@article {043430, author = {The Computational Pan-Genomics Consortium and Tobias Marschall and Manja Marz and Thomas Abeel and Louis Dijkstra and Bas E. Dutilh and Ali Ghaffaari and Paul Kersey and Wigard P. Kloosterman and Veli M{\"a}kinen and Adam M. Novak and Benedict Paten and David Porubsky and Eric Rivals and Can Alkan and Jasmijn Baaijens and Paul I. W. De Bakker and Valentina Boeva and Raoul J. P. Bonnal and Francesca Chiaromonte and Rayan Chikhi and Francesca D. Ciccarelli and Robin Cijvat and Erwin Datema and Cornelia M. Van Duijn and Evan E. Eichler and Corinna Ernst and Eleazar Eskin and Erik Garrison and Mohammed El-Kebir and Gunnar W. Klau and Jan O. Korbel and Eric-Wubbo Lameijer and Benjamin Langmead and Marcel Martin and Paul Medvedev and John C. Mu and Pieter Neerincx and Klaasjan Ouwens and Pierre Peterlongo and Nadia Pisanti and Sven Rahmann and Ben Raphael and Knut Reinert and Dick de Ridder and Jeroen de Ridder and Matthias Schlesner and Ole Schulz-Trieglaff and Ashley D. Sanders and Siavash Sheikhizadeh and Carl Shneider and Sandra Smit and Daniel Valenzuela and Jiayin Wang and Lodewyk Wessels and Ying Zhang and Victor Guryev and Fabio Vandin and Kai Ye and Alexander Sch{\"o}nhuth}, title = {Computational Pan-Genomics: Status, Promises and Challenges}, elocation-id = {043430}, year = {2016}, doi = {10.1101/043430}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Many disciplines, from human genetics and oncology to plant breeding, microbiology and virology, commonly face the challenge of analyzing rapidly increasing numbers of genomes. In case of Homo sapiens, the number of sequenced genomes will approach hundreds of thousands in the next few years. Simply scaling up established bioinformatics pipelines will not be sufficient for leveraging the full potential of such rich genomic datasets. Instead, novel, qualitatively different computational methods and paradigms are needed. We will witness the rapid extension of computational pan-genomics, a new sub-area of research in computational biology. In this paper, we generalize existing definitions and understand a pan-genome as any collection of genomic sequences to be analyzed jointly or to be used as a reference. We examine already available approaches to construct and use pan-genomes, discuss the potential benefits of future technologies and methodologies, and review open challenges from the vantage point of the above-mentioned biological disciplines. As a prominent example for a computational paradigm shift, we particularly highlight the transition from the representation of reference genomes as strings to representations as graphs. We outline how this and other challenges from different application domains translate into common computational problems, point out relevant bioinformatics techniques and identify open problems in computer science. With this review, we aim to increase awareness that a joint approach to computational pan-genomics can help address many of the problems currently faced in various domains.}, URL = {https://www.biorxiv.org/content/early/2016/08/25/043430}, eprint = {https://www.biorxiv.org/content/early/2016/08/25/043430.full.pdf}, journal = {bioRxiv} }