@article {McTavish037655, author = {Emily Jane McTavish and James Pettengill and Steve Davis and Hugh Rand and Errol Strain and Marc Allard and Ruth E. Timme}, title = {TreeToReads - a pipeline for simulating raw reads from phylogenies}, elocation-id = {037655}, year = {2016}, doi = {10.1101/037655}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Using genome-wide SNP-based methods for tracking pathogens has become standard practice in academia and public health agencies. There are multiple computational approaches available that perform a similar task: call variants by mapping short read data against a reference genome, quality filter these variants, then concatenate the variants into a sequence matrix for downstream phylogenetic analysis. However, there are no existing methods to validate the accuracy of these approaches despite the fact that we know there are parameters that can affect whether a SNP is called, or the correct tree is recovered. We present a simulation approach (TreeToReads) to generate raw read data from mutated genomes simulated under a known phylogeny. The user can vary parameters of interest at each step in the simulation (e.g., topology, model of sequence evolution, and read coverage) to assess the robustness of a given result, which is critical within both research and applied settings. Source code, examples, and a tutorial are available at https://github.com/snacktavish/TreeToReads.}, URL = {https://www.biorxiv.org/content/early/2016/01/22/037655}, eprint = {https://www.biorxiv.org/content/early/2016/01/22/037655.full.pdf}, journal = {bioRxiv} }