@article {Collins032565, author = {Rupert A. Collins and Tomas Hrbek}, title = {An in silico comparison of reduced-representation and sequence-capture protocols for phylogenomics}, elocation-id = {032565}, year = {2015}, doi = {10.1101/032565}, publisher = {Cold Spring Harbor Laboratory}, abstract = {In the age of genome-scale DNA sequencing, choice of molecular marker arguably remains an important decision in planning a phylogenetic study. Using published genomes from 23 primate species, we make a standardized comparison of four of the most frequently used protocols in phylogenomics, viz., targeted sequence-enrichment using ultraconserved element and exon-capture probes, and reduced genomic representation using restriction-site-associated DNA sequencing (RADseq and ddRAD-seq). Here we present a procedure to perform in silico extractions from genomes and create directly comparable datasets for each class of marker. We then compare these datasets in terms of both phylogenetic resolution and ability to consistently and precisely estimate clade ages using fossil-calibrated molecular-clock models. Furthermore, we were also able to directly compare these results to previously published datasets from Sanger-sequenced nuclear exons and mitochondrial genomes under the same analytical conditions. Our results show{\textemdash}although with the exception of the mitochondrial genome and ddRADseq datasets{\textemdash}that for uncontroversial nodes all data classes performed equally well, i.e. they recovered the same well supported topology. However, for one difficult-to-resolve node comprising a rapid diversification (subfamilial relationships among the Cebidae), we report well supported but conflicting topologies among the marker classes, likely the result of mismodelling of gene tree heterogeneity. Likewise, clade age estimates showed consistent discrepancies between datasets; for recent nodes, clade ages estimated by nuclear exon datasets were younger than those of the UCE, RAD and mitochondrial data, but vice versa for the deepest nodes in the primate phylogeny. This effect can be explained by temporal differences in phylogenetic informativeness and choice of clock model used. Finally, we conclude by emphasizing that while huge numbers of loci are probably not required for uncontroversial phylogenetic questions{\textemdash}for which practical considerations such as cost and ease of data generation/sharing/aggregating therefore become increasingly important{\textemdash}accurately modelling heterogeneous data remains as relevant as ever for the more recalcitrant problems.}, URL = {https://www.biorxiv.org/content/early/2015/11/21/032565}, eprint = {https://www.biorxiv.org/content/early/2015/11/21/032565.full.pdf}, journal = {bioRxiv} }