@article {Ochoa029843, author = {Idoia Ochoa and Mikel Hernaez and Rachel Goldfeder and Tsachy Weissman and Euan Ashley}, title = {Effect of lossy compression of quality scores on variant calling}, elocation-id = {029843}, year = {2015}, doi = {10.1101/029843}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Recent advancements in sequencing technology have led to a drastic reduction in the cost of genome sequencing. This development has generated an unprecedented amount of genomic data that must be stored, processed, and communicated. To facilitate this effort, compression of genomic files has been proposed. Specifically, lossy compression of quality scores is emerging as a natural candidate for reducing the growing costs of storage. A main goal of performing DNA sequencing in population studies and clinical settings is to identify genetic variation. Though the field agrees that smaller files are advantageous, the cost of lossy compression, in terms of variant discovery, is unclear.Bioinformatic algorithms to identify SNPs and INDELs from next-generation DNA sequencing data use base quality score information; here, we evaluate the effect of lossy compression of quality scores on SNP and INDEL detection. We analyze several lossy compressors introduced recently in the literature. Specifically, we investigate how the output of the variant caller when using the original data (uncompressed) differs from that obtained when quality scores are replaced by those generated by a lossy compressor. Using gold standard genomic datasets such as the GIAB (Genome In A Bottle) consensus sequence for NA12878 and simulated data, we are able to analyze how accurate the output of the variant calling is, both for the original data and that previously lossily compressed. We show that lossy compression can significantly alleviate the storage while maintaining variant calling performance comparable to that with the uncompressed data. Further, in some cases lossy compression can lead to variant calling performance which is superior to that using the uncompressed file. We envisage our findings and framework serving as a benchmark in future development and analyses of lossy genomic data compressors.The Supplementary Data can be found at http://web.stanford.edu/~iochoa/supplementEffectLossy.zip.}, URL = {https://www.biorxiv.org/content/early/2015/10/26/029843}, eprint = {https://www.biorxiv.org/content/early/2015/10/26/029843.full.pdf}, journal = {bioRxiv} }