@article {Nellore035287, author = {Abhinav Nellore and Christopher Wilks and Kasper D Hansen and Jeffrey T Leek and Ben Langmead}, title = {Rail-dbGaP: a protocol and tool for analyzing protected genomic data in a commercial cloud}, elocation-id = {035287}, year = {2015}, doi = {10.1101/035287}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Motivation Public archives contain thousands of trillions of bases of valuable sequencing data. More than 40\% of the Sequence Read Archive is human data protected by provisions such as dbGaP. To analyze dbGaP-protected data, researchers must typically work with IT administrators and signing officials to ensure all levels of security are implemented at their institution. This is a major obstacle, impeding reproducibility and reducing the utility of archived data.Results We present a protocol and software tool for analyzing protected data in a commercial cloud. The protocol is applicable to any MapReduce tool running on Amazon Web Services. The tool, Rail-RNA v0.2, is a spliced aligner for RNA-seq data, which we demonstrate by running on 9,662 samples from the dbGaP-protected GTEx consortium dataset. These are important first steps toward making it easy for typical biomedical investigators to study protected data, regardless of their local IT resources or expertise.Availability Rail-RNA is available from http://rail.bio, and detailed instructions on running Rail-RNA on dbGaP-protected data using Amazon Web Services are available at http://docs.rail.bio/dbgap/.Contact anellore@gmail.com, langmea@cs.jhu.edu}, URL = {https://www.biorxiv.org/content/early/2015/12/24/035287}, eprint = {https://www.biorxiv.org/content/early/2015/12/24/035287.full.pdf}, journal = {bioRxiv} }