@article {Pare107409, author = {Guillaume Pare and Shihong Mao and Wei Q. Deng}, title = {A machine-learning heuristic to improve gene score prediction of polygenic traits}, elocation-id = {107409}, year = {2017}, doi = {10.1101/107409}, publisher = {Cold Spring Harbor Laboratory}, abstract = {The advent of precision medicine is largely dependent on the availability of accurate and highly predictive gene scores. While progress has been made identifying genetic determinants of polygenic traits, the phenotypic variance explained by gene scores derived from genome-wide associations remains modest. Machine-learning techniques have proven very useful for solving a broad range of prediction problems, yet are not widely applied to complex traits prediction using gene scores. We propose a novel machine-learning heuristic (MLH) to improve the predictive performance of gene scores. It is based on two innovative features. We first apply gradient boosted regression trees models to leverage a large number of SNPs and optimize the weights of individual SNPs included in the gene scores. We show a calibration set sample size of ~200 individuals is sufficient for optimal performance. We then correct for linkage disequilibrium (LD) between SNPs using a novel procedure, enabling retention of all SNPs in the gene score irrespective of LD. Our novel heuristic yielded a prediction R2 of 0.237, 0.082 for height and BMI using GIANT summary association statistics in the UKBiobank study (N=130K; 1.98M SNPs), explaining 46.6\% and 32.6\% of the overall polygenic variance, respectively. Corresponding area under the ROC was 0.602 for diabetes in the UKBiobank using DIAGRAM association statistics. MLH outperformed other gene score heuristics for height and BMI and was equivalent to LDpred for diabetes. Results were independently validated in participants of the HRS (N=8,292) study. Our report demonstrates the potential of machine-learning methods for polygenic trait prediction. Our method has wide-ranging applications, from predicting medically important traits to creating stronger instrumental variables for Mendelian randomization studies.}, URL = {https://www.biorxiv.org/content/early/2017/02/09/107409}, eprint = {https://www.biorxiv.org/content/early/2017/02/09/107409.full.pdf}, journal = {bioRxiv} }