...
#Additional test files for running PRSice (Section 8):
BMI_1kgph3_chr16_snps_summarystat_2024.txt
3.1- Base Data
We will use as the base data part of GWAS Anthropometric 2015 BMI summary statistics ( https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4382211/), made available by the GIANT consortium and were extracted from their online portal
...
Code Block | ||
---|---|---|
| ||
#### PRSice2 (adapted from https://choishingwan.github.io/PRSice/) #### ##Some quality control steps before using summary statistics file in PRSice2 ## *** make sure to review any documents that come with the summary statistics to understand how the summary statistics were derived and what the column names stand for *** # In R: ss=read.table("BMI_1kgph3_chr16_snps_summarystat_2024.txt",header=T) nrow(ss) #[1] 68385 head(ss) summary(ss$INFO) #imputation quality summary(ss$N) #N for each SNP in source GWAS summary(ss$Freq1.Hapmap) #minor allele frequency for each SNP nrow(ss[ss$INFO<0.8,]) #[1] 199 nrow(ss[ss$N<0.5*max(ss$N),]) #[1] 2629 nrow(ss[ss$Freq1.Hapmap<=0.01 | is.na(ss$Freq1.Hapmap),]) #[1] 1590 #Some quality control for imputation quality, N, and minor allele frequency: ss1=ss[ss$INFO>0.8 & ss$N>0.5*max(ss$N) & ss$Freq1.Hapmap>0.01 & !is.na(ss$Freq1.Hapmap),] nrow(ss1) #[1] 64933 colnames(ss1)<-c("SNP","A1","A2","Freq1.Hapmap","BETA","SE","P","N","INFO") head(ss1) write.table(ss1,"BMI_1kgph3_chr16_snps_summarystat_2024_short.txt", col.names=T, row.names=F, quote=F, sep='\t') ##Now we are ready to try PRSice2! ##Need to set up the following modules in R for PRSice2 to run properly: module load lang/R/3.5.1-Python-3.8.5-Anaconda3-2020.11 R --vanilla library(ggplot2) library(optparse#if R packages not already installed: install.packages("ggplot2") install.packages("optparse") #may need to install this package manually library(methods) library(tools) library( by downloading from https://cran.r-project.org/web/packages/optparse/index.html; install.packages(c("optparse")) install.packages("methods") install.packages("tools") install.packages("data.table") install.packages("grDevices") install.packages("RColorBrewer") #once R packages are installed: library(ggplot2) library(optparse) library(methods) library(tools) library(data.table) library(grDevices) library(RColorBrewer) #if using Linux, exit R (if using Windows: stay in R): q() #to exit R ## Linux -- Using our tutorial files: Rscript PRSice.R --prsice ./#may need to make PRSice_linux executable chmod u+x ./PRSice_linux # Rscript PRSice.R --prsice ./PRSice_linux --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --thread 1 --stat BETA --pvalue P --binary-target F --out dummybmi_nocov2024 ## Windows (in R) -- Using our tutorial files: system("Rscript.exe PRSice.R --prsice ./PRSice_win64.exe --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --thread 1 --stat BETA --pvalue P --binary-target F --out dummybmi_nocov2024") ## Linux -- Adding a covariate: Rscript PRSice.R --prsice ./PRSice_linux --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --cov 1kgph3_dummybmi20200804.txt --cov-col sex --thread 1 --stat BETA --pvalue P --binary-target F --out dummybmi_sexcov2024 ## Windows (in R) -- Adding a covariate: system("Rscript.exe PRSice.R --prsice ./PRSice_win64.exe --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --cov 1kgph3_dummybmi20200804.txt --cov-col sex --thread 1 --stat BETA --pvalue P --binary-target F --out dummybmi_sexcov2024") ## Check the log files to see what went on behind the scene. # How many SNPs went into the PRS calculations? #Hint: check the *.prsice file ## Now we check the output file with PRS (in R): prs=read.table("dummybmi_sexcov2024.best",header=T) pheno=read.table("1kgph3_dummybmi20200804.txt",header=T) nrow(prs) #[1] 476 nrow(pheno) #[1] 476 head(prs) head(pheno) data=merge(pheno,prs,by.x=c("V1","V2"),by.y=c("FID","IID")) nrow(data) #[1] 476 head(data) lmprs<-lm(data=data, dummybmi~PRS + sex) summary(lmprs) #notice the magnitude of the effect estimate mean(data$PRS) sd(data$PRS) data$stdprs<-scale(data$PRS) #standardizing the PRS to a mean of 0 and standard deviation of 1 mean(data$stdprs) sd(data$stdprs) lmprs<-lm(data=data, dummybmi~stdprs + sex) summary(lmprs) #notice how the magnitude of the effect estimate has changed and everything else stayed the same ## Scatterplot of BMI vs best PRS pdf("Scatterplot_dummybmi_vs_PRSice2_best_stdprs.pdf") plot(data$stdprs,data$dummybmi) abline(lm(data$dummybmi ~ data$stdprs), col = "red", lty = 1, lwd = 1) dev.off() ####################################################################### ## Additional Notes/Examples: # add "--perm 10000": *.summary file with additional column showing empirical p value # "--all-score": outputs PRS for all thresholds tested -- beware of large output file ##This is sample script (***Need to change the actual file names***): #Rscript workdir/PRSice.R --dir workdir --prsice ./workdir/PRSice_linux --base workdir/source_gwas_sumstat_info_9_prsicein# "--quantile 10": generates plot and file for PRS quantiles (deciles in this case) # add "--gtf gene.gtf --msigdb set.txt --multi-plot 10" for PRSet --> outputs plot for top 10 gene sets (*.gtf file with genome boundary of each gene from MAGMA; set.txt with gene IDs in each gene set from MSigDB -- refer to https://www.gsea-msigdb.org/gsea/msigdb/index.jsp for gene set definitions # ##This is sample script (***Need to change the actual file names***): #Rscript workdir/PRSice.R --dir workdir --prsice ./workdir/PRSice_linux --base workdir/source_gwas_sumstat_info_9_prsicein.txt --target indir/target_gwas_cleaned_plink --bar-levels 5e-8,0.00001,0.00005,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5,1 --seed 1234 --perm 10000 --fastscore --all-score --no-regress T --out outdir/target_gwas_cleaned_source_gwas_prs # #system("Rscript.exe PRSice.R --prsice ./PRSice_win64.exe --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target indir/target_gwas_cleaned_plink1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --barpheno-levelscol dummybmi 5e-8,0.00001,0.00005,0.0001,0.0005,0.--cov 1kgph3_dummybmi20200804.txt --cov-col sex --bar-levels 0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5,1 --seed 1234fastscore --permthread 100001 --fastscorestat BETA --all-scorepvalue P --nobinary-regresstarget TF --out outdir/target_gwas_cleaned_source_gwas_prs ## PRSice2 within R on Windows (from tutorial): system("Rscript.exe PRSice.R --prsice ./PRSice_win64.exe --base TOY_BASE_GWAS.assoc --target TOY_TARGET_DATAdummybmi_sexcov2024setpvalthreshs") #system("Rscript.exe PRSice.R --prsice ./PRSice_win64.exe --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --cov 1kgph3_dummybmi20200804.txt --cov-col sex --thread 1 --stat OR BETA --pvalue P --binary-target T") F --gtf NCBI37.3.gene.loc.gtf --msigdb c2.all.v2024.1.Hs.entrez.gmt --multi-plot 10 --out dummybmi_sexcov2024") |
Running PRS-CS
Code Block | ||
---|---|---|
| ||
## PRS-CS #need effective sample size of source GWAS (neff =2*N_cases*N_controls/(N_cases+N_controls)) python workdir/PRScs/PRScs.py --ref_dir=workdir/ldblk_1kg_eur --bim_prefix=indir/target_gwas_cleaned_plink --sst_file=workdir/source_gwas_sumstat_prscsin.txt --n_gwas=[neff] --seed=1234 --out_dir=outdir/target_gwas_source_gwas_sumstat_prscsout # Merge sumstat outputs from PRS-CS chr1-22 in R: R mci<-c() mcall<-c() for (i in 1:22) { mci<-read.table(paste("outdir//target_gwas_source_gwas_sumstat_prscsout_pst_eff_a1_b0.5_phiauto_chr",i,".txt",sep=""),header=F) mcall<-rbind(mcall,mci) } colnames(mcall)<-c("CHR","SNP","BP","A1","A2","B") mcalla=subset(mcall,select=c("SNP","A1","B")) colnames(mcalla)<-c("SNP","A1","Score") write.table(mcalla,"outdir//target_gwas_source_gwas_sumstat_prscsout_pst_eff_a1_b0.5_phiauto_chr1-22.raw", row.names=F, col.names=T, quote=F, sep='\t') # Run polygenic scoring in PLINK: plink --bfile indir/target_gwas_cleaned_plink --score outdir/target_gwas_source_gwas_sumstat_prscsout_pst_eff_a1_b0.5_phiauto_chr1-22.raw --out outdir/target_gwas_source_gwas_sumstat_prscsout |
...