...
#Additional test files for running PRSice (Section 8):
BMI_1kgph3_chr16_snps_summarystat_2024.txt
3.1- Base Data
We will use as the base data part of GWAS Anthropometric 2015 BMI summary statistics ( https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4382211/), made available by the GIANT consortium and were extracted from their online portal
...
Tip: If the SNPs do not have the same id between the target and base datasets, merge by chromosome, position (and alleles if the files include multi-allelic variants).
### To mark ambiguous G/C or A/T SNPs for removal
...
Code Block |
---|
bmikg1aa=bmikg1[as.character(bmikg1$A1) == bmikg1$V5 & as.character(bmikg1$A2) == bmikg1$V6,]
nrow(bmikg1aa) ##28,842 SNPs in scenario (i) |
### Scenario (ii): To find allele matches for flipped strand between base GWAS summary statistics file and target GWAS data -- beta will be unchanged
...
Code Block |
---|
bmikg1ab=bmikg1[bmikg1$A1 == "A" & bmikg1$A2 == "C" & bmikg1$V5 == "T" & bmikg1$V6 == "G",]
bmikg1ac=bmikg1[bmikg1$A1 == "A" & bmikg1$A2 == "G" & bmikg1$V5 == "T" & bmikg1$V6 == "C",]
bmikg1ad=bmikg1[bmikg1$A1 == "C" & bmikg1$A2 == "A" & bmikg1$V5 == "G" & bmikg1$V6 == "T",]
bmikg1ae=bmikg1[bmikg1$A1 == "C" & bmikg1$A2 == "T" & bmikg1$V5 == "G" & bmikg1$V6 == "A",]
bmikg1af=bmikg1[bmikg1$A1 == "G" & bmikg1$A2 == "A" & bmikg1$V5 == "C" & bmikg1$V6 == "T",]
bmikg1ag=bmikg1[bmikg1$A1 == "G" & bmikg1$A2 == "T" & bmikg1$V5 == "C" & bmikg1$V6 == "A",]
bmikg1ah=bmikg1[bmikg1$A1 == "T" & bmikg1$A2 == "C" & bmikg1$V5 == "A" & bmikg1$V6 == "G",]
bmikg1ai=bmikg1[bmikg1$A1 == "T" & bmikg1$A2 == "G" & bmikg1$V5 == "A" & bmikg1$V6 == "C",]
bmikg1a=rbind(bmikg1aa,bmikg1ab,bmikg1ac,bmikg1ad,bmikg1ae,bmikg1af,bmikg1ag,bmikg1ah,bmikg1ai) # to combine all the datasets created above + bmikg1aa with non-ambiguous SNPs
nrow(bmikg1a) ##28,854 SNPs in scenarios (i) & (ii) |
### To add column “B” for the beta to be used in polygenic risk scoring (note: “b” is the original beta)
...
Code Block |
---|
bmikg1ba=bmikg1[as.character(bmikg1$A1) == bmikg1$V6 & as.character(bmikg1$A2) == bmikg1$V5,]
nrow(bmikg1ba) ##26,883 SNPs in scenario (iii) |
### Scenario (iv): To find flipped strands but perfect allele switches between A1 and A2 (i.e., A1 (column “A1”) in base GWAS summary statistics file is the same as the complementary allele for A2 (column “V6”) in target GWAS data, and vice versa) -- beta will have opposite sign
...
Code Block |
---|
bmikg1bb=bmikg1[bmikg1$A1 == "A" & bmikg1$A2 == "C" & bmikg1$V5 == "G" & bmikg1$V6 == "T",]
bmikg1bc=bmikg1[bmikg1$A1 == "A" & bmikg1$A2 == "G" & bmikg1$V5 == "C" & bmikg1$V6 == "T",]
bmikg1bd=bmikg1[bmikg1$A1 == "C" & bmikg1$A2 == "A" & bmikg1$V5 == "T" & bmikg1$V6 == "G",]
bmikg1be=bmikg1[bmikg1$A1 == "C" & bmikg1$A2 == "T" & bmikg1$V5 == "A" & bmikg1$V6 == "G",]
bmikg1bf=bmikg1[bmikg1$A1 == "G" & bmikg1$A2 == "A" & bmikg1$V5 == "T" & bmikg1$V6 == "C",]
bmikg1bg=bmikg1[bmikg1$A1 == "G" & bmikg1$A2 == "T" & bmikg1$V5 == "A" & bmikg1$V6 == "C",]
bmikg1bh=bmikg1[bmikg1$A1 == "T" & bmikg1$A2 == "C" & bmikg1$V5 == "G" & bmikg1$V6 == "A",]
bmikg1bi=bmikg1[bmikg1$A1 == "T" & bmikg1$A2 == "G" & bmikg1$V5 == "C" & bmikg1$V6 == "A",]
bmikg1b=rbind(bmikg1ba,bmikg1bb,bmikg1bc,bmikg1bd,bmikg1be,bmikg1bf,bmikg1bg,bmikg1bh,bmikg1bi)
nrow(bmikg1b) ##26,898 SNPs in scenarios (iii) & (iv) |
### NOTE: Need to check for allelic mismatches beyond those assessed in the steps above (same strands or flipped strands, same alleles or switched alleles): additional checks would include checking for INDELs (insertions/deletions) and non-matching alleles (not included in this workshop)
### HERE: our data have 0 SNPs with mismatching A1 and A2 between GWAS summary statistics and target GWAS data
### NOTE: there are 0 INDELs with mismatching A1 and A2 between Base GWAS summary stat and target GWAS data
Code Block |
---|
nrow(bmikg1b)+nrow(bmikg1a) ##55,752 |
### ### B=-b for SNPs with switched alleles above (beta will have an opposite sign for Scenarios (iii) and (iv))
Code Block |
---|
bmikg1b$B=0-bmikg1b$b |
### same number of non-ambiguous SNPs that we began with.
Code Block |
---|
nrow(bmikg1b)+nrow(bmikg1a) ##55,752 |
### ***NOTE: Use column “V5” as Allele1 & column “V6” as Allele2 & column “B” as the effect estimate***
...
### As an example, we will use either (a) p-value threshold of 0.5, or (b) a range of p-value thresholds from base GWAS summary statistics to select SNPs for polygenic risk scoring in the target GWAS dataset
### ***NOTE: Use column “V5” as Allele1 & column “V6” as Allele2 & column “B” as the effect estimate***
### Example (a) p=<0.5
Code Block |
---|
bmikg3a=bmikg3[bmikg3$P<0.5,] bmikg3a1=subset(bmikg3a,select=c("SNP","V5","B")) nrow(bmikg3a) ##3254 # Renaming column headings from "SNP", "V5", and "B" to "SNP", "A1", and "Score" colnames(bmikg3a1)<-c("SNP","A1","Score") write.table(bmikg3a1,"1kgph3_chr16_test_clumped_1_threshold_5.raw",col.names=T,row.names=F,quote=F,sep='\t') |
...
Code Block |
---|
prs1phen=read.csv("test_clump_1_prs_5.csv",header=T)
head(prs1phen)
lmprs<-lm(data=prs1phen, dummybmi~SCORE + sex)
summary(lmprs) |
6.2- PLOTTING
#adding sex as a covariate here; consider adding population structure PCs as covariates
summary(lmprs) |
6.2- PLOTTING
Code Block |
---|
## Histogram showing distribution of |
Code Block |
## Histogram showing distribution of polygenic risk scores (shown for p-value threshold of 0.5 from Example (a))
pdf("Histogram_testscore_pT_5.pdf")
hist(prs1phen$SCORE, breaks=100, main = paste("Distribution of PRS scores (n =", length(prs1phen$SCORE), ")"))
dev.off()
## Scatterplot of BMI vs PRS (shown for p-value threshold of 0.5 from Example (a))
pdf("Scatterplot_dummybmi_vs_PRS_pT_5.pdf")
plot(prs1phen$SCORE,prs1phen$dummybmi)
abline(lm(prs1phen$dummybmi ~ prs1phen$SCORE), col = "red",
lty = 1, lwd = 1)
dev.off()
# Plots of prediction R-squared values vs p-value thresholds (for Example (b))
prs_files <- list.files(pattern = "1kgph3_chr16_profile_20200226.*.*.profile$");
prs_list <- lapply(prs_files, function(rr) read.table(rr, head=T))
r_squared<-NA
for (j in 1:length(prs_list)){
prs1phen=merge(prs_list[[j]] ,phen, by.x="FID", by.y="V1")
lmprs<-lm(data= prs1phen, dummybmi~SCORE + sex)
r_squared[j]<-summary(lmprs)$r.squared
}
p_threshold <- as.numeric(sapply(sapply(prs_files, function(rr) strsplit(rr, "_20200226*\\.")[[1]][[2]]), function(ee) strsplit(ee, ".profile")[[1]]))
## Lineplot of prediction R-squared values vs p-value thresholds
pdf("Lineplot_R-squared_vs_p-value_threshold.pdf")
plot(p_threshold, r_squared, xlab = "P-value threshold", ylab="Prediction R-squared")
lines(p_threshold, r_squared);
abline(v=p_threshold[which.max(r_squared)], col=2)
dev.off()
## Barplot of prediction R-squared values vs p-value thresholds -- colour gradient from blue for lowest R-squared value to red for highest R-squared value; legend showing R-squared values in the order of the p-value thresholds
rankfac <- rank(r_squared)
rbPal <- colorRampPalette(c('blue','red'))
Col <- rbPal(length(rankfac))[as.numeric(cut(rankfac,breaks = length(rankfac)))]
pdf("Barplot_R-squared_vs_p-value_threshold_bluetored.pdf")
barplot(r_squared, names.arg=p_threshold, xlab = "P-value threshold", ylab="Prediction R-squared",col=Col, legend=round(r_squared, digits=4), args.legend=list(title="R-squared"), xlim=c(0,12))
dev.off() |
...
Some additional codes to try (need to download and install required items following instructions):
8- Additional Commands (under development)
Running PRSice2
Code Block | ||
---|---|---|
| ||
###### PRSice2 (adapted from https://choishingwan.github.io/PRSice/) #### ##Some quality control steps before using summary statistics file in PRSice2 ## *** make sure to review any documents that come with the summary statistics to understand how the summary statistics were derived and what the column names stand for *** # In R: ss=read.table("BMI_1kgph3_chr16_snps_summarystat_2024.txt",header=T) nrow(ss) #[1] 68385 head(ss) summary(ss$INFO) #imputation quality summary(ss$N) #N for each SNP in source GWAS summary(ss$Freq1.Hapmap) #minor allele frequency for each SNP nrow(ss[ss$INFO<0.8,]) #[1] 199 nrow(ss[ss$N<0.5*max(ss$N),]) #[1] 2629 nrow(ss[ss$Freq1.Hapmap<=0.01 | is.na(ss$Freq1.Hapmap),]) #[1] 1590 #Some quality control for imputation quality, N, and minor allele frequency: ss1=ss[ss$INFO>0.8 & ss$N>0.5*max(ss$N) & ss$Freq1.Hapmap>0.01 & !is.na(ss$Freq1.Hapmap),] nrow(ss1) #[1] 64933 colnames(ss1)<-c("SNP","A1","A2","Freq1.Hapmap","BETA","SE","P","N","INFO") head(ss1) write.table(ss1,"BMI_1kgph3_chr16_snps_summarystat_2024_short.txt", col.names=T, row.names=F, quote=F, sep='\t') ##Now we are ready to try PRSice2! ##Need to set up the following modules in R for PRSice2 to run properly: module load lang/R/3.5.1-Python-3.8.5-Anaconda3-2020.11 R --vanilla #if R packages not already installed: install.packages("ggplot2") install.packages("optparse") #may need to install this package manually by downloading from https://cran.r-project.org/web/packages/optparse/index.html; install.packages(c("optparse")) install.packages("methods") install.packages("tools") install.packages("data.table") install.packages("grDevices") install.packages("RColorBrewer") #once R packages are installed: library(ggplot2) library(optparse) library(methods) library(tools) library(data.table) library(grDevices) library(RColorBrewer) #if using Linux, exit R (if using Windows: stay in R): q() #to exit R ## Linux -- Using our tutorial files: #may need to make PRSice_linux executable chmod u+x ./PRSice_linux # Rscript PRSice.R --prsice ./PRSice_linux --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --thread 1 --stat BETA --pvalue P --binary-target F --out dummybmi_nocov2024 ## Windows (in R) -- Using our tutorial files: system("Rscript.exe PRSice.R --prsice ./PRSice_win64.exe --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --thread 1 --stat BETA --pvalue P --binary-target F --out dummybmi_nocov2024") ## Linux -- Adding a covariate: Rscript PRSice.R --prsice ./PRSice_linux --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --cov 1kgph3_dummybmi20200804.txt --cov-col sex --thread 1 --stat BETA --pvalue P --binary-target F --out dummybmi_sexcov2024 ## Windows (in R) -- Adding a covariate: system("Rscript.exe PRSice.R --prsice ./PRSice_win64.exe --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --cov 1kgph3_dummybmi20200804.txt --cov-col sex --thread 1 --stat BETA --pvalue P --binary-target F --out dummybmi_sexcov2024") ## Check the log files to see what went on behind the scene. # How many SNPs went into the PRS calculations? #Hint: check the *.prsice file ## Now we check the output file with PRS (in R): prs=read.table("dummybmi_sexcov2024.best",header=T) pheno=read.table("1kgph3_dummybmi20200804.txt",header=T) nrow(prs) #[1] 476 nrow(pheno) #[1] 476 head(prs) head(pheno) data=merge(pheno,prs,by.x=c("V1","V2"),by.y=c("FID","IID")) nrow(data) #[1] 476 head(data) lmprs<-lm(data=data, dummybmi~PRS + sex) summary(lmprs) #notice the magnitude of the effect estimate mean(data$PRS) sd(data$PRS) data$stdprs<-scale(data$PRS) #standardizing the PRS to a mean of 0 and standard deviation of 1 mean(data$stdprs) sd(data$stdprs) lmprs<-lm(data=data, dummybmi~stdprs + sex) summary(lmprs) #notice how the magnitude of the effect estimate has changed and everything else stayed the same ## Scatterplot of BMI vs best PRS pdf("Scatterplot_dummybmi_vs_PRSice2_best_stdprs.pdf") plot(data$stdprs,data$dummybmi) abline(lm(data$dummybmi ~ data$stdprs), col = "red", lty = 1, lwd = 1) dev.off() ####################################################################### ## Additional Notes/Examples: # add "--perm 10000": *.summary file with additional column showing empirical p value # "--all-score": outputs PRS for all thresholds tested -- beware of large output file # "--quantile 10": generates plot and file for PRS quantiles (deciles in this case) # add "--gtf gene.gtf --msigdb set.txt --multi-plot 10" for PRSet --> outputs plot for top 10 gene sets (*.gtf file with genome boundary of each gene from MAGMA; set.txt with gene IDs in each gene set from MSigDB -- refer to https://www.gsea-msigdb.org/gsea/msigdb/index.jsp for gene set definitions # ##This is sample script (***Need to change the actual file names***): #Rscript workdir/PRSice.R --dir workdir --prsice ./workdir/PRSice_linux --base workdir/source_gwas_sumstat_info_9_prsicein.txt --target indir/target_gwas_cleaned_plink --bar-levels 5e-8,0.00001,0.00005,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5,1 --seed 1234 --perm 10000 --fastscore --all-score --no-regress T --out outdir/target_gwas_cleaned_source_gwas_prs # #system("Rscript.exe PRSice.R --prsice ./PRSice_win64.exe --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --cov 1kgph3_dummybmi20200804.txt --cov-col sex --bar-levels 0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5,1 --fastscore --thread 1 --stat BETA --pvalue P --binary-target F --out dummybmi_sexcov2024setpvalthreshs") #system("Rscript.exe PRSice.R --prsice ./PRSice_win64.exe --base BMI_1kgph3_chr16_snps_summarystat_2024_short.txt --target 1kgph3_chr16 --pheno 1kgph3_dummybmi20200804.txt --pheno-col dummybmi --cov 1kgph3_dummybmi20200804.txt --cov-col sex --thread 1 --stat BETA --pvalue P --binary-target F --gtf NCBI37.3.gene.loc.gtf --msigdb c2.all.v2024.1.Hs.entrez.gmt --multi-plot 10 --out dummybmi_sexcov2024") |
Running PRS-CS
Code Block | ||
---|---|---|
| ||
## PRS-CS #need effective sample size of source GWAS (neff =2*N_cases*N_controls/(N_cases+N_controls)) module load lang/R/3.5.1-Python-3.8.5-Anaconda3-2020.11 R --vanilla library(ggplot2) library(optparse) library(method) library(tools) library(data.table) library(grDevices) library(RColorBrewer) q() Rscript workdir/PRSice.R --dir workdir --prsice ./workdir/PRSice_linux --base workdir/source_gwas_sumstat_info_9_prsicein.txt --target indir/target_gwas_cleaned_plink --bar-levels 5e-8,0.00001,0.00005,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5,1 --seed 1234 --perm 10000 --fastscore --all-score --no-regress T --out outdir/target_gwas_cleaned_source_gwas_prs ## PRS-CS python workdir/PRScs/PRScs.py --ref_dir=workdir/ldblk_1kg_eur --bim_prefix=indir/target_gwas_cleaned_plink --sst_file=workdir/source_gwas_sumstat_prscsin.txt --n_gwas=[neff] --seed=1234 --out_dir=outdir/target_gwas_source_gwas_sumstat_prscsout # Merge sumstat outputs from PRS-CS chr1-22 in R: R mci<-c() mcall<-c() for (i in 1:22) { mci<-read.table(paste("outdir//target_gwas_source_gwas_sumstat_prscsout_pst_eff_a1_b0.5_phiauto_chr",i,".txt",sep=""),header=F) mcall<-rbind(mcall,mci) } colnames(mcall)<-c("CHR","SNP","BP","A1","A2","B") mcalla=subset(mcall,select=c("SNP","A1","B")) colnames(mcalla)<-c("SNP","A1","Score") write.table(mcalla,"outdir//target_gwas_source_gwas_sumstat_prscsout_pst_eff_a1_b0.5_phiauto_chr1-22.raw", row.names=F, col.names=T, quote=F, sep='\t') # Run polygenic scoring in PLINK: plink --bfile indir/target_gwas_cleaned_plink --score outdir/target_gwas_source_gwas_sumstat_prscsout_pst_eff_a1_b0.5_phiauto_chr1-22.raw --out outdir/target_gwas_source_gwas_sumstat_prscsout |
...
9- References
- Bulik-Sullivan BK, Loh PR, Finucane HK, Ripke S, Yang J; Schizophrenia Working Group of the Psychiatric Genomics Consortium, Patterson N, Daly MJ, Price AL, Neale BM. LD Score regression distinguishes confounding from polygenicity in genome-wide association studies. Nature Genetics, 2015; 47:291–295. PMID: 25642630 PMCID: PMC4495769 DOI: 10.1038/ng.3211
- Choi, S.W., Mak, T.S. & O’Reilly, P.F. Tutorial: a guide to performing polygenic risk score analyses. Nat Protoc. 2020; 15: 2759–2772. https://doi-org.myaccess.library.utoronto.ca/10.1038/s41596-020-0353-1
- Ge T, Chen C-Y, Ni Y, Feng Y-CA, Smoller JW. Polygenic prediction via Bayesian regression and continuous shrinkage priors. Nat Commun. 2019 Apr 16;10(1): 1776. PMID: 30992449 PMCID: PMC6467998 DOI:10.1038/s41467-019-09718-5
- Locke AE, Kahali B, Berndt SI, Justice AE, Pers TH, Day FR, Powell C, Vedantam S, Buchkovich ML, Yang J, Croteau-Chonka DC, Esko T et al. Genetic studies of body mass index yield new insights for obesity biology. Nature. 2015; 518:197-206. PMID: 25673413 PMCID: PMC4382211 DOI: 10.1038/nature14177
- Ni G, Zeng J, Revez JA, Wang Y, Zheng Z, Ge T, Restuadi R, Kiewa J, Nyholt DR, Coleman JRI, Smoller JW, Schizophrenia Working Group of the Psychiatric Genomics Consortium, Major Depressive Disorder Working Group of the Psychiatric Genomics Consortium, Yang J, Visscher PM, Wray NR. A comparison of ten polygenic score methods for psychiatric disorders applied across multiiple cohorts. Biol Psychiatry. 2021 Nov 1; 90(9): 611–620. PMID: 34304866 URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8500913/
- Pain O, Glanville KP, Hagenaars SP, Selzam S, Fürtjes AE, Gaspar HA, Coleman JRI, Rimfeld K, Breen G, Plomin R, Folkersen L, Lewis CM. Evaluation of polygenic prediction methodology within a reference-standardized framework. PLoS Genet. 2021 May 4;17(5):e1009021. doi: 10.1371/journal.pgen.1009021. eCollection 2021 May. PMID: 33945532 URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8121285/
- Speed D, Holmes J, Balding DJ. Evaluating and improving heritability models using summary statistics. Nature Genetics, 2020; 52: 458–462. PMID: 32203469 DOI: 10.1038/s41588-020-0600-y
...