Created by Genetics Working Group CAMH, last modified on October 27th, 2020
This is a guide for a genome-wide data cleaning and analysis in Plink.
(1) Apply quality control measures for genome-wide genotyping data sets;
(2) Perform population stratification analysis;
(3) Perform basic association analysis;
(4) Get familiarized with Plink and R environments.
## All material is located in the following folder:
https://support.scinet.utoronto.ca/education/go.php/542/file_storage/index.php
Please download all files and create a folder in your PC or cluster to run the analysis. Remember: plink must be stored in the same folder if running in your PC.
Anderson, C.A., et al. (2010) Data quality control in genetic #case-control association studies. Nat Protoc. 5:1564-1573. doi:10.1038/nprot.2010.116
Clarke, G.M., et al. (2011) Basic statistical analysis in genetic case-control studies. Nat Protoc. 6: 121-133. doi:10.1038/nprot.2010.182.
Weale, Michael. (2011, Sept. 23) Notes on GWAS QC/QA. from
https://sites.google.com/site/mikeweale/software/gwascode
Marees AT, de Kluiver H, Stringer S, et al. A tutorial on conducting genome-wide association studies: Quality control and statistical analysis. Int J Methods Psychiatr Res. 2018;27(2):e1608. doi:10.1002/mpr.1608
plink --file Raw_DATA --make-bed --out Raw_DATA_b36 |
## Visualize the 1Kgenomes .bim file using the command ‘head’ and then use “awk” to create a new file containing the columns #2 (Variant identifier) and #4 (base pair coordinate) of 1Kgenomes .bim file to use as build 37 SNP map
head 1KGPh3_b37fwd2019.bim awk '{print $2 , $4}' 1KGPh3_b37fwd2019.bim > 1KGPh3_b37fwd2019_snps_b37map.txt head 1KGPh3_b3Quality Control Steps7fwd2019_snps_b37map.txt |
## As the target dataset map is in build 36, use the 1KGPh3_b37fwd2019_snps_b37map.txt to filter variants present in 1K genomes dataset and update the map of the target dataset to build 37, using PLINK flags --extract and --update-map
plink --bfile Raw_DATA_b36 --extract 1KGPh3_b37fwd2019_snps_b37map.txt --update-map 1KGPh3_b37fwd2019_snps_b37map.txt --make-bed --out sample_b37 |
## Your map file is now on the build 37
plink --bfile sample_b37 --check-sex --out sample_sex |
## Interpretation:
# output produces the file "sample.sexcheck" with the following columns:
# FID IID PEDSEX SNPSEX STATUS F
# where “STATUS” is either “OK” or “PROBLEM”
## Use ‘grep’ in the terminal to select rows containing the pattern ‘PROBLEM’ and print these rows to a text file containing all the individuals, which failed in ‘sex check’ to be removed from dataset
grep "PROBLEM" sample_sex.sexcheck > fail-sexcheck-qc.txt |
## It is also possible to impute-sex using genotype of X-chromosome. If you choose to impute despite of removing individuals that failed sex-check, run the following command: # plink --bfile sample_b37 --impute-sex --make-bed --out sample_b37_sexqc
plink --bfile sample_b37 --missing --out check_individuals |
## Interpretation:
# Two output are generated: "sample.imiss" (individual) and "sample.lmiss" (locus).
# The file.imiss header: FID IID MISS_PHENO N_MISS N_GENO F_MISS
# where N_MISS is the number and F_MISS is the proportion of missing SNPs per individual
## Plot in R
IMISS<-read.table("check_individuals.imiss", header=T, as.is=T) pdf("individualcallrate.pdf") plot( (1:dim(IMISS)[1])/(dim(IMISS)[1]-1), sort(1-IMISS$F_MISS), main="Individual call rate cumulative distribution", xlab="Quantile", ylab="Call Rate" ); grid() dev.off() q() |
## The proportion of SNPs successfully genotyped is called Call rate (CR) and it can be obtained by sample or by SNP. Usually, only SNPs and individuals showing CR>95% or higher are kept in the sample. Select the threshold based on the analyses you intend to run.
## Let’s remove individuals with missing rate > 0.05 (CR<95%) use --mind 0.05 in Plink
plink --bfile sample_b37 --mind 0.05 --make-bed --out sample-remove-missing |
plink --bfile sample-remove-missing --het --out het |
## Interpretation:
# which produces the file het.het with the following columns
# file.het header: FID IID O(HOM) E(HOM) N(NM) F
# O(HOM) = observed # homozygous genos; N(NM) = nonmissing genos per individual
(ie. IBD>0.185) (Figure "IBSplot.pdf")
## For analysis of duplicate and related individuals, we will generate a series of temporary files. ## We will go back to "sample-remove-missing" when we have a list of unrelated and unique individuals.
## Exclude SNPs in high LD, and also snps from chr 23, 24, 25, and 26
## Visualize the last lines of the .bim file.
Plot data in R
(N(NM) – O(HOM)]/N(NM)= mean heterozygosity rate per individual)
het<-read.table("het.het", h=T) het$meanHet <- (het$N.NM. - het$O.HOM.)/het$N.NM. pdf("het-histogram.pdf") hist(het$meanHet, col="green", ylab="Heterozygosity Rate", xlab="Heterozygosity", main= "Distribution of heterozygosity per individual") linea<- mean(het$meanHet) + 2*sd(het$meanHet) lineb<- mean(het$meanHet) - 2*sd(het$meanHet) abline(v=linea, col="red") #red lines on the graph represent +/- 2 standard deviations from the mean abline(v=lineb, col="red") linea<- mean(het$meanHet) + 3*sd(het$meanHet) lineb<- mean(het$meanHet) - 3*sd(het$meanHet) abline(v=linea, col="blue") #blue lines on the graph represent +/-3 standard deviations from the mean abline(v=lineb, col="blue") dev.off() |
## (ie. IBD>0.185) (Figure "IBSplot.pdf")
## For analysis of duplicate and related individuals, we will generate a series of temporary files. ## We will go back to "sample-remove-missing" when we have a list of unrelated and unique individuals.
## Exclude SNPs in high LD, and also snps from chr 23, 24, 25, and 26
## Visualize the last lines of the .bim file.
tail sample-remove-missing.bim |
## Use ‘awk’ to make a text file with SNPs mapped on chromosomes 1 to 22:
awk '{ if ($1 == 1 || $1 == 22) print $2 }' sample-remove-missing.bim > snp.txt |
## Use ‘--extract and snp.txt’ to make new binary files containing only autosomal SNPs
plink --bfile sample-remove-missing --extract snp.txt --make-bed --out sample-remove-missing-chr |
## Use ‘tail’ to confirm non-autosomal SNPs were removed
tail sample-remove-missing-chr.bim |
## The file highLDregions4bim_b37_24JoK.txt contains coordinate ranges of high LD regions of the genome. These regions must be excluded before pruning process. Use ‘--exclude highLDregions4bim_b37_24JoK.txt --range’ to remove SNPs in these regions during pruning and use ‘--indep-pairwise 50 5 0.2’ to prune the target dataset. The parameters ‘50 5 0.2’ stand respectively for: the window size, the number of SNPs to shift the window at each step, and the multiple correlation coefficient for a SNP being regressed on all other SNPs simultaneously.
plink --bfile sample-remove-missing-chr --exclude highLDregions4bim_b37_24JoK.txt --range --indep-pairwise 50 5 0.2 --out prune |
## Interpretation:
## It produces two files prune.prune.in and prune.prune.out. Prune.prune.in file contains independent SNPs only, which can be used to calculate relatedness in the target dataset.
## Generate pairwise IBS for all individuals
## Use ‘--extract prune.prune.in’ and ‘--genome’ to calculate IBS using only independent SNPs.
plink --bfile sample-remove-missing-chr --extract prune.prune.in --genome --out IBS |
## Interpretation:
## IBS.genome file header: FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE DST PPC RATIO
## PI_HAT= proportion (IBD) per pair of individuals (P(IBD=2)+0.5*P(IBD=1))
## Typically, one individual in a pair that achieved an IBD of over 0.1875 (ie. halfway between 1 = #duplicates /monozygotic twins, and 0.5= first degree relatives) is removed
IBS<-read.table("IBS.genome", h=T) pdf("IBSplot.pdf") hist(IBS$PI_HAT, col="green", breaks=100, xlab= "Estimated mean pairwise IBD", main="Pairwise Identity-by-descent") abline(v=c(0.185)) #related individuals are assumed to have a pairwise IBD of over 0.185 dev.off() q() |
## Use the following commands to create a file with pairs of subjects related (PI_HAT>0.2)
awk '{ if ($8 >0.2) print $0 }' IBS.genome > pihat_min0.2.txt |
## For each pair of 'related' individuals with a pihat > 0.2, we recommend to remove the individual with the lowest call rate, but you also can choose to remove individuals with poor phenotype quality for example. write the individuals to be removed in a text file, suc as fail_pihat.txt
## If you choose to filter by CR, you can use an UNIX text editor (e.g., vi(m) ) to check which individual has the lowest CR in the 'related pair' and include it in the list of individuals to be removed. Use the ‘--missing’ command to check CR.
## Extract independent SNPs from target dataset by using ‘--extract’ and the file prune.prune.in generated above.
plink --bfile sample-remove-missing-chr --extract prune.prune.in --make-bed --out sample-remove-missing-chr-pruned |
## Observe that the dataset is smaller now for population stratification cleaning
## Preparing 1000G (keep only SNPs in common with data set pruned)
## Merge study sample with 1000G and keep only SNPs with CR >0.03
plink --bfile sample-remove-missing-chr-pruned --bmerge 1KGP_Phase3_Pruned_forMDS_2019.bed 1KGP_Phase3_Pruned_forMDS_2019.bim 1KGP_Phase3_Pruned_forMDS_2019.fam --geno 0.03 --make-bed --out sample_1000G_merged_tmp plink --bfile sample_1000G_merged_tmp --genome --out sample_1000G_merged_ibd |
## Check Eigenvalues in "MDS.mds.eigvals" file.
plink --bfile sample_1000G_merged_tmp --read-genome sample_1000G_merged_ibd.genome --cluster --mds-plot 10 eigvals --out MDS |
## Split 1000G populations
plink --bfile 1KGP_Phase3_Pruned_forMDS_2019 --keep 1KGP3_eur20170320.txt --make-bed --out 1000G_europeans plink --bfile 1KGP_Phase3_Pruned_forMDS_2019 --keep 1KGP3_afr20170320.txt --make-bed --out 1000G_africans plink --bfile 1KGP_Phase3_Pruned_forMDS_2019 --keep 1KGP3_eas20170320.txt --make-bed --out 1000G_asians |
## plotting with ggplot2 package in R
install.packages("ggplot2") library(ggplot2) pcas<-read.table("MDS.mds", header=TRUE) sample.pop<-read.table("sample_1000G_merged_tmp.fam", header=FALSE) eur<-read.table("1000G_europeans.fam", header=FALSE) afr<-read.table("1000G_africans.fam", header=FALSE) asn<-read.table("1000G_asians.fam", header=FALSE) mds<-read.table("MDS.mds", h=T) pop<-merge(sample.pop, mds, by.x=c("V1","V2"),by.y=c("FID","IID")) pop1<-merge(eur, mds, by.x=c("V1","V2"),by.y=c("FID","IID")) pop2<-merge(asn, mds, by.x=c("V1","V2"),by.y=c("FID","IID")) pop3<-merge(afr, mds, by.x=c("V1","V2"),by.y=c("FID","IID")) ## Population Structure Plot for all populations (Figure "sample_population-all.pdf") pdf("sample_population-all.pdf") ggplot(pop1, aes(C1,C2)) + geom_point(shape=21,colour = "lightBlue", fill="lightBlue", size = 3) + geom_point(shape=21,data=pop, colour = "white",fill = "black", size = 3, alpha = 0.5) + geom_point(shape=21,data=pop2, colour = "maroon",fill = "maroon", size = 3, alpha = 0.5)+ geom_point(shape=21,data=pop3, colour = "lightGreen",fill = "lightGreen", size = 3, alpha = 0.5) + annotate("text", x=0.15,y=-0.1,label = "bold(Sample)",color = "black",parse = TRUE)+ annotate("text", x=0.15,y=-0.11,label = "bold(CEU)",color = "lightBlue",parse = TRUE)+ annotate("text", x=0.15,y=-0.12,label = "bold(CHB+JPT)",color = "maroon", parse = TRUE)+ annotate("text", x=0.15,y=-0.13,label = "bold(AFR)",color = "lightGreen",parse = TRUE) dev.off() |
## Now that we have the list of individuals that fail QC steps, we will return to the target dataset in the build37 with all the chromosomes and CR>95% (sample-remove-missing.bed .bim .fam) and remove these individuals from the sample by using the flag ‘--remove’. Then we will decide the best thresholds for SNP quality according to our final goal. In the following ‘notes’, you can see some examples of parameters of SNP quality for performing a simple GWAS.
## Notes:
## --hwe 0.000001 - markers with HWE P values less than 0.000001 (select case/control)
## --maf 0.01 - markers with a minor allele frequency less than 0.01
## --geno 0.01 - markers with a call-rate threshold of less than 1% (see plot)
## Keep in mind that you can choose different parameters according to your goals.
## Identify markers with excessive missing data rate and determine threshold (Figure "SNPcoverage.pdf")
plink --bfile sample-remove-missing --remove fail_inds.txt --mind 0.02 --missing --out lmiss |
## Interpretation:
## It produces the files .imiss and .lmiss
## The lmiss file header: CHR SNP N_MISS N_GENO F_MISS
## Where N_MISS is the number of individuals missing this SNP and F_MISS is the proportion of the sample missing for this SNP
## Plot in R
x<-read.table("lmiss.lmiss", header=T, as.is=T) pdf("SNPcoverage.pdf") plot( (1:dim(x)[1])/(dim(x)[1]-1), sort(1-x$F_MISS), main="SNP coverage cumulative distribution", xlab="Quantile", ylab="Coverage" ); grid() dev.off() q() |
## SNPs that fail this step will be removed using the --geno flag (find the threshold)
## Remove all markers and samples failing quality control (by default the --hwe option in plink only filters for controls)
plink --bfile sample-remove-missing --remove fail_inds.txt --mind 0.02 --hwe 1e-6 --maf 0.01 --geno 0.01 --make-bed --out sample_clean_final_temp |
## This second HWE step only focuses on cases because in the controls all SNPs with a HWE p-value < hwe 1e-6 were already been removed
/plink --bfile sample_clean_final_temp --hwe 1e-10 --hwe-all --make-bed --out sample_clean_final |
## QC is ready now!
R x<-read.table("res.assoc.logistic", h=T, as.is=T) #or use clean-DATA.assoc.linear source('qq_plot_v5.R') pdf("qqPlot_data_clean_final.assoc.logistic.pdf") qq.plot(x[ ,9], xlab= "Expected P value", ylab= "Observed P value", main="qqPlot for SNPs in clean-DATA") dev.off() |
data<-read.table("res.assoc.logistic", h=T, as.is=T) #or use clean-DATA.assoc.linear source('manhattan_v1.R') pdf("Manhattan_res.assoc.logistic.pdf") d<-subset(data, data$TEST=="ADD") X<-data.frame(CHR=d$CHR, BP=d$BP, P=d$P) manhattan( X, GWthresh=5e-8, GreyZoneThresh=1e-5, DrawGWline=TRUE ) dev.off() |
unadj1<-read.table("res.assoc.logistic", h=T, as.is=T) unadj2=unadj1[which(unadj1$TEST==c("ADD")),] significant=unadj2[which(unadj2$P<0.0005),] significant write.table(significant, "case-control-res.txt", col.names=TRUE, row.names=FALSE, quote=FALSE) q() |