################################################################################################################# ##### EasyQC-script to perform study-level and meta-level QC on imputed 1000G data ##### EasyQC version: 9.0 ##### Programmer: Thomas Winkler, 2014-09-22 ##### Contact: thomas.winkler@klinik.uni-regensburg.de ################################################################################################################# ### Please DEFINE here format and input columns of the following EASYIN files DEFINE --pathOut /path2output/results --strMissing . --strSeparator TAB --acolIn SNP;CHR;POS;STRAND;EFFECT_ALLELE;OTHER_ALLELE;N;EAF;BETA;SE;PVAL;IMPUTATION --acolInClasses character;character;integer;character;character;character;numeric;numeric;numeric;numeric;numeric;numeric --acolNewName SNP;CHR;POS;STRAND;EFFECT_ALLELE;OTHER_ALLELE;N;EAF;BETA;SE;PVAL;IMPUTATION ## Please DO NOT CHANGE --acolNewName values because these reflect the column names used throughout the script ## If the study used different column names, please amend the respective value at --acolIn, the column will then ## be automatically renamed to the respective --acolNewName value ### Please DEFINE here all input files: EASYIN --fileIn /path2input/GWAS1000G.STUDY1.file1.txt.gz EASYIN --fileIn /path2input/GWAS1000G.STUDY1.file2.txt.gz EASYIN --fileIn /path2input/GWAS1000G.STUDY1.file3.txt.gz ## ... ################################################################################################################# ## EASYQC Scripting interface: START EASYQC #################### ## 1. Sanity checks: CLEAN --rcdClean is.na(EFFECT_ALLELE)&is.na(OTHER_ALLELE) --strCleanName numDrop_Missing_Alleles CLEAN --rcdClean is.na(PVAL) --strCleanName numDrop_Missing_P CLEAN --rcdClean is.na(BETA) --strCleanName numDrop_Missing_BETA CLEAN --rcdClean is.na(SE) --strCleanName numDrop_Missing_SE CLEAN --rcdClean is.na(EAF) --strCleanName numDrop_Missing_EAF CLEAN --rcdClean is.na(N) --strCleanName numDrop_Missing_N CLEAN --rcdClean is.na(IMPUTATION) --strCleanName numDrop_Missing_Imputation CLEAN --rcdClean PVAL<0|PVAL>1 --strCleanName numDrop_invalid_PVAL CLEAN --rcdClean SE<=0|SE==Inf|SE>=10 --strCleanName numDrop_invalid_SE CLEAN --rcdClean abs(BETA)>=10 --strCleanName numDrop_invalid_BETA CLEAN --rcdClean EAF<0|EAF>1 --strCleanName numDrop_invalid_EAF CLEAN --rcdClean IMPUTATION<0 --strCleanName numDrop_invalid_IMPUTATION ## This is important for data reduction, because some studies report an unnecessary large number of significant digits EDITCOL --rcdEditCol signif(EAF,4) --colEdit EAF EDITCOL --rcdEditCol signif(BETA,4) --colEdit BETA EDITCOL --rcdEditCol signif(SE,4) --colEdit SE EDITCOL --rcdEditCol signif(PVAL,4) --colEdit PVAL #################### ## 2. Prepare files for filtering and apply minimum thresholds: ## Exclude monomorphic SNPs: CLEAN --rcdClean (EAF==0)|(EAF==1) --strCleanName numDrop_Monomorph ## Create column with minor allele count: ADDCOL --rcdAddCol signif(2*pmin(EAF,1-EAF)*N,4) --colOut MAC ## If you do not want to apply filters at this stage, please comment out the following rows or amend the ## filter thresholds according to your needs. CLEAN --rcdClean N<30 --strCleanName numDrop_Nlt30 CLEAN --rcdClean MAC<=6 --strCleanName numDrop_MAClet6 CLEAN --rcdClean (!is.na(IMPUTATION))&IMPUTATION<0.3 --strCleanName numDrop_lowImpQual #################### #### 3. Harmonization of allele coding (I/D) ## The aim of this step is to compile uniform allele codes A/C/G/T or I/D from different versions f given alleles HARMONIZEALLELES --colInA1 EFFECT_ALLELE --colInA2 OTHER_ALLELE #################### ## 4. Harmonization of marker names (compile 'cptid') CREATECPTID --fileMap /path2ref/rsmid_map.1000G_ALL_p1v3.merged_mach_impute.v1.txt.gz --colMapMarker rsmid --colMapChr chr --colMapPos pos --colInMarker SNP --colInA1 EFFECT_ALLELE --colInA2 OTHER_ALLELE --colInChr CHR --colInPos POS ## TO DO: Define the path to the reference file 'rsmid_map.1000G_ALL_p1v3.merged_mach_impute.v1.txt.gz' at --fileMap. ## The mapping file can be found on our website www.genepi-regensburg.de/easyqc. ## In case CHR or POS are not given in the input files, please remove "--colInChr CHR" and "--colInPos POS" from the ## command and remove "CHR;POS;" from --acolIn and --acolNewName as well as the respective "character;integer;" ## from --acolInClasses #################### ## 5.Filter duplicate SNPs ## This will count duplicates and throw out the SNP with the lower sample size: CLEANDUPLICATES --colInMarker cptid --strMode samplesize --colN N ## The duplicates are written to the output in a separate file "*duplicates.txt" #################### ## 6. AF Checks ### TO DO: Define the path to the reference file 'allelefreq.1000G_[ANCESTRY]_p1v3.impute_legends.noMono.noDup.noX.v2.gz' at --fileRef: ### Please use the reference file ancestry that matches the ancestry of the study MERGE --colInMarker cptid --fileRef /path2ref/allelefreq.1000G_EUR_p1v3.impute_legends.noMono.noDup.noX.v2.gz --acolIn cptid;a0;a1;eaf --acolInClasses character;character;character;numeric --strRefSuffix .ref --colRefMarker cptid --blnWriteNotInRef 1 ADJUSTALLELES --colInStrand STRAND --colInA1 EFFECT_ALLELE --colInA2 OTHER_ALLELE --colInFreq EAF --colInBeta BETA --colRefA1 a0.ref --colRefA2 a1.ref --blnMetalUseStrand 1 --blnRemoveMismatch 1 --blnRemoveInvalid 1 ## All mismatches will be removed (e.g. A/T in input, A/C in reference) AFCHECK --colInFreq EAF --colRefFreq eaf.ref --numLimOutlier 0.2 --blnPlotAll 0 ## blnPlotAll 0 causes that only outlying SNPs with |Freq-Freq.ref|>0.2 will be plotted (way less computational time) #################### ## 7. Rearrange columns and Write CLEANED output GETCOLS --acolOut cptid;SNP;STRAND;EFFECT_ALLELE;OTHER_ALLELE;EAF;IMPUTATION;BETA;SE;PVAL;N;MAC WRITE --strPrefix CLEANED. --strMissing . --strMode gz #################### ## 8. Plot Z versus P PZPLOT --colBeta BETA --colSe SE --colPval PVAL #################### ## 9. QQ plot QQPLOT --acolQQPlot PVAL --numPvalOffset 0.05 --strMode subplot #################### ## 10. Summary Stats post-QC CALCULATE --rcdCalc max(N,na.rm=T) --strCalcName N_max GC --colPval PVAL --blnSuppressCorrection 1 RPLOT --rcdRPlotX N_max --rcdRPlotY Lambda.PVAL.GC --arcdAdd2Plot abline(h=1,col='orange');abline(h=1.1,col='red') --strAxes lim(0,NULL,0,NULL) --strPlotName GC-PLOT #################### ## 11. SE-N Plot - Trait transformation CALCULATE --rcdCalc median(SE,na.rm=T) --strCalcName SE_median CALCULATE --rcdCalc median(1/sqrt(2*EAF*(1-EAF)), na.rm=T) --strCalcName c_trait_transf RPLOT --rcdRPlotX sqrt(N_max) --rcdRPlotY c_trait_transf/SE_median --arcdAdd2Plot abline(0,1,col='orange') --strAxes zeroequal --strPlotName SEN-PLOT STOP EASYQC ################################################################################################################# #################################################################################################################