## R code illustrating a processing pipeline for manuscript "Separating wheat from the chaff: a prioritisation pipeline for the analysis of metabolomics datasets" ## A. Jankevics, M.E. Merlo, M. de Vries, R.J. Vonk, E. Takano, R. Breitling ## Please send any comments and questions on the code to Rainer.Breitling@glasgow.ac.uk (Rainer Breitling) and a.jankevics@rug.nl (Andris Jankevics) ## mzmatch.R library must be installed. ## Installation instructions and comprehensive tutorial on the mzMatch software is available at: ## http://mzmatch.sourceforge.net/tutorial.mzmatch.r.advanced.html ## Data files in mzXML format used in this script are available at: ## http://mzmatch.sourceforge.net/metabolomics.html library (mzmatch.R) mzmatch.init (6000) ## Set folder in which you data files are located setwd ("/home/DATA/Darbam/DATI/OrbiTrap/Standards/datafiles") ## list all files in folder including subfolders mzXMLfiles <- dir (full.names=TRUE,pattern="\\.mzXML$",recursive=TRUE) outputfiles <- paste(sub("\\.mzXML","",mzXMLfiles),".peakml",sep="") ## Use XCMS for peak picking and export individual peakml file afterwards for (i in 1:length(mzXMLfiles)) { xset <- xcmsSet(mzXMLfiles[i], method='centWave', ppm=5, peakwidth=c(10,50), snthresh=1, prefilter=c(3,100), integrate=2, mzdiff=-0.00005, verbose.columns=TRUE,fitgauss=TRUE,mzCenterFun="wMean") PeakML.xcms.write.SingleMeasurement (xset=xset,filename=outputfiles[i],ionisation="detect",addscans=2,writeRejected=FALSE,ppm=10) } ## List folders of different measurement conditions (Biological_Sample_C18, Biological_Sample_HILIC, Standards_mixture_C18 and Standards_Mixture_HILIC). Furthemore 4 data sets will be processed recursively by using R loops. MainClasses <- dir () ## Retention time correction and combine technical replicates in the single peakml file. for (i in 1:length(MainClasses)) { setwd (MainClasses[i]) SubClasses <- dir () dir.create ("combined") dir.create ("RTcor") for (a in 1:length(SubClasses)) { FILESf <- dir (SubClasses[a],full.names=TRUE,pattern="\\.peakml$") mzmatch.ipeak.align.CowCoda(i=paste(FILESf,collapse=","), o="RTcor", ppm=5, codadw=0.8, order=5, maxrt=40, v=T) FILESf <- dir ("RTcor",full.names=TRUE,pattern="\\.peakml$") outputfile <- paste("combined/",MainClasses[i],"_",SubClasses[a],".peakml",sep="") mzmatch.ipeak.Combine (i=paste(FILESf,collapse=","),v=T,rtwindow=30,o=outputfile,combination="set",ppm=5,label=paste(MainClasses[i],"_",SubClasses[a],sep="")) file.remove (FILESf) } file.remove ("RTcor") setwd ("..") } ## Use gap filer tool to fill in peaks between replicates missed in peak picking step. for (i in 1:length(MainClasses)) { setwd (MainClasses[i]) dir.create ("combined_filled") if (length(dir(dir ()[1]))>2) { FILESf <- dir ("combined",full.names=TRUE,pattern="\\.peakml$") FILESs <- dir ("combined",pattern="\\.peakml$") for (a in 1:length(FILESf)) { PeakML.GapFiller (filename=FILESf[a], outputfile=paste("combined_filled/",FILESs[a],sep="")) } } else { FILESf <- dir ("combined",full.names=TRUE,pattern="\\.peakml$") FILESs <- dir ("combined",pattern="\\.peakml$") file.copy (FILESf,"combined_filled") } setwd ("..") } ## Keep only peaks which are detected in all replicates. for (i in 1:length(MainClasses)) { setwd (MainClasses[i]) dir.create ("combined_nfiltered") dir.create ("combined_nrejected") FILESf <- dir ("combined_filled",full.names=TRUE,pattern="\\.peakml$") FILESs <- dir ("combined_filled",pattern="\\.peakml$") for (a in 1:length(FILESf)) { project <- .jnew("peakml/util/rjava/Project", rep("A",3), rep("A",3), rep("A",3)) cat ("Loading peakML file in memory (it can take some time, sorry) \n") st <- system.time(.jcall(project, returnSig="V", method="load",FILESf[a])) cat ("Done in:",st[3],"s \n") samplenames <- .jcall(project,returnSig="[S", method="getMeasurementNames") mzmatch.ipeak.filter.SimpleFilter(i=FILESf[a], o=paste("combined_nfiltered/",FILESs[a],sep=""), rejected=paste("combined_nrejected/",FILESs[a],sep=""), mindetections=length(samplenames), v=T) } setwd ("..") } ## Combine dilution samples in one data set (peakml file) and apply retention time correction for (i in 1:length(MainClasses)) { setwd (MainClasses[i]) dir.create ("RTcor") FILESf <- dir ("combined_nfiltered",full.names=TRUE,pattern="\\.peakml$") mzmatch.ipeak.align.CowCoda(i=paste(FILESf,collapse=","), o="RTcor", ppm=5, codadw=0.8, order=5, maxrt=40, v=T) FILESf <- dir ("RTcor",full.names=TRUE,pattern="\\.peakml$") OUTPUT <- c("combined_nfiltered.peakml") mzmatch.ipeak.Combine (i=paste(FILESf,collapse=","),v=T,rtwindow=60,o=OUTPUT,combination="set",ppm=5) file.remove (FILESf) file.remove ("RTcor") setwd ("..") } ## Use gap filer tool to fill in peaks between dilution series missed in peak picking step. for (i in 1:length(MainClasses)) { setwd (MainClasses[i]) PeakML.GapFiller (filename="combined_nfiltered.peakml", outputfile="combined_nfiltered_gf.peakml") setwd ("..") } ## Match related peaks (annotate fragment, isotopes, adducts etc.) for (i in 1:length(MainClasses)) { setwd (MainClasses[i]) FILES <- dir (pattern="\\.peakml$") FILESo <- paste("REL_",FILES,sep="") FILESo2 <- paste("BP_",FILES,sep="") mzmatch.ipeak.sort.RelatedPeaks (i=FILES[1],v=T,o=FILESo[1],basepeaks=FILESo2[1],ppm=5,rtwindow=20) setwd ("..") } ## Apply metabolite identification to the files. ## Two output files will be made. On including ID's from Scocys. And the second one with ID's from KEGG DB <- dir(paste(.find.package("mzmatch.R"),"/dbs",sep=""),full.names=TRUE) DBS <- paste(DB[c(1,8,9)],collapse=",") DBS2 <- paste(DB[c(1,3,9)],collapse=",") for (i in 1:length(MainClasses)) { setwd (MainClasses[i]) FILES <- dir (pattern="\\REL_") FILESo <- paste("Scocyc_ID_",FILES,sep="") FILESo2 <- paste("KEGG_ID_",FILES,sep="") mzmatch.ipeak.util.Identify (i=FILES,v=T,o=FILESo,ppm=5,databases=DBS,JHeapSize=2500) mzmatch.ipeak.util.Identify (i=FILES,v=T,o=FILESo2,ppm=5,databases=DBS2,JHeapSize=2500) setwd ("..") } ### Calculate dilution trend correlation values ## Files what inlcudes annotations for isotopes, fragments and adducts as well identifiactions will be used. FILES <- dir (pattern="\\Scocyc_ID_",recursive=TRUE) FILES <- append(FILES,dir (pattern="\\KEGG_ID_",recursive=TRUE)) ## Define names for the output files OFILES <- sub (".peakml","",FILES) OFILES <- paste(OFILES,"_Ann.peakml",sep="") for (i in 1:length(FILES)) { mzmatch.ipeak.convert.ConvertToText(JHeapSize=1425, i=FILES[i], o="out.txt", v=T) peakt <- read.csv("out.txt",sep="\t",row.names=1) peakt <- t(peakt[,2:ncol(peakt)]) out <- rep (NA,ncol(peakt)) for (coln in 1:ncol(peakt)) { ## 2 replicates were used for C18 column and 3 for HILIC. ints <- peakt[,coln] if (length(ints)==16) { int1 <- ints[seq(1,16,2)] int2 <- ints[seq(2,16,2)] int3 <- NULL } if (length(ints)==24) { int1 <- ints[seq(1,24,3)] int2 <- ints[seq(2,24,3)] int3 <- ints[seq(3,24,3)] } ## Calculate median for intensitys of dilution points. Correlation values of are stored in vector 'out' avg <- apply(rbind(int1,int2,int3),2,median,na.rm=TRUE) nas <- which(is.na(avg)) nas <- append(nas,which(avg==0)) if (length(nas)>=5) { out[coln] <- NA } else { if (length(nas)==0) { SCL <- cor(1:8,log(avg,base=2)) } else { SCL <- cor(c(1:8)[-c(nas)],log(avg[-c(nas)],base=2)) } out[coln] <- round(SCL,2) } } ## Generate extra annotations for identifiers from contaminants and standards data bases Annotations <- PeakML.get.attributes (FILES[i], attribute="getGroupAnnotation",annotations=c("identification","relation.ship")) idents <-rep(NA,length(out)) for (a in 1: length(idents)) { id <- Annotations[[1]][a] id <- unlist(strsplit(id,", ")) wid <- grep ("STD_",id) cid <- grep ("CONTAMINANTDB_",id) if (length(cid)!=0) { idents[a] <- "CONT" } if (length(wid)!=0) { idents[a] <- "STD" } } BPS <- rep (NA, length(out)) BPS[Annotations[[2]]=="bp"] <- "bp" BPS[Annotations[[2]]=="potential bp"] <- "bp" BPS[Annotations[[2]]=="fragment?"] <- "bp" GroupAttValues <- rbind (out,idents,BPS) GroupAttrName <- c("corr.l","ids","BPS") PeakML.AddGroupsAttribute (filename=FILES[i],outputfile=OFILES[i],GroupAttributes=NULL,ionisation="detect",GrouptAttrName=GroupAttrName,GroupAttValues=GroupAttValues) } ## Write output of the files to csv text files. Including data before and after filtering. FILES <- dir (pattern="\\_Ann.peakml",recursive=TRUE) OFILES <- sub (".peakml","",FILES) OFILES <- paste(OFILES,"_idonly.peakml",sep="") for (i in 1:length(FILES)) { mzmatch.ipeak.filter.SimpleFilter(JHeapSize=1425, i=FILES[i], o=OFILES[i], annotations="identification", v=T) } DB <- dir(paste(.find.package("mzmatch.R"),"/dbs",sep=""),full.names=TRUE) DBS <- paste(DB[c(1,8,9)],collapse=",") DBS2 <- paste(DB[c(1,3,9)],collapse=",") FILES <- dir (pattern="Scocyc_ID_REL_combined_nfiltered_gf_Ann_idonly.peakml",recursive=TRUE) OFILES <- sub (".peakml","",FILES) OFILES <- paste(OFILES,".tsv",sep="") for (i in 1:length(FILES)) { mzmatch.ipeak.convert.ConvertToText(JHeapSize=1425, i=FILES[i], o=OFILES[i], databases=DBS, annotations="identification,ppm,corr.l,relation.ship,BPS", v=T) } FILES <- dir (pattern="KEGG_ID_REL_combined_nfiltered_gf_Ann_idonly.peakml",recursive=TRUE) OFILES <- sub (".peakml","",FILES) OFILES <- paste(OFILES,".tsv",sep="") for (i in 1:length(FILES)) { mzmatch.ipeak.convert.ConvertToText(JHeapSize=1425, i=FILES[i], o=OFILES[i], databases=DBS2, annotations="identification,ppm,corr.l,relation.ship,BPS", v=T) } ## Add number of the isobaric peaks, for the given mass as the extra column. And export peaks with dilution trend correlation valuse < -0.85 to separate files FILES <- dir (pattern="\\.tsv",recursive=TRUE) OFILE <- sub (".tsv","",FILES) OFILES <- paste(OFILE,"_isobar.tsv",sep="") OFILES2 <- paste(OFILE,"_isobar_corr.tsv",sep="") mzXMLfiles <- dir(pattern="\\.mzXML",recursive=TRUE) filenames <- vector("list",8) filenames[[1]] <- mzXMLfiles[1:16] filenames[[2]] <- mzXMLfiles[1:16] filenames[[3]] <- mzXMLfiles[17:40] filenames[[4]] <- mzXMLfiles[17:40] filenames[[5]] <- mzXMLfiles[41:56] filenames[[6]] <- mzXMLfiles[41:56] filenames[[7]] <- mzXMLfiles[57:80] filenames[[8]] <- mzXMLfiles[57:80] for (i in 1:length(FILES)) { DATA <- read.csv(FILES[i],sep="\t") ## isobaric id's (assigned the sama identifiers) IDS <- DATA$identification IDScount <- table(IDS) NumofPossiblecomp <- rep(NA,nrow(DATA)) for (a in 1:length(IDScount)) { hits <- which(IDS==(names(IDScount[a]))) NumofPossiblecomp[hits] <- IDScount[a] } DATA <- cbind(DATA,NumofPossiblecomp) colnames(DATA) <- c("Mass","RT",filenames[[i]],"identification","ppm","corr.l","relation.ship","BPS","Number.of.isobaric.masses") hits <- which(DATA$corr.l<=-0.85 & !is.na(DATA$corr.l)) DATA2 <- DATA[hits,] write.table (DATA,sep="\t",file=OFILES[i],row.names=FALSE) write.table (DATA2,sep="\t",file=OFILES2[i],row.names=FALSE) } ## Study design file in tab separated file. Column <- c(rep ("C18",16),rep("HILIC",24),rep ("C18",16),rep("HILIC",24)) Cond <- c(rep ("Biological Sample",40),rep("Standards Mixture",40)) Dilution <- c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8) Dilution[Dilution==1] <- "1/8" Dilution[Dilution==2] <- "1/16" Dilution[Dilution==3] <- "1/32" Dilution[Dilution==4] <- "1/64" Dilution[Dilution==5] <- "1/128" Dilution[Dilution==6] <- "1/256" Dilution[Dilution==7] <- "1/512" Dilution[Dilution==8] <- "1/1024" A <- "A" B <- "B" C <- "C" replicate <- c(A,B,A,B,A,B,A,B,A,B,A,B,A,B,A,B,A,B,C,A,B,C,A,B,C,A,B,C,A,B,C,A,B,C,A,B,C,A,B,C,A,B,A,B,A,B,A,B,A,B,A,B,A,B,A,B,A,B,C,A,B,C,A,B,C,A,B,C,A,B,C,A,B,C,A,B,C,A,B,C) DESC <- cbind(mzXMLfiles,Cond,Column,replicate,Dilution) colnames(DESC) <- c("Filename","Taxonomy","Chromatographic column","Technical replicate","Dilution factor") write.table (DESC,sep="\t",file="experiment_design.tsv",row.names=FALSE) ## Generate a data for Tables 2 and 4 in manuscript. FILES <- dir (pattern="Scocyc_ID_REL_combined_nfiltered_gf_Ann.peakml",recursive=TRUE) counts <- matrix(nrow=length(FILES),ncol=28) SC_list_bp <- vector("list",length(FILES)) SC_list_rp <- vector("list",length(FILES)) for (i in 1:length(FILES)) { Annotations_S <- PeakML.get.attributes (FILES[i], attribute="getGroupAnnotation",annotations=c("identification","relation.ship","ids","corr.l")) Annotations_S <- do.call (cbind,Annotations_S) ## Unfiltered STD_bp <- Annotations_S[which(Annotations_S[,3]=="STD" & Annotations_S[,2]=="bp"),1] STD_bpu <- unique(STD_bp) STD_rp <- Annotations_S[which(Annotations_S[,3]=="STD" & Annotations_S[,2]!="bp"),1] STD_rpu <- unique(STD_rp) CONT_bp <- Annotations_S[which(Annotations_S[,3]=="CONT" & Annotations_S[,2]=="bp"),1] CONT_bpu <- unique (CONT_bp) CONT_rp <- Annotations_S[which(Annotations_S[,3]=="CONT" & Annotations_S[,2]!="bp"),1] CONT_rpu <- unique(CONT_rp) SC_bp <- Annotations_S[which(is.na(Annotations_S[,3]) & !is.na(Annotations_S[,1]) & Annotations_S[,2]=="bp"),1] SC_bpu <- unique (SC_bp) SC_rp <- Annotations_S[which(is.na(Annotations_S[,3]) & !is.na(Annotations_S[,1]) & Annotations_S[,2]!="bp"),1] SC_rpu <- unique (SC_rp) UnIdent_bp <- length(which(is.na(Annotations_S[,1]) & Annotations_S[,2]=="bp")) UnIdent_rp <- length(which(is.na(Annotations_S[,1]) & Annotations_S[,2]!="bp")) BF <- c(length(STD_bp),length(STD_rp),length(CONT_bp),length(CONT_rp),length(SC_bp),length(SC_rp),UnIdent_bp,UnIdent_rp, length(STD_bpu),length(STD_rpu),length(CONT_bpu),length(CONT_rpu),length(SC_bpu),length(SC_rpu)) ## corr < .85 VEC <- as.numeric(Annotations_S[,4])*-1 Annotations_S <- Annotations_S[which(VEC>0.85),] STD_bp <- Annotations_S[which(Annotations_S[,3]=="STD" & Annotations_S[,2]=="bp"),1] STD_bpu <- unique(STD_bp) STD_rp <- Annotations_S[which(Annotations_S[,3]=="STD" & Annotations_S[,2]!="bp"),1] STD_rpu <- unique(STD_rp) CONT_bp <- Annotations_S[which(Annotations_S[,3]=="CONT" & Annotations_S[,2]=="bp"),1] CONT_bpu <- unique (CONT_bp) CONT_rp <- Annotations_S[which(Annotations_S[,3]=="CONT" & Annotations_S[,2]!="bp"),1] CONT_rpu <- unique(CONT_rp) SC_bp <- Annotations_S[which(is.na(Annotations_S[,3]) & !is.na(Annotations_S[,1]) & Annotations_S[,2]=="bp"),1] SC_bpu <- unique (SC_bp) SC_rp <- Annotations_S[which(is.na(Annotations_S[,3]) & !is.na(Annotations_S[,1]) & Annotations_S[,2]!="bp"),1] SC_rpu <- unique (SC_rp) UnIdent_bp <- length(which(is.na(Annotations_S[,1]) & Annotations_S[,2]=="bp")) UnIdent_rp <- length(which(is.na(Annotations_S[,1]) & Annotations_S[,2]!="bp")) AF <- c(length(STD_bp),length(STD_rp),length(CONT_bp),length(CONT_rp),length(SC_bp),length(SC_rp),UnIdent_bp,UnIdent_rp, length(STD_bpu),length(STD_rpu),length(CONT_bpu),length(CONT_rpu),length(SC_bpu),length(SC_rpu)) counts[i,] <- c(BF,AF) SC_list_bp[[i]] <- SC_bpu SC_list_rp[[i]] <- SC_rpu } save.image ("image.Rdata") counts <- t(counts) cc <- cbind (counts[1:14,1],counts[15:28,1],counts[1:14,2],counts[15:28,2],counts[1:14,3],counts[15:28,3],counts[1:14,4],counts[15:28,4]) write.csv (cc,file="Scocyc_counts_table.csv")