--- title: "Bioinformatics Pipeline-18s" author: "Cathy Cavallo" date: "6/24/2018" output: html_document --- #Next Generation Sequencing Bioinformatics Pipeline ## Broad spectrum primer example -- 18s_SSU This code will allow the user to clean and organise their DNA sample library and then calculate measures of taxon abundance for analysis. Important steps include: * Removing + samples with less than 100 target reads + Non-target sequences + Specific samples (e.g. those that did not meet thresholds for inclusion, like CT) * Producing group totals for higher taxonomic levels * Calculating the frequency of occurrence (FOO) and relative read abundance (RRA) of items in the dataset * Matching samples with sample metadata for analysis ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) #knitr::opts_knit$set(root.dir = '/Volumes/CathyMacBookPro/A Working PhD/CPUFE MS2 RMD') ``` ```{r echo=FALSE} #Set up path (Windows only) .libPaths('C:/Users/ccavallo/rpkgs') ``` ##Preparation ####Load packages ```{r} library(dplyr) library(tidyverse) library(data.table) library(tidyr) ``` ####Load identified sample library ```{r} Metazoa<-read.csv("CCavallo_metazoa_2016_2018_20180709.csv",stringsAsFactors = FALSE) ``` ####Inspect ```{r} head(Metazoa) ``` ##Clean ####Sum all reads in each sample and food reads in each sample ```{r} Total<-colSums(Metazoa[,6:765])#Totals for numeric columns (from first to last sample columns) PreyOnly<-Metazoa[Metazoa$Type == "Prey",] TotalFood<-colSums(PreyOnly[,6:765])#remember to change 6:744 to the column numbers for your library ``` ####Create a vector to retain only samples w > 100 food reads ```{r} ColumnIndex<-which(as.vector(TotalFood >= 100)) ColumnIndex<-ColumnIndex+5 #Add 5 to all values in this vector because we skipped the first five columns when calculating the column sums. i.e. what was column 1 will become column 6, column 3 becomes column 8 etc.. ``` ####Drop non-food taxa and drop samples with < 100 food reads ```{r} ncol(Metazoa) MetazoaNoLowreads<-PreyOnly[,c(1:5, ColumnIndex)] tail(MetazoaNoLowreads) ncol(MetazoaNoLowreads) ``` ##Organise ####Create rows with group totals You will need to define the names of your groups in a column called "Group" in your library prior to importing into R. ```{r} MetazoaGrpTotals<-MetazoaNoLowreads%>% group_by(Group)%>% summarise_at(-c(1:4),funs(sum)) ``` ####Add the group totals to your library ```{r} ColsToAppend<-data.frame(matrix(rep(NA, (nrow(MetazoaGrpTotals)*4)), ncol = 4)) names(ColsToAppend)<-names(MetazoaNoLowreads[,1:4]) MetazoaWGrpTotals<-cbind(ColsToAppend, MetazoaGrpTotals) MetazoaTotalData<-rbind( MetazoaNoLowreads, MetazoaWGrpTotals) ``` ####Drop redundant columns Remember that the column numbers you choose to drop will depend on what is in your library.You may not need to drop any. ```{r} MetazoaForFilters<-MetazoaTotalData[,c(3,5:ncol(MetazoaTotalData))] ``` ##Calculate Now that your library is organised and includes only the samples and sequences necessary for your analysis, you can start calculating some measures of presence and abundance! ###Relative read abundance (RRA) ####Get sample totals again ```{r} MetazoaForFilters$Class[is.na(MetazoaForFilters$Class)]<-"GroupTotal" SampleTotals<-colSums(MetazoaForFilters[which(MetazoaForFilters$Class != "GroupTotal"),3:ncol(MetazoaForFilters)]) ``` ####Check that the totals for groups and totals for Metazoa are the same. ```{r} GROUPTotals<-colSums(MetazoaForFilters[which(MetazoaForFilters$Class == "GroupTotal"),3:ncol(MetazoaForFilters)]) SampleTotals<-c(NA, NA, SampleTotals) tail(SampleTotals) tail(GROUPTotals) ``` ####Bind them and inspect ```{r} MetazoaForFilters<-rbind( MetazoaForFilters, SampleTotals) MetazoaForFilters$Group[nrow(MetazoaForFilters)]<-"Total" head(MetazoaForFilters) tail(MetazoaForFilters) ``` ####Convert each cell or group total to a percentage of the sample total This converts the library into a sheet of percentages. ```{r} class(MetazoaForFilters$Group) MetazoaRRA<-MetazoaForFilters MetazoaRRA[, 3:ncol(MetazoaRRA)]<-NaN #replace values with NaN (not a number) for(i in 3:ncol(MetazoaForFilters)){#Loop from column 3 til the end (3:ncol) ActiveTotal<-MetazoaForFilters[MetazoaForFilters$Group == "Total", i]#subset the data so that only the total is left for the column we are working in for(j in 1:nrow(MetazoaForFilters)){#and then from the first row to the last row ActiveRRA<-(MetazoaForFilters[j,i]/ActiveTotal)*100 #calculate RRA Column i, row j. MetazoaRRA[j, i]<-ActiveRRA #in the Metazoa RRA data frame update the exact corresponding cell with the RRA value you have just calculated } } tail(MetazoaRRA) tail(MetazoaForFilters) ``` ###Frequency of occurrence (FOO) Frequency of occurrence describes the percentage of samples in the dataset in which a given taxon is present, but, like all presence–absence records, can overestimate the importance of common prey items that are ingested in very small amounts, including in the digestive tract of targeted prey (secondary predation). ####Convert all cells where RRA is >1% to "1", cells <1% get "0". This converts the library to presence/absence type data. ```{r} MetazoaFOO<-MetazoaRRA MetazoaFOO[, 3:ncol(MetazoaFOO)]<-NaN #replace values with NaN (not a number) for(i in 3:ncol(MetazoaRRA)){#Loop from column 3 til the end (3:ncol) for(j in 1:nrow(MetazoaRRA)){#and then from the first row to the last row if(MetazoaRRA[j,i]>=1){ MetazoaFOO[j,i]<-1#present }else{ MetazoaFOO[j,i]<-0#absent } #calculate RRA Column i, row j. } } tail(MetazoaFOO) tail(MetazoaRRA) ``` ####Rename group totals ```{r} for(i in which(MetazoaRRA$Class=="GroupTotal")){ MetazoaRRA$Class[i]<-paste(MetazoaRRA$Group[i],"Total") } for(i in which(MetazoaFOO$Class=="GroupTotal")){ MetazoaFOO$Class[i]<-paste(MetazoaFOO$Group[i],"Total") } for(i in which(MetazoaForFilters$Class=="GroupTotal")){ MetazoaForFilters$Class[i]<-paste(MetazoaForFilters$Group[i],"Total") } head(MetazoaRRA) tail(MetazoaRRA) head(MetazoaFOO) tail(MetazoaFOO) ``` ####Clean away some stuff you don't need ```{r} rm(ColsToAppend,MetazoaWGrpTotals,MetazoaGrpTotals,MetazoaTotalData,Metazoa,Check,MetazoaNoLowreads,PreyOnly,MetazoaRRAt,MetazoaFOOt,MetazoaGrpTotals) ``` ##Combine with sample metadata ####Transpose data back into a format that makes more sense ```{r} names<-MetazoaRRA$Class MetazoaRRAt<-as.data.frame(t(MetazoaRRA[,-c(1:2)]))#transpose all but the first column (Sample) colnames(MetazoaRRAt)<-names#set column names MetazoaRRAt<-cbind(rownames(MetazoaRRAt),data.frame(MetazoaRRAt)) names(MetazoaRRAt)[1]<-"Sample" head(MetazoaRRAt) MetazoaRRAt$Sample <- substr(x = MetazoaRRAt$Sample, start = 2,stop=100) #If the sample name starts with 'X' this should work. If it starts with 'X.' you will need to change the start = to 3 head(MetazoaRRAt) tail(MetazoaRRAt) summary(MetazoaRRAt) names<-MetazoaFOO$Class MetazoaFOOt<-as.data.frame(t(MetazoaFOO[,-c(1:2)])) colnames(MetazoaFOOt)<-names MetazoaFOOt<-cbind(rownames(MetazoaFOOt),data.frame(MetazoaFOOt)) names(MetazoaFOOt)[1]<-"Sample" head(MetazoaFOOt) names<-MetazoaForFilters$Class MetazoaForFilterst<-as.data.frame(t(MetazoaForFilters[,-c(1:2)])) colnames(MetazoaForFilterst)<-names MetazoaForFilterst<-cbind(rownames(MetazoaForFilterst),data.frame(MetazoaForFilterst)) names(MetazoaForFilterst)[1]<-"Sample" head(MetazoaForFilterst) ``` ####Drop leading X's if you have them ```{r} MetazoaFOOt$Sample <- substr(x = MetazoaFOOt$Sample, start =2,stop=100) head(MetazoaFOOt) tail(MetazoaFOOt) summary(MetazoaFOOt) MetazoaForFilterst$Sample <- substr(x = MetazoaForFilterst$Sample, start =2,stop=100) head(MetazoaForFilterst) tail(MetazoaForFilterst) summary(MetazoaForFilterst) ``` ###Combine sequence data with sample metadata/ field data based on an identifying sample number ####Bring in the sample field data (tied to sample number) ```{r} fielddata1<-read.csv("metadataIDs_metazoa_20180710.csv",stringsAsFactors = FALSE) head(fielddata1) fielddata<-fielddata1[c(1:10)] head(fielddata) ``` ####Rename sample in the metadata/field data database to match sample name in sequence library ```{r} colnames(fielddata)[colnames(fielddata)=="X20180215.labels"]<-"SampleField" head(fielddata) ``` ####Then bind sample field data to sample diet data for both RRA and FOO sheets You may have samples in either your library or field database that you want to drop. You can drop these here. ```{r} todrop<-c("X2016a_CRC321_s376","X2018_CRC177_s034","X2018_CRC323_s226","X2018_CRC165_s230","X2018_CRC321_s267","X2018_CRC183_s279","X2018_CRC264_s170","X2018_CRC264_s171","X2018_CRC422_s216","X2018_CRC16095_s096","X2018_CRC122_s210","X2018_CRC122_s209","X2018_CRC321_s267","X2018_CRC323_s227","X2018_CRC099_s095", "X2016a_CRC376_s327") todrop2<-c("X2016a_CRC321_s376","X2018_CRC177_s034","X2018_CRC323_s226","X2018_CRC165_s230","X2018_CRC321_s267","X2018_CRC183_s279","X2018_CRC264_s170","X2018_CRC264_s171","X2018_CRC422_s216","X2018_CRC16095_s096","X2018_CRC122_s210","X2018_CRC122_s209","X2018_CRC321_s267","X2018_CRC323_s227","X2018_CRC099_s095", "X2016a_CRC376_s327","X2016a_CRC376_s327") MetazoaRRAtX<-MetazoaRRAt[!rownames(MetazoaRRAt)%in%todrop,] MetazoaFOOtX<-MetazoaFOOt[!rownames(MetazoaFOOt)%in%todrop2,] MetazoaForFilterst<-MetazoaForFilterst[!rownames(MetazoaForFilterst)%in%todrop,] write.csv(MetazoaFOOtX,"Metazoasamplesfiltered.csv") ``` ####Now bring in the data ```{r} fielddatanames<-names(fielddata) MetazoaRRAnames<-names(MetazoaRRAtX) summary(fielddata) summary(MetazoaRRAt) tail(fielddata) tail(MetazoaRRAt) ``` ####Clear away some extraneous data frames and things ```{r} rm(ActiveSample,fielddataRow,RRARow,RowsBound1,MetazoaRRAcomplet,ActiveTotal,ActiveRRA) ``` ####Match up your field data with your RRA data ```{r} MetazoaRRAcomplet<-data.frame() i=1 for(i in 1:nrow(MetazoaRRAtX)){ ActiveSample<-MetazoaRRAtX$Sample[i]# isolating that one cell (sample in row i) fielddataRow<-fielddata[fielddata$SampleField == ActiveSample,]#subset to a single row in RRA. Works for pulling the small sheet into the big sheet. RRARow<-MetazoaRRAtX[i,] #i is always the active row RowsBound1<-cbind(fielddataRow,RRARow) MetazoaRRAcomplet<-rbind(MetazoaRRAcomplet,RowsBound1) } head(MetazoaRRAcomplet) MetazoaRRAcomplet<-data.frame(MetazoaRRAcomplet) ``` ####Match up your field data with your FOO data ```{r} i=2 MetazoaFOOnames<-names(MetazoaFOOtX) MetazoaFOOcomplet<-data.frame() for(i in 1:nrow(MetazoaFOOtX)){ ActiveSample<-MetazoaFOOtX$Sample[i] fielddataRow<-fielddata[fielddata$SampleField == ActiveSample,] FOORow<-MetazoaFOOtX[i,] RowsBound<-cbind(fielddataRow,FOORow) MetazoaFOOcomplet<-rbind(MetazoaFOOcomplet,RowsBound) } tail(MetazoaFOOcomplet) MetazoaFOOcomplet<-data.frame(MetazoaFOOcomplet) ``` ####Match up your field data with MetazoaForFilters ```{r} MetazoaForFilterstcomplet<-data.frame() i=187 for(i in 1:nrow(MetazoaForFilterst)){ ActiveSample<-MetazoaForFilterst$Sample[i] fielddataRow<-fielddata[fielddata$SampleField == ActiveSample,] RRARow<-MetazoaForFilterst[i,] RowsBound1<-cbind(fielddataRow,RRARow) MetazoaForFilterstcomplet<-rbind(MetazoaForFilterstcomplet,RowsBound1) } head(MetazoaForFilterstcomplet) ``` ####Drop extra dataframes and objects ```{r} rm(FOORow,RRARow,RowsBound,RowsBound1,MetazoaFOO,MetazoaRRA,MetazoaForFilters,MetazoaFOOt,MetazoaRRAt,MetazoaForFilterst,MetazoaFOOtX,MetazoaRRAtX,fielddata,fielddata1,fielddataRow) ``` ####Drop columns you aren't interested in There is also an opportunity to rename some columns here if you want. ```{r} drop<-c("SampleFinal","Sample","labels","Sample.1") MetazoaRRAcomplete<-MetazoaRRAcomplet[,!(names(MetazoaRRAcomplet)%in%drop)] setnames(MetazoaRRAcomplete, old=c("Date", "Month", "Site","Stage", "Burrow","Year"), new=c("DATE", "MONTH","SITE", "STAGE","BURROW","SEASON")) tail(MetazoaRRAcomplete) drop<-c("SampleFinal","Sample","labels","Sample.1") MetazoaFOOcomplete<-MetazoaFOOcomplet[,!(names(MetazoaFOOcomplet)%in%drop)] setnames(MetazoaFOOcomplete, old=c("Date", "Month", "Site","Stage", "Burrow","Year"), new=c("DATE", "MONTH","SITE", "STAGE","BURROW","SEASON")) tail(MetazoaFOOcomplete) drop<-c("SampleFinal","Sample","labels","Sample") MetazoaForFilterstcomplete<-MetazoaForFilterstcomplet[,!(names(MetazoaForFilterstcomplet)%in%drop)] setnames(MetazoaForFilterstcomplete, old=c("Date", "Month", "Site","Stage", "Burrow","Year"), new=c("DATE", "MONTH","SITE", "STAGE","BURROW","SEASON")) tail(MetazoaForFilterstcomplete) ``` ####You may need to correct some column classes ```{r} MetazoaRRAcomplete$MONTH<-as.character(MetazoaRRAcomplete$MONTH) MetazoaRRAcomplete$BURROW<-as.character(MetazoaRRAcomplete$BURROW) MetazoaRRAcomplete$SEASON<-as.character(MetazoaRRAcomplete$SEASON) MetazoaRRAcomplete$DATE<-as.Date(MetazoaRRAcomplete$DATE,format="%d/%m/%Y") MetazoaFOOcomplete$MONTH<-as.character(MetazoaFOOcomplete$MONTH) MetazoaFOOcomplete$BURROW<-as.character(MetazoaFOOcomplete$BURROW) MetazoaFOOcomplete$SEASON<-as.character(MetazoaFOOcomplete$SEASON) MetazoaFOOcomplete$DATE<-as.Date(MetazoaFOOcomplete$DATE,format="%d/%m/%Y") MetazoaForFilterstcomplete$MONTH<-as.character(MetazoaForFilterstcomplete$MONTH) MetazoaForFilterstcomplete$BURROW<-as.character(MetazoaForFilterstcomplete$BURROW) MetazoaForFilterstcomplete$SEASON<-as.character(MetazoaForFilterstcomplete$SEASON) MetazoaForFilterstcomplete$DATE<-as.Date(MetazoaForFilterstcomplete$DATE,format="%d/%m/%Y") summary(MetazoaRRAcomplete) summary(MetazoaFOOcomplete) ``` ##Export your data I use this version to record a new version each time and avoid overwriting previous versions. ```{r} currentDate <- Sys.Date() csvFileName1 <- paste("MetazoaFOO_",currentDate,".csv",sep="") csvFileName2 <- paste("MetazoaRRA_",currentDate,".csv",sep="") write.csv(MetazoaFOOcomplete, file=csvFileName1) write.csv(MetazoaRRAcomplete, file=csvFileName2) csvFileName3 <- paste("MetazoaForFilters_",currentDate,".csv",sep="") write.csv(MetazoaForFilterstcomplete, file=csvFileName3) ```