##### All necesssary functions for the hands on ######### ## Now we want to compute Tau (tissue-specificity), a trait for these Zebrafish genes #Function require a vector with expression of one gene in different tissues. #Mean value per gene is calculated fmean <- function(x) { if(!all(is.na(x))) { res <- mean(x, na.rm=TRUE) } else { res <- NA } return(res) } ###***###***### ###+++### #Function require a vector with expression of one gene in different tissues. #If expression for one tissue is not known, gene specificity for this gene is NA #Minimum 2 tissues fTau <- function(x) { if(all(!is.na(x))) { if(min(x, na.rm=TRUE) >= 0) { if(max(x)!=0) { x <- (1-(x/max(x))) res <- sum(x, na.rm=TRUE) res <- res/(length(x)-1) } else { res <- 0 } } else { res <- NA #print("Expression values have to be positive!") } } else { res <- NA #print("No data for this gene avalable.") } return(res) } ## We need to use the function written to parse trees with nodes labeled with events parse_trees<-function(line) { out = tryCatch( { treetxt <- textConnection(line) tree <- treeio::read.nhx(treetxt) close(treetxt) tree@data$label <- c(tree@phylo$tip.label, tree@phylo$node.label) serial_no <- nrow(tree@data) ntips <- length(tree@phylo$tip.label) internal_nodes <- (ntips+1):(ntips+tree@phylo$Nnode) # Adding a column of events of interest tree@data$events <- NA tree@data$events[ tree@data$D == "Y" ] <- "duplication" tree@data$events[ tree@data$D == 0 & tree@data$node > ntips ] <- "speciation" tree@data$events <- factor(tree@data$events, levels=c( "speciation", "duplication")) return(tree) }, error = function(condition){ message("Error") return(NA) }, finally = { message("done") }) return(out) } ## This function adds branch length and node depth to our dataframe of tree@data for further use in building time calibration matrix tree.nodedepth <- function(tree) { gene_tree <- tree@phylo ##Computing branch length for the trees gene_tree <- ape::compute.brlen(gene_tree) gene_tree_data <- tree@data gene_tree_nodes<-gene_tree_data$node ##Computing node depth and returned it to the tree into "@data" slot nodedepth <- ape::node.depth(gene_tree, method = 1) tree@phylo$edge.length<-gene_tree$edge.length tree@data$depth_node<-nodedepth return(tree) } ## The following function is used to drop tips for which we do not have tau data ## That means we need to prune trees ## We also used the criteria that the pruned trees should have atleast 4 tips with tau data drop.leaf <- function(tree) { ##Reading the tips labels and data table tree_tip <- tree@phylo$tip.label gene_tree_data <- tree@data ntips <- length(tree@phylo$tip.label) ##Identify the tips without expression data tips_to_remove <- tree_tip[which(is.na(gene_tree_data[1:length(tree_tip),]$Tau))] ##Identify the tips with expression data tips_with_Tau <- tree_tip[which(!is.na(gene_tree_data[1:length(tree_tip),]$Tau))] ##Identifying atleast one speciation node Speciation_node <- length(gene_tree_data$events[ gene_tree_data$D == 0 & gene_tree_data$node > ntips ]) ##Returning the pruned tree if atleast 4 tips have Tau data and atleast 1 speciation node if(length(tips_with_Tau) >= 4 & Speciation_node >=1) { pruned.tree<-treeio::drop.tip(tree, tips_to_remove) return(pruned.tree) } else{return(NA)} } ## This function is written to modify the labels of all duplication nodes modify.label <- function(tree) { gene_tree <- tree@phylo gene_tree_data <- tree@data ## Collecting internal clade labels of duplication nodes to modify them Internal_clade_label <- gene_tree_data$label[which(!is.na(gene_tree_data$events))] dup_node<-gene_tree_data$node[which(gene_tree_data$events=="duplication")] dup_node_data_label<-gene_tree_data$label[dup_node] #dup_node_data_S<-gene_tree_data$S[dup_node] tree@data$label[dup_node] <- paste(dup_node_data_label,"_d",sep = "") #tree@data$S[dup_node] <- paste(dup_node_data_S,".d",sep = "") return(tree) } ## We need the following function to calibrate the trees ## This function calibrates our gene trees based on the calibration time points ## Maintaining the trees original topology tree.calibrate <- function(tree, timeframe, model="relaxed") { gene_tree <- tree@phylo gene_tree_data <- tree@data ## Identifyting nodes and their depth (those are not representing tips) gene_tree_nodes <- gene_tree_data$node[which(!is.na(gene_tree_data$events))] node.depth <- gene_tree_data$depth_node[gene_tree_nodes] ## Merging tree@data based on "label" tree_data<-merge(gene_tree_data,timeframe,by=c("label")) if( nrow(tree_data) > 0 ) { tree_data<- tree_data[order(tree_data$depth_node, decreasing = T),] ## Buiding calibration matrix calibration_matrix <- data.frame( node = tree_data$node, age.min = tree_data$Mya, age.max = tree_data$Mya, soft.bounds = NA) ## Time calibrating trees calibrate_trees <- try(ape::chronos(gene_tree, calibration = calibration_matrix, model = model)) ##Trees those are not time calibrated can not be used further ##To avoid error due to non calibrated tree we did the following if( "phylo" %in% class(calibrate_trees)) { class(calibrate_trees) <- "phylo" tree@phylo <- calibrate_trees } else{tree = NA} return(tree) } } ## This function is written to remodify the labels of all duplication nodes remodify.label <- function(tree) { gene_tree <- tree@phylo gene_tree_data <- tree@data ## Collecting internal clade labels of duplication nodes to modify them Internal_clade_label <- gene_tree_data$label[which(!is.na(gene_tree_data$events))] Internal_node<-gene_tree_data$node[which(!is.na(gene_tree_data$events))] New_internal_clade_label<-sapply(Internal_clade_label, function(x) {x <- unlist(strsplit(toString(x), split='_d', fixed=TRUE))[1]}) tree@data$label[Internal_node] <-as.character(New_internal_clade_label) #tree@data$S[dup_node] <- paste(dup_node_data_S,".d",sep = "") return(tree) } ## Hence we need the following function ## This function adds age of the nodes to the "@data" slot node.age <-function (tree) { ## Tree and "@data" slot gene_tree<-tree@phylo gene_data<-tree@data Internal_label<-gene_data$label[which(!is.na(gene_data$events))] Internal_node<-data.frame(node=gene_data$node[which(!is.na(gene_data$events))]) ## Computing node age and adding the values to the "@data"slot age.count<-ape::branching.times(gene_tree) names(age.count)<-Internal_node #age<-age.count[which(names(age.count) %in% gene_data$node)] tree@data$node.age <- c(rep(0, length(gene_tree$tip.label)), as.numeric(age.count)) return(tree) } ##This function calculates Phylogenetic indepent contrast for our calibrated trees contrast.calculation<-function(tree) { ## Tree data gene_tree<-tree@phylo gene_data<-tree@data ## Collecting tip data Tau.tip<-tree@data$Tau[which(is.na(tree@data$events))] ## Initializing variable tree@data$PIC <- NULL tree@data$Variance <- NULL ## Calculating the phylogenetic independent contrasts of tree ## Returning the results to "data" frame of the tree ## To resolve polytomic tree we used "multi2di" pic.tree <- ape::pic(Tau.tip, multi2di(gene_tree), var.contrasts=TRUE) ## However this using "multi2di", adds PIC and variance for nodes(afer resolving) for which data are absent in @data slot ## New we only took pic and variance data for nodes for which duplication and speciation data available contra<-pic.tree[,1][which(row.names(pic.tree) %in% gene_data$label)] var<-pic.tree[,2][which(row.names(pic.tree) %in% gene_data$label)] tree@data$PIC <- c(rep(NA, length(gene_tree$tip.label)), contra) tree@data$Variance <- c(rep( NA, length(gene_tree$tip.label)),var) tree@data$pic.abs <- abs(tree@data$PIC) ## absolute PIC values return(tree) } ##This function will help to analyze data in pairs pairwise.processing<-function(tree) { ##Reading data in the "@data" slot and counting number of tips gene_data<-tree@data tips<-length(tree@phylo$tip.label) ##Initializing dataframe all_pairwise<-NULL PR_results<- NULL #Making tibble of pairwise data and adding data in it all_pairwise<-data.frame(t(combn(tips,2))) colnames(all_pairwise) <-c("tip1","tip2") all_pairwise$tip.label1<-gene_data$label[all_pairwise$tip1] all_pairwise$tip.label2<-gene_data$label[all_pairwise$tip2] all_pairwise$species1<-gene_data$S[all_pairwise$tip1] all_pairwise$species2<-gene_data$S[all_pairwise$tip2] all_pairwise$Tau1<-gene_data$Tau[all_pairwise$tip1] all_pairwise$Tau2<-gene_data$Tau[all_pairwise$tip2] # Obtaining the most recent common ancestor for the pairwise tip combination all_pairwise$mrca<-apply(all_pairwise, 1, function(y) getMRCA(tree@phylo, c(as.integer(y['tip1']), as.integer(y['tip2'])))) ##Merging two data frames PR_results<-left_join(all_pairwise,gene_data,by = c( "mrca" = "node")) drops <- c("B","SIS") # B and SIS have inconsistent types in binding rows, so we remove those columns PR_results<-PR_results[ , !(names(PR_results) %in% drops)] return(PR_results) } ##plotting function of boxplot boxplot.new<-function(dataframe,pval) { dodge <- position_dodge(width = 0.51) ggplot(dataframe,aes(x=events, y=pic.abs, fill=events ) ) + #geom_violin(position = dodge, alpha=0.3)+ guides( colour = guide_legend( override.aes = list( shape = 16 ) ) ) + geom_boxplot( width=0.5,outlier.colour=NA, position = dodge, notch = T) + #scale_fill_manual(values=group.colors)+ xlab( NULL ) + ylab(expression(bold("Phylogenetic Independent Contrast (PIC)"))) + #ylim(min(simulation.df$pic), max(simulation.df$pic)) + ylim(0, 0.03) + theme_classic()+ theme(legend.title=element_blank(),legend.position=c(1.9,1.9)) + theme(axis.text=element_text(size=10,face="bold")) + theme(legend.text=element_text(size=10,face="bold")) + #theme(plot.title = element_text(hjust = 0.5)) + annotate("text", x = 1.5, y = 0.025, label= pval, fontface = 2) } ## Function to create scatterplot result of our data scatterplot<-function(dataframe,pval) { ggplot(dataframe, aes(x=Mya, y=R, color=Event)) + geom_point(alpha = 0.8, shape = 20,size = 4) + xlab(expression(bold("Million years "))) + ylab(expression(bold("Pearson correlation coeffcient, R"))) + xlim(0,500) + ylim(0, 1) + theme_classic() + theme(legend.title=element_blank(),legend.position=c(0.85,0.55)) + theme(axis.text=element_text(size=10,face="bold")) + annotate("text", x = 400, y = 0.8, label=pval, fontface = 2) } ## This function is written to perform ANCOVA test ANCOVA_pairwise <- function (data) { capture.output(cat("\n\n ANCOVA result \n\n"), append=TRUE, file="Genomicus_empirical.txt") ## Separating speciation and duplication evenets from the data all_spec<-data[data$Event=="speciation",] all_dupl<- data[data$Event=="duplication",] AP1<-NULL AP1<- lm(all_dupl$R~all_dupl$Mya) AO1<-NULL AO1<- lm(all_spec$R~all_spec$Mya) ## Linear regression model1 with interaction term reg_imp1<-NULL reg_imp2<-NULL reg_imp1 <-aov(R~Mya+Event+Mya*Event, data = data) interaction_effect<-summary(reg_imp1) interaction_P<-interaction_effect[[1]][["Pr(>F)"]][3] event_effect_P<- interaction_effect[[1]][["Pr(>F)"]][2] covariate_effect_P<- interaction_effect[[1]][["Pr(>F)"]][1] ## Linear regression model2 without interaction term reg_imp2 <-aov(R~Mya+Event, data = data) without_interaction_effect<-summary(reg_imp2) event_effect_P1<- without_interaction_effect[[1]][["Pr(>F)"]][2] covariate_effect_P1<- without_interaction_effect[[1]][["Pr(>F)"]][1] ## testing which model is better between thw two test<-anova(reg_imp1,reg_imp2) pValue_comp <- test[6][2,1] if(pValue_comp >= 0.05) { if(event_effect_P1 >= 0.05) { event_effect_P1 <- format(event_effect_P1, digits= 3, scientific = TRUE) label_p1 =paste0("ANCOVA, P = ",event_effect_P1) } if((event_effect_P1 < 0.05)) { if (event_effect_P1 == 0){label_p1 = paste0("ANCOVA, P < 2.2e-16")} if ((event_effect_P1 != 0)) { event_effect_P1 <- format(event_effect_P1, digits= 3, scientific = TRUE) label_p1 =paste0("ANCOVA, P = ",event_effect_P1) } ##Now to check whether there is a significant difference in the intercepts ##We will fit linear regressions separately for duplication and speciation in this case dupli<-NULL speci<-NULL dupli<-summary(AP1) dupli_intercept <-dupli[[4]][1] speci<-summary(AO1) speci_intercept <-speci[[4]][1] if(dupli_intercept > speci_intercept) { cat("\nThis means paralogs have a higher intercept than orthologs\n\n") } if(dupli_intercept < speci_intercept) { cat("\nThis means orthologs have a higher intercept than paralogs\n\n") } } } if(pValue_comp < 0.05) { if (event_effect_P == 0){label_p1 = paste0("ANCOVA, P < 2.2e-16")} if ((event_effect_P != 0)) { event_effect_P <- format(event_effect_P, digits= 3, scientific = TRUE) label_p1 =paste0("ANCOVA, P = ",event_effect_P) } dupli<-NULL speci<-NULL dupli<-summary(AP1) dupli_intercept <-dupli[[4]][1] speci<-summary(AO1) speci_intercept <-speci[[4]][1] if(dupli_intercept > speci_intercept) { cat("\nThis means paralogs have a higher intercept than orthologs\n\n") } if(dupli_intercept < speci_intercept) { cat("\nThis means orthologs have a higher intercept than paralogs\n\n") } } return(label_p1) }