##### All necesssary functions for the hands on #########

## Now we want to compute Tau (tissue-specificity), a trait for these Zebrafish genes
#Function require a vector with expression of one gene in different tissues.
#Mean value per gene is calculated
fmean <- function(x)
{
  if(!all(is.na(x))) {
    res <- mean(x, na.rm=TRUE)
  } else {
    res <- NA
  }
  return(res)
}
###***###***###	
###+++###
#Function require a vector with expression of one gene in different tissues.
#If expression for one tissue is not known, gene specificity for this gene is NA
#Minimum 2 tissues
fTau <- function(x)
{
  if(all(!is.na(x))) {
    if(min(x, na.rm=TRUE) >= 0) {
      if(max(x)!=0) {
        x <- (1-(x/max(x)))
        res <- sum(x, na.rm=TRUE)
        res <- res/(length(x)-1)
      } else {
        res <- 0
      }
    } else {
      res <- NA
      #print("Expression values have to be positive!")
    } 
  } else {
    res <- NA
    #print("No data for this gene avalable.")
  } 
  return(res)
}


## We need to use the function written to parse trees with nodes labeled with events
parse_trees<-function(line)
{
  out = tryCatch(
    {
      treetxt <- textConnection(line)
      tree <- treeio::read.nhx(treetxt)
      close(treetxt)
      
      tree@data$label <- c(tree@phylo$tip.label, tree@phylo$node.label)
      serial_no <- nrow(tree@data)
      ntips <- length(tree@phylo$tip.label) 
      internal_nodes <- (ntips+1):(ntips+tree@phylo$Nnode) 
      
      # Adding a column of events of interest
      tree@data$events <- NA
      tree@data$events[ tree@data$D == "Y" ] <- "duplication"
      tree@data$events[ tree@data$D == 0 & tree@data$node > ntips ] <- "speciation"
      tree@data$events <- factor(tree@data$events, levels=c( "speciation", "duplication"))
      return(tree)
    },
    error = function(condition){
      message("Error")
      return(NA)
    },
    
    finally = {
      message("done")
    })
  return(out)
}

## This function adds branch length and node depth to our dataframe of tree@data for further use in building time calibration matrix
tree.nodedepth <- function(tree)
{
  gene_tree <- tree@phylo
  
  ##Computing branch length for the trees
  gene_tree <- ape::compute.brlen(gene_tree)
  gene_tree_data <- tree@data
  gene_tree_nodes<-gene_tree_data$node
  
  ##Computing node depth and returned it to the tree into "@data" slot
  nodedepth <- ape::node.depth(gene_tree, method = 1)
  tree@phylo$edge.length<-gene_tree$edge.length
  tree@data$depth_node<-nodedepth
  return(tree)
}

## The following function is used to drop tips for which we do not have tau data
## That means we need to prune trees
## We also used the criteria that the pruned trees should have atleast 4 tips with tau data
drop.leaf <- function(tree) 
{
  ##Reading the tips labels and data table
  tree_tip <- tree@phylo$tip.label
  gene_tree_data <- tree@data
  ntips <- length(tree@phylo$tip.label)
  
  ##Identify the tips without expression data
  tips_to_remove <- tree_tip[which(is.na(gene_tree_data[1:length(tree_tip),]$Tau))]
  
  ##Identify the tips with expression data
  tips_with_Tau <- tree_tip[which(!is.na(gene_tree_data[1:length(tree_tip),]$Tau))]
  
  ##Identifying atleast one speciation node
  Speciation_node <- length(gene_tree_data$events[ gene_tree_data$D == 0 & gene_tree_data$node > ntips ])
  
  ##Returning the pruned tree if atleast 4 tips have Tau data and atleast 1 speciation node
  if(length(tips_with_Tau) >= 4 & Speciation_node >=1)
  {
    pruned.tree<-treeio::drop.tip(tree, tips_to_remove)
    return(pruned.tree)
  }
  else{return(NA)}
}

## This function is written to modify the labels of all duplication nodes 
modify.label <- function(tree)
{
  gene_tree <- tree@phylo
  gene_tree_data <- tree@data
  
  ## Collecting internal clade labels of duplication nodes to modify them
  Internal_clade_label <- gene_tree_data$label[which(!is.na(gene_tree_data$events))]
  dup_node<-gene_tree_data$node[which(gene_tree_data$events=="duplication")]
  dup_node_data_label<-gene_tree_data$label[dup_node]
  #dup_node_data_S<-gene_tree_data$S[dup_node]
  tree@data$label[dup_node] <- paste(dup_node_data_label,"_d",sep = "")
  #tree@data$S[dup_node] <- paste(dup_node_data_S,".d",sep = "")
  return(tree)
}

## We need the following function to calibrate the trees
## This function calibrates our gene trees based on the calibration time points
## Maintaining the trees original topology
tree.calibrate <- function(tree, timeframe, model="relaxed")
{ 
  
  gene_tree <- tree@phylo
  gene_tree_data <- tree@data
  
  ## Identifyting nodes and their depth (those are not representing tips)
  gene_tree_nodes <- gene_tree_data$node[which(!is.na(gene_tree_data$events))] 
  node.depth <- gene_tree_data$depth_node[gene_tree_nodes]
  
  ## Merging tree@data based on "label"
  tree_data<-merge(gene_tree_data,timeframe,by=c("label"))
  if( nrow(tree_data) > 0 )
  {
    tree_data<- tree_data[order(tree_data$depth_node, decreasing = T),]
    
    ## Buiding calibration matrix
    calibration_matrix <- data.frame(
      node = tree_data$node,
      age.min = tree_data$Mya,
      age.max = tree_data$Mya,
      soft.bounds = NA) 
    ## Time calibrating trees
    calibrate_trees <- try(ape::chronos(gene_tree, calibration = calibration_matrix,  model = model))
    
    ##Trees those are not time calibrated can not be used further
    ##To avoid error due to non calibrated tree we did the following
    if( "phylo" %in% class(calibrate_trees))
    {
      class(calibrate_trees) <- "phylo"
      tree@phylo <- calibrate_trees
    }
    
    else{tree = NA}
    return(tree)
  }
}

## This function is written to remodify the labels of all duplication nodes 
remodify.label <- function(tree)
{
  gene_tree <- tree@phylo
  gene_tree_data <- tree@data
  
  
  ## Collecting internal clade labels of duplication nodes to modify them
  Internal_clade_label <- gene_tree_data$label[which(!is.na(gene_tree_data$events))]
  Internal_node<-gene_tree_data$node[which(!is.na(gene_tree_data$events))]
  New_internal_clade_label<-sapply(Internal_clade_label,
                                   function(x) 
                                   {x <- unlist(strsplit(toString(x), split='_d', fixed=TRUE))[1]})
  tree@data$label[Internal_node] <-as.character(New_internal_clade_label)
  #tree@data$S[dup_node] <- paste(dup_node_data_S,".d",sep = "")
  return(tree)
}

## Hence we need the following function
## This function adds age of the nodes to the "@data" slot
node.age <-function (tree)
{
  ## Tree and "@data" slot
  gene_tree<-tree@phylo
  gene_data<-tree@data
  
  Internal_label<-gene_data$label[which(!is.na(gene_data$events))]
  Internal_node<-data.frame(node=gene_data$node[which(!is.na(gene_data$events))])
  
  ## Computing node age and adding the values to the "@data"slot
  age.count<-ape::branching.times(gene_tree)
  names(age.count)<-Internal_node
  
  #age<-age.count[which(names(age.count) %in% gene_data$node)]
  tree@data$node.age <- c(rep(0, length(gene_tree$tip.label)), as.numeric(age.count))
  return(tree)
}  

##This function calculates Phylogenetic indepent contrast for our calibrated trees
contrast.calculation<-function(tree)
{
  ## Tree data
  gene_tree<-tree@phylo
  gene_data<-tree@data
  
  ## Collecting tip data 
  Tau.tip<-tree@data$Tau[which(is.na(tree@data$events))]
  
  ## Initializing variable
  tree@data$PIC <- NULL
  tree@data$Variance <- NULL
  
  ## Calculating the phylogenetic independent contrasts of tree
  ## Returning the results to "data" frame of the tree 
  ## To resolve polytomic tree we used "multi2di"
  pic.tree <- ape::pic(Tau.tip, multi2di(gene_tree), var.contrasts=TRUE)
  
  ## However this using "multi2di", adds PIC and variance for nodes(afer resolving) for which data are absent in @data slot
  ## New we only took pic and variance data for nodes for which duplication and speciation data available
  contra<-pic.tree[,1][which(row.names(pic.tree) %in% gene_data$label)]
  var<-pic.tree[,2][which(row.names(pic.tree) %in% gene_data$label)]
  tree@data$PIC <- c(rep(NA, length(gene_tree$tip.label)), contra)
  tree@data$Variance <- c(rep( NA, length(gene_tree$tip.label)),var)
  tree@data$pic.abs <- abs(tree@data$PIC) ## absolute PIC values
  return(tree)
}

##This function will help to analyze data in pairs
pairwise.processing<-function(tree)
{
  ##Reading data in the "@data" slot and counting number of tips
  gene_data<-tree@data
  tips<-length(tree@phylo$tip.label)
  
  ##Initializing dataframe
  all_pairwise<-NULL
  PR_results<- NULL
  
  #Making tibble of pairwise data and adding data in it
  all_pairwise<-data.frame(t(combn(tips,2)))
  colnames(all_pairwise) <-c("tip1","tip2")
  all_pairwise$tip.label1<-gene_data$label[all_pairwise$tip1]
  all_pairwise$tip.label2<-gene_data$label[all_pairwise$tip2]
  all_pairwise$species1<-gene_data$S[all_pairwise$tip1]
  all_pairwise$species2<-gene_data$S[all_pairwise$tip2]
  all_pairwise$Tau1<-gene_data$Tau[all_pairwise$tip1]
  all_pairwise$Tau2<-gene_data$Tau[all_pairwise$tip2]
  
  # Obtaining the most recent common ancestor for the pairwise tip combination
  all_pairwise$mrca<-apply(all_pairwise, 1, function(y) getMRCA(tree@phylo, c(as.integer(y['tip1']), as.integer(y['tip2']))))
  
  ##Merging two data frames
  PR_results<-left_join(all_pairwise,gene_data,by = c( "mrca" = "node"))
  drops <- c("B","SIS")  # B and SIS have inconsistent types in binding rows, so we remove those columns
  PR_results<-PR_results[ , !(names(PR_results) %in% drops)]
  return(PR_results)
}


##plotting function of boxplot
boxplot.new<-function(dataframe,pval)
{
  dodge <- position_dodge(width = 0.51)
  ggplot(dataframe,aes(x=events, y=pic.abs, fill=events ) ) + 
    #geom_violin(position = dodge, alpha=0.3)+
    guides( colour = guide_legend( override.aes = list( shape = 16 ) ) ) +
    geom_boxplot( width=0.5,outlier.colour=NA, position = dodge, notch = T) +
    #scale_fill_manual(values=group.colors)+
    xlab( NULL ) +
    ylab(expression(bold("Phylogenetic Independent Contrast (PIC)"))) +
    #ylim(min(simulation.df$pic), max(simulation.df$pic)) +
    ylim(0, 0.03) +
    theme_classic()+
    theme(legend.title=element_blank(),legend.position=c(1.9,1.9)) +
    theme(axis.text=element_text(size=10,face="bold")) +
    theme(legend.text=element_text(size=10,face="bold")) +
    #theme(plot.title = element_text(hjust = 0.5)) +
    annotate("text", x = 1.5, y = 0.025, label= pval, fontface = 2)
}


## Function to create scatterplot result of our data
scatterplot<-function(dataframe,pval)
{
  ggplot(dataframe, aes(x=Mya, y=R, color=Event)) + 
    geom_point(alpha = 0.8, shape = 20,size = 4) +
    xlab(expression(bold("Million years "))) +
    ylab(expression(bold("Pearson correlation coeffcient, R"))) +
    xlim(0,500) +
    ylim(0, 1) +
    theme_classic() +
    theme(legend.title=element_blank(),legend.position=c(0.85,0.55)) +
    theme(axis.text=element_text(size=10,face="bold")) +
    annotate("text", x = 400, y = 0.8, label=pval, fontface = 2)
}

## This function is written to perform ANCOVA test
ANCOVA_pairwise <- function (data)
{
  capture.output(cat("\n\n ANCOVA result \n\n"), append=TRUE, file="Genomicus_empirical.txt") 
  
  ## Separating speciation and duplication evenets from the data
  all_spec<-data[data$Event=="speciation",]
  all_dupl<- data[data$Event=="duplication",]
  
  AP1<-NULL
  AP1<- lm(all_dupl$R~all_dupl$Mya)
  AO1<-NULL
  AO1<- lm(all_spec$R~all_spec$Mya)
 
  
  ## Linear regression model1 with interaction term
  reg_imp1<-NULL
  reg_imp2<-NULL
  reg_imp1 <-aov(R~Mya+Event+Mya*Event, data = data)
  interaction_effect<-summary(reg_imp1)
  interaction_P<-interaction_effect[[1]][["Pr(>F)"]][3]
  event_effect_P<- interaction_effect[[1]][["Pr(>F)"]][2]
  covariate_effect_P<- interaction_effect[[1]][["Pr(>F)"]][1]

  
  ## Linear regression model2 without interaction term
  reg_imp2 <-aov(R~Mya+Event, data = data)
  without_interaction_effect<-summary(reg_imp2)
  event_effect_P1<- without_interaction_effect[[1]][["Pr(>F)"]][2]
  covariate_effect_P1<- without_interaction_effect[[1]][["Pr(>F)"]][1]
  
  ## testing which model is better between thw two
  test<-anova(reg_imp1,reg_imp2)
  
  pValue_comp <- test[6][2,1]
  
  
  if(pValue_comp >= 0.05) 
  {
    if(event_effect_P1 >= 0.05) 
    {
      event_effect_P1 <- format(event_effect_P1, digits= 3, scientific = TRUE)
      label_p1 =paste0("ANCOVA, P = ",event_effect_P1)

    }
    if((event_effect_P1 < 0.05))
    {
      if (event_effect_P1 == 0){label_p1 = paste0("ANCOVA, P < 2.2e-16")}
      if ((event_effect_P1 != 0))
      {
        event_effect_P1 <- format(event_effect_P1, digits= 3, scientific = TRUE)
        label_p1 =paste0("ANCOVA, P = ",event_effect_P1)
      }
      
      ##Now to check whether there is a significant difference in the intercepts
      ##We will fit linear regressions separately for duplication and speciation in this case
      dupli<-NULL
      speci<-NULL
      dupli<-summary(AP1)
      dupli_intercept <-dupli[[4]][1]
      speci<-summary(AO1)
      speci_intercept <-speci[[4]][1]
      if(dupli_intercept > speci_intercept)
      {
        cat("\nThis means paralogs have a higher intercept than orthologs\n\n")
      }
      if(dupli_intercept < speci_intercept)
      {
        cat("\nThis means orthologs have a higher intercept than paralogs\n\n")
      }
    }
    
  }
  if(pValue_comp < 0.05) 
  {
    
    if (event_effect_P == 0){label_p1 = paste0("ANCOVA, P < 2.2e-16")}
    if ((event_effect_P != 0))
    {
      event_effect_P <- format(event_effect_P, digits= 3, scientific = TRUE)
      label_p1 =paste0("ANCOVA, P = ",event_effect_P)
    }
    
    dupli<-NULL
    speci<-NULL
    dupli<-summary(AP1)
    dupli_intercept <-dupli[[4]][1]
    speci<-summary(AO1)
    speci_intercept <-speci[[4]][1]
    if(dupli_intercept > speci_intercept)
    {
      cat("\nThis means paralogs have a higher intercept than orthologs\n\n")
    }
    if(dupli_intercept < speci_intercept)
    {
      cat("\nThis means orthologs have a higher intercept than paralogs\n\n")
    }
  }
  return(label_p1)
}