Phylogenetic Poisson regression simulation

#############################################################
#############BELOW WORKS FOR GEE POISSON PHYLOGENTIC#########
#############################################################
rm(list=ls())
setwd("~/Dropbox/FCU/Teaching/Mentoring/2016Spring-2021Fall_AssociateProfessor/2020Fall/ChiYuWu/rcodeCYW")

library(ape)
library(MASS)
library(ggplot2)
library(geepack)

compar.gee2<-function (formula, data = NULL, family = gaussian, phy, corStruct, scale.fix = FALSE, scale.value = 1){
  # formula=  rawA~X
  # phy=tree
  # scale.fix = FALSE
  # scale.value = 1
  # data = NULL
  # theta=2
  #family =  negative.binomial(2)
  if (requireNamespace("gee", quietly = TRUE))
    gee <- gee::gee
  else stop("package 'gee' not available")
  if (!missing(corStruct)) {
    if (!missing(phy))
      warning("the phylogeny was ignored because you gave a 'corStruct' object")
    R <- vcv(corStruct, corr = TRUE)
  }
  else {
    R <- vcv(phy, corr = TRUE)
  }
  if (is.null(data))
    data <- parent.frame()
  else {
    nmsR <- rownames(R)
    if (!identical(rownames(data), nmsR)) {
      if (!any(is.na(match(rownames(data), nmsR))))
        data <- data[nmsR, ]
      else {
        msg <- if (missing(corStruct))
          "the tip labels of the tree"
        else "those of the correlation structure"
        msg <- paste("the rownames of the data.frame and",
                     msg, "do not match: the former were ignored in the analysis")
        warning(msg)
      }
    }
  }
  effect.assign <- attr(model.matrix(formula, data = data), "assign")
  for (i in all.vars(formula)) {
    if (any(is.na(eval(parse(text = i), envir = data))))
      stop("the present method cannot be used with missing data: you may consider removing the species with missing data from your tree with the function 'drop.tip'.")
  }
  id <- rep(1, dim(R)[1])
  ###############################
  #  negative.binomial(2)
  library(gee)
  family = "poisson"
  geemod <- do.call("gee",
                    list(formula,
                         id,
                         data = data,
                         family = family,
                         R = R,
                         corstr = "fixed",
                         scale.fix = scale.fix,
                         scale.value = scale.value,
                         tol=1e-4,maxit=25)
  )
  W <- geemod$naive.variance
  fname <- if (is.function(family))
    deparse(substitute(family))
  else if (is.list(family))
    family$family
  else family
  if (fname == "binomial")
    W <- summary(glm(formula, family = quasibinomial, data = data))$cov.scaled
  N <- geemod$nobs
  if (!missing(corStruct))
    phy <- attr(corStruct, "tree")
  dfP <- sum(phy$edge.length) * N/sum(diag(vcv(phy)))
  Y <- geemod$y
  MU <- geemod$fitted.values
  Qlik <- switch(fname, 
                 gaussian = -sum((Y - MU)^2)/2,
                 binomial = sum(Y *log(MU/(1 - MU)) + log(1 - MU)),
                 poisson = sum(Y * log(MU) - MU),
                 Gamma = sum(Y/MU + log(MU)),
                 inverse.gaussian = sum(-Y/(2 * MU^2) + 1/MU),
  )
  Ai <- do.call("gee", list(formula, id, data = data, family = family,
                            corstr = "independence", scale.fix = scale.fix, scale.value = scale.value))$naive.variance
  QIC <- -2 * Qlik + 2 * sum(diag(solve(Ai) %*% W))
  print(geemod$family)
  obj <- list(call = match.call(), effect.assign = effect.assign,
              nobs = N, QIC = QIC, coefficients = geemod$coefficients,
              residuals = geemod$residuals, fitted.values = MU, family = geemod$family$family,
              link = geemod$family$link, scale = geemod$scale, W = W,
              dfP = dfP)
  class(obj) <- "compar.gee"
  obj
}


### Poisson

ordsamplep.poi<-function (n, lambda, Sigma){
  #  n=sims
  #  lambda=lambda.array
  #  Sigma=C
  k <- length(lambda)
  #  n<-2
  valori <- mvrnorm(n, rep(0, k), Sigma)
  for (i in 1:k)
  {
    #    i<-1
    valori[, i] <- qpois(pnorm(valori[,i]), lambda[i])
  }
  return(valori)
}


simCoefPoi<-function(sims=sims,taxa=taxa,lambda=lambda,predictor=predictor){
  coef.array <- array(NA,c(sims,2))
  coef.raw<-array(NA,c(4,2))
  colnames(coef.raw)<-c("beta_0","beta_1")

  C<-vcv(tree)
  C<-C/max(C)
  simY<-ordsamplep.poi(n=sims, lambda=lambda, Sigma=C)
  for (simIndex in 1:sims){
    #simIndex<-1
    print(simY[simIndex,])
    #?compar.gee
    formula=simY[simIndex,]~predictor
    fitsim <-compar.gee2(simY[simIndex,]~predictor,phy=tree,family="poisson",scale.fix = TRUE)
    #fitsim <-compar.gee2(simY[simIndex,]~predictor,phy=tree,family="poisson",scale.fix = TRUE)
    coefsim<-fitsim$coefficients
    coef.array [simIndex,]<-coefsim
    #}
  }
  return(coef.array)
}


#set.seed(8122)
sims<- 10000
true.param<-c(3,5)
names(true.param)<-c("b0","b1")

taxa.array<-c(16,32,64,128)
treetype.array=c("coal",  "balanced","left", "star")
coef.raw<-array(NA,c(length(treetype.array),length(taxa.array),2))
tree.coef.array<-array(NA,c(length(treetype.array), length(taxa.array),sims,2))
for(treetypeIndex in 1:length(treetype.array)){
  #treetypeIndex<-1
  treetype<-treetype.array[treetypeIndex]

  for (taxaIndex in 1:length(taxa.array)){
    #taxaIndex<-1
    taxa <- taxa.array[taxaIndex]
    if(treetype=="coal"){tree<-rcoal(taxa)}
    if(treetype=="balanced"){tree<-compute.brlen(stree(taxa,type = "balanced"))}
    if(treetype=="left"){tree<-compute.brlen(stree(n=taxa,type = "left"))}
    if(treetype=="star"){tree<-compute.brlen(stree(n=taxa,type = "star"))}

    #response<-rpois(taxa,lambda=20)
    predictor<-rexp(n=taxa,rate=4)
    lambda<-exp(true.param["b0"]+true.param["b1"]*predictor)
    coef.array <- simCoefPoi(sims=sims,taxa=taxa,lambda=lambda,predictor=predictor)
    tree.coef.array[treetypeIndex,taxaIndex,,]<-  coef.array
  }
}

save.image("phypoisim2.rda")


# outtable<-array(NA,c(length(taxa.array),length(treetype.array)*2))
# rownames(outtable)<-taxa.array
# outtable
# 
# dim(tree.coef.array)
# for(treetypeIndex in 1:length(treetype.array)){
#   for(taxaIndex in 1:length(taxa.array)){
#     outtable[taxaIndex,(2*treetypeIndex-1):(2*treetypeIndex)]  <- c(paste(round(mean(tree.coef.array[treetypeIndex,taxaIndex,,1]),3),"(",
#                                                                           round(sd(tree.coef.array[treetypeIndex,taxaIndex,,1]),3),")",sep="")
#                                                                     ,
#                                                                     paste(round(mean(tree.coef.array[treetypeIndex,taxaIndex,,2]),3),"(",
#                                                                           round(sd(tree.coef.array[treetypeIndex,taxaIndex,,2]),3),")",sep="")
#     )
#   }
# }

# for(treetypeIndex in 1:length(treetype.array)){
#   for(taxaIndex in 1:length(taxa.array)){
#     # For each pair of treetype and taxa, get the data for coef 1 and 2
#     coef1 <- tree.coef.array[treetypeIndex,taxaIndex,,1]
#     coef2 <- tree.coef.array[treetypeIndex,taxaIndex,,2]
#     
#     # Compute Q1 - 1.5*IQR and Q3 + 1.5*IQR for coef 1 and 2
#     bounds1 <- quantile(coef1, c(0.25, 0.75)) + c(-1, 1) * 1.5 * IQR(coef1)
#     bounds2 <- quantile(coef2, c(0.25, 0.75)) + c(-1, 1) * 1.5 * IQR(coef2)
#     
#     # Remove outliers for coef 1 and 2
#     coef1_no_outliers <- coef1[coef1 >= bounds1[1] & coef1 <= bounds1[2]]
#     coef2_no_outliers <- coef2[coef2 >= bounds2[1] & coef2 <= bounds2[2]]
#     
#     # Compute median and sd without outliers and update the table
#     outtable[taxaIndex,(2*treetypeIndex-1):(2*treetypeIndex)]  <- c(
#       paste(round(median(coef1_no_outliers),2),"(",round(sd(coef1_no_outliers),2),")",sep=""),
#       paste(round(median(coef2_no_outliers),2),"(",round(sd(coef2_no_outliers),2),")",sep="")
#     )
#   }
# }

load("~/Dropbox/FCU/Teaching/Mentoring/2016Spring-2021Fall_AssociateProfessor/2020Fall/ChiYuWu/rcodeCYW/phypoisim2.rda")


outtable<-array(NA,c(length(taxa.array),length(treetype.array)*2))
rownames(outtable)<-taxa.array
outtable

##     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## 16    NA   NA   NA   NA   NA   NA   NA   NA
## 32    NA   NA   NA   NA   NA   NA   NA   NA
## 64    NA   NA   NA   NA   NA   NA   NA   NA
## 128   NA   NA   NA   NA   NA   NA   NA   NA

dim(tree.coef.array)

## [1]     4     4 10000     2

for(treetypeIndex in 1:length(treetype.array)){
  for(taxaIndex in 1:length(taxa.array)){
    b0_values <- tree.coef.array[treetypeIndex,taxaIndex,,1][is.finite(tree.coef.array[treetypeIndex,taxaIndex,,1])]
    b1_values <- tree.coef.array[treetypeIndex,taxaIndex,,2][is.finite(tree.coef.array[treetypeIndex,taxaIndex,,2])]
    
    outtable[taxaIndex,(2*treetypeIndex-1):(2*treetypeIndex)]  <- c(paste(round(mean(b0_values, na.rm=T),3),"(",round(sd(b0_values, na.rm=T),3),")",sep=""),
                                                                    paste(round(mean(b1_values, na.rm=T),3),"(",round(sd(b1_values, na.rm=T),3),")",sep="")
    )
  }
}

print(outtable)

##     [,1]       [,2]       [,3]           [,4]           [,5]          
## 16  "3(0.034)" "5(0.031)" "2.998(0.086)" "5.002(0.088)" "2.999(0.059)"
## 32  "3(0.009)" "5(0.006)" "3(0.028)"     "5(0.024)"     "2.998(0.082)"
## 64  "3(0.009)" "5(0.008)" "3(0.025)"     "5(0.028)"     "3(0.054)"    
## 128 "3(0.008)" "5(0.007)" "3(0.004)"     "5(0.003)"     "3(0.02)"     
##     [,6]           [,7]           [,8]      
## 16  "5.001(0.057)" "2.999(0.055)" "5(0.114)"
## 32  "5.003(0.121)" "3(0.029)"     "5(0.025)"
## 64  "5.001(0.071)" "3(0.019)"     "5(0.019)"
## 128 "5(0.019)"     "3(0.014)"     "5(0.015)"

library(xtable)
xtable(outtable)

## % latex table generated in R 4.3.1 by xtable 1.8-4 package
## % Sun Jul  9 14:47:03 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllllllll}
##   \hline
##  & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\ 
##   \hline
## 16 & 3(0.034) & 5(0.031) & 2.998(0.086) & 5.002(0.088) & 2.999(0.059) & 5.001(0.057) & 2.999(0.055) & 5(0.114) \\ 
##   32 & 3(0.009) & 5(0.006) & 3(0.028) & 5(0.024) & 2.998(0.082) & 5.003(0.121) & 3(0.029) & 5(0.025) \\ 
##   64 & 3(0.009) & 5(0.008) & 3(0.025) & 5(0.028) & 3(0.054) & 5.001(0.071) & 3(0.019) & 5(0.019) \\ 
##   128 & 3(0.008) & 5(0.007) & 3(0.004) & 5(0.003) & 3(0.02) & 5(0.019) & 3(0.014) & 5(0.015) \\ 
##    \hline
## \end{tabular}
## \end{table}

# Prepare data for b0
df_b0 <- data.frame()
for(treetypeIndex in 1:length(treetype.array)){
  for(taxaIndex in 1:length(taxa.array)){
    b0_values <- tree.coef.array[treetypeIndex,taxaIndex,,1][is.finite(tree.coef.array[treetypeIndex,taxaIndex,,1])]
    
    temp_df_b0 <- data.frame(
      taxa = taxa.array[taxaIndex],
      treetype = treetype.array[treetypeIndex],
      value = b0_values
    )
    
    df_b0 <- rbind(df_b0, temp_df_b0)
  }
}

head(df_b0)

##   taxa treetype    value
## 1   16     coal 2.985514
## 2   16     coal 3.003005
## 3   16     coal 2.926589
## 4   16     coal 2.993321
## 5   16     coal 3.018333
## 6   16     coal 3.025342

# Load library
library(ggplot2)
df_b0$taxa<-as.factor(df_b0$taxa)
# Create boxplot for b0 across all tree types
p_b0 <- ggplot(df_b0, aes(x=treetype, y=value, fill=taxa)) +
  geom_boxplot() +
  labs(x = "Tree Type", 
       y = expression(beta[0] ~ "Estimates"), 
       fill = "Taxa Size",
       title = "Phylogenetic Poisson Regression:" ~ beta[0] ~ "Estimates")   +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  # This line centers the title

# Print the plot
print(p_b0)

# Prepare data for b1
df_b1 <- data.frame()
for(treetypeIndex in 1:length(treetype.array)){
  for(taxaIndex in 1:length(taxa.array)){
    b1_values <- tree.coef.array[treetypeIndex,taxaIndex,,2][is.finite(tree.coef.array[treetypeIndex,taxaIndex,,2])]
    
    temp_df_b1 <- data.frame(
      taxa = taxa.array[taxaIndex],
      treetype = treetype.array[treetypeIndex],
      value = b1_values
    )
    
    df_b1 <- rbind(df_b1, temp_df_b1)
  }
}

head(df_b1)

##   taxa treetype    value
## 1   16     coal 5.011077
## 2   16     coal 5.001785
## 3   16     coal 5.067866
## 4   16     coal 5.000886
## 5   16     coal 4.986326
## 6   16     coal 4.981057

# Load library
library(ggplot2)
df_b1$taxa<-as.factor(df_b1$taxa)
# Create boxplot for b1 across all tree types
p_b1 <- ggplot(df_b1, aes(x=treetype, y=value, fill=taxa)) +
  geom_boxplot() +
  labs(x = "Tree Type", 
       y = expression(beta[1] ~ "Estimates"), 
       fill = "Taxa Size",
       title = "Phylogenetic Poisson Regression:" ~ beta[1] ~ "Estimates")   +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  # This line centers the title

# Print the plot
print(p_b1)

library(gridExtra)
grid.arrange(p_b0,p_b1,ncol=2)