#############################################################
#############BELOW WORKS FOR GEE POISSON PHYLOGENTIC#########
#############################################################
rm(list=ls())
setwd("~/Dropbox/FCU/Teaching/Mentoring/2016Spring-2021Fall_AssociateProfessor/2020Fall/ChiYuWu/rcodeCYW")
library(ape)
library(MASS)
library(ggplot2)
library(geepack)
compar.gee2<-function (formula, data = NULL, family = gaussian, phy, corStruct, scale.fix = FALSE, scale.value = 1){
# formula= rawA~X
# phy=tree
# scale.fix = FALSE
# scale.value = 1
# data = NULL
# theta=2
#family = negative.binomial(2)
if (requireNamespace("gee", quietly = TRUE))
gee <- gee::gee
else stop("package 'gee' not available")
if (!missing(corStruct)) {
if (!missing(phy))
warning("the phylogeny was ignored because you gave a 'corStruct' object")
R <- vcv(corStruct, corr = TRUE)
}
else {
R <- vcv(phy, corr = TRUE)
}
if (is.null(data))
data <- parent.frame()
else {
nmsR <- rownames(R)
if (!identical(rownames(data), nmsR)) {
if (!any(is.na(match(rownames(data), nmsR))))
data <- data[nmsR, ]
else {
msg <- if (missing(corStruct))
"the tip labels of the tree"
else "those of the correlation structure"
msg <- paste("the rownames of the data.frame and",
msg, "do not match: the former were ignored in the analysis")
warning(msg)
}
}
}
effect.assign <- attr(model.matrix(formula, data = data), "assign")
for (i in all.vars(formula)) {
if (any(is.na(eval(parse(text = i), envir = data))))
stop("the present method cannot be used with missing data: you may consider removing the species with missing data from your tree with the function 'drop.tip'.")
}
id <- rep(1, dim(R)[1])
###############################
# negative.binomial(2)
library(gee)
family = "poisson"
geemod <- do.call("gee",
list(formula,
id,
data = data,
family = family,
R = R,
corstr = "fixed",
scale.fix = scale.fix,
scale.value = scale.value,
tol=1e-4,maxit=25)
)
W <- geemod$naive.variance
fname <- if (is.function(family))
deparse(substitute(family))
else if (is.list(family))
family$family
else family
if (fname == "binomial")
W <- summary(glm(formula, family = quasibinomial, data = data))$cov.scaled
N <- geemod$nobs
if (!missing(corStruct))
phy <- attr(corStruct, "tree")
dfP <- sum(phy$edge.length) * N/sum(diag(vcv(phy)))
Y <- geemod$y
MU <- geemod$fitted.values
Qlik <- switch(fname,
gaussian = -sum((Y - MU)^2)/2,
binomial = sum(Y *log(MU/(1 - MU)) + log(1 - MU)),
poisson = sum(Y * log(MU) - MU),
Gamma = sum(Y/MU + log(MU)),
inverse.gaussian = sum(-Y/(2 * MU^2) + 1/MU),
)
Ai <- do.call("gee", list(formula, id, data = data, family = family,
corstr = "independence", scale.fix = scale.fix, scale.value = scale.value))$naive.variance
QIC <- -2 * Qlik + 2 * sum(diag(solve(Ai) %*% W))
print(geemod$family)
obj <- list(call = match.call(), effect.assign = effect.assign,
nobs = N, QIC = QIC, coefficients = geemod$coefficients,
residuals = geemod$residuals, fitted.values = MU, family = geemod$family$family,
link = geemod$family$link, scale = geemod$scale, W = W,
dfP = dfP)
class(obj) <- "compar.gee"
obj
}
### Poisson
ordsamplep.poi<-function (n, lambda, Sigma){
# n=sims
# lambda=lambda.array
# Sigma=C
k <- length(lambda)
# n<-2
valori <- mvrnorm(n, rep(0, k), Sigma)
for (i in 1:k)
{
# i<-1
valori[, i] <- qpois(pnorm(valori[,i]), lambda[i])
}
return(valori)
}
simCoefPoi<-function(sims=sims,taxa=taxa,lambda=lambda,predictor=predictor){
coef.array <- array(NA,c(sims,2))
coef.raw<-array(NA,c(4,2))
colnames(coef.raw)<-c("beta_0","beta_1")
C<-vcv(tree)
C<-C/max(C)
simY<-ordsamplep.poi(n=sims, lambda=lambda, Sigma=C)
for (simIndex in 1:sims){
#simIndex<-1
print(simY[simIndex,])
#?compar.gee
formula=simY[simIndex,]~predictor
fitsim <-compar.gee2(simY[simIndex,]~predictor,phy=tree,family="poisson",scale.fix = TRUE)
#fitsim <-compar.gee2(simY[simIndex,]~predictor,phy=tree,family="poisson",scale.fix = TRUE)
coefsim<-fitsim$coefficients
coef.array [simIndex,]<-coefsim
#}
}
return(coef.array)
}
#set.seed(8122)
sims<- 10000
true.param<-c(3,5)
names(true.param)<-c("b0","b1")
taxa.array<-c(16,32,64,128)
treetype.array=c("coal", "balanced","left", "star")
coef.raw<-array(NA,c(length(treetype.array),length(taxa.array),2))
tree.coef.array<-array(NA,c(length(treetype.array), length(taxa.array),sims,2))
for(treetypeIndex in 1:length(treetype.array)){
#treetypeIndex<-1
treetype<-treetype.array[treetypeIndex]
for (taxaIndex in 1:length(taxa.array)){
#taxaIndex<-1
taxa <- taxa.array[taxaIndex]
if(treetype=="coal"){tree<-rcoal(taxa)}
if(treetype=="balanced"){tree<-compute.brlen(stree(taxa,type = "balanced"))}
if(treetype=="left"){tree<-compute.brlen(stree(n=taxa,type = "left"))}
if(treetype=="star"){tree<-compute.brlen(stree(n=taxa,type = "star"))}
#response<-rpois(taxa,lambda=20)
predictor<-rexp(n=taxa,rate=4)
lambda<-exp(true.param["b0"]+true.param["b1"]*predictor)
coef.array <- simCoefPoi(sims=sims,taxa=taxa,lambda=lambda,predictor=predictor)
tree.coef.array[treetypeIndex,taxaIndex,,]<- coef.array
}
}
save.image("phypoisim2.rda")
# outtable<-array(NA,c(length(taxa.array),length(treetype.array)*2))
# rownames(outtable)<-taxa.array
# outtable
#
# dim(tree.coef.array)
# for(treetypeIndex in 1:length(treetype.array)){
# for(taxaIndex in 1:length(taxa.array)){
# outtable[taxaIndex,(2*treetypeIndex-1):(2*treetypeIndex)] <- c(paste(round(mean(tree.coef.array[treetypeIndex,taxaIndex,,1]),3),"(",
# round(sd(tree.coef.array[treetypeIndex,taxaIndex,,1]),3),")",sep="")
# ,
# paste(round(mean(tree.coef.array[treetypeIndex,taxaIndex,,2]),3),"(",
# round(sd(tree.coef.array[treetypeIndex,taxaIndex,,2]),3),")",sep="")
# )
# }
# }
# for(treetypeIndex in 1:length(treetype.array)){
# for(taxaIndex in 1:length(taxa.array)){
# # For each pair of treetype and taxa, get the data for coef 1 and 2
# coef1 <- tree.coef.array[treetypeIndex,taxaIndex,,1]
# coef2 <- tree.coef.array[treetypeIndex,taxaIndex,,2]
#
# # Compute Q1 - 1.5*IQR and Q3 + 1.5*IQR for coef 1 and 2
# bounds1 <- quantile(coef1, c(0.25, 0.75)) + c(-1, 1) * 1.5 * IQR(coef1)
# bounds2 <- quantile(coef2, c(0.25, 0.75)) + c(-1, 1) * 1.5 * IQR(coef2)
#
# # Remove outliers for coef 1 and 2
# coef1_no_outliers <- coef1[coef1 >= bounds1[1] & coef1 <= bounds1[2]]
# coef2_no_outliers <- coef2[coef2 >= bounds2[1] & coef2 <= bounds2[2]]
#
# # Compute median and sd without outliers and update the table
# outtable[taxaIndex,(2*treetypeIndex-1):(2*treetypeIndex)] <- c(
# paste(round(median(coef1_no_outliers),2),"(",round(sd(coef1_no_outliers),2),")",sep=""),
# paste(round(median(coef2_no_outliers),2),"(",round(sd(coef2_no_outliers),2),")",sep="")
# )
# }
# }
load("~/Dropbox/FCU/Teaching/Mentoring/2016Spring-2021Fall_AssociateProfessor/2020Fall/ChiYuWu/rcodeCYW/phypoisim2.rda")
outtable<-array(NA,c(length(taxa.array),length(treetype.array)*2))
rownames(outtable)<-taxa.array
outtable
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## 16 NA NA NA NA NA NA NA NA
## 32 NA NA NA NA NA NA NA NA
## 64 NA NA NA NA NA NA NA NA
## 128 NA NA NA NA NA NA NA NA
dim(tree.coef.array)
## [1] 4 4 10000 2
for(treetypeIndex in 1:length(treetype.array)){
for(taxaIndex in 1:length(taxa.array)){
b0_values <- tree.coef.array[treetypeIndex,taxaIndex,,1][is.finite(tree.coef.array[treetypeIndex,taxaIndex,,1])]
b1_values <- tree.coef.array[treetypeIndex,taxaIndex,,2][is.finite(tree.coef.array[treetypeIndex,taxaIndex,,2])]
outtable[taxaIndex,(2*treetypeIndex-1):(2*treetypeIndex)] <- c(paste(round(mean(b0_values, na.rm=T),3),"(",round(sd(b0_values, na.rm=T),3),")",sep=""),
paste(round(mean(b1_values, na.rm=T),3),"(",round(sd(b1_values, na.rm=T),3),")",sep="")
)
}
}
print(outtable)
## [,1] [,2] [,3] [,4] [,5]
## 16 "3(0.034)" "5(0.031)" "2.998(0.086)" "5.002(0.088)" "2.999(0.059)"
## 32 "3(0.009)" "5(0.006)" "3(0.028)" "5(0.024)" "2.998(0.082)"
## 64 "3(0.009)" "5(0.008)" "3(0.025)" "5(0.028)" "3(0.054)"
## 128 "3(0.008)" "5(0.007)" "3(0.004)" "5(0.003)" "3(0.02)"
## [,6] [,7] [,8]
## 16 "5.001(0.057)" "2.999(0.055)" "5(0.114)"
## 32 "5.003(0.121)" "3(0.029)" "5(0.025)"
## 64 "5.001(0.071)" "3(0.019)" "5(0.019)"
## 128 "5(0.019)" "3(0.014)" "5(0.015)"
library(xtable)
xtable(outtable)
## % latex table generated in R 4.3.1 by xtable 1.8-4 package
## % Sun Jul 9 14:47:03 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllllllll}
## \hline
## & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\
## \hline
## 16 & 3(0.034) & 5(0.031) & 2.998(0.086) & 5.002(0.088) & 2.999(0.059) & 5.001(0.057) & 2.999(0.055) & 5(0.114) \\
## 32 & 3(0.009) & 5(0.006) & 3(0.028) & 5(0.024) & 2.998(0.082) & 5.003(0.121) & 3(0.029) & 5(0.025) \\
## 64 & 3(0.009) & 5(0.008) & 3(0.025) & 5(0.028) & 3(0.054) & 5.001(0.071) & 3(0.019) & 5(0.019) \\
## 128 & 3(0.008) & 5(0.007) & 3(0.004) & 5(0.003) & 3(0.02) & 5(0.019) & 3(0.014) & 5(0.015) \\
## \hline
## \end{tabular}
## \end{table}
# Prepare data for b0
df_b0 <- data.frame()
for(treetypeIndex in 1:length(treetype.array)){
for(taxaIndex in 1:length(taxa.array)){
b0_values <- tree.coef.array[treetypeIndex,taxaIndex,,1][is.finite(tree.coef.array[treetypeIndex,taxaIndex,,1])]
temp_df_b0 <- data.frame(
taxa = taxa.array[taxaIndex],
treetype = treetype.array[treetypeIndex],
value = b0_values
)
df_b0 <- rbind(df_b0, temp_df_b0)
}
}
head(df_b0)
## taxa treetype value
## 1 16 coal 2.985514
## 2 16 coal 3.003005
## 3 16 coal 2.926589
## 4 16 coal 2.993321
## 5 16 coal 3.018333
## 6 16 coal 3.025342
# Load library
library(ggplot2)
df_b0$taxa<-as.factor(df_b0$taxa)
# Create boxplot for b0 across all tree types
p_b0 <- ggplot(df_b0, aes(x=treetype, y=value, fill=taxa)) +
geom_boxplot() +
labs(x = "Tree Type",
y = expression(beta[0] ~ "Estimates"),
fill = "Taxa Size",
title = "Phylogenetic Poisson Regression:" ~ beta[0] ~ "Estimates") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # This line centers the title
# Print the plot
print(p_b0)
# Prepare data for b1
df_b1 <- data.frame()
for(treetypeIndex in 1:length(treetype.array)){
for(taxaIndex in 1:length(taxa.array)){
b1_values <- tree.coef.array[treetypeIndex,taxaIndex,,2][is.finite(tree.coef.array[treetypeIndex,taxaIndex,,2])]
temp_df_b1 <- data.frame(
taxa = taxa.array[taxaIndex],
treetype = treetype.array[treetypeIndex],
value = b1_values
)
df_b1 <- rbind(df_b1, temp_df_b1)
}
}
head(df_b1)
## taxa treetype value
## 1 16 coal 5.011077
## 2 16 coal 5.001785
## 3 16 coal 5.067866
## 4 16 coal 5.000886
## 5 16 coal 4.986326
## 6 16 coal 4.981057
# Load library
library(ggplot2)
df_b1$taxa<-as.factor(df_b1$taxa)
# Create boxplot for b1 across all tree types
p_b1 <- ggplot(df_b1, aes(x=treetype, y=value, fill=taxa)) +
geom_boxplot() +
labs(x = "Tree Type",
y = expression(beta[1] ~ "Estimates"),
fill = "Taxa Size",
title = "Phylogenetic Poisson Regression:" ~ beta[1] ~ "Estimates") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # This line centers the title
# Print the plot
print(p_b1)
library(gridExtra)
grid.arrange(p_b0,p_b1,ncol=2)