rm(list=ls())
library(ggpubr)
## 載入需要的套件:ggplot2
library(dplyr)
##
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:stats':
##
## filter, lag
## 下列物件被遮斷自 'package:base':
##
## intersect, setdiff, setequal, union
library(nonpar)
setwd("~/Dropbox/JournalSubmission/0phyregDiversity/rcode/sims/")
load("fulloutput.rda")
#load(url("https://tonyjhwueng.info/pcmreg/fulloutput.rda"))
####################
###full.raw.output##
####################
full.raw.output<-as.data.frame(full.raw.output)
full.raw.output$intercept<-as.numeric(full.raw.output$intercept)
full.raw.output$slope<-as.numeric(full.raw.output$slope)
full.raw.output$aic<-as.numeric(full.raw.output$aic)
full.raw.output$r.squared<-as.numeric(full.raw.output$r.squared)
full.raw.output$rsq<-as.numeric(full.raw.output$r.squared)
full.raw.output$sigma2<-as.numeric(full.raw.output$sigma2)
full.raw.output$mse<-as.numeric(full.raw.output$mse)
full.raw.output$size<-as.factor(full.raw.output$size)
full.raw.output$tree<-as.factor(full.raw.output$tree)
full.raw.output$model<-as.factor(full.raw.output$model)
str(full.raw.output)
## 'data.frame': 308 obs. of 10 variables:
## $ intercept: num 4.8 4.41 4.65 5.08 5.7 ...
## $ slope : num 3.08 3.74 4.42 3.18 1.47 ...
## $ sigma2 : num 14.1 38.6 116.6 196.2 243.1 ...
## $ aic : num 85.1 211.1 459.2 701.6 922.5 ...
## $ r.squared: num 0.52705 0.31968 0.17111 0.06746 0.00956 ...
## $ mse : num 1.45 2.02 2.61 2.35 3.28 ...
## $ tree : Factor w/ 4 levels "BalancedTree",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ model : Factor w/ 7 levels "BM","delta","EB",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ size : Factor w/ 11 levels "100","150","20",..: 3 10 1 2 4 5 6 7 8 9 ...
## $ rsq : num 0.52705 0.31968 0.17111 0.06746 0.00956 ...
ls(full.raw.output)
## [1] "aic" "intercept" "model" "mse" "r.squared" "rsq"
## [7] "sigma2" "size" "slope" "tree"
####################
###full.sim.output##
####################
head(full.sim.output)
## intercept slope sigma2
## temp.sim.clade.output "4.52970746363137" "4.0402783799179" "11.281103254218"
## temp.sim.root.output "4.61755031685766" "3.73183075438454" "24.2752871835689"
## temp.sim.clade.output "4.03380673311132" "3.67543134070236" "16.6221528686918"
## temp.sim.root.output "4.48323972916464" "3.26835306446401" "23.4297256659566"
## temp.sim.clade.output "4.97649883813443" "3.04383334651632" "8.74081904505687"
## temp.sim.root.output "4.83391799459114" "3.210843499219" "8.91847058127604"
## aic r.squared mse
## temp.sim.clade.output "90.937856921515" "0.596650814845572" "2.38209113851361"
## temp.sim.root.output "97.5337287023863" "0.382736192466198" "2.1248245133016"
## temp.sim.clade.output "98.690002288419" "0.430769786524581" "2.75999961502552"
## temp.sim.root.output "96.8246626874404" "0.286404316904917" "2.82911020382223"
## temp.sim.clade.output "85.835353921787" "0.635626204758499" "2.42642099643274"
## temp.sim.root.output "77.5070412595935" "0.600091773475409" "2.24448910492674"
## tree model size coltype collevel sims
## temp.sim.clade.output "BalancedTree" "BM" "20" "clade" "1" "1"
## temp.sim.root.output "BalancedTree" "BM" "20" "root" "1" "1"
## temp.sim.clade.output "BalancedTree" "BM" "20" "clade" "1" "2"
## temp.sim.root.output "BalancedTree" "BM" "20" "root" "1" "2"
## temp.sim.clade.output "BalancedTree" "BM" "20" "clade" "1" "3"
## temp.sim.root.output "BalancedTree" "BM" "20" "root" "1" "3"
full.sim.output<-as.data.frame(full.sim.output)
dim(full.sim.output)
## [1] 492800 12
colnames(full.sim.output)
## [1] "intercept" "slope" "sigma2" "aic" "r.squared" "mse"
## [7] "tree" "model" "size" "coltype" "collevel" "sims"
head(full.sim.output)
## intercept slope sigma2
## temp.sim.clade.output 4.52970746363137 4.0402783799179 11.281103254218
## temp.sim.root.output 4.61755031685766 3.73183075438454 24.2752871835689
## temp.sim.clade.output.1 4.03380673311132 3.67543134070236 16.6221528686918
## temp.sim.root.output.1 4.48323972916464 3.26835306446401 23.4297256659566
## temp.sim.clade.output.2 4.97649883813443 3.04383334651632 8.74081904505687
## temp.sim.root.output.2 4.83391799459114 3.210843499219 8.91847058127604
## aic r.squared mse
## temp.sim.clade.output 90.937856921515 0.596650814845572 2.38209113851361
## temp.sim.root.output 97.5337287023863 0.382736192466198 2.1248245133016
## temp.sim.clade.output.1 98.690002288419 0.430769786524581 2.75999961502552
## temp.sim.root.output.1 96.8246626874404 0.286404316904917 2.82911020382223
## temp.sim.clade.output.2 85.835353921787 0.635626204758499 2.42642099643274
## temp.sim.root.output.2 77.5070412595935 0.600091773475409 2.24448910492674
## tree model size coltype collevel sims
## temp.sim.clade.output BalancedTree BM 20 clade 1 1
## temp.sim.root.output BalancedTree BM 20 root 1 1
## temp.sim.clade.output.1 BalancedTree BM 20 clade 1 2
## temp.sim.root.output.1 BalancedTree BM 20 root 1 2
## temp.sim.clade.output.2 BalancedTree BM 20 clade 1 3
## temp.sim.root.output.2 BalancedTree BM 20 root 1 3
full.sim.output$intercept<-as.numeric(full.sim.output$intercept)
full.sim.output$slope<-as.numeric(full.sim.output$slope)
full.sim.output$sigma2<-as.numeric(full.sim.output$sigma2)
full.sim.output$aic<-as.numeric(full.sim.output$aic)
full.sim.output$r.squared<-as.numeric(full.sim.output$r.squared)
full.sim.output$rsq<-as.numeric(full.sim.output$r.squared)
full.sim.output$mse<-as.numeric(full.sim.output$mse)
full.sim.output$size<-as.factor(full.sim.output$size)
full.sim.output$sims<-as.numeric(full.sim.output$sims)
full.sim.output$collevel<-as.factor(full.sim.output$collevel)
full.sim.output$coltype<-as.factor(full.sim.output$coltype)
full.sim.output$tree<-as.factor(full.sim.output$tree)
full.sim.output$model<-as.factor(full.sim.output$model)
full.sim.output<-full.sim.output%>%
mutate(colbil=ifelse(collevel %in%c(1:2),1, 0))
full.sim.output$colbil<-as.factor(full.sim.output$colbil)
str(full.sim.output)
## 'data.frame': 492800 obs. of 14 variables:
## $ intercept: num 4.53 4.62 4.03 4.48 4.98 ...
## $ slope : num 4.04 3.73 3.68 3.27 3.04 ...
## $ sigma2 : num 11.28 24.28 16.62 23.43 8.74 ...
## $ aic : num 90.9 97.5 98.7 96.8 85.8 ...
## $ r.squared: num 0.597 0.383 0.431 0.286 0.636 ...
## $ mse : num 2.38 2.12 2.76 2.83 2.43 ...
## $ tree : Factor w/ 4 levels "BalancedTree",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ model : Factor w/ 7 levels "BM","delta","EB",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ size : Factor w/ 11 levels "100","150","20",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ coltype : Factor w/ 2 levels "clade","root": 1 2 1 2 1 2 1 2 1 2 ...
## $ collevel : Factor w/ 8 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sims : num 1 1 2 2 3 3 4 4 5 5 ...
## $ rsq : num 0.597 0.383 0.431 0.286 0.636 ...
## $ colbil : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
ls(full.sim.output)
## [1] "aic" "colbil" "collevel" "coltype" "intercept" "model"
## [7] "mse" "r.squared" "rsq" "sigma2" "sims" "size"
## [13] "slope" "tree"
result0<-round(rbind(
c(median(full.raw.output$intercept,na.rm=T),
median(full.raw.output$slope,na.rm=T),
median(full.raw.output$sigma2,na.rm=T),
median(full.raw.output$aic,na.rm=T),
median(full.raw.output$r.squared,na.rm=T),
median(full.raw.output$mse,na.rm=T)),
c(median(full.sim.output$intercept,na.rm=T),
median(full.sim.output$slope,na.rm=T),
median(full.sim.output$sigma2,na.rm=T),
median(full.sim.output$aic,na.rm=T),
median(full.sim.output$r.squared,na.rm=T),
median(full.sim.output$mse,na.rm=T))
),3)
colnames(result0)<-c(
"intercept",
"slope",
"sigma2",
"aic",
"r.squared",
"mse"
)
rownames(result0)<-c("raw","sims")
library(xtable)
xtable(result0,digits=3)
## % latex table generated in R 4.2.0 by xtable 1.8-4 package
## % Sat Sep 10 21:09:37 2022
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrr}
## \hline
## & intercept & slope & sigma2 & aic & r.squared & mse \\
## \hline
## raw & 4.992 & 2.981 & 25.742 & 1114.414 & 0.324 & 2.317 \\
## sims & 4.997 & 2.999 & 26.638 & 1117.378 & 0.364 & 2.328 \\
## \hline
## \end{tabular}
## \end{table}
#############################
###### paired t test #######
############################
removeoutlier<-function(data=data){
quartiles <- quantile(data, probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(data)
Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR
data_no_outlier <- subset(data, data > Lower & data < Upper)
return(list(data_no_outlier=data_no_outlier, pos= (data > Lower & data < Upper) ))
}
###### intercept #######
#t.test(full.raw.output$intercept, full.sim.output$intercept, alternative = "two.sided", var.equal = FALSE)
full.raw.output.intercept<-removeoutlier(full.raw.output$intercept)
full.sim.output.intercept<-removeoutlier(full.sim.output$intercept)
df.intercept<-data.frame(
intercept=c(full.raw.output.intercept$data_no_outlier, full.sim.output.intercept$data_no_outlier),
datatype=c(rep("raw",times=length(full.raw.output.intercept$data_no_outlier)),rep("sim",times=length(full.sim.output.intercept$data_no_outlier)))
)
p.intercept <- ggboxplot(df.intercept, x = "datatype", y = "intercept",
color = "datatype", palette =c("#00AFBB", "#E7B800"),
add = "jitter", shape = "datatype")+ylim(4,6)
my_comparisons <- list( c("raw", "sim"))
p.intercept <- p.intercept + stat_compare_means(comparisons = my_comparisons)+ggtitle("Intercept")+theme(plot.title = element_text(hjust = 0.5))+ylab("Value")+ theme(legend.position="none")
p.intercept
###### slope #######
t.test(full.raw.output$slope, full.sim.output$slope, alternative = "two.sided", var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: full.raw.output$slope and full.sim.output$slope
## t = -0.67983, df = 307.7, p-value = 0.4971
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.16464141 0.08008885
## sample estimates:
## mean of x mean of y
## 2.943177 2.985453
full.raw.output.slope<-removeoutlier(full.raw.output$slope)
full.sim.output.slope<-removeoutlier(full.sim.output$slope)
df.slope<-data.frame(
slope=c(full.raw.output.slope$data_no_outlier, full.sim.output.slope$data_no_outlier),
datatype=c(rep("raw",times=length(full.raw.output.slope$data_no_outlier)),rep("sim",times=length(full.sim.output.slope$data_no_outlier)))
)
p.slope <- ggboxplot(df.slope, x = "datatype", y = "slope",
color = "datatype", palette =c("#00AFBB", "#E7B800"),
add = "jitter", shape = "datatype")+ylim(2,4)
my_comparisons <- list( c("raw", "sim"))
p.slope <- p.slope + stat_compare_means(comparisons = my_comparisons)+ggtitle("Slope")+theme(plot.title = element_text(hjust = 0.5))+ylab("Value")+ theme(legend.position="none")
p.slope
###### sigma2 #######
t.test(full.raw.output$sigma2, full.sim.output$sigma2, alternative = "two.sided", var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: full.raw.output$sigma2 and full.sim.output$sigma2
## t = -3.4443, df = 502.94, p-value = 0.0006205
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -360.49221 -98.60907
## sample estimates:
## mean of x mean of y
## 384.2519 613.8025
# https://universeofdatascience.com/how-to-remove-outliers-from-data-in-r/
# remove outlier
full.raw.output.sigma2<-removeoutlier(full.raw.output$sigma2)
full.sim.output.sigma2<-removeoutlier(full.sim.output$sigma2)
df.sigma2<-data.frame(
sigma2=c(full.raw.output.sigma2$data_no_outlier, full.sim.output.sigma2$data_no_outlier),
datatype=c(rep("raw",times=length(full.raw.output.sigma2$data_no_outlier)),rep("sim",times=length(full.sim.output.sigma2$data_no_outlier)))
)
p.sigma2 <- ggboxplot(df.sigma2,
x = "datatype",
y = "sigma2",
color = "datatype",
palette =c("#00AFBB", "#E7B800"),
add = "jitter",
shape = "datatype")
my_comparisons <- list( c("raw", "sim"))
p.sigma2 <- p.sigma2 + stat_compare_means(comparisons = my_comparisons) +ggtitle( bquote(sigma^2))+theme(plot.title = element_text(hjust = 0.5))+ylab("Value")+ theme(legend.position="none")
p.sigma2