rm(list=ls())
library(ggpubr)
## 載入需要的套件:ggplot2
library(dplyr)
## 
## 載入套件:'dplyr'
## 下列物件被遮斷自 'package:stats':
## 
##     filter, lag
## 下列物件被遮斷自 'package:base':
## 
##     intersect, setdiff, setequal, union
library(nonpar)

setwd("~/Dropbox/JournalSubmission/0phyregDiversity/rcode/sims/")
load("fulloutput.rda")

#load(url("https://tonyjhwueng.info/pcmreg/fulloutput.rda"))
####################
###full.raw.output##
####################

full.raw.output<-as.data.frame(full.raw.output)
full.raw.output$intercept<-as.numeric(full.raw.output$intercept)
full.raw.output$slope<-as.numeric(full.raw.output$slope)
full.raw.output$aic<-as.numeric(full.raw.output$aic)
full.raw.output$r.squared<-as.numeric(full.raw.output$r.squared)
full.raw.output$rsq<-as.numeric(full.raw.output$r.squared)
full.raw.output$sigma2<-as.numeric(full.raw.output$sigma2)
full.raw.output$mse<-as.numeric(full.raw.output$mse)
full.raw.output$size<-as.factor(full.raw.output$size)
full.raw.output$tree<-as.factor(full.raw.output$tree)
full.raw.output$model<-as.factor(full.raw.output$model)
str(full.raw.output)
## 'data.frame':    308 obs. of  10 variables:
##  $ intercept: num  4.8 4.41 4.65 5.08 5.7 ...
##  $ slope    : num  3.08 3.74 4.42 3.18 1.47 ...
##  $ sigma2   : num  14.1 38.6 116.6 196.2 243.1 ...
##  $ aic      : num  85.1 211.1 459.2 701.6 922.5 ...
##  $ r.squared: num  0.52705 0.31968 0.17111 0.06746 0.00956 ...
##  $ mse      : num  1.45 2.02 2.61 2.35 3.28 ...
##  $ tree     : Factor w/ 4 levels "BalancedTree",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ model    : Factor w/ 7 levels "BM","delta","EB",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ size     : Factor w/ 11 levels "100","150","20",..: 3 10 1 2 4 5 6 7 8 9 ...
##  $ rsq      : num  0.52705 0.31968 0.17111 0.06746 0.00956 ...
ls(full.raw.output)
##  [1] "aic"       "intercept" "model"     "mse"       "r.squared" "rsq"      
##  [7] "sigma2"    "size"      "slope"     "tree"
####################
###full.sim.output##
####################

head(full.sim.output)
##                       intercept          slope              sigma2            
## temp.sim.clade.output "4.52970746363137" "4.0402783799179"  "11.281103254218" 
## temp.sim.root.output  "4.61755031685766" "3.73183075438454" "24.2752871835689"
## temp.sim.clade.output "4.03380673311132" "3.67543134070236" "16.6221528686918"
## temp.sim.root.output  "4.48323972916464" "3.26835306446401" "23.4297256659566"
## temp.sim.clade.output "4.97649883813443" "3.04383334651632" "8.74081904505687"
## temp.sim.root.output  "4.83391799459114" "3.210843499219"   "8.91847058127604"
##                       aic                r.squared           mse               
## temp.sim.clade.output "90.937856921515"  "0.596650814845572" "2.38209113851361"
## temp.sim.root.output  "97.5337287023863" "0.382736192466198" "2.1248245133016" 
## temp.sim.clade.output "98.690002288419"  "0.430769786524581" "2.75999961502552"
## temp.sim.root.output  "96.8246626874404" "0.286404316904917" "2.82911020382223"
## temp.sim.clade.output "85.835353921787"  "0.635626204758499" "2.42642099643274"
## temp.sim.root.output  "77.5070412595935" "0.600091773475409" "2.24448910492674"
##                       tree           model size coltype collevel sims
## temp.sim.clade.output "BalancedTree" "BM"  "20" "clade" "1"      "1" 
## temp.sim.root.output  "BalancedTree" "BM"  "20" "root"  "1"      "1" 
## temp.sim.clade.output "BalancedTree" "BM"  "20" "clade" "1"      "2" 
## temp.sim.root.output  "BalancedTree" "BM"  "20" "root"  "1"      "2" 
## temp.sim.clade.output "BalancedTree" "BM"  "20" "clade" "1"      "3" 
## temp.sim.root.output  "BalancedTree" "BM"  "20" "root"  "1"      "3"
full.sim.output<-as.data.frame(full.sim.output)
dim(full.sim.output)
## [1] 492800     12
colnames(full.sim.output)
##  [1] "intercept" "slope"     "sigma2"    "aic"       "r.squared" "mse"      
##  [7] "tree"      "model"     "size"      "coltype"   "collevel"  "sims"
head(full.sim.output)
##                                intercept            slope           sigma2
## temp.sim.clade.output   4.52970746363137  4.0402783799179  11.281103254218
## temp.sim.root.output    4.61755031685766 3.73183075438454 24.2752871835689
## temp.sim.clade.output.1 4.03380673311132 3.67543134070236 16.6221528686918
## temp.sim.root.output.1  4.48323972916464 3.26835306446401 23.4297256659566
## temp.sim.clade.output.2 4.97649883813443 3.04383334651632 8.74081904505687
## temp.sim.root.output.2  4.83391799459114   3.210843499219 8.91847058127604
##                                      aic         r.squared              mse
## temp.sim.clade.output    90.937856921515 0.596650814845572 2.38209113851361
## temp.sim.root.output    97.5337287023863 0.382736192466198  2.1248245133016
## temp.sim.clade.output.1  98.690002288419 0.430769786524581 2.75999961502552
## temp.sim.root.output.1  96.8246626874404 0.286404316904917 2.82911020382223
## temp.sim.clade.output.2  85.835353921787 0.635626204758499 2.42642099643274
## temp.sim.root.output.2  77.5070412595935 0.600091773475409 2.24448910492674
##                                 tree model size coltype collevel sims
## temp.sim.clade.output   BalancedTree    BM   20   clade        1    1
## temp.sim.root.output    BalancedTree    BM   20    root        1    1
## temp.sim.clade.output.1 BalancedTree    BM   20   clade        1    2
## temp.sim.root.output.1  BalancedTree    BM   20    root        1    2
## temp.sim.clade.output.2 BalancedTree    BM   20   clade        1    3
## temp.sim.root.output.2  BalancedTree    BM   20    root        1    3
full.sim.output$intercept<-as.numeric(full.sim.output$intercept)
full.sim.output$slope<-as.numeric(full.sim.output$slope)
full.sim.output$sigma2<-as.numeric(full.sim.output$sigma2)
full.sim.output$aic<-as.numeric(full.sim.output$aic)
full.sim.output$r.squared<-as.numeric(full.sim.output$r.squared)
full.sim.output$rsq<-as.numeric(full.sim.output$r.squared)
full.sim.output$mse<-as.numeric(full.sim.output$mse)
full.sim.output$size<-as.factor(full.sim.output$size)
full.sim.output$sims<-as.numeric(full.sim.output$sims)
full.sim.output$collevel<-as.factor(full.sim.output$collevel)
full.sim.output$coltype<-as.factor(full.sim.output$coltype)
full.sim.output$tree<-as.factor(full.sim.output$tree)
full.sim.output$model<-as.factor(full.sim.output$model)
full.sim.output<-full.sim.output%>%
  mutate(colbil=ifelse(collevel %in%c(1:2),1, 0))
full.sim.output$colbil<-as.factor(full.sim.output$colbil)
str(full.sim.output)
## 'data.frame':    492800 obs. of  14 variables:
##  $ intercept: num  4.53 4.62 4.03 4.48 4.98 ...
##  $ slope    : num  4.04 3.73 3.68 3.27 3.04 ...
##  $ sigma2   : num  11.28 24.28 16.62 23.43 8.74 ...
##  $ aic      : num  90.9 97.5 98.7 96.8 85.8 ...
##  $ r.squared: num  0.597 0.383 0.431 0.286 0.636 ...
##  $ mse      : num  2.38 2.12 2.76 2.83 2.43 ...
##  $ tree     : Factor w/ 4 levels "BalancedTree",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ model    : Factor w/ 7 levels "BM","delta","EB",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ size     : Factor w/ 11 levels "100","150","20",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ coltype  : Factor w/ 2 levels "clade","root": 1 2 1 2 1 2 1 2 1 2 ...
##  $ collevel : Factor w/ 8 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sims     : num  1 1 2 2 3 3 4 4 5 5 ...
##  $ rsq      : num  0.597 0.383 0.431 0.286 0.636 ...
##  $ colbil   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
ls(full.sim.output)
##  [1] "aic"       "colbil"    "collevel"  "coltype"   "intercept" "model"    
##  [7] "mse"       "r.squared" "rsq"       "sigma2"    "sims"      "size"     
## [13] "slope"     "tree"
result0<-round(rbind(
c(median(full.raw.output$intercept,na.rm=T),
median(full.raw.output$slope,na.rm=T),
median(full.raw.output$sigma2,na.rm=T),
median(full.raw.output$aic,na.rm=T),
median(full.raw.output$r.squared,na.rm=T),
median(full.raw.output$mse,na.rm=T)),
c(median(full.sim.output$intercept,na.rm=T),
median(full.sim.output$slope,na.rm=T),
median(full.sim.output$sigma2,na.rm=T),
median(full.sim.output$aic,na.rm=T),
median(full.sim.output$r.squared,na.rm=T),
median(full.sim.output$mse,na.rm=T))
),3)

colnames(result0)<-c(
  "intercept", 
  "slope",    
  "sigma2",    
  "aic",      
  "r.squared", 
  "mse"      
)
rownames(result0)<-c("raw","sims")
library(xtable)
xtable(result0,digits=3)
## % latex table generated in R 4.2.0 by xtable 1.8-4 package
## % Sat Sep 10 21:09:37 2022
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrr}
##   \hline
##  & intercept & slope & sigma2 & aic & r.squared & mse \\ 
##   \hline
## raw & 4.992 & 2.981 & 25.742 & 1114.414 & 0.324 & 2.317 \\ 
##   sims & 4.997 & 2.999 & 26.638 & 1117.378 & 0.364 & 2.328 \\ 
##    \hline
## \end{tabular}
## \end{table}
#############################
###### paired t test #######
############################


removeoutlier<-function(data=data){
  quartiles <- quantile(data, probs=c(.25, .75), na.rm = FALSE)
  IQR <- IQR(data)
  Lower <- quartiles[1] - 1.5*IQR
  Upper <- quartiles[2] + 1.5*IQR 
  data_no_outlier <- subset(data, data > Lower & data < Upper)
  return(list(data_no_outlier=data_no_outlier,  pos= (data > Lower & data < Upper) ))
}

###### intercept #######
#t.test(full.raw.output$intercept, full.sim.output$intercept, alternative = "two.sided", var.equal = FALSE)

full.raw.output.intercept<-removeoutlier(full.raw.output$intercept)
full.sim.output.intercept<-removeoutlier(full.sim.output$intercept)

df.intercept<-data.frame(
  intercept=c(full.raw.output.intercept$data_no_outlier, full.sim.output.intercept$data_no_outlier),  
  datatype=c(rep("raw",times=length(full.raw.output.intercept$data_no_outlier)),rep("sim",times=length(full.sim.output.intercept$data_no_outlier)))
)

p.intercept <- ggboxplot(df.intercept, x = "datatype", y = "intercept",
                         color = "datatype", palette =c("#00AFBB", "#E7B800"),
                         add = "jitter", shape = "datatype")+ylim(4,6)
my_comparisons <- list( c("raw", "sim"))
p.intercept <- p.intercept + stat_compare_means(comparisons = my_comparisons)+ggtitle("Intercept")+theme(plot.title = element_text(hjust = 0.5))+ylab("Value")+ theme(legend.position="none")
p.intercept

###### slope #######
t.test(full.raw.output$slope, full.sim.output$slope, alternative = "two.sided", var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  full.raw.output$slope and full.sim.output$slope
## t = -0.67983, df = 307.7, p-value = 0.4971
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.16464141  0.08008885
## sample estimates:
## mean of x mean of y 
##  2.943177  2.985453
full.raw.output.slope<-removeoutlier(full.raw.output$slope)
full.sim.output.slope<-removeoutlier(full.sim.output$slope)

df.slope<-data.frame(
  slope=c(full.raw.output.slope$data_no_outlier, full.sim.output.slope$data_no_outlier),  
  datatype=c(rep("raw",times=length(full.raw.output.slope$data_no_outlier)),rep("sim",times=length(full.sim.output.slope$data_no_outlier)))
)

p.slope <- ggboxplot(df.slope, x = "datatype", y = "slope",
                     color = "datatype", palette =c("#00AFBB", "#E7B800"),
                     add = "jitter", shape = "datatype")+ylim(2,4)
my_comparisons <- list( c("raw", "sim"))
p.slope <- p.slope + stat_compare_means(comparisons = my_comparisons)+ggtitle("Slope")+theme(plot.title = element_text(hjust = 0.5))+ylab("Value")+ theme(legend.position="none")
p.slope

###### sigma2 #######
t.test(full.raw.output$sigma2, full.sim.output$sigma2, alternative = "two.sided", var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  full.raw.output$sigma2 and full.sim.output$sigma2
## t = -3.4443, df = 502.94, p-value = 0.0006205
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -360.49221  -98.60907
## sample estimates:
## mean of x mean of y 
##  384.2519  613.8025
# https://universeofdatascience.com/how-to-remove-outliers-from-data-in-r/
# remove outlier

full.raw.output.sigma2<-removeoutlier(full.raw.output$sigma2)
full.sim.output.sigma2<-removeoutlier(full.sim.output$sigma2)

df.sigma2<-data.frame(
  sigma2=c(full.raw.output.sigma2$data_no_outlier, full.sim.output.sigma2$data_no_outlier),  
  datatype=c(rep("raw",times=length(full.raw.output.sigma2$data_no_outlier)),rep("sim",times=length(full.sim.output.sigma2$data_no_outlier)))
)

p.sigma2 <- ggboxplot(df.sigma2, 
                      x = "datatype", 
                      y = "sigma2",
                      color = "datatype", 
                      palette =c("#00AFBB", "#E7B800"),
                      add = "jitter",
                      shape = "datatype")
my_comparisons <- list( c("raw", "sim"))
p.sigma2 <- p.sigma2 + stat_compare_means(comparisons = my_comparisons) +ggtitle( bquote(sigma^2))+theme(plot.title = element_text(hjust = 0.5))+ylab("Value")+ theme(legend.position="none")
p.sigma2