#let's clean our work environment first
rm(list = ls())
#set up the working directory
setwd ('C:\\Users\\u23916\\Dropbox\\R workshop\\2021-22 series\\session 12')

##LOGISTIC GENERALIZED LINEAR MODEL
#upload data
boar_data <- read.csv ('Boar.csv', header = T)

#The outcome variable has a binary structure (Presence-Absence).
#We, therefore, need to run GLM models with logistic error structure. Let's run logistic GLM with the boar data
#in the boar data we want to test whether age, length and sex significantly predict boar's likelihood of being
#infected by tubercolosis

#let's explore the data first
##DATA EXPLORATION##
#1.remove missing values#
colSums (is.na(boar_data))  #identify whether there are  missing values
boar_data2 <- na.exclude (boar_data) #create a new data set without missing values


#2.check for outliers using Cleveland dotplot#
dotchart (boar_data2 [,"LengthCT"])

#To assess collinearity between continuous covariates and categorical covariates, 
#conditional boxplots can be used.
source (file = "HighstatLibV8.R")
install.packages("lattice")
library (lattice)
Mybwplot(boar_data2, "LengthCT", "AgeClass") #there is correlation between LengthCT and Age
Mybwplot(boar_data2, "LengthCT", "SEX")

#now we can run the logistic GLM model
glm_boar <- glm (Tb ~ LengthCT + SEX, family = binomial, data = boar_data2) #remember that glm_boar is 
                                                                             #an arbitrary name I gave to the model
summary (glm_boar) #let's generate the output

#using the step function we can select the best model
step (glm_boar)

#so the following is the best model
glm_boar2 <- glm (Tb ~ LengthCT, family = binomial, data = boar_data2) 
summary (glm_boar2)

#we can plot the results
pred_boar <- predict (glm_boar2, boar_data2, type = "response")
plot (x =boar_data2$LengthCT, y = boar_data2$Tb, xlab = "Boar Length", ylab = "Tb infection")
lines (boar_data2$LengthCT, pred_boar)

#we can check for influential observations
plot (cooks.distance(glm_boar2), ylim = c(0,1), 
      ylab = "Cook distance values", type = "h") #there are not influential observations

##GENERALIZED LINEAR MIXED MODEL (GLMM) ANALYSIS
#upload data
spider_data <- read.table(file = "Spiders.txt", 
                                      header = TRUE, 
                                      dec = ".")

#before we run GLMM model let's run the analysis with linear regression

spider_reg <- lm (Hlog10 ~ HerbLayer, data = spider_data)
summary (spider_reg)

#let's plot the results
plot (spider_data$HerbLayer, spider_data$Hlog10, xlab = "Percentage of herb layer",
      ylab = "Shannon index", pch = 16)
abline (spider_reg, lwd = 3)

#so the percentage of herb layer seems to significantly explain shannon weaver index.
#However each plot has been sampled multiple times so we need to check whether Plot ID explains
#shannon-weaver index

#let's plot the residuals for each plot
spider_data$fPlot <- factor (spider_data$Plot) #convert plot from number to factor
spider_res <- resid(spider_reg) #calculate regression residuals

boxplot(spider_res ~ fPlot, 
        data = spider_data,
        xlab = "Plot",
        ylab = "Residuals",
        cex.lab = 1.5)
abline(h = 0, lty = 2)

#you can see that residuals vary substantially across plots
#we can see this even when we include Plot as a covariate in the regression model

spider_reg2 <- lm (Hlog10 ~ HerbLayer +fPlot, data = spider_data)
summary (spider_reg2)

#as the output shows, the overall model is significant (p<0.001) and explains a higher percentage of variation (61%)
#compared to the model without Plot (57%). However, we get 31 regression parameters, which we don't need. We need a single regression
#parameter to assess the effect of percentage of Herb Layer on the shannon-weaver index while controlling for Plot ID;
#that's what LMM does

install.packages('lmerTest')
library (lmerTest)
spider_lmm <- lmer (Hlog10 ~ HerbLayer + (1|fPlot), data = spider_data)
summary (spider_lmm)

#similar to multiple regression analysis, we can fit many predictors. When predictors are on different scales,
#however, it is recommended to z-transform the data in order to make the effect sizes of the different predictors
#comparable. Z-transformation involves substracting each data point to the mean and divide the difference by the standard deviaton
#the distribution of z-transformed data is a distribution with mean = 0 and sd = 1
#in "r" you can easily z-transform by using the scale() function

spider_data$zgroundveg <- scale(spider_data$GroundVeg) #GroundVeg = Percentage of Ground Vegetation
spider_data$zherblayer <- scale(spider_data$HerbLayer) #HerbLayer = Percentage of Herb Layer cover
spider_data$zlitter <- scale (spider_data$Litter) #Litter = Percentage of Litter Content

#we can check mean and sd of the rescaled data

mean (spider_data$zgroundveg)
sd (spider_data$zgroundveg)

mean (spider_data$zherblayer)
sd (spider_data$zherblayer)

mean (spider_data$zlitter)
sd (spider_data$zlitter)

#now we can run the model
spider_lmm2 <- lmer (Hlog10 ~ zgroundveg + zherblayer + zlitter + (1|fPlot), data = spider_data)
summary (spider_lmm2)

#while the model above is called "random intercept", 
#a more commonly used model is a random intercept and random slope model (see power point for difference)
#here's what a random intercept and random slope model looks like

spider_lmm3 <- lmer (Hlog10 ~ zgroundveg + zherblayer + zlitter + (1 + zherblayer |fPlot), data = spider_data)
summary (spider_lmm3)

#the warning that says "the boundary (singular) fit: see ?isSingular" 
#means that one or more variances are very close to zero
#this might be due to the fact that the relationship between zherblayer and Hlog10 across plots
#doesn't change much so it's possible that a random intercept is enough
#we can see this with the following codes
par(mar = c(5,5,2,2))
plot(x = spider_data$HerbLayer, 
     y = spider_data$Hlog10,
     xlab = "Percentage of herb layer",
     ylab = "Shannon index",
     cex.lab = 1.5, pch = 16)

PlotLevels <- unique(levels(spider_data$fPlot))
for (i in PlotLevels){
  MinMaxi <- range(spider_data$HerbLayer[spider_data$fPlot==i])
  MyDatai <- data.frame(HerbLayer=
                          seq(from = MinMaxi[1],
                              to   = MinMaxi[2],
                              length=10),
                        fPlot = i)
  Pi <- predict(spider_reg2, newdata = MyDatai)
  lines(MyDatai$HerbLayer, Pi, lty=1,col=1,lwd=1)
}

#calculate R2
install.packages ("performance")
library (performance)
r2 (spider_lmm2)

#now we need to check the validity of the model. We can use the same approach as for regression

#1. normality of residuals
par(mfrow = c(2, 2), mar = c(5, 5, 2, 2)) #this is to generate a multi-panel graph
spider_resid_lmm <- resid (spider_lmm3)
hist(spider_resid_lmm, main = "", xlab = "Residuals")
qqnorm(spider_resid_lmm, main = "")
qqline(spider_resid_lmm)

#2.homogeneity of variance. We plot the residuals vs fitted values
#calculate residuals
spider_fitted_lmm <- fitted(spider_lmm3) #calculate fitted values
plot(spider_fitted_lmm, spider_resid_lmm, xlab = "Fitted values",
     ylab = "Residuals")
abline(h = 0)

##Poisson Generalized Linear Mixed Model
pollen <- read.csv ("Pollen.csv", header = T)

#let's explore the data
#1.Cleveland plot
dotchart(pollen$Dandelion)

#2. relationship between dandellion pollen # and time per treatment
xyplot (Dandelion ~ Time|Treatment, 
        xlab = "Time (days)", data = pollen, col = 1, 
        yab = "Number of dandelion pollen grains", 
        layout = c(3,1), groups = Hive, type = "l")

#this shows no clear outline and reveals an interaction between Time and Treatment
pollen$fHive <- factor (pollen$Hive)
pollen_glmm <- glmer (Dandelion ~ Time * Treatment + (1|fHive), data = pollen, family = poisson)
summary (pollen_glmm)

#as you can see, in the output there is no "control" variable. That's because
#"control" is used as the standard. We can change the standard as follows
pollen <- within(pollen, Treatment <- as.factor(Treatment))
pollen$Treatment <- relevel (pollen$Treatment, "Protein")
pollen_glmm <- glmer (Dandelion ~ Time * Treatment + (1|fHive), data = pollen, family = poisson)
summary (pollen_glmm)


#we have to check for overdispersion

pollen_resid_glmm <- resid (pollen_glmm, type = "pearson")
N <- nrow (pollen)
p <- length (fixef(pollen_glmm)) + 1
Overdispersion <- sum (pollen_resid_glmm^2)/ (N-p)
Overdispersion

#the model is too overdispersed.
#similar to Poisson GLM, we can try to understand the source of overdispersion by plotting residuals vs fitted
#values and by plotting residuals vs each covariate

#residuals vs fitted values
pollen_fitted_glm <- fitted (pollen_glmm, type = "response")
par (mfrow = c(2,2), mar = c(5,5,2,2))
plot (pollen_fitted_glm, pollen_resid_glmm, xlab = "Fitted values", ylab = "Pearson residuals")
abline (h = 0, lty = 2)

#residuals vs Time
plot (pollen$Time, pollen_resid_glmm, xlab = "Time", ylab = "Pearson residuals")
abline (h = 0, lty = 2)

#residuals vs Treatment
boxplot (pollen_resid_glmm ~ pollen$Treatment, xlab = "Treatment", ylab = "Pearson residuals")
abline (h = 0, lty = 2)

#residuals vs Hive
plot (pollen_resid_glmm ~ pollen$fHive, xlab = "Hive", ylab = "Pearson residuals")
abline (h = 0, lty = 2)

#there doesn't seem a clear pattern in the data. So it is better to switch to negative binomial model
install.packages("glmmTMB")
library (glmmTMB)
pollen_glmmnb <- glmmTMB(Dandelion ~ Time * Treatment + (1|fHive), family = nbinom2, data = pollen)
summary (pollen_glmmnb)

pollen <- within(pollen, Treatment <- as.factor(Treatment))
pollen$Treatment <- relevel (pollen$Treatment, "Protein")
pollen_glmm <- glmer (Dandelion ~ Time * Treatment + (1|fHive), data = pollen, family = poisson)
summary (pollen_glmm)

#let's run model validation
pollen_resid_glmmnb <- resid (pollen_glmmnb, type = "pearson")
N <- nrow (pollen)
pnb <- length (fixef(pollen_glmmnb)) + 1
Overdispersionnb <- sum (pollen_resid_glmmnb^2)/ (N-p)
Overdispersionnb

#model is no longer overdispersed