#let's clean our work environment first rm(list = ls()) #set up the working directory setwd ('C:\\Users\\u23916\\Dropbox\\R workshop\\2021-22 series\\session 12') ##LOGISTIC GENERALIZED LINEAR MODEL #upload data boar_data <- read.csv ('Boar.csv', header = T) #The outcome variable has a binary structure (Presence-Absence). #We, therefore, need to run GLM models with logistic error structure. Let's run logistic GLM with the boar data #in the boar data we want to test whether age, length and sex significantly predict boar's likelihood of being #infected by tubercolosis #let's explore the data first ##DATA EXPLORATION## #1.remove missing values# colSums (is.na(boar_data)) #identify whether there are missing values boar_data2 <- na.exclude (boar_data) #create a new data set without missing values #2.check for outliers using Cleveland dotplot# dotchart (boar_data2 [,"LengthCT"]) #To assess collinearity between continuous covariates and categorical covariates, #conditional boxplots can be used. source (file = "HighstatLibV8.R") install.packages("lattice") library (lattice) Mybwplot(boar_data2, "LengthCT", "AgeClass") #there is correlation between LengthCT and Age Mybwplot(boar_data2, "LengthCT", "SEX") #now we can run the logistic GLM model glm_boar <- glm (Tb ~ LengthCT + SEX, family = binomial, data = boar_data2) #remember that glm_boar is #an arbitrary name I gave to the model summary (glm_boar) #let's generate the output #using the step function we can select the best model step (glm_boar) #so the following is the best model glm_boar2 <- glm (Tb ~ LengthCT, family = binomial, data = boar_data2) summary (glm_boar2) #we can plot the results pred_boar <- predict (glm_boar2, boar_data2, type = "response") plot (x =boar_data2$LengthCT, y = boar_data2$Tb, xlab = "Boar Length", ylab = "Tb infection") lines (boar_data2$LengthCT, pred_boar) #we can check for influential observations plot (cooks.distance(glm_boar2), ylim = c(0,1), ylab = "Cook distance values", type = "h") #there are not influential observations ##GENERALIZED LINEAR MIXED MODEL (GLMM) ANALYSIS #upload data spider_data <- read.table(file = "Spiders.txt", header = TRUE, dec = ".") #before we run GLMM model let's run the analysis with linear regression spider_reg <- lm (Hlog10 ~ HerbLayer, data = spider_data) summary (spider_reg) #let's plot the results plot (spider_data$HerbLayer, spider_data$Hlog10, xlab = "Percentage of herb layer", ylab = "Shannon index", pch = 16) abline (spider_reg, lwd = 3) #so the percentage of herb layer seems to significantly explain shannon weaver index. #However each plot has been sampled multiple times so we need to check whether Plot ID explains #shannon-weaver index #let's plot the residuals for each plot spider_data$fPlot <- factor (spider_data$Plot) #convert plot from number to factor spider_res <- resid(spider_reg) #calculate regression residuals boxplot(spider_res ~ fPlot, data = spider_data, xlab = "Plot", ylab = "Residuals", cex.lab = 1.5) abline(h = 0, lty = 2) #you can see that residuals vary substantially across plots #we can see this even when we include Plot as a covariate in the regression model spider_reg2 <- lm (Hlog10 ~ HerbLayer +fPlot, data = spider_data) summary (spider_reg2) #as the output shows, the overall model is significant (p<0.001) and explains a higher percentage of variation (61%) #compared to the model without Plot (57%). However, we get 31 regression parameters, which we don't need. We need a single regression #parameter to assess the effect of percentage of Herb Layer on the shannon-weaver index while controlling for Plot ID; #that's what LMM does install.packages('lmerTest') library (lmerTest) spider_lmm <- lmer (Hlog10 ~ HerbLayer + (1|fPlot), data = spider_data) summary (spider_lmm) #similar to multiple regression analysis, we can fit many predictors. When predictors are on different scales, #however, it is recommended to z-transform the data in order to make the effect sizes of the different predictors #comparable. Z-transformation involves substracting each data point to the mean and divide the difference by the standard deviaton #the distribution of z-transformed data is a distribution with mean = 0 and sd = 1 #in "r" you can easily z-transform by using the scale() function spider_data$zgroundveg <- scale(spider_data$GroundVeg) #GroundVeg = Percentage of Ground Vegetation spider_data$zherblayer <- scale(spider_data$HerbLayer) #HerbLayer = Percentage of Herb Layer cover spider_data$zlitter <- scale (spider_data$Litter) #Litter = Percentage of Litter Content #we can check mean and sd of the rescaled data mean (spider_data$zgroundveg) sd (spider_data$zgroundveg) mean (spider_data$zherblayer) sd (spider_data$zherblayer) mean (spider_data$zlitter) sd (spider_data$zlitter) #now we can run the model spider_lmm2 <- lmer (Hlog10 ~ zgroundveg + zherblayer + zlitter + (1|fPlot), data = spider_data) summary (spider_lmm2) #while the model above is called "random intercept", #a more commonly used model is a random intercept and random slope model (see power point for difference) #here's what a random intercept and random slope model looks like spider_lmm3 <- lmer (Hlog10 ~ zgroundveg + zherblayer + zlitter + (1 + zherblayer |fPlot), data = spider_data) summary (spider_lmm3) #the warning that says "the boundary (singular) fit: see ?isSingular" #means that one or more variances are very close to zero #this might be due to the fact that the relationship between zherblayer and Hlog10 across plots #doesn't change much so it's possible that a random intercept is enough #we can see this with the following codes par(mar = c(5,5,2,2)) plot(x = spider_data$HerbLayer, y = spider_data$Hlog10, xlab = "Percentage of herb layer", ylab = "Shannon index", cex.lab = 1.5, pch = 16) PlotLevels <- unique(levels(spider_data$fPlot)) for (i in PlotLevels){ MinMaxi <- range(spider_data$HerbLayer[spider_data$fPlot==i]) MyDatai <- data.frame(HerbLayer= seq(from = MinMaxi[1], to = MinMaxi[2], length=10), fPlot = i) Pi <- predict(spider_reg2, newdata = MyDatai) lines(MyDatai$HerbLayer, Pi, lty=1,col=1,lwd=1) } #calculate R2 install.packages ("performance") library (performance) r2 (spider_lmm2) #now we need to check the validity of the model. We can use the same approach as for regression #1. normality of residuals par(mfrow = c(2, 2), mar = c(5, 5, 2, 2)) #this is to generate a multi-panel graph spider_resid_lmm <- resid (spider_lmm3) hist(spider_resid_lmm, main = "", xlab = "Residuals") qqnorm(spider_resid_lmm, main = "") qqline(spider_resid_lmm) #2.homogeneity of variance. We plot the residuals vs fitted values #calculate residuals spider_fitted_lmm <- fitted(spider_lmm3) #calculate fitted values plot(spider_fitted_lmm, spider_resid_lmm, xlab = "Fitted values", ylab = "Residuals") abline(h = 0) ##Poisson Generalized Linear Mixed Model pollen <- read.csv ("Pollen.csv", header = T) #let's explore the data #1.Cleveland plot dotchart(pollen$Dandelion) #2. relationship between dandellion pollen # and time per treatment xyplot (Dandelion ~ Time|Treatment, xlab = "Time (days)", data = pollen, col = 1, yab = "Number of dandelion pollen grains", layout = c(3,1), groups = Hive, type = "l") #this shows no clear outline and reveals an interaction between Time and Treatment pollen$fHive <- factor (pollen$Hive) pollen_glmm <- glmer (Dandelion ~ Time * Treatment + (1|fHive), data = pollen, family = poisson) summary (pollen_glmm) #as you can see, in the output there is no "control" variable. That's because #"control" is used as the standard. We can change the standard as follows pollen <- within(pollen, Treatment <- as.factor(Treatment)) pollen$Treatment <- relevel (pollen$Treatment, "Protein") pollen_glmm <- glmer (Dandelion ~ Time * Treatment + (1|fHive), data = pollen, family = poisson) summary (pollen_glmm) #we have to check for overdispersion pollen_resid_glmm <- resid (pollen_glmm, type = "pearson") N <- nrow (pollen) p <- length (fixef(pollen_glmm)) + 1 Overdispersion <- sum (pollen_resid_glmm^2)/ (N-p) Overdispersion #the model is too overdispersed. #similar to Poisson GLM, we can try to understand the source of overdispersion by plotting residuals vs fitted #values and by plotting residuals vs each covariate #residuals vs fitted values pollen_fitted_glm <- fitted (pollen_glmm, type = "response") par (mfrow = c(2,2), mar = c(5,5,2,2)) plot (pollen_fitted_glm, pollen_resid_glmm, xlab = "Fitted values", ylab = "Pearson residuals") abline (h = 0, lty = 2) #residuals vs Time plot (pollen$Time, pollen_resid_glmm, xlab = "Time", ylab = "Pearson residuals") abline (h = 0, lty = 2) #residuals vs Treatment boxplot (pollen_resid_glmm ~ pollen$Treatment, xlab = "Treatment", ylab = "Pearson residuals") abline (h = 0, lty = 2) #residuals vs Hive plot (pollen_resid_glmm ~ pollen$fHive, xlab = "Hive", ylab = "Pearson residuals") abline (h = 0, lty = 2) #there doesn't seem a clear pattern in the data. So it is better to switch to negative binomial model install.packages("glmmTMB") library (glmmTMB) pollen_glmmnb <- glmmTMB(Dandelion ~ Time * Treatment + (1|fHive), family = nbinom2, data = pollen) summary (pollen_glmmnb) pollen <- within(pollen, Treatment <- as.factor(Treatment)) pollen$Treatment <- relevel (pollen$Treatment, "Protein") pollen_glmm <- glmer (Dandelion ~ Time * Treatment + (1|fHive), data = pollen, family = poisson) summary (pollen_glmm) #let's run model validation pollen_resid_glmmnb <- resid (pollen_glmmnb, type = "pearson") N <- nrow (pollen) pnb <- length (fixef(pollen_glmmnb)) + 1 Overdispersionnb <- sum (pollen_resid_glmmnb^2)/ (N-p) Overdispersionnb #model is no longer overdispersed