#let's clean our work environment first rm(list = ls()) #set up the working directory setwd ('C:\\Users\\kabur\\Dropbox\\teaching\\Animal Behaviour\\Research methods\\2020-2021\\practical_2') #upload the data HVS <- read.csv("HVS.csv", header = T) #CORRELATION #let's run the correlation between orbital volume and latitude #let's first check data distribution #to check data distribution we use the shapiro.test shapiro.test (HVS$OrbV) #distribution of orbital volume is normal shapiro.test(HVS$LatAbs) #data are not normally distributed for latitude #so a spearman's rank correlation test is the best option in this case but I'll show how to run the analysis #with both pearson's correlation and spearman's cor.test (HVS$OrbV, HVS$LatAbs) cor.test (HVS$OrbV, HVS$LatAbs, method = "spearman") plot (HVS$Latitude, HVS$OrbV, xlab = "Absolute Latitude", ylab = "Orbital Volume") #How to report the output: #Our correlation analysis examining the association between Absolute Latitude and Orbital volume #showed a positive correlation between the two variables (r = 0.53, n = 55, p < 0.001) ####LINEAR REGRESSION#### #now you have to install and load the relevant packages install.packages ('lattice')#the "lattice" package is necessary for multi-panel graphs install.packages ('car') #the "car" package is used to run the "vif" function which is needed to check for collinearity library (lattice) library (car) source (file = "HighstatLibV8.R") #the HighstatLibV8.R is a package that I downloaded from the book #"Beginner's Guide to Generalized Additive Models with R".This package will #help with data exploration ##DATA EXPLORATION## #1.remove missing values# colSums (is.na(HVS)) #the output shows that there is one missing value (for the FM variable) HVS2 <- na.exclude (HVS) #we remove the missing value. Note that you now created a new variable (HVS2) which #is the same as the original data set except for the missing value that is now gone. Also #note that if you use the na.exclude function, you delete the entire row that includes the missing value colSums (is.na(HVS2)) #let's check again for the presence of missing values #2.check for outliers using Clevelend dotplot# MyVar <- c("OrbV", "LatAbs", "CC", "FM", "MI", "MT") #generate a vector called "MyVar" that includes only #the continuous variables Mydotplot (HVS2 [,MyVar]) #use the Mydotplot which is a function contained in the package "HighstatLibV8.R" #there are no obvious outliers #3.check for collinearity between predictors MyVar2 <- c("LatAbs", "CC", "FM", "MI", "MT") #let's create a vector called "MyVar2" that includes all predictors Mypairs (HVS2 [,MyVar2]) #the panel shows strong correlation between latitude and illuminance, between latitude and temperature, and between #illumunance and temperature #To assess collinearity between continuous covariates and categorical covariates, #conditional boxplots can be used. Mybwplot(HVS2, MyVar2, "Gender") #Gender seems to be correlated with CC #CONCLUSION: given the collinearity between temperature, illuminance and latitude, we can drop temperature #and illuminance. Also, given the collinearity between CC and Gender, we can drop Gender and keep CC. #now we can run the multilinear regression reg_model <- lm (OrbV ~ LatAbs + CC+ FM, data = HVS2) summary (reg_model) #the results show a signficant positve relationship between Latitude and Orbital Volume and between Cranial #Capacity and Orbital Volume #we can use also a stepwise model selection to select a regression model that is a better fit than reg_model. #there are different ways in which you can do model selection. One of these is to use the "step" function #which uses the Aikake Information Criterion (AIC)-values, for which the lower the values the better the model step(reg_model) #the output confirms that the best model is the one that includes only Latitude and CC as best model reg_model2 <- lm (OrbV ~ LatAbs + CC, data = HVS2) summary (reg_model2) ##Plot the regression## install.packages("ggplot2") library("ggplot2") ggplot(HVS2, aes(CC, OrbV)) + xlab ("Cranial Capacity") + ylab ("Orbital Volume") + geom_point() + stat_smooth(method = lm)+ theme(plot.title = element_text(color = "black", size = 20, hjust = .5), axis.text= element_text(color = "black", size = 20, hjust =1, vjust = .5, face = "plain"), axis.title = element_text(color = "black", size = 20, vjust = 2, face = "plain"), panel.background = element_rect(fill = "transparent",colour = NA), axis.line = element_line(color="black", size = .2)) #now we need to check the validity of the model. To do that we need to check different parameters #1. homogeneity of variance. We plot the residuals vs fitted values E2 <- rstandard(reg_model2) #calculate residuals F2 <- fitted(reg_model2) #calculate fitted values plot(x = F2, y = E2, xlab = "Fitted values", ylab = "Residuals") abline(h = 0) #INTERPRETATION: This graph does not not show a specific pattern, either in spread or in groups of residuals #being consistently above or below zero at certain points on the horizontal axis. #2. Lack of influential observation. We use the Cook's distance values plot(cooks.distance(reg_model2), type = "h", ylim = c(0,1), xlab = "Observations", ylab = "Cook distance values") abline(h=1, lwd = 2, lty = 2) #There are different cut-off points that are commonly used: 1 or 4/n or 4/(n-p) where n = sample size and p is the number of parameters #regardless of which cut-off point we use here, there does not seem any influential observation #3. normality of residuals via histogram, qq-plot and shapiro.test hist(E2, main = "", xlab = "Residuals") shapiro.test (E2) #INTERPRETATION: residuals are normally distributed. #4. you can use the "vif" function to check for collinearity vif (reg_model2) #values below 4 are not correlated ##General Linear Model## #let's upload the "Fish_data" dataset fish_data <- read.csv ('Fish_data.csv', header= T) #let's first run a regression analysis reg_model3 <- lm (TotAbund ~ MeanDepth, data= fish_data) summary (reg_model3) #Ocean depth seems to negatively predict fish abundance #let's run the model validation par(mfrow=c(1,1)) #we can reset a single panel (rather than multipanel) res_reg_model3 <- rstandard(reg_model3) fit_reg_model3<- fitted(reg_model3) plot(x = fit_reg_model3, y = res_reg_model3, xlab = "Fitted values", ylab = "Residuals") abline(h = 0) #as you can see the graph shows heterogeneity in the residuals (i.e., higher variation #in the residuals for higher fitted values) #the graph also shows negative fitted values, which does not make biological sense #Poisson General Linear Model (GLM) can cope with heterogeneity of residuals and avoids negative fitted values #let's run Poisson GLM glm_model <- glm (TotAbund~MeanDepth, data = fish_data, family = poisson) summary (glm_model) #let's plot the data pred_glm_model <- predict (glm_model, data = fish_data, type = "response") plot (fish_data$MeanDepth, fish_data$TotAbund, xlab = "Mean depth (km)", ylab = "Total Abundance (N)") lines (fish_data$MeanDepth, pred_glm_model) #let's check for overdispersion E1 <- resid (glm_model, type = "pearson") N <- nrow (fish_data) p <- length (coef(glm_model)) Dispersion <- sum (E1^2)/ (N-p) Dispersion #the model is overdispersed because Dispersion value is much higher than 1 #we can try to understand the source of overdispersion #are there outliers? plot(cooks.distance(glm_model), type = "h", xlab = "Observations", ylab = "Cook distance values") #to identify the observation that has high value I <- 1:nrow (fish_data) I [cooks.distance(glm_model)>10] #we can exclude the observation #23 from the fish data and re-run the analysis fish_data_new <- fish_data [c(-23),] glm_model_nooutlier <- glm (TotAbund~MeanDepth, data = fish_data_new, family = poisson) #let's check for overdispersion E2 <- resid (glm_model_nooutlier, type = "pearson") N2 <- nrow (fish_data_new) p2 <- length (coef(glm_model_nooutlier)) Dispersion2 <- sum (E2^2)/ (N2-p2) Dispersion2 ##Dispersion = 103.8, so still overdispersed #maybe we need to add covariates? #if we plot residuals vs Period we see that there is a difference across the two periods E1 <- resid (glm_model, type = "pearson") boxplot (E1 ~ Period, data= fish_data, ylab = "Pearson residuals", xlab = "Period") abline ( h = 0, v = 0, lty = 2) #so we can see whether adding Period into the model decreses data dispersion #the first step is to convert Period into factor fish_data$fPeriod <- factor (fish_data$Period) #now let's run the GLM model glm_model2 <- glm (TotAbund~MeanDepth*fPeriod, data = fish_data, family = poisson) summary (glm_model2) #let's check for overdispersion E3 <- resid (glm_model2, type = "pearson") N3 <- nrow (fish_data) p3 <- length (coef(glm_model2)) Dispersion3 <- sum (E3^2)/ (N3-p3) Dispersion3 #Dispersion = 110.8. still overdispersed #let's run the Negative binomial distribution install.packages ('MASS') library (MASS) #the glm.nb function is contained within the MASS package glm_nb_model <- glm.nb (TotAbund~MeanDepth*fPeriod, data = fish_data) summary (glm_nb_model) E4 <- resid (glm_nb_model, type = "pearson") N4 <- nrow (fish_data) p4 <- length (coef(glm_nb_model)) Dispersion4 <- sum (E4^2)/ (N4-p4) Dispersion4 #success! data are no longer overdispersed #model validation #let's pot residuals res_glm_nb <- resid (glm_nb_model, type = "pearson") fit_glm_nb<- fitted(glm_nb_model) plot(x = fit_glm_nb, y = res_glm_nb, xlab = "Fitted values", ylab = "Residuals") abline(h = 0) #data look good. Compare this graph with the previous graph of Residuals vs Fitted values from the Poisson model #one last piece of information. You can use an "offset" function if you want to calculate rates and control #for the size of geographical area (or observation time) #for example glm_nb_model2 <- glm.nb (TotAbund~MeanDepth*fPeriod + offset (log(SweptArea)), data = fish_data) summary (glm_nb_model2)