#let's clean our work environment first
rm(list = ls())
#set up the working directory
setwd ('C:\\Users\\kabur\\Dropbox\\teaching\\Animal Behaviour\\Research methods\\2020-2021\\practical_2') 
#upload the data
HVS <- read.csv("HVS.csv", header = T)

#CORRELATION
#let's run the correlation between orbital volume and latitude
#let's first check data distribution
#to check data distribution we use the shapiro.test
shapiro.test (HVS$OrbV) #distribution of orbital volume is normal
shapiro.test(HVS$LatAbs) #data are not normally distributed for latitude
#so a spearman's rank correlation test is the best option in this case but I'll show how to run the analysis
#with both pearson's correlation and spearman's

cor.test (HVS$OrbV, HVS$LatAbs)
cor.test (HVS$OrbV, HVS$LatAbs, method = "spearman")

plot (HVS$Latitude, HVS$OrbV, xlab = "Absolute Latitude", ylab = "Orbital Volume")

#How to report the output:
#Our correlation analysis examining the association between Absolute Latitude and Orbital volume
#showed a positive correlation between the two variables (r = 0.53, n = 55, p < 0.001)

####LINEAR REGRESSION####
#now you have to install and load the relevant packages
install.packages ('lattice')#the "lattice" package is necessary for multi-panel graphs
install.packages ('car') #the "car" package is used to run the "vif" function which is needed to check for collinearity
library (lattice)
library (car)
source (file = "HighstatLibV8.R") #the HighstatLibV8.R is a package that I downloaded from the book
#"Beginner's Guide to Generalized Additive Models with R".This package will
#help with data exploration

##DATA EXPLORATION##
#1.remove missing values#
colSums (is.na(HVS)) #the output shows that there is one missing value (for the FM variable)  
HVS2 <- na.exclude (HVS) #we remove the missing value. Note that you now created a new variable (HVS2) which
                          #is the same as the original data set except for the missing value that is now gone. Also
                          #note that if you use the na.exclude function, you delete the entire row that includes the missing value

colSums (is.na(HVS2)) #let's check again for the presence of missing values

#2.check for outliers using Clevelend dotplot#
MyVar <- c("OrbV", "LatAbs", "CC", "FM", "MI", "MT") #generate a vector called "MyVar" that includes only
                                                     #the continuous variables
Mydotplot (HVS2 [,MyVar]) #use the Mydotplot which is a function contained in the package "HighstatLibV8.R"
#there are no obvious outliers


#3.check for collinearity between predictors
MyVar2 <- c("LatAbs", "CC", "FM", "MI", "MT") #let's create a vector called "MyVar2" that includes all predictors
Mypairs (HVS2 [,MyVar2])
#the panel shows strong correlation between latitude and illuminance, between latitude and temperature, and between
#illumunance and temperature

#To assess collinearity between continuous covariates and categorical covariates, 
#conditional boxplots can be used.
Mybwplot(HVS2, MyVar2, "Gender")
#Gender seems to be correlated with CC

#CONCLUSION: given the collinearity between temperature, illuminance and latitude, we can drop temperature
#and illuminance. Also, given the collinearity between CC and Gender, we can drop Gender and keep CC.

#now we can run the multilinear regression

reg_model <- lm (OrbV ~ LatAbs + CC+ FM, data = HVS2)
summary (reg_model)

#the results show a signficant positve relationship between Latitude and Orbital Volume and between Cranial
#Capacity and Orbital Volume


#we can use also a stepwise model selection to select a regression model that is a better fit than reg_model.
#there are different ways in which you can do model selection. One of these is to use the "step" function
#which uses the Aikake Information Criterion (AIC)-values, for which the lower the values the better the model

step(reg_model)

#the output confirms that the best model is  the one that includes only Latitude and CC as best model

reg_model2 <- lm (OrbV ~ LatAbs + CC, data = HVS2)
summary (reg_model2)

##Plot the regression##
install.packages("ggplot2")
library("ggplot2")
ggplot(HVS2, aes(CC, OrbV)) + xlab ("Cranial Capacity") + ylab ("Orbital Volume") +
  geom_point() +
  stat_smooth(method = lm)+
  theme(plot.title = element_text(color = "black", size = 20, hjust = .5), 
        axis.text= element_text(color = "black", size = 20, hjust =1, vjust = .5, face = "plain"), 
        axis.title = element_text(color = "black", size = 20, vjust = 2, face = "plain"),
        panel.background = element_rect(fill = "transparent",colour = NA), 
        axis.line = element_line(color="black", size = .2))

#now we need to check the validity of the model. To do that we need to check different parameters

#1. homogeneity of variance. We plot the residuals vs fitted values
E2 <- rstandard(reg_model2) #calculate residuals
F2 <- fitted(reg_model2) #calculate fitted values
plot(x = F2, y = E2, xlab = "Fitted values",
       ylab = "Residuals")
abline(h = 0)
#INTERPRETATION: This graph does not not show a specific pattern, either in spread or in groups of residuals 
#being consistently above or below zero at certain points on the horizontal axis. 

#2. Lack of influential observation. We use the Cook's distance values
plot(cooks.distance(reg_model2), type = "h",
      ylim = c(0,1), xlab = "Observations",
       ylab = "Cook distance values")
 abline(h=1, lwd = 2, lty = 2)
#There are different cut-off points that are commonly used: 1 or 4/n or 4/(n-p) where n = sample size and p is the number of parameters
#regardless of which cut-off point we use here, there does not seem any influential observation

#3. normality of residuals via histogram, qq-plot and shapiro.test
hist(E2, main = "", xlab = "Residuals")
shapiro.test (E2)
#INTERPRETATION: residuals are normally distributed.

#4. you can use the "vif" function to check for collinearity
vif (reg_model2)
#values below 4 are not correlated

##General Linear Model##
#let's upload the "Fish_data" dataset

fish_data <- read.csv ('Fish_data.csv', header= T)

#let's first run a regression analysis 
reg_model3 <- lm (TotAbund ~ MeanDepth, data= fish_data)
summary (reg_model3)

#Ocean depth seems to negatively predict fish abundance
#let's run the model validation
par(mfrow=c(1,1)) #we can reset a single panel (rather than multipanel)
res_reg_model3 <- rstandard(reg_model3)
fit_reg_model3<- fitted(reg_model3)
plot(x = fit_reg_model3, y = res_reg_model3, xlab = "Fitted values",
     ylab = "Residuals")
abline(h = 0)

#as you can see the graph shows heterogeneity in the residuals (i.e., higher variation 
#in the residuals for higher fitted values)
#the graph also shows negative fitted values, which does not make biological sense
#Poisson General Linear Model (GLM) can cope with heterogeneity of residuals and avoids negative fitted values
#let's run Poisson GLM
glm_model <- glm (TotAbund~MeanDepth, data = fish_data, family = poisson)
summary (glm_model)

#let's plot the data
pred_glm_model <- predict (glm_model, data = fish_data, type = "response")
plot (fish_data$MeanDepth, fish_data$TotAbund, xlab = "Mean depth (km)", ylab = "Total Abundance (N)")
lines (fish_data$MeanDepth, pred_glm_model)

#let's check for overdispersion
E1 <- resid (glm_model, type = "pearson")
N <- nrow (fish_data)
p <- length (coef(glm_model))
Dispersion <- sum (E1^2)/ (N-p)
Dispersion

#the model is overdispersed because Dispersion value is much higher than 1
#we can try to understand the source of overdispersion
#are there outliers?
plot(cooks.distance(glm_model), type = "h",
     xlab = "Observations",
     ylab = "Cook distance values")

#to identify the observation that has high value
I <- 1:nrow (fish_data)
I [cooks.distance(glm_model)>10]

#we can exclude the observation #23 from the fish data and re-run the analysis
fish_data_new <- fish_data [c(-23),]

glm_model_nooutlier <- glm (TotAbund~MeanDepth, data = fish_data_new, family = poisson)

#let's check for overdispersion
E2 <- resid (glm_model_nooutlier, type = "pearson")
N2 <- nrow (fish_data_new)
p2 <- length (coef(glm_model_nooutlier))
Dispersion2 <- sum (E2^2)/ (N2-p2)
Dispersion2

##Dispersion = 103.8, so still overdispersed

#maybe we need to add covariates?
#if we plot residuals vs Period we see that there is a difference across the two periods
E1 <- resid (glm_model, type = "pearson")
boxplot (E1 ~ Period, data= fish_data, ylab = "Pearson residuals", xlab = "Period")
abline ( h = 0, v = 0, lty = 2)

#so we can see whether adding Period into the model decreses data dispersion
#the first step is to convert Period into factor
fish_data$fPeriod <- factor (fish_data$Period)

#now let's run the GLM model
glm_model2 <- glm (TotAbund~MeanDepth*fPeriod, data = fish_data, family = poisson)
summary (glm_model2)

#let's check for overdispersion
E3 <- resid (glm_model2, type = "pearson")
N3 <- nrow (fish_data)
p3 <- length (coef(glm_model2))
Dispersion3 <- sum (E3^2)/ (N3-p3)
Dispersion3

#Dispersion = 110.8. still overdispersed

#let's run the Negative binomial distribution
install.packages ('MASS')
library (MASS) #the glm.nb function is contained within the MASS package
glm_nb_model <- glm.nb (TotAbund~MeanDepth*fPeriod, data = fish_data)
summary (glm_nb_model)

E4 <- resid (glm_nb_model, type = "pearson")
N4 <- nrow (fish_data)
p4 <- length (coef(glm_nb_model))
Dispersion4 <- sum (E4^2)/ (N4-p4)
Dispersion4

#success! data are no longer overdispersed

#model validation
#let's pot residuals 
res_glm_nb <- resid (glm_nb_model, type = "pearson")
fit_glm_nb<- fitted(glm_nb_model)
plot(x = fit_glm_nb, y = res_glm_nb, xlab = "Fitted values",
     ylab = "Residuals")
abline(h = 0)

#data look good. Compare this graph with the previous graph of Residuals vs Fitted values from the Poisson model

#one last piece of information. You can use an "offset" function if you want to calculate rates and control
#for the size of geographical area (or observation time)

#for example
glm_nb_model2 <- glm.nb (TotAbund~MeanDepth*fPeriod + offset (log(SweptArea)), data = fish_data)
summary (glm_nb_model2)