####let's look at the different types of data R uses####
#let's clean our work environment first
rm(list = ls())

####ANOVA#####
##Set working directory to tell R where your data are located
setwd ('C:\\Users\\kabur\\Dropbox\\R workshop\\2021-22 series\\session 5') 

#Upload data
install.packages("readxl")
library (readxl)
data_monkeys <- read_excel('data_monkeys.xls')

#let's suppose we want to test whether grooming rates significantly differ between species
#let's calculate grooming rates first
data_monkeys$gr_rates <- data_monkeys$Grooming/data_monkeys$Observation_time

#If you want to replace headers, you can use the following coding
colnames(data_monkeys)[colnames(data_monkeys)=="Focal_ID"] <- "ID"
colnames(data_monkeys)[colnames(data_monkeys)=="Observation_time"]<- "Obs_time" 

#you can check the the presence of outliers using the boxplot function
boxplot(data_monkeys$gr_rates) 

#the boxplot confirms the outlier. You can identify the datapoint as follows
out <- boxplot.stats(data_monkeys$gr_rates)$out
out_ind <- which(data_monkeys$gr_rates%in% c(out))

#you can create a new table without row 53
data_monkeys2<- data_monkeys[-c(53),]

#ANOVA has two main assumptions: (1) normality of data distribution; and (2) homogeneity of variance.
#let's test both
#STEP1: Check that grooming rates for each Species  are normally distributed
install.packages ("rstatix")
library(rstatix)
data_monkeys2 %>%
  group_by(Species) %>%
  shapiro_test(gr_rates)

#data are NOT normally distributed

#STEP2: Check whether the assumption of homogeneity of variance is not violated
install.packages ('stats')
library (stats)
bartlett.test(data_monkeys2$gr_rates ~ data_monkeys2$Species)

#assumption of homogeneity of variance is met

#since data are NOT normally distributed, we should be using the kruskal-Wallis. However
#for the sake of the practical, I'll show you how to run the anova first
#So now let's run the ANOVA test
anova_test <-aov (data_monkeys2$gr_rates~data_monkeys2$Species)
summary (anova_test)

#calculate effect size and confidence interval
install.packages ("effectsize")
library (effectsize)
eta_squared(anova_test,ci = 0.95)

#rule of thumb for eta squared assessment
#etasquared = 0.01 small
#etasquared = 0.06 medium
#etasuqared >0.14 large

#the ANOVA tells you that there is a difference between category, but you now need to assess which category
#actually differ
TukeyHSD(anova_test)

#How to report the results:
#We conducted a one-way ANOVA to test whether grooming rates differed 
#across species. The analysis showed that there was a significant difference 
#in grooming rates between the different macaque species (F (2,88) = 92.99, p < 0.001, eta2 = 0.68, 95%CI [0.59, 1.00]). 
#Post-hoc analysis revealed that rhesus macaques engaged in significantly more frequent grooming than the other two species (p < 0.001)

#alternatively you can run the kruskal-wallis test on the raw data
kruskal.test(data_monkeys2$gr_rates~ data_monkeys2$Species)
kruskal_effsize(data_monkeys2, gr_rates~ Species, ci = T)

#similarly you can run a series of pair-wise post-hoc wilcoxon tests following kruskal-wallis test
pairwise.wilcox.test(data_monkeys2$gr_rates, data_monkeys2$Species,
                     p.adjust.method = "BH")


# plot the data
#let's calculate mean and sd grooming rates for each species
library (plyr)
library (ggplot2)
plot_data <-ddply(data_monkeys2, .(Species), summarize,              #to summarize the data by mean and sd
                  mean = round(mean(gr_rates), 2),
                  sd = round(sd(gr_rates), 2))

#geom_bar is to edit the characteristics of the bars
#geom_errorbar is to insert the error bars
plot_data$Species <- factor (c ("bonnet", "long-tailed", "rhesus"), levels =c ("bonnet", "rhesus", "long-tailed"))
p <- ggplot (plot_data, aes (x = Species, y = mean)) + geom_bar (stat = "identity", fill = "white", colour = "black")+   
  geom_errorbar( aes(ymin=mean-sd, ymax=mean+sd),  width = .2) 

#ggtitle is for the title (\n to have two lines)
#xlab/ylab is for the label of the axes
q <- p +ggtitle("Difference in grooming between macaque species")+ 
  xlab("Species") + ylab("Average grooming rates (seconds)") +
  scale_x_discrete(labels=c("Bonnet", "Rhesus", "Long-tailed"))

#theme is to set up the size and colour of the labels; you'll need to have element "axis.text.x", "axis.title.x" to edit specific axes
#panel.background generates a blank background

t <- q + theme(plot.title = element_text(color = "black", size = 20, hjust = .5), 
               axis.text = element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "plain"), 
               axis.title= element_text (color = "black", size = 20, vjust =3),
               panel.background = element_rect(fill = "transparent",colour = NA), 
               axis.line = element_line(color="black", size = .2))

tiff("Grooming and Species.tif", res = 300, height = 25, width = 30, units = 'cm')
t + annotate ("segment", x = (1), xend = (2), y = 0.44, yend = 0.44 )+
  annotate ("segment", x = (2), xend = (3), y = 0.40, yend = 0.40) +
  annotate ("text", x = 1.5, y = 0.46, label ="***", size = 5) + 
  annotate ("text", x = 2.5, y = 0.42, label ='***', size = 5)
dev.off ()

#TWO-WAY ANOVA
#The two-way ANOVA compares the mean differences between groups that have been split on two independent variables (called factors). The primary purpose of a two-way ANOVA is to understand if there is an interaction 
#between the two independent variables on the dependent variable.
#for example, do macaque's grooming rates differ between species AND sexes and is there
#any interaction between species and sex?
#there is no equivalent of K-W for a Two-Way ANOVA so we can transform the data
data_monkeys2$sq_gr_rates <- sqrt (data_monkeys2$gr_rates)

#we also need to convert sex from number to factor and can convert 0 to males and 1 to females
data_monkeys2$Sex <- factor (data_monkeys2$Sex, levels = c ("0", "1"), labels = c ("males", "females"))

#now let's check that the square-root trasnformed data are now normally distributed and have equal variance
data_monkeys2 %>%
  group_by(Species) %>%
  shapiro_test(sq_gr_rates)

#Let's check whether the assumption of homogeneity of variance is met
bartlett.test(data_monkeys2$sq_gr_rates ~ data_monkeys2$Species)

#Is there any interaction between Sex and Species?
two_way_anova_test <-aov (data_monkeys2$sq_gr_rates~data_monkeys2$Species* data_monkeys2$Sex)
summary (two_way_anova_test)
TukeyHSD(two_way_anova_test)
eta_squared(two_way_anova_test,ci = 0.95)

#here's how we can report the output:
#A two-way ANOVA was conducted in order to examine the effect of sex 
#and species on grooming rates. We found that there was a significant 
#main effect of both species (F(2, 85) = 84.32, p < 0.001, ??2 = 0.66, 
#95%CI [0.57, 1.00]) and sex (F(1, 85) = 19.85,p < 0.001, ??2 = 0.19, 
#95%CI [0.08, 1.00]) and their interaction (F(1, 85) = 3.82,
#p = 0.025, eta2 = 0.08, 95%CI [0.01, 1.00])

r <- ggplot (data_monkeys2, aes (x = Species, y = sq_gr_rates)) + geom_boxplot (aes (fill = Sex)) + 
  labs(fill = "Sex") +  ggtitle("Grooming rates")+ 
  xlab("Species") + ylab("Social Grooming (square-root seconds)") +
  theme(plot.title = element_text(color = "black", size = 20, hjust = .5), 
        axis.text= element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "plain"), 
        axis.title = element_text(color = "black", size = 20, vjust = 2, face = "plain"),
        panel.background = element_rect(fill = "transparent",colour = NA), 
        axis.line = element_line(color="black", size = .2),
        legend.text = element_text(size = 20),
        legend.title = element_text (size = 25)) + scale_colour_manual(values= c("black","red"))

r +  annotate ("segment", x = (0.8), xend = (1.2), y = 0.53, yend = 0.53 )+
  annotate ("segment", x = (1.8), xend = (2.2), y = 0.53, yend = 0.53) +
  annotate ("segment", x = (2.8), xend = (3.2), y = 0.73, yend = 0.73) +
  annotate ("segment", x = (2), xend = (3), y = 0.80, yend = 0.80) +
  annotate ("segment", x = (1), xend = (3), y = 0.90, yend = 0.90) +
  annotate ("text", x = 1, y = 0.56, label ="***", size = 5) + 
  annotate ("text", x = 2, y = 0.56, label ='***', size = 5) +
  annotate ("text", x = 3, y = 0.75, label = "ns", size = 5) +
  annotate ("text", x = 2.5, y = 0.82, label = "***", size = 5) +
  annotate ("text", x = 2, y = 0.92, label = "***", size = 5) 

#REPEATED-MEASURED ANOVA
#The repeated-measures ANOVA is used for analyzing data where 
#same subjects are measured more than once. It can be one-way or two-way.
#The assumptions are the same as for the Independent-measures ANOVA, namely:
#1.Lack of outliers
#2.Normality
#3.Homogeneity of variance

#let's upload the data called "data_mice"
data_mice <- read.csv ("data_mice.csv", header = T)

#let's identify the outliers using the function "identify_outliers" within the package
#rstatix
data_mice %>%
  group_by(time) %>%
  identify_outliers(weight)

#you should look at "is.extreme" column. There are no extreme outliers

#let's test the normality assumption
data_mice %>%
  group_by(time) %>%
  shapiro_test(weight)

#you can use qqplot to assess normality visually
install.packages ("ggpubr")
library (ggpubr)
ggqqplot(data_mice, "weight", facet.by = "time")

#data have a normal distribution

#eventual deviations from the assumption of homogeneity of variance is automatically 
#corrected using the anova_test function

#so let's run the analysis
res.aov <- anova_test(data = data_mice, dv = weight, wid = id, within = time)
get_anova_table(res.aov)

#We can run post-hoc analysis as below
pwc <- data_mice %>%
  pairwise_t_test(
    weight ~ time, paired = TRUE,
    p.adjust.method = "bonferroni"
  )
pwc

#the results can be reported as below:
#Mice's weight was statistically significantly different at the different time points (F(2, 18) = 55.5, p < 0.0001, generalized eta squared = 0.82)
#Post-hoc analyses with a Bonferroni adjustment revealed that all the pairwise differences, between time points, were statistically significantly different (p <= 0.05).

#let's plot the results!
# Visualization: box plots with p-values
bxp <- ggboxplot(data_mice, x = "time", y = "weight", add = "point")
bxp_labs <-bxp + labs(subtitle = get_test_label(res.aov, detailed = TRUE),
    caption = get_pwc_label(pwc))

#we can add p values
pwc <- pwc %>% add_xy_position(x = "time")
bxp_labs+stat_pvalue_manual(pwc)

#MANOVA
#MANOVA is used when you want to test multiple response variables
#for example, in the "iris" data set we used a couple of classes ago, we want to test
#whether species identity explains both Petal and Sepal length
#let's upload the data
iris <- read.csv ("iris.csv", header = T)

#let's run the MANOVA
res.man <- manova(cbind(Sepal.Length, Petal.Length) ~ Species, data = iris)
summary(res.man)

#using the function summary.aov you can run the anova for each
summary.aov(res.man)