####let's look at the different types of data R uses####
#let's clean our work environment first
rm(list = ls())
####ANOVA#####
##Set working directory to tell R where your data are located
setwd ('C:\\Users\\kabur\\Dropbox\\R workshop\\2021-22 series\\session 5')
#Upload data
install.packages("readxl")
library (readxl)
data_monkeys <- read_excel('data_monkeys.xls')
#let's suppose we want to test whether grooming rates significantly differ between species
#let's calculate grooming rates first
data_monkeys$gr_rates <- data_monkeys$Grooming/data_monkeys$Observation_time
#If you want to replace headers, you can use the following coding
colnames(data_monkeys)[colnames(data_monkeys)=="Focal_ID"] <- "ID"
colnames(data_monkeys)[colnames(data_monkeys)=="Observation_time"]<- "Obs_time"
#you can check the the presence of outliers using the boxplot function
boxplot(data_monkeys$gr_rates)
#the boxplot confirms the outlier. You can identify the datapoint as follows
out <- boxplot.stats(data_monkeys$gr_rates)$out
out_ind <- which(data_monkeys$gr_rates%in% c(out))
#you can create a new table without row 53
data_monkeys2<- data_monkeys[-c(53),]
#ANOVA has two main assumptions: (1) normality of data distribution; and (2) homogeneity of variance.
#let's test both
#STEP1: Check that grooming rates for each Species are normally distributed
install.packages ("rstatix")
library(rstatix)
data_monkeys2 %>%
group_by(Species) %>%
shapiro_test(gr_rates)
#data are NOT normally distributed
#STEP2: Check whether the assumption of homogeneity of variance is not violated
install.packages ('stats')
library (stats)
bartlett.test(data_monkeys2$gr_rates ~ data_monkeys2$Species)
#assumption of homogeneity of variance is met
#since data are NOT normally distributed, we should be using the kruskall-wallis. However
#for the sake of the practical, I'll show you how to run the anova first
#So now let's run the ANOVA test
anova_test <-aov (data_monkeys2$gr_rates~data_monkeys2$Species)
summary (anova_test)
#calculate effect size and confidence interval
install.packages ("effectsize")
library (effectsize)
eta_squared(anova_test,ci = 0.95)
#rule of thumb for eta squared assessment
#etasquared = 0.01 small
#etasquared = 0.06 medium
#etasuqared >0.14 large
#the ANOVA tells you that there is a difference between category, but you now need to assess which category
#actually differ
TukeyHSD(anova_test)
#How to report the results:
#We conducted a one-way ANOVA to test whether grooming rates differed
#across species. The analysis showed that there was a significant difference
#in grooming rates between the different macaque species (F (2,88) = 92.99, p < 0.001, ??2 = 0.68, 95%CI [0.59, 1.00]).
#Post-hoc analysis revealed that rhesus macaques engaged in significantly more frequent grooming than the other two species (p < 0.001)
#alternatively you can run the kruskall-wallis test on the raw data
kruskal.test(data_monkeys2$gr_rates~ data_monkeys2$Species)
kruskal_effsize(data_monkeys2, gr_rates~ Species, ci = T)
#similarly you can run a series of pair-wise post-hoc wilcoxon tests following kruskall-wallis test
pairwise.wilcox.test(data_monkeys2$gr_rates, data_monkeys2$Species,
p.adjust.method = "BH")
# plot the data
#let's calculate mean and sd grooming rates for each species
library (plyr)
library (ggplot2)
plot_data <-ddply(data_monkeys2, .(Species), summarize, #to summarize the data by mean and sd
mean = round(mean(gr_rates), 2),
sd = round(sd(gr_rates), 2))
#geom_bar is to edit the characteristics of the bars
#geom_errorbar is to insert the error bars
plot_data$Species <- factor (c ("bonnet", "long-tailed", "rhesus"), levels =c ("bonnet", "rhesus", "long-tailed"))
p <- ggplot (plot_data, aes (x = Species, y = mean)) + geom_bar (stat = "identity", fill = "white", colour = "black")+
geom_errorbar( aes(ymin=mean-sd, ymax=mean+sd), width = .2)
#ggtitle is for the title (\n to have two lines)
#xlab/ylab is for the label of the axes
q <- p +ggtitle("Difference in grooming between macaque species")+
xlab("Species") + ylab("Average grooming rates (seconds)") +
scale_x_discrete(labels=c("Bonnet", "Rhesus", "Long-tailed"))
#theme is to set up the size and colour of the labels; you'll need to have element "axis.text.x", "axis.title.x" to edit specific axes
#panel.background generates a blank background
t <- q + theme(plot.title = element_text(color = "black", size = 20, hjust = .5),
axis.text = element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "plain"),
axis.title= element_text (color = "black", size = 20, vjust =3),
panel.background = element_rect(fill = "transparent",colour = NA),
axis.line = element_line(color="black", size = .2))
tiff("Grooming and Species.tif", res = 300, height = 25, width = 30, units = 'cm')
t + annotate ("segment", x = (1), xend = (2), y = 0.44, yend = 0.44 )+
annotate ("segment", x = (2), xend = (3), y = 0.40, yend = 0.40) +
annotate ("text", x = 1.5, y = 0.46, label ="***", size = 5) +
annotate ("text", x = 2.5, y = 0.42, label ='***', size = 5)
dev.off ()
#TWO-WAY ANOVA
#The two-way ANOVA compares the mean differences between groups that have been split on two independent variables (called factors). The primary purpose of a two-way ANOVA is to understand if there is an interaction
#between the two independent variables on the dependent variable.
#for example, do macaque's grooming rates differ between species AND sexes and is there
#any interaction between species and sex?
#there is no equivalent of K-W for a Two-Way ANOVA so we can transform the data
data_monkeys2$sq_gr_rates <- sqrt (data_monkeys2$gr_rates)
#we also need to convert sex from number to factor and can convert 0 to males and 1 to females
data_monkeys2$Sex <- factor (data_monkeys2$Sex, levels = c ("0", "1"), labels = c ("males", "females"))
#now let's check that the square-root trasnformed data are now normally distributed and have equal variance
data_monkeys2 %>%
group_by(Species) %>%
shapiro_test(sq_gr_rates)
#Let's check whether the assumption of homogeneity of variance is met
bartlett.test(data_monkeys2$sq_gr_rates ~ data_monkeys2$Species)
#Is there any interaction between Sex and Species?
two_way_anova_test <-aov (data_monkeys2$sq_gr_rates~data_monkeys2$Species* data_monkeys2$Sex)
summary (two_way_anova_test)
TukeyHSD(two_way_anova_test)
eta_squared(two_way_anova_test,ci = 0.95)
#here's how we can report the output:
#A two-way ANOVA was conducted in order to examine the effect of sex
#and species on grooming rates. We found that there was a significant
#main effect of both species (F(2, 85) = 84.32, p < 0.001, ??2 = 0.66,
#95%CI [0.57, 1.00]) and sex (F(1, 85) = 19.85,p < 0.001, ??2 = 0.19,
#95%CI [0.08, 1.00]) and their interaction (F(1, 85) = 3.82,
#p = 0.025, ??2 = 0.08, 95%CI [0.01, 1.00])
r <- ggplot (data_monkeys2, aes (x = Species, y = sq_gr_rates)) + geom_boxplot (aes (fill = Sex)) +
labs(fill = "Sex") + ggtitle("Grooming rates")+
xlab("Species") + ylab("Social Grooming (square-root seconds)") +
theme(plot.title = element_text(color = "black", size = 20, hjust = .5),
axis.text= element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "plain"),
axis.title = element_text(color = "black", size = 20, vjust = 2, face = "plain"),
panel.background = element_rect(fill = "transparent",colour = NA),
axis.line = element_line(color="black", size = .2),
legend.text = element_text(size = 20),
legend.title = element_text (size = 25)) + scale_colour_manual(values= c("black","red"))
r + annotate ("segment", x = (0.8), xend = (1.2), y = 0.53, yend = 0.53 )+
annotate ("segment", x = (1.8), xend = (2.2), y = 0.53, yend = 0.53) +
annotate ("segment", x = (2.8), xend = (3.2), y = 0.73, yend = 0.73) +
annotate ("segment", x = (2), xend = (3), y = 0.80, yend = 0.80) +
annotate ("segment", x = (1), xend = (3), y = 0.90, yend = 0.90) +
annotate ("text", x = 1, y = 0.56, label ="***", size = 5) +
annotate ("text", x = 2, y = 0.56, label ='***', size = 5) +
annotate ("text", x = 3, y = 0.75, label = "ns", size = 5) +
annotate ("text", x = 2.5, y = 0.82, label = "***", size = 5) +
annotate ("text", x = 2, y = 0.92, label = "***", size = 5)
#REPEATED-MEASURED ANOVA
#The repeated-measures ANOVA is used for analyzing data where
#same subjects are measured more than once. It can be one-way or two-way.
#The assumptions are the same as for the Independent-measures ANOVA, namely:
#1.Lack of outliers
#2.Normality
#3.Homogeneity of variance
#let's upload the data called "data_mice"
data_mice <- read.csv ("data_mice.csv", header = T)
#let's identify the outliers using the function "identify_outliers" within the package
#rstatix
data_mice %>%
group_by(time) %>%
identify_outliers(weight)
#you should look at "is.extreme" column. There are no extreme outliers
#let's test the normality assumption
data_mice %>%
group_by(time) %>%
shapiro_test(weight)
#you can use qqplot to assess normality visually
ggqqplot(data_mice, "weight", facet.by = "time")
#data have a normal distribution
#eventual deviatons from the assumption of homogeneity of variance is automatically
#corrected using the anova_test function
#so let's run the analysis
res.aov <- anova_test(data = data_mice, dv = weight, wid = id, within = time)
get_anova_table(res.aov)
#We can run post-hoc analysis as below
pwc <- data_mice %>%
pairwise_t_test(
weight ~ time, paired = TRUE,
p.adjust.method = "bonferroni"
)
pwc
#the results can be reported as below:
#Mice's weight was statistically significantly different at the different time points (F(2, 18) = 55.5, p < 0.0001, generalized eta squared = 0.82)
#Post-hoc analyses with a Bonferroni adjustment revealed that all the pairwise differences, between time points, were statistically significantly different (p <= 0.05).
#let's plot the results!
# Visualization: box plots with p-values
bxp <- ggboxplot(data_mice, x = "time", y = "weight", add = "point")
bxp_labs <-bxp + labs(subtitle = get_test_label(res.aov, detailed = TRUE),
caption = get_pwc_label(pwc))
#we can add p values
pwc <- pwc %>% add_xy_position(x = "time")
bxp_labs+stat_pvalue_manual(pwc)
#MANOVA
#MANOVA is used when you want to test multiple response variables
#for example, in the "iris" data set we used a couple of classes ago, we want to test
#whether species identity explains both Petal and Sepal length
#let's upload the data
iris <- read.csv ("iris.csv", header = T)
#let's run the MANOVA
res.man <- manova(cbind(Sepal.Length, Petal.Length) ~ Species, data = iris)
summary(res.man)
#using the function summary.aov you can run the anova for each
summary.aov(res.man)