####let's look at the different types of data R uses#### #let's clean our work environment first rm(list = ls()) ####ANOVA##### ##Set working directory to tell R where your data are located setwd ('C:\\Users\\kabur\\Dropbox\\R workshop\\2021-22 series\\session 5') #Upload data install.packages("readxl") library (readxl) data_monkeys <- read_excel('data_monkeys.xls') #let's suppose we want to test whether grooming rates significantly differ between species #let's calculate grooming rates first data_monkeys$gr_rates <- data_monkeys$Grooming/data_monkeys$Observation_time #If you want to replace headers, you can use the following coding colnames(data_monkeys)[colnames(data_monkeys)=="Focal_ID"] <- "ID" colnames(data_monkeys)[colnames(data_monkeys)=="Observation_time"]<- "Obs_time" #you can check the the presence of outliers using the boxplot function boxplot(data_monkeys$gr_rates) #the boxplot confirms the outlier. You can identify the datapoint as follows out <- boxplot.stats(data_monkeys$gr_rates)$out out_ind <- which(data_monkeys$gr_rates%in% c(out)) #you can create a new table without row 53 data_monkeys2<- data_monkeys[-c(53),] #ANOVA has two main assumptions: (1) normality of data distribution; and (2) homogeneity of variance. #let's test both #STEP1: Check that grooming rates for each Species are normally distributed install.packages ("rstatix") library(rstatix) data_monkeys2 %>% group_by(Species) %>% shapiro_test(gr_rates) #data are NOT normally distributed #STEP2: Check whether the assumption of homogeneity of variance is not violated install.packages ('stats') library (stats) bartlett.test(data_monkeys2$gr_rates ~ data_monkeys2$Species) #assumption of homogeneity of variance is met #since data are NOT normally distributed, we should be using the kruskal-Wallis. However #for the sake of the practical, I'll show you how to run the anova first #So now let's run the ANOVA test anova_test <-aov (data_monkeys2$gr_rates~data_monkeys2$Species) summary (anova_test) #calculate effect size and confidence interval install.packages ("effectsize") library (effectsize) eta_squared(anova_test,ci = 0.95) #rule of thumb for eta squared assessment #etasquared = 0.01 small #etasquared = 0.06 medium #etasuqared >0.14 large #the ANOVA tells you that there is a difference between category, but you now need to assess which category #actually differ TukeyHSD(anova_test) #How to report the results: #We conducted a one-way ANOVA to test whether grooming rates differed #across species. The analysis showed that there was a significant difference #in grooming rates between the different macaque species (F (2,88) = 92.99, p < 0.001, eta2 = 0.68, 95%CI [0.59, 1.00]). #Post-hoc analysis revealed that rhesus macaques engaged in significantly more frequent grooming than the other two species (p < 0.001) #alternatively you can run the kruskal-wallis test on the raw data kruskal.test(data_monkeys2$gr_rates~ data_monkeys2$Species) kruskal_effsize(data_monkeys2, gr_rates~ Species, ci = T) #similarly you can run a series of pair-wise post-hoc wilcoxon tests following kruskal-wallis test pairwise.wilcox.test(data_monkeys2$gr_rates, data_monkeys2$Species, p.adjust.method = "BH") # plot the data #let's calculate mean and sd grooming rates for each species library (plyr) library (ggplot2) plot_data <-ddply(data_monkeys2, .(Species), summarize, #to summarize the data by mean and sd mean = round(mean(gr_rates), 2), sd = round(sd(gr_rates), 2)) #geom_bar is to edit the characteristics of the bars #geom_errorbar is to insert the error bars plot_data$Species <- factor (c ("bonnet", "long-tailed", "rhesus"), levels =c ("bonnet", "rhesus", "long-tailed")) p <- ggplot (plot_data, aes (x = Species, y = mean)) + geom_bar (stat = "identity", fill = "white", colour = "black")+ geom_errorbar( aes(ymin=mean-sd, ymax=mean+sd), width = .2) #ggtitle is for the title (\n to have two lines) #xlab/ylab is for the label of the axes q <- p +ggtitle("Difference in grooming between macaque species")+ xlab("Species") + ylab("Average grooming rates (seconds)") + scale_x_discrete(labels=c("Bonnet", "Rhesus", "Long-tailed")) #theme is to set up the size and colour of the labels; you'll need to have element "axis.text.x", "axis.title.x" to edit specific axes #panel.background generates a blank background t <- q + theme(plot.title = element_text(color = "black", size = 20, hjust = .5), axis.text = element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "plain"), axis.title= element_text (color = "black", size = 20, vjust =3), panel.background = element_rect(fill = "transparent",colour = NA), axis.line = element_line(color="black", size = .2)) tiff("Grooming and Species.tif", res = 300, height = 25, width = 30, units = 'cm') t + annotate ("segment", x = (1), xend = (2), y = 0.44, yend = 0.44 )+ annotate ("segment", x = (2), xend = (3), y = 0.40, yend = 0.40) + annotate ("text", x = 1.5, y = 0.46, label ="***", size = 5) + annotate ("text", x = 2.5, y = 0.42, label ='***', size = 5) dev.off () #TWO-WAY ANOVA #The two-way ANOVA compares the mean differences between groups that have been split on two independent variables (called factors). The primary purpose of a two-way ANOVA is to understand if there is an interaction #between the two independent variables on the dependent variable. #for example, do macaque's grooming rates differ between species AND sexes and is there #any interaction between species and sex? #there is no equivalent of K-W for a Two-Way ANOVA so we can transform the data data_monkeys2$sq_gr_rates <- sqrt (data_monkeys2$gr_rates) #we also need to convert sex from number to factor and can convert 0 to males and 1 to females data_monkeys2$Sex <- factor (data_monkeys2$Sex, levels = c ("0", "1"), labels = c ("males", "females")) #now let's check that the square-root trasnformed data are now normally distributed and have equal variance data_monkeys2 %>% group_by(Species) %>% shapiro_test(sq_gr_rates) #Let's check whether the assumption of homogeneity of variance is met bartlett.test(data_monkeys2$sq_gr_rates ~ data_monkeys2$Species) #Is there any interaction between Sex and Species? two_way_anova_test <-aov (data_monkeys2$sq_gr_rates~data_monkeys2$Species* data_monkeys2$Sex) summary (two_way_anova_test) TukeyHSD(two_way_anova_test) eta_squared(two_way_anova_test,ci = 0.95) #here's how we can report the output: #A two-way ANOVA was conducted in order to examine the effect of sex #and species on grooming rates. We found that there was a significant #main effect of both species (F(2, 85) = 84.32, p < 0.001, ??2 = 0.66, #95%CI [0.57, 1.00]) and sex (F(1, 85) = 19.85,p < 0.001, ??2 = 0.19, #95%CI [0.08, 1.00]) and their interaction (F(1, 85) = 3.82, #p = 0.025, eta2 = 0.08, 95%CI [0.01, 1.00]) r <- ggplot (data_monkeys2, aes (x = Species, y = sq_gr_rates)) + geom_boxplot (aes (fill = Sex)) + labs(fill = "Sex") + ggtitle("Grooming rates")+ xlab("Species") + ylab("Social Grooming (square-root seconds)") + theme(plot.title = element_text(color = "black", size = 20, hjust = .5), axis.text= element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "plain"), axis.title = element_text(color = "black", size = 20, vjust = 2, face = "plain"), panel.background = element_rect(fill = "transparent",colour = NA), axis.line = element_line(color="black", size = .2), legend.text = element_text(size = 20), legend.title = element_text (size = 25)) + scale_colour_manual(values= c("black","red")) r + annotate ("segment", x = (0.8), xend = (1.2), y = 0.53, yend = 0.53 )+ annotate ("segment", x = (1.8), xend = (2.2), y = 0.53, yend = 0.53) + annotate ("segment", x = (2.8), xend = (3.2), y = 0.73, yend = 0.73) + annotate ("segment", x = (2), xend = (3), y = 0.80, yend = 0.80) + annotate ("segment", x = (1), xend = (3), y = 0.90, yend = 0.90) + annotate ("text", x = 1, y = 0.56, label ="***", size = 5) + annotate ("text", x = 2, y = 0.56, label ='***', size = 5) + annotate ("text", x = 3, y = 0.75, label = "ns", size = 5) + annotate ("text", x = 2.5, y = 0.82, label = "***", size = 5) + annotate ("text", x = 2, y = 0.92, label = "***", size = 5) #REPEATED-MEASURED ANOVA #The repeated-measures ANOVA is used for analyzing data where #same subjects are measured more than once. It can be one-way or two-way. #The assumptions are the same as for the Independent-measures ANOVA, namely: #1.Lack of outliers #2.Normality #3.Homogeneity of variance #let's upload the data called "data_mice" data_mice <- read.csv ("data_mice.csv", header = T) #let's identify the outliers using the function "identify_outliers" within the package #rstatix data_mice %>% group_by(time) %>% identify_outliers(weight) #you should look at "is.extreme" column. There are no extreme outliers #let's test the normality assumption data_mice %>% group_by(time) %>% shapiro_test(weight) #you can use qqplot to assess normality visually install.packages ("ggpubr") library (ggpubr) ggqqplot(data_mice, "weight", facet.by = "time") #data have a normal distribution #eventual deviations from the assumption of homogeneity of variance is automatically #corrected using the anova_test function #so let's run the analysis res.aov <- anova_test(data = data_mice, dv = weight, wid = id, within = time) get_anova_table(res.aov) #We can run post-hoc analysis as below pwc <- data_mice %>% pairwise_t_test( weight ~ time, paired = TRUE, p.adjust.method = "bonferroni" ) pwc #the results can be reported as below: #Mice's weight was statistically significantly different at the different time points (F(2, 18) = 55.5, p < 0.0001, generalized eta squared = 0.82) #Post-hoc analyses with a Bonferroni adjustment revealed that all the pairwise differences, between time points, were statistically significantly different (p <= 0.05). #let's plot the results! # Visualization: box plots with p-values bxp <- ggboxplot(data_mice, x = "time", y = "weight", add = "point") bxp_labs <-bxp + labs(subtitle = get_test_label(res.aov, detailed = TRUE), caption = get_pwc_label(pwc)) #we can add p values pwc <- pwc %>% add_xy_position(x = "time") bxp_labs+stat_pvalue_manual(pwc) #MANOVA #MANOVA is used when you want to test multiple response variables #for example, in the "iris" data set we used a couple of classes ago, we want to test #whether species identity explains both Petal and Sepal length #let's upload the data iris <- read.csv ("iris.csv", header = T) #let's run the MANOVA res.man <- manova(cbind(Sepal.Length, Petal.Length) ~ Species, data = iris) summary(res.man) #using the function summary.aov you can run the anova for each summary.aov(res.man)