#let's clean our work environment first rm(list = ls()) #CHI-SQUARE OF INDEPENDENCE#### #The chi-square test of independence evaluates whether there is a significant #association between two categorical variables #Set working directory setwd ('C:\\Users\\kabur\\Dropbox\\R workshop\\2021-22 series\\session 3') #let's upload the data iris_data <- read.csv ("iris.csv", header = T, stringsAsFactors = F) #let's create a contingency table iris_data_table <- table(iris_data$Species, iris_data$size) #you can now run the test chi.test <- chisq.test(iris_data_table) #you can also extract the expected data and the residuals chi.test$expected chi.test$residuals #you can visualize the results with a mosaic plot install.packages('vcd') library(vcd) mosaic(~ Species + size, direction = c("v", "h"), data = iris_data, shade = TRUE ) #the results can be reported as follows: ??(2) = 86.04, p < 0.001 #INDEPENDENT-SAMPLE T-TEST#### #Upload data bci <- read.csv('BCI.csv', header = T, stringsAsFactors = FALSE) #let's check the names of the bat species unique(bci$Species) #let's select only data coming from Myotis daubentonii and Pipistrellus pipistrellus bci_reduced <- bci [which(bci$Species== c("Myotis daubentonii", 'Pipistrellus pipistrellus')),] #now it's time for some data analysis; #let's suppose we want to test whether Forearm length significantly differs #between the two bat species. The test you need to use in this case is #either a "t-test" or "Mann-Whitney U-test" depending on whether t-test assumptions #are met or not #t-test assumptions are two: #1.data distribution is normal #2.the two groups have equal variance install.packages ("rstatix") library(rstatix) #STEP1a: Check that Forarm length is normally distributed for each species bci_reduced %>% group_by(Species) %>% shapiro_test(Forearm) #Interpretation: data are not normally distributed #STEP1b: construct histograms to visually see data distribution #we can use two methods to produce histograms #METHOD 1 with(bci_reduced, hist(Forearm[Species == "Myotis daubentonii"])) with(bci_reduced, hist(Forearm[Species == "Pipistrellus pipistrellus"])) #METHOD 2 (GGPLOT2) install.packages (ggplot2) library (ggplot2) ggplot(bci_reduced, aes(x = Forearm)) + geom_histogram(aes(color = Species), fill = "white", position = "identity", bins = 30) #you can see also through box plot boxplot(bci_reduced$Forearm, ylab = "Species" ) #there seems to be two outliers. Let's remove them #we can identify those outliers out <- boxplot.stats(bci_reduced$Forearm)$out out_ind <- which(bci_reduced$Forearm %in% c(out)) #let's remove rows 72 and 154 bci_reduced2<- bci_reduced [-c(72, 154),] #let's re_run the normality test bci_reduced2 %>% group_by(Species) %>% shapiro_test(Forearm) #Now data are normally distributed #STEP2: check for variance of means var.test(Forearm ~ Species, data = bci_reduced2) #Interpretation: assumption on homogeneity of variance is NOT violated #Since data (without those two outliers) meet #the assumptions of normality and homogeneity of variance #we can run the t.test #t-test is run as follows t.test (bci_reduced2$Forearm~bci_reduced2$Species) #let's check the mean Forearm length and sd by Species data_sum<- ddply(bci_reduced2, .(Species), summarize, mean = round(mean(Forearm), 2), sd = round(sd(Forearm), 2)) #we should calculate effect size too install.packages ("lsr") library (lsr) cohensD(bci_reduced2$Forearm~bci_reduced2$Species) #small effect size: 0.2-0.4 #medium effect size: 0.5-0.7 #large effect size: >0.8 #so there is a significant difference in Forearm length between the two species #this is how it should be reported: #This study found that M. daubentonii exhibit, on average, significantly longer forearms (mean± SD: 37.3 ± 1.11 cm) #compared to P. pipistrellus (mean ± SD: 32.3 ± 1.08cm; t(163)= 29.9, p< 0.001, d = 4.6, 95% CI [4.7, 5.4]). #MANN-WHITNEY U-TEST##### #let's suppose the assumptions are not met #in this case we can run the mann-whitney U-test wilcox.test(bci_reduced2$Forearm~bci_reduced2$Species) #effect size and confidence interval install.packages ("rcompanion") library(rcompanion) wilcoxonR(x = bci_reduced2$Forearm, g = bci_reduced2$Species, ci = TRUE) #install ggplot2 package install.packages ('ggplot2') library (ggplot2) #bar graph my_title <- expression(paste("Difference in forearm length between ", italic("M. daubentonii"), "and ", italic ("P. pipistrellus"))) ggplot (data_des, aes (x = Species, y = mean)) + geom_bar (stat = "identity", fill = "white", colour = "black")+ geom_errorbar( aes(ymin=mean-sd, ymax=mean+sd), width = .2) + ggtitle (my_title) + xlab ("Species") + ylab ("Forearm (cm)")+ theme(plot.title = element_text(color = "black", size = 20, hjust = .5), axis.text = element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "italic"), axis.title= element_text(color = "black", size = 20, vjust =3), panel.background = element_rect(fill = "transparent",colour = NA), axis.line = element_line(color="black", size = .2)) #PAIRED T-TEST#### #let's suppose we are testing a drug on mice in the lab and we want to compare mice weight #before and after the administration of the drug #let's upload the data mice_data <- read.csv ("mice_treatment.csv", header = T) #Paired T-test needs to meet the assumption that #the difference between the two treatment is normally distributed d <- with(mice_data, weight[group == "before"] - weight[group == "after"]) shapiro.test (d) #so the difference between the two treatments is normally distributed #we can now run the paired t-test t.test(mice_data$weight ~mice_data$group, paired = TRUE) cohensD(mice_data$weight ~mice_data$group, method = "paired") ddply(mice_data, .(group), summarize, mean = round(mean(weight), 2), sd = round(sd(weight), 2)) #how to report the output: the paired t-test analysis revealed that #mice increased the body weight following treatment (mean ± SD: 393.6 ± 29.4 #compared to before they were administered the treatment #(mean ± SD: 199.2 ± 8.47; t(9) = 20.8, p < 0.001, d = 6.6, 95% CI [173.4, 215.5]) #if the data do not meet the assumptions, instead, you should run a Wilcoxon-signed rank test, as follows: wilcox.test(mice_data$weight ~mice_data$group, paired = TRUE, conf.int = T) # mice_data %>% wilcox_effsize (weight ~group, paired = TRUE) #effect size #let's check the mean Forearm length and sd by Species ddply(mice_data, .(group), summarize, median = round(median(weight), 2), sd = round(sd(weight), 2)) #how to report; #A Wilcoxon signed-rank test analysis showed that mice significantly #increased their body weight treatment #after they were exposed to the treatment #(median ± SD = 392.9 ± 29.40g) compared to before they underwent treatment #(median ± SD: 195.3 ± 18.47; V = 55, p = 0.002; r = 0.88; 95% CI [172.8, 216.6]). #let's plot the results mice_data$group <- factor (mice_data$group, levels = c ("before", "after"), ordered = T) ggplot (mice_data, aes (x = group, y = weight)) + geom_boxplot () + geom_jitter ()+ ggtitle("Difference in body weight between treatements")+ xlab("Period") + ylab("body weight (g)") + theme(plot.title = element_text(color = "black", size = 20, hjust = .5), axis.text = element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "plain"), axis.title= element_text(color = "black", size = 20, vjust =3), panel.background = element_rect(fill = "transparent",colour = NA), axis.line = element_line(color="black", size = .2))