#let's clean our work environment first
rm(list = ls())
#CHI-SQUARE OF INDEPENDENCE####
#The chi-square test of independence evaluates whether there is a significant
#association between two categorical variables
#Set working directory
setwd ('C:\\Users\\kabur\\Dropbox\\R workshop\\2021-22 series\\session 3')
#let's upload the data
iris_data <- read.csv ("iris.csv", header = T, stringsAsFactors = F)
#let's create a contingency table
iris_data_table <- table(iris_data$Species, iris_data$size)
#you can now run the test
chi.test <- chisq.test(iris_data_table)
#you can also extract the expected data and the residuals
chi.test$expected
chi.test$residuals
#you can visualize the results with a mosaic plot
install.packages('vcd')
library(vcd)
mosaic(~ Species + size,
direction = c("v", "h"),
data = iris_data,
shade = TRUE
)
#the results can be reported as follows: ??(2) = 86.04, p < 0.001
#INDEPENDENT-SAMPLE T-TEST####
#Upload data
bci <- read.csv('BCI.csv', header = T, stringsAsFactors = FALSE)
#let's check the names of the bat species
unique(bci$Species)
#let's select only data coming from Myotis daubentonii and Pipistrellus pipistrellus
bci_reduced <- bci [which(bci$Species== c("Myotis daubentonii", 'Pipistrellus pipistrellus')),]
#now it's time for some data analysis;
#let's suppose we want to test whether Forearm length significantly differs
#between the two bat species. The test you need to use in this case is
#either a "t-test" or "Mann-Whitney U-test" depending on whether t-test assumptions
#are met or not
#t-test assumptions are two:
#1.data distribution is normal
#2.the two groups have equal variance
install.packages ("rstatix")
library(rstatix)
#STEP1a: Check that Forarm length is normally distributed for each species
bci_reduced %>%
group_by(Species) %>%
shapiro_test(Forearm)
#Interpretation: data are not normally distributed
#STEP1b: construct histograms to visually see data distribution
#we can use two methods to produce histograms
#METHOD 1
with(bci_reduced, hist(Forearm[Species == "Myotis daubentonii"]))
with(bci_reduced, hist(Forearm[Species == "Pipistrellus pipistrellus"]))
#METHOD 2 (GGPLOT2)
install.packages (ggplot2)
library (ggplot2)
ggplot(bci_reduced, aes(x = Forearm)) +
geom_histogram(aes(color = Species), fill = "white",
position = "identity", bins = 30)
#you can see also through box plot
boxplot(bci_reduced$Forearm,
ylab = "Species"
)
#there seems to be two outliers. Let's remove them
#we can identify those outliers
out <- boxplot.stats(bci_reduced$Forearm)$out
out_ind <- which(bci_reduced$Forearm %in% c(out))
#let's remove rows 72 and 154
bci_reduced2<- bci_reduced [-c(72, 154),]
#let's re_run the normality test
bci_reduced2 %>%
group_by(Species) %>%
shapiro_test(Forearm)
#Now data are normally distributed
#STEP2: check for variance of means
var.test(Forearm ~ Species, data = bci_reduced2)
#Interpretation: assumption on homogeneity of variance is NOT violated
#Since data (without those two outliers) meet
#the assumptions of normality and homogeneity of variance
#we can run the t.test
#t-test is run as follows
t.test (bci_reduced2$Forearm~bci_reduced2$Species)
#let's check the mean Forearm length and sd by Species
data_sum<- ddply(bci_reduced2, .(Species), summarize,
mean = round(mean(Forearm), 2),
sd = round(sd(Forearm), 2))
#we should calculate effect size too
install.packages ("lsr")
library (lsr)
cohensD(bci_reduced2$Forearm~bci_reduced2$Species)
#small effect size: 0.2-0.4
#medium effect size: 0.5-0.7
#large effect size: >0.8
#so there is a significant difference in Forearm length between the two species
#this is how it should be reported:
#This study found that M. daubentonii exhibit, on average, significantly longer forearms (mean± SD: 37.3 ± 1.11 cm)
#compared to P. pipistrellus (mean ± SD: 32.3 ± 1.08cm; t(163)= 29.9, p< 0.001, d = 4.6, 95% CI [4.7, 5.4]).
#MANN-WHITNEY U-TEST#####
#let's suppose the assumptions are not met
#in this case we can run the mann-whitney U-test
wilcox.test(bci_reduced2$Forearm~bci_reduced2$Species)
#effect size and confidence interval
install.packages ("rcompanion")
library(rcompanion)
wilcoxonR(x = bci_reduced2$Forearm,
g = bci_reduced2$Species,
ci = TRUE)
#install ggplot2 package
install.packages ('ggplot2')
library (ggplot2)
#bar graph
my_title <- expression(paste("Difference in forearm length between ", italic("M. daubentonii"), "and ", italic ("P. pipistrellus")))
ggplot (data_des, aes (x = Species, y = mean)) + geom_bar (stat = "identity", fill = "white", colour = "black")+
geom_errorbar( aes(ymin=mean-sd, ymax=mean+sd), width = .2) +
ggtitle (my_title) + xlab ("Species") + ylab ("Forearm (cm)")+
theme(plot.title = element_text(color = "black", size = 20, hjust = .5),
axis.text = element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "italic"),
axis.title= element_text(color = "black", size = 20, vjust =3),
panel.background = element_rect(fill = "transparent",colour = NA),
axis.line = element_line(color="black", size = .2))
#PAIRED T-TEST####
#let's suppose we are testing a drug on mice in the lab and we want to compare mice weight
#before and after the administration of the drug
#let's upload the data
mice_data <- read.csv ("mice_treatment.csv", header = T)
#Paired T-test needs to meet the assumption that
#the difference between the two treatment is normally distributed
d <- with(mice_data,
weight[group == "before"] - weight[group == "after"])
shapiro.test (d)
#so the difference between the two treatments is normally distributed
#we can now run the paired t-test
t.test(mice_data$weight ~mice_data$group, paired = TRUE)
cohensD(mice_data$weight ~mice_data$group, method = "paired")
ddply(mice_data, .(group), summarize,
mean = round(mean(weight), 2),
sd = round(sd(weight), 2))
#how to report the output: the paired t-test analysis revealed that
#mice increased the body weight following treatment (mean ± SD: 393.6 ± 29.4
#compared to before they were administered the treatment
#(mean ± SD: 199.2 ± 8.47; t(9) = 20.8, p < 0.001, d = 6.6, 95% CI [173.4, 215.5])
#if the data do not meet the assumptions, instead, you should run a Wilcoxon-signed rank test, as follows:
wilcox.test(mice_data$weight ~mice_data$group, paired = TRUE, conf.int = T) #
mice_data %>% wilcox_effsize (weight ~group, paired = TRUE) #effect size
#let's check the mean Forearm length and sd by Species
ddply(mice_data, .(group), summarize,
median = round(median(weight), 2),
sd = round(sd(weight), 2))
#how to report;
#A Wilcoxon signed-rank test analysis showed that mice significantly
#increased their body weight treatment #after they were exposed to the treatment
#(median ± SD = 392.9 ± 29.40g) compared to before they underwent treatment
#(median ± SD: 195.3 ± 18.47; V = 55, p = 0.002; r = 0.88; 95% CI [172.8, 216.6]).
#let's plot the results
mice_data$group <- factor (mice_data$group, levels = c ("before", "after"), ordered = T)
ggplot (mice_data, aes (x = group, y = weight)) + geom_boxplot () +
geom_jitter ()+ ggtitle("Difference in body weight between treatements")+
xlab("Period") + ylab("body weight (g)") +
theme(plot.title = element_text(color = "black", size = 20, hjust = .5),
axis.text = element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "plain"),
axis.title= element_text(color = "black", size = 20, vjust =3),
panel.background = element_rect(fill = "transparent",colour = NA),
axis.line = element_line(color="black", size = .2))