#let's clean our work environment first
rm(list = ls())

#CHI-SQUARE OF INDEPENDENCE####
#The chi-square test of independence evaluates whether there is a significant 
#association between two categorical variables

#Set working directory 
setwd ('C:\\Users\\kabur\\Dropbox\\R workshop\\2021-22 series\\session 3') 

#let's upload the data
iris_data <- read.csv ("iris.csv", header = T, stringsAsFactors = F)

#let's create a contingency table
iris_data_table <- table(iris_data$Species, iris_data$size)

#you can now run the test
chi.test <- chisq.test(iris_data_table)

#you can also extract the expected data and the residuals
chi.test$expected

chi.test$residuals

#you can visualize the results with a mosaic plot
install.packages('vcd')
library(vcd)

mosaic(~ Species + size,
       direction = c("v", "h"),
       data = iris_data,
       shade = TRUE
)

#the results can be reported as follows: ??(2) = 86.04, p < 0.001

#INDEPENDENT-SAMPLE T-TEST####
#Upload data
bci <- read.csv('BCI.csv', header = T, stringsAsFactors = FALSE)

#let's check the names of the bat species
unique(bci$Species)

#let's select only data coming from Myotis daubentonii and Pipistrellus pipistrellus
bci_reduced <- bci [which(bci$Species== c("Myotis daubentonii", 'Pipistrellus pipistrellus')),]


#now it's time for some data analysis; 
#let's suppose we want to test whether Forearm length significantly differs
#between the two bat species. The test you need to use in this case is 
#either a "t-test" or "Mann-Whitney U-test" depending on whether t-test assumptions
#are met or not

#t-test assumptions are two:
#1.data distribution is normal
#2.the two groups have equal variance

install.packages ("rstatix")
library(rstatix)

#STEP1a: Check that Forarm length is normally distributed for each species
bci_reduced %>%
  group_by(Species) %>%
  shapiro_test(Forearm)

#Interpretation: data are not normally distributed

#STEP1b: construct histograms to visually see data distribution
#we can use two methods to produce histograms
#METHOD 1
with(bci_reduced, hist(Forearm[Species == "Myotis daubentonii"]))
with(bci_reduced, hist(Forearm[Species == "Pipistrellus pipistrellus"]))

#METHOD 2 (GGPLOT2)
install.packages (ggplot2)
library (ggplot2)
ggplot(bci_reduced, aes(x = Forearm)) +
  geom_histogram(aes(color = Species), fill = "white",
                 position = "identity", bins = 30)

#you can see also through box plot
boxplot(bci_reduced$Forearm,
        ylab = "Species"
) 

#there seems to be two outliers. Let's remove them
#we can identify those outliers
out <- boxplot.stats(bci_reduced$Forearm)$out
out_ind <- which(bci_reduced$Forearm %in% c(out))

#let's remove rows 72 and 154
bci_reduced2<- bci_reduced [-c(72, 154),]

#let's re_run the normality test
bci_reduced2 %>%
  group_by(Species) %>%
  shapiro_test(Forearm)

#Now data are normally distributed


#STEP2: check for variance of means
var.test(Forearm ~ Species, data = bci_reduced2)


#Interpretation: assumption on homogeneity of variance is NOT violated

#Since data (without those two outliers) meet 
#the assumptions of normality and homogeneity of variance
#we can run the t.test

#t-test is run as follows
t.test (bci_reduced2$Forearm~bci_reduced2$Species)

#let's check the mean Forearm length and sd by Species
data_sum<- ddply(bci_reduced2, .(Species), summarize,              
                 mean = round(mean(Forearm), 2),
                 sd = round(sd(Forearm), 2))

#we should calculate effect size too
install.packages ("lsr")
library (lsr)
cohensD(bci_reduced2$Forearm~bci_reduced2$Species)

#small effect size: 0.2-0.4
#medium effect size: 0.5-0.7
#large effect size: >0.8

#so there is a significant difference in Forearm length between the two species
#this is how it should be reported:
#This study found that M. daubentonii exhibit, on average, significantly longer forearms (meaną SD: 37.3 ą 1.11 cm) 
#compared to P. pipistrellus (mean ą SD: 32.3 ą 1.08cm; t(163)= 29.9, p< 0.001, d = 4.6, 95% CI [4.7, 5.4]).

#MANN-WHITNEY U-TEST#####
#let's suppose the assumptions are not met
#in this case we can run the mann-whitney U-test

wilcox.test(bci_reduced2$Forearm~bci_reduced2$Species)

#effect size and confidence interval
install.packages ("rcompanion")
library(rcompanion)
wilcoxonR(x  = bci_reduced2$Forearm,
          g  = bci_reduced2$Species,
          ci = TRUE)

#install ggplot2 package
install.packages ('ggplot2')
library (ggplot2)

#bar graph
my_title <- expression(paste("Difference in forearm length between ", italic("M. daubentonii"), "and ", italic ("P. pipistrellus")))
ggplot (data_des, aes (x = Species, y = mean)) + geom_bar (stat = "identity", fill = "white", colour = "black")+   
  geom_errorbar( aes(ymin=mean-sd, ymax=mean+sd),  width = .2) +
  ggtitle (my_title) + xlab ("Species") + ylab ("Forearm (cm)")+
  theme(plot.title = element_text(color = "black", size = 20, hjust = .5), 
        axis.text = element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "italic"), 
        axis.title= element_text(color = "black", size = 20, vjust =3),
        panel.background = element_rect(fill = "transparent",colour = NA), 
        axis.line = element_line(color="black", size = .2))


#PAIRED T-TEST####
#let's suppose we are testing a drug on mice in the lab and we want to compare mice weight
#before and after the administration of the drug
#let's upload the data
mice_data <- read.csv ("mice_treatment.csv", header = T)

#Paired T-test needs to meet the assumption that 
#the difference between the two treatment is normally distributed

d <- with(mice_data, 
          weight[group == "before"] - weight[group == "after"])
shapiro.test (d)

#so the difference between the two treatments is normally distributed
#we can now run the paired t-test

t.test(mice_data$weight ~mice_data$group, paired = TRUE)
cohensD(mice_data$weight ~mice_data$group, method = "paired")

ddply(mice_data, .(group), summarize,              
                 mean = round(mean(weight), 2),
                 sd = round(sd(weight), 2))

#how to report the output: the paired t-test analysis revealed that
#mice increased the body weight following treatment (mean ą SD: 393.6 ą 29.4
#compared to before they were administered the treatment
#(mean ą SD: 199.2 ą 8.47; t(9) = 20.8, p < 0.001, d = 6.6, 95% CI [173.4, 215.5])

#if the data do not meet the assumptions, instead, you should run a Wilcoxon-signed rank test, as follows:
wilcox.test(mice_data$weight ~mice_data$group, paired = TRUE, conf.int = T) #
mice_data %>% wilcox_effsize (weight ~group, paired = TRUE) #effect size

#let's check the mean Forearm length and sd by Species
ddply(mice_data, .(group), summarize,              
                     median = round(median(weight), 2),
                     sd = round(sd(weight), 2))

#how to report;
#A Wilcoxon signed-rank test analysis showed that mice significantly 
#increased their body weight treatment #after they were exposed to the treatment 
#(median ą SD = 392.9 ą 29.40g) compared to before they underwent treatment
#(median ą SD: 195.3 ą 18.47; V = 55, p = 0.002; r = 0.88; 95% CI [172.8, 216.6]).

#let's plot the results
mice_data$group <- factor (mice_data$group, levels = c ("before", "after"), ordered = T)
ggplot (mice_data, aes (x = group, y = weight)) + geom_boxplot () +
  geom_jitter ()+ ggtitle("Difference in body weight between treatements")+ 
  xlab("Period") + ylab("body weight (g)") +
  theme(plot.title = element_text(color = "black", size = 20, hjust = .5), 
        axis.text = element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "plain"), 
        axis.title= element_text(color = "black", size = 20, vjust =3),
        panel.background = element_rect(fill = "transparent",colour = NA), 
        axis.line = element_line(color="black", size = .2))