#upload the data
iris_data <- read.csv ("iris.csv", header = T)

#Generate a new table that contains only the species "setosa" and "versicolor"

iris_data2 <- iris_data[which(iris_data$Species==c("setosa", "versicolor")),]

#Compare the length of the sepal (variable name: "Sepal.length") between the two species
#we can run either the Independent-sample t-test or the Mann-Whitney U-test
#let's check the t-test assumptions
#distribution of the data
install.packages ("rstatix")
library(rstatix)
iris_data2%>%
  group_by(Species) %>%
  shapiro_test(Petal.Length)

#data are normally distributed

#variance of means
var.test(Petal.Length ~ Species, data = iris_data2)

#assumption on homogeneity of variance is viaoleted (p< 0.001)

#since one of the two assumptions is violeted, then we need to run the Mann-Whitney U-test
wilcox.test(iris_data2$Petal.Length~iris_data2$Species)

#effect size and confidence interval
install.packages ("rcompanion")
library(rcompanion)
wilcoxonR(x  = iris_data2$Petal.Length,
          g  = iris_data2$Species,
          ci = TRUE)

#Calculate mean, median and standard deviation of petal length for the two species
ddply(iris_data2, .(Species), summarize,              
      mean = mean(Petal.Length),
      median = median (Petal.Length),
      sd = sd(Petal.Length))

#Plot the results with the most appropriate plot
ggplot (iris_data2, aes (x = Species, y = Petal.Length)) + geom_boxplot () +
  geom_jitter ()+ ggtitle("Difference in Petal length between plant species")+ 
  xlab("Species") + ylab("Petal length (cm)") +
  theme(plot.title = element_text(color = "black", size = 20, hjust = .5), 
        axis.text = element_text(color = "black", size = 20, hjust = .5, vjust = .5, face = "plain"), 
        axis.title= element_text(color = "black", size = 20, vjust =3),
        panel.background = element_rect(fill = "transparent",colour = NA), 
        axis.line = element_line(color="black", size = .2))