#let's clean our work environment first
rm(list = ls())
#the first step is to tell R where your data are located. In other words you need to set your working directory
setwd ('C:\\Users\\kabur\\Dropbox\\R workshop\\2021-22 series\\session 1')
#Upload data
monkeys <- read.csv ("data_monkeys.csv", header = T, stringsAsFactors = F)
#To see objects in working space enter
ls ()
#check names of the headers
names (monkeys)
#check characteristics of data
str (monkeys)
#check first six raws
head (monkeys)
#check for last six raws
tail (monkeys)
#check how many raws and columns there are in the data set
dim (monkeys)
#you can select individual data points by using coordinates (row number, column column)
monkeys [1,4]
#you can also save an object that corresponds to a specific coordinate
adriana <- monkeys [2,2]
#you can extract a whole section (from row number to row number, from column number to column number)
monkeys [1:4, 5:7]
#you can extract a single column using $
monkeys$Focal_ID
#you can change column names as follows
colnames(monkeys)[colnames(monkeys)=="Focal_ID"] <- "ID"
colnames(monkeys)[colnames(monkeys)=="Tot_N_HM_interactions"] <- "hm_int"
colnames(monkeys)[colnames(monkeys)=="Tot_Provision"] <- "prov"
colnames(monkeys)[colnames(monkeys)=="Grooming_given"] <- "gr"
#let's recheck the header names
names (monkeys)
#you can change the format of your data from a number to a character
monkeys$csex <- as.character (monkeys$sex)
#let's recheck data format
str (monkeys)
#we can change the numbers into meaningful text. For example we can replace "0" with adult male and "1" with adult females
monkeys["csex"] [monkeys["csex"] == "0"] <- "am"
monkeys["csex"] [monkeys["csex"] == "1"] <- "af"
#you can round up decimal positions
monkeys$Obs_time <-round(monkeys$Obs_time, digits =0)
#let's recheck the data
head (monkeys)
#you can do all sorts of mathematical calculations
sum (monkeys$Obs_time)#sum
mean (monkeys$Obs_time)#mean
median (monkeys$Obs_time)#median
#you can create a new column that is the mathematical calculation of other two columns
monkeys$Tot_HMINT <- monkeys$hm_int + monkeys$prov
#you can extract statistical information for different groups/factors using the ddply function in the "plyr" package
install.packages(("plyr"))
library (plyr)
summarize_data <-ddply(monkeys, .(csex), summarize, #to summarize the data by mean and sd
mean = round(mean(hm_int), 2),
sd = round(sd(hm_int), 2))
#check the output
summarize_data
#you can also change the headers of the object "summarize data"
colnames(summarize_data)[colnames(summarize_data)=="csex"] <- "sex"
#check the headers
head (summarize_data)
#you can remove objects from the environment using the "rm" function
rm (summarize_data)
#We can also ask R whether specific cells have specific data. For example, by using the "==" symbol we can ask whether we have females in our data sets and how many females there were
monkeys$csex == "af"
sum (monkeys$csex == "af")
#We can use the "which()" function to select a specific category of data and do calculations on that categories. So for example: if we want to isolate how long female monkeys were observed for, then you can enter the following command:
sum(monkeys[which(monkeys$csex == "af"), "Obs_time"])
#we can create a new table that includes only females
females <- monkeys [which(monkeys$csex == "af"),]
#You can use the "which" and "==" functions to exclude some data points that have too few or too large observation time (i.e., exclude outliers)
females_nooutlier <- females [-which(females$Obs_time < 3 | females$Obs_time > 10),]
#You can see if you have missing data points through the following R command:
which(is.na(females_nooutlier), arr.ind = T)
#If you have any missing data point, you can take them out as follows:
data_clean<- females_nooutlier[-which(is.na(females_nooutlier), arr.ind = T),]
#Recheck presence of outlier as follows:
which(is.na(data_clean), arr.ind = T)
#if you want to remove a specific ID, you can do it as follows:
data_clean_norw <- data_clean [-c(which (data_clean$ID == "rw")),]