species <- read.csv("C:/Users/dan/Desktop/a Pre Processing 2/species.csv")
surveys <- read.csv("C:/Users/dan/Desktop/a Pre Processing 2/surveys.csv")
WHO <- read.csv("C:/Users/dan/Desktop/a Pre Processing 2/WHO.csv")
#View(WHO)
Task_1 <- WHO %>%
gather(new_sp_m014,
new_sp_m1524,
new_sp_m2534,
new_sp_m3544,
new_sp_m4554,
new_sp_m5564,
new_sp_m65,
new_sn_m014,
new_sn_m1524,
new_sn_m2534,
new_sn_m3544,
new_sn_m4554,
new_sn_m5564,
new_sn_m65,
new_ep_m014,
new_ep_m1524,
new_ep_m2534,
new_ep_m3544,
new_ep_m4554,
new_ep_m5564,
new_ep_m65,
new_rel_m014,
new_rel_m1524,
new_rel_m2534,
new_rel_m3544,
new_rel_m4554,
new_rel_m5564,
new_rel_m65,
new_sp_f014,
new_sp_f1524,
new_sp_f2534,
new_sp_f3544,
new_sp_f4554,
new_sp_f5564,
new_sp_f65,
new_sn_f014,
new_sn_f1524,
new_sn_f2534,
new_sn_f3544,
new_sn_f4554,
new_sn_f5564,
new_sn_f65,
new_ep_f014,
new_ep_f1524,
new_ep_f2534,
new_ep_f3544,
new_ep_f4554,
new_ep_f5564,
new_ep_f65,
new_rel_f014,
new_rel_f1524,
new_rel_f2534,
new_rel_f3544,
new_rel_f4554,
new_rel_f5564,
new_rel_f65,
key = "code", value = "value" ) #group_by(new_sp) %>% grepl('m') %>%
#head(Task_1, n=10)
#str(WHO)
#Task_2 <- Task_1 %>% grepl('m')
Task_2 <- Task_1 %>% separate(code, into = c("new", "var", sep = "_") )
colnames(Task_2)[7] <- "age"
#colnames(Task_2)[8] <- "sex"
#Task_2 <- cbind.data.frame(Task_1,new,var1)
#head(Task_2 , n=10)
#Task_2 <-as.data.frame(Task_2)
library(stringi)
Task_2$sex <- substring(Task_2$age, 1, 1)
Task_2$age <- stri_sub(Task_2$age,2)
Task_2 <-as.data.frame(Task_2)
#head(Task_2 , n=10)
#View(Task_2)
var <- Task_2$var
age <- Task_2$age
sex <- Task_2$sex
value <- Task_2$value
Task_2 <- Task_2 %>% select(-contains("age"))%>%
select(-contains("var"))%>%
select(-contains("sex"))%>%
select(-contains("value"))
head(Task_2 , n=10)
## country iso2 iso3 year new
## 1 Afghanistan AF AFG 1980 new
## 2 Afghanistan AF AFG 1981 new
## 3 Afghanistan AF AFG 1982 new
## 4 Afghanistan AF AFG 1983 new
## 5 Afghanistan AF AFG 1984 new
## 6 Afghanistan AF AFG 1985 new
## 7 Afghanistan AF AFG 1986 new
## 8 Afghanistan AF AFG 1987 new
## 9 Afghanistan AF AFG 1988 new
## 10 Afghanistan AF AFG 1989 new
Task_2 <- cbind(Task_2, sex, age, var, value)
head(Task_2 , n=10)
## country iso2 iso3 year new sex age var value
## 1 Afghanistan AF AFG 1980 new m 014 sp NA
## 2 Afghanistan AF AFG 1981 new m 014 sp NA
## 3 Afghanistan AF AFG 1982 new m 014 sp NA
## 4 Afghanistan AF AFG 1983 new m 014 sp NA
## 5 Afghanistan AF AFG 1984 new m 014 sp NA
## 6 Afghanistan AF AFG 1985 new m 014 sp NA
## 7 Afghanistan AF AFG 1986 new m 014 sp NA
## 8 Afghanistan AF AFG 1987 new m 014 sp NA
## 9 Afghanistan AF AFG 1988 new m 014 sp NA
## 10 Afghanistan AF AFG 1989 new m 014 sp NA
all good to here
The WHO data set is not in a tidy format yet. The “rel”, “ep”, “sn”, and “sp” keys need to be in their own columns as we will treat each of these as a separate variable. In this step, move the “rel”, “ep”, “sn”, and “sp” keys into their own columns. ## theres a problem here
Task_3 <- Task_2 %>% separate(var, c("var","ep", "rel","sn", "sp"), extra='drop') %>%
select('country','iso2','iso3','year','new','sex','age','ep',
'rel','sn', 'sp','value')
## Warning: Expected 5 pieces. Missing pieces filled with `NA` in 405440
## rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
## 20, ...].
head(Task_3, n=4)
## country iso2 iso3 year new sex age ep rel sn sp value
## 1 Afghanistan AF AFG 1980 new m 014 <NA> <NA> <NA> <NA> NA
## 2 Afghanistan AF AFG 1981 new m 014 <NA> <NA> <NA> <NA> NA
## 3 Afghanistan AF AFG 1982 new m 014 <NA> <NA> <NA> <NA> NA
## 4 Afghanistan AF AFG 1983 new m 014 <NA> <NA> <NA> <NA> NA
Using “mutate()”, factorise sex and age. For “age” variable, create labels and also order the variable. Labels would be: <15, 15-24, 25-34, 35-44, 45-54, 55-64, >=65.
Task_4 <- Task_3 %>%
mutate(
age = ifelse(age == "014", "<15", age) ,
age = ifelse(age == "1524", "15-24", age) ,
age = ifelse(age == "2534", "25-34", age) ,
age = ifelse(age == "3444", "35-44", age) ,
age = ifelse(age == "4554", "45-54", age) ,
age = ifelse(age == "5564", "55-64", age) ,
age = ifelse(age == "065", ">=65", age)
) #%>% arrange( desc(Task_4) )
head(Task_4)
## country iso2 iso3 year new sex age ep rel sn sp value
## 1 Afghanistan AF AFG 1980 new m <15 <NA> <NA> <NA> <NA> NA
## 2 Afghanistan AF AFG 1981 new m <15 <NA> <NA> <NA> <NA> NA
## 3 Afghanistan AF AFG 1982 new m <15 <NA> <NA> <NA> <NA> NA
## 4 Afghanistan AF AFG 1983 new m <15 <NA> <NA> <NA> <NA> NA
## 5 Afghanistan AF AFG 1984 new m <15 <NA> <NA> <NA> <NA> NA
## 6 Afghanistan AF AFG 1985 new m <15 <NA> <NA> <NA> <NA> NA
Drop the redundant columns “iso2” and “new”, and filter any three countries from the tidy version of the WHO data set. Name this subset of the data frame as “WHO_subset”.
Task_4 <- as.data.frame(Task_4)
Task_4$iso2 <- NULL ## Drop the column iso2 and new
Task_4$new <- NULL
head(Task_4)
## country iso3 year sex age ep rel sn sp value
## 1 Afghanistan AFG 1980 m <15 <NA> <NA> <NA> <NA> NA
## 2 Afghanistan AFG 1981 m <15 <NA> <NA> <NA> <NA> NA
## 3 Afghanistan AFG 1982 m <15 <NA> <NA> <NA> <NA> NA
## 4 Afghanistan AFG 1983 m <15 <NA> <NA> <NA> <NA> NA
## 5 Afghanistan AFG 1984 m <15 <NA> <NA> <NA> <NA> NA
## 6 Afghanistan AFG 1985 m <15 <NA> <NA> <NA> <NA> NA
# WHO_subset <- Task_4 %>% (filter( country == Canada &&
# country == Ghana &&
# country == Greece))
#WHO_subset <- Task_4$country %>% filter( country == Canada & country == Ghana & country == Greece )
Combine “surveys” and “species” data frames using the key variable “species_id”. Add the species information (“genus”, “species”, “taxa”) to the “surveys” data. Rename as “surveys_combined”.
#library(dplyr)
surveys_combined <- surveys %>% left_join(species, by = "species_id")
## Warning: Column `species_id` joining factors with different levels,
## coercing to character vector
head(surveys_combined)
## record_id month day year species_id sex hindfoot_length weight
## 1 1 7 16 1977 NL M 32 NA
## 2 2 7 16 1977 NL M 33 NA
## 3 3 7 16 1977 DM F 37 NA
## 4 4 7 16 1977 DM M 36 NA
## 5 5 7 16 1977 DM M 35 NA
## 6 6 7 16 1977 PF M 14 NA
## genus species taxa
## 1 Neotoma albigula Rodent
## 2 Neotoma albigula Rodent
## 3 Dipodomys merriami Rodent
## 4 Dipodomys merriami Rodent
## 5 Dipodomys merriami Rodent
## 6 Perognathus flavus Rodent
s2 <- cbind.data.frame( surveys_combined$species,
surveys_combined$sex,
surveys_combined$hindfoot_length,
surveys_combined$weight,
surveys_combined$month )
average_male_hf_1 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '1', na.rm = TRUE)
average_male_hf_2 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '2', na.rm = TRUE)
average_male_hf_3 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '3', na.rm = TRUE)
average_male_hf_4 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '4', na.rm = TRUE)
average_male_hf_5 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '5', na.rm = TRUE)
average_male_hf_6 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '6', na.rm = TRUE)
average_male_hf_7 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '7', na.rm = TRUE)
average_male_hf_8 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '8', na.rm = TRUE)
average_male_hf_9 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '9', na.rm = TRUE)
average_male_hf_10 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '10', na.rm = TRUE)
average_male_hf_11 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '11', na.rm = TRUE)
average_male_hf_12 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$hindfoot_length` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '12', na.rm = TRUE)
average_male_w_1 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '1', na.rm = TRUE)
average_male_w_2 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '2', na.rm = TRUE)
average_male_w_3 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '3', na.rm = TRUE)
average_male_w_4 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '4', na.rm = TRUE)
average_male_w_5 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '5', na.rm = TRUE)
average_male_w_6 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '6', na.rm = TRUE)
average_male_w_7 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '7', na.rm = TRUE)
average_male_w_8 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '8', na.rm = TRUE)
average_male_w_9 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '9', na.rm = TRUE)
average_male_w_10 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '10', na.rm = TRUE)
average_male_w_11 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '11', na.rm = TRUE)
average_male_w_12 = mean(s2$`surveys_combined$sex` == 'M' &
s2$`surveys_combined$weight` &
s2$`surveys_combined$species`=='merriami' &
s2$`surveys_combined$month` == '12', na.rm = TRUE)
cat(" Average Male Hindfooot length for merriami in Jan is " ,
round( average_male_hf_1, 4),
"\n and the average weight for Jan is ",
round( average_male_w_1, 4),
"\n Average Male Hindfooot length for merriami in Feb is " ,
round( average_male_hf_2, 4),
"\n and the average weight for Feb is ",
round( average_male_w_2, 4),
"\n Average Male Hindfooot length for merriami in Mar is " ,
round( average_male_hf_3, 4),
"\n and the average weight for Mar is ",
round( average_male_w_3, 4),
"\n Average Male Hindfooot length for merriami in Apr is " ,
round( average_male_hf_4, 4),
"\n and the average weight for Apr is ",
round( average_male_w_4, 4),
"\n Average Male Hindfooot length for merriami in May is " ,
round( average_male_hf_5, 4),
"\n and the average weight for May is ",
round( average_male_w_5, 4),
"\n Average Male Hindfooot lenght for merriami in Jun is " ,
round( average_male_hf_6, 4),
"\n and the average weight for Jun is ",
round( average_male_w_6, 4),
"\n Average Male Hindfooot length for merriami in Jul is " ,
round( average_male_hf_7, 4),
"\n and the average weight for Jul is ",
round( average_male_w_7, 4),
"\n Average Male Hindfooot length for merriami in Aug is " ,
round( average_male_hf_8, 4),
"\n and the average weight for Aug is ",
round( average_male_w_8, 4),
"\n Average Male Hindfooot length for merriami in Sep is " ,
round( average_male_hf_9, 4),
"\n and the average weight for Sep is ",
round( average_male_w_9, 4),
"\n Average Male Hindfooot length for merriami in Oct is " ,
round( average_male_hf_10, 4),
"\n and the average weight for Oct is ",
round( average_male_w_10, 4),
"\n Average Male Hindfooot length for merriami in Nov is " ,
round( average_male_hf_11, 4),
"\n and the average weight for Nov is ",
round( average_male_w_11, 4),
"\n Average Male Hindfooot length for merriami in Dec is " ,
round( average_male_hf_12, 4),
"\n and the average weight for Dec is ",
round( average_male_w_12, 4))
## Average Male Hindfooot length for merriami in Jan is 0.0132
## and the average weight for Jan is 0.0135
## Average Male Hindfooot length for merriami in Feb is 0.0153
## and the average weight for Feb is 0.0152
## Average Male Hindfooot length for merriami in Mar is 0.016
## and the average weight for Mar is 0.017
## Average Male Hindfooot length for merriami in Apr is 0.0142
## and the average weight for Apr is 0.0167
## Average Male Hindfooot length for merriami in May is 0.0144
## and the average weight for May is 0.0143
## Average Male Hindfooot lenght for merriami in Jun is 0.0102
## and the average weight for Jun is 0.0107
## Average Male Hindfooot length for merriami in Jul is 0.0123
## and the average weight for Jul is 0.0132
## Average Male Hindfooot length for merriami in Aug is 0.0101
## and the average weight for Aug is 0.0094
## Average Male Hindfooot length for merriami in Sep is 0.0112
## and the average weight for Sep is 0.0114
## Average Male Hindfooot length for merriami in Oct is 0.0147
## and the average weight for Oct is 0.0145
## Average Male Hindfooot length for merriami in Nov is 0.0141
## and the average weight for Nov is 0.014
## Average Male Hindfooot length for merriami in Dec is 0.0135
## and the average weight for Dec is 0.0135
surveys_combined_year <- surveys_combined %>% group_by(year)%>%filter(year == '1980')
sum(is.na(surveys_combined_year$weight))
## [1] 98
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
surveys_weight_imputed <- impute(surveys_combined_year$weight, fun = mean)
## Now check this imputed column
sum(is.na(surveys_weight_imputed))
## [1] 0
# is.imputed(surveys_weight_imputed) <== ## I would use this but its to long for individual
## checks
## This looks for NaN, Inf, -Inf , and then tells me if they are
## present in this particular column
is.special <- function(surveys_weight_imputed){
if (is.numeric(surveys_weight_imputed)) !is.finite(surveys_weight_imputed) else is.na(surveys_weight_imputed)
}
sum (sapply(surveys_weight_imputed, is.special))
## [1] 0
boxplot(surveys_combined$hindfoot_length)
surveys_combined$hindfoot_length <- as.numeric(surveys_combined$hindfoot_length)
sum(is.na(surveys_combined$hindfoot_length))
## [1] 4111
# Use Imputational method
surveys_combined$hindfoot_length <- impute(surveys_combined$hindfoot_length, fun = mean)
# Define a function to cap the values outside the limits
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
## Apply the cap then recheck and structure check
surveys_combined$hindfoot_length <- surveys_combined$hindfoot_length %>% cap()
sum(is.na(surveys_combined$hindfoot_length))
## [1] 0
#str(surveys_combined$hindfoot_length)
surveys_combined$hindfoot_length <- as.numeric(surveys_combined$hindfoot_length)
boxplot(surveys_combined$hindfoot_length)