species <- read.csv("C:/Users/dan/Desktop/a Pre Processing 2/species.csv")

surveys <- read.csv("C:/Users/dan/Desktop/a Pre Processing 2/surveys.csv")

WHO <- read.csv("C:/Users/dan/Desktop/a Pre Processing 2/WHO.csv")
#View(WHO)

Tasks 1 - 5 using WHO

Task 1

Task_1 <- WHO %>%  
gather(new_sp_m014, 
       new_sp_m1524,
       new_sp_m2534,
       new_sp_m3544,
       new_sp_m4554,
       new_sp_m5564,
       new_sp_m65,
       new_sn_m014,
       new_sn_m1524,
       new_sn_m2534,
       new_sn_m3544,
       new_sn_m4554,
       new_sn_m5564,
       new_sn_m65,
       new_ep_m014,
       new_ep_m1524,
       new_ep_m2534,
       new_ep_m3544,
       new_ep_m4554,
       new_ep_m5564,
       new_ep_m65,
       new_rel_m014,
       new_rel_m1524,
       new_rel_m2534,
       new_rel_m3544,
       new_rel_m4554,
       new_rel_m5564,
       new_rel_m65,
       new_sp_f014,
       new_sp_f1524,
       new_sp_f2534,
       new_sp_f3544,
       new_sp_f4554,
       new_sp_f5564,
       new_sp_f65,
       new_sn_f014,
       new_sn_f1524,
       new_sn_f2534,
       new_sn_f3544,
       new_sn_f4554,
       new_sn_f5564,
       new_sn_f65,
       new_ep_f014,
       new_ep_f1524,
       new_ep_f2534,
       new_ep_f3544,
       new_ep_f4554,
       new_ep_f5564,
       new_ep_f65,
       new_rel_f014,
       new_rel_f1524,
       new_rel_f2534,
       new_rel_f3544,
       new_rel_f4554,
       new_rel_f5564,
       new_rel_f65,
       key = "code", value = "value" ) #group_by(new_sp) %>% grepl('m')  %>%


#head(Task_1, n=10)
#str(WHO)

Task 2

#Task_2 <-  Task_1  %>% grepl('m')
  
Task_2 <- Task_1  %>% separate(code, into = c("new", "var",  sep = "_") )
colnames(Task_2)[7] <- "age" 
#colnames(Task_2)[8] <- "sex" 
#Task_2 <- cbind.data.frame(Task_1,new,var1)
#head(Task_2 , n=10)
#Task_2 <-as.data.frame(Task_2)
library(stringi)
Task_2$sex <- substring(Task_2$age, 1, 1)
Task_2$age <-  stri_sub(Task_2$age,2)
Task_2 <-as.data.frame(Task_2)
#head(Task_2 , n=10)
#View(Task_2)
var <- Task_2$var
age <- Task_2$age
sex <- Task_2$sex
value <- Task_2$value

Task_2 <- Task_2 %>% select(-contains("age"))%>% 
                     select(-contains("var"))%>% 
                     select(-contains("sex"))%>%
                     select(-contains("value"))

head(Task_2 , n=10)
##        country iso2 iso3 year new
## 1  Afghanistan   AF  AFG 1980 new
## 2  Afghanistan   AF  AFG 1981 new
## 3  Afghanistan   AF  AFG 1982 new
## 4  Afghanistan   AF  AFG 1983 new
## 5  Afghanistan   AF  AFG 1984 new
## 6  Afghanistan   AF  AFG 1985 new
## 7  Afghanistan   AF  AFG 1986 new
## 8  Afghanistan   AF  AFG 1987 new
## 9  Afghanistan   AF  AFG 1988 new
## 10 Afghanistan   AF  AFG 1989 new
Task_2 <- cbind(Task_2, sex, age, var, value)
head(Task_2 , n=10)
##        country iso2 iso3 year new sex age var value
## 1  Afghanistan   AF  AFG 1980 new   m 014  sp    NA
## 2  Afghanistan   AF  AFG 1981 new   m 014  sp    NA
## 3  Afghanistan   AF  AFG 1982 new   m 014  sp    NA
## 4  Afghanistan   AF  AFG 1983 new   m 014  sp    NA
## 5  Afghanistan   AF  AFG 1984 new   m 014  sp    NA
## 6  Afghanistan   AF  AFG 1985 new   m 014  sp    NA
## 7  Afghanistan   AF  AFG 1986 new   m 014  sp    NA
## 8  Afghanistan   AF  AFG 1987 new   m 014  sp    NA
## 9  Afghanistan   AF  AFG 1988 new   m 014  sp    NA
## 10 Afghanistan   AF  AFG 1989 new   m 014  sp    NA

all good to here

Task 3 The “rel”, “ep”, “sn”, and “sp” keys need to be in their own columns

The WHO data set is not in a tidy format yet. The “rel”, “ep”, “sn”, and “sp” keys need to be in their own columns as we will treat each of these as a separate variable. In this step, move the “rel”, “ep”, “sn”, and “sp” keys into their own columns. ## theres a problem here

Task_3 <- Task_2 %>% separate(var, c("var","ep", "rel","sn", "sp"),  extra='drop') %>%
                     select('country','iso2','iso3','year','new','sex','age','ep',
                            'rel','sn', 'sp','value')
## Warning: Expected 5 pieces. Missing pieces filled with `NA` in 405440
## rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
## 20, ...].
head(Task_3, n=4)
##       country iso2 iso3 year new sex age   ep  rel   sn   sp value
## 1 Afghanistan   AF  AFG 1980 new   m 014 <NA> <NA> <NA> <NA>    NA
## 2 Afghanistan   AF  AFG 1981 new   m 014 <NA> <NA> <NA> <NA>    NA
## 3 Afghanistan   AF  AFG 1982 new   m 014 <NA> <NA> <NA> <NA>    NA
## 4 Afghanistan   AF  AFG 1983 new   m 014 <NA> <NA> <NA> <NA>    NA

Task 4

Using “mutate()”, factorise sex and age. For “age” variable, create labels and also order the variable. Labels would be: <15, 15-24, 25-34, 35-44, 45-54, 55-64, >=65.

Task_4 <- Task_3 %>% 
  mutate(
    age = ifelse(age == "014", "<15", age)   ,
    age = ifelse(age == "1524", "15-24", age) ,
    age = ifelse(age == "2534", "25-34", age) ,
    age = ifelse(age == "3444", "35-44", age) ,
    age = ifelse(age == "4554", "45-54", age) ,
    age = ifelse(age == "5564", "55-64", age) ,
    age = ifelse(age == "065", ">=65", age)
    ) #%>% arrange( desc(Task_4) )

head(Task_4)
##       country iso2 iso3 year new sex age   ep  rel   sn   sp value
## 1 Afghanistan   AF  AFG 1980 new   m <15 <NA> <NA> <NA> <NA>    NA
## 2 Afghanistan   AF  AFG 1981 new   m <15 <NA> <NA> <NA> <NA>    NA
## 3 Afghanistan   AF  AFG 1982 new   m <15 <NA> <NA> <NA> <NA>    NA
## 4 Afghanistan   AF  AFG 1983 new   m <15 <NA> <NA> <NA> <NA>    NA
## 5 Afghanistan   AF  AFG 1984 new   m <15 <NA> <NA> <NA> <NA>    NA
## 6 Afghanistan   AF  AFG 1985 new   m <15 <NA> <NA> <NA> <NA>    NA

Task 5 I tried several variations but the ‘country’ column WOULD NOT filter ??

Drop the redundant columns “iso2” and “new”, and filter any three countries from the tidy version of the WHO data set. Name this subset of the data frame as “WHO_subset”.

Task_4 <- as.data.frame(Task_4)
Task_4$iso2 <- NULL   ##  Drop the column iso2 and new
Task_4$new  <- NULL

head(Task_4)
##       country iso3 year sex age   ep  rel   sn   sp value
## 1 Afghanistan  AFG 1980   m <15 <NA> <NA> <NA> <NA>    NA
## 2 Afghanistan  AFG 1981   m <15 <NA> <NA> <NA> <NA>    NA
## 3 Afghanistan  AFG 1982   m <15 <NA> <NA> <NA> <NA>    NA
## 4 Afghanistan  AFG 1983   m <15 <NA> <NA> <NA> <NA>    NA
## 5 Afghanistan  AFG 1984   m <15 <NA> <NA> <NA> <NA>    NA
## 6 Afghanistan  AFG 1985   m <15 <NA> <NA> <NA> <NA>    NA
# WHO_subset <- Task_4 %>%  (filter( country == Canada &&
#                                    country == Ghana &&
#                                    country == Greece))


#WHO_subset <- Task_4$country %>%  filter( country == Canada & country == Ghana & country == Greece    )

Task 6: Join

Combine “surveys” and “species” data frames using the key variable “species_id”. Add the species information (“genus”, “species”, “taxa”) to the “surveys” data. Rename as “surveys_combined”.

#library(dplyr)
surveys_combined <- surveys %>% left_join(species, by = "species_id")
## Warning: Column `species_id` joining factors with different levels,
## coercing to character vector
head(surveys_combined)
##   record_id month day year species_id sex hindfoot_length weight
## 1         1     7  16 1977         NL   M              32     NA
## 2         2     7  16 1977         NL   M              33     NA
## 3         3     7  16 1977         DM   F              37     NA
## 4         4     7  16 1977         DM   M              36     NA
## 5         5     7  16 1977         DM   M              35     NA
## 6         6     7  16 1977         PF   M              14     NA
##         genus  species   taxa
## 1     Neotoma albigula Rodent
## 2     Neotoma albigula Rodent
## 3   Dipodomys merriami Rodent
## 4   Dipodomys merriami Rodent
## 5   Dipodomys merriami Rodent
## 6 Perognathus   flavus Rodent

Task 7: Calculate

Calculate the average weight and hindfoot length of one of the species observed in each month (irrespective of the year). Exclude missing values while calculating the average

s2 <- cbind.data.frame(     surveys_combined$species, 
                            surveys_combined$sex, 
                            surveys_combined$hindfoot_length, 
                            surveys_combined$weight,
                            surveys_combined$month  )

average_male_hf_1 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '1', na.rm = TRUE)
average_male_hf_2 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '2', na.rm = TRUE)
average_male_hf_3 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '3', na.rm = TRUE)
average_male_hf_4 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '4', na.rm = TRUE)
average_male_hf_5 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '5', na.rm = TRUE)
average_male_hf_6 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '6', na.rm = TRUE)
average_male_hf_7 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '7', na.rm = TRUE)
average_male_hf_8 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '8', na.rm = TRUE)
average_male_hf_9 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '9', na.rm = TRUE)
average_male_hf_10 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '10', na.rm = TRUE)
average_male_hf_11 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '11', na.rm = TRUE)
average_male_hf_12 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$hindfoot_length` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '12', na.rm = TRUE)
average_male_w_1 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '1', na.rm = TRUE)
average_male_w_2 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '2', na.rm = TRUE)
average_male_w_3 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '3', na.rm = TRUE)
average_male_w_4 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '4', na.rm = TRUE)
average_male_w_5 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '5', na.rm = TRUE)
average_male_w_6 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '6', na.rm = TRUE)
average_male_w_7 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '7', na.rm = TRUE)
average_male_w_8 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '8', na.rm = TRUE)
average_male_w_9 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '9', na.rm = TRUE)
average_male_w_10 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '10', na.rm = TRUE)
average_male_w_11 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '11', na.rm = TRUE)
average_male_w_12 = mean(s2$`surveys_combined$sex` == 'M' &
                           s2$`surveys_combined$weight` & 
                           s2$`surveys_combined$species`=='merriami' &
                           s2$`surveys_combined$month` == '12', na.rm = TRUE)

7 cont Display Calc’s

cat(" Average Male Hindfooot length for merriami in Jan is " , 
    round( average_male_hf_1, 4), 
    "\n and the average weight for Jan is ", 
    round( average_male_w_1, 4),
    "\n Average Male Hindfooot length for merriami in Feb is " , 
    round( average_male_hf_2, 4), 
    "\n and the average weight for Feb is ", 
    round( average_male_w_2, 4),
    "\n Average Male Hindfooot length for merriami in Mar is " , 
    round( average_male_hf_3, 4), 
    "\n and the average weight for Mar is ", 
    round( average_male_w_3, 4),
    "\n Average Male Hindfooot length for merriami in Apr is " , 
    round( average_male_hf_4, 4), 
    "\n and the average weight for Apr is ", 
    round( average_male_w_4, 4),
    "\n Average Male Hindfooot length for merriami in May is " , 
    round( average_male_hf_5, 4), 
    "\n and the average weight for May is ", 
    round( average_male_w_5, 4),
    "\n Average Male Hindfooot lenght for merriami in Jun is " , 
    round( average_male_hf_6, 4), 
    "\n and the average weight for Jun is ", 
    round( average_male_w_6, 4),
    "\n Average Male Hindfooot length for merriami in Jul is " , 
    round( average_male_hf_7, 4), 
    "\n and the average weight for Jul is ", 
    round( average_male_w_7, 4),
    "\n Average Male Hindfooot length for merriami in Aug is " , 
    round( average_male_hf_8, 4), 
    "\n and the average weight for Aug is ", 
    round( average_male_w_8, 4),
    "\n Average Male Hindfooot length for merriami in Sep is " , 
    round( average_male_hf_9, 4), 
    "\n and the average weight for Sep is ", 
    round( average_male_w_9, 4),
    "\n Average Male Hindfooot length for merriami in Oct is " , 
    round( average_male_hf_10, 4), 
    "\n and the average weight for Oct is ", 
    round( average_male_w_10, 4),
    "\n Average Male Hindfooot length for merriami in Nov is " , 
    round( average_male_hf_11, 4), 
    "\n and the average weight for Nov is ", 
    round( average_male_w_11, 4),
    "\n Average Male Hindfooot length for merriami in Dec is " , 
    round( average_male_hf_12, 4), 
    "\n and the average weight for Dec is ", 
    round( average_male_w_12, 4))
##  Average Male Hindfooot length for merriami in Jan is  0.0132 
##  and the average weight for Jan is  0.0135 
##  Average Male Hindfooot length for merriami in Feb is  0.0153 
##  and the average weight for Feb is  0.0152 
##  Average Male Hindfooot length for merriami in Mar is  0.016 
##  and the average weight for Mar is  0.017 
##  Average Male Hindfooot length for merriami in Apr is  0.0142 
##  and the average weight for Apr is  0.0167 
##  Average Male Hindfooot length for merriami in May is  0.0144 
##  and the average weight for May is  0.0143 
##  Average Male Hindfooot lenght for merriami in Jun is  0.0102 
##  and the average weight for Jun is  0.0107 
##  Average Male Hindfooot length for merriami in Jul is  0.0123 
##  and the average weight for Jul is  0.0132 
##  Average Male Hindfooot length for merriami in Aug is  0.0101 
##  and the average weight for Aug is  0.0094 
##  Average Male Hindfooot length for merriami in Sep is  0.0112 
##  and the average weight for Sep is  0.0114 
##  Average Male Hindfooot length for merriami in Oct is  0.0147 
##  and the average weight for Oct is  0.0145 
##  Average Male Hindfooot length for merriami in Nov is  0.0141 
##  and the average weight for Nov is  0.014 
##  Average Male Hindfooot length for merriami in Dec is  0.0135 
##  and the average weight for Dec is  0.0135

Task 8 Find Number of NA’s in this new DataFrame for the weight column

surveys_combined_year <- surveys_combined %>% group_by(year)%>%filter(year == '1980')
sum(is.na(surveys_combined_year$weight))
## [1] 98

Use Imputational method to impute for the missing values

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
surveys_weight_imputed <- impute(surveys_combined_year$weight, fun = mean) 

##  Now check this imputed column   
sum(is.na(surveys_weight_imputed))
## [1] 0
#   is.imputed(surveys_weight_imputed)   <==  ## I would use this but its to long for individual
                                              ## checks

Task 9: Additional check for Inconsistencies or Special Values

##  This looks for NaN, Inf, -Inf , and then tells me if they are 
##  present in this particular column

is.special <- function(surveys_weight_imputed){
if (is.numeric(surveys_weight_imputed)) !is.finite(surveys_weight_imputed) else is.na(surveys_weight_imputed)
}

sum (sapply(surveys_weight_imputed, is.special))
## [1] 0

Task 10: Outliers

Use visual method as a way to check for outliers

boxplot(surveys_combined$hindfoot_length)

Change specific column to numeric for further analysis and check for NA’s

surveys_combined$hindfoot_length <- as.numeric(surveys_combined$hindfoot_length)
sum(is.na(surveys_combined$hindfoot_length))
## [1] 4111

Use the Imputation method including create outlier capping function

# Use Imputational method
surveys_combined$hindfoot_length <- impute(surveys_combined$hindfoot_length, fun = mean)

# Define a function to cap the values outside the limits
cap <- function(x){
    quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
    x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
    x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
    x
}

Apply capping function and check and repeat check for NA’s

## Apply the cap then recheck and structure check
surveys_combined$hindfoot_length <- surveys_combined$hindfoot_length %>% cap()
sum(is.na(surveys_combined$hindfoot_length)) 
## [1] 0
#str(surveys_combined$hindfoot_length)

Imputation changed it to integer, to use Boxplot I needed to change it back to numeric

then re-use boxplot as a visual check for outliers

surveys_combined$hindfoot_length <- as.numeric(surveys_combined$hindfoot_length)
boxplot(surveys_combined$hindfoot_length)

The outliers have been removed or rather , values have been imputed for the NA’s