Setup

library(readr)
## Warning: package 'readr' was built under R version 3.4.4
library(magrittr)
## Warning: package 'magrittr' was built under R version 3.4.4
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(dplyr)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.4.4
library(deductive)
## Warning: package 'deductive' was built under R version 3.4.4
library(validate)
## Warning: package 'validate' was built under R version 3.4.4
library(outliers)

Read WHO Data

WHO <- read_csv("C:/Users/Alfredo/Downloads/WHO.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   country = col_character(),
##   iso2 = col_character(),
##   iso3 = col_character()
## )
## See spec(...) for full column specifications.

Tidy Task 1:

WHO1 <- WHO %>% 
  gather(value='value',key='code',5:60)

WHO1 %>% head()

Tidy Task 2:

WHO2a <- WHO1 %>% 
  separate(code, into = c('new','var','sexage'),sep='_')
WHO2b <- WHO2a %>% 
  separate(sexage, into = c('sex','age'),sep=1)

WHO2b %>% head()

Tidy Task 3:

WHO3 <- spread(WHO2b,key=var,value = value)

WHO3 %>% head()

Tidy Task 4:

WHO4 <- mutate(WHO3,
               sex=as.factor(sex),
               age=as.factor(age))

str(WHO4)
## Classes 'tbl_df', 'tbl' and 'data.frame':    101360 obs. of  11 variables:
##  $ country: chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ iso2   : chr  "AF" "AF" "AF" "AF" ...
##  $ iso3   : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ year   : int  1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 ...
##  $ new    : chr  "new" "new" "new" "new" ...
##  $ sex    : Factor w/ 2 levels "f","m": 2 2 2 2 2 2 2 2 2 2 ...
##  $ age    : Factor w/ 7 levels "014","1524","2534",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ep     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ rel    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ sn     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ sp     : int  NA NA NA NA NA NA NA NA NA NA ...
WHO4 <- mutate(WHO4,
              age=factor(age,labels = c('<15','15-24','25-34','35-44','45-54','55-64','65>=')))

WHO4 <- WHO4 %>% arrange(age)

WHO4 %>% head()

Task 5: Filter & Select

WHO_subset <- WHO4 %>% select(-(iso2),-(new)) %>%
  filter(country=='Argentina'|country=='Australia'|country=='Italy')

WHO_subset %>% head()

Read Species and Surveys data sets

su <- read_csv("C:/Users/Alfredo/Downloads/surveys.csv")
## Parsed with column specification:
## cols(
##   record_id = col_integer(),
##   month = col_integer(),
##   day = col_integer(),
##   year = col_integer(),
##   species_id = col_character(),
##   sex = col_character(),
##   hindfoot_length = col_integer(),
##   weight = col_integer()
## )
sp <- read_csv("C:/Users/Alfredo/Downloads/species.csv")
## Parsed with column specification:
## cols(
##   species_id = col_character(),
##   genus = col_character(),
##   species = col_character(),
##   taxa = col_character()
## )

Task 6: Join

surveys_combined <- su %>% left_join(sp,by='species_id')

surveys_combined %>% head()

Task 7: Calculate

surveys_combined %>% filter(species_id=='DM') %>% group_by(month) %>% summarise(avg_weight=mean(weight,na.rm = TRUE),
                                                   avg_hindfoot=mean(hindfoot_length,na.rm = TRUE))

Task 8: Missing Values

surveys_combined_year <- surveys_combined %>% filter(year=='1977')

surveys_combined_year %>% group_by(species) %>% summarise(sum_weight=sum(is.na(weight)))
surveys_combined_year %>% head()
surveys_weight_imputed <- surveys_combined_year

surveys_weight_imputed$weight[is.na(surveys_weight_imputed$weight)] <- mean(surveys_weight_imputed$weight, na.rm = TRUE)

surveys_weight_imputed %>% head()

Task 9: Inconsistencies or Special Values

sum(is.infinite(surveys_weight_imputed$weight))
## [1] 0
sum(is.nan(surveys_weight_imputed$weight))
## [1] 0
## Both sum of infinite and NaN values is 0; hence, there are no special values left for the variable 'weight'.

Task 10: Outliers

y <- surveys_combined$hindfoot_length
y <- na.omit(y)
y %>% boxplot(main='Box Plot of hindfoot length', ylab='hindfoot lenght')

## There are clear outliers, so we use capping in order to handle them.

z.scores <- y %>% scores(type='z')
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.8530 -0.8665  0.2835  0.0000  0.7017  4.2565
cap <- function(x){
  quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
  x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
  x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
  x
}

hfl_capped <- y %>% cap()

summary(hfl_capped)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   21.00   32.00   29.29   36.00   58.00
summary(y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   21.00   32.00   29.29   36.00   70.00
## Comparing both cases, the Max value decreases from 70 to 58.