library(readr)
## Warning: package 'readr' was built under R version 3.4.4
library(magrittr)
## Warning: package 'magrittr' was built under R version 3.4.4
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(dplyr)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.4.4
library(deductive)
## Warning: package 'deductive' was built under R version 3.4.4
library(validate)
## Warning: package 'validate' was built under R version 3.4.4
library(outliers)
WHO <- read_csv("C:/Users/Alfredo/Downloads/WHO.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## country = col_character(),
## iso2 = col_character(),
## iso3 = col_character()
## )
## See spec(...) for full column specifications.
WHO1 <- WHO %>%
gather(value='value',key='code',5:60)
WHO1 %>% head()
WHO2a <- WHO1 %>%
separate(code, into = c('new','var','sexage'),sep='_')
WHO2b <- WHO2a %>%
separate(sexage, into = c('sex','age'),sep=1)
WHO2b %>% head()
WHO3 <- spread(WHO2b,key=var,value = value)
WHO3 %>% head()
WHO4 <- mutate(WHO3,
sex=as.factor(sex),
age=as.factor(age))
str(WHO4)
## Classes 'tbl_df', 'tbl' and 'data.frame': 101360 obs. of 11 variables:
## $ country: chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ iso2 : chr "AF" "AF" "AF" "AF" ...
## $ iso3 : chr "AFG" "AFG" "AFG" "AFG" ...
## $ year : int 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 ...
## $ new : chr "new" "new" "new" "new" ...
## $ sex : Factor w/ 2 levels "f","m": 2 2 2 2 2 2 2 2 2 2 ...
## $ age : Factor w/ 7 levels "014","1524","2534",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ep : int NA NA NA NA NA NA NA NA NA NA ...
## $ rel : int NA NA NA NA NA NA NA NA NA NA ...
## $ sn : int NA NA NA NA NA NA NA NA NA NA ...
## $ sp : int NA NA NA NA NA NA NA NA NA NA ...
WHO4 <- mutate(WHO4,
age=factor(age,labels = c('<15','15-24','25-34','35-44','45-54','55-64','65>=')))
WHO4 <- WHO4 %>% arrange(age)
WHO4 %>% head()
WHO_subset <- WHO4 %>% select(-(iso2),-(new)) %>%
filter(country=='Argentina'|country=='Australia'|country=='Italy')
WHO_subset %>% head()
su <- read_csv("C:/Users/Alfredo/Downloads/surveys.csv")
## Parsed with column specification:
## cols(
## record_id = col_integer(),
## month = col_integer(),
## day = col_integer(),
## year = col_integer(),
## species_id = col_character(),
## sex = col_character(),
## hindfoot_length = col_integer(),
## weight = col_integer()
## )
sp <- read_csv("C:/Users/Alfredo/Downloads/species.csv")
## Parsed with column specification:
## cols(
## species_id = col_character(),
## genus = col_character(),
## species = col_character(),
## taxa = col_character()
## )
surveys_combined <- su %>% left_join(sp,by='species_id')
surveys_combined %>% head()
surveys_combined %>% filter(species_id=='DM') %>% group_by(month) %>% summarise(avg_weight=mean(weight,na.rm = TRUE),
avg_hindfoot=mean(hindfoot_length,na.rm = TRUE))
surveys_combined_year <- surveys_combined %>% filter(year=='1977')
surveys_combined_year %>% group_by(species) %>% summarise(sum_weight=sum(is.na(weight)))
surveys_combined_year %>% head()
surveys_weight_imputed <- surveys_combined_year
surveys_weight_imputed$weight[is.na(surveys_weight_imputed$weight)] <- mean(surveys_weight_imputed$weight, na.rm = TRUE)
surveys_weight_imputed %>% head()
sum(is.infinite(surveys_weight_imputed$weight))
## [1] 0
sum(is.nan(surveys_weight_imputed$weight))
## [1] 0
## Both sum of infinite and NaN values is 0; hence, there are no special values left for the variable 'weight'.
y <- surveys_combined$hindfoot_length
y <- na.omit(y)
y %>% boxplot(main='Box Plot of hindfoot length', ylab='hindfoot lenght')
## There are clear outliers, so we use capping in order to handle them.
z.scores <- y %>% scores(type='z')
z.scores %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.8530 -0.8665 0.2835 0.0000 0.7017 4.2565
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
hfl_capped <- y %>% cap()
summary(hfl_capped)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 21.00 32.00 29.29 36.00 58.00
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 21.00 32.00 29.29 36.00 70.00
## Comparing both cases, the Max value decreases from 70 to 58.