Setup

library(readr)
library(magrittr)
library(tidyr)
library(dplyr)
library(Hmisc)
library(outliers)

Read WHO Data

who <- read_csv("WHO.csv")
Parsed with column specification:
cols(
  .default = col_integer(),
  country = col_character(),
  iso2 = col_character(),
  iso3 = col_character()
)
See spec(...) for full column specifications.
who

Tidy Task 1:

who_1 <- who %>% gather(c(5:60), key = "code", value = "value")
who_1

Tidy Task 2:

who_2 <- who_1 %>% separate(code, into = c("new","var","sex_age"), sep="_")
who_2 <- who_2 %>% separate(sex_age, into = c("sex","age"), sep = 1)
who_2

Tidy Task 3:

who_3 <- who_2 %>% spread(key = var, value = value)
who_3

Tidy Task 4:

who_4 <- who_3 %>% mutate(sex = factor(sex, levels = c("m","f")), age = factor(age, levels = c("014","1524","2534","3544","4554","5564","65"), labels = c("<15","15-24","25-34","35-44","45-54","55-64", "65>="), ordered = T))
who_4

Task 5: Filter & Select

who_tidy <- who_4 %>% select(., -c(iso2,new))
WHO_subset <- who_tidy %>% filter(country == "Australia"|country =="China"|country =="Russian Federation")
WHO_subset

Read Species and Surveys data sets

species <- read_csv("species.csv")
Parsed with column specification:
cols(
  species_id = col_character(),
  genus = col_character(),
  species = col_character(),
  taxa = col_character()
)
surveys <- read_csv("surveys.csv")
Parsed with column specification:
cols(
  record_id = col_integer(),
  month = col_integer(),
  day = col_integer(),
  year = col_integer(),
  species_id = col_character(),
  sex = col_character(),
  hindfoot_length = col_integer(),
  weight = col_integer()
)
head(species)
head(surveys)

Task 6: Join

surveys_combined <- left_join(surveys, species, by = "species_id")
head(surveys_combined)

Task 7: Calculate

flavus_avg <- surveys_combined %>% filter(., species == "flavus") %>% group_by(., month) %>% summarise(., avg_weight = mean(weight, na.rm = T), avg_hindfoot = mean(hindfoot_length, na.rm = T))
head(flavus_avg)

Task 8: Missing Values

surveys_combined_year <- surveys_combined %>% filter(., year == "1977")
surveys_combined_year %>% group_by(.,species) %>% summarise (total_na = sum(is.na(weight)))
surveys_weight_imputed <- surveys_combined_year %>% 
  group_by(species) %>% 
  mutate(weight = ifelse(is.na(weight), mean(weight, na.rm = TRUE), weight))
surveys_weight_imputed

Task 9: Inconsistencies or Special Values

is.special <- function(x){if(is.numeric(x)) !is.finite(x) else is.nan(x)}
sapply(surveys_weight_imputed$weight, is.special)
  [1]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [16] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
 [31] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [46] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
 [76] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[106]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[136] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[166] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
[196] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[211] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
[226] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[256] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
[271] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
[286] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
[301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
[316] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
[331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
[346] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[361] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[376] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
[391] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE
[406]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
[421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
[436] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[451] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE
[466] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
[481] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
[496] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE

Explaination:

Task 10: Outliers

boxplot(surveys_combined$hindfoot_length, main = "Box Plot of hindfoot length")

cap <- function(x){
    quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ),na.rm = T)
    x[ x < quantiles[2] - 1.5*IQR(x,na.rm = T)] <- quantiles[1]
    x[ x > quantiles[3] + 1.5*IQR(x,na.rm = T)] <- quantiles[4]
    x
}
hindfood_capped <- surveys_combined$hindfoot_length %>% cap()
hindfood_capped
   [1] 32 33 37 36 35 14 NA 37 34 20 53 38 35 NA 36 36 48 22 NA 48 34 31 36 21 35 31 36 38
  [29] NA 52 37 35 36 NA 38 22 35 33 36 36 34 46 36 35 36 35 32 36 17 32 36 26 36 37 36 34
  [57] NA 45 33 20 35 35 35 37 34 35 35 32 15 21 36 31 44 12 32 47 NA 16 34 48 14 35 37 35
  [85] 35 33 11 35 20 35 50 35 NA 36 38 36 36 38 37 54 35 35 35 43 35 NA NA 21 35 NA 37 37
 [113] 13 32 34 37 19 33 35 35 NA 33 37 36 35 30 33 34 34 34 37 33 37 45 35 16 35 37 15 33
 [141] 36 34 35 40 37 36 37 37 50 35 35 38 50 47 NA 35 36 36 53 37 37 39 21 36 50 13 36 36
 [169] 37 36 NA NA 36 37 33 45 NA 33 37 34 35 49 37 52 19 34 36 48 33 36 47 36 NA NA 50 15
 [197] 37 35 36 52 47 32 38 37 50 52 53 37 35 36 48 NA 35 NA 36 13 19 13 36 52 NA 52 50 NA
 [225] NA 37 NA 38 35 NA NA 48 25 35 37 47 50 20 48 53 NA 19 38 36 37 37 54 NA 36 51 36 53
 [253] 38 36 35 36 37 36 36 36 36 19 37 49 38 NA 46 36 37 36 50 NA 37 20 37 50 20 50 19 37
 [281] NA NA 34 37 36 50 38 37 38 33 47 36 36 37 36 18 NA 36 19 NA 48 36 36 37 35 33 36 48
 [309] 49 NA 18 36 35 NA 36 37 32 33 36 35 NA 47 33 NA NA NA 50 NA 47 37 50 16 NA 37 52 36
 [337] 35 37 49 36 50 34 NA NA 34 37 38 36 NA 37 NA 36 NA 38 NA 32 50 16 36 35 36 51 NA 37
 [365] 38 NA 51 16 36 35 36 36 34 37 15 15 48 33 36 33 48 37 52 36 50 NA NA 36 NA NA 15 53
 [393] NA 48 36 36 16 50 36 NA NA NA NA 37 NA NA 50  9 36 53 36 52 49 34 36 NA NA NA NA 38
 [421] 51 37 36 37 36 36 37 33 NA 15 NA NA 38 16 NA 37 37 36 36 16 NA NA 35 38 NA NA 37 51
 [449] 35 33 51 NA 36 36 NA NA NA NA NA NA 36 36 49 49 NA 51 32 35 NA 15 NA 49 NA 15 NA NA
 [477] 47 NA 36 47 35 37 49 37 52 NA 38 37 NA 37 48 49 49 NA NA 36 NA 16 51 NA NA 36 NA NA
 [505] 36 NA 35 36 NA 16 36 48 NA 16 NA 38 26 36 38 15 36 37 36 NA 34 51 37 51 36 37 NA NA
 [533] NA NA NA NA NA 36 34 47 36 36 49 NA 38 NA NA 53 38 36 50 51 50 NA 48 36 37 49 36 50
 [561] 53 NA NA 51 NA 36 36 NA NA NA NA 38 NA 34 38 36 NA NA NA NA NA NA NA NA 53 35 NA NA
 [589] 39 16 38 39 38 38 37 NA 33 NA 15 37 36 37 NA 22 NA NA 36 34 49 37 20 NA 49 49 49 36
 [617] 38 47 39 49 NA 46 NA 35 35 20 NA NA NA 18 20 NA 55 NA 38 36 38 37 35 50 38 36 35 38
 [645] 38 32 52 37 35 15 38 14 47 37 32 49 54 21 20 37 NA 15 18 35 35 21 35 21 50 20 20 15
 [673] NA 49 NA NA 52 49 49 NA NA NA NA 35 NA NA NA 48 53 36 50 NA NA 37 NA 37 36 NA 36 37
 [701] NA 35 NA 35 NA NA NA 20 NA 49 37 36 15 NA 36 35 NA 35 37 47 37 NA 16 37 NA NA NA NA
 [729] 36 NA 47 19 33 36 18 NA 36 NA 19 NA 48 18 36 21 37 20 35 NA NA NA 35 49 36 37 32 20
 [757] NA 34 49 48 NA NA 50 35 36 35 NA NA NA 37 35 NA 37 36 37 37 20 36 NA NA 37 37 47 NA
 [785] 39 52 50 36 52 50 53 34 49 35 33 50 50 48 36 35 46 NA 48 34 47 47 35 51 47 NA NA NA
 [813] NA 55 38 46 37 49 NA 35 34 48 33 38 NA 20 36 NA 48 38 NA NA 14 38 53 49 48 37 51 37
 [841] NA 38 50 46 32 NA 35 32 36 48 46 36 49 36 NA 37 NA NA 13 NA 19 52 48 NA 37 37 37 50
 [869] 33 49 NA 52 50 37 33 36 50 NA NA NA 35 52 20 49 36 NA 34 34 50 49 35 50 48 35 50 50
 [897] 21 35 33 37 48 NA 35 52 33 34 36 33 36 37 48 NA 37 NA 18 50 52 21 35 15 48 50 34 37
 [925] 36 35 48 NA 48 36 46 37 20 47 NA NA NA 38 51 51 36 32 NA 50 49 36 33 37 49 34 21 34
 [953] 52 36 47 49 33 36 47 NA 36 47 50 36 NA 35 49 13 15 NA 32 NA 47 49 35 15 34 18 NA 51
 [981] 21 51 NA NA NA NA 36 NA 50 NA 23 38 54 33 16 48 49 51 34 46
 [ reached getOption("max.print") -- omitted 34549 entries ]

Explaination:



LS0tCnRpdGxlOiAiTUFUSDIzNDkgU2VtZXN0ZXIgMiwgMjAxOCIKYXV0aG9yOiAiQW5uYSBLcmlub2Noa2luYSBzMzcxMjc2MSwgU2hhbiBKaWFuZyBzMzU5MjM2OSwgWGl5dWUgU2h1IHMzNzA1NDc0IgpzdWJ0aXRsZTogQXNzaWdubWVudCAyCm91dHB1dDoKICBodG1sX25vdGVib29rOiBkZWZhdWx0Ci0tLQoKIyMgU2V0dXAKCgpgYGB7ciwgZWNobyA9IFRSVUUsIG1lc3NhZ2U9RkFMU0V9CmxpYnJhcnkocmVhZHIpCmxpYnJhcnkobWFncml0dHIpCmxpYnJhcnkodGlkeXIpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoSG1pc2MpCmxpYnJhcnkob3V0bGllcnMpCmBgYAoKIyMgUmVhZCBXSE8gRGF0YQoKYGBge3J9CndobyA8LSByZWFkX2NzdigiV0hPLmNzdiIpCndobwpgYGAKCiMjIFRpZHkgVGFzayAxOgoKYGBge3IsIGVjaG89VFJVRX0Kd2hvXzEgPC0gd2hvICU+JSBnYXRoZXIoYyg1OjYwKSwga2V5ID0gImNvZGUiLCB2YWx1ZSA9ICJ2YWx1ZSIpCndob18xCmBgYAoKIyMgVGlkeSBUYXNrIDI6CgpgYGB7cn0Kd2hvXzIgPC0gd2hvXzEgJT4lIHNlcGFyYXRlKGNvZGUsIGludG8gPSBjKCJuZXciLCJ2YXIiLCJzZXhfYWdlIiksIHNlcD0iXyIpCndob18yIDwtIHdob18yICU+JSBzZXBhcmF0ZShzZXhfYWdlLCBpbnRvID0gYygic2V4IiwiYWdlIiksIHNlcCA9IDEpCndob18yCmBgYAoKIyMgVGlkeSBUYXNrIDM6CgpgYGB7cn0Kd2hvXzMgPC0gd2hvXzIgJT4lIHNwcmVhZChrZXkgPSB2YXIsIHZhbHVlID0gdmFsdWUpCndob18zCmBgYAoKIyMgVGlkeSBUYXNrIDQ6IAoKYGBge3J9Cndob180IDwtIHdob18zICU+JSBtdXRhdGUoc2V4ID0gZmFjdG9yKHNleCwgbGV2ZWxzID0gYygibSIsImYiKSksIGFnZSA9IGZhY3RvcihhZ2UsIGxldmVscyA9IGMoIjAxNCIsIjE1MjQiLCIyNTM0IiwiMzU0NCIsIjQ1NTQiLCI1NTY0IiwiNjUiKSwgbGFiZWxzID0gYygiPDE1IiwiMTUtMjQiLCIyNS0zNCIsIjM1LTQ0IiwiNDUtNTQiLCI1NS02NCIsICI2NT49IiksIG9yZGVyZWQgPSBUKSkKd2hvXzQKYGBgCgojIyBUYXNrIDU6IEZpbHRlciAmIFNlbGVjdAoKYGBge3J9Cndob190aWR5IDwtIHdob180ICU+JSBzZWxlY3QoLiwgLWMoaXNvMixuZXcpKQpXSE9fc3Vic2V0IDwtIHdob190aWR5ICU+JSBmaWx0ZXIoY291bnRyeSA9PSAiQXVzdHJhbGlhInxjb3VudHJ5ID09IkNoaW5hInxjb3VudHJ5ID09IlJ1c3NpYW4gRmVkZXJhdGlvbiIpCldIT19zdWJzZXQKYGBgCgoKIyMgUmVhZCBTcGVjaWVzIGFuZCBTdXJ2ZXlzIGRhdGEgc2V0cwoKYGBge3J9CnNwZWNpZXMgPC0gcmVhZF9jc3YoInNwZWNpZXMuY3N2IikKc3VydmV5cyA8LSByZWFkX2Nzdigic3VydmV5cy5jc3YiKQpoZWFkKHNwZWNpZXMpCmhlYWQoc3VydmV5cykKYGBgCgojIyBUYXNrIDY6IEpvaW4gIAoKYGBge3J9CnN1cnZleXNfY29tYmluZWQgPC0gbGVmdF9qb2luKHN1cnZleXMsIHNwZWNpZXMsIGJ5ID0gInNwZWNpZXNfaWQiKQpoZWFkKHN1cnZleXNfY29tYmluZWQpCmBgYAoKIyMgVGFzayA3OiBDYWxjdWxhdGUgCgpgYGB7cn0KZmxhdnVzX2F2ZyA8LSBzdXJ2ZXlzX2NvbWJpbmVkICU+JSBmaWx0ZXIoLiwgc3BlY2llcyA9PSAiZmxhdnVzIikgJT4lIGdyb3VwX2J5KC4sIG1vbnRoKSAlPiUgc3VtbWFyaXNlKC4sIGF2Z193ZWlnaHQgPSBtZWFuKHdlaWdodCwgbmEucm0gPSBUKSwgYXZnX2hpbmRmb290ID0gbWVhbihoaW5kZm9vdF9sZW5ndGgsIG5hLnJtID0gVCkpCmhlYWQoZmxhdnVzX2F2ZykKYGBgCgojIyBUYXNrIDg6IE1pc3NpbmcgVmFsdWVzCgpgYGB7cn0Kc3VydmV5c19jb21iaW5lZF95ZWFyIDwtIHN1cnZleXNfY29tYmluZWQgJT4lIGZpbHRlciguLCB5ZWFyID09ICIxOTc3IikKc3VydmV5c19jb21iaW5lZF95ZWFyICU+JSBncm91cF9ieSguLHNwZWNpZXMpICU+JSBzdW1tYXJpc2UgKHRvdGFsX25hID0gc3VtKGlzLm5hKHdlaWdodCkpKQoKc3VydmV5c193ZWlnaHRfaW1wdXRlZCA8LSBzdXJ2ZXlzX2NvbWJpbmVkX3llYXIgJT4lIAogIGdyb3VwX2J5KHNwZWNpZXMpICU+JSAKICBtdXRhdGUod2VpZ2h0ID0gaWZlbHNlKGlzLm5hKHdlaWdodCksIG1lYW4od2VpZ2h0LCBuYS5ybSA9IFRSVUUpLCB3ZWlnaHQpKQpzdXJ2ZXlzX3dlaWdodF9pbXB1dGVkCmBgYAoKIyMgVGFzayA5OiBJbmNvbnNpc3RlbmNpZXMgb3IgU3BlY2lhbCBWYWx1ZXMKCmBgYHtyfQoKaXMuc3BlY2lhbCA8LSBmdW5jdGlvbih4KXtpZihpcy5udW1lcmljKHgpKSAhaXMuZmluaXRlKHgpIGVsc2UgaXMubmFuKHgpfQpzYXBwbHkoc3VydmV5c193ZWlnaHRfaW1wdXRlZCR3ZWlnaHQsIGlzLnNwZWNpYWwpCmBgYAoKRXhwbGFpbmF0aW9uOiAKCiogRm9yIHNvbWUgc3BlY2llcyAoZS5nLiBhbGJpZ3VsYSksIHRoZSB3ZWlnaHQgZW50cmllcyBhcmUgTkEgdG8gc3RhcnQgd2l0aCwgc28gd2hlbiB3ZSBjYWxjdWxhdGUgdGhlIGF2ZXJhZ2Ugd2VpZ2h0IGZvciB0aGVzZSBzcGVjaWVzLCB3ZSB3aWxsIGdldCBOQU4uCgojIyBUYXNrIDEwOiBPdXRsaWVycwoKYGBge3J9CmJveHBsb3Qoc3VydmV5c19jb21iaW5lZCRoaW5kZm9vdF9sZW5ndGgsIG1haW4gPSAiQm94IFBsb3Qgb2YgaGluZGZvb3QgbGVuZ3RoIikKY2FwIDwtIGZ1bmN0aW9uKHgpewogICAgcXVhbnRpbGVzIDwtIHF1YW50aWxlKCB4LCBjKC4wNSwgMC4yNSwgMC43NSwgLjk1ICksbmEucm0gPSBUKQogICAgeFsgeCA8IHF1YW50aWxlc1syXSAtIDEuNSpJUVIoeCxuYS5ybSA9IFQpXSA8LSBxdWFudGlsZXNbMV0KICAgIHhbIHggPiBxdWFudGlsZXNbM10gKyAxLjUqSVFSKHgsbmEucm0gPSBUKV0gPC0gcXVhbnRpbGVzWzRdCiAgICB4Cn0KaGluZGZvb2RfY2FwcGVkIDwtIHN1cnZleXNfY29tYmluZWQkaGluZGZvb3RfbGVuZ3RoICU+JSBjYXAoKQpoaW5kZm9vZF9jYXBwZWQKYGBgCgpFeHBsYWluYXRpb246CgoqIFdlIHVzZWQgdGhlIGNhcHBpbmcgbWV0aG9kLCByZXBsYWNpbmcgdGhlIG91dGxpZXJzIHdpdGggdGhlIG5lYXJlc3QgbmVpZ2hib3VycyB0aGF0IGFyZSBub3Qgb3V0bGllcnMuIFdlIHVzZWQgdGhlIHZhbHVlIG9mIHRoZSA1dGggcGVyY2VudGlsZSB0byBkZWFsIHdpdGggdGhlIG91dGxpZXJzIG91dHNpZGUgdGhlIGxvd2VyIGxpbWl0IGFuZCB0aGUgOTV0aCBwZXJjZW50aWxlIHRvIGRlYWwgd2l0aCB0aGUgb3V0bGllcnMgYWJvdmUgdGhlIHVwcGVyIGxpbWl0LiAKCjxicj4KPGJyPgo=