library(readr)
library(magrittr)
library(tidyr)
library(dplyr)
library(Hmisc)
library(outliers)
who <- read_csv("WHO.csv")
Parsed with column specification:
cols(
.default = col_integer(),
country = col_character(),
iso2 = col_character(),
iso3 = col_character()
)
See spec(...) for full column specifications.
who
who_1 <- who %>% gather(c(5:60), key = "code", value = "value")
who_1
who_2 <- who_1 %>% separate(code, into = c("new","var","sex_age"), sep="_")
who_2 <- who_2 %>% separate(sex_age, into = c("sex","age"), sep = 1)
who_2
who_3 <- who_2 %>% spread(key = var, value = value)
who_3
who_4 <- who_3 %>% mutate(sex = factor(sex, levels = c("m","f")), age = factor(age, levels = c("014","1524","2534","3544","4554","5564","65"), labels = c("<15","15-24","25-34","35-44","45-54","55-64", "65>="), ordered = T))
who_4
who_tidy <- who_4 %>% select(., -c(iso2,new))
WHO_subset <- who_tidy %>% filter(country == "Australia"|country =="China"|country =="Russian Federation")
WHO_subset
species <- read_csv("species.csv")
Parsed with column specification:
cols(
species_id = col_character(),
genus = col_character(),
species = col_character(),
taxa = col_character()
)
surveys <- read_csv("surveys.csv")
Parsed with column specification:
cols(
record_id = col_integer(),
month = col_integer(),
day = col_integer(),
year = col_integer(),
species_id = col_character(),
sex = col_character(),
hindfoot_length = col_integer(),
weight = col_integer()
)
head(species)
head(surveys)
surveys_combined <- left_join(surveys, species, by = "species_id")
head(surveys_combined)
flavus_avg <- surveys_combined %>% filter(., species == "flavus") %>% group_by(., month) %>% summarise(., avg_weight = mean(weight, na.rm = T), avg_hindfoot = mean(hindfoot_length, na.rm = T))
head(flavus_avg)
surveys_combined_year <- surveys_combined %>% filter(., year == "1977")
surveys_combined_year %>% group_by(.,species) %>% summarise (total_na = sum(is.na(weight)))
surveys_weight_imputed <- surveys_combined_year %>%
group_by(species) %>%
mutate(weight = ifelse(is.na(weight), mean(weight, na.rm = TRUE), weight))
surveys_weight_imputed
is.special <- function(x){if(is.numeric(x)) !is.finite(x) else is.nan(x)}
sapply(surveys_weight_imputed$weight, is.special)
[1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[16] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[31] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[46] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
[76] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[106] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[136] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[166] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
[196] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[211] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
[226] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[256] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
[271] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
[286] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
[301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
[316] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
[331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
[346] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[361] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[376] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
[391] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE
[406] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE
[421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
[436] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[451] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
[466] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
[481] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
[496] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
Explaination:
boxplot(surveys_combined$hindfoot_length, main = "Box Plot of hindfoot length")
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ),na.rm = T)
x[ x < quantiles[2] - 1.5*IQR(x,na.rm = T)] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x,na.rm = T)] <- quantiles[4]
x
}
hindfood_capped <- surveys_combined$hindfoot_length %>% cap()
hindfood_capped
[1] 32 33 37 36 35 14 NA 37 34 20 53 38 35 NA 36 36 48 22 NA 48 34 31 36 21 35 31 36 38
[29] NA 52 37 35 36 NA 38 22 35 33 36 36 34 46 36 35 36 35 32 36 17 32 36 26 36 37 36 34
[57] NA 45 33 20 35 35 35 37 34 35 35 32 15 21 36 31 44 12 32 47 NA 16 34 48 14 35 37 35
[85] 35 33 11 35 20 35 50 35 NA 36 38 36 36 38 37 54 35 35 35 43 35 NA NA 21 35 NA 37 37
[113] 13 32 34 37 19 33 35 35 NA 33 37 36 35 30 33 34 34 34 37 33 37 45 35 16 35 37 15 33
[141] 36 34 35 40 37 36 37 37 50 35 35 38 50 47 NA 35 36 36 53 37 37 39 21 36 50 13 36 36
[169] 37 36 NA NA 36 37 33 45 NA 33 37 34 35 49 37 52 19 34 36 48 33 36 47 36 NA NA 50 15
[197] 37 35 36 52 47 32 38 37 50 52 53 37 35 36 48 NA 35 NA 36 13 19 13 36 52 NA 52 50 NA
[225] NA 37 NA 38 35 NA NA 48 25 35 37 47 50 20 48 53 NA 19 38 36 37 37 54 NA 36 51 36 53
[253] 38 36 35 36 37 36 36 36 36 19 37 49 38 NA 46 36 37 36 50 NA 37 20 37 50 20 50 19 37
[281] NA NA 34 37 36 50 38 37 38 33 47 36 36 37 36 18 NA 36 19 NA 48 36 36 37 35 33 36 48
[309] 49 NA 18 36 35 NA 36 37 32 33 36 35 NA 47 33 NA NA NA 50 NA 47 37 50 16 NA 37 52 36
[337] 35 37 49 36 50 34 NA NA 34 37 38 36 NA 37 NA 36 NA 38 NA 32 50 16 36 35 36 51 NA 37
[365] 38 NA 51 16 36 35 36 36 34 37 15 15 48 33 36 33 48 37 52 36 50 NA NA 36 NA NA 15 53
[393] NA 48 36 36 16 50 36 NA NA NA NA 37 NA NA 50 9 36 53 36 52 49 34 36 NA NA NA NA 38
[421] 51 37 36 37 36 36 37 33 NA 15 NA NA 38 16 NA 37 37 36 36 16 NA NA 35 38 NA NA 37 51
[449] 35 33 51 NA 36 36 NA NA NA NA NA NA 36 36 49 49 NA 51 32 35 NA 15 NA 49 NA 15 NA NA
[477] 47 NA 36 47 35 37 49 37 52 NA 38 37 NA 37 48 49 49 NA NA 36 NA 16 51 NA NA 36 NA NA
[505] 36 NA 35 36 NA 16 36 48 NA 16 NA 38 26 36 38 15 36 37 36 NA 34 51 37 51 36 37 NA NA
[533] NA NA NA NA NA 36 34 47 36 36 49 NA 38 NA NA 53 38 36 50 51 50 NA 48 36 37 49 36 50
[561] 53 NA NA 51 NA 36 36 NA NA NA NA 38 NA 34 38 36 NA NA NA NA NA NA NA NA 53 35 NA NA
[589] 39 16 38 39 38 38 37 NA 33 NA 15 37 36 37 NA 22 NA NA 36 34 49 37 20 NA 49 49 49 36
[617] 38 47 39 49 NA 46 NA 35 35 20 NA NA NA 18 20 NA 55 NA 38 36 38 37 35 50 38 36 35 38
[645] 38 32 52 37 35 15 38 14 47 37 32 49 54 21 20 37 NA 15 18 35 35 21 35 21 50 20 20 15
[673] NA 49 NA NA 52 49 49 NA NA NA NA 35 NA NA NA 48 53 36 50 NA NA 37 NA 37 36 NA 36 37
[701] NA 35 NA 35 NA NA NA 20 NA 49 37 36 15 NA 36 35 NA 35 37 47 37 NA 16 37 NA NA NA NA
[729] 36 NA 47 19 33 36 18 NA 36 NA 19 NA 48 18 36 21 37 20 35 NA NA NA 35 49 36 37 32 20
[757] NA 34 49 48 NA NA 50 35 36 35 NA NA NA 37 35 NA 37 36 37 37 20 36 NA NA 37 37 47 NA
[785] 39 52 50 36 52 50 53 34 49 35 33 50 50 48 36 35 46 NA 48 34 47 47 35 51 47 NA NA NA
[813] NA 55 38 46 37 49 NA 35 34 48 33 38 NA 20 36 NA 48 38 NA NA 14 38 53 49 48 37 51 37
[841] NA 38 50 46 32 NA 35 32 36 48 46 36 49 36 NA 37 NA NA 13 NA 19 52 48 NA 37 37 37 50
[869] 33 49 NA 52 50 37 33 36 50 NA NA NA 35 52 20 49 36 NA 34 34 50 49 35 50 48 35 50 50
[897] 21 35 33 37 48 NA 35 52 33 34 36 33 36 37 48 NA 37 NA 18 50 52 21 35 15 48 50 34 37
[925] 36 35 48 NA 48 36 46 37 20 47 NA NA NA 38 51 51 36 32 NA 50 49 36 33 37 49 34 21 34
[953] 52 36 47 49 33 36 47 NA 36 47 50 36 NA 35 49 13 15 NA 32 NA 47 49 35 15 34 18 NA 51
[981] 21 51 NA NA NA NA 36 NA 50 NA 23 38 54 33 16 48 49 51 34 46
[ reached getOption("max.print") -- omitted 34549 entries ]
Explaination: