Install and load the necessary packages to reproduce the report here:
library(readr)
library(tidyr)
library(dplyr)
library(kableExtra)
library(knitr)
library(Hmisc)
library(outliers)
Read the WHO data using an appropriate function.
WHO <- read_csv("WHO.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## country = col_character(),
## iso2 = col_character(),
## iso3 = col_character()
## )
## See spec(...) for full column specifications.
WHO <- WHO %>% gather(code, value, 5:60)
WHO <- WHO %>% separate(code, into = c("new", "var", "sex"), sep = "_")
WHO <- WHO %>% separate(sex, into = c("sex","age"), sep = 1)
WHO <- WHO %>% spread(var, value)
WHO <- WHO %>% mutate(age = factor(age, levels=c("014","1524","2534","3544","4554","5564","65"), labels=c("<15","15-24","25-34","35-44","45-54","55-64","65>="), ordered=TRUE))
WHO <- WHO %>% mutate(sex = factor(sex))
Drop the redundant columns iso2
and new
, and filter any three countries from the tidy version of the WHO data set. Name this subset of the data frame as WHO_subset
.
WHO_subset <- WHO %>% filter(country %in% c("Afghanistan", "Albania", "Algeria")) %>% select(-(iso2),-(new))
Read the Species and Surveys data sets using an appropriate function. Name these data frames as species
and surveys
, respectively.
# This is an R chunk for reading the Species and Surveys data sets. Provide your R codes here:
species <- read_csv("species.csv")
surveys <- read_csv("surveys.csv")
str(species)
## Classes 'tbl_df', 'tbl' and 'data.frame': 54 obs. of 4 variables:
## $ species_id: chr "AB" "AH" "AS" "BA" ...
## $ genus : chr "Amphispiza" "Ammospermophilus" "Ammodramus" "Baiomys" ...
## $ species : chr "bilineata" "harrisi" "savannarum" "taylori" ...
## $ taxa : chr "Bird" "Rodent" "Bird" "Rodent" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 4
## .. ..$ species_id: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ genus : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ species : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ taxa : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
summary(species)
## species_id genus species
## Length:54 Length:54 Length:54
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## taxa
## Length:54
## Class :character
## Mode :character
str(surveys)
## Classes 'tbl_df', 'tbl' and 'data.frame': 35549 obs. of 8 variables:
## $ record_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ month : int 7 7 7 7 7 7 7 7 7 7 ...
## $ day : int 16 16 16 16 16 16 16 16 16 16 ...
## $ year : int 1977 1977 1977 1977 1977 1977 1977 1977 1977 1977 ...
## $ species_id : chr "NL" "NL" "DM" "DM" ...
## $ sex : chr "M" "M" "F" "M" ...
## $ hindfoot_length: int 32 33 37 36 35 14 NA 37 34 20 ...
## $ weight : int NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 8
## .. ..$ record_id : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ month : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ day : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ year : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ species_id : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ sex : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ hindfoot_length: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ weight : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
summary(surveys)
## record_id month day year
## Min. : 1 Min. : 1.000 Min. : 1.00 Min. :1977
## 1st Qu.: 8888 1st Qu.: 4.000 1st Qu.: 9.00 1st Qu.:1984
## Median :17775 Median : 6.000 Median :16.00 Median :1990
## Mean :17775 Mean : 6.478 Mean :15.99 Mean :1990
## 3rd Qu.:26662 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1997
## Max. :35549 Max. :12.000 Max. :31.00 Max. :2002
##
## species_id sex hindfoot_length weight
## Length:35549 Length:35549 Min. : 2.00 Min. : 4.00
## Class :character Class :character 1st Qu.:21.00 1st Qu.: 20.00
## Mode :character Mode :character Median :32.00 Median : 37.00
## Mean :29.29 Mean : 42.67
## 3rd Qu.:36.00 3rd Qu.: 48.00
## Max. :70.00 Max. :280.00
## NA's :4111 NA's :3266
Combine surveys
and species
data frames using the key variable species_id
. For this task, you need to add the species information (genus
, species
, taxa
) to the surveys
data. Rename the combined data frame as surveys_combined
.
# This is a chunk for Task 6. Provide your R codes here:
surveys_combined <- full_join(species, surveys, by = "species_id")
Using the surveys_combined
data frame, calculate the average weight and hindfoot length of one of the species observed in each month (irrespective of the year). Make sure to exclude missing values while calculating the average.
# Check the species ID
unique(surveys_combined$species_id)
## [1] "AB" "AH" "AS" "BA" "CB" "CM" "CQ" "CS" "CT" "CU" "CV" "DM" "DO" "DS"
## [15] "DX" "EO" "GS" "NL" "NX" "OL" "OT" "OX" "PB" "PC" "PE" "PF" "PG" "PH"
## [29] "PI" "PL" "PM" "PP" "PU" "PX" "RF" "RM" "RO" "RX" "SA" "SB" "SC" "SF"
## [43] "SH" "SO" "SS" "ST" "SU" "SX" "UL" "UP" "UR" "US" "ZL" "ZM" NA
Species ID ‘DS’ was randomely selected for the next part.
# Make a new data set by filtering for DS
DS <- subset(surveys_combined, surveys_combined$species_id == "DS")
# Check the structure
str(DS)
## Classes 'tbl_df', 'tbl' and 'data.frame': 2504 obs. of 11 variables:
## $ species_id : chr "DS" "DS" "DS" "DS" ...
## $ genus : chr "Dipodomys" "Dipodomys" "Dipodomys" "Dipodomys" ...
## $ species : chr "spectabilis" "spectabilis" "spectabilis" "spectabilis" ...
## $ taxa : chr "Rodent" "Rodent" "Rodent" "Rodent" ...
## $ record_id : int 11 17 20 30 42 58 73 76 80 91 ...
## $ month : int 7 7 7 7 7 7 8 8 8 8 ...
## $ day : int 16 16 17 17 18 18 19 19 19 20 ...
## $ year : int 1977 1977 1977 1977 1977 1977 1977 1977 1977 1977 ...
## $ sex : chr "F" "F" "F" "F" ...
## $ hindfoot_length: int 53 48 48 52 46 45 44 47 48 50 ...
## $ weight : int NA NA NA NA NA NA NA NA NA NA ...
Convert month into a factor.
# Converting into a factor
DS$month <- factor(DS$month)
# Checking the conversion
str(DS$month)
## Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 8 8 8 8 ...
# Calculate the averge weight of DS in each month, excluding NA
DS_WeightByMonth <- aggregate(DS$weight ~ month, DS, mean, na.action = na.omit)
# Output the results
kable(DS_WeightByMonth, col.names = c('Month', 'Average Weight of DS'), align = rep('c')) %>%
kable_styling(bootstrap_options = "striped", full_width = F, "condensed") %>%
column_spec(1, bold = TRUE, border_right = TRUE, width = "5em") %>%
column_spec(2, width = "10em")
Month | Average Weight of DS |
---|---|
1 | 130.7521 |
2 | 126.0698 |
3 | 121.4336 |
4 | 116.1994 |
5 | 112.7483 |
6 | 111.6020 |
7 | 112.4064 |
8 | 120.1447 |
9 | 124.9539 |
10 | 126.9412 |
11 | 126.7778 |
12 | 128.7634 |
# Calculate the averge weight of DS in each month, excluding NA
DS_HFLengthByMonth <- aggregate(DS$hindfoot_length ~ month, DS, mean, na.action = na.omit)
# Output the results
kable(DS_HFLengthByMonth, col.names = c('Month', 'Average Hindfoot Length of DS'),
align = rep('c')) %>%
kable_styling(bootstrap_options = "striped", full_width = F, "condensed") %>%
column_spec(1, bold = TRUE, border_right = TRUE, width = "5em") %>%
column_spec(2, width = "10em")
Month | Average Hindfoot Length of DS |
---|---|
1 | 49.86087 |
2 | 50.36842 |
3 | 49.93103 |
4 | 49.64444 |
5 | 49.60211 |
6 | 50.21687 |
7 | 49.90286 |
8 | 49.82609 |
9 | 50.20000 |
10 | 50.17808 |
11 | 50.06044 |
12 | 49.90625 |
Select one of the years in the surveys_combined
data frame, rename this data set as surveys_combined_year
. Using surveys_combined_year
data frame, find the total missing values in weight
column grouped by species
. Replace the missing values in weight
column with the mean values of each species. Save this imputed data as surveys_weight_imputed
.
# Converting into a factor
surveys_combined$year <- factor(surveys_combined$year)
# Checking the conversion
str(surveys_combined$year)
## Factor w/ 26 levels "1977","1978",..: 4 4 4 4 4 5 5 5 5 5 ...
# This is a chunk for Task 8. Provide your R codes here:
# Choose 1988 (my year of birth)
surveys_combined_year <- subset(surveys_combined, surveys_combined$year == "1988")
# Checking the structure
str(surveys_combined_year)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1469 obs. of 11 variables:
## $ species_id : chr "AB" "AB" "AB" "AB" ...
## $ genus : chr "Amphispiza" "Amphispiza" "Amphispiza" "Amphispiza" ...
## $ species : chr "bilineata" "bilineata" "bilineata" "bilineata" ...
## $ taxa : chr "Bird" "Bird" "Bird" "Bird" ...
## $ record_id : int 13856 13897 13901 13923 13979 14008 14010 14016 14017 14034 ...
## $ month : int 1 1 1 1 1 2 2 2 2 2 ...
## $ day : int 23 23 23 24 24 21 21 21 21 21 ...
## $ year : Factor w/ 26 levels "1977","1978",..: 12 12 12 12 12 12 12 12 12 12 ...
## $ sex : chr NA NA NA NA ...
## $ hindfoot_length: int NA NA NA NA NA NA NA NA NA NA ...
## $ weight : int NA NA NA NA NA NA NA NA NA NA ...
# Check the species ID (those which were surveyed in 1988)
unique(surveys_combined_year$species_id)
## [1] "AB" "AH" "CB" "DM" "DO" "DS" "DX" "NL" "OL" "OT" "PE" "PF" "PG" "PH"
## [15] "PM" "PP" "RF" "RM" "SA" "SH" "SS" "UL" "UP" "UR" "ZL" NA
# Creating a new list value for a count of NA value (from weight)
na_count <- sapply(surveys_combined_year$weight, function(y) sum(length(which(is.na(y)))))
# Creating a new column from the list value in surveys_combined_year
surveys_combined_year$WeightNA <- na_count
# Calculate the sum of NA
weight_by_species <- aggregate(WeightNA ~ species_id, surveys_combined_year, FUN = length)
# Output the results
kable(weight_by_species, col.names = c('Species ID (1988)', 'Number of NA Weight Values'),
align = rep('c')) %>%
kable_styling(bootstrap_options = "striped", full_width = F, "condensed") %>%
column_spec(1, bold = TRUE, border_right = TRUE, width = "8em") %>%
column_spec(2, width = "8em")
Species ID (1988) | Number of NA Weight Values |
---|---|
AB | 39 |
AH | 26 |
CB | 6 |
DM | 365 |
DO | 144 |
DS | 54 |
DX | 1 |
NL | 102 |
OL | 51 |
OT | 86 |
PE | 190 |
PF | 2 |
PG | 2 |
PH | 1 |
PM | 54 |
PP | 53 |
RF | 11 |
RM | 211 |
SA | 1 |
SH | 31 |
SS | 20 |
UL | 1 |
UP | 3 |
UR | 1 |
ZL | 2 |
# Drop the WeightNA column
surveys_combined_year <- surveys_combined_year[-c(12)]
# Determining the mean values
average_by_species_year <- aggregate(surveys_combined_year$weight ~ species_id,
surveys_combined_year, mean, na.action = na.omit)
# Checking the output
average_by_species_year
# Column rename (data cleaning) to perform next function
names(average_by_species_year) <- c("species_id", "weight")
# Creation of surveys_weight_imputed
surveys_weight_imputed <- left_join(surveys_combined_year, average_by_species_year, by = "species_id") %>%
mutate(weight = ifelse(is.na(weight.x), weight.y, weight.x)) %>%
select(-weight.y, -weight.x)
Inspect the weight
column in surveys_weight_imputed
data frame for any further inconsistencies or special values (i.e., NaN, Inf, -Inf) . Trace back and explain briefly why you got such a value.
# Checking surveys_combined_year
sum(is.na(surveys_combined_year$weight))
## [1] 142
# Checking surveys_weight_imputed
sum(is.na(surveys_weight_imputed$weight))
## [1] 114
# Select species ID 'AB' from list above
na_check <- subset(surveys_combined_year, surveys_combined_year$species_id == "AB")
dim(na_check)
## [1] 39 11
# Checking surveys_weight_imputed
sum(is.na(na_check$weight))
## [1] 39
# Not a Number Count
sum(is.nan(surveys_weight_imputed$weight))
## [1] 0
# Infinite Count
sum(is.infinite(surveys_weight_imputed$weight))
## [1] 0
# Checking for finite values
sum(is.finite(surveys_weight_imputed$weight))
## [1] 1355
# Checking the structure
str(surveys_weight_imputed$weight)
## num [1:1469] NA NA NA NA NA NA NA NA NA NA ...
The surveys_weight_imputed
still contains NA values because for these species every value for weight
from the selected year was NA to begin with. This means that they couldn’t be filled with a mean, as a mean could not be generated. All the other checks above gave expected results.
Using the surveys_combined
data frame, inspect the variable hindfoot length for possible univariate outliers. If you detect any outliers use any of the methods outlined in the Module 6 notes to deal with them. Explain briefly the actions that you take to handle outliers.
# This is a chunk for Task 10. Provide your R codes here:
unique(surveys_combined$species_id)
## [1] "AB" "AH" "AS" "BA" "CB" "CM" "CQ" "CS" "CT" "CU" "CV" "DM" "DO" "DS"
## [15] "DX" "EO" "GS" "NL" "NX" "OL" "OT" "OX" "PB" "PC" "PE" "PF" "PG" "PH"
## [29] "PI" "PL" "PM" "PP" "PU" "PX" "RF" "RM" "RO" "RX" "SA" "SB" "SC" "SF"
## [43] "SH" "SO" "SS" "ST" "SU" "SX" "UL" "UP" "UR" "US" "ZL" "ZM" NA
surveys_combined$hindfoot_length %>% boxplot(main="Box Plot of Hindfoot Length",
ylab="Hindfoot Length", col = "grey")
# Checking the summary stats
summary(surveys_combined$hindfoot_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.00 21.00 32.00 29.29 36.00 70.00 4117
# Drop all Na
surveys_combined <- dplyr::filter(surveys_combined, !is.na(hindfoot_length))
# Checking the summary stats
summary(surveys_combined$hindfoot_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 21.00 32.00 29.29 36.00 70.00
# Checking the z score summary stats
z.scores <- surveys_combined$hindfoot_length %>% scores(type = "z")
z.scores %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.8530 -0.8665 0.2835 0.0000 0.7017 4.2565
# Checking the z score values
surveys_combined$hindfoot_length[ which( abs(z.scores) >3 )]
## [1] 58 64 58 70
# Imputing the outliers
surveys_combined$hindfoot_length[ which( abs(z.scores) >3 )] <- mean(surveys_combined$hindfoot_length,
na.rm = TRUE)
# Checking the results
surveys_combined$hindfoot_length %>% boxplot(main="Box Plot of Hindfoot Length",
ylab="Hindfoot Length", col = "grey")
Outliers were removed by imputation. The method of outlier removal in the case of these data does not require a large amount of consideration. This is because the hindfoot length is being taken over a huge range of different species, so this statistical evaluation has minimal applicable value. Additionally the outliers reprented only four values out of over 30,000, so any method do deal with the outliers will have minimal effect.