MATH2349 Semester 1, 2018

Setup

Install and load the necessary packages to reproduce the report here:

library(readr)
library(tidyr)
library(dplyr)
library(kableExtra)
library(knitr)
library(Hmisc)
library(outliers)

Read WHO Data

Read the WHO data using an appropriate function.

WHO <- read_csv("WHO.csv")

## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   country = col_character(),
##   iso2 = col_character(),
##   iso3 = col_character()
## )

## See spec(...) for full column specifications.

Tidy Task 1:

WHO <- WHO %>% gather(code, value, 5:60)

Tidy Task 2:

WHO <- WHO %>% separate(code, into = c("new", "var", "sex"), sep = "_")
WHO <- WHO %>% separate(sex, into = c("sex","age"), sep = 1)

Tidy Task 3:

WHO <- WHO %>% spread(var, value)

Tidy Task 4:

WHO <- WHO %>% mutate(age = factor(age, levels=c("014","1524","2534","3544","4554","5564","65"), labels=c("<15","15-24","25-34","35-44","45-54","55-64","65>="), ordered=TRUE))
WHO <- WHO %>% mutate(sex = factor(sex))

Task 5: Filter & Select

Drop the redundant columns iso2 and new, and filter any three countries from the tidy version of the WHO data set. Name this subset of the data frame as WHO_subset.

WHO_subset <- WHO %>% filter(country %in% c("Afghanistan", "Albania", "Algeria")) %>% select(-(iso2),-(new))

Read Species and Surveys data sets

Read the Species and Surveys data sets using an appropriate function. Name these data frames as species and surveys, respectively.

# This is an R chunk for reading the Species and Surveys data sets. Provide your R codes here:
species <- read_csv("species.csv")
surveys <- read_csv("surveys.csv")

Task 6: Join

Checking the imported data

str(species)

## Classes 'tbl_df', 'tbl' and 'data.frame':    54 obs. of  4 variables:
##  $ species_id: chr  "AB" "AH" "AS" "BA" ...
##  $ genus     : chr  "Amphispiza" "Ammospermophilus" "Ammodramus" "Baiomys" ...
##  $ species   : chr  "bilineata" "harrisi" "savannarum" "taylori" ...
##  $ taxa      : chr  "Bird" "Rodent" "Bird" "Rodent" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 4
##   .. ..$ species_id: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ genus     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ species   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ taxa      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

summary(species)

##   species_id           genus             species         
##  Length:54          Length:54          Length:54         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##      taxa          
##  Length:54         
##  Class :character  
##  Mode  :character

str(surveys)

## Classes 'tbl_df', 'tbl' and 'data.frame':    35549 obs. of  8 variables:
##  $ record_id      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ month          : int  7 7 7 7 7 7 7 7 7 7 ...
##  $ day            : int  16 16 16 16 16 16 16 16 16 16 ...
##  $ year           : int  1977 1977 1977 1977 1977 1977 1977 1977 1977 1977 ...
##  $ species_id     : chr  "NL" "NL" "DM" "DM" ...
##  $ sex            : chr  "M" "M" "F" "M" ...
##  $ hindfoot_length: int  32 33 37 36 35 14 NA 37 34 20 ...
##  $ weight         : int  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 8
##   .. ..$ record_id      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ month          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ day            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ year           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ species_id     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ sex            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ hindfoot_length: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ weight         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

summary(surveys)

##    record_id         month             day             year     
##  Min.   :    1   Min.   : 1.000   Min.   : 1.00   Min.   :1977  
##  1st Qu.: 8888   1st Qu.: 4.000   1st Qu.: 9.00   1st Qu.:1984  
##  Median :17775   Median : 6.000   Median :16.00   Median :1990  
##  Mean   :17775   Mean   : 6.478   Mean   :15.99   Mean   :1990  
##  3rd Qu.:26662   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1997  
##  Max.   :35549   Max.   :12.000   Max.   :31.00   Max.   :2002  
##                                                                 
##   species_id            sex            hindfoot_length     weight      
##  Length:35549       Length:35549       Min.   : 2.00   Min.   :  4.00  
##  Class :character   Class :character   1st Qu.:21.00   1st Qu.: 20.00  
##  Mode  :character   Mode  :character   Median :32.00   Median : 37.00  
##                                        Mean   :29.29   Mean   : 42.67  
##                                        3rd Qu.:36.00   3rd Qu.: 48.00  
##                                        Max.   :70.00   Max.   :280.00  
##                                        NA's   :4111    NA's   :3266

Combine surveys and species data frames using the key variable species_id. For this task, you need to add the species information (genus, species, taxa) to the surveys data. Rename the combined data frame as surveys_combined.

# This is a chunk for Task 6. Provide your R codes here: 
surveys_combined <- full_join(species, surveys, by = "species_id")

Task 7: Calculate

Using the surveys_combined data frame, calculate the average weight and hindfoot length of one of the species observed in each month (irrespective of the year). Make sure to exclude missing values while calculating the average.

# Check the species ID
unique(surveys_combined$species_id)

##  [1] "AB" "AH" "AS" "BA" "CB" "CM" "CQ" "CS" "CT" "CU" "CV" "DM" "DO" "DS"
## [15] "DX" "EO" "GS" "NL" "NX" "OL" "OT" "OX" "PB" "PC" "PE" "PF" "PG" "PH"
## [29] "PI" "PL" "PM" "PP" "PU" "PX" "RF" "RM" "RO" "RX" "SA" "SB" "SC" "SF"
## [43] "SH" "SO" "SS" "ST" "SU" "SX" "UL" "UP" "UR" "US" "ZL" "ZM" NA

Species ID ‘DS’ was randomely selected for the next part.

# Make a new data set by filtering for DS
DS <- subset(surveys_combined, surveys_combined$species_id == "DS")

# Check the structure
str(DS)

## Classes 'tbl_df', 'tbl' and 'data.frame':    2504 obs. of  11 variables:
##  $ species_id     : chr  "DS" "DS" "DS" "DS" ...
##  $ genus          : chr  "Dipodomys" "Dipodomys" "Dipodomys" "Dipodomys" ...
##  $ species        : chr  "spectabilis" "spectabilis" "spectabilis" "spectabilis" ...
##  $ taxa           : chr  "Rodent" "Rodent" "Rodent" "Rodent" ...
##  $ record_id      : int  11 17 20 30 42 58 73 76 80 91 ...
##  $ month          : int  7 7 7 7 7 7 8 8 8 8 ...
##  $ day            : int  16 16 17 17 18 18 19 19 19 20 ...
##  $ year           : int  1977 1977 1977 1977 1977 1977 1977 1977 1977 1977 ...
##  $ sex            : chr  "F" "F" "F" "F" ...
##  $ hindfoot_length: int  53 48 48 52 46 45 44 47 48 50 ...
##  $ weight         : int  NA NA NA NA NA NA NA NA NA NA ...

Convert month into a factor.

# Converting into a factor
DS$month <- factor(DS$month)

# Checking the conversion
str(DS$month)

##  Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 8 8 8 8 ...

# Calculate the averge weight of DS in each month, excluding NA
DS_WeightByMonth <- aggregate(DS$weight ~ month, DS, mean, na.action = na.omit)

# Output the results
kable(DS_WeightByMonth, col.names = c('Month', 'Average Weight of DS'), align = rep('c')) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, "condensed") %>%
  column_spec(1, bold = TRUE, border_right = TRUE, width = "5em") %>%
  column_spec(2, width = "10em")

Month	Average Weight of DS
1	130.7521
2	126.0698
3	121.4336
4	116.1994
5	112.7483
6	111.6020
7	112.4064
8	120.1447
9	124.9539
10	126.9412
11	126.7778
12	128.7634

# Calculate the averge weight of DS in each month, excluding NA
DS_HFLengthByMonth <- aggregate(DS$hindfoot_length ~ month, DS, mean, na.action = na.omit)

# Output the results
kable(DS_HFLengthByMonth, col.names = c('Month', 'Average Hindfoot Length of DS'), 
      align = rep('c')) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, "condensed") %>%
  column_spec(1, bold = TRUE, border_right = TRUE, width = "5em") %>%
  column_spec(2, width = "10em")

Month	Average Hindfoot Length of DS
1	49.86087
2	50.36842
3	49.93103
4	49.64444
5	49.60211
6	50.21687
7	49.90286
8	49.82609
9	50.20000
10	50.17808
11	50.06044
12	49.90625

Task 8: Missing Values

Select one of the years in the surveys_combined data frame, rename this data set as surveys_combined_year. Using surveys_combined_year data frame, find the total missing values in weight column grouped by species. Replace the missing values in weight column with the mean values of each species. Save this imputed data as surveys_weight_imputed.

# Converting into a factor
surveys_combined$year <- factor(surveys_combined$year)

# Checking the conversion
str(surveys_combined$year)

##  Factor w/ 26 levels "1977","1978",..: 4 4 4 4 4 5 5 5 5 5 ...

# This is a chunk for Task 8. Provide your R codes here: 
# Choose 1988 (my year of birth)
surveys_combined_year <- subset(surveys_combined, surveys_combined$year == "1988")

# Checking the structure
str(surveys_combined_year)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1469 obs. of  11 variables:
##  $ species_id     : chr  "AB" "AB" "AB" "AB" ...
##  $ genus          : chr  "Amphispiza" "Amphispiza" "Amphispiza" "Amphispiza" ...
##  $ species        : chr  "bilineata" "bilineata" "bilineata" "bilineata" ...
##  $ taxa           : chr  "Bird" "Bird" "Bird" "Bird" ...
##  $ record_id      : int  13856 13897 13901 13923 13979 14008 14010 14016 14017 14034 ...
##  $ month          : int  1 1 1 1 1 2 2 2 2 2 ...
##  $ day            : int  23 23 23 24 24 21 21 21 21 21 ...
##  $ year           : Factor w/ 26 levels "1977","1978",..: 12 12 12 12 12 12 12 12 12 12 ...
##  $ sex            : chr  NA NA NA NA ...
##  $ hindfoot_length: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ weight         : int  NA NA NA NA NA NA NA NA NA NA ...

# Check the species ID (those which were surveyed in 1988)
unique(surveys_combined_year$species_id)

##  [1] "AB" "AH" "CB" "DM" "DO" "DS" "DX" "NL" "OL" "OT" "PE" "PF" "PG" "PH"
## [15] "PM" "PP" "RF" "RM" "SA" "SH" "SS" "UL" "UP" "UR" "ZL" NA

# Creating a new list value for a count of NA value (from weight)
na_count <- sapply(surveys_combined_year$weight, function(y) sum(length(which(is.na(y)))))
# Creating a new column from the list value in surveys_combined_year
surveys_combined_year$WeightNA <- na_count

# Calculate the sum of NA 
weight_by_species <- aggregate(WeightNA ~ species_id, surveys_combined_year, FUN = length)

The total missing values in “weight” column grouped by species (in the year 1988)

# Output the results
kable(weight_by_species, col.names = c('Species ID (1988)', 'Number of NA Weight Values'), 
      align = rep('c')) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, "condensed") %>%
  column_spec(1, bold = TRUE, border_right = TRUE, width = "8em") %>%
  column_spec(2, width = "8em")

Species ID (1988)	Number of NA Weight Values
AB	39
AH	26
CB	6
DM	365
DO	144
DS	54
DX	1
NL	102
OL	51
OT	86
PE	190
PF	2
PG	2
PH	1
PM	54
PP	53
RF	11
RM	211
SA	1
SH	31
SS	20
UL	1
UP	3
UR	1
ZL	2

# Drop the WeightNA column
surveys_combined_year <- surveys_combined_year[-c(12)]

Determine the mean values

# Determining the mean values
average_by_species_year <- aggregate(surveys_combined_year$weight ~ species_id, 
                                     surveys_combined_year, mean, na.action = na.omit)

# Checking the output
average_by_species_year

# Column rename (data cleaning) to perform next function
names(average_by_species_year) <- c("species_id", "weight")

# Creation of surveys_weight_imputed
surveys_weight_imputed <- left_join(surveys_combined_year, average_by_species_year, by = "species_id") %>%
  mutate(weight = ifelse(is.na(weight.x), weight.y, weight.x)) %>% 
  select(-weight.y, -weight.x)

Task 9: Inconsistencies or Special Values

Inspect the weight column in surveys_weight_imputed data frame for any further inconsistencies or special values (i.e., NaN, Inf, -Inf) . Trace back and explain briefly why you got such a value.

# Checking surveys_combined_year
sum(is.na(surveys_combined_year$weight))

## [1] 142

# Checking surveys_weight_imputed
sum(is.na(surveys_weight_imputed$weight))

## [1] 114

# Select species ID 'AB' from list above
na_check <- subset(surveys_combined_year, surveys_combined_year$species_id == "AB")

dim(na_check)

## [1] 39 11

# Checking surveys_weight_imputed
sum(is.na(na_check$weight))

## [1] 39

# Not a Number Count
sum(is.nan(surveys_weight_imputed$weight))

## [1] 0

# Infinite Count
sum(is.infinite(surveys_weight_imputed$weight))

## [1] 0

# Checking for finite values
sum(is.finite(surveys_weight_imputed$weight))

## [1] 1355

# Checking the structure
str(surveys_weight_imputed$weight)

##  num [1:1469] NA NA NA NA NA NA NA NA NA NA ...

The surveys_weight_imputed still contains NA values because for these species every value for weight from the selected year was NA to begin with. This means that they couldn’t be filled with a mean, as a mean could not be generated. All the other checks above gave expected results.

Task 10: Outliers

Using the surveys_combined data frame, inspect the variable hindfoot length for possible univariate outliers. If you detect any outliers use any of the methods outlined in the Module 6 notes to deal with them. Explain briefly the actions that you take to handle outliers.

# This is a chunk for Task 10. Provide your R codes here: 
unique(surveys_combined$species_id)

##  [1] "AB" "AH" "AS" "BA" "CB" "CM" "CQ" "CS" "CT" "CU" "CV" "DM" "DO" "DS"
## [15] "DX" "EO" "GS" "NL" "NX" "OL" "OT" "OX" "PB" "PC" "PE" "PF" "PG" "PH"
## [29] "PI" "PL" "PM" "PP" "PU" "PX" "RF" "RM" "RO" "RX" "SA" "SB" "SC" "SF"
## [43] "SH" "SO" "SS" "ST" "SU" "SX" "UL" "UP" "UR" "US" "ZL" "ZM" NA

surveys_combined$hindfoot_length %>%  boxplot(main="Box Plot of Hindfoot Length", 
                                              ylab="Hindfoot Length", col = "grey")

# Checking the summary stats
summary(surveys_combined$hindfoot_length)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2.00   21.00   32.00   29.29   36.00   70.00    4117

# Drop all Na
surveys_combined <- dplyr::filter(surveys_combined,  !is.na(hindfoot_length))

# Checking the summary stats
summary(surveys_combined$hindfoot_length)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   21.00   32.00   29.29   36.00   70.00

# Checking the z score summary stats
z.scores <- surveys_combined$hindfoot_length %>%  scores(type = "z")
z.scores %>% summary()

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.8530 -0.8665  0.2835  0.0000  0.7017  4.2565

# Checking the z score values
surveys_combined$hindfoot_length[ which( abs(z.scores) >3 )]

## [1] 58 64 58 70

# Imputing the outliers
surveys_combined$hindfoot_length[ which( abs(z.scores) >3 )] <- mean(surveys_combined$hindfoot_length, 
                                                                     na.rm = TRUE)

# Checking the results
surveys_combined$hindfoot_length %>%  boxplot(main="Box Plot of Hindfoot Length", 
                                              ylab="Hindfoot Length", col = "grey")

Outliers were removed by imputation. The method of outlier removal in the case of these data does not require a large amount of consideration. This is because the hindfoot length is being taken over a huge range of different species, so this statistical evaluation has minimal applicable value. Additionally the outliers reprented only four values out of over 30,000, so any method do deal with the outliers will have minimal effect.