Week 13 HLT

Kaycee 2/1/2022

R Markdown

This is an R Markdown document.

Task: Use a meaningful data visualisation using this data.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(palmerpenguins)

The palmerpenguins data contains size measurements, clutch observations, and blood isotope ratios for three penguin species observed on three islands in the Palmer Archipelago, Antarctica over a study period of three years. You can read more about the dataset on https://education.rstudio.com/blog/2020/07/palmerpenguins-cran/

# import the dataset 
penguins = penguins

# view structure of the data
str(penguins)
## tibble [344 x 8] (S3: tbl_df/tbl/data.frame)
##  $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
##  $ year             : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
glimpse(penguins)
## Rows: 344
## Columns: 8
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel~
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse~
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, ~
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, ~
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186~
## $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, ~
## $ sex               <fct> male, female, female, NA, female, male, female, male~
## $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007~
# dimention of dataset
dim(penguins)
## [1] 344   8
# Data Wrangling
# summary statistics 
library(skimr)
skim(penguins)
Name penguins
Number of rows 344
Number of columns 8
_______________________
Column type frequency:
factor 3
numeric 5
________________________
Group variables None

Data summary

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
species 0 1.00 FALSE 3 Ade: 152, Gen: 124, Chi: 68
island 0 1.00 FALSE 3 Bis: 168, Dre: 124, Tor: 52
sex 11 0.97 FALSE 2 mal: 168, fem: 165

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
bill_length_mm 2 0.99 43.92 5.46 32.1 39.23 44.45 48.5 59.6 ▃▇▇▆▁
bill_depth_mm 2 0.99 17.15 1.97 13.1 15.60 17.30 18.7 21.5 ▅▅▇▇▂
flipper_length_mm 2 0.99 200.92 14.06 172.0 190.00 197.00 213.0 231.0 ▂▇▃▅▂
body_mass_g 2 0.99 4201.75 801.95 2700.0 3550.00 4050.00 4750.0 6300.0 ▃▇▆▃▂
year 0 1.00 2008.03 0.82 2007.0 2007.00 2008.00 2009.0 2009.0 ▇▁▇▁▇
# check for missing values 
sum(is.na(penguins)) # 19 missing values 
## [1] 19
# show number of missing values in each column
colSums(is.na(penguins))  # the sex variables have 11 mising values 
##           species            island    bill_length_mm     bill_depth_mm 
##                 0                 0                 2                 2 
## flipper_length_mm       body_mass_g               sex              year 
##                 2                 2                11                 0
# lets remove null values 
penguins = na.omit(penguins)

# check to see if null values removed 
colSums(is.na(penguins))
##           species            island    bill_length_mm     bill_depth_mm 
##                 0                 0                 0                 0 
## flipper_length_mm       body_mass_g               sex              year 
##                 0                 0                 0                 0

Data Visualisation

# Scatter plot of bill length vs flipper length
penguins %>% 
  ggplot(aes(bill_length_mm, flipper_length_mm, col="red"))+
  geom_point(show.legend = FALSE)+
  ggtitle("Scatter plot of Penguin's Bill length vs Flipper length")+
  xlab("Bill Length (mm)")+
  ylab("Flipper Length (mm)")+
  theme_bw()

# Appears that there is positive relationship between Bill length and flipper length 

cor(penguins$bill_length_mm, penguins$flipper_length_mm) # correlation coefficient of 0.65
## [1] 0.6530956
# Scatter plot of bill depth vs flipper length
penguins %>% 
  ggplot(aes(bill_depth_mm, flipper_length_mm, color = "red"))+
  geom_point(show.legend = FALSE)+
  ggtitle("Scatter plot of Penguin's Bill Depth vs Flipper length")+
  xlab("Bill Depth (mm)")+
  ylab("Flipper Length (mm)")+
  theme_bw()

# Appears that as Bill depth increases, flipper length reduces. Notice the existence of two different clusters

cor(penguins$bill_depth_mm, penguins$flipper_length_mm) # correlation coefficient of -0.58
## [1] -0.5777917

map color by sex

penguins %>% 
  ggplot(aes(bill_depth_mm, flipper_length_mm, col = sex))+
  geom_point()+
  ggtitle("Scatter plot of Penguin's Bill length vs Flipper length")+
  xlab("Bill Depth (mm)")+
  ylab("Flipper Length (mm)")+
  theme_bw()

# graph shows that male penguins higher flipper length and bill depth 

map color by species

penguins %>% 
  ggplot(aes(bill_depth_mm, flipper_length_mm, color = species))+
  geom_point()+
  ggtitle("Scatter plot of Penguin's Bill length vs Flipper length")+
  xlab("Bill Depth (mm)")+
  ylab("Flipper Length (mm)")+
  theme_bw()

# Graph shows that Adelie species have much higher flipper length than the other species 

map color by the island that the penguins live

penguins %>% 
  ggplot(aes(bill_depth_mm, flipper_length_mm, color = island))+
  geom_point()+
  ggtitle("Scatter plot of Penguin's Bill length vs Flipper length")+
  xlab("Bill Depth (mm)")+
  ylab("Flipper Length (mm)")+
  theme_bw()

# Graph shows that penguins that live in Biscoe island have higher flipper length than the other species 

facet by species & gender

penguins %>% 
  ggplot(aes(bill_depth_mm, flipper_length_mm, color = sex))+
  geom_point()+
  ggtitle("Scatter plot of Penguin's Bill length vs Flipper length")+
  xlab("Bill Depth (mm)")+
  ylab("Flipper Length (mm)")+
  facet_grid(~species)+
  theme_bw()

# Graph shows that male penguins have longer flipper than female and Gentoo species have longer flipper 

facetting by sex and islands

penguins %>% 
  ggplot(aes(bill_depth_mm, flipper_length_mm, color = sex))+
  geom_point()+
  ggtitle("Scatter plot of Penguin's Bill length vs Flipper length")+
  xlab("Bill Depth (mm)")+
  ylab("Flipper Length (mm)")+
  facet_grid(~island)+
  theme_bw()

penguins %>% 
  ggplot(aes(bill_depth_mm, flipper_length_mm, col = species, size = body_mass_g))+
  geom_point()+
  ggtitle("Scatter plot of Penguin's Bill length vs Flipper length")+
  xlab("Bill Depth (mm)")+
  ylab("Flipper Length (mm)")+
  theme_bw()+
  labs(size = "Body Mass (grams)")

# graph shows that Adelie species have higher body mass and flipper length 

is there a difference in the body mass of the penguins

penguins %>% 
  ggplot(aes(bill_depth_mm, flipper_length_mm, col = species, size = body_mass_g))+
  geom_point()+
  ggtitle("Scatter plot of Penguin's Bill length vs Flipper length")+
  xlab("Bill Depth (mm)")+
  ylab("Flipper Length (mm)")+
  theme_bw()

# graph shows that gentor penguins larger body mass

Map by island and body mass

penguins %>% 
  ggplot(aes(bill_depth_mm, flipper_length_mm, col = island, size = body_mass_g))+
  geom_point()+
  ggtitle("Scatter plot of Penguin's Bill length vs Flipper length")+
  xlab("Bill Depth (mm)")+
  ylab("Flipper Length (mm)")+
  theme_bw()+
  labs(size = "Body Mass (grams)")

# graph shows that  penguins that lives in Biscoe islands have larger body mass

Is there a difference in size of the body mass by gender

penguins %>% 
  ggplot(aes(bill_length_mm, flipper_length_mm, col = sex, size = body_mass_g))+
  geom_point()+
  ggtitle("Scatter plot of Penguin's Bill length vs Flipper length")+
  xlab("Bill Depth (mm)")+
  ylab("Flipper Length (mm)")+
  theme_bw()+
  facet_wrap(~species)+
  labs(size = "Body Mass (grams)")

# Appears that male penguins are larger

lets build a classification model to predict the gender of the penguins

# drop column year- wont be used for our modelling 

penguins = penguins %>% select(-year)

# import tidymodel library (for machine learning)
library(tidymodels)
## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip

## -- Attaching packages -------------------------------------- tidymodels 0.1.4 --

## v broom        0.7.10     v rsample      0.1.1 
## v dials        0.0.10     v tune         0.1.6 
## v infer        1.0.0      v workflows    0.2.4 
## v modeldata    0.1.1      v workflowsets 0.1.0 
## v parsnip      0.1.7      v yardstick    0.0.9 
## v recipes      0.1.17

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/
# split dataset into training and testing set
set.seed(100) # ensures reproducibility
penguins_split = initial_split(penguins, prop = 0.75, strata = sex)
penguins_train = training(penguins_split)
penguins_test = testing(penguins_split)

Create 10 K fold resampling

set.seed(100)
kfolds = vfold_cv(penguins_train, v = 10)
kfolds
## #  10-fold cross-validation 
## # A tibble: 10 x 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [224/25]> Fold01
##  2 <split [224/25]> Fold02
##  3 <split [224/25]> Fold03
##  4 <split [224/25]> Fold04
##  5 <split [224/25]> Fold05
##  6 <split [224/25]> Fold06
##  7 <split [224/25]> Fold07
##  8 <split [224/25]> Fold08
##  9 <split [224/25]> Fold09
## 10 <split [225/24]> Fold10

Lets compare two models: A simple Logistic Regression and Random Forest model using Todymodel workflow

Step 1: Specify the models

# logistic reg
glm_spec = logistic_reg() %>% 
  set_mode("classification") %>% 
  set_engine("glm")

# Random Forest Model
rand_spec = rand_forest() %>% 
  set_mode("classification") %>% 
  set_engine("ranger")

Step 2: Set workflow pipeline

penguins_wf = workflow() %>% 
  add_formula(sex~.) # we will model sex against all other variables 

Step 3: Fit the Models against each of the resamples

Fit Logistic model

glm_fit = penguins_wf %>% 
  add_model(glm_spec) %>% 
  fit_resamples(
    resamples = kfolds,
    control = control_resamples(save_pred = TRUE)
  )
glm_fit
## # Resampling results
## # 10-fold cross-validation 
## # A tibble: 10 x 5
##    splits           id     .metrics         .notes           .predictions     
##    <list>           <chr>  <list>           <list>           <list>           
##  1 <split [224/25]> Fold01 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  2 <split [224/25]> Fold02 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  3 <split [224/25]> Fold03 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  4 <split [224/25]> Fold04 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  5 <split [224/25]> Fold05 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  6 <split [224/25]> Fold06 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  7 <split [224/25]> Fold07 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  8 <split [224/25]> Fold08 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  9 <split [224/25]> Fold09 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
## 10 <split [225/24]> Fold10 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [24 x 6]>

Fit Random Forest model

rand_fit = penguins_wf %>% 
  add_model(rand_spec) %>% 
  fit_resamples(
    resamples = kfolds,
    control = control_resamples(save_pred = TRUE)
  )
rand_fit
## # Resampling results
## # 10-fold cross-validation 
## # A tibble: 10 x 5
##    splits           id     .metrics         .notes           .predictions     
##    <list>           <chr>  <list>           <list>           <list>           
##  1 <split [224/25]> Fold01 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  2 <split [224/25]> Fold02 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  3 <split [224/25]> Fold03 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  4 <split [224/25]> Fold04 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  5 <split [224/25]> Fold05 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  6 <split [224/25]> Fold06 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  7 <split [224/25]> Fold07 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  8 <split [224/25]> Fold08 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
##  9 <split [224/25]> Fold09 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [25 x 6]>
## 10 <split [225/24]> Fold10 <tibble [2 x 4]> <tibble [0 x 1]> <tibble [24 x 6]>

Evaluate the results

# Performance metrics of Logistic model 
glm_fit %>% collect_metrics() # 92% Training Accuracy
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.920    10 0.0133  Preprocessor1_Model1
## 2 roc_auc  binary     0.981    10 0.00574 Preprocessor1_Model1
# Performance metrics of Random Forest model 
rand_fit %>% collect_metrics() # 92% Training Accuracy 
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.920    10 0.0169  Preprocessor1_Model1
## 2 roc_auc  binary     0.972    10 0.00953 Preprocessor1_Model1

Both models have similar accuracy. Folowing Ocam Razor Principle, we will pick the logistic model which is simpler and more explainable. We will fit on the training data and evaluate on the testing data using last_fit

penguins_final = penguins_wf %>% 
  add_model(glm_spec) %>% 
  last_fit(penguins_split)
penguins_final
## # Resampling results
## # Manual resampling 
## # A tibble: 1 x 6
##   splits           id               .metrics   .notes    .predictions  .workflow
##   <list>           <chr>            <list>     <list>    <list>        <list>   
## 1 <split [249/84]> train/test split <tibble [~ <tibble ~ <tibble [84 ~ <workflo~
penguins_final %>% collect_metrics()
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.905 Preprocessor1_Model1
## 2 roc_auc  binary         0.976 Preprocessor1_Model1

The Logistic model has an accuracy of 91% on the test dataset. It is not overfitting

# get the confusion matrix 
penguins_final %>% 
  collect_predictions() %>% 
  conf_mat(sex, .pred_class)
##           Truth
## Prediction female male
##     female     39    5
##     male        3   37

Lets get the odd ratios of the predictors to see which variables were significant in predicting the gender of the penguins

penguins_final$.workflow[[1]] %>% 
  tidy(exponentiate = TRUE)
## # A tibble: 9 x 5
##   term              estimate std.error statistic      p.value
##   <chr>                <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept)       1.31e-35  14.3        -5.61  0.0000000200
## 2 speciesChinstrap  3.27e- 4   1.99       -4.03  0.0000550   
## 3 speciesGentoo     3.14e- 4   2.95       -2.73  0.00631     
## 4 islandDream       2.71e+ 0   0.930       1.07  0.284       
## 5 islandTorgersen   6.58e- 1   0.996      -0.420 0.674       
## 6 bill_length_mm    1.88e+ 0   0.158       3.99  0.0000664   
## 7 bill_depth_mm     5.10e+ 0   0.373       4.37  0.0000125   
## 8 flipper_length_mm 1.03e+ 0   0.0542      0.463 0.643       
## 9 body_mass_g       1.01e+ 0   0.00121     4.70  0.00000259

The largest odds ratio is for islandTorgersen, with the second largest for bill depth. An increase of 1 mm in bill depth corresponds to 5x higher odds of being male. The characteristics of a penguin’s bill must be associated with their sex.

penguins %>% 
  ggplot(aes(bill_depth_mm, bill_length_mm, color = sex, size = body_mass_g))+
  geom_point()+
  facet_wrap(~species)+
  labs(color = "Sex", size = "Body Mass (grams)")+
  theme_bw()

Female and male penguins are much more seperated.