library(palmerpenguins)
library(tidyr)
library(dplyr)
library(ggplot2)
library(mlbench)
library(MASS)
library(pROC)
library(stringr)
library(ggplot2)

1. Logistic Regression with a binary outcome. (40)

a.

The penguin dataset has ‘species’ column. Please check how many categories you have in the species column. Conduct whatever data manipulation you need to do to be able to build a logistic regression with binary outcome. Please explain your reasoning behind your decision as you manipulate the outcome/dependent variable (species).

There are three categories in the species column. From the summary statistic, there are missing values in the dataset, being displayed as NA’s and should be removed. Species is the binary dependent variable in this dataset with categories Adelie, Chinstrap and Gentoo. When the dependent variable has more than two categories, then it is a multinomial logistic regression. In this case we are trying to build a logistic regression with binary outcome therefore one category has to be removed.

summary(penguins)
##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :152   Biscoe   :168   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :124   1st Qu.:39.23   1st Qu.:15.60  
##  Gentoo   :124   Torgersen: 52   Median :44.45   Median :17.30  
##                                  Mean   :43.92   Mean   :17.15  
##                                  3rd Qu.:48.50   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##                                  NA's   :2       NA's   :2      
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172.0     Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190.0     1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197.0     Median :4050   NA's  : 11   Median :2008  
##  Mean   :200.9     Mean   :4202                Mean   :2008  
##  3rd Qu.:213.0     3rd Qu.:4750                3rd Qu.:2009  
##  Max.   :231.0     Max.   :6300                Max.   :2009  
##  NA's   :2         NA's   :2

I will remove the species with the lowest quantity which is Chinstrap.

newdata <-na.omit(penguins)
newdatat <- newdata %>% filter(species !="Chinstrap")
glimpse(newdatat)
## Rows: 265
## Columns: 8
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Ade…
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgers…
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.…
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.…
## $ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 1…
## $ body_mass_g       <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 380…
## $ sex               <fct> male, female, female, female, male, female, male, f…
## $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 200…

b.

Please make sure you are evaluating the independent variables appropriately in deciding which ones should be in the model.

Non-numeric columns, Island and Sex were removed along with Year. I reordered the columns to have the binary dependent variable, species as the last column.

newdata2 <-newdatat [, c(3,4,5,6,1)]
#newdata2 <-newdatat [, c(3,6,1)]

pnewdata <-newdata2 %>%
  mutate_all(str_trim) %>%
  mutate(bill_length_mm = bill_length_mm %>% as.numeric()) %>%
  mutate(bill_depth_mm = bill_depth_mm %>% as.numeric()) %>%
  mutate(flipper_length_mm= flipper_length_mm %>% as.numeric()) %>%
  mutate(body_mass_g= body_mass_g %>% as.numeric()) %>%
  arrange(body_mass_g) %>%
  mutate_if(is.character,as.factor)
glimpse(pnewdata)
## Rows: 265
## Columns: 5
## $ bill_length_mm    <dbl> 36.5, 36.4, 34.5, 33.1, 38.6, 37.9, 37.0, 37.3, 35.…
## $ bill_depth_mm     <dbl> 16.6, 17.1, 18.1, 16.1, 17.0, 18.6, 16.9, 16.8, 16.…
## $ flipper_length_mm <dbl> 181, 184, 187, 178, 188, 193, 185, 192, 190, 186, 1…
## $ body_mass_g       <dbl> 2850, 2850, 2900, 2900, 2900, 2925, 3000, 3000, 305…
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Ade…

There are now four independent variables as indicated in the Descriptive statistics.

Changed data frame to matrix to handle the following error: Error in hist.default(newdata2[, i], main = colnames(newdata2)[i], xlab = colnames(newdata2)[i], : ‘x’ must be numeric

#newdata3 <-as.matrix(sapply(newdata2, as.numeric))
#head(newdata3)
pnewdata2 <-as.matrix(sapply(pnewdata, as.numeric))

Analyze the distribution of each independent variable

par(mfrow = c(2,2))
for( i in 1:4){
  hist(pnewdata2[,i], main = colnames(pnewdata)[i],xlab=colnames(pnewdata)[i], col = 'yellow')
}

For continuous independent variables, we can get more clarity on the distribution by analyzing it w.r.t. dependent variable.

par(mfrow = c(2,2))
boxplot(bill_length_mm~species, ylab="Bill Length (mm)", xlab= "Species", col="light blue",data = pnewdata)
boxplot(bill_depth_mm~species, ylab="Bill Depth (mm)", xlab= "Species", col="light blue",data = pnewdata)
boxplot(flipper_length_mm~species, ylab="Flipper Length (mm)", xlab= "Species", col="light blue",data = pnewdata)
boxplot(body_mass_g~species, ylab="Body Mass (g)", xlab= "Species", col="light blue",data = pnewdata)

## c. Provide variable interpretations in your model.

logit_1 <- glm(species~., family = binomial,data = pnewdata)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Analysis of Model Summary

summary(logit_1)
## 
## Call:
## glm(formula = species ~ ., family = binomial, data = pnewdata)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -3.173e-05  -2.100e-08  -2.100e-08   2.100e-08   3.746e-05  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)       -1.435e+02  3.546e+05   0.000    1.000
## bill_length_mm    -1.706e-01  5.586e+03   0.000    1.000
## bill_depth_mm     -1.167e+01  1.048e+04  -0.001    0.999
## flipper_length_mm  1.265e+00  1.689e+03   0.001    0.999
## body_mass_g        1.848e-02  2.833e+01   0.001    0.999
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3.6461e+02  on 264  degrees of freedom
## Residual deviance: 5.8112e-09  on 260  degrees of freedom
## AIC: 10
## 
## Number of Fisher Scoring iterations: 25

For continuous variables, the interpretation is as follows: For every one unit increase in bill length(mm), the log odds of being species ‘Adelie’(versus being Species ‘Gentoo’) decreases by 1.706. Similarly, for one unit increase in bill depth(mm), the log odds of being species ‘Adelie’(versus being Species ‘Gentoo’) decreases by 1.167. For every one unit increase in flipper_length(mm), the log odds of being species ‘Adelie’(versus being Species ‘Gentoo’) increases by 1.265. Similarly, for one unit increase in body_mass(g), the log odds of being species ‘Adelie’(versus being Species ‘Gentoo’) increases by 1.848.

The model ‘logit_1’, might not be the best model with the given set of independent variables. There are multiple methodologies for variable selection. I will explore only the ‘stepAIC’ function. The ‘stepAIC’ function in R performs a stepwise model selection with an objective to minimize the AIC value.

logit_2 <- stepAIC(logit_1)
## Start:  AIC=10
## species ~ bill_length_mm + bill_depth_mm + flipper_length_mm + 
##     body_mass_g
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                     Df Deviance    AIC
## - bill_length_mm     1   0.0000  8.000
## - body_mass_g        1   0.0000  8.000
## - flipper_length_mm  1   0.0000  8.000
## <none>                   0.0000 10.000
## - bill_depth_mm      1   9.9544 17.954
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=8
## species ~ bill_depth_mm + flipper_length_mm + body_mass_g
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                     Df Deviance    AIC
## - body_mass_g        1    0.000  6.000
## - flipper_length_mm  1    0.000  6.000
## <none>                    0.000  8.000
## - bill_depth_mm      1   15.388 21.388
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=6
## species ~ bill_depth_mm + flipper_length_mm
## 
##                     Df Deviance    AIC
## <none>                    0.000  6.000
## - bill_depth_mm      1   20.887 24.887
## - flipper_length_mm  1   79.317 83.317
summary(logit_2)
## 
## Call:
## glm(formula = species ~ bill_depth_mm + flipper_length_mm, family = binomial, 
##     data = pnewdata)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -7.598e-05  -2.100e-08  -2.100e-08   2.100e-08   7.092e-05  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)         -242.039 383506.606  -0.001    0.999
## bill_depth_mm        -20.343  12032.046  -0.002    0.999
## flipper_length_mm      2.803   1489.552   0.002    0.998
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3.6461e+02  on 264  degrees of freedom
## Residual deviance: 1.1987e-08  on 262  degrees of freedom
## AIC: 6
## 
## Number of Fisher Scoring iterations: 25

After implementing ‘stepAIC’ function, I am now left with two independent variables — bill depth(mm) and flipper length(mm). Of all the possible models, this model (logit_2) has the minimum AIC value and these variables are highly significant.

summary(logit_2$fitted.values)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4491  1.0000  1.0000
newdata2$Predict <- ifelse(logit_2$fitted.values >0.5,"pos","neg")
logit_1$aic
## [1] 10
logit_2$aic
## [1] 6

2. Provide: AUC, Accuracy, TPR, FPR, TNR, FNR (20)

Gentoo Confusion Matrix was zero so I will adjust the data set.

pennewdata <-na.omit(penguins)%>% filter(species !="Chinstrap")
#newdata2 <-newdatat [, c(3,4,5,6,1)]
pennewdata2 <-pennewdata [, c(3,6,1)]
pennewdata3 <-pennewdata2 %>%
  mutate_all(str_trim) %>%
  mutate(bill_length_mm = bill_length_mm %>% as.numeric()) %>%
  #mutate(bill_depth_mm = bill_depth_mm %>% as.numeric()) %>%
  #mutate(flipper_length_mm= flipper_length_mm %>% as.numeric()) %>%
  mutate(body_mass_g= body_mass_g %>% as.numeric()) %>%
  #arrange(body_mass_g) %>%
  mutate_if(is.character,as.factor)
glimpse(pennewdata3)
## Rows: 265
## Columns: 3
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6, …
## $ body_mass_g    <dbl> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800, …
## $ species        <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adelie…

Train/Test Split

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
## ✓ broom     0.7.4      ✓ recipes   0.1.15
## ✓ dials     0.0.9      ✓ rsample   0.0.8 
## ✓ infer     0.5.4      ✓ tibble    3.0.6 
## ✓ modeldata 0.1.0      ✓ tune      0.1.2 
## ✓ parsnip   0.1.5      ✓ workflows 0.2.1 
## ✓ purrr     0.3.4      ✓ yardstick 0.0.7
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag()     masks stats::lag()
## x MASS::select()   masks dplyr::select()
## x recipes::step()  masks stats::step()
df_split <- initial_split(pennewdata3, prop=0.7)
df_train <- training(df_split)
df_test <- testing(df_split)

Logistic regression model

lr_model <- logistic_reg() %>%
  # using model classification
  set_mode('classification') %>%
  # use glm function
  set_engine('glm') %>%
  #fit training data
  fit(species ~ ., df_train)

Prediction on Training Data

lr_train_pred <- lr_model %>% 
  predict(df_train) %>%
  # rename the prediction column
  mutate(prediction = `.pred_class`) %>%
  # merge the prediction result back to training data set
  bind_cols(df_train) 

Confusion Matrix is a tabular representation of Observed vs Predicted values. It helps to quantify the efficiency (or accuracy) of the model.

lr_train_cm <- lr_train_pred %>% 
  #use only prediction values and actual label
  dplyr::select(prediction, species) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
lr_train_cm
##           species
## prediction Adelie Gentoo
##     Adelie    101      4
##     Gentoo      4     77

Accuracy is .946.

lr_train_pred %>%
  metrics(truth = species, estimate = prediction)
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.957
## 2 kap      binary         0.913
tp<-lr_train_cm[1,1]
fp<-lr_train_cm[1,2]
fn<-lr_train_cm[2,1]
tn<-lr_train_cm[2,2]

accuracy <- (tp+tn)/(tp+tn+fp+fn) #accuracy=(TP+TN/P+N)
tpr <- tp/(tp+fn) #TPR=TP/(TP+FN)
fpr <- fp/(fp+tn) #FPR=FP/(FP+TN)
tnr <- tn/(tn+fp) #TNR=TN/(TN+FP)
fnr <- fn/(fn+tp) #FNR=FN/(FN+TP)


lr_pred_prob_tr <- lr_train_pred %>%
  cbind(predict(lr_model, df_train, type='prob'))
lr_pred_prob_tr
##     .pred_class prediction bill_length_mm body_mass_g species .pred_Adelie
## 1        Adelie     Adelie           39.1        3750  Adelie 9.993429e-01
## 2        Adelie     Adelie           39.5        3800  Adelie 9.987914e-01
## 3        Adelie     Adelie           40.3        3250  Adelie 9.999346e-01
## 4        Adelie     Adelie           36.7        3450  Adelie 9.999831e-01
## 5        Adelie     Adelie           39.3        3650  Adelie 9.995977e-01
## 6        Adelie     Adelie           39.2        4675  Adelie 7.965891e-01
## 7        Adelie     Adelie           38.7        3450  Adelie 9.999273e-01
## 8        Adelie     Adelie           42.5        4500  Adelie 5.184427e-01
## 9        Adelie     Adelie           34.4        3325  Adelie 9.999986e-01
## 10       Adelie     Adelie           37.7        3600  Adelie 9.999088e-01
## 11       Adelie     Adelie           35.9        3800  Adelie 9.999123e-01
## 12       Adelie     Adelie           38.2        3950  Adelie 9.987824e-01
## 13       Adelie     Adelie           35.3        3800  Adelie 9.999433e-01
## 14       Adelie     Adelie           40.6        3550  Adelie 9.994510e-01
## 15       Adelie     Adelie           40.5        3200  Adelie 9.999450e-01
## 16       Adelie     Adelie           37.9        3150  Adelie 9.999940e-01
## 17       Adelie     Adelie           40.5        3950  Adelie 9.935242e-01
## 18       Adelie     Adelie           39.5        3250  Adelie 9.999635e-01
## 19       Adelie     Adelie           37.2        3900  Adelie 9.995724e-01
## 20       Adelie     Adelie           39.5        3300  Adelie 9.999498e-01
## 21       Adelie     Adelie           40.9        3900  Adelie 9.936940e-01
## 22       Adelie     Adelie           36.4        3325  Adelie 9.999939e-01
## 23       Adelie     Adelie           39.2        4150  Adelie 9.910533e-01
## 24       Adelie     Adelie           42.2        3550  Adelie 9.982398e-01
## 25       Adelie     Adelie           37.6        3300  Adelie 9.999874e-01
## 26       Adelie     Adelie           36.5        3150  Adelie 9.999978e-01
## 27       Adelie     Adelie           40.8        3900  Adelie 9.941347e-01
## 28       Adelie     Adelie           36.0        3100  Adelie 9.999989e-01
## 29       Gentoo     Gentoo           44.1        4400  Adelie 3.879773e-01
## 30       Adelie     Adelie           37.0        3000  Adelie 9.999988e-01
## 31       Adelie     Adelie           39.6        4600  Adelie 8.250618e-01
## 32       Adelie     Adelie           42.3        4150  Adelie 9.204034e-01
## 33       Adelie     Adelie           39.6        3500  Adelie 9.998073e-01
## 34       Adelie     Adelie           35.0        3450  Adelie 9.999951e-01
## 35       Adelie     Adelie           42.0        4050  Adelie 9.645370e-01
## 36       Adelie     Adelie           34.5        2900  Adelie 9.999999e-01
## 37       Adelie     Adelie           39.0        3550  Adelie 9.998289e-01
## 38       Adelie     Adelie           40.6        3800  Adelie 9.973093e-01
## 39       Adelie     Adelie           37.6        3750  Adelie 9.997797e-01
## 40       Adelie     Adelie           35.7        3150  Adelie 9.999988e-01
## 41       Adelie     Adelie           37.6        3600  Adelie 9.999152e-01
## 42       Adelie     Adelie           36.4        2850  Adelie 9.999997e-01
## 43       Adelie     Adelie           41.1        4100  Adelie 9.744407e-01
## 44       Adelie     Adelie           41.8        4450  Adelie 7.114362e-01
## 45       Adelie     Adelie           33.5        3600  Adelie 9.999957e-01
## 46       Adelie     Adelie           39.7        3900  Adelie 9.973608e-01
## 47       Adelie     Adelie           39.6        3550  Adelie 9.997351e-01
## 48       Gentoo     Gentoo           45.8        4150  Adelie 4.741790e-01
## 49       Adelie     Adelie           40.9        3700  Adelie 9.982268e-01
## 50       Adelie     Adelie           37.2        3900  Adelie 9.995724e-01
## 51       Adelie     Adelie           36.2        3550  Adelie 9.999778e-01
## 52       Adelie     Adelie           42.1        4000  Adelie 9.720391e-01
## 53       Adelie     Adelie           34.6        3200  Adelie 9.999993e-01
## 54       Gentoo     Gentoo           42.9        4700  Adelie 1.837642e-01
## 55       Adelie     Adelie           36.7        3800  Adelie 9.998428e-01
## 56       Adelie     Adelie           35.1        4200  Adelie 9.993754e-01
## 57       Adelie     Adelie           37.3        3350  Adelie 9.999861e-01
## 58       Adelie     Adelie           41.3        3550  Adelie 9.990859e-01
## 59       Adelie     Adelie           36.3        3800  Adelie 9.998826e-01
## 60       Adelie     Adelie           36.9        3500  Adelie 9.999731e-01
## 61       Adelie     Adelie           38.3        3950  Adelie 9.986905e-01
## 62       Adelie     Adelie           35.7        3550  Adelie 9.999846e-01
## 63       Adelie     Adelie           41.1        4300  Adelie 9.143213e-01
## 64       Adelie     Adelie           36.2        3300  Adelie 9.999955e-01
## 65       Adelie     Adelie           40.8        4300  Adelie 9.299716e-01
## 66       Adelie     Adelie           38.1        3700  Adelie 9.997693e-01
## 67       Adelie     Adelie           40.3        4350  Adelie 9.329173e-01
## 68       Adelie     Adelie           33.1        2900  Adelie 1.000000e+00
## 69       Adelie     Adelie           43.2        4100  Adelie 8.918843e-01
## 70       Adelie     Adelie           35.0        3725  Adelie 9.999718e-01
## 71       Adelie     Adelie           37.7        3075  Adelie 9.999968e-01
## 72       Adelie     Adelie           37.8        4250  Adelie 9.938878e-01
## 73       Adelie     Adelie           37.9        2925  Adelie 9.999986e-01
## 74       Adelie     Adelie           38.6        3750  Adelie 9.995435e-01
## 75       Adelie     Adelie           38.1        3825  Adelie 9.994889e-01
## 76       Gentoo     Gentoo           45.6        4600  Adelie 5.612102e-02
## 77       Adelie     Adelie           39.7        3200  Adelie 9.999693e-01
## 78       Adelie     Adelie           39.6        3900  Adelie 9.975459e-01
## 79       Adelie     Adelie           42.7        4075  Adelie 9.330027e-01
## 80       Adelie     Adelie           38.6        2900  Adelie 9.999980e-01
## 81       Adelie     Adelie           35.7        3350  Adelie 9.999957e-01
## 82       Adelie     Adelie           36.2        3150  Adelie 9.999983e-01
## 83       Adelie     Adelie           37.7        3500  Adelie 9.999517e-01
## 84       Adelie     Adelie           40.2        3450  Adelie 9.997829e-01
## 85       Adelie     Adelie           41.4        3875  Adelie 9.922681e-01
## 86       Adelie     Adelie           35.2        3050  Adelie 9.999996e-01
## 87       Adelie     Adelie           38.8        3275  Adelie 9.999743e-01
## 88       Adelie     Adelie           41.5        4300  Adelie 8.885495e-01
## 89       Adelie     Adelie           44.1        4000  Adelie 8.900010e-01
## 90       Adelie     Adelie           38.5        3325  Adelie 9.999716e-01
## 91       Adelie     Adelie           43.1        3500  Adelie 9.975345e-01
## 92       Adelie     Adelie           36.8        3500  Adelie 9.999750e-01
## 93       Adelie     Adelie           37.5        4475  Adelie 9.797186e-01
## 94       Adelie     Adelie           38.1        3425  Adelie 9.999599e-01
## 95       Adelie     Adelie           41.1        3900  Adelie 9.927116e-01
## 96       Adelie     Adelie           35.6        3175  Adelie 9.999987e-01
## 97       Adelie     Adelie           40.2        3975  Adelie 9.938960e-01
## 98       Adelie     Adelie           37.0        3400  Adelie 9.999847e-01
## 99       Adelie     Adelie           39.7        4250  Adelie 9.760229e-01
## 100      Adelie     Adelie           32.1        3050  Adelie 1.000000e+00
## 101      Adelie     Adelie           40.7        3725  Adelie 9.982030e-01
## 102      Adelie     Adelie           39.0        3650  Adelie 9.996767e-01
## 103      Adelie     Adelie           39.2        4250  Adelie 9.832233e-01
## 104      Adelie     Adelie           36.6        3475  Adelie 9.999815e-01
## 105      Adelie     Adelie           36.0        3450  Adelie 9.999898e-01
## 106      Gentoo     Gentoo           46.1        4500  Gentoo 7.240609e-02
## 107      Gentoo     Gentoo           48.7        4450  Gentoo 1.587244e-02
## 108      Gentoo     Gentoo           50.0        5700  Gentoo 2.187371e-06
## 109      Gentoo     Gentoo           47.6        5400  Gentoo 8.494069e-05
## 110      Gentoo     Gentoo           46.5        4550  Gentoo 4.069152e-02
## 111      Gentoo     Gentoo           45.4        4800  Gentoo 1.889114e-02
## 112      Gentoo     Gentoo           46.7        5200  Gentoo 5.845061e-04
## 113      Adelie     Adelie           43.3        4400  Gentoo 5.317852e-01
## 114      Gentoo     Gentoo           46.8        5150  Gentoo 7.469771e-04
## 115      Adelie     Adelie           40.9        4650  Gentoo 5.707985e-01
## 116      Gentoo     Gentoo           49.0        5550  Gentoo 1.178202e-05
## 117      Gentoo     Gentoo           45.5        4650  Gentoo 4.445020e-02
## 118      Gentoo     Gentoo           48.4        5850  Gentoo 2.702035e-06
## 119      Gentoo     Gentoo           45.8        4200  Gentoo 3.961106e-01
## 120      Adelie     Adelie           42.0        4150  Gentoo 9.350217e-01
## 121      Gentoo     Gentoo           49.2        6300  Gentoo 8.594716e-08
## 122      Gentoo     Gentoo           46.5        4400  Gentoo 9.928201e-02
## 123      Gentoo     Gentoo           42.9        5000  Gentoo 3.226485e-02
## 124      Gentoo     Gentoo           46.1        5100  Gentoo 1.708953e-03
## 125      Gentoo     Gentoo           48.2        4600  Gentoo 8.856772e-03
## 126      Gentoo     Gentoo           42.8        4700  Gentoo 1.949505e-01
## 127      Gentoo     Gentoo           45.1        5050  Gentoo 4.854788e-03
## 128      Gentoo     Gentoo           49.1        5150  Gentoo 1.397882e-04
## 129      Gentoo     Gentoo           42.6        4950  Gentoo 5.396302e-02
## 130      Gentoo     Gentoo           44.4        5250  Gentoo 2.269402e-03
## 131      Gentoo     Gentoo           48.7        5350  Gentoo 5.237852e-05
## 132      Gentoo     Gentoo           49.6        5700  Gentoo 2.927847e-06
## 133      Gentoo     Gentoo           49.6        4750  Gentoo 1.237865e-03
## 134      Gentoo     Gentoo           50.5        5550  Gentoo 3.947981e-06
## 135      Gentoo     Gentoo           43.6        4900  Gentoo 3.645362e-02
## 136      Gentoo     Gentoo           45.2        5300  Gentoo 9.225706e-04
## 137      Gentoo     Gentoo           46.6        4850  Gentoo 5.806148e-03
## 138      Gentoo     Gentoo           45.1        4400  Gentoo 2.342008e-01
## 139      Gentoo     Gentoo           45.0        5050  Gentoo 5.219962e-03
## 140      Adelie     Adelie           43.8        4300  Gentoo 5.985667e-01
## 141      Gentoo     Gentoo           50.4        5550  Gentoo 4.246503e-06
## 142      Gentoo     Gentoo           54.3        5650  Gentoo 1.308993e-07
## 143      Gentoo     Gentoo           49.8        5700  Gentoo 2.530670e-06
## 144      Gentoo     Gentoo           49.5        5800  Gentoo 1.666152e-06
## 145      Gentoo     Gentoo           43.5        4700  Gentoo 1.269276e-01
## 146      Gentoo     Gentoo           50.7        5550  Gentoo 3.412418e-06
## 147      Gentoo     Gentoo           46.4        5000  Gentoo 2.593381e-03
## 148      Gentoo     Gentoo           46.5        5200  Gentoo 6.761800e-04
## 149      Gentoo     Gentoo           46.4        4700  Gentoo 1.725469e-02
## 150      Gentoo     Gentoo           48.6        5800  Gentoo 3.210872e-06
## 151      Gentoo     Gentoo           47.5        4600  Gentoo 1.466622e-02
## 152      Gentoo     Gentoo           51.1        6000  Gentoo 1.452878e-07
## 153      Gentoo     Gentoo           45.2        4750  Gentoo 2.971652e-02
## 154      Gentoo     Gentoo           49.1        4625  Gentoo 3.939067e-03
## 155      Gentoo     Gentoo           52.5        5450  Gentoo 1.736744e-06
## 156      Gentoo     Gentoo           47.4        4725  Gentoo 7.172193e-03
## 157      Gentoo     Gentoo           50.0        5350  Gentoo 2.030636e-05
## 158      Gentoo     Gentoo           44.9        4750  Gentoo 3.671339e-02
## 159      Gentoo     Gentoo           50.8        5600  Gentoo 2.307601e-06
## 160      Gentoo     Gentoo           51.3        5300  Gentoo 1.082297e-05
## 161      Gentoo     Gentoo           47.5        4875  Gentoo 2.577890e-03
## 162      Gentoo     Gentoo           52.1        5550  Gentoo 1.229905e-06
## 163      Gentoo     Gentoo           47.5        4950  Gentoo 1.600743e-03
## 164      Gentoo     Gentoo           52.2        5400  Gentoo 2.971326e-06
## 165      Gentoo     Gentoo           45.5        4750  Gentoo 2.401987e-02
## 166      Gentoo     Gentoo           49.5        5650  Gentoo 4.329619e-06
## 167      Gentoo     Gentoo           50.8        5200  Gentoo 2.945180e-05
## 168      Gentoo     Gentoo           51.1        5250  Gentoo 1.721481e-05
## 169      Gentoo     Gentoo           48.5        4850  Gentoo 1.459850e-03
## 170      Gentoo     Gentoo           55.9        5600  Gentoo 5.606321e-08
## 171      Gentoo     Gentoo           47.2        4975  Gentoo 1.698736e-03
## 172      Gentoo     Gentoo           49.1        5500  Gentoo 1.505938e-05
## 173      Gentoo     Gentoo           46.8        5500  Gentoo 8.051547e-05
## 174      Gentoo     Gentoo           41.7        4700  Gentoo 3.506127e-01
## 175      Gentoo     Gentoo           53.4        5500  Gentoo 6.555141e-07
## 176      Gentoo     Gentoo           43.3        4575  Gentoo 2.715420e-01
## 177      Gentoo     Gentoo           48.1        5500  Gentoo 3.121517e-05
## 178      Gentoo     Gentoo           50.5        5000  Gentoo 1.309239e-04
## 179      Gentoo     Gentoo           43.5        4650  Gentoo 1.665776e-01
## 180      Gentoo     Gentoo           51.5        5500  Gentoo 2.618518e-06
## 181      Gentoo     Gentoo           55.1        5850  Gentoo 2.045048e-08
## 182      Gentoo     Gentoo           48.8        6000  Gentoo 7.768363e-07
## 183      Gentoo     Gentoo           47.2        4925  Gentoo 2.333968e-03
## 184      Gentoo     Gentoo           46.8        4850  Gentoo 5.022468e-03
## 185      Gentoo     Gentoo           50.4        5750  Gentoo 1.188643e-06
## 186      Gentoo     Gentoo           45.2        5200  Gentoo 1.742344e-03
##     .pred_Gentoo
## 1   6.571414e-04
## 2   1.208622e-03
## 3   6.536434e-05
## 4   1.693198e-05
## 5   4.023388e-04
## 6   2.034109e-01
## 7   7.274688e-05
## 8   4.815573e-01
## 9   1.428896e-06
## 10  9.119625e-05
## 11  8.772878e-05
## 12  1.217553e-03
## 13  5.665214e-05
## 14  5.490003e-04
## 15  5.500639e-05
## 16  6.013383e-06
## 17  6.475835e-03
## 18  3.648389e-05
## 19  4.275828e-04
## 20  5.015803e-05
## 21  6.305955e-03
## 22  6.139449e-06
## 23  8.946655e-03
## 24  1.760154e-03
## 25  1.255690e-05
## 26  2.167349e-06
## 27  5.865255e-03
## 28  1.094968e-06
## 29  6.120227e-01
## 30  1.200815e-06
## 31  1.749382e-01
## 32  7.959665e-02
## 33  1.927155e-04
## 34  4.904014e-06
## 35  3.546297e-02
## 36  1.027004e-07
## 37  1.710929e-04
## 38  2.690715e-03
## 39  2.202929e-04
## 40  1.209698e-06
## 41  8.478582e-05
## 42  2.984015e-07
## 43  2.555925e-02
## 44  2.885638e-01
## 45  4.270118e-06
## 46  2.639198e-03
## 47  2.649296e-04
## 48  5.258210e-01
## 49  1.773152e-03
## 50  4.275828e-04
## 51  2.222867e-05
## 52  2.796090e-02
## 53  7.459342e-07
## 54  8.162358e-01
## 55  1.571681e-04
## 56  6.246249e-04
## 57  1.387259e-05
## 58  9.141336e-04
## 59  1.174236e-04
## 60  2.693158e-05
## 61  1.309496e-03
## 62  1.543951e-05
## 63  8.567871e-02
## 64  4.525789e-06
## 65  7.002839e-02
## 66  2.306924e-04
## 67  6.708266e-02
## 68  3.701524e-08
## 69  1.081157e-01
## 70  2.824171e-05
## 71  3.224326e-06
## 72  6.112236e-03
## 73  1.435545e-06
## 74  4.565236e-04
## 75  5.111223e-04
## 76  9.438790e-01
## 77  3.070233e-05
## 78  2.454121e-03
## 79  6.699735e-02
## 80  2.039340e-06
## 81  4.321724e-06
## 82  1.741644e-06
## 83  4.825085e-05
## 84  2.170700e-04
## 85  7.731918e-03
## 86  4.445329e-07
## 87  2.568222e-05
## 88  1.114505e-01
## 89  1.099990e-01
## 90  2.837313e-05
## 91  2.465513e-03
## 92  2.503837e-05
## 93  2.028143e-02
## 94  4.006514e-05
## 95  7.288436e-03
## 96  1.318691e-06
## 97  6.103955e-03
## 98  1.532614e-05
## 99  2.397713e-02
## 100 4.640354e-08
## 101 1.796988e-03
## 102 3.233379e-04
## 103 1.677670e-02
## 104 1.845751e-05
## 105 1.016517e-05
## 106 9.275939e-01
## 107 9.841276e-01
## 108 9.999978e-01
## 109 9.999151e-01
## 110 9.593085e-01
## 111 9.811089e-01
## 112 9.994155e-01
## 113 4.682148e-01
## 114 9.992530e-01
## 115 4.292015e-01
## 116 9.999882e-01
## 117 9.555498e-01
## 118 9.999973e-01
## 119 6.038894e-01
## 120 6.497834e-02
## 121 9.999999e-01
## 122 9.007180e-01
## 123 9.677352e-01
## 124 9.982910e-01
## 125 9.911432e-01
## 126 8.050495e-01
## 127 9.951452e-01
## 128 9.998602e-01
## 129 9.460370e-01
## 130 9.977306e-01
## 131 9.999476e-01
## 132 9.999971e-01
## 133 9.987621e-01
## 134 9.999961e-01
## 135 9.635464e-01
## 136 9.990774e-01
## 137 9.941939e-01
## 138 7.657992e-01
## 139 9.947800e-01
## 140 4.014333e-01
## 141 9.999958e-01
## 142 9.999999e-01
## 143 9.999975e-01
## 144 9.999983e-01
## 145 8.730724e-01
## 146 9.999966e-01
## 147 9.974066e-01
## 148 9.993238e-01
## 149 9.827453e-01
## 150 9.999968e-01
## 151 9.853338e-01
## 152 9.999999e-01
## 153 9.702835e-01
## 154 9.960609e-01
## 155 9.999983e-01
## 156 9.928278e-01
## 157 9.999797e-01
## 158 9.632866e-01
## 159 9.999977e-01
## 160 9.999892e-01
## 161 9.974221e-01
## 162 9.999988e-01
## 163 9.983993e-01
## 164 9.999970e-01
## 165 9.759801e-01
## 166 9.999957e-01
## 167 9.999705e-01
## 168 9.999828e-01
## 169 9.985402e-01
## 170 9.999999e-01
## 171 9.983013e-01
## 172 9.999849e-01
## 173 9.999195e-01
## 174 6.493873e-01
## 175 9.999993e-01
## 176 7.284580e-01
## 177 9.999688e-01
## 178 9.998691e-01
## 179 8.334224e-01
## 180 9.999974e-01
## 181 1.000000e+00
## 182 9.999992e-01
## 183 9.976660e-01
## 184 9.949775e-01
## 185 9.999988e-01
## 186 9.982577e-01
lr_pred_prob_tr %>%
  roc_curve(species, c(.pred_Adelie)) %>%
  autoplot()

auc <- lr_pred_prob_tr %>%
  roc_auc(species, c(.pred_Adelie)) %>%
  .[1,3] %>%
  as.numeric()

train_r1 <- data.frame(AUC = auc, 
                       ACCURACY = accuracy, 
                       TPR = tpr, 
                       FPR = fpr, 
                       TNR = tnr, 
                       FNR = fnr) 
 
library(kableExtra)
train_r1 %>%
  kable(caption = 'Training')
Training
AUC ACCURACY TPR FPR TNR FNR
0.9945914 0.9569892 0.9619048 0.0493827 0.9506173 0.0380952
lr_test_pred <- lr_model %>% 
  #make prediction on testing data
  predict(df_test) %>%
  # rename the prediction column
  mutate(prediction = `.pred_class`) %>%
  # merge the prediction result back to training data set
  bind_cols(df_test)
lr_test_cm <- lr_test_pred %>% 
  #use only prediction values and actual label
  dplyr::select(prediction, species) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
lr_test_cm
##           species
## prediction Adelie Gentoo
##     Adelie     38      1
##     Gentoo      3     37
lr_test_pred %>%
  metrics(truth = species, estimate = prediction)
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.949
## 2 kap      binary         0.899
tp<-lr_test_cm[1,1]
fp<-lr_test_cm[1,2]
fn<-lr_test_cm[2,1]
tn<-lr_test_cm[2,2]

accuracy <- (tp+tn)/(tp+tn+fp+fn) #accuracy=(TP+TN/P+N)
tpr <- tp/(tp+fn) #TPR=TP/(TP+FN)
fpr <- fp/(fp+tn) #FPR=FP/(FP+TN)
tnr <- tn/(tn+fp) #TNR=TN/(TN+FP)
fnr <- fn/(fn+tp) #FNR=FN/(FN+TP)

lr_pred_prob_te <- lr_test_pred %>%
  cbind(predict(lr_model, df_test, type='prob'))
lr_pred_prob_te %>%
  roc_curve(species, c(.pred_Adelie)) %>%
  autoplot()

auc <- lr_pred_prob_te %>%
  roc_auc(species, c(.pred_Adelie)) %>%
  .[1,3] %>%
  as.numeric()

test_r1 <- data.frame(AUC = auc, 
                       ACCURACY = accuracy, 
                       TPR = tpr, 
                       FPR = fpr, 
                       TNR = tnr, 
                       FNR = fnr)
test_r1 %>%
  kable(caption = 'Testing')
Testing
AUC ACCURACY TPR FPR TNR FNR
0.9839538 0.9493671 0.9268293 0.0263158 0.9736842 0.0731707

3. Multinomial Logistic Regression. (40)

unique(penguins[c("species")])
## # A tibble: 3 x 1
##   species  
##   <fct>    
## 1 Adelie   
## 2 Gentoo   
## 3 Chinstrap

a.

Please fit it a multinomial logistic regression where your outcome variable is ‘species’.

From the summary statistic, there are missing values in the dataset, being displayed as NA’s and should be removed. Species has categories Adelie, Chinstrap and Gentoo. When the dependent variable has more than two categories, then it is a multinomial logistic regression.

summary(penguins)
##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :152   Biscoe   :168   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :124   1st Qu.:39.23   1st Qu.:15.60  
##  Gentoo   :124   Torgersen: 52   Median :44.45   Median :17.30  
##                                  Mean   :43.92   Mean   :17.15  
##                                  3rd Qu.:48.50   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##                                  NA's   :2       NA's   :2      
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172.0     Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190.0     1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197.0     Median :4050   NA's  : 11   Median :2008  
##  Mean   :200.9     Mean   :4202                Mean   :2008  
##  3rd Qu.:213.0     3rd Qu.:4750                3rd Qu.:2009  
##  Max.   :231.0     Max.   :6300                Max.   :2009  
##  NA's   :2         NA's   :2
mnewdata <-na.omit(penguins)
summary(mnewdata)
##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :146   Biscoe   :163   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :123   1st Qu.:39.50   1st Qu.:15.60  
##  Gentoo   :119   Torgersen: 47   Median :44.50   Median :17.30  
##                                  Mean   :43.99   Mean   :17.16  
##                                  3rd Qu.:48.60   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172       Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190       1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197       Median :4050                Median :2008  
##  Mean   :201       Mean   :4207                Mean   :2008  
##  3rd Qu.:213       3rd Qu.:4775                3rd Qu.:2009  
##  Max.   :231       Max.   :6300                Max.   :2009

Non-numeric columns, Island and Sex were removed along with Year. I reordered the columns to have the binary dependent variable, species as the last column.

mnewdata1 <-mnewdata[, c(3,4,5,6,1)]

There are now four independent variables as indicated in the Descriptive statistics.

summary(mnewdata1)
##  bill_length_mm  bill_depth_mm   flipper_length_mm  body_mass_g  
##  Min.   :32.10   Min.   :13.10   Min.   :172       Min.   :2700  
##  1st Qu.:39.50   1st Qu.:15.60   1st Qu.:190       1st Qu.:3550  
##  Median :44.50   Median :17.30   Median :197       Median :4050  
##  Mean   :43.99   Mean   :17.16   Mean   :201       Mean   :4207  
##  3rd Qu.:48.60   3rd Qu.:18.70   3rd Qu.:213       3rd Qu.:4775  
##  Max.   :59.60   Max.   :21.50   Max.   :231       Max.   :6300  
##       species   
##  Adelie   :146  
##  Chinstrap: 68  
##  Gentoo   :119  
##                 
##                 
## 

Changed data frame to matrix to handle the following error: Error in hist.default(newdata2[, i], main = colnames(newdata2)[i], xlab = colnames(newdata2)[i], : ‘x’ must be numeric

mnewdata2 <-as.matrix(sapply(mnewdata1, as.numeric))

Analyze the distribution of each independent variable

par(mfrow = c(2,2))
for( i in 1:4){
  hist(mnewdata2[,i], main = colnames(mnewdata2)[i],xlab=colnames(mnewdata2)[i], col = 'yellow')
}

For continuous independent variables, we can get more clarity on the distribution by analyzing it w.r.t. dependent variable.

par(mfrow = c(2,2))
boxplot(bill_length_mm~species, ylab="Bill Length (mm)", xlab= "Species", col="light blue",data = mnewdata1)
boxplot(bill_depth_mm~species, ylab="Bill Depth (mm)", xlab= "Species", col="light blue",data = mnewdata1)
boxplot(flipper_length_mm~species, ylab="Flipper Length (mm)", xlab= "Species", col="light blue",data = mnewdata1)
boxplot(body_mass_g~species, ylab="Body Mass (g)", xlab= "Species", col="light blue",data = mnewdata1)

New data frame of relevant modeling variables.

mnewdata4 <- mnewdata2[,c("species", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g")]

#b. Please be sure to evaluate the independent variables appropriately to fit your best parsimonious model.

library(nnet)

Implementation of Logistic Regression to predict the binary outcome — species in the dataset “newdata4”.

multimodel<- multinom(species~.,data = mnewdata1)
## # weights:  18 (10 variable)
## initial  value 365.837892 
## iter  10 value 16.321214
## iter  20 value 3.754897
## iter  30 value 1.631859
## iter  40 value 0.012427
## iter  50 value 0.001125
## iter  60 value 0.001108
## iter  70 value 0.001006
## iter  80 value 0.000906
## iter  90 value 0.000498
## iter 100 value 0.000498
## final  value 0.000498 
## stopped after 100 iterations

c.

Please be sure to interpret your variables in the model.

Analysis of Model Summary

summary(multimodel)
## Call:
## multinom(formula = species ~ ., data = mnewdata1)
## 
## Coefficients:
##           (Intercept) bill_length_mm bill_depth_mm flipper_length_mm
## Chinstrap   -34.60273       58.94543     -84.81399         -2.643720
## Gentoo       -4.70502       43.75912     -91.60364         -1.639715
##            body_mass_g
## Chinstrap -0.132491128
## Gentoo     0.007448619
## 
## Std. Errors:
##            (Intercept) bill_length_mm bill_depth_mm flipper_length_mm
## Chinstrap 4.4008386318    73.52088725  51.084116136       15.71457598
## Gentoo    0.0003319148     0.01602115   0.006439479        0.06591798
##           body_mass_g
## Chinstrap   0.3479725
## Gentoo      1.4739115
## 
## Residual Deviance: 0.0009955954 
## AIC: 20.001

A one-unit increase in the variable body mass(g) is associated with the decrease in the log odds of being Chinstrap species vs. Adelie in the amount of .013. A one-unit increase in the variable body mass(g) is associated with the increase in the log odds of being Gentoo species vs. Adelie in the amount of .001. A one-unit increase in the variable flipper length(mm) is associated with the decrease in the log odds of being Chinstrap species vs. Adelie in the amount of 2.644. A one-unit increase in the variable body mass(g) is associated with the decrease in the log odds of being Gentoo species vs. Adelie in the amount of 1.640. A one-unit increase in the variable bill depth(mm) is associated with the decrease in the log odds of being Chinstrap species vs. Adelie in the amount of 84.814. A one-unit increase in the variable bill depth(mm) is associated with the decrease in the log odds of being Gentoo species vs. Adelie in the amount of 91.604. A one-unit increase in the variable bill length(mm) is associated with the increase in the log odds of being Chinstrap species vs. Adelie in the amount of 58.945. A one-unit increase in the variable bill length(mm) is associated with the increase in the log odds of being Gentoo species vs. Adelie in the amount of 43.759.

#4. Extra credit

What would be some of the fit statistics you would want to evaluate for your model in question #3? Feel free to share whatever you can provide. (10)

Naive Bayes computes the conditional a-posterior probabilities of a categorical class variable given independent predictor variables using the Bayes rule. Naive Bayes is a Supervised Machine Learning algorithm based on the Bayes Theorem that is used to solve classification problems by following a probabilistic approach.

library(naivebayes)
library(e1071)
nb_model <- naiveBayes(species ~ ., df_train)
nb_model
FALSE 
FALSE Naive Bayes Classifier for Discrete Predictors
FALSE 
FALSE Call:
FALSE naiveBayes.default(x = X, y = Y, laplace = laplace)
FALSE 
FALSE A-priori probabilities:
FALSE Y
FALSE    Adelie    Gentoo 
FALSE 0.5645161 0.4354839 
FALSE 
FALSE Conditional probabilities:
FALSE         bill_length_mm
FALSE Y            [,1]     [,2]
FALSE   Adelie 38.68762 2.698137
FALSE   Gentoo 47.68395 3.167430
FALSE 
FALSE         body_mass_g
FALSE Y            [,1]     [,2]
FALSE   Adelie 3681.667 446.5179
FALSE   Gentoo 5119.753 480.0019
nb_train_pred <- nb_model %>% 
  #make prediction on training data
  predict(df_train) %>%
  data.frame(prediction = .) %>%
  # merge the prediction result back to training data set
  bind_cols(df_train)
nb_train_pred
##     prediction bill_length_mm body_mass_g species
## 1       Adelie           39.1        3750  Adelie
## 2       Adelie           39.5        3800  Adelie
## 3       Adelie           40.3        3250  Adelie
## 4       Adelie           36.7        3450  Adelie
## 5       Adelie           39.3        3650  Adelie
## 6       Adelie           39.2        4675  Adelie
## 7       Adelie           38.7        3450  Adelie
## 8       Gentoo           42.5        4500  Adelie
## 9       Adelie           34.4        3325  Adelie
## 10      Adelie           37.7        3600  Adelie
## 11      Adelie           35.9        3800  Adelie
## 12      Adelie           38.2        3950  Adelie
## 13      Adelie           35.3        3800  Adelie
## 14      Adelie           40.6        3550  Adelie
## 15      Adelie           40.5        3200  Adelie
## 16      Adelie           37.9        3150  Adelie
## 17      Adelie           40.5        3950  Adelie
## 18      Adelie           39.5        3250  Adelie
## 19      Adelie           37.2        3900  Adelie
## 20      Adelie           39.5        3300  Adelie
## 21      Adelie           40.9        3900  Adelie
## 22      Adelie           36.4        3325  Adelie
## 23      Adelie           39.2        4150  Adelie
## 24      Adelie           42.2        3550  Adelie
## 25      Adelie           37.6        3300  Adelie
## 26      Adelie           36.5        3150  Adelie
## 27      Adelie           40.8        3900  Adelie
## 28      Adelie           36.0        3100  Adelie
## 29      Gentoo           44.1        4400  Adelie
## 30      Adelie           37.0        3000  Adelie
## 31      Adelie           39.6        4600  Adelie
## 32      Adelie           42.3        4150  Adelie
## 33      Adelie           39.6        3500  Adelie
## 34      Adelie           35.0        3450  Adelie
## 35      Adelie           42.0        4050  Adelie
## 36      Adelie           34.5        2900  Adelie
## 37      Adelie           39.0        3550  Adelie
## 38      Adelie           40.6        3800  Adelie
## 39      Adelie           37.6        3750  Adelie
## 40      Adelie           35.7        3150  Adelie
## 41      Adelie           37.6        3600  Adelie
## 42      Adelie           36.4        2850  Adelie
## 43      Adelie           41.1        4100  Adelie
## 44      Adelie           41.8        4450  Adelie
## 45      Adelie           33.5        3600  Adelie
## 46      Adelie           39.7        3900  Adelie
## 47      Adelie           39.6        3550  Adelie
## 48      Gentoo           45.8        4150  Adelie
## 49      Adelie           40.9        3700  Adelie
## 50      Adelie           37.2        3900  Adelie
## 51      Adelie           36.2        3550  Adelie
## 52      Adelie           42.1        4000  Adelie
## 53      Adelie           34.6        3200  Adelie
## 54      Gentoo           42.9        4700  Adelie
## 55      Adelie           36.7        3800  Adelie
## 56      Adelie           35.1        4200  Adelie
## 57      Adelie           37.3        3350  Adelie
## 58      Adelie           41.3        3550  Adelie
## 59      Adelie           36.3        3800  Adelie
## 60      Adelie           36.9        3500  Adelie
## 61      Adelie           38.3        3950  Adelie
## 62      Adelie           35.7        3550  Adelie
## 63      Adelie           41.1        4300  Adelie
## 64      Adelie           36.2        3300  Adelie
## 65      Adelie           40.8        4300  Adelie
## 66      Adelie           38.1        3700  Adelie
## 67      Adelie           40.3        4350  Adelie
## 68      Adelie           33.1        2900  Adelie
## 69      Adelie           43.2        4100  Adelie
## 70      Adelie           35.0        3725  Adelie
## 71      Adelie           37.7        3075  Adelie
## 72      Adelie           37.8        4250  Adelie
## 73      Adelie           37.9        2925  Adelie
## 74      Adelie           38.6        3750  Adelie
## 75      Adelie           38.1        3825  Adelie
## 76      Gentoo           45.6        4600  Adelie
## 77      Adelie           39.7        3200  Adelie
## 78      Adelie           39.6        3900  Adelie
## 79      Adelie           42.7        4075  Adelie
## 80      Adelie           38.6        2900  Adelie
## 81      Adelie           35.7        3350  Adelie
## 82      Adelie           36.2        3150  Adelie
## 83      Adelie           37.7        3500  Adelie
## 84      Adelie           40.2        3450  Adelie
## 85      Adelie           41.4        3875  Adelie
## 86      Adelie           35.2        3050  Adelie
## 87      Adelie           38.8        3275  Adelie
## 88      Adelie           41.5        4300  Adelie
## 89      Adelie           44.1        4000  Adelie
## 90      Adelie           38.5        3325  Adelie
## 91      Adelie           43.1        3500  Adelie
## 92      Adelie           36.8        3500  Adelie
## 93      Adelie           37.5        4475  Adelie
## 94      Adelie           38.1        3425  Adelie
## 95      Adelie           41.1        3900  Adelie
## 96      Adelie           35.6        3175  Adelie
## 97      Adelie           40.2        3975  Adelie
## 98      Adelie           37.0        3400  Adelie
## 99      Adelie           39.7        4250  Adelie
## 100     Adelie           32.1        3050  Adelie
## 101     Adelie           40.7        3725  Adelie
## 102     Adelie           39.0        3650  Adelie
## 103     Adelie           39.2        4250  Adelie
## 104     Adelie           36.6        3475  Adelie
## 105     Adelie           36.0        3450  Adelie
## 106     Gentoo           46.1        4500  Gentoo
## 107     Gentoo           48.7        4450  Gentoo
## 108     Gentoo           50.0        5700  Gentoo
## 109     Gentoo           47.6        5400  Gentoo
## 110     Gentoo           46.5        4550  Gentoo
## 111     Gentoo           45.4        4800  Gentoo
## 112     Gentoo           46.7        5200  Gentoo
## 113     Gentoo           43.3        4400  Gentoo
## 114     Gentoo           46.8        5150  Gentoo
## 115     Adelie           40.9        4650  Gentoo
## 116     Gentoo           49.0        5550  Gentoo
## 117     Gentoo           45.5        4650  Gentoo
## 118     Gentoo           48.4        5850  Gentoo
## 119     Gentoo           45.8        4200  Gentoo
## 120     Adelie           42.0        4150  Gentoo
## 121     Gentoo           49.2        6300  Gentoo
## 122     Gentoo           46.5        4400  Gentoo
## 123     Gentoo           42.9        5000  Gentoo
## 124     Gentoo           46.1        5100  Gentoo
## 125     Gentoo           48.2        4600  Gentoo
## 126     Gentoo           42.8        4700  Gentoo
## 127     Gentoo           45.1        5050  Gentoo
## 128     Gentoo           49.1        5150  Gentoo
## 129     Gentoo           42.6        4950  Gentoo
## 130     Gentoo           44.4        5250  Gentoo
## 131     Gentoo           48.7        5350  Gentoo
## 132     Gentoo           49.6        5700  Gentoo
## 133     Gentoo           49.6        4750  Gentoo
## 134     Gentoo           50.5        5550  Gentoo
## 135     Gentoo           43.6        4900  Gentoo
## 136     Gentoo           45.2        5300  Gentoo
## 137     Gentoo           46.6        4850  Gentoo
## 138     Gentoo           45.1        4400  Gentoo
## 139     Gentoo           45.0        5050  Gentoo
## 140     Gentoo           43.8        4300  Gentoo
## 141     Gentoo           50.4        5550  Gentoo
## 142     Gentoo           54.3        5650  Gentoo
## 143     Gentoo           49.8        5700  Gentoo
## 144     Gentoo           49.5        5800  Gentoo
## 145     Gentoo           43.5        4700  Gentoo
## 146     Gentoo           50.7        5550  Gentoo
## 147     Gentoo           46.4        5000  Gentoo
## 148     Gentoo           46.5        5200  Gentoo
## 149     Gentoo           46.4        4700  Gentoo
## 150     Gentoo           48.6        5800  Gentoo
## 151     Gentoo           47.5        4600  Gentoo
## 152     Gentoo           51.1        6000  Gentoo
## 153     Gentoo           45.2        4750  Gentoo
## 154     Gentoo           49.1        4625  Gentoo
## 155     Gentoo           52.5        5450  Gentoo
## 156     Gentoo           47.4        4725  Gentoo
## 157     Gentoo           50.0        5350  Gentoo
## 158     Gentoo           44.9        4750  Gentoo
## 159     Gentoo           50.8        5600  Gentoo
## 160     Gentoo           51.3        5300  Gentoo
## 161     Gentoo           47.5        4875  Gentoo
## 162     Gentoo           52.1        5550  Gentoo
## 163     Gentoo           47.5        4950  Gentoo
## 164     Gentoo           52.2        5400  Gentoo
## 165     Gentoo           45.5        4750  Gentoo
## 166     Gentoo           49.5        5650  Gentoo
## 167     Gentoo           50.8        5200  Gentoo
## 168     Gentoo           51.1        5250  Gentoo
## 169     Gentoo           48.5        4850  Gentoo
## 170     Gentoo           55.9        5600  Gentoo
## 171     Gentoo           47.2        4975  Gentoo
## 172     Gentoo           49.1        5500  Gentoo
## 173     Gentoo           46.8        5500  Gentoo
## 174     Gentoo           41.7        4700  Gentoo
## 175     Gentoo           53.4        5500  Gentoo
## 176     Gentoo           43.3        4575  Gentoo
## 177     Gentoo           48.1        5500  Gentoo
## 178     Gentoo           50.5        5000  Gentoo
## 179     Gentoo           43.5        4650  Gentoo
## 180     Gentoo           51.5        5500  Gentoo
## 181     Gentoo           55.1        5850  Gentoo
## 182     Gentoo           48.8        6000  Gentoo
## 183     Gentoo           47.2        4925  Gentoo
## 184     Gentoo           46.8        4850  Gentoo
## 185     Gentoo           50.4        5750  Gentoo
## 186     Gentoo           45.2        5200  Gentoo
nb_train_cm <- nb_train_pred %>% 
  #use only prediction values and actual label
  dplyr::select(prediction, species) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
nb_train_cm
##           species
## prediction Adelie Gentoo
##     Adelie    100      2
##     Gentoo      5     79
nb_train_pred %>%
  metrics(truth = species, estimate = prediction)
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.962
## 2 kap      binary         0.924

kNN k-nearest neighbour classification for test set from training set. For each row of the test set, the k nearest (in Euclidean distance) training set vectors are found, and the classification is decided by majority vote, with ties broken at random. If there are ties for the kth nearest vector, all candidates are included in the vote.

# kNN model with k=3
knn3_model <- nearest_neighbor(neighbors = 3) %>%
  set_mode('classification') %>%
  set_engine('kknn') %>%
  fit(species ~ ., df_train)

knn3_model
## parsnip model object
## 
## Fit time:  46ms 
## 
## Call:
## kknn::train.kknn(formula = species ~ ., data = data, ks = min_rows(3,     data, 5))
## 
## Type of response variable: nominal
## Minimal misclassification: 0.06451613
## Best kernel: optimal
## Best k: 3
knn3_train_pred <- knn3_model %>% 
  #make prediction on training data
  predict(df_train) %>%
  rename(prediction = `.pred_class`) %>%
  # merge the prediction result back to training data set
  bind_cols(df_train)
knn3_train_cm <- knn3_train_pred %>% 
  #use only prediction values and actual label
  dplyr::select(prediction, species) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
knn3_train_cm
##           species
## prediction Adelie Gentoo
##     Adelie    105      0
##     Gentoo      0     81
knn3_train_pred %>%
  metrics(truth = species, estimate = prediction)
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary             1
## 2 kap      binary             1