library(tidyverse)
library(caret)
library(palmerpenguins)
library(e1071) 
library(caTools)
library(ggplot2)
library(GGally)
library(ggplot2) 
library(MASS) 
library(mvtnorm)

Evaluation

a. Evaluate all the ‘features’ or dependent variables

I chose to evaluate the following within the data frame: species, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g.

library(tidyr)
library(dplyr)
penguin_measurements <- penguins %>% drop_na() %>% dplyr::select(species, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)
head(penguin_measurements%>% as.data.frame())

FALSE   species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
FALSE 1  Adelie           39.1          18.7               181        3750
FALSE 2  Adelie           39.5          17.4               186        3800
FALSE 3  Adelie           40.3          18.0               195        3250
FALSE 4  Adelie           36.7          19.3               193        3450
FALSE 5  Adelie           39.3          20.6               190        3650
FALSE 6  Adelie           38.9          17.8               181        3625

glimpse(penguin_measurements)

FALSE Rows: 333
FALSE Columns: 5
FALSE $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel~
FALSE $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6~
FALSE $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2~
FALSE $ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 18~
FALSE $ body_mass_g       <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800~

summary(penguin_measurements)

##       species    bill_length_mm  bill_depth_mm   flipper_length_mm
##  Adelie   :146   Min.   :32.10   Min.   :13.10   Min.   :172      
##  Chinstrap: 68   1st Qu.:39.50   1st Qu.:15.60   1st Qu.:190      
##  Gentoo   :119   Median :44.50   Median :17.30   Median :197      
##                  Mean   :43.99   Mean   :17.16   Mean   :201      
##                  3rd Qu.:48.60   3rd Qu.:18.70   3rd Qu.:213      
##                  Max.   :59.60   Max.   :21.50   Max.   :231      
##   body_mass_g  
##  Min.   :2700  
##  1st Qu.:3550  
##  Median :4050  
##  Mean   :4207  
##  3rd Qu.:4775  
##  Max.   :6300

ggplot(data = penguin_measurements, 
                       aes(x = flipper_length_mm,
                           y = body_mass_g)) +
  geom_point(aes(color = species, 
                 shape = species),
             size = 3,
             alpha = 0.8) +
  #theme_minimal() +
  scale_color_manual(values = c("darkorange","purple","cyan4")) +
  labs(title = "Penguin size, Palmer Station LTER",
       subtitle = "Flipper length and body mass for Adelie, Chinstrap and Gentoo Penguins",
       x = "Flipper length (mm)",
       y = "Body mass (g)",
       color = "Penguin species",
       shape = "Penguin species") +
  theme_minimal()

b. Explore Plot feature on the caret package.

ggpairs(penguin_measurements, aes(col=species, alpha=0.5)) +
  labs(title = "Pair Plot")

plot(penguin_measurements)

Linear Discriminant Analysis

c. model

# Split the data into training (80%) and test set (20%) 
set.seed(123) 
training.individuals <- penguin_measurements$species %>% createDataPartition(p = 0.8, list = FALSE) 
train.data <- penguin_measurements[training.individuals, ] 
test.data <- penguin_measurements[-training.individuals, ] 

# Estimate preprocessing parameters 
preproc.parameter <- train.data %>% preProcess(method = c("center", "scale")) 

# Transform the data using the estimated parameters 
train.transform <- preproc.parameter %>% predict(train.data) 
test.transform <- preproc.parameter %>% predict(test.data) 

# Fit the model 
model <- lda(species~., data = train.transform) 

# Make predictions 
predictions <- model %>% predict(test.transform) 

# Model accuracy 
mean(predictions$class==test.transform$species)

FALSE [1] 0.9692308

model <- lda(species~., data = train.transform) 
model

FALSE Call:
FALSE lda(species ~ ., data = train.transform)
FALSE 
FALSE Prior probabilities of groups:
FALSE    Adelie Chinstrap    Gentoo 
FALSE 0.4365672 0.2052239 0.3582090 
FALSE 
FALSE Group means:
FALSE           bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
FALSE Adelie        -0.9555507     0.6009140        -0.7707877  -0.6092842
FALSE Chinstrap      0.9088301     0.6564651        -0.3716006  -0.5901947
FALSE Gentoo         0.6438935    -1.1084638         1.1522937   1.0806975
FALSE 
FALSE Coefficients of linear discriminants:
FALSE                          LD1          LD2
FALSE bill_length_mm     0.5415983 -2.408094172
FALSE bill_depth_mm     -2.1000729  0.004472972
FALSE flipper_length_mm  1.0940111  0.404513143
FALSE body_mass_g        1.0280418  1.274291326
FALSE 
FALSE Proportion of trace:
FALSE    LD1    LD2 
FALSE 0.8462 0.1538

# Graphical plotting of the output 

# Variance Covariance matrix for random bivariate gaussian sample 
var_covar = matrix(data = c(1.5, 0.4, 0.4, 1.5), nrow = 2) 

# Random bivariate Gaussian samples for class +1 
Xplus1 <- rmvnorm(400, mean = c(5, 5), sigma = var_covar) 

# Random bivariate Gaussian samples for class -1 
Xminus1 <- rmvnorm(600, mean = c(3, 3), sigma = var_covar) 

# Samples for the dependent variable 
Y_samples <- c(rep(1, 400), rep(-1, 600)) 

# Combining the independent and dependent variables into a dataframe 
dataset <- as.data.frame(cbind(rbind(Xplus1, Xminus1), Y_samples)) 
colnames(dataset) <- c("X1", "X2", "Y") 
dataset$Y <- as.character(dataset$Y) 

# Plot the above samples and color by class labels 
ggplot(data = dataset) + geom_point(aes(X1, X2, color = Y))

lda_penguins <- train(data=penguin_measurements, species ~ ., method="lda",
trControl=trainControl(method="cv", number=10))
lda_penguins

## Linear Discriminant Analysis 
## 
## 333 samples
##   4 predictor
##   3 classes: 'Adelie', 'Chinstrap', 'Gentoo' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 301, 299, 300, 299, 299, 299, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9881462  0.9812566

d. Fit statistics/ accuracy rates

lda_penguins$results

##   parameter  Accuracy     Kappa AccuracySD    KappaSD
## 1      none 0.9881462 0.9812566 0.02065161 0.03283826

Quadratic Discriminant Analysis

# Split the data into training (80%) and test set (20%) 
set.seed(1) 
qtraining.individuals <- penguin_measurements$species %>% createDataPartition(p = 0.8, list = FALSE) 
qtrain.data <- penguin_measurements[qtraining.individuals, ] 
qtest.data <- penguin_measurements[-qtraining.individuals, ] 

# Estimate preprocessing parameters 
qpreproc.parameter <- qtrain.data %>% preProcess(method = c("center", "scale")) 

# Transform the data using the estimated parameters 
qtrain.transform <- qpreproc.parameter %>% predict(qtrain.data) 
qtest.transform <- qpreproc.parameter %>% predict(qtest.data) 

# Fit the model 
qmodel <- qda(species~., data = qtrain.transform) 

# Make predictions 
qpredictions <- qmodel %>% predict(qtest.transform) 

# Model accuracy 
mean(qpredictions$class==qtest.transform$species)

## [1] 0.9846154

qmodel <- qda(species~., data = qtrain.transform) 
qmodel

## Call:
## qda(species ~ ., data = qtrain.transform)
## 
## Prior probabilities of groups:
##    Adelie Chinstrap    Gentoo 
## 0.4365672 0.2052239 0.3582090 
## 
## Group means:
##           bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## Adelie        -0.9497803     0.5740255        -0.7917055  -0.6342784
## Chinstrap      0.8764768     0.6775769        -0.3401823  -0.5738502
## Gentoo         0.6553966    -1.0877887         1.1597872   1.1017952

library(klaR)

## Warning: package 'klaR' was built under R version 4.0.4

partimat(species ~ ., data = penguin_measurements, method = "qda", plot.matrix = TRUE, col.correct='green', col.wrong='red')

c. model

qda_penguins <- train(data=penguin_measurements, species ~ ., method="qda",
trControl=trainControl(method="cv", number=10))
qda_penguins

## Quadratic Discriminant Analysis 
## 
## 333 samples
##   4 predictor
##   3 classes: 'Adelie', 'Chinstrap', 'Gentoo' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 301, 299, 299, 299, 300, 299, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9879623  0.9811449

d. Fit statistics/ accuracy rates

qda_penguins$results

##   parameter  Accuracy     Kappa AccuracySD    KappaSD
## 1      none 0.9879623 0.9811449 0.01554881 0.02435484

Naïve Bayes

c. model

# Splitting data into train and test data 
split <- sample.split(penguins, SplitRatio = 0.8) 
train_cl <- subset(penguin_measurements%>% as.data.frame(), split == "TRUE") 
test_cl <- subset(penguin_measurements%>% as.data.frame(), split == "FALSE") 
  
# Feature Scaling 
train_scale <- scale(train_cl[, 2:5]) 
test_scale <- scale(test_cl[, 2:5]) 
  
# Fitting Naive Bayes Model to training dataset 
set.seed(120)  # Setting Seed 
classifier_cl <- naiveBayes(species ~ ., data = train_cl) 
classifier_cl

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    Adelie Chinstrap    Gentoo 
## 0.4377510 0.2048193 0.3574297 
## 
## Conditional probabilities:
##            bill_length_mm
## Y               [,1]     [,2]
##   Adelie    39.22294 2.626173
##   Chinstrap 48.38431 3.352514
##   Gentoo    47.56742 3.245198
## 
##            bill_depth_mm
## Y               [,1]     [,2]
##   Adelie    18.47064 1.122027
##   Chinstrap 18.22353 1.101742
##   Gentoo    15.02360 1.039780
## 
##            flipper_length_mm
## Y               [,1]     [,2]
##   Adelie    190.8349 6.583528
##   Chinstrap 194.8627 7.345800
##   Gentoo    217.7753 6.661888
## 
##            body_mass_g
## Y               [,1]     [,2]
##   Adelie    3742.431 475.3350
##   Chinstrap 3677.451 389.4308
##   Gentoo    5122.472 527.9819

# Predicting on test data' 
y_pred <- predict(classifier_cl, newdata = test_cl)

d. Fit statistics/ accuracy rates

# Confusion Matrix 
cm <- table(test_cl$species, y_pred) 
# Model Evauation 
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##            y_pred
##             Adelie Chinstrap Gentoo
##   Adelie        37         0      0
##   Chinstrap      0        17      0
##   Gentoo         0         0     30
## 
## Overall Statistics
##                                     
##                Accuracy : 1         
##                  95% CI : (0.957, 1)
##     No Information Rate : 0.4405    
##     P-Value [Acc > NIR] : < 2.2e-16 
##                                     
##                   Kappa : 1         
##                                     
##  Mcnemar's Test P-Value : NA        
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 1.0000           1.0000        1.0000
## Specificity                 1.0000           1.0000        1.0000
## Pos Pred Value              1.0000           1.0000        1.0000
## Neg Pred Value              1.0000           1.0000        1.0000
## Prevalence                  0.4405           0.2024        0.3571
## Detection Rate              0.4405           0.2024        0.3571
## Detection Prevalence        0.4405           0.2024        0.3571
## Balanced Accuracy           1.0000           1.0000        1.0000

Conclusion

Comment on the models fits/strength/weakness/accuracy for all these three models that you worked with.

Generative methods learn true input distribution (joint probability) and can generate data, and use Bayes rule to get posterior and classify. LDA is a strong methods in general. LDA assumes class have common covariance matrix.LDA uses means and variances of each class in order to create a linear boundary (or separation) between them. This boundary is delimited by the coefficients.LDA determines group means and computes, for each individual, the probability of belonging to the different groups. The individual is then affected to the group with the highest probability score. Prior probabilities of groups: the proportion of training observations in each group. Group means: group center of gravity. Shows the mean of each variable in each group. Coefficients of linear discriminants: Shows the linear combination of predictor variables that are used to form the LDA decision rule.The accuracy of this model is 98.56%.

QDA is also a strong method but doesn’t have any real advantage over LDA unless the covariances really are different, and the fact that it requires training many more parameters could lead to rank deficiency issues or other problems while constructing the model.QDA is little bit more flexible than LDA, in the sense that it does not assumes the equality of variance/covariance. In other words, for QDA the covariance matrix can be different for each class. QDA is recommended if the training set is very large, so that the variance of the classifier is not a major issue, or if the assumption of a common covariance matrix for the K classes is clearly untenable.The accuracy of this model is 98.53%.

Naive Bayes places a stronger assumption on feature independence, while LDA does not require features to be independent.This method makes strong assumptions about the data having features independent of each other while in reality, they may be dependent in some way. In other words, it assumes that the presence of one feature in a class is completely unrelated to the presence of all other features. If this assumption of independence holds, Naive Bayes performs extremely well and often better than other models. Naive Bayes can also be used with continuous features but is more suited to categorical variables. If all the input features are categorical, Naive Bayes is recommended. Naive Bayes is recommended with an accuracy of 100%.

References

https://www.r-bloggers.com/2018/01/how-to-perform-logistic-regression-lda-qda-in-r/ https://stackoverflow.com/questions/62085827/error-must-subset-rows-with-a-valid-subscript-vector-in-preprocess-when-usi https://www.geeksforgeeks.org/naive-bayes-classifier-in-r-programming/ https://towardsdatascience.com/penguins-dataset-overview-iris-alternative-9453bb8c8d95 https://stats.stackexchange.com/questions/143692/plotting-qda-projections-in-r http://www.sthda.com/english/articles/36-classification-methods-essentials/146-discriminant-analysis-essentials-in-r/ https://www.r-bloggers.com/2018/01/understanding-naive-bayes-classifier-using-r/

DATA622_HW2

IR

3/14/2021

Evaluation

a. Evaluate all the ‘features’ or dependent variables

b. Explore Plot feature on the caret package.

Linear Discriminant Analysis

c. model

d. Fit statistics/ accuracy rates

Quadratic Discriminant Analysis

c. model

d. Fit statistics/ accuracy rates

Naïve Bayes

c. model

d. Fit statistics/ accuracy rates

Conclusion

References