library(tidyverse)
library(caret)
library(palmerpenguins)
library(e1071)
library(caTools)
library(ggplot2)
library(GGally)
library(ggplot2)
library(MASS)
library(mvtnorm)I chose to evaluate the following within the data frame: species, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g.
library(tidyr)
library(dplyr)
penguin_measurements <- penguins %>% drop_na() %>% dplyr::select(species, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)
head(penguin_measurements%>% as.data.frame())FALSE species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
FALSE 1 Adelie 39.1 18.7 181 3750
FALSE 2 Adelie 39.5 17.4 186 3800
FALSE 3 Adelie 40.3 18.0 195 3250
FALSE 4 Adelie 36.7 19.3 193 3450
FALSE 5 Adelie 39.3 20.6 190 3650
FALSE 6 Adelie 38.9 17.8 181 3625
FALSE Rows: 333
FALSE Columns: 5
FALSE $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel~
FALSE $ bill_length_mm <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6~
FALSE $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2~
FALSE $ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 18~
FALSE $ body_mass_g <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800~
## species bill_length_mm bill_depth_mm flipper_length_mm
## Adelie :146 Min. :32.10 Min. :13.10 Min. :172
## Chinstrap: 68 1st Qu.:39.50 1st Qu.:15.60 1st Qu.:190
## Gentoo :119 Median :44.50 Median :17.30 Median :197
## Mean :43.99 Mean :17.16 Mean :201
## 3rd Qu.:48.60 3rd Qu.:18.70 3rd Qu.:213
## Max. :59.60 Max. :21.50 Max. :231
## body_mass_g
## Min. :2700
## 1st Qu.:3550
## Median :4050
## Mean :4207
## 3rd Qu.:4775
## Max. :6300
ggplot(data = penguin_measurements,
aes(x = flipper_length_mm,
y = body_mass_g)) +
geom_point(aes(color = species,
shape = species),
size = 3,
alpha = 0.8) +
#theme_minimal() +
scale_color_manual(values = c("darkorange","purple","cyan4")) +
labs(title = "Penguin size, Palmer Station LTER",
subtitle = "Flipper length and body mass for Adelie, Chinstrap and Gentoo Penguins",
x = "Flipper length (mm)",
y = "Body mass (g)",
color = "Penguin species",
shape = "Penguin species") +
theme_minimal()# Split the data into training (80%) and test set (20%)
set.seed(123)
training.individuals <- penguin_measurements$species %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- penguin_measurements[training.individuals, ]
test.data <- penguin_measurements[-training.individuals, ]
# Estimate preprocessing parameters
preproc.parameter <- train.data %>% preProcess(method = c("center", "scale"))
# Transform the data using the estimated parameters
train.transform <- preproc.parameter %>% predict(train.data)
test.transform <- preproc.parameter %>% predict(test.data)
# Fit the model
model <- lda(species~., data = train.transform)
# Make predictions
predictions <- model %>% predict(test.transform)
# Model accuracy
mean(predictions$class==test.transform$species) FALSE [1] 0.9692308
FALSE Call:
FALSE lda(species ~ ., data = train.transform)
FALSE
FALSE Prior probabilities of groups:
FALSE Adelie Chinstrap Gentoo
FALSE 0.4365672 0.2052239 0.3582090
FALSE
FALSE Group means:
FALSE bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
FALSE Adelie -0.9555507 0.6009140 -0.7707877 -0.6092842
FALSE Chinstrap 0.9088301 0.6564651 -0.3716006 -0.5901947
FALSE Gentoo 0.6438935 -1.1084638 1.1522937 1.0806975
FALSE
FALSE Coefficients of linear discriminants:
FALSE LD1 LD2
FALSE bill_length_mm 0.5415983 -2.408094172
FALSE bill_depth_mm -2.1000729 0.004472972
FALSE flipper_length_mm 1.0940111 0.404513143
FALSE body_mass_g 1.0280418 1.274291326
FALSE
FALSE Proportion of trace:
FALSE LD1 LD2
FALSE 0.8462 0.1538
# Graphical plotting of the output
# Variance Covariance matrix for random bivariate gaussian sample
var_covar = matrix(data = c(1.5, 0.4, 0.4, 1.5), nrow = 2)
# Random bivariate Gaussian samples for class +1
Xplus1 <- rmvnorm(400, mean = c(5, 5), sigma = var_covar)
# Random bivariate Gaussian samples for class -1
Xminus1 <- rmvnorm(600, mean = c(3, 3), sigma = var_covar)
# Samples for the dependent variable
Y_samples <- c(rep(1, 400), rep(-1, 600))
# Combining the independent and dependent variables into a dataframe
dataset <- as.data.frame(cbind(rbind(Xplus1, Xminus1), Y_samples))
colnames(dataset) <- c("X1", "X2", "Y")
dataset$Y <- as.character(dataset$Y)
# Plot the above samples and color by class labels
ggplot(data = dataset) + geom_point(aes(X1, X2, color = Y)) lda_penguins <- train(data=penguin_measurements, species ~ ., method="lda",
trControl=trainControl(method="cv", number=10))
lda_penguins## Linear Discriminant Analysis
##
## 333 samples
## 4 predictor
## 3 classes: 'Adelie', 'Chinstrap', 'Gentoo'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 301, 299, 300, 299, 299, 299, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9881462 0.9812566
## parameter Accuracy Kappa AccuracySD KappaSD
## 1 none 0.9881462 0.9812566 0.02065161 0.03283826
# Split the data into training (80%) and test set (20%)
set.seed(1)
qtraining.individuals <- penguin_measurements$species %>% createDataPartition(p = 0.8, list = FALSE)
qtrain.data <- penguin_measurements[qtraining.individuals, ]
qtest.data <- penguin_measurements[-qtraining.individuals, ]
# Estimate preprocessing parameters
qpreproc.parameter <- qtrain.data %>% preProcess(method = c("center", "scale"))
# Transform the data using the estimated parameters
qtrain.transform <- qpreproc.parameter %>% predict(qtrain.data)
qtest.transform <- qpreproc.parameter %>% predict(qtest.data)
# Fit the model
qmodel <- qda(species~., data = qtrain.transform)
# Make predictions
qpredictions <- qmodel %>% predict(qtest.transform)
# Model accuracy
mean(qpredictions$class==qtest.transform$species) ## [1] 0.9846154
## Call:
## qda(species ~ ., data = qtrain.transform)
##
## Prior probabilities of groups:
## Adelie Chinstrap Gentoo
## 0.4365672 0.2052239 0.3582090
##
## Group means:
## bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## Adelie -0.9497803 0.5740255 -0.7917055 -0.6342784
## Chinstrap 0.8764768 0.6775769 -0.3401823 -0.5738502
## Gentoo 0.6553966 -1.0877887 1.1597872 1.1017952
## Warning: package 'klaR' was built under R version 4.0.4
qda_penguins <- train(data=penguin_measurements, species ~ ., method="qda",
trControl=trainControl(method="cv", number=10))
qda_penguins## Quadratic Discriminant Analysis
##
## 333 samples
## 4 predictor
## 3 classes: 'Adelie', 'Chinstrap', 'Gentoo'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 301, 299, 299, 299, 300, 299, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9879623 0.9811449
## parameter Accuracy Kappa AccuracySD KappaSD
## 1 none 0.9879623 0.9811449 0.01554881 0.02435484
# Splitting data into train and test data
split <- sample.split(penguins, SplitRatio = 0.8)
train_cl <- subset(penguin_measurements%>% as.data.frame(), split == "TRUE")
test_cl <- subset(penguin_measurements%>% as.data.frame(), split == "FALSE")
# Feature Scaling
train_scale <- scale(train_cl[, 2:5])
test_scale <- scale(test_cl[, 2:5])
# Fitting Naive Bayes Model to training dataset
set.seed(120) # Setting Seed
classifier_cl <- naiveBayes(species ~ ., data = train_cl)
classifier_cl ##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Adelie Chinstrap Gentoo
## 0.4377510 0.2048193 0.3574297
##
## Conditional probabilities:
## bill_length_mm
## Y [,1] [,2]
## Adelie 39.22294 2.626173
## Chinstrap 48.38431 3.352514
## Gentoo 47.56742 3.245198
##
## bill_depth_mm
## Y [,1] [,2]
## Adelie 18.47064 1.122027
## Chinstrap 18.22353 1.101742
## Gentoo 15.02360 1.039780
##
## flipper_length_mm
## Y [,1] [,2]
## Adelie 190.8349 6.583528
## Chinstrap 194.8627 7.345800
## Gentoo 217.7753 6.661888
##
## body_mass_g
## Y [,1] [,2]
## Adelie 3742.431 475.3350
## Chinstrap 3677.451 389.4308
## Gentoo 5122.472 527.9819
## Confusion Matrix and Statistics
##
## y_pred
## Adelie Chinstrap Gentoo
## Adelie 37 0 0
## Chinstrap 0 17 0
## Gentoo 0 0 30
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.957, 1)
## No Information Rate : 0.4405
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.4405 0.2024 0.3571
## Detection Rate 0.4405 0.2024 0.3571
## Detection Prevalence 0.4405 0.2024 0.3571
## Balanced Accuracy 1.0000 1.0000 1.0000
Comment on the models fits/strength/weakness/accuracy for all these three models that you worked with.
Generative methods learn true input distribution (joint probability) and can generate data, and use Bayes rule to get posterior and classify. LDA is a strong methods in general. LDA assumes class have common covariance matrix.LDA uses means and variances of each class in order to create a linear boundary (or separation) between them. This boundary is delimited by the coefficients.LDA determines group means and computes, for each individual, the probability of belonging to the different groups. The individual is then affected to the group with the highest probability score. Prior probabilities of groups: the proportion of training observations in each group. Group means: group center of gravity. Shows the mean of each variable in each group. Coefficients of linear discriminants: Shows the linear combination of predictor variables that are used to form the LDA decision rule.The accuracy of this model is 98.56%.
QDA is also a strong method but doesn’t have any real advantage over LDA unless the covariances really are different, and the fact that it requires training many more parameters could lead to rank deficiency issues or other problems while constructing the model.QDA is little bit more flexible than LDA, in the sense that it does not assumes the equality of variance/covariance. In other words, for QDA the covariance matrix can be different for each class. QDA is recommended if the training set is very large, so that the variance of the classifier is not a major issue, or if the assumption of a common covariance matrix for the K classes is clearly untenable.The accuracy of this model is 98.53%.
Naive Bayes places a stronger assumption on feature independence, while LDA does not require features to be independent.This method makes strong assumptions about the data having features independent of each other while in reality, they may be dependent in some way. In other words, it assumes that the presence of one feature in a class is completely unrelated to the presence of all other features. If this assumption of independence holds, Naive Bayes performs extremely well and often better than other models. Naive Bayes can also be used with continuous features but is more suited to categorical variables. If all the input features are categorical, Naive Bayes is recommended. Naive Bayes is recommended with an accuracy of 100%.
https://www.r-bloggers.com/2018/01/how-to-perform-logistic-regression-lda-qda-in-r/ https://stackoverflow.com/questions/62085827/error-must-subset-rows-with-a-valid-subscript-vector-in-preprocess-when-usi https://www.geeksforgeeks.org/naive-bayes-classifier-in-r-programming/ https://towardsdatascience.com/penguins-dataset-overview-iris-alternative-9453bb8c8d95 https://stats.stackexchange.com/questions/143692/plotting-qda-projections-in-r http://www.sthda.com/english/articles/36-classification-methods-essentials/146-discriminant-analysis-essentials-in-r/ https://www.r-bloggers.com/2018/01/understanding-naive-bayes-classifier-using-r/