##1. Installing packages in CONSOLE
##2. Loading packages: ranger is for random forest algorithm
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ranger)
## Warning: package 'ranger' was built under R version 4.4.3
##3. Importing data
sleephealth <- read.csv("C:/Users/ajpay/OneDrive/Documents/MSBA/SCH-MGMT 655 - Machine Learning/Exercise Files/Week 9/Sleep Health & Lifestyle Dataset.csv")
##4. Exploring data 374 row dataset. Each row corresponds to one person. trying to predict whether person has sleeping disorder (1) or not (0). Small dataset to make code run faster
head(sleephealth)
## Is_Male Age Salary Sleep.Duration Quality.of.Sleep Physical.Activity.Level
## 1 1 32 111283 7.2 8 50
## 2 0 35 113170 7.2 8 60
## 3 0 36 123298 7.2 8 60
## 4 0 36 107043 7.1 8 60
## 5 0 36 97889 7.2 8 60
## 6 0 36 121403 7.1 8 60
## Stress.Level BMI.Category Systolic.BP Diastolic.BP Heart.Rate Daily.Steps
## 1 6 1 118 76 68 7000
## 2 4 1 115 75 68 7000
## 3 4 1 115 75 68 7000
## 4 4 1 115 75 68 7000
## 5 4 1 115 75 68 7000
## 6 4 1 115 75 68 7000
## Sleep.Disorder
## 1 0
## 2 0
## 3 1
## 4 0
## 5 0
## 6 0
nrow(sleephealth)
## [1] 374
##5. Setting random seed
set.seed(1234)
##6. Pre-process data for binary classification need to set outcome variable to categorical variable to enable classification
sleephealth$Sleep.Disorder <- factor(sleephealth$Sleep.Disorder, levels = c(0, 1))
##7. Partition training and validation sets Randomly putting 80% of our data records into a training partition, the rest (20% in validation). Training data will served as a “look up repository” for every data record that we’re trying to make a classification for. So, for every record that we try to classify as Sleep Disorder = 1 or Sleep Disorder = 0, we will look to our training data to ID the k most similar records, and use their Sleep Disorder status to inform our classification.
sample <- sample.int(n = nrow(sleephealth), size = nrow(sleephealth)*0.8, replace = F)
sleephealth_training <- sleephealth[sample, ] ##Yields training dataset that is the training percentage %
sleephealth_validation <- sleephealth[-sample, ] ##Yields validation dataset
##8. Normalizing data partitions for k-nearest neighbors Using all of our data (data before partitioning), identifying minimum and maximum values for each variable. There are then used to normalize the data in both our training + validation partitions. We are running min-max normalization (other…) performing a min-max normalization in this case. Involves assessing the max and min possible value of each variable (by looking at all the data) and using these to weight each variable such that they only take values between 0-1.
preProcValues <- preProcess(sleephealth, method = c("range")) ##Uses column minimums & maximums to normalize values around 0 using original data
sleephealth_training_norm <- predict(preProcValues, sleephealth_training) #Using the normalizing object, normalize the rows in the dataframe and save it new to a new one
sleephealth_validation_norm <- predict(preProcValues, sleephealth_validation) #Using the normalizing object, normalize the rows in the dataframe and save it new to a new one
Normalized data (for both partitions) has variables that all use the same scale (0-1); lower values indicating value is closer to sample min, higher indicating it is closer to sample max…
head(sleephealth_training_norm)
## Is_Male Age Salary Sleep.Duration Quality.of.Sleep
## 284 0 1.0000 0.07532685 0.81481481 1.0
## 336 1 0.2500 0.12841569 0.33333333 0.6
## 101 1 0.1875 0.37024971 0.07407407 0.4
## 111 1 0.2500 0.37262339 0.51851852 0.8
## 133 1 0.5000 0.49893747 0.74074074 0.8
## 98 1 0.1875 0.61298441 0.07407407 0.4
## Physical.Activity.Level Stress.Level BMI.Category Systolic.BP Diastolic.BP
## 284 0.7500000 0.0 0.5 0.9259259 1.00
## 336 0.1666667 0.4 0.5 0.4814815 0.45
## 101 0.0000000 1.0 0.0 0.3703704 0.25
## 111 0.5000000 0.2 0.0 0.3703704 0.25
## 133 1.0000000 0.4 0.0 0.5555556 0.50
## 98 0.0000000 1.0 0.0 0.3703704 0.25
## Heart.Rate Daily.Steps Sleep.Disorder
## 284 0.1428571 0.5714286 1
## 336 0.2380952 0.3714286 0
## 101 0.3333333 0.2857143 0
## 111 0.0000000 0.2857143 0
## 133 0.2380952 0.7142857 0
## 98 0.3333333 0.2857143 0
head(sleephealth_validation_norm)
## Is_Male Age Salary Sleep.Duration Quality.of.Sleep
## 1 1 0.15625 0.2122873 0.5185185 0.8
## 3 0 0.28125 0.2404689 0.5185185 0.8
## 5 0 0.28125 0.1808712 0.5185185 0.8
## 8 0 0.31250 0.2400655 0.5185185 0.8
## 9 0 0.31250 0.2475384 0.5185185 0.8
## 11 0 0.31250 0.2064352 0.5185185 0.8
## Physical.Activity.Level Stress.Level BMI.Category Systolic.BP Diastolic.BP
## 1 0.3333333 0.6 0 0.1111111 0.05
## 3 0.5000000 0.2 0 0.0000000 0.00
## 5 0.5000000 0.2 0 0.0000000 0.00
## 8 0.5000000 0.2 0 0.0000000 0.00
## 9 0.5000000 0.2 0 0.0000000 0.00
## 11 0.5000000 0.2 0 0.0000000 0.00
## Heart.Rate Daily.Steps Sleep.Disorder
## 1 0.1428571 0.5714286 0
## 3 0.1428571 0.5714286 1
## 5 0.1428571 0.5714286 0
## 8 0.1428571 0.5714286 0
## 9 0.1428571 0.5714286 0
## 11 0.1428571 0.5714286 0
##9. Train kNN model on normalized data & select optimal k optimal k is k=7; this means that for every data record that we want to classify, we will look at the 7 most similar data records and their value for Sleep.Disorder to make a classification. Specifically, we will use the the majority class. (For classification, we go by ‘majority rules’) choose k because …
ctrl <- trainControl(method="repeatedcv",repeats = 3) #Set training parameters
knnFit <- train(Sleep.Disorder ~ ., data = sleephealth_training_norm, method = "knn", trControl = ctrl, tuneLength = 20) #Test various values of k on normalized training data.
knnFit ##Displays the relative performance of different values of k
## k-Nearest Neighbors
##
## 299 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 269, 269, 269, 270, 269, 268, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9217402 0.8390083
## 7 0.9229663 0.8422585
## 9 0.9207440 0.8379920
## 11 0.9163330 0.8293468
## 13 0.9174824 0.8316874
## 15 0.9164071 0.8289728
## 17 0.9130738 0.8218019
## 19 0.9130738 0.8218019
## 21 0.9141849 0.8241484
## 23 0.9130738 0.8218019
## 25 0.9130738 0.8218019
## 27 0.9085910 0.8122398
## 29 0.9074799 0.8098950
## 31 0.9085910 0.8123034
## 33 0.9107774 0.8168779
## 35 0.9163713 0.8287820
## 37 0.9185935 0.8335439
## 39 0.9185935 0.8335439
## 41 0.9197046 0.8359538
## 43 0.9197046 0.8359538
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
plot(knnFit) #Plot the accuracy of various k values
##10. Generate kNN predictions for normalized validation data
knn_validation_predictions <- predict(knnFit,newdata = sleephealth_validation_norm) #Generate validation data predictions
##11. Train random forest model - trained on original data NOT normalized data (data doesn’t need to be normalized for random forest models - it does for knn)
299 bootstrapped samples were drawn out of training data; each was used to construct a decision tree. Each DT takes in values…
rf_fit <- train(Sleep.Disorder ~ ., data = sleephealth_training, method = "ranger") ##Does not need to use normalized data so using original training partition
rf_fit
## Random Forest
##
## 299 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 299, 299, 299, 299, 299, 299, ...
## Resampling results across tuning parameters:
##
## mtry splitrule Accuracy Kappa
## 2 gini 0.9476576 0.8912613
## 2 extratrees 0.9468566 0.8900889
## 7 gini 0.9219968 0.8378307
## 7 extratrees 0.9295087 0.8539901
## 12 gini 0.9053584 0.8036056
## 12 extratrees 0.9156190 0.8248558
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = gini
## and min.node.size = 1.
##12. Generate random forest predictions on validation data (data does not need to be normalized)
rf_validation_predictions <- predict(rf_fit, sleephealth_validation)
##13. Compare kNN and random forest performance on validation partitions
RF Model: accuracy (93.33%), sensitivity/recall/true positive rate (87.88%), specificity/true negative rate (97.62%), precision / positive predicted value (96.67%)
kNN Model: accuracy (92%), sensitivity/recall/true positive rate (87.88%), specificity/true negative rate (95.24%), precision / positive predicted value (93.55%)
Depending on what you care about (sensitivity or precision or accuracy) you will choose one over the other. They are very close and both very good models.
confusionMatrix(rf_validation_predictions, sleephealth_validation$Sleep.Disorder,positive="1") ##Random forest validation predictions
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 41 4
## 1 1 29
##
## Accuracy : 0.9333
## 95% CI : (0.8512, 0.978)
## No Information Rate : 0.56
## P-Value [Acc > NIR] : 7.369e-13
##
## Kappa : 0.8634
##
## Mcnemar's Test P-Value : 0.3711
##
## Sensitivity : 0.8788
## Specificity : 0.9762
## Pos Pred Value : 0.9667
## Neg Pred Value : 0.9111
## Prevalence : 0.4400
## Detection Rate : 0.3867
## Detection Prevalence : 0.4000
## Balanced Accuracy : 0.9275
##
## 'Positive' Class : 1
##
confusionMatrix(knn_validation_predictions, sleephealth_validation_norm$Sleep.Disorder,positive="1") ##kNN validation predictions
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 40 4
## 1 2 29
##
## Accuracy : 0.92
## 95% CI : (0.834, 0.9701)
## No Information Rate : 0.56
## P-Value [Acc > NIR] : 6.898e-12
##
## Kappa : 0.8366
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.8788
## Specificity : 0.9524
## Pos Pred Value : 0.9355
## Neg Pred Value : 0.9091
## Prevalence : 0.4400
## Detection Rate : 0.3867
## Detection Prevalence : 0.4133
## Balanced Accuracy : 0.9156
##
## 'Positive' Class : 1
##