##1. Installing packages in CONSOLE

##2. Loading packages: ranger is for random forest algorithm

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ranger)
## Warning: package 'ranger' was built under R version 4.4.3

##3. Importing data

sleephealth <- read.csv("C:/Users/ajpay/OneDrive/Documents/MSBA/SCH-MGMT 655 - Machine Learning/Exercise Files/Week 9/Sleep Health & Lifestyle Dataset.csv")

##4. Exploring data 374 row dataset. Each row corresponds to one person. trying to predict whether person has sleeping disorder (1) or not (0). Small dataset to make code run faster

head(sleephealth)
##   Is_Male Age Salary Sleep.Duration Quality.of.Sleep Physical.Activity.Level
## 1       1  32 111283            7.2                8                      50
## 2       0  35 113170            7.2                8                      60
## 3       0  36 123298            7.2                8                      60
## 4       0  36 107043            7.1                8                      60
## 5       0  36  97889            7.2                8                      60
## 6       0  36 121403            7.1                8                      60
##   Stress.Level BMI.Category Systolic.BP Diastolic.BP Heart.Rate Daily.Steps
## 1            6            1         118           76         68        7000
## 2            4            1         115           75         68        7000
## 3            4            1         115           75         68        7000
## 4            4            1         115           75         68        7000
## 5            4            1         115           75         68        7000
## 6            4            1         115           75         68        7000
##   Sleep.Disorder
## 1              0
## 2              0
## 3              1
## 4              0
## 5              0
## 6              0
nrow(sleephealth)
## [1] 374

##5. Setting random seed

set.seed(1234)

##6. Pre-process data for binary classification need to set outcome variable to categorical variable to enable classification

sleephealth$Sleep.Disorder <- factor(sleephealth$Sleep.Disorder, levels = c(0, 1))

##7. Partition training and validation sets Randomly putting 80% of our data records into a training partition, the rest (20% in validation). Training data will served as a “look up repository” for every data record that we’re trying to make a classification for. So, for every record that we try to classify as Sleep Disorder = 1 or Sleep Disorder = 0, we will look to our training data to ID the k most similar records, and use their Sleep Disorder status to inform our classification.

sample <- sample.int(n = nrow(sleephealth), size = nrow(sleephealth)*0.8, replace = F)
sleephealth_training <- sleephealth[sample, ] ##Yields training dataset that is the training percentage % 
sleephealth_validation <- sleephealth[-sample, ] ##Yields validation dataset

##8. Normalizing data partitions for k-nearest neighbors Using all of our data (data before partitioning), identifying minimum and maximum values for each variable. There are then used to normalize the data in both our training + validation partitions. We are running min-max normalization (other…)

preProcValues <- preProcess(sleephealth, method = c("range")) ##Uses column minimums & maximums to normalize values around 0 using original data
sleephealth_training_norm <- predict(preProcValues, sleephealth_training) #Using the normalizing object, normalize the rows in the dataframe and save it new to a new one
sleephealth_validation_norm <- predict(preProcValues, sleephealth_validation) #Using the normalizing object, normalize the rows in the dataframe and save it new to a new one
head(sleephealth_training_norm)
##     Is_Male    Age     Salary Sleep.Duration Quality.of.Sleep
## 284       0 1.0000 0.07532685     0.81481481              1.0
## 336       1 0.2500 0.12841569     0.33333333              0.6
## 101       1 0.1875 0.37024971     0.07407407              0.4
## 111       1 0.2500 0.37262339     0.51851852              0.8
## 133       1 0.5000 0.49893747     0.74074074              0.8
## 98        1 0.1875 0.61298441     0.07407407              0.4
##     Physical.Activity.Level Stress.Level BMI.Category Systolic.BP Diastolic.BP
## 284               0.7500000          0.0          0.5   0.9259259         1.00
## 336               0.1666667          0.4          0.5   0.4814815         0.45
## 101               0.0000000          1.0          0.0   0.3703704         0.25
## 111               0.5000000          0.2          0.0   0.3703704         0.25
## 133               1.0000000          0.4          0.0   0.5555556         0.50
## 98                0.0000000          1.0          0.0   0.3703704         0.25
##     Heart.Rate Daily.Steps Sleep.Disorder
## 284  0.1428571   0.5714286              1
## 336  0.2380952   0.3714286              0
## 101  0.3333333   0.2857143              0
## 111  0.0000000   0.2857143              0
## 133  0.2380952   0.7142857              0
## 98   0.3333333   0.2857143              0
head(sleephealth_validation_norm)
##    Is_Male     Age    Salary Sleep.Duration Quality.of.Sleep
## 1        1 0.15625 0.2122873      0.5185185              0.8
## 3        0 0.28125 0.2404689      0.5185185              0.8
## 5        0 0.28125 0.1808712      0.5185185              0.8
## 8        0 0.31250 0.2400655      0.5185185              0.8
## 9        0 0.31250 0.2475384      0.5185185              0.8
## 11       0 0.31250 0.2064352      0.5185185              0.8
##    Physical.Activity.Level Stress.Level BMI.Category Systolic.BP Diastolic.BP
## 1                0.3333333          0.6            0   0.1111111         0.05
## 3                0.5000000          0.2            0   0.0000000         0.00
## 5                0.5000000          0.2            0   0.0000000         0.00
## 8                0.5000000          0.2            0   0.0000000         0.00
## 9                0.5000000          0.2            0   0.0000000         0.00
## 11               0.5000000          0.2            0   0.0000000         0.00
##    Heart.Rate Daily.Steps Sleep.Disorder
## 1   0.1428571   0.5714286              0
## 3   0.1428571   0.5714286              1
## 5   0.1428571   0.5714286              0
## 8   0.1428571   0.5714286              0
## 9   0.1428571   0.5714286              0
## 11  0.1428571   0.5714286              0