Project 5

Libraries needed

library(caretEnsemble)
library(caret)
library(ISLR)
library(elasticnet)
library(AppliedPredictiveModeling)
library(randomForest)
library(gbm)
library(readxl)
library(neuralnet)
library(MASS)
library(HMM)
library(ggplot2)
library(usmap)

Problem 1

1A

# Reads the imported ionosphere data, deletes all columns with zero variance, and converts the first column to type num. 
setwd("C:/Users/joshr/Documents/Machine Learning R")
sphereData <- read.csv(file = 'ionosphere.data', header = FALSE)

sphereZeroValues <- nearZeroVar(sphereData)
sphereData <- sphereData[,-sphereZeroValues]

sphereData$V1 <- as.numeric(sphereData$V1)
# Sets the seed to 12345
set.seed(12345)
# Partitions the data into Training and Testing sets
sphereIndex <- createDataPartition(sphereData$V35, p=0.7, list=FALSE)
Trainingset <- sphereData[sphereIndex,]
Testingset <- sphereData[-sphereIndex, ]

1B

#The caretEnsemble package allows us to build and combine different models.
# Builds rf1, gbm1, CART, and SVM models for the data.
models1 <- caretList(V35~., data=Trainingset, trControl=trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE), methodList=c('rf', 'gbm', 'rpart','svmLinear'))

results <- resamples(models1)
# Displays numerical results of the model
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: rf, gbm, rpart, svmLinear 
## Number of resamples: 10 
## 
## Accuracy 
##                Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
## rf        0.7916667 0.8862500 0.9591667 0.9305000    0.99 1.00    0
## gbm       0.7916667 0.9166667 0.9200000 0.9265000    0.96 1.00    0
## rpart     0.7083333 0.8350000 0.8575000 0.8576667    0.91 0.96    0
## svmLinear 0.7083333 0.8762500 0.8800000 0.8700000    0.88 0.96    0
## 
## Kappa 
##                Min.   1st Qu.    Median      Mean   3rd Qu.     Max. NA's
## rf        0.5454545 0.7587274 0.9071289 0.8462430 0.9777580 1.000000    0
## gbm       0.5652174 0.8043796 0.8198702 0.8367863 0.9141987 1.000000    0
## rpart     0.3913043 0.6185152 0.6812276 0.6861784 0.8062323 0.911032    0
## svmLinear 0.3333333 0.6970167 0.7191011 0.7010252 0.7295973 0.911032    0

# Displays results of the model in a DotPlot
dotplot(results)

The dotplot represents gbm1, rf1, CART, and SVM. It is clear that the various models have comparable accuracy each somewhat close to 0.9. The gbm1 and rf1 models, though, appear to generally have higher accuracies and kappas than models CART and SVM. This can be evidenced by the gbm1 and rf1 models each having higher first quarter and third quarter values than SVM and CART for both accuracy and kappa. The rf1 model has the highest mean accuracy with 93.05% and the highest mean Kappa value of 0.8462. The gbm1 model has a mean accuracy of 92.65% and a mean Kappa value of 0.8368. The SVM model has a mean accuracy of 87.00% and a mean Kappa of 0.701. The CART model has the lowest mean accuracy with 85.77% and the lowest mean Kappa value of 0.6862.

1C

# Tests accuracy of rf1 model on training data
predictedtrainingrf1 <- predict(models1$rf, newdata=Trainingset)
confusionMatrix(predictedtrainingrf1, Trainingset$V35)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   b   g
##          b  89   0
##          g   0 158
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9852, 1)
##     No Information Rate : 0.6397     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.3603     
##          Detection Rate : 0.3603     
##    Detection Prevalence : 0.3603     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : b          
##

# Tests accuracy of CART model on training data
predictedtrainingCART <- predict(models1$rpart, newdata=Trainingset)
confusionMatrix(predictedtrainingCART, Trainingset$V35)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   b   g
##          b  73  10
##          g  16 148
##                                           
##                Accuracy : 0.8947          
##                  95% CI : (0.8496, 0.9301)
##     No Information Rate : 0.6397          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7682          
##                                           
##  Mcnemar's Test P-Value : 0.3268          
##                                           
##             Sensitivity : 0.8202          
##             Specificity : 0.9367          
##          Pos Pred Value : 0.8795          
##          Neg Pred Value : 0.9024          
##              Prevalence : 0.3603          
##          Detection Rate : 0.2955          
##    Detection Prevalence : 0.3360          
##       Balanced Accuracy : 0.8785          
##                                           
##        'Positive' Class : b               
##

# Tests accuracy of SVM model on training data
predictedtrainingSVM <- predict(models1$svmLinear, newdata=Trainingset)
confusionMatrix(predictedtrainingSVM, Trainingset$V35)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   b   g
##          b  78   1
##          g  11 157
##                                           
##                Accuracy : 0.9514          
##                  95% CI : (0.9167, 0.9746)
##     No Information Rate : 0.6397          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.892           
##                                           
##  Mcnemar's Test P-Value : 0.009375        
##                                           
##             Sensitivity : 0.8764          
##             Specificity : 0.9937          
##          Pos Pred Value : 0.9873          
##          Neg Pred Value : 0.9345          
##              Prevalence : 0.3603          
##          Detection Rate : 0.3158          
##    Detection Prevalence : 0.3198          
##       Balanced Accuracy : 0.9350          
##                                           
##        'Positive' Class : b               
##

# Tests accuracy of gbm1 model on training data
predictedtraininggbm1 <- predict(models1$gbm, newdata=Trainingset)
confusionMatrix(predictedtraininggbm1, Trainingset$V35)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   b   g
##          b  89   0
##          g   0 158
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9852, 1)
##     No Information Rate : 0.6397     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.3603     
##          Detection Rate : 0.3603     
##    Detection Prevalence : 0.3603     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : b          
##

Using the training data, the rf1 model has an accuracy of 1, the CART model has an accuracy of 0.8947, the SVM model has an accuracy of 0.9514, and the gbm1 model has an accuracy of 1. It is evident that rf1 and gbm1 are the most accurate models on the Training data. It is also evident that CART is the least accurate model on the Training data.

1D

# Tests accuracy of rf1 model on testing data
predictedtestingrf1 <- predict(models1$rf, newdata=Testingset)
confusionMatrix(predictedtestingrf1, Testingset$V35)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  b  g
##          b 33  1
##          g  4 66
##                                           
##                Accuracy : 0.9519          
##                  95% CI : (0.8914, 0.9842)
##     No Information Rate : 0.6442          
##     P-Value [Acc > NIR] : 7.16e-14        
##                                           
##                   Kappa : 0.8932          
##                                           
##  Mcnemar's Test P-Value : 0.3711          
##                                           
##             Sensitivity : 0.8919          
##             Specificity : 0.9851          
##          Pos Pred Value : 0.9706          
##          Neg Pred Value : 0.9429          
##              Prevalence : 0.3558          
##          Detection Rate : 0.3173          
##    Detection Prevalence : 0.3269          
##       Balanced Accuracy : 0.9385          
##                                           
##        'Positive' Class : b               
##

# Tests accuracy of CART model on testing data
predictedtestingCART <- predict(models1$rpart, newdata=Testingset)
confusionMatrix(predictedtestingCART, Testingset$V35)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  b  g
##          b 34  3
##          g  3 64
##                                           
##                Accuracy : 0.9423          
##                  95% CI : (0.8787, 0.9785)
##     No Information Rate : 0.6442          
##     P-Value [Acc > NIR] : 6.661e-13       
##                                           
##                   Kappa : 0.8741          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9189          
##             Specificity : 0.9552          
##          Pos Pred Value : 0.9189          
##          Neg Pred Value : 0.9552          
##              Prevalence : 0.3558          
##          Detection Rate : 0.3269          
##    Detection Prevalence : 0.3558          
##       Balanced Accuracy : 0.9371          
##                                           
##        'Positive' Class : b               
##

# Tests accuracy of SVM model on testing data
predictedtestingSVM <- predict(models1$svmLinear, newdata=Testingset)
confusionMatrix(predictedtestingSVM, Testingset$V35)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  b  g
##          b 25  1
##          g 12 66
##                                           
##                Accuracy : 0.875           
##                  95% CI : (0.7957, 0.9317)
##     No Information Rate : 0.6442          
##     P-Value [Acc > NIR] : 9.994e-08       
##                                           
##                   Kappa : 0.7079          
##                                           
##  Mcnemar's Test P-Value : 0.005546        
##                                           
##             Sensitivity : 0.6757          
##             Specificity : 0.9851          
##          Pos Pred Value : 0.9615          
##          Neg Pred Value : 0.8462          
##              Prevalence : 0.3558          
##          Detection Rate : 0.2404          
##    Detection Prevalence : 0.2500          
##       Balanced Accuracy : 0.8304          
##                                           
##        'Positive' Class : b               
##

# Tests accuracy of gbm1 model on testing data
predictedtestinggbm1 <- predict(models1$gbm, newdata=Testingset)
confusionMatrix(predictedtestinggbm1, Testingset$V35)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  b  g
##          b 34  1
##          g  3 66
##                                           
##                Accuracy : 0.9615          
##                  95% CI : (0.9044, 0.9894)
##     No Information Rate : 0.6442          
##     P-Value [Acc > NIR] : 6.354e-15       
##                                           
##                   Kappa : 0.9151          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.9189          
##             Specificity : 0.9851          
##          Pos Pred Value : 0.9714          
##          Neg Pred Value : 0.9565          
##              Prevalence : 0.3558          
##          Detection Rate : 0.3269          
##    Detection Prevalence : 0.3365          
##       Balanced Accuracy : 0.9520          
##                                           
##        'Positive' Class : b               
##

Using the testing data, the rf1 model has an accuracy of 0.9519, the CART model has an accuracy of 0.9423, the SVM model has an accuracy of 0.875, and the gbm1 model has an accuracy of 0.9615.It is evident that gbm1 is the most accurate model on the Testing data. Also, SVM is the least accurate model on the Testing data.

Problem 2

2A

data(Khan)

#Making dataframes for the training and testing sets and setting the seed

khantraining = data.frame(R = as.factor(Khan$ytrain),Khan$xtrain)
khantesting = data.frame(R = as.factor(Khan$ytest),Khan$xtest)

set.seed(12345)

2B

# Builds CART model on the training data.
CART2 = train(R~.,data = khantraining, method = "rpart", trControl = trainControl(method = "cv", number = 10), preProcess = c("center", "scale"))

#Builds random forest (rf) model on the training data.
RF2 = train(R~.,data = khantraining, method = "rf", trControl = trainControl(method = "cv", number = 10), preProcess = c("center", "scale"))

#Builds gradient boosting machine (gbm) model on the training data.
GBM2 = train(R~.,data = khantraining, method = "gbm", trControl = trainControl(method = "cv", number = 10), preProcess = c("center", "scale"))

#Builds support vector machine (SVM) on the training data.
SVM2 = train(R~.,data = khantraining, method = "svmLinear", trControl = trainControl(method = "cv", number = 10), preProcess = c("center", "scale"))

2C

# Tests accuracy of the CART model on the training set.
confusionMatrix(predict(CART2, khantraining), khantraining$R)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  0  0  0  0
##          2  0 22  0  0
##          3  7  1 12  0
##          4  1  0  0 20
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8571          
##                  95% CI : (0.7461, 0.9325)
##     No Information Rate : 0.3651          
##     P-Value [Acc > NIR] : 1.023e-15       
##                                           
##                   Kappa : 0.7977          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity             0.000   0.9565   1.0000   1.0000
## Specificity             1.000   1.0000   0.8431   0.9767
## Pos Pred Value            NaN   1.0000   0.6000   0.9524
## Neg Pred Value          0.873   0.9756   1.0000   1.0000
## Prevalence              0.127   0.3651   0.1905   0.3175
## Detection Rate          0.000   0.3492   0.1905   0.3175
## Detection Prevalence    0.000   0.3492   0.3175   0.3333
## Balanced Accuracy       0.500   0.9783   0.9216   0.9884

#Tests accuracy of the rf model on the training set.
confusionMatrix(predict(RF2, khantraining), khantraining$R)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  8  0  0  0
##          2  0 23  0  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.3651     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity             1.000   1.0000   1.0000   1.0000
## Specificity             1.000   1.0000   1.0000   1.0000
## Pos Pred Value          1.000   1.0000   1.0000   1.0000
## Neg Pred Value          1.000   1.0000   1.0000   1.0000
## Prevalence              0.127   0.3651   0.1905   0.3175
## Detection Rate          0.127   0.3651   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy       1.000   1.0000   1.0000   1.0000

#Tests accuracy of the gbm model on the training set.
confusionMatrix(predict(GBM2, khantraining), khantraining$R)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  8  0  0  0
##          2  0 23  0  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.3651     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity             1.000   1.0000   1.0000   1.0000
## Specificity             1.000   1.0000   1.0000   1.0000
## Pos Pred Value          1.000   1.0000   1.0000   1.0000
## Neg Pred Value          1.000   1.0000   1.0000   1.0000
## Prevalence              0.127   0.3651   0.1905   0.3175
## Detection Rate          0.127   0.3651   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy       1.000   1.0000   1.0000   1.0000

#Tests accuracy of the svm model on the training set.
confusionMatrix(predict(SVM2, khantraining), khantraining$R)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  8  0  0  0
##          2  0 23  0  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.3651     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity             1.000   1.0000   1.0000   1.0000
## Specificity             1.000   1.0000   1.0000   1.0000
## Pos Pred Value          1.000   1.0000   1.0000   1.0000
## Neg Pred Value          1.000   1.0000   1.0000   1.0000
## Prevalence              0.127   0.3651   0.1905   0.3175
## Detection Rate          0.127   0.3651   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy       1.000   1.0000   1.0000   1.0000

The four different models built using the training data had strong accuracies. The CART2 model had a strong accuracy of 85.71%. The RF2, GBM2, and SVM2 models each had accuracies of 100%. The CART2 model is the only model that does not have 100% prediction accuracy on the training dataset.

2D

# Tests the CART model on the testing set.
confusionMatrix(predict(CART2, khantesting), khantesting$R)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 0 0 0 0
##          2 0 4 0 1
##          3 3 1 5 1
##          4 0 1 1 3
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.3605, 0.8088)
##     No Information Rate : 0.3             
##     P-Value [Acc > NIR] : 0.005138        
##                                           
##                   Kappa : 0.4386          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              0.00   0.6667   0.8333   0.6000
## Specificity              1.00   0.9286   0.6429   0.8667
## Pos Pred Value            NaN   0.8000   0.5000   0.6000
## Neg Pred Value           0.85   0.8667   0.9000   0.8667
## Prevalence               0.15   0.3000   0.3000   0.2500
## Detection Rate           0.00   0.2000   0.2500   0.1500
## Detection Prevalence     0.00   0.2500   0.5000   0.2500
## Balanced Accuracy        0.50   0.7976   0.7381   0.7333

# Tests the rf model on the testing set.
confusionMatrix(predict(RF2, khantesting), khantesting$R)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 3 0 0 0
##          2 0 6 0 0
##          3 0 0 5 0
##          4 0 0 1 5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.95            
##                  95% CI : (0.7513, 0.9987)
##     No Information Rate : 0.3             
##     P-Value [Acc > NIR] : 1.662e-09       
##                                           
##                   Kappa : 0.9322          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              1.00      1.0   0.8333   1.0000
## Specificity              1.00      1.0   1.0000   0.9333
## Pos Pred Value           1.00      1.0   1.0000   0.8333
## Neg Pred Value           1.00      1.0   0.9333   1.0000
## Prevalence               0.15      0.3   0.3000   0.2500
## Detection Rate           0.15      0.3   0.2500   0.2500
## Detection Prevalence     0.15      0.3   0.2500   0.3000
## Balanced Accuracy        1.00      1.0   0.9167   0.9667

# Tests the gbm model on the testing set.
confusionMatrix(predict(GBM2, khantesting), khantesting$R)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 3 0 0 0
##          2 0 5 0 1
##          3 0 0 6 0
##          4 0 1 0 4
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9            
##                  95% CI : (0.683, 0.9877)
##     No Information Rate : 0.3            
##     P-Value [Acc > NIR] : 3.773e-08      
##                                          
##                   Kappa : 0.8639         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              1.00   0.8333      1.0   0.8000
## Specificity              1.00   0.9286      1.0   0.9333
## Pos Pred Value           1.00   0.8333      1.0   0.8000
## Neg Pred Value           1.00   0.9286      1.0   0.9333
## Prevalence               0.15   0.3000      0.3   0.2500
## Detection Rate           0.15   0.2500      0.3   0.2000
## Detection Prevalence     0.15   0.3000      0.3   0.2500
## Balanced Accuracy        1.00   0.8810      1.0   0.8667

# Tests the svm model on the testing set.
confusionMatrix(predict(SVM2, khantesting), khantesting$R)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 3 0 0 0
##          2 0 6 2 0
##          3 0 0 4 0
##          4 0 0 0 5
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9            
##                  95% CI : (0.683, 0.9877)
##     No Information Rate : 0.3            
##     P-Value [Acc > NIR] : 3.773e-08      
##                                          
##                   Kappa : 0.8639         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              1.00   1.0000   0.6667     1.00
## Specificity              1.00   0.8571   1.0000     1.00
## Pos Pred Value           1.00   0.7500   1.0000     1.00
## Neg Pred Value           1.00   1.0000   0.8750     1.00
## Prevalence               0.15   0.3000   0.3000     0.25
## Detection Rate           0.15   0.3000   0.2000     0.25
## Detection Prevalence     0.15   0.4000   0.2000     0.25
## Balanced Accuracy        1.00   0.9286   0.8333     1.00

The rf and SVM models had 90% accuracy. The rf model had an accuracy of 95%. The CART model had a much lower accuracy with only 60% accuracy.Therefore, it seems as though the CART model does not do a great job of predicting on the testing data, but every other model does rather well.

Problem 3

3A

#Reads imported file
setwd("C:/Users/joshr/Documents/Machine Learning R")
EnergyData = read_excel("ENB2012_data.xlsx")
#Normalizes the data
normalize <- function(x){
  return((x-min(x))/(max(x)-min(x)))
}
normalizeddata <- as.data.frame(lapply(EnergyData, normalize))
# Sets the seed to 12345
set.seed(12345)
# Partitions the data into training and testing sets.
index <- createDataPartition(y=normalizeddata$Y1, p=0.7, list= FALSE)
energytraining <- normalizeddata[index,]
energytesting <- normalizeddata[-index,]

3B

#Builds a neural network model for Y1 on the training data.
NN3b <- train(Y1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = energytraining, method = "nnet", trace = FALSE)

3C

#Computes r^2 on the testing data.        
NN3bR2 = R2(predict(NN3b, energytesting),energytesting$Y1)
NN3bR2

## [1] 0.9895002

R^2 on the testing data is 0.9895002.

3D

#Plots a neural network called NN3d that has one hidden unit in one hidden layer.
NN3d = neuralnet(Y1 + Y2 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = energytraining, hidden = 1)
plot(NN3d, rep="best")

#Using NN3d, computes r^2 for Y1 and Y2 using testing data.
Y1r2 = R2(compute(NN3d, energytesting[,1:8])$net.result,energytesting$Y1)
Y2r2 = R2(compute(NN3d, energytesting[,1:8])$net.result,energytesting$Y2)

Y1r2[1,1]

## [1] 0.9246202

Y2r2[1,1]

## [1] 0.8964148

Y1’s r^2 is 0.9246202 and Y2’s r^2 is 0.8964148. The high r^2 value of 0.9246202 exhibits that this neural network model (NN3d) is quite effective at predicting Y1 (heating load). The Y2 r^2 value of 0.8964148 is high and exhibits that this neural network model (NN3d) is fairly effective at predicting Y2 (cooling load).

3E

#Plots a neural network called NN3e that has two hidden layers. The first layer has 2 nodes while the second layer has 1 node.

NN3e = neuralnet(Y1 + Y2 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = energytraining, hidden = c(2,1), stepmax = 225000)
plot(NN3e, rep="best")

# Using NN3e, computes r^2 for Y1 and Y2 using the testing data.
Y1r2e = R2(compute(NN3e, energytesting[,1:8])$net.result,energytesting$Y1)
Y2r2e = R2(compute(NN3e, energytesting[,1:8])$net.result,energytesting$Y2)

Y1r2e[1,1]

## [1] 0.9761593

Y2r2e[1,1]

## [1] 0.9512389

Y1’s r^2 is 0.9761593 and Y2’s r^2 is 0.9512389. The Y1 r^2 value of 0.9761593 is high and exhibits that this neural network model (NN3e) is very effective at predicting Y1 (heating load). The Y2 r^2 value of 0.9512389 is high and exhibits that this neural network model (NN3e) is also very effective at predicting Y2 (cooling load).

Problem 4

4A

#Setting the seed and constructing a neural network model, NN4b, with one hidden layer containing one hidden variable
set.seed(12345)
NN4b <- neuralnet(medv~lstat, data = Boston, hidden = 1)
#Plotting the data and superimposing the model over the data
plot(Boston$lstat, Boston$medv, pch = 19, xlab = "Lstat Observed", ylab = "Medv Observed")
BostonData <- seq(0, 50, .25)
BostonData <- as.data.frame(BostonData)
lines(BostonData$BostonData, compute(NN4b, BostonData)$net.result, col = "purple")

As we can see from the plot, the fit is not very good since it is simply a straight line. A curved line would fit the data much better.

4B

#Constructing a new dataframe with a normalized version of the Boston data
normalize <- function(x){
  return((x-min(x))/(max(x)-min(x)))
}
BostonNormalized <- as.data.frame(lapply(Boston, normalize))

4C

#Constructing a neural network model, NN4d, with one hidden layer containing one hidden variable
NN4d <- neuralnet(medv~lstat, data = BostonNormalized, hidden = 1)
#Plotting the data and superimposing the model over the data
plot(BostonNormalized$lstat, BostonNormalized$medv, pch = 19, xlab = "Lstat Normalized", ylab = "Medv Normalized")
BostonData1 <- seq(0, 1, .05)
BostonData1 <- as.data.frame(BostonData1)
lines(BostonData1$BostonData1, compute(NN4d, BostonData1)$net.result, lwd = 2, col = "red")

The quality of the fit on the normalized data is much better than on the non-normalized data in 4A. The model is now fit with a curve instead of simply a straight line.

4D

#Using this code to visualize the model
plot(NN4d, rep = "best")

The corresponding equation is: medv = 2.70587 - 2.53167S(0.99799 + 6.03666(lstat))

4E

#Constructing a neural network model, NN4f, with two hidden layers containing two hidden variables each
NN4f <- neuralnet(medv~lstat, data = BostonNormalized, hidden = c(2,2))
#Plotting the data and superimposing the model over the data
plot(BostonNormalized$lstat, BostonNormalized$medv, pch = 19, xlab = "Lstat Normalized", ylab = "Medv Normalized")
BostonData2 <- seq(0, 1, .05)
BostonData2 <- as.data.frame(BostonData2)
lines(BostonData2$BostonData2, compute(NN4f, BostonData2)$net.result, lwd = 2, col = "blue")

This model, like the one in 4C, seems to fit the data rather well. It has a curved shape, which fits better than the straight line in 4A. However, the model has a slant at the top that makes it look S shaped.

Problem 5

5A

setwd("C:/Users/joshr/Documents/Machine Learning R")
#Reading the casino file into R Markdown and setting the seed
Casino <- read.csv(file = 'Casino.csv')
normalizeProbabilities <- function (x) {x/sum(x)}
set.seed(6789)
#Initializing the Hidden Markov Model
#TransitionMatrix <- cbind(c(.99, .02), c(.01, .98))
TransitionMatrix <- matrix(c(.99,.01,.02,.98), 2)
PIprobs <- normalizeProbabilities(runif(2))
Bprobs <- apply(matrix(runif(12), 6), 1, normalizeProbabilities)

5B

#Building the Hidden Markov model using the Baum-Welch algorithm
hmm <- initHMM(c("fair die", "unfair die"), 1:6, startProbs = PIprobs, transProbs = TransitionMatrix, emissionProbs = Bprobs)
CasinoModel <- baumWelch(hmm, Casino$Roll, maxIterations = 50)
CasinoModel$hmm$emissionProbs

##             symbols
## states               1          2          3         4         5         6
##   fair die   0.1666314 0.16706238 0.16771742 0.1667201 0.1672893 0.1645794
##   unfair die 0.1009619 0.09892086 0.09879078 0.1005153 0.1015320 0.4992791

The model predicts that the weights of the unfair dice are about .10 for observations 1-5 and around .5 for observation 6. Therefore, according to the model, there is a much higher probability of rolling a 6 than any number from 1-5 with the unfair dice.

Problem 6

6A

setwd("C:/Users/joshr/Documents/Machine Learning R")
#Reading the data file into R, turning the blank cells into NA's
KaggleSurvey <- read.csv(file =  'KaggleSurvey.csv', na.strings=c(""," ","NA"))
#Removing the salary and country columns
KaggleSurvey <- KaggleSurvey[, -3]
KaggleSurvey <- KaggleSurvey[, -4]
#Removing incomplete cases
KaggleSurvey <- na.omit(KaggleSurvey)
KaggleSurvey$Response <- as.factor(KaggleSurvey$Response)

6B

#Using the polr funciton to build ORD, an ordinal regression model
ORD <- polr(Response ~ Gender + Age + Student, data = KaggleSurvey)
summary(ORD)

## Call:
## polr(formula = Response ~ Gender + Age + Student, data = KaggleSurvey)
## 
## Coefficients:
##              Value Std. Error t value
## GenderMale  0.1511   0.038853   3.890
## Age        -0.0100   0.001627  -6.147
## Student     0.1400   0.038908   3.597
## 
## Intercepts:
##     Value    Std. Error t value 
## 1|2  -3.4791   0.0737   -47.1938
## 2|3  -1.9297   0.0636   -30.3541
## 3|4  -0.2651   0.0612    -4.3330
## 4|5   0.9878   0.0616    16.0235
## 
## Residual Deviance: 44823.35 
## AIC: 44837.35

exp(coef(ORD))

## GenderMale        Age    Student 
##  1.1631549  0.9900479  1.1502335

6C

## Using the provided code to see probabilities for each response for 25-year old people
testing <- data.frame(Student=c(0,1,0,1), Gender=c("Male","Male","Female","Female"), Age=c(25,25,25,25))
predict(ORD,newdata = testing, type="p")

##            1          2         3         4         5
## 1 0.03292027 0.10522049 0.3204161 0.2892042 0.2522389
## 2 0.02874410 0.09356062 0.3017577 0.2963971 0.2795405
## 3 0.03808680 0.11905067 0.3391082 0.2789427 0.2248117
## 4 0.03327778 0.10619837 0.3218555 0.2885306 0.2501377

From the results shown above, we can see that group 2 has the greatest probability of responding “Much better” to the survey question. Their probability is 0.2795405, while the next highest probability is group 1 with 0.2522389.

6D

We can determine the affect of Age on the model from the code in 6B. From the summary, we can see that Age appears to be rather significant based on its t-value that is pretty far from zero. We also recognize that an increase of 1 unit of age corresponds to a 0.99 increase in the odds of responding positively about the quality of online learning platforms and MOOCs, as compared to traditional education.

Problem 7

7A

# Loading the southAmerica data into R
setwd("C:/Users/joshr/Documents/Machine Learning R")
southAmerica <- read.csv(file = "SouthAmerica.csv")
#Renaming the rows with the names of the countries
rownames(southAmerica) = southAmerica$Country
southAmerica = southAmerica[, !names(southAmerica) %in% c("Country")]
# Using scale to center and scale each column, then creating a dendrogram and plotting it
dist <- dist(scale(southAmerica))
clust <- hclust(dist)
plot(clust, main="Countries")

7B

The countries that are most like Colombia, according to the dendrogram, are Ecuador and Peru.

7C

If we choose a height so that there are only two clusters, the clusters would be (at a height of 6): Cluster 1: Bolivia, Cluster 2: Chile, Ecuador, Colombia, Peru, Venezuela, Argentina, Uruguay, Brazil, Paraguay

Problem 8

8A

#Reading the education data into R 
setwd("C:/Users/joshr/Documents/Machine Learning R")
education <- read.csv(file = "EducationLevel.csv")
education <- education[-1, ]
#A cursory glance at the data shows 10 N/A fields in the last four columns of the data that we will have to replace with a value (we will choose -1 for this value)
#Getting the columns that have the N/A values to read -1 instead; they happend to be the columns that start with 'Percent'
percentCols <- grep("Percent", names(education))
education[is.na(education[percentCols[1]]),percentCols] = -1
percentage = as.data.frame(education[percentCols])

8B

#Setting the seed and using kmeans clustering with 2 clusters on the percentage data
set.seed(1234)
clusters <- kmeans(percentage, 2)

8C

#Making a new data frame called codes that has variables fips and cluster
codes <- data.frame(education$FIPS.Code, clusters$cluster)
names(codes) <- c("fips", "cluster")

8D

#Generating a color-coded map of the US where the color indicates the cluster membership for each county
plot_usmap(data = codes, labels = TRUE, values = 'cluster', label_color = 'white') + scale_fill_continuous(low = "red", high = "green") + theme(legend.position = "none")