Problem 1

A.

iono <- read.csv(file = "/Users/alex/Dropbox/College/4-Senior/Machine Learning/Project5/ionospheredata.csv", header = FALSE, sep = ",")
cols = nearZeroVar(iono)
iono <- iono[-cols]
iono$V1 <- as.numeric(as.character(iono$V1))

set.seed("12345")

dp <- createDataPartition(iono$V35, p=0.7, list=FALSE)
training <- iono[dp,]
testing <- iono[-dp,]

Read in data, removed any columns with near zero variance, set the first variable to numeric, set the seed, and created training and testing partitions.

B.

models1 <- caretList(V35~., data=training,
                     trControl=trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE), methodList=c('knn', 'lda', 'rpart'))
results <- resamples(models1)
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: knn, lda, rpart 
## Number of resamples: 10 
## 
## Accuracy 
##            Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
## knn   0.7600000 0.8100000 0.8575000 0.8501667 0.87875 0.92    0
## lda   0.7600000 0.8350000 0.8750000 0.8703333 0.91000 1.00    0
## rpart 0.7083333 0.8891667 0.9183333 0.8941667 0.92000 0.96    0
## 
## Kappa 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## knn   0.4230769 0.5467033 0.6525199 0.6455658 0.7283935 0.8175182    0
## lda   0.3902439 0.6111632 0.7019704 0.6890054 0.7929140 1.0000000    0
## rpart 0.3913043 0.7624470 0.8157359 0.7747473 0.8324250 0.9110320    0

Accuracies of the three models used in models1 are all relatively similar.

C.

modelCor(results)
##             knn       lda     rpart
## knn   1.0000000 0.7927249 0.5740615
## lda   0.7927249 1.0000000 0.2232201
## rpart 0.5740615 0.2232201 1.0000000
models1 <- caretList(V35~., data=training,
                     trControl=trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE), methodList=c('knn', 'rpart'))

Model correlations show that LDA and kNN models are highly correlated. Reran models1 to exclude the LDA model.

D.

stack1 <- caretStack(models1, method="glm", metric="Accuracy", trControl=trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE))
kNN1 <- train(V35~., data = training, method = "knn", trControl = trainControl(method = 'cv', number = 10))
CART1 <- train(V35~., data = training, method = "rpart", trControl = trainControl(method = 'cv', number = 10))

confusionMatrix(training$V35, predict(stack1, training))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   b   g
##          b  26  63
##          g 157   1
##                                          
##                Accuracy : 0.1093         
##                  95% CI : (0.0733, 0.155)
##     No Information Rate : 0.7409         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.5701        
##  Mcnemar's Test P-Value : 3.609e-10      
##                                          
##             Sensitivity : 0.142077       
##             Specificity : 0.015625       
##          Pos Pred Value : 0.292135       
##          Neg Pred Value : 0.006329       
##              Prevalence : 0.740891       
##          Detection Rate : 0.105263       
##    Detection Prevalence : 0.360324       
##       Balanced Accuracy : 0.078851       
##                                          
##        'Positive' Class : b              
## 
confusionMatrix(training$V35, predict(kNN1, training))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   b   g
##          b  64  25
##          g   3 155
##                                           
##                Accuracy : 0.8866          
##                  95% CI : (0.8403, 0.9233)
##     No Information Rate : 0.7287          
##     P-Value [Acc > NIR] : 1.088e-09       
##                                           
##                   Kappa : 0.7401          
##  Mcnemar's Test P-Value : 7.229e-05       
##                                           
##             Sensitivity : 0.9552          
##             Specificity : 0.8611          
##          Pos Pred Value : 0.7191          
##          Neg Pred Value : 0.9810          
##              Prevalence : 0.2713          
##          Detection Rate : 0.2591          
##    Detection Prevalence : 0.3603          
##       Balanced Accuracy : 0.9082          
##                                           
##        'Positive' Class : b               
## 
confusionMatrix(training$V35, predict(CART1, training))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   b   g
##          b  80   9
##          g  11 147
##                                           
##                Accuracy : 0.919           
##                  95% CI : (0.8777, 0.9498)
##     No Information Rate : 0.6316          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8252          
##  Mcnemar's Test P-Value : 0.8231          
##                                           
##             Sensitivity : 0.8791          
##             Specificity : 0.9423          
##          Pos Pred Value : 0.8989          
##          Neg Pred Value : 0.9304          
##              Prevalence : 0.3684          
##          Detection Rate : 0.3239          
##    Detection Prevalence : 0.3603          
##       Balanced Accuracy : 0.9107          
##                                           
##        'Positive' Class : b               
## 

The accuracy of the stacked model (stack1) is very low at 10.93% while the kNN1 and the CART1 both have high accuracies at 88.66% and 91.9% respectively.

E.

predicted.stack1 <- predict(stack1, newdata = testing)
confusionMatrix(predicted.stack1, testing$V35)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  b  g
##          b 14 67
##          g 23  0
##                                           
##                Accuracy : 0.1346          
##                  95% CI : (0.0756, 0.2155)
##     No Information Rate : 0.6442          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.4909         
##  Mcnemar's Test P-Value : 5.826e-06       
##                                           
##             Sensitivity : 0.3784          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.1728          
##          Neg Pred Value : 0.0000          
##              Prevalence : 0.3558          
##          Detection Rate : 0.1346          
##    Detection Prevalence : 0.7788          
##       Balanced Accuracy : 0.1892          
##                                           
##        'Positive' Class : b               
## 
predicted.kNN1 <- predict(kNN1, newdata = testing)
confusionMatrix(predicted.kNN1, testing$V35)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  b  g
##          b 20  2
##          g 17 65
##                                           
##                Accuracy : 0.8173          
##                  95% CI : (0.7295, 0.8863)
##     No Information Rate : 0.6442          
##     P-Value [Acc > NIR] : 8.509e-05       
##                                           
##                   Kappa : 0.5617          
##  Mcnemar's Test P-Value : 0.001319        
##                                           
##             Sensitivity : 0.5405          
##             Specificity : 0.9701          
##          Pos Pred Value : 0.9091          
##          Neg Pred Value : 0.7927          
##              Prevalence : 0.3558          
##          Detection Rate : 0.1923          
##    Detection Prevalence : 0.2115          
##       Balanced Accuracy : 0.7553          
##                                           
##        'Positive' Class : b               
## 
predicted.CART1 <- predict(CART1, newdata = testing)
confusionMatrix(predicted.CART1, testing$V35)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  b  g
##          b 30  5
##          g  7 62
##                                           
##                Accuracy : 0.8846          
##                  95% CI : (0.8071, 0.9389)
##     No Information Rate : 0.6442          
##     P-Value [Acc > NIR] : 2.487e-08       
##                                           
##                   Kappa : 0.7452          
##  Mcnemar's Test P-Value : 0.7728          
##                                           
##             Sensitivity : 0.8108          
##             Specificity : 0.9254          
##          Pos Pred Value : 0.8571          
##          Neg Pred Value : 0.8986          
##              Prevalence : 0.3558          
##          Detection Rate : 0.2885          
##    Detection Prevalence : 0.3365          
##       Balanced Accuracy : 0.8681          
##                                           
##        'Positive' Class : b               
## 

When the stack1, kNN1, and CART1 models are applied to the testing data we see similar results with stack1’s accuracy being the lowest at 13.46%, while kNN1 and CART1’s accuracies dropped slightly to 81.73% and 88.46% respectively.

Problem 2

A.

data("Khan")
training <- data.frame(Khan$xtrain)
training$response <- as.factor(Khan$ytrain)

testing <- data.frame(Khan$xtest)
testing$response <- as.factor(Khan$ytest)

set.seed(12345)

Read in the data from ISLR package, added the response variables to the training and testing partitions and set the seed.

B.

CART2 <- train(response~., data=training, method="rpart", trControl=trainControl(method="cv", number=10))
RF2 <- train(response~., data=training, method="rf", trControl=trainControl(method="cv", number=10))
GBM2 <- train(response~., data=training, method="gbm", trControl=trainControl(method="cv", number=10))
SVM2 <- train(response~., data=training, method="svmLinear", trControl=trainControl(method="cv", number=10))

Built four different models: CART, random forest, gradient boosting machine, and SVM.

C.

confusionMatrix(training$response, predict(CART2, training))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  0  0  7  1
##          2  0 22  1  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8571          
##                  95% CI : (0.7461, 0.9325)
##     No Information Rate : 0.3492          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7977          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity                NA   1.0000   0.6000   0.9524
## Specificity             0.873   0.9756   1.0000   1.0000
## Pos Pred Value             NA   0.9565   1.0000   1.0000
## Neg Pred Value             NA   1.0000   0.8431   0.9767
## Prevalence              0.000   0.3492   0.3175   0.3333
## Detection Rate          0.000   0.3492   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy          NA   0.9878   0.8000   0.9762
confusionMatrix(training$response, predict(RF2, training))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  8  0  0  0
##          2  0 23  0  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.3651     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity             1.000   1.0000   1.0000   1.0000
## Specificity             1.000   1.0000   1.0000   1.0000
## Pos Pred Value          1.000   1.0000   1.0000   1.0000
## Neg Pred Value          1.000   1.0000   1.0000   1.0000
## Prevalence              0.127   0.3651   0.1905   0.3175
## Detection Rate          0.127   0.3651   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy       1.000   1.0000   1.0000   1.0000
confusionMatrix(training$response, predict(GBM2, training))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  8  0  0  0
##          2  0 23  0  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.3651     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity             1.000   1.0000   1.0000   1.0000
## Specificity             1.000   1.0000   1.0000   1.0000
## Pos Pred Value          1.000   1.0000   1.0000   1.0000
## Neg Pred Value          1.000   1.0000   1.0000   1.0000
## Prevalence              0.127   0.3651   0.1905   0.3175
## Detection Rate          0.127   0.3651   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy       1.000   1.0000   1.0000   1.0000
confusionMatrix(training$response, predict(SVM2, training))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  8  0  0  0
##          2  0 23  0  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.3651     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity             1.000   1.0000   1.0000   1.0000
## Specificity             1.000   1.0000   1.0000   1.0000
## Pos Pred Value          1.000   1.0000   1.0000   1.0000
## Neg Pred Value          1.000   1.0000   1.0000   1.0000
## Prevalence              0.127   0.3651   0.1905   0.3175
## Detection Rate          0.127   0.3651   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy       1.000   1.0000   1.0000   1.0000

The CART2 model is the only one of the four models that did not have an accuracy of 100% when applied to the training data. CART2 had an accuracy of 85.71%.

D.

predicted.CART2 <- predict(CART2, newdata = testing)
confusionMatrix(predicted.CART2, testing$response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 0 0 0 0
##          2 0 4 0 1
##          3 3 1 5 1
##          4 0 1 1 3
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.3605, 0.8088)
##     No Information Rate : 0.3             
##     P-Value [Acc > NIR] : 0.005138        
##                                           
##                   Kappa : 0.4386          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              0.00   0.6667   0.8333   0.6000
## Specificity              1.00   0.9286   0.6429   0.8667
## Pos Pred Value            NaN   0.8000   0.5000   0.6000
## Neg Pred Value           0.85   0.8667   0.9000   0.8667
## Prevalence               0.15   0.3000   0.3000   0.2500
## Detection Rate           0.00   0.2000   0.2500   0.1500
## Detection Prevalence     0.00   0.2500   0.5000   0.2500
## Balanced Accuracy        0.50   0.7976   0.7381   0.7333
predicted.RF2 <- predict(RF2, newdata = testing)
confusionMatrix(predicted.RF2, testing$response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 3 0 0 0
##          2 0 6 1 0
##          3 0 0 5 0
##          4 0 0 0 5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.95            
##                  95% CI : (0.7513, 0.9987)
##     No Information Rate : 0.3             
##     P-Value [Acc > NIR] : 1.662e-09       
##                                           
##                   Kappa : 0.932           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              1.00   1.0000   0.8333     1.00
## Specificity              1.00   0.9286   1.0000     1.00
## Pos Pred Value           1.00   0.8571   1.0000     1.00
## Neg Pred Value           1.00   1.0000   0.9333     1.00
## Prevalence               0.15   0.3000   0.3000     0.25
## Detection Rate           0.15   0.3000   0.2500     0.25
## Detection Prevalence     0.15   0.3500   0.2500     0.25
## Balanced Accuracy        1.00   0.9643   0.9167     1.00
predicted.GBM2 <- predict(GBM2, newdata = testing)
confusionMatrix(predicted.GBM2, testing$response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 3 0 0 0
##          2 0 6 0 0
##          3 0 0 6 0
##          4 0 0 0 5
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8316, 1)
##     No Information Rate : 0.3        
##     P-Value [Acc > NIR] : 3.487e-11  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              1.00      1.0      1.0     1.00
## Specificity              1.00      1.0      1.0     1.00
## Pos Pred Value           1.00      1.0      1.0     1.00
## Neg Pred Value           1.00      1.0      1.0     1.00
## Prevalence               0.15      0.3      0.3     0.25
## Detection Rate           0.15      0.3      0.3     0.25
## Detection Prevalence     0.15      0.3      0.3     0.25
## Balanced Accuracy        1.00      1.0      1.0     1.00
predicted.SVM2 <- predict(SVM2, newdata = testing)
confusionMatrix(predicted.SVM2, testing$response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 3 0 0 0
##          2 0 6 2 0
##          3 0 0 4 0
##          4 0 0 0 5
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9            
##                  95% CI : (0.683, 0.9877)
##     No Information Rate : 0.3            
##     P-Value [Acc > NIR] : 3.773e-08      
##                                          
##                   Kappa : 0.8639         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              1.00   1.0000   0.6667     1.00
## Specificity              1.00   0.8571   1.0000     1.00
## Pos Pred Value           1.00   0.7500   1.0000     1.00
## Neg Pred Value           1.00   1.0000   0.8750     1.00
## Prevalence               0.15   0.3000   0.3000     0.25
## Detection Rate           0.15   0.3000   0.2000     0.25
## Detection Prevalence     0.15   0.4000   0.2000     0.25
## Balanced Accuracy        1.00   0.9286   0.8333     1.00

The CART2 model once again had the lowest accuracy of the four when applied to the testing data. CART2’s accuracy was 60% while RF2’s was 95%, GBM2’s was 100%, and SVM2’s was 90%.

Problem 3

A.

normalize <- function(x) { (x - min(x)) / (max(x) - min(x)) }

energy <- read.csv(file = "/Users/alex/Dropbox/College/4-Senior/Machine Learning/Project5/ENB2012_data.csv", header = TRUE, sep = ",")
energy <- as.data.frame(lapply(energy, normalize))


set.seed("12345")

dp <- createDataPartition(energy$Y1, p = 0.7, list = FALSE)
training <- energy[dp,]
testing <- energy[-dp,]

Created a normalize function, read in the data and normalized it. Set the seed and created training and testing partitions.

B.

NN3b <- train(Y1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = training, method = "nnet", trace = FALSE)

Created a neural network model using Y1 as the response and X1 through X8 as the predictors.

C.

cor(predict(NN3b, testing), testing$Y1)^2
##           [,1]
## [1,] 0.9932561

Calculated R2 for NN3b on the testing data, as seen above it is 0.9932561.

D.

NN3d <- neuralnet(Y1 + Y2 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = training, hidden = 1)
plot(NN3d, rep = "best")

cor(compute(NN3d, testing[, 1:8])$net.result[ , 1], testing$Y1)^2
## [1] 0.9076677614
cor(compute(NN3d, testing[, 1:8])$net.result[ , 2], testing$Y2)^2
## [1] 0.8716707806

R2 for Y1 on NN3d with the testing data was 0.9076678
R2 for Y2 on NN3d with the testing data was 0.8716708

E.

NN3e <- neuralnet(Y1 + Y2 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = training, hidden = c(2, 1), stepmax = 225000)
plot(NN3e, rep = "best")

cor(compute(NN3e, testing[, 1:8])$net.result[ , 1], testing$Y1)^2
## [1] 0.9759201642
cor(compute(NN3e, testing[, 1:8])$net.result[ , 2], testing$Y2)^2
## [1] 0.9429601469

R2 for Y1 on NN3e with the testing data was 0.9759202
R2 for Y2 on NN3e with the testing data was 0.9429601

Problem 4

A.

data("Boston")
set.seed("12345")

Read in data and set the seed.

B.

NN4b <- neuralnet(medv ~ lstat, data = Boston)
xList <- seq(0,40,.2)
predicted.NN4b <- compute(NN4b, xList)
plot(Boston$lstat, Boston$medv, pch = 20)
lines(xList, predicted.NN4b$net.result, col = "green")

The fit of the line is rather poor and does not “fit” the data really at all.

C.

normalBoston <- data.frame(lapply(Boston, normalize))

Normalized the data.

D.

NN4d <- neuralnet(medv ~ lstat, data = normalBoston)
xList<-seq(0,1,.02)
predicted.NN4d <- compute(NN4d,xList)
plot(normalBoston$lstat, normalBoston$medv, pch = 20)
lines(xList, predicted.NN4d$net.result, col = "red")

This curve for NN4d fits the data much better, taking a similar shape as the data shown in the graph.

E.

plot(NN4d, rep = "best")

\(y=2.70587−2.53167S(0.99799+6.0366x)\)
y = medv
x = lstat
S = activation function

F.

NN4f <- neuralnet(medv ~ lstat, data = normalBoston, hidden = c(2,2))
xList<-seq(0,1,.02)
predicted.NN4f <- compute(NN4f,xList)
plot(normalBoston$lstat, normalBoston$medv, pch = 20)
lines(xList, predicted.NN4f$net.result, col = "blue")

The fit of the line for NN4f appears rather similar to that of the one for NN4d. Not necessarily better or worse but a bit different, notably the curl to the left as medv appraches 1.