library(caret)
library(caretEnsemble)
library(ISLR)
library(randomForest)
library(gbm)
library(e1071)
library(nnet)
library(neuralnet)
library(MASS)
library(ggplot2)

Question 1

A

ionosphere<- read.csv(file="https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data", header = FALSE)
ionosphere<- ionosphere[,-c(nearZeroVar(ionosphere))]
ionosphere$V1<-as.numeric(ionosphere$V1)

set.seed(12345)
partition <- createDataPartition(y=ionosphere$V35, p=.7, list= FALSE)
trainingSet1<- ionosphere[partition,]
testingSet1<- ionosphere[-partition,]

B

models1 <- caretList(V35~., data=trainingSet1, trControl=trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE), methodList=c('knn','lda','rpart'))
results1 <- resamples(models1)
summary(results1)
## 
## Call:
## summary.resamples(object = results1)
## 
## Models: knn, lda, rpart 
## Number of resamples: 10 
## 
## Accuracy 
##       Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## knn   0.80  0.8000 0.8542 0.8545  0.9075 0.92    0
## lda   0.75  0.8400 0.8800 0.8663  0.9075 0.96    0
## rpart 0.80  0.8762 0.9200 0.9067  0.9596 0.96    0
## 
## Kappa 
##         Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## knn   0.5059  0.5318 0.6497 0.6569  0.7902 0.8175    0
## lda   0.3846  0.6154 0.7261 0.6832  0.7833 0.9110    0
## rpart 0.5763  0.7287 0.8304 0.7992  0.9110 0.9153    0
dotplot(results1)

C)

modelCor(results1)
##             knn       lda     rpart
## knn   1.0000000 0.5603657 0.3114024
## lda   0.5603657 1.0000000 0.4902437
## rpart 0.3114024 0.4902437 1.0000000
models1 <- caretList(V35~., data=trainingSet1, trControl=trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE), methodList=c('knn','rpart'))

D)

stack1 <- caretStack(models1, method="glm", metric="Accuracy", trControl=trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE))

knn1<-train(V35~., data=trainingSet1, method="knn", trControl=trainControl(method="cv", number=10))

cart1<-train(V35~., data=trainingSet1, method="rpart", trControl=trainControl(method="cv", number=10))
confusionMatrix(trainingSet1$V35, predict(stack1, trainingSet1))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   b   g
##          b  82   7
##          g   3 155
##                                           
##                Accuracy : 0.9595          
##                  95% CI : (0.9268, 0.9804)
##     No Information Rate : 0.6559          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9113          
##  Mcnemar's Test P-Value : 0.3428          
##                                           
##             Sensitivity : 0.9647          
##             Specificity : 0.9568          
##          Pos Pred Value : 0.9213          
##          Neg Pred Value : 0.9810          
##              Prevalence : 0.3441          
##          Detection Rate : 0.3320          
##    Detection Prevalence : 0.3603          
##       Balanced Accuracy : 0.9607          
##                                           
##        'Positive' Class : b               
## 
confusionMatrix(trainingSet1$V35, predict(knn1, trainingSet1))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   b   g
##          b  59  30
##          g   3 155
##                                           
##                Accuracy : 0.8664          
##                  95% CI : (0.8175, 0.9062)
##     No Information Rate : 0.749           
##     P-Value [Acc > NIR] : 4.287e-06       
##                                           
##                   Kappa : 0.6896          
##  Mcnemar's Test P-Value : 6.011e-06       
##                                           
##             Sensitivity : 0.9516          
##             Specificity : 0.8378          
##          Pos Pred Value : 0.6629          
##          Neg Pred Value : 0.9810          
##              Prevalence : 0.2510          
##          Detection Rate : 0.2389          
##    Detection Prevalence : 0.3603          
##       Balanced Accuracy : 0.8947          
##                                           
##        'Positive' Class : b               
## 
confusionMatrix(trainingSet1$V35, predict(cart1, trainingSet1))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   b   g
##          b  80   9
##          g  11 147
##                                           
##                Accuracy : 0.919           
##                  95% CI : (0.8777, 0.9498)
##     No Information Rate : 0.6316          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8252          
##  Mcnemar's Test P-Value : 0.8231          
##                                           
##             Sensitivity : 0.8791          
##             Specificity : 0.9423          
##          Pos Pred Value : 0.8989          
##          Neg Pred Value : 0.9304          
##              Prevalence : 0.3684          
##          Detection Rate : 0.3239          
##    Detection Prevalence : 0.3603          
##       Balanced Accuracy : 0.9107          
##                                           
##        'Positive' Class : b               
## 

The accuracy for the stacked model is higher than the knn and the cart models.

E)

predicted_stack1<- predict(stack1, newdata = testingSet1)
confusionMatrix(predicted_stack1, testingSet1$V35)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  b  g
##          b 29  3
##          g  8 64
##                                          
##                Accuracy : 0.8942         
##                  95% CI : (0.8186, 0.946)
##     No Information Rate : 0.6442         
##     P-Value [Acc > NIR] : 5.657e-09      
##                                          
##                   Kappa : 0.7621         
##  Mcnemar's Test P-Value : 0.2278         
##                                          
##             Sensitivity : 0.7838         
##             Specificity : 0.9552         
##          Pos Pred Value : 0.9062         
##          Neg Pred Value : 0.8889         
##              Prevalence : 0.3558         
##          Detection Rate : 0.2788         
##    Detection Prevalence : 0.3077         
##       Balanced Accuracy : 0.8695         
##                                          
##        'Positive' Class : b              
## 
predicted_knn1<- predict(knn1, newdata = testingSet1)
confusionMatrix(predicted_knn1, testingSet1$V35)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  b  g
##          b 19  2
##          g 18 65
##                                           
##                Accuracy : 0.8077          
##                  95% CI : (0.7187, 0.8784)
##     No Information Rate : 0.6442          
##     P-Value [Acc > NIR] : 0.0002079       
##                                           
##                   Kappa : 0.5355          
##  Mcnemar's Test P-Value : 0.0007962       
##                                           
##             Sensitivity : 0.5135          
##             Specificity : 0.9701          
##          Pos Pred Value : 0.9048          
##          Neg Pred Value : 0.7831          
##              Prevalence : 0.3558          
##          Detection Rate : 0.1827          
##    Detection Prevalence : 0.2019          
##       Balanced Accuracy : 0.7418          
##                                           
##        'Positive' Class : b               
## 
predicted_cart1<- predict(cart1, newdata = testingSet1)
confusionMatrix(predicted_cart1, testingSet1$V35)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  b  g
##          b 30  5
##          g  7 62
##                                           
##                Accuracy : 0.8846          
##                  95% CI : (0.8071, 0.9389)
##     No Information Rate : 0.6442          
##     P-Value [Acc > NIR] : 2.487e-08       
##                                           
##                   Kappa : 0.7452          
##  Mcnemar's Test P-Value : 0.7728          
##                                           
##             Sensitivity : 0.8108          
##             Specificity : 0.9254          
##          Pos Pred Value : 0.8571          
##          Neg Pred Value : 0.8986          
##              Prevalence : 0.3558          
##          Detection Rate : 0.2885          
##    Detection Prevalence : 0.3365          
##       Balanced Accuracy : 0.8681          
##                                           
##        'Positive' Class : b               
## 

When using the testing data, the accuracy for the stacked model is still better that the other two models.

Question 2

A)

data("Khan")
trainingSet2<- data.frame(Khan$xtrain)
trainingSet2$response<- as.factor(Khan$ytrain)
testingSet2<- data.frame(Khan$xtest)
testingSet2$response<- as.factor(Khan$ytest)
set.seed(12345)

B)

cart2<-train(response~., data=trainingSet2, method="rpart", trControl=trainControl(method="cv", number=10))
RF2<-train(response~., data=trainingSet2, method="rf", trControl=trainControl(method="cv", number=10))
GBM2<-train(response~., data=trainingSet2, method="gbm", trControl=trainControl(method="cv", number=10))
SVM2<-train(response~., data=trainingSet2, method="svmLinear", trControl=trainControl(method="cv", number=10))

C)

confusionMatrix(trainingSet2$response, predict(cart2, trainingSet2))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  0  0  7  1
##          2  0 22  1  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8571          
##                  95% CI : (0.7461, 0.9325)
##     No Information Rate : 0.3492          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7977          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity                NA   1.0000   0.6000   0.9524
## Specificity             0.873   0.9756   1.0000   1.0000
## Pos Pred Value             NA   0.9565   1.0000   1.0000
## Neg Pred Value             NA   1.0000   0.8431   0.9767
## Prevalence              0.000   0.3492   0.3175   0.3333
## Detection Rate          0.000   0.3492   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy          NA   0.9878   0.8000   0.9762
confusionMatrix(trainingSet2$response, predict(RF2, trainingSet2))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  8  0  0  0
##          2  0 23  0  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.3651     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity             1.000   1.0000   1.0000   1.0000
## Specificity             1.000   1.0000   1.0000   1.0000
## Pos Pred Value          1.000   1.0000   1.0000   1.0000
## Neg Pred Value          1.000   1.0000   1.0000   1.0000
## Prevalence              0.127   0.3651   0.1905   0.3175
## Detection Rate          0.127   0.3651   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy       1.000   1.0000   1.0000   1.0000
confusionMatrix(trainingSet2$response, predict(GBM2, trainingSet2))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  8  0  0  0
##          2  0 23  0  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.3651     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity             1.000   1.0000   1.0000   1.0000
## Specificity             1.000   1.0000   1.0000   1.0000
## Pos Pred Value          1.000   1.0000   1.0000   1.0000
## Neg Pred Value          1.000   1.0000   1.0000   1.0000
## Prevalence              0.127   0.3651   0.1905   0.3175
## Detection Rate          0.127   0.3651   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy       1.000   1.0000   1.0000   1.0000
confusionMatrix(trainingSet2$response, predict(SVM2, trainingSet2))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  8  0  0  0
##          2  0 23  0  0
##          3  0  0 12  0
##          4  0  0  0 20
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.3651     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity             1.000   1.0000   1.0000   1.0000
## Specificity             1.000   1.0000   1.0000   1.0000
## Pos Pred Value          1.000   1.0000   1.0000   1.0000
## Neg Pred Value          1.000   1.0000   1.0000   1.0000
## Prevalence              0.127   0.3651   0.1905   0.3175
## Detection Rate          0.127   0.3651   0.1905   0.3175
## Detection Prevalence    0.127   0.3651   0.1905   0.3175
## Balanced Accuracy       1.000   1.0000   1.0000   1.0000

The accuracy for each model is 1, except for CART which has an accuracy of 0.8571. We expect a high accuracy since this is the training data. The accuracy measure for the testing data will be a better indicator of how well the models work.

D)

predicted_cart2<- predict(cart2, newdata = testingSet2)
confusionMatrix(predicted_cart2, testingSet2$response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 0 0 0 0
##          2 0 4 0 1
##          3 3 1 5 1
##          4 0 1 1 3
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.3605, 0.8088)
##     No Information Rate : 0.3             
##     P-Value [Acc > NIR] : 0.005138        
##                                           
##                   Kappa : 0.4386          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              0.00   0.6667   0.8333   0.6000
## Specificity              1.00   0.9286   0.6429   0.8667
## Pos Pred Value            NaN   0.8000   0.5000   0.6000
## Neg Pred Value           0.85   0.8667   0.9000   0.8667
## Prevalence               0.15   0.3000   0.3000   0.2500
## Detection Rate           0.00   0.2000   0.2500   0.1500
## Detection Prevalence     0.00   0.2500   0.5000   0.2500
## Balanced Accuracy        0.50   0.7976   0.7381   0.7333
predicted_RF2<- predict(RF2, newdata = testingSet2)
confusionMatrix(predicted_RF2, testingSet2$response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 3 0 0 0
##          2 0 6 0 0
##          3 0 0 5 0
##          4 0 0 1 5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.95            
##                  95% CI : (0.7513, 0.9987)
##     No Information Rate : 0.3             
##     P-Value [Acc > NIR] : 1.662e-09       
##                                           
##                   Kappa : 0.9322          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              1.00      1.0   0.8333   1.0000
## Specificity              1.00      1.0   1.0000   0.9333
## Pos Pred Value           1.00      1.0   1.0000   0.8333
## Neg Pred Value           1.00      1.0   0.9333   1.0000
## Prevalence               0.15      0.3   0.3000   0.2500
## Detection Rate           0.15      0.3   0.2500   0.2500
## Detection Prevalence     0.15      0.3   0.2500   0.3000
## Balanced Accuracy        1.00      1.0   0.9167   0.9667
predicted_GBM2<- predict(GBM2, newdata = testingSet2)
confusionMatrix(predicted_GBM2, testingSet2$response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 3 0 0 0
##          2 0 5 0 0
##          3 0 0 6 0
##          4 0 1 0 5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.95            
##                  95% CI : (0.7513, 0.9987)
##     No Information Rate : 0.3             
##     P-Value [Acc > NIR] : 1.662e-09       
##                                           
##                   Kappa : 0.9322          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              1.00   0.8333      1.0   1.0000
## Specificity              1.00   1.0000      1.0   0.9333
## Pos Pred Value           1.00   1.0000      1.0   0.8333
## Neg Pred Value           1.00   0.9333      1.0   1.0000
## Prevalence               0.15   0.3000      0.3   0.2500
## Detection Rate           0.15   0.2500      0.3   0.2500
## Detection Prevalence     0.15   0.2500      0.3   0.3000
## Balanced Accuracy        1.00   0.9167      1.0   0.9667
predicted_SVM2<- predict(SVM2, newdata = testingSet2)
confusionMatrix(predicted_SVM2, testingSet2$response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 1 2 3 4
##          1 3 0 0 0
##          2 0 6 2 0
##          3 0 0 4 0
##          4 0 0 0 5
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9            
##                  95% CI : (0.683, 0.9877)
##     No Information Rate : 0.3            
##     P-Value [Acc > NIR] : 3.773e-08      
##                                          
##                   Kappa : 0.8639         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity              1.00   1.0000   0.6667     1.00
## Specificity              1.00   0.8571   1.0000     1.00
## Pos Pred Value           1.00   0.7500   1.0000     1.00
## Neg Pred Value           1.00   1.0000   0.8750     1.00
## Prevalence               0.15   0.3000   0.3000     0.25
## Detection Rate           0.15   0.3000   0.2000     0.25
## Detection Prevalence     0.15   0.4000   0.2000     0.25
## Balanced Accuracy        1.00   0.9286   0.8333     1.00

Each of the models is a fairly good predictor except CART which only has an accuracy of 0.6. The random forest model and the gradient boosting machine model were the best predictors each with an accuracy of 0.95 each.

Question 3

A)

buildings<-read.csv("C:\\Users\\Denise\\Documents/buildings.csv", header=TRUE)
buildings$X<-NULL
buildings$X.1<-NULL
buildings<- buildings[-c(769:1296), ] 
normalize<-function(x){
  return((x-min(x))/(max(x)-min(x)))
}
newbuildings<- data.frame(lapply(buildings, normalize))

set.seed(12345)
partition <- createDataPartition(y=newbuildings$Y1, p=.7, list= FALSE)
trainingSet3<- newbuildings[partition,]
testingSet3<- newbuildings[-partition,]

B)

NN3b<-train(Y1~.-Y2, data=trainingSet3, method="nnet", trControl=trainControl(method="cv", number=10))

C)

predicted_NN3b<- predict(NN3b, newdata = testingSet3)
cor(predicted_NN3b, testingSet3$Y1)^2
##          [,1]
## [1,] 0.986851

D)

NN3d<-neuralnet(Y1+Y2~ X1+X2+X3+X4+X5+X6+X7+X8, data=trainingSet3)
plot(NN3d, rep="best")

predicted_NN3d<- compute(NN3d, testingSet3[,1:8])
cor(predicted_NN3d$net.result, testingSet3$Y1)
##              [,1]
## [1,] 0.9529890362
## [2,] 0.9529890362
cor(predicted_NN3d$net.result, testingSet3$Y2)
##              [,1]
## [1,] 0.9340293044
## [2,] 0.9340293044

E)

NN3e<-neuralnet(Y1+Y2~ X1+X2+X3+X4+X5+X6+X7+X8, data=trainingSet3, hidden=c(2,1))
plot(NN3e, rep="best")

predicted_NN3e<- compute(NN3e, testingSet3[,1:8])
cor(predicted_NN3e$net.result, testingSet3$Y1)^2
##              [,1]
## [1,] 0.9765887975
## [2,] 0.9765887975
cor(predicted_NN3e$net.result, testingSet3$Y2)^2
##              [,1]
## [1,] 0.9438385522
## [2,] 0.9438385522

Question 4

A)

data("Boston")
set.seed(12345)

B)

NN4b<-neuralnet(medv~ lstat, data=Boston)
xlist<- seq(0,40,.2)
predicted_NN4b<-compute(NN4b,xlist)
plot(Boston$lstat, Boston$medv)
lines(xlist, predicted_NN4b$net.result, col="red")

The fit of this model is terrible. The model appears linear while the data clearly is not.

C)

normalBoston<- data.frame(lapply(Boston, normalize))

D)

NN4d<-neuralnet(medv~ lstat, data=normalBoston)
xlist<-seq(0,1,.02)
predicted_NN4d<-compute(NN4d,xlist)
plot(normalBoston$lstat, normalBoston$medv)
lines(xlist, predicted_NN4d$net.result, col="red")

The model is much better after we normalized the data. It appears to be a good fit.

E)

plot(NN4d, rep="best")

y = 0.99799 + 6.0366 * S (2.70587 - 2.53167x)
where y = medv, and x = lstat

F)

NN4f<-neuralnet(medv~ lstat, data=normalBoston, hidden=c(2,2))
predicted_NN4f<-compute(NN4f,xlist)
plot(normalBoston$lstat, normalBoston$medv)
lines(xlist, predicted_NN4f$net.result, col="blue")

The fit of this graph is also very good. There are a few shape differences from the model with only 1 hidden layer.