library(caret)
library(caretEnsemble)
library(ISLR)
library(randomForest)
library(gbm)
library(e1071)
library(nnet)
library(neuralnet)
library(MASS)
library(ggplot2)
ionosphere<- read.csv(file="https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data", header = FALSE)
ionosphere<- ionosphere[,-c(nearZeroVar(ionosphere))]
ionosphere$V1<-as.numeric(ionosphere$V1)
set.seed(12345)
partition <- createDataPartition(y=ionosphere$V35, p=.7, list= FALSE)
trainingSet1<- ionosphere[partition,]
testingSet1<- ionosphere[-partition,]
models1 <- caretList(V35~., data=trainingSet1, trControl=trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE), methodList=c('knn','lda','rpart'))
results1 <- resamples(models1)
summary(results1)
##
## Call:
## summary.resamples(object = results1)
##
## Models: knn, lda, rpart
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## knn 0.80 0.8000 0.8542 0.8545 0.9075 0.92 0
## lda 0.75 0.8400 0.8800 0.8663 0.9075 0.96 0
## rpart 0.80 0.8762 0.9200 0.9067 0.9596 0.96 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## knn 0.5059 0.5318 0.6497 0.6569 0.7902 0.8175 0
## lda 0.3846 0.6154 0.7261 0.6832 0.7833 0.9110 0
## rpart 0.5763 0.7287 0.8304 0.7992 0.9110 0.9153 0
dotplot(results1)
modelCor(results1)
## knn lda rpart
## knn 1.0000000 0.5603657 0.3114024
## lda 0.5603657 1.0000000 0.4902437
## rpart 0.3114024 0.4902437 1.0000000
models1 <- caretList(V35~., data=trainingSet1, trControl=trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE), methodList=c('knn','rpart'))
stack1 <- caretStack(models1, method="glm", metric="Accuracy", trControl=trainControl(method="cv", number=10, savePredictions=TRUE, classProbs=TRUE))
knn1<-train(V35~., data=trainingSet1, method="knn", trControl=trainControl(method="cv", number=10))
cart1<-train(V35~., data=trainingSet1, method="rpart", trControl=trainControl(method="cv", number=10))
confusionMatrix(trainingSet1$V35, predict(stack1, trainingSet1))
## Confusion Matrix and Statistics
##
## Reference
## Prediction b g
## b 82 7
## g 3 155
##
## Accuracy : 0.9595
## 95% CI : (0.9268, 0.9804)
## No Information Rate : 0.6559
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9113
## Mcnemar's Test P-Value : 0.3428
##
## Sensitivity : 0.9647
## Specificity : 0.9568
## Pos Pred Value : 0.9213
## Neg Pred Value : 0.9810
## Prevalence : 0.3441
## Detection Rate : 0.3320
## Detection Prevalence : 0.3603
## Balanced Accuracy : 0.9607
##
## 'Positive' Class : b
##
confusionMatrix(trainingSet1$V35, predict(knn1, trainingSet1))
## Confusion Matrix and Statistics
##
## Reference
## Prediction b g
## b 59 30
## g 3 155
##
## Accuracy : 0.8664
## 95% CI : (0.8175, 0.9062)
## No Information Rate : 0.749
## P-Value [Acc > NIR] : 4.287e-06
##
## Kappa : 0.6896
## Mcnemar's Test P-Value : 6.011e-06
##
## Sensitivity : 0.9516
## Specificity : 0.8378
## Pos Pred Value : 0.6629
## Neg Pred Value : 0.9810
## Prevalence : 0.2510
## Detection Rate : 0.2389
## Detection Prevalence : 0.3603
## Balanced Accuracy : 0.8947
##
## 'Positive' Class : b
##
confusionMatrix(trainingSet1$V35, predict(cart1, trainingSet1))
## Confusion Matrix and Statistics
##
## Reference
## Prediction b g
## b 80 9
## g 11 147
##
## Accuracy : 0.919
## 95% CI : (0.8777, 0.9498)
## No Information Rate : 0.6316
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8252
## Mcnemar's Test P-Value : 0.8231
##
## Sensitivity : 0.8791
## Specificity : 0.9423
## Pos Pred Value : 0.8989
## Neg Pred Value : 0.9304
## Prevalence : 0.3684
## Detection Rate : 0.3239
## Detection Prevalence : 0.3603
## Balanced Accuracy : 0.9107
##
## 'Positive' Class : b
##
The accuracy for the stacked model is higher than the knn and the cart models.
predicted_stack1<- predict(stack1, newdata = testingSet1)
confusionMatrix(predicted_stack1, testingSet1$V35)
## Confusion Matrix and Statistics
##
## Reference
## Prediction b g
## b 29 3
## g 8 64
##
## Accuracy : 0.8942
## 95% CI : (0.8186, 0.946)
## No Information Rate : 0.6442
## P-Value [Acc > NIR] : 5.657e-09
##
## Kappa : 0.7621
## Mcnemar's Test P-Value : 0.2278
##
## Sensitivity : 0.7838
## Specificity : 0.9552
## Pos Pred Value : 0.9062
## Neg Pred Value : 0.8889
## Prevalence : 0.3558
## Detection Rate : 0.2788
## Detection Prevalence : 0.3077
## Balanced Accuracy : 0.8695
##
## 'Positive' Class : b
##
predicted_knn1<- predict(knn1, newdata = testingSet1)
confusionMatrix(predicted_knn1, testingSet1$V35)
## Confusion Matrix and Statistics
##
## Reference
## Prediction b g
## b 19 2
## g 18 65
##
## Accuracy : 0.8077
## 95% CI : (0.7187, 0.8784)
## No Information Rate : 0.6442
## P-Value [Acc > NIR] : 0.0002079
##
## Kappa : 0.5355
## Mcnemar's Test P-Value : 0.0007962
##
## Sensitivity : 0.5135
## Specificity : 0.9701
## Pos Pred Value : 0.9048
## Neg Pred Value : 0.7831
## Prevalence : 0.3558
## Detection Rate : 0.1827
## Detection Prevalence : 0.2019
## Balanced Accuracy : 0.7418
##
## 'Positive' Class : b
##
predicted_cart1<- predict(cart1, newdata = testingSet1)
confusionMatrix(predicted_cart1, testingSet1$V35)
## Confusion Matrix and Statistics
##
## Reference
## Prediction b g
## b 30 5
## g 7 62
##
## Accuracy : 0.8846
## 95% CI : (0.8071, 0.9389)
## No Information Rate : 0.6442
## P-Value [Acc > NIR] : 2.487e-08
##
## Kappa : 0.7452
## Mcnemar's Test P-Value : 0.7728
##
## Sensitivity : 0.8108
## Specificity : 0.9254
## Pos Pred Value : 0.8571
## Neg Pred Value : 0.8986
## Prevalence : 0.3558
## Detection Rate : 0.2885
## Detection Prevalence : 0.3365
## Balanced Accuracy : 0.8681
##
## 'Positive' Class : b
##
When using the testing data, the accuracy for the stacked model is still better that the other two models.
data("Khan")
trainingSet2<- data.frame(Khan$xtrain)
trainingSet2$response<- as.factor(Khan$ytrain)
testingSet2<- data.frame(Khan$xtest)
testingSet2$response<- as.factor(Khan$ytest)
set.seed(12345)
cart2<-train(response~., data=trainingSet2, method="rpart", trControl=trainControl(method="cv", number=10))
RF2<-train(response~., data=trainingSet2, method="rf", trControl=trainControl(method="cv", number=10))
GBM2<-train(response~., data=trainingSet2, method="gbm", trControl=trainControl(method="cv", number=10))
SVM2<-train(response~., data=trainingSet2, method="svmLinear", trControl=trainControl(method="cv", number=10))
confusionMatrix(trainingSet2$response, predict(cart2, trainingSet2))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 0 0 7 1
## 2 0 22 1 0
## 3 0 0 12 0
## 4 0 0 0 20
##
## Overall Statistics
##
## Accuracy : 0.8571
## 95% CI : (0.7461, 0.9325)
## No Information Rate : 0.3492
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7977
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity NA 1.0000 0.6000 0.9524
## Specificity 0.873 0.9756 1.0000 1.0000
## Pos Pred Value NA 0.9565 1.0000 1.0000
## Neg Pred Value NA 1.0000 0.8431 0.9767
## Prevalence 0.000 0.3492 0.3175 0.3333
## Detection Rate 0.000 0.3492 0.1905 0.3175
## Detection Prevalence 0.127 0.3651 0.1905 0.3175
## Balanced Accuracy NA 0.9878 0.8000 0.9762
confusionMatrix(trainingSet2$response, predict(RF2, trainingSet2))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 8 0 0 0
## 2 0 23 0 0
## 3 0 0 12 0
## 4 0 0 0 20
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9431, 1)
## No Information Rate : 0.3651
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 1.000 1.0000 1.0000 1.0000
## Specificity 1.000 1.0000 1.0000 1.0000
## Pos Pred Value 1.000 1.0000 1.0000 1.0000
## Neg Pred Value 1.000 1.0000 1.0000 1.0000
## Prevalence 0.127 0.3651 0.1905 0.3175
## Detection Rate 0.127 0.3651 0.1905 0.3175
## Detection Prevalence 0.127 0.3651 0.1905 0.3175
## Balanced Accuracy 1.000 1.0000 1.0000 1.0000
confusionMatrix(trainingSet2$response, predict(GBM2, trainingSet2))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 8 0 0 0
## 2 0 23 0 0
## 3 0 0 12 0
## 4 0 0 0 20
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9431, 1)
## No Information Rate : 0.3651
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 1.000 1.0000 1.0000 1.0000
## Specificity 1.000 1.0000 1.0000 1.0000
## Pos Pred Value 1.000 1.0000 1.0000 1.0000
## Neg Pred Value 1.000 1.0000 1.0000 1.0000
## Prevalence 0.127 0.3651 0.1905 0.3175
## Detection Rate 0.127 0.3651 0.1905 0.3175
## Detection Prevalence 0.127 0.3651 0.1905 0.3175
## Balanced Accuracy 1.000 1.0000 1.0000 1.0000
confusionMatrix(trainingSet2$response, predict(SVM2, trainingSet2))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 8 0 0 0
## 2 0 23 0 0
## 3 0 0 12 0
## 4 0 0 0 20
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9431, 1)
## No Information Rate : 0.3651
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 1.000 1.0000 1.0000 1.0000
## Specificity 1.000 1.0000 1.0000 1.0000
## Pos Pred Value 1.000 1.0000 1.0000 1.0000
## Neg Pred Value 1.000 1.0000 1.0000 1.0000
## Prevalence 0.127 0.3651 0.1905 0.3175
## Detection Rate 0.127 0.3651 0.1905 0.3175
## Detection Prevalence 0.127 0.3651 0.1905 0.3175
## Balanced Accuracy 1.000 1.0000 1.0000 1.0000
The accuracy for each model is 1, except for CART which has an accuracy of 0.8571. We expect a high accuracy since this is the training data. The accuracy measure for the testing data will be a better indicator of how well the models work.
predicted_cart2<- predict(cart2, newdata = testingSet2)
confusionMatrix(predicted_cart2, testingSet2$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 0 0 0 0
## 2 0 4 0 1
## 3 3 1 5 1
## 4 0 1 1 3
##
## Overall Statistics
##
## Accuracy : 0.6
## 95% CI : (0.3605, 0.8088)
## No Information Rate : 0.3
## P-Value [Acc > NIR] : 0.005138
##
## Kappa : 0.4386
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.00 0.6667 0.8333 0.6000
## Specificity 1.00 0.9286 0.6429 0.8667
## Pos Pred Value NaN 0.8000 0.5000 0.6000
## Neg Pred Value 0.85 0.8667 0.9000 0.8667
## Prevalence 0.15 0.3000 0.3000 0.2500
## Detection Rate 0.00 0.2000 0.2500 0.1500
## Detection Prevalence 0.00 0.2500 0.5000 0.2500
## Balanced Accuracy 0.50 0.7976 0.7381 0.7333
predicted_RF2<- predict(RF2, newdata = testingSet2)
confusionMatrix(predicted_RF2, testingSet2$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 3 0 0 0
## 2 0 6 0 0
## 3 0 0 5 0
## 4 0 0 1 5
##
## Overall Statistics
##
## Accuracy : 0.95
## 95% CI : (0.7513, 0.9987)
## No Information Rate : 0.3
## P-Value [Acc > NIR] : 1.662e-09
##
## Kappa : 0.9322
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 1.00 1.0 0.8333 1.0000
## Specificity 1.00 1.0 1.0000 0.9333
## Pos Pred Value 1.00 1.0 1.0000 0.8333
## Neg Pred Value 1.00 1.0 0.9333 1.0000
## Prevalence 0.15 0.3 0.3000 0.2500
## Detection Rate 0.15 0.3 0.2500 0.2500
## Detection Prevalence 0.15 0.3 0.2500 0.3000
## Balanced Accuracy 1.00 1.0 0.9167 0.9667
predicted_GBM2<- predict(GBM2, newdata = testingSet2)
confusionMatrix(predicted_GBM2, testingSet2$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 3 0 0 0
## 2 0 5 0 0
## 3 0 0 6 0
## 4 0 1 0 5
##
## Overall Statistics
##
## Accuracy : 0.95
## 95% CI : (0.7513, 0.9987)
## No Information Rate : 0.3
## P-Value [Acc > NIR] : 1.662e-09
##
## Kappa : 0.9322
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 1.00 0.8333 1.0 1.0000
## Specificity 1.00 1.0000 1.0 0.9333
## Pos Pred Value 1.00 1.0000 1.0 0.8333
## Neg Pred Value 1.00 0.9333 1.0 1.0000
## Prevalence 0.15 0.3000 0.3 0.2500
## Detection Rate 0.15 0.2500 0.3 0.2500
## Detection Prevalence 0.15 0.2500 0.3 0.3000
## Balanced Accuracy 1.00 0.9167 1.0 0.9667
predicted_SVM2<- predict(SVM2, newdata = testingSet2)
confusionMatrix(predicted_SVM2, testingSet2$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 3 0 0 0
## 2 0 6 2 0
## 3 0 0 4 0
## 4 0 0 0 5
##
## Overall Statistics
##
## Accuracy : 0.9
## 95% CI : (0.683, 0.9877)
## No Information Rate : 0.3
## P-Value [Acc > NIR] : 3.773e-08
##
## Kappa : 0.8639
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 1.00 1.0000 0.6667 1.00
## Specificity 1.00 0.8571 1.0000 1.00
## Pos Pred Value 1.00 0.7500 1.0000 1.00
## Neg Pred Value 1.00 1.0000 0.8750 1.00
## Prevalence 0.15 0.3000 0.3000 0.25
## Detection Rate 0.15 0.3000 0.2000 0.25
## Detection Prevalence 0.15 0.4000 0.2000 0.25
## Balanced Accuracy 1.00 0.9286 0.8333 1.00
Each of the models is a fairly good predictor except CART which only has an accuracy of 0.6. The random forest model and the gradient boosting machine model were the best predictors each with an accuracy of 0.95 each.
buildings<-read.csv("C:\\Users\\Denise\\Documents/buildings.csv", header=TRUE)
buildings$X<-NULL
buildings$X.1<-NULL
buildings<- buildings[-c(769:1296), ]
normalize<-function(x){
return((x-min(x))/(max(x)-min(x)))
}
newbuildings<- data.frame(lapply(buildings, normalize))
set.seed(12345)
partition <- createDataPartition(y=newbuildings$Y1, p=.7, list= FALSE)
trainingSet3<- newbuildings[partition,]
testingSet3<- newbuildings[-partition,]
NN3b<-train(Y1~.-Y2, data=trainingSet3, method="nnet", trControl=trainControl(method="cv", number=10))
predicted_NN3b<- predict(NN3b, newdata = testingSet3)
cor(predicted_NN3b, testingSet3$Y1)^2
## [,1]
## [1,] 0.986851
NN3d<-neuralnet(Y1+Y2~ X1+X2+X3+X4+X5+X6+X7+X8, data=trainingSet3)
plot(NN3d, rep="best")
predicted_NN3d<- compute(NN3d, testingSet3[,1:8])
cor(predicted_NN3d$net.result, testingSet3$Y1)
## [,1]
## [1,] 0.9529890362
## [2,] 0.9529890362
cor(predicted_NN3d$net.result, testingSet3$Y2)
## [,1]
## [1,] 0.9340293044
## [2,] 0.9340293044
NN3e<-neuralnet(Y1+Y2~ X1+X2+X3+X4+X5+X6+X7+X8, data=trainingSet3, hidden=c(2,1))
plot(NN3e, rep="best")
predicted_NN3e<- compute(NN3e, testingSet3[,1:8])
cor(predicted_NN3e$net.result, testingSet3$Y1)^2
## [,1]
## [1,] 0.9765887975
## [2,] 0.9765887975
cor(predicted_NN3e$net.result, testingSet3$Y2)^2
## [,1]
## [1,] 0.9438385522
## [2,] 0.9438385522
data("Boston")
set.seed(12345)
NN4b<-neuralnet(medv~ lstat, data=Boston)
xlist<- seq(0,40,.2)
predicted_NN4b<-compute(NN4b,xlist)
plot(Boston$lstat, Boston$medv)
lines(xlist, predicted_NN4b$net.result, col="red")
The fit of this model is terrible. The model appears linear while the data clearly is not.
normalBoston<- data.frame(lapply(Boston, normalize))
NN4d<-neuralnet(medv~ lstat, data=normalBoston)
xlist<-seq(0,1,.02)
predicted_NN4d<-compute(NN4d,xlist)
plot(normalBoston$lstat, normalBoston$medv)
lines(xlist, predicted_NN4d$net.result, col="red")
The model is much better after we normalized the data. It appears to be a good fit.
plot(NN4d, rep="best")
y = 0.99799 + 6.0366 * S (2.70587 - 2.53167x)
where y = medv, and x = lstat
NN4f<-neuralnet(medv~ lstat, data=normalBoston, hidden=c(2,2))
predicted_NN4f<-compute(NN4f,xlist)
plot(normalBoston$lstat, normalBoston$medv)
lines(xlist, predicted_NN4f$net.result, col="blue")
The fit of this graph is also very good. There are a few shape differences from the model with only 1 hidden layer.