Michal Greenwood, Ashley Mata, Andrew Padro, John Thomas and Manuel Valles
May 13th, 2019
v<-data.frame(read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', header = FALSE))
v[1]<-as.factor(ifelse(v[1]=='democrat',0,1))
set.seed(1)
train<-sample(nrow(v),nrow(v)*.8)
v.train<-v[train,]
v.test<-v[-train,]
library(caret)
library(dplyr)
library(gbm)
library(randomForest)
library(MASS)
There are other ways to do this, but this is what we did.
amount_undecided<-length(1:ncol(v))
for (i in 1:ncol(v)){ amount_undecided[i]<-nrow(v[v[i]=='?',])}
perc_und_party<- list()
for (i in 1:ncol(v)){ perc_und_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='?',1]))}
perc_und_party<-as.data.frame(perc_und_party)
names(perc_und_party)<-names(v)
rownames(perc_und_party)<-c('Dem?_percent','Rep?_percent','n?')
perc_no_party<- list()
for (i in 1:ncol(v)){ perc_no_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='n',1]))}
perc_no_party<-as.data.frame(perc_no_party)
names(perc_no_party)<-names(v)
rownames(perc_no_party)<-c('DemNO_percent','RepNO_percent','nNO')
perc_yes_party<- list()
for (i in 1:ncol(v)){ perc_yes_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='y',1]))}
perc_yes_party<-as.data.frame(perc_yes_party)
names(perc_yes_party)<-names(v)
rownames(perc_yes_party)<-c('DemYES_percent','RepYES_percent','nYES')
num_und_party<- list()
for (i in 1:ncol(v)){ num_und_party[[i]]<-as.matrix(table(v[v[i]=='?',1]))}
num_und_party<-as.data.frame(num_und_party)
names(num_und_party)<-names(v)
rownames(num_und_party)<-c('Dem?_#','Rep?_#')
num_no_party<- list()
for (i in 1:ncol(v)){ num_no_party[[i]]<-as.matrix(table(v[v[i]=='n',1]))}
num_no_party<-as.data.frame(num_no_party)
names(num_no_party)<-names(v)
rownames(num_no_party)<-c('DemNO_#','RepNO_#')
num_yes_party<- list()
for (i in 1:ncol(v)){ num_yes_party[[i]]<-as.matrix(table(v[v[i]=='y',1]))}
num_yes_party<-as.data.frame(num_yes_party)
names(num_yes_party)<-names(v)
rownames(num_yes_party)<-c('DemYES_#','RepYES_#')
bill_pass<-length(1:ncol(v))
for (i in 1:ncol(v)) {ncol
bill_pass[i]<-ifelse(nrow(v[v[i]=='y',])>=218,'pass','no_pass')
}
bill_pass<-t(as.data.frame(bill_pass))
vote_party<-as.matrix(rbind(num_no_party,perc_no_party, num_yes_party, perc_yes_party, num_und_party, perc_und_party))
vote_party<-as.data.frame(rbind(bill_pass,vote_party))
vote_party_pretty<-as.data.frame(t(vote_party[-1]))
n_dem<-NROW(v[v$V1==0,])
n_rep<-NROW(v[v$V1==1,])
vote_party_pretty %>% mutate(PercentDem_No = vote_party_pretty$`DemNO_#`/n_dem )
bill_pass DemNO_# RepNO_# DemNO_percent RepNO_percent nNO
1 no_pass 102 134 43.2203389830508 56.7796610169492 236
2 no_pass 119 73 61.9791666666667 38.0208333333333 192
3 pass 29 142 16.9590643274854 83.0409356725146 171
4 no_pass 245 2 99.1902834008097 0.809716599190283 247
5 no_pass 200 8 96.1538461538462 3.84615384615385 208
6 pass 135 17 88.8157894736842 11.1842105263158 152
7 pass 59 123 32.4175824175824 67.5824175824176 182
8 pass 45 133 25.2808988764045 74.7191011235955 178
9 no_pass 60 146 29.126213592233 70.873786407767 206
10 no_pass 139 73 65.5660377358491 34.4339622641509 212
11 no_pass 126 138 47.7272727272727 52.2727272727273 264
12 no_pass 213 20 91.4163090128755 8.58369098712446 233
13 no_pass 179 22 89.0547263681592 10.9452736318408 201
14 pass 167 3 98.2352941176471 1.76470588235294 170
15 no_pass 91 142 39.0557939914163 60.9442060085837 233
16 pass 12 50 19.3548387096774 80.6451612903226 62
DemYES_# RepYES_# DemYES_percent RepYES_percent nYES Dem?_# Rep?_#
1 156 31 83.4224598930481 16.5775401069519 187 9 3
2 120 75 61.5384615384615 38.4615384615385 195 28 20
3 231 22 91.304347826087 8.69565217391304 253 7 4
4 14 163 7.90960451977401 92.090395480226 177 8 3
5 55 157 25.9433962264151 74.0566037735849 212 12 3
6 123 149 45.2205882352941 54.7794117647059 272 9 2
7 200 39 83.6820083682008 16.3179916317992 239 8 6
8 218 24 90.0826446280992 9.91735537190083 242 4 11
9 188 19 90.8212560386473 9.17874396135266 207 19 3
10 124 92 57.4074074074074 42.5925925925926 216 4 3
11 129 21 86 14 150 12 9
12 36 135 21.0526315789474 78.9473684210526 171 18 13
13 73 136 34.9282296650718 65.0717703349282 209 15 10
14 90 158 36.2903225806452 63.7096774193548 248 10 7
15 160 14 91.9540229885057 8.04597701149425 174 16 12
16 173 96 64.3122676579926 35.6877323420074 269 82 22
Dem?_percent Rep?_percent n? PercentDem_No
1 75 25 12 NA
2 58.3333333333333 41.6666666666667 48 NA
3 63.6363636363636 36.3636363636364 11 NA
4 72.7272727272727 27.2727272727273 11 NA
5 80 20 15 NA
6 81.8181818181818 18.1818181818182 11 NA
7 57.1428571428571 42.8571428571429 14 NA
8 26.6666666666667 73.3333333333333 15 NA
9 86.3636363636364 13.6363636363636 22 NA
10 57.1428571428571 42.8571428571429 7 NA
11 57.1428571428571 42.8571428571429 21 NA
12 58.0645161290323 41.9354838709677 31 NA
13 60 40 25 NA
14 58.8235294117647 41.1764705882353 17 NA
15 57.1428571428571 42.8571428571429 28 NA
16 78.8461538461538 21.1538461538462 104 NA
#View(vote_party_pretty)
ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(V1 ~ ., family="binomial", data = v.train, trControl = ctrl, tuneLength = 5)
pred = predict(mod_fit, newdata=v.test)
confusionMatrix(data=pred, v.test$V1)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 50 3
1 3 31
Accuracy : 0.931
95% CI : (0.8559, 0.9743)
No Information Rate : 0.6092
P-Value [Acc > NIR] : 7.442e-12
Kappa : 0.8552
Mcnemar's Test P-Value : 1
Sensitivity : 0.9434
Specificity : 0.9118
Pos Pred Value : 0.9434
Neg Pred Value : 0.9118
Prevalence : 0.6092
Detection Rate : 0.5747
Detection Prevalence : 0.6092
Balanced Accuracy : 0.9276
'Positive' Class : 0
varImp(mod_fit$finalModel)
Overall
V2n 0.3076245
V2y 0.1935486
V3n 0.8183339
V3y 0.7024232
V4n 3.0291660
V4y 3.1391322
V5n 37.3727408
V5y 92.4662184
V6n 0.3873454
V6y 0.9557746
V7n 0.3690561
V7y 0.3656961
V8n 0.7579766
V8y 0.6636729
V9n 0.4885352
V9y 0.2193060
V10n 0.4371934
V10y 1.0515425
V11n 0.7635286
V11y 0.8758000
V12n 1.8533688
V12y 4.4335176
V13n 2.6145978
V13y 2.4972471
V14n 0.3765825
V14y 0.7019656
V15n 0.4292976
V15y 0.3709025
V16n 0.6463404
V16y 1.2821558
V17n 0.4721428
V17y 3.2667538
set.seed(1)
library(MASS)
v.lda.train<-lda(V1~., data = v.train, cv = FALSE)
testpredz<-predict(v.lda.train, v.test,type='response')
Test_Predz<-testpredz$class
1-caret::confusionMatrix(table(Test_Predz, Test_Party = v.test$V1))$overall[1]
Accuracy
0.03448276
rf.v<- list()
yhat.rf<-list()
rf.error<-list()
for ( i in 1:16 ) {
set.seed(24)
rf.v[[i]]<-randomForest(V1 ~ ., data = v.train, mtry = i, importance = TRUE)
yhat.rf[[i]]<-predict(rf.v[[i]], newdata = v.test)
rf.error[[i]]<-caret::confusionMatrix(yhat.rf[[i]], v.test$V1)$overall[1]
}
rf.airz<-data.frame(accuracy=unlist(rf.error))
rownames(rf.airz)<-paste0("mtry=",1:16)
par(mfrow=c(1,1))
matplot(1:16, rf.error, xlab = 'mtry', ylab = 'Accuracy', main = "What # of predictors sampled at each split \n gives the lowest error?")
lines(1:16, rf.error, type = "o")
rf_var_imps<-data.frame(
scale(varImp(rf.v[[2]])[1]),
scale(varImp(rf.v[[3]])[1]),
scale(varImp(rf.v[[4]])[1]),
scale(varImp(rf.v[[8]])[1])
)
names(rf_var_imps)<-paste0("mtry_",c(2,3,4,8))
avg_var_imp<-apply(rf_var_imps,1,mean)
best_vars<-names(avg_var_imp[order(-avg_var_imp)])
best_n<-list() #most important variable by descending importances
for(i in 1:16){
best_n[[i]]<-best_vars[1:i]
}
best_n
[[1]]
[1] "V5"
[[2]]
[1] "V5" "V4"
[[3]]
[1] "V5" "V4" "V12"
[[4]]
[1] "V5" "V4" "V12" "V13"
[[5]]
[1] "V5" "V4" "V12" "V13" "V6"
[[6]]
[1] "V5" "V4" "V12" "V13" "V6" "V15"
[[7]]
[1] "V5" "V4" "V12" "V13" "V6" "V15" "V10"
[[8]]
[1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16"
[[9]]
[1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17"
[[10]]
[1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9"
[[11]]
[1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
[[12]]
[1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
[12] "V14"
[[13]]
[1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
[12] "V14" "V3"
[[14]]
[1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
[12] "V14" "V3" "V7"
[[15]]
[1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
[12] "V14" "V3" "V7" "V11"
[[16]]
[1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
[12] "V14" "V3" "V7" "V11" "V2"
Once again, there was probably a shorter way to do this, but Im not going to waste my time fixing it rn. It is a lot of code, so Im just going to show how mtry = 16 was done
mtry_16<- function(x){
errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
errors[[seed]]<-1-confusionMatrix(predict(randomForest(V1~., data = v.tr[,c("V1",best_n[[x]])], mtry = 16), newdata=v.te), v.te$V1)$overall[1]
}
unlist(errors)
}
mtry_16_preds<-list()
for( i in 16:16){
mtry_16_preds[[i]]<-mean(mtry_16(i))
}
mtry_16_preds
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
[[12]]
NULL
[[13]]
NULL
[[14]]
NULL
[[15]]
NULL
[[16]]
[1] 0.04137931
#list of models
mtry_n_preds<-list(mtry_1_preds,mtry_2_preds,mtry_3_preds,
mtry_4_preds,mtry_5_preds,mtry_6_preds,mtry_7_preds,
mtry_8_preds,mtry_9_preds,mtry_10_preds,mtry_11_preds,
mtry_12_preds,mtry_13_preds,mtry_14_preds,mtry_15_preds,
mtry_15_preds)
best_mtry_10_seeds<-data.frame(
mtry1=unlist(mtry_1_preds),mtry2=c(1000, unlist(mtry_2_preds)),mtry3=c(rep(1000,2), unlist(mtry_3_preds)),
mtry4=c(rep(1000,3), unlist(mtry_4_preds)),mtry5=c(rep(1000,4), unlist(mtry_5_preds)),
mtry6=c(rep(1000,5), unlist(mtry_6_preds)),mtry7=c(rep(1000,6), unlist(mtry_7_preds)),
mtry8=c(rep(1000,7), unlist(mtry_8_preds)),mtry9=c(rep(1000,8), unlist(mtry_9_preds)),
mtry10=c(rep(1000,9), unlist(mtry_10_preds)),mtry11=c(rep(1000,10), unlist(mtry_11_preds)),
mtry12=c(rep(1000,11), unlist(mtry_12_preds)),mtry13=c(rep(1000,12), unlist(mtry_13_preds)),
mtry14=c(rep(1000,13), unlist(mtry_14_preds)),mtry15=c(rep(1000,14), unlist(mtry_15_preds)),
mtry16=c(rep(1000,15), unlist(mtry_16_preds))
)
rownames(best_mtry_10_seeds)<-paste0("nvar_",1:16)
best_mtry_10_seeds
mtry1 mtry2 mtry3 mtry4 mtry5
nvar_1 0.03563218 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_2 0.03908046 4.137931e-02 1.000000e+03 1.000000e+03 1.000000e+03
nvar_3 0.03678161 3.678161e-02 2.988506e-02 1.000000e+03 1.000000e+03
nvar_4 0.03908046 4.137931e-02 3.793103e-02 3.218391e-02 1.000000e+03
nvar_5 0.03218391 3.448276e-02 3.448276e-02 3.333333e-02 3.333333e-02
nvar_6 0.03678161 4.022989e-02 3.678161e-02 3.678161e-02 3.333333e-02
nvar_7 0.03793103 3.908046e-02 4.022989e-02 3.793103e-02 3.793103e-02
nvar_8 0.04022989 4.022989e-02 4.827586e-02 4.482759e-02 4.367816e-02
nvar_9 0.04252874 4.022989e-02 4.137931e-02 4.482759e-02 4.827586e-02
nvar_10 0.04482759 3.793103e-02 4.252874e-02 4.482759e-02 4.252874e-02
nvar_11 0.04252874 3.448276e-02 3.333333e-02 3.448276e-02 3.333333e-02
nvar_12 0.04137931 3.678161e-02 3.448276e-02 3.448276e-02 3.908046e-02
nvar_13 0.03908046 3.678161e-02 3.678161e-02 3.678161e-02 3.563218e-02
nvar_14 0.04597701 3.678161e-02 3.793103e-02 3.563218e-02 3.678161e-02
nvar_15 0.04942529 3.448276e-02 3.448276e-02 3.563218e-02 3.678161e-02
nvar_16 0.05057471 3.908046e-02 3.678161e-02 3.793103e-02 3.678161e-02
mtry6 mtry7 mtry8 mtry9 mtry10
nvar_1 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_2 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_3 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_4 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_5 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_6 3.218391e-02 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_7 3.793103e-02 3.563218e-02 1.000000e+03 1.000000e+03 1.000000e+03
nvar_8 4.482759e-02 4.597701e-02 4.597701e-02 1.000000e+03 1.000000e+03
nvar_9 4.942529e-02 5.057471e-02 5.057471e-02 5.172414e-02 1.000000e+03
nvar_10 4.482759e-02 4.712644e-02 4.827586e-02 5.057471e-02 5.057471e-02
nvar_11 3.908046e-02 3.908046e-02 4.137931e-02 3.908046e-02 4.252874e-02
nvar_12 3.908046e-02 3.793103e-02 3.908046e-02 4.022989e-02 4.252874e-02
nvar_13 3.678161e-02 3.908046e-02 3.793103e-02 3.908046e-02 4.252874e-02
nvar_14 3.678161e-02 3.678161e-02 3.793103e-02 3.793103e-02 3.908046e-02
nvar_15 3.678161e-02 3.908046e-02 3.678161e-02 3.678161e-02 3.678161e-02
nvar_16 3.563218e-02 3.563218e-02 3.678161e-02 3.563218e-02 3.678161e-02
mtry11 mtry12 mtry13 mtry14 mtry15
nvar_1 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_2 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_3 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_4 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_5 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_6 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_7 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_8 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_9 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_10 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_11 4.482759e-02 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_12 4.712644e-02 4.712644e-02 1.000000e+03 1.000000e+03 1.000000e+03
nvar_13 4.137931e-02 4.137931e-02 4.712644e-02 1.000000e+03 1.000000e+03
nvar_14 3.793103e-02 4.022989e-02 4.482759e-02 4.367816e-02 1.000000e+03
nvar_15 3.793103e-02 3.793103e-02 4.137931e-02 4.137931e-02 4.022989e-02
nvar_16 3.678161e-02 3.563218e-02 3.908046e-02 4.022989e-02 4.137931e-02
mtry16
nvar_1 1.000000e+03
nvar_2 1.000000e+03
nvar_3 1.000000e+03
nvar_4 1.000000e+03
nvar_5 1.000000e+03
nvar_6 1.000000e+03
nvar_7 1.000000e+03
nvar_8 1.000000e+03
nvar_9 1.000000e+03
nvar_10 1.000000e+03
nvar_11 1.000000e+03
nvar_12 1.000000e+03
nvar_13 1.000000e+03
nvar_14 1.000000e+03
nvar_15 1.000000e+03
nvar_16 4.137931e-02
best_mtry_matrix<-as.matrix(best_mtry_10_seeds)
best_rf_model<-data.frame(which(best_mtry_matrix == min(best_mtry_matrix), arr.ind = TRUE), min(best_mtry_matrix))
names(best_rf_model)<-c('nvar','mtry','error')
best_rf_model
nvar mtry error
nvar_3 3 3 0.02988506
npreds_lda<-
function(x){
errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
errors[[seed]]<-1-confusionMatrix(predict(lda(V1~., data = v.tr[,c("V1",best_n[[x]])] , cv = FALSE), v.te, type='response')$class, v.te$V1)$overall[1]
}
unlist(errors)
}
best_npreds_lda<-list()
for( i in 1:16){
best_npreds_lda[[i]]<-mean(npreds_lda(i))
}
matplot(1:16, best_npreds_lda, xlab = "best n predictors", ylab ="error", main = 'Best n Predictors for LDA' , pch = 19)
lines(1:16, best_npreds_lda)
abline(h = min(unlist(best_npreds_lda))+.2*sd(unlist(best_npreds_lda)))
min(unlist(best_npreds_lda))
[1] 0.03678161
data.frame(n_preds = which(as.matrix(best_npreds_lda)== min(unlist(best_npreds_lda)), arr.ind = TRUE)[,1], error = min(unlist(best_npreds_lda)) )
n_preds error
row 3 0.03678161
v.boost<-gbm(V1~., data = v.train, distribution = 'gaussian',n.trees=5000,interaction.depth=4)
yhat.boost=predict(v.boost ,newdata =v.test, n.trees=5000)
v.boostn<-function(x) {
as.numeric(caret::confusionMatrix(as.factor(ifelse(yhat.boost> x , 1, 0)), v.test$V1)[[3]][1])
}
cutoffs<-seq(1,2,.05)
bbb<-as.data.frame(cutoffs)
bbb$accuracy<-apply(bbb,1,v.boostn)
bbb
cutoffs accuracy
1 1.00 0.6436782
2 1.05 0.8160920
3 1.10 0.9080460
4 1.15 0.9310345
5 1.20 0.9310345
6 1.25 0.9310345
7 1.30 0.9425287
8 1.35 0.9425287
9 1.40 0.9425287
10 1.45 0.9425287
11 1.50 0.9080460
12 1.55 0.9080460
13 1.60 0.9080460
14 1.65 0.9080460
15 1.70 0.9080460
16 1.75 0.9080460
17 1.80 0.9080460
18 1.85 0.8735632
19 1.90 0.8735632
20 1.95 0.8505747
21 2.00 0.7471264
plot(bbb)
lines(bbb[,1],bbb[,2])
predictors through 5 seeds. it takes too damn long to do 100
npreds_gbm<-
function(x){
errors<-list()
for( seed in 1:5){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,c("V1",best_n[[x]])]
v.te<-v[-tr,c("V1",best_n[[x]])]
errors[[seed]]<- 1-(confusionMatrix(as.factor(ifelse(predict(gbm(V1~., data = v.tr, distribution = 'gaussian', n.trees=5000, interaction.depth = 4 ) ,newdata = v.te, n.trees=5000)> 1.35 , 1, 0)), v.te$V1)[[3]][1])
}
unlist(errors)
}
best_gbm<-data.frame(best_n=1:16, error = rbind(mean(npreds_gbm(1)), mean(npreds_gbm(2)), mean(npreds_gbm(3)), mean(npreds_gbm(4)), mean(npreds_gbm(5)), mean(npreds_gbm(6)), mean(npreds_gbm(7)), mean(npreds_gbm(8)), mean(npreds_gbm(9)), mean(npreds_gbm(10)), mean(npreds_gbm(11)), mean(npreds_gbm(12)), mean(npreds_gbm(13)), mean(npreds_gbm(14)), mean(npreds_gbm(15)), mean(npreds_gbm(16))))
matplot(best_gbm$best_n, best_gbm$error, xlab = "best n predictors", ylab ="error", main = 'Best n Predictors for GBM' , pch = 19)
lines(best_gbm$best_n, best_gbm$error)
abline(h = min(unlist( best_gbm[2]))+.2*sd(unlist( best_gbm[2])))
best_gbm[which(best_gbm$error== min(best_gbm$error), arr.ind = TRUE),]
best_n error
2 2 0.04597701
3 3 0.04597701
#best svm
library(e1071)
#svm linear
npreds_svm_lin<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
kernel = "linear",
ranges = list(cost = c(0.1,1,10,100,1000)),
gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}
best_npreds_svm_lin<-list()
for( i in 1:16){
best_npreds_svm_lin[[i]]<-mean(npreds_svm_lin(i))
}
#svm radial
npreds_svm_rad<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
kernel = "radial",
ranges = list(cost = c(0.1,1,10,100,1000)),
gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}
best_npreds_svm_rad<-list()
for( i in 1:16){
best_npreds_svm_rad[[i]]<-mean(npreds_svm_rad(i))
}
#svm polynomial
npreds_svm_pol<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
kernel = "polynomial",
ranges = list(cost = c(0.1,1,10,100,1000)),
gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}
best_npreds_svm_pol<-list()
for( i in 1:16){
best_npreds_svm_pol[[i]]<-mean(npreds_svm_pol(i))
}
best_svm<-data.frame( n_var = 1:16,
linear = unlist(best_npreds_svm_lin),
radial = unlist(best_npreds_svm_rad),
polynomial = unlist(best_npreds_svm_pol)
)
matplot(as.matrix(best_svm)[,2:4], type = c('b'), pch = 20, col = c(1,2,4), xlab='best n variables', ylab = 'error', main = "Best SVM" )
abline(h=min(as.matrix(best_svm)[,2:4]) + .2*sd(as.matrix(best_svm)[,2:4]))
best_svm_model= data.frame(which(as.matrix(best_svm)==min(as.matrix(best_svm)), arr.ind = TRUE), best_svm[which(as.matrix(best_svm)==min(as.matrix(best_svm)), arr.ind = TRUE)])
names(best_svm_model)<-c("best_n", "model","error")
best_svm_model[1,2]<-names(best_svm)[best_svm_model[1,2]]
best_svm_model
best_n model error
1 7 radial 0.03448276
Ultimately, the randomForest model with of mtry = 3 and the best 3 predictors gave us the lowest error rate of 2.99%. This outperformed all the models we tried, and was the best at predicting which political party a member of congress was part of, based off their voting record.