v<-data.frame(read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', header = FALSE))
v[1]<-as.factor(ifelse(v[1]=='democrat',0,1))
set.seed(1)
train<-sample(nrow(v),nrow(v)*.8)
v.train<-v[train,]
v.test<-v[-train,]
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'gbm' was built under R version 3.5.3
## Loaded gbm 2.1.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
library(dplyr)
library(gbm)
library(randomForest)
library(MASS)
#looking at the votes across parties
amount_undecided<-length(1:ncol(v))
for (i in 1:ncol(v)){ amount_undecided[i]<-nrow(v[v[i]=='?',])}
perc_und_party<- list()
for (i in 1:ncol(v)){ perc_und_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='?',1]))}
perc_und_party<-as.data.frame(perc_und_party)
names(perc_und_party)<-names(v)
rownames(perc_und_party)<-c('Dem?_percent','Rep?_percent','n?')
perc_no_party<- list()
for (i in 1:ncol(v)){ perc_no_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='n',1]))}
perc_no_party<-as.data.frame(perc_no_party)
names(perc_no_party)<-names(v)
rownames(perc_no_party)<-c('DemNO_percent','RepNO_percent','nNO')
perc_yes_party<- list()
for (i in 1:ncol(v)){ perc_yes_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='y',1]))}
perc_yes_party<-as.data.frame(perc_yes_party)
names(perc_yes_party)<-names(v)
rownames(perc_yes_party)<-c('DemYES_percent','RepYES_percent','nYES')
num_und_party<- list()
for (i in 1:ncol(v)){ num_und_party[[i]]<-as.matrix(table(v[v[i]=='?',1]))}
num_und_party<-as.data.frame(num_und_party)
names(num_und_party)<-names(v)
rownames(num_und_party)<-c('Dem?_#','Rep?_#')
num_no_party<- list()
for (i in 1:ncol(v)){ num_no_party[[i]]<-as.matrix(table(v[v[i]=='n',1]))}
num_no_party<-as.data.frame(num_no_party)
names(num_no_party)<-names(v)
rownames(num_no_party)<-c('DemNO_#','RepNO_#')
num_yes_party<- list()
for (i in 1:ncol(v)){ num_yes_party[[i]]<-as.matrix(table(v[v[i]=='y',1]))}
num_yes_party<-as.data.frame(num_yes_party)
names(num_yes_party)<-names(v)
rownames(num_yes_party)<-c('DemYES_#','RepYES_#')
bill_pass<-length(1:ncol(v))
for (i in 1:ncol(v)) {ncol
bill_pass[i]<-ifelse(nrow(v[v[i]=='y',])>=218,'pass','no_pass')
}
bill_pass<-t(as.data.frame(bill_pass))
vote_party<-as.matrix(rbind(num_no_party,perc_no_party, num_yes_party, perc_yes_party, num_und_party, perc_und_party))
vote_party<-as.data.frame(rbind(bill_pass,vote_party))
vote_party_pretty<-as.data.frame(t(vote_party[-1]))
vote_party_pretty
## bill_pass DemNO_# RepNO_# DemNO_percent RepNO_percent nNO
## V2 no_pass 102 134 43.2203389830508 56.7796610169492 236
## V3 no_pass 119 73 61.9791666666667 38.0208333333333 192
## V4 pass 29 142 16.9590643274854 83.0409356725146 171
## V5 no_pass 245 2 99.1902834008097 0.809716599190283 247
## V6 no_pass 200 8 96.1538461538462 3.84615384615385 208
## V7 pass 135 17 88.8157894736842 11.1842105263158 152
## V8 pass 59 123 32.4175824175824 67.5824175824176 182
## V9 pass 45 133 25.2808988764045 74.7191011235955 178
## V10 no_pass 60 146 29.126213592233 70.873786407767 206
## V11 no_pass 139 73 65.5660377358491 34.4339622641509 212
## V12 no_pass 126 138 47.7272727272727 52.2727272727273 264
## V13 no_pass 213 20 91.4163090128755 8.58369098712446 233
## V14 no_pass 179 22 89.0547263681592 10.9452736318408 201
## V15 pass 167 3 98.2352941176471 1.76470588235294 170
## V16 no_pass 91 142 39.0557939914163 60.9442060085837 233
## V17 pass 12 50 19.3548387096774 80.6451612903226 62
## DemYES_# RepYES_# DemYES_percent RepYES_percent nYES Dem?_# Rep?_#
## V2 156 31 83.4224598930481 16.5775401069519 187 9 3
## V3 120 75 61.5384615384615 38.4615384615385 195 28 20
## V4 231 22 91.304347826087 8.69565217391304 253 7 4
## V5 14 163 7.90960451977401 92.090395480226 177 8 3
## V6 55 157 25.9433962264151 74.0566037735849 212 12 3
## V7 123 149 45.2205882352941 54.7794117647059 272 9 2
## V8 200 39 83.6820083682008 16.3179916317992 239 8 6
## V9 218 24 90.0826446280992 9.91735537190083 242 4 11
## V10 188 19 90.8212560386473 9.17874396135266 207 19 3
## V11 124 92 57.4074074074074 42.5925925925926 216 4 3
## V12 129 21 86 14 150 12 9
## V13 36 135 21.0526315789474 78.9473684210526 171 18 13
## V14 73 136 34.9282296650718 65.0717703349282 209 15 10
## V15 90 158 36.2903225806452 63.7096774193548 248 10 7
## V16 160 14 91.9540229885057 8.04597701149425 174 16 12
## V17 173 96 64.3122676579926 35.6877323420074 269 82 22
## Dem?_percent Rep?_percent n?
## V2 75 25 12
## V3 58.3333333333333 41.6666666666667 48
## V4 63.6363636363636 36.3636363636364 11
## V5 72.7272727272727 27.2727272727273 11
## V6 80 20 15
## V7 81.8181818181818 18.1818181818182 11
## V8 57.1428571428571 42.8571428571429 14
## V9 26.6666666666667 73.3333333333333 15
## V10 86.3636363636364 13.6363636363636 22
## V11 57.1428571428571 42.8571428571429 7
## V12 57.1428571428571 42.8571428571429 21
## V13 58.0645161290323 41.9354838709677 31
## V14 60 40 25
## V15 58.8235294117647 41.1764705882353 17
## V16 57.1428571428571 42.8571428571429 28
## V17 78.8461538461538 21.1538461538462 104
ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(V1 ~ ., family="binomial", data = v.train, trControl = ctrl, tuneLength = 5)
summary(mod_fit)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 348 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 696 matrix numeric
## oob.times 348 -none- numeric
## classes 2 -none- character
## importance 32 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 348 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 32 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## param 1 -none- list
pred = predict(mod_fit, newdata=v.test)
confusionMatrix(data=pred, v.test$V1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 50 3
## 1 3 31
##
## Accuracy : 0.931
## 95% CI : (0.8559, 0.9743)
## No Information Rate : 0.6092
## P-Value [Acc > NIR] : 7.442e-12
##
## Kappa : 0.8552
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9434
## Specificity : 0.9118
## Pos Pred Value : 0.9434
## Neg Pred Value : 0.9118
## Prevalence : 0.6092
## Detection Rate : 0.5747
## Detection Prevalence : 0.6092
## Balanced Accuracy : 0.9276
##
## 'Positive' Class : 0
##
set.seed(1)
library(MASS)
v.lda.train<-lda(V1~., data = v.train, cv = FALSE)
testpredz<-predict(v.lda.train, v.test,type='response')
Test_Predz<-testpredz$class
1-caret::confusionMatrix(table(Test_Predz, Test_Party = v.test$V1))$overall[1]
## Accuracy
## 0.03448276
rf.v<- list()
yhat.rf<-list()
rf.error<-list()
for ( i in 1:16 ) {
set.seed(24)
rf.v[[i]]<-randomForest(V1 ~ ., data = v.train, mtry = i, importance = TRUE)
yhat.rf[[i]]<-predict(rf.v[[i]], newdata = v.test)
rf.error[[i]]<-caret::confusionMatrix(yhat.rf[[i]], v.test$V1)$overall[1]
}
rf.airz<-data.frame(accuracy=unlist(rf.error))
rownames(rf.airz)<-paste0("mtry=",1:16)
par(mfrow=c(1,1))
matplot(1:16, rf.error, xlab = 'mtry', ylab = 'Accuracy', main = "What # of predictors sampled at each split \n gives the lowest error?")
lines(1:16, rf.error, type = "o")
rf_var_imps<-data.frame(
scale(varImp(rf.v[[2]])[1]),
scale(varImp(rf.v[[3]])[1]),
scale(varImp(rf.v[[4]])[1]),
scale(varImp(rf.v[[8]])[1])
)
names(rf_var_imps)<-paste0("mtry_",c(2,3,4,8))
avg_var_imp<-apply(rf_var_imps,1,mean)
best_vars<-names(avg_var_imp[order(-avg_var_imp)])
best_n<-list() #most important variable by descending importances
for(i in 1:16){
best_n[[i]]<-best_vars[1:i]
}
best_n
## [[1]]
## [1] "V5"
##
## [[2]]
## [1] "V5" "V4"
##
## [[3]]
## [1] "V5" "V4" "V12"
##
## [[4]]
## [1] "V5" "V4" "V12" "V13"
##
## [[5]]
## [1] "V5" "V4" "V12" "V13" "V6"
##
## [[6]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15"
##
## [[7]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15" "V10"
##
## [[8]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16"
##
## [[9]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17"
##
## [[10]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9"
##
## [[11]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
##
## [[12]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
## [12] "V14"
##
## [[13]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
## [12] "V14" "V3"
##
## [[14]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
## [12] "V14" "V3" "V7"
##
## [[15]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
## [12] "V14" "V3" "V7" "V11"
##
## [[16]]
## [1] "V5" "V4" "V12" "V13" "V6" "V15" "V10" "V16" "V17" "V9" "V8"
## [12] "V14" "V3" "V7" "V11" "V2"
## [[1]]
## [1] 0.0437931
##
## [[2]]
## [1] 0.04551724
##
## [[3]]
## [1] 0.04528736
##
## [[4]]
## [1] 0.04586207
##
## [[5]]
## [1] 0.04126437
##
## [[6]]
## [1] 0.04586207
##
## [[7]]
## [1] 0.04655172
##
## [[8]]
## [1] 0.05114943
##
## [[9]]
## [1] 0.05275862
##
## [[10]]
## [1] 0.05252874
##
## [[11]]
## [1] 0.05298851
##
## [[12]]
## [1] 0.05609195
##
## [[13]]
## [1] 0.05712644
##
## [[14]]
## [1] 0.05908046
##
## [[15]]
## [1] 0.0637931
##
## [[16]]
## [1] 0.06494253
## [[1]]
## NULL
##
## [[2]]
## [1] 0.04885057
##
## [[3]]
## [1] 0.04609195
##
## [[4]]
## [1] 0.04885057
##
## [[5]]
## [1] 0.0437931
##
## [[6]]
## [1] 0.04804598
##
## [[7]]
## [1] 0.04724138
##
## [[8]]
## [1] 0.04873563
##
## [[9]]
## [1] 0.04448276
##
## [[10]]
## [1] 0.04298851
##
## [[11]]
## [1] 0.03988506
##
## [[12]]
## [1] 0.04068966
##
## [[13]]
## [1] 0.04091954
##
## [[14]]
## [1] 0.0408046
##
## [[15]]
## [1] 0.04045977
##
## [[16]]
## [1] 0.04505747
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## [1] 0.04137931
##
## [[4]]
## [1] 0.04632184
##
## [[5]]
## [1] 0.04126437
##
## [[6]]
## [1] 0.0462069
##
## [[7]]
## [1] 0.04471264
##
## [[8]]
## [1] 0.04896552
##
## [[9]]
## [1] 0.04678161
##
## [[10]]
## [1] 0.04321839
##
## [[11]]
## [1] 0.03758621
##
## [[12]]
## [1] 0.03862069
##
## [[13]]
## [1] 0.03816092
##
## [[14]]
## [1] 0.03781609
##
## [[15]]
## [1] 0.03862069
##
## [[16]]
## [1] 0.04114943
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## [1] 0.04310345
##
## [[5]]
## [1] 0.03965517
##
## [[6]]
## [1] 0.04344828
##
## [[7]]
## [1] 0.04321839
##
## [[8]]
## [1] 0.04781609
##
## [[9]]
## [1] 0.04770115
##
## [[10]]
## [1] 0.04471264
##
## [[11]]
## [1] 0.03827586
##
## [[12]]
## [1] 0.03850575
##
## [[13]]
## [1] 0.03747126
##
## [[14]]
## [1] 0.03666667
##
## [[15]]
## [1] 0.03862069
##
## [[16]]
## [1] 0.04103448
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## [1] 0.03908046
##
## [[6]]
## [1] 0.04206897
##
## [[7]]
## [1] 0.04321839
##
## [[8]]
## [1] 0.04655172
##
## [[9]]
## [1] 0.04977011
##
## [[10]]
## [1] 0.04551724
##
## [[11]]
## [1] 0.03896552
##
## [[12]]
## [1] 0.03988506
##
## [[13]]
## [1] 0.03632184
##
## [[14]]
## [1] 0.0362069
##
## [[15]]
## [1] 0.0383908
##
## [[16]]
## [1] 0.04022989
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## [1] 0.04218391
##
## [[7]]
## [1] 0.04321839
##
## [[8]]
## [1] 0.04678161
##
## [[9]]
## [1] 0.04931034
##
## [[10]]
## [1] 0.04655172
##
## [[11]]
## [1] 0.04034483
##
## [[12]]
## [1] 0.03942529
##
## [[13]]
## [1] 0.03689655
##
## [[14]]
## [1] 0.03735632
##
## [[15]]
## [1] 0.0383908
##
## [[16]]
## [1] 0.04011494
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## [1] 0.04264368
##
## [[8]]
## [1] 0.04597701
##
## [[9]]
## [1] 0.05011494
##
## [[10]]
## [1] 0.04816092
##
## [[11]]
## [1] 0.04057471
##
## [[12]]
## [1] 0.04045977
##
## [[13]]
## [1] 0.0391954
##
## [[14]]
## [1] 0.03770115
##
## [[15]]
## [1] 0.04103448
##
## [[16]]
## [1] 0.03965517
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## [1] 0.04632184
##
## [[9]]
## [1] 0.05
##
## [[10]]
## [1] 0.04977011
##
## [[11]]
## [1] 0.04045977
##
## [[12]]
## [1] 0.04045977
##
## [[13]]
## [1] 0.04068966
##
## [[14]]
## [1] 0.03896552
##
## [[15]]
## [1] 0.04
##
## [[16]]
## [1] 0.04091954
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## [1] 0.05011494
##
## [[10]]
## [1] 0.05
##
## [[11]]
## [1] 0.04114943
##
## [[12]]
## [1] 0.04137931
##
## [[13]]
## [1] 0.04218391
##
## [[14]]
## [1] 0.03942529
##
## [[15]]
## [1] 0.0408046
##
## [[16]]
## [1] 0.0408046
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## [1] 0.05045977
##
## [[11]]
## [1] 0.04310345
##
## [[12]]
## [1] 0.04183908
##
## [[13]]
## [1] 0.04264368
##
## [[14]]
## [1] 0.04091954
##
## [[15]]
## [1] 0.0408046
##
## [[16]]
## [1] 0.04241379
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## [1] 0.04413793
##
## [[12]]
## [1] 0.04333333
##
## [[13]]
## [1] 0.04390805
##
## [[14]]
## [1] 0.04114943
##
## [[15]]
## [1] 0.04172414
##
## [[16]]
## [1] 0.04195402
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## [1] 0.04356322
##
## [[13]]
## [1] 0.04367816
##
## [[14]]
## [1] 0.0416092
##
## [[15]]
## [1] 0.04275862
##
## [[16]]
## [1] 0.04218391
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## [1] 0.04528736
##
## [[14]]
## [1] 0.04264368
##
## [[15]]
## [1] 0.04367816
##
## [[16]]
## [1] 0.04229885
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## [1] 0.04356322
##
## [[15]]
## [1] 0.04390805
##
## [[16]]
## [1] 0.04310345
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## [1] 0.04494253
##
## [[16]]
## [1] 0.04425287
mtry_16<-
function(x){
errors<-list()
for( seed in 1:100){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
errors[[seed]]<-1-confusionMatrix(predict(randomForest(V1~., data = v.tr[,c("V1",best_n[[x]])], mtry = 16), newdata=v.te), v.te$V1)$overall[1]
}
unlist(errors)
}
mtry_16_preds<-list()
for( i in 16:16){
mtry_16_preds[[i]]<-mean(mtry_16(i))
}
mtry_16_preds
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## NULL
##
## [[16]]
## [1] 0.0445977
mtry_n_preds<-list( #list of models
mtry_1_preds,
mtry_2_preds,
mtry_3_preds,
mtry_4_preds,
mtry_5_preds,
mtry_6_preds,
mtry_7_preds,
mtry_8_preds,
mtry_9_preds,
mtry_10_preds,
mtry_11_preds,
mtry_12_preds,
mtry_13_preds,
mtry_14_preds,
mtry_15_preds,
mtry_16_preds)
best_mtry_100_seeds<-data.frame(
mtry1=unlist(mtry_1_preds),
mtry2=c(1000, unlist(mtry_2_preds)),
mtry3=c(rep(1000,2), unlist(mtry_3_preds)),
mtry4=c(rep(1000,3), unlist(mtry_4_preds)),
mtry5=c(rep(1000,4), unlist(mtry_5_preds)),
mtry6=c(rep(1000,5), unlist(mtry_6_preds)),
mtry7=c(rep(1000,6), unlist(mtry_7_preds)),
mtry8=c(rep(1000,7), unlist(mtry_8_preds)),
mtry9=c(rep(1000,8), unlist(mtry_9_preds)),
mtry10=c(rep(1000,9), unlist(mtry_10_preds)),
mtry11=c(rep(1000,10), unlist(mtry_11_preds)),
mtry12=c(rep(1000,11), unlist(mtry_12_preds)),
mtry13=c(rep(1000,12), unlist(mtry_13_preds)),
mtry14=c(rep(1000,13), unlist(mtry_14_preds)),
mtry15=c(rep(1000,14), unlist(mtry_15_preds)),
mtry16=c(rep(1000,15), unlist(mtry_16_preds))
)
rownames(best_mtry_100_seeds)<-paste0("nvar_",1:16)
best_mtry_100_seeds
## mtry1 mtry2 mtry3 mtry4 mtry5
## nvar_1 0.04379310 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_2 0.04551724 4.885057e-02 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_3 0.04528736 4.609195e-02 4.137931e-02 1.000000e+03 1.000000e+03
## nvar_4 0.04586207 4.885057e-02 4.632184e-02 4.310345e-02 1.000000e+03
## nvar_5 0.04126437 4.379310e-02 4.126437e-02 3.965517e-02 3.908046e-02
## nvar_6 0.04586207 4.804598e-02 4.620690e-02 4.344828e-02 4.206897e-02
## nvar_7 0.04655172 4.724138e-02 4.471264e-02 4.321839e-02 4.321839e-02
## nvar_8 0.05114943 4.873563e-02 4.896552e-02 4.781609e-02 4.655172e-02
## nvar_9 0.05275862 4.448276e-02 4.678161e-02 4.770115e-02 4.977011e-02
## nvar_10 0.05252874 4.298851e-02 4.321839e-02 4.471264e-02 4.551724e-02
## nvar_11 0.05298851 3.988506e-02 3.758621e-02 3.827586e-02 3.896552e-02
## nvar_12 0.05609195 4.068966e-02 3.862069e-02 3.850575e-02 3.988506e-02
## nvar_13 0.05712644 4.091954e-02 3.816092e-02 3.747126e-02 3.632184e-02
## nvar_14 0.05908046 4.080460e-02 3.781609e-02 3.666667e-02 3.620690e-02
## nvar_15 0.06379310 4.045977e-02 3.862069e-02 3.862069e-02 3.839080e-02
## nvar_16 0.06494253 4.505747e-02 4.114943e-02 4.103448e-02 4.022989e-02
## mtry6 mtry7 mtry8 mtry9 mtry10
## nvar_1 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_2 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_3 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_4 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_5 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_6 4.218391e-02 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_7 4.321839e-02 4.264368e-02 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_8 4.678161e-02 4.597701e-02 4.632184e-02 1.000000e+03 1.000000e+03
## nvar_9 4.931034e-02 5.011494e-02 5.000000e-02 5.011494e-02 1.000000e+03
## nvar_10 4.655172e-02 4.816092e-02 4.977011e-02 5.000000e-02 5.045977e-02
## nvar_11 4.034483e-02 4.057471e-02 4.045977e-02 4.114943e-02 4.310345e-02
## nvar_12 3.942529e-02 4.045977e-02 4.045977e-02 4.137931e-02 4.183908e-02
## nvar_13 3.689655e-02 3.919540e-02 4.068966e-02 4.218391e-02 4.264368e-02
## nvar_14 3.735632e-02 3.770115e-02 3.896552e-02 3.942529e-02 4.091954e-02
## nvar_15 3.839080e-02 4.103448e-02 4.000000e-02 4.080460e-02 4.080460e-02
## nvar_16 4.011494e-02 3.965517e-02 4.091954e-02 4.080460e-02 4.241379e-02
## mtry11 mtry12 mtry13 mtry14 mtry15
## nvar_1 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_2 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_3 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_4 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_5 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_6 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_7 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_8 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_9 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_10 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_11 4.413793e-02 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_12 4.333333e-02 4.356322e-02 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_13 4.390805e-02 4.367816e-02 4.528736e-02 1.000000e+03 1.000000e+03
## nvar_14 4.114943e-02 4.160920e-02 4.264368e-02 4.356322e-02 1.000000e+03
## nvar_15 4.172414e-02 4.275862e-02 4.367816e-02 4.390805e-02 4.494253e-02
## nvar_16 4.195402e-02 4.218391e-02 4.229885e-02 4.310345e-02 4.425287e-02
## mtry16
## nvar_1 1.00000e+03
## nvar_2 1.00000e+03
## nvar_3 1.00000e+03
## nvar_4 1.00000e+03
## nvar_5 1.00000e+03
## nvar_6 1.00000e+03
## nvar_7 1.00000e+03
## nvar_8 1.00000e+03
## nvar_9 1.00000e+03
## nvar_10 1.00000e+03
## nvar_11 1.00000e+03
## nvar_12 1.00000e+03
## nvar_13 1.00000e+03
## nvar_14 1.00000e+03
## nvar_15 1.00000e+03
## nvar_16 4.45977e-02
best_mtry_matrix<-as.matrix(best_mtry_100_seeds)
best_rf_model<-data.frame(which(best_mtry_matrix == min(best_mtry_matrix), arr.ind = TRUE), min(best_mtry_matrix))
names(best_rf_model)<-c('nvar','mtry','error')
best_rf_model
## nvar mtry error
## nvar_14 14 5 0.0362069
npreds_lda<-
function(x){
errors<-list()
for( seed in 1:100){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
errors[[seed]]<-1-confusionMatrix(predict(lda(V1~., data = v.tr[,c("V1",best_n[[x]])] , cv = FALSE), v.te, type='response')$class, v.te$V1)$overall[1]
}
unlist(errors)
}
best_npreds_lda<-list()
for( i in 1:16){
best_npreds_lda[[i]]<-mean(npreds_lda(i))
}
matplot(1:16, best_npreds_lda, xlab = "best n predictors", ylab ="error", main = 'Best n Predictors for LDA' , pch = 19)
lines(1:16, best_npreds_lda)
abline(h = min(unlist(best_npreds_lda))+.2*sd(unlist(best_npreds_lda)))
min(unlist(best_npreds_lda))
## [1] 0.04413793
data.frame(n_preds = which(as.matrix(best_npreds_lda)== min(unlist(best_npreds_lda)), arr.ind = TRUE)[,1], error = min(unlist(best_npreds_lda)) )
## n_preds error
## 1 2 0.04413793
## 2 4 0.04413793
v.boost<-gbm(V1~., data = v.train, distribution = 'gaussian',n.trees=5000,interaction.depth=4)
yhat.boost=predict(v.boost ,newdata =v.test, n.trees=5000)
v.boostn<-function(x) {
as.numeric(caret::confusionMatrix(as.factor(ifelse(yhat.boost> x , 1, 0)), v.test$V1)[[3]][1])
}
cutoffs<-seq(1,2,.05)
bbb<-as.data.frame(cutoffs)
bbb$accuracy<-apply(bbb,1,v.boostn)
bbb
## cutoffs accuracy
## 1 1.00 0.7011494
## 2 1.05 0.8160920
## 3 1.10 0.8850575
## 4 1.15 0.9080460
## 5 1.20 0.9195402
## 6 1.25 0.9425287
## 7 1.30 0.9425287
## 8 1.35 0.9425287
## 9 1.40 0.9425287
## 10 1.45 0.9425287
## 11 1.50 0.9310345
## 12 1.55 0.9195402
## 13 1.60 0.9080460
## 14 1.65 0.9080460
## 15 1.70 0.9080460
## 16 1.75 0.9080460
## 17 1.80 0.8965517
## 18 1.85 0.8735632
## 19 1.90 0.8620690
## 20 1.95 0.8505747
## 21 2.00 0.7586207
plot(bbb)
lines(bbb[,1],bbb[,2])
#lets try it across the best n groups of predictors through 5 seeds. it takes too damn long to do 100
npreds_gbm<-
function(x){
errors<-list()
for( seed in 1:100){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,c("V1",best_n[[x]])]
v.te<-v[-tr,c("V1",best_n[[x]])]
errors[[seed]]<- 1-(confusionMatrix(as.factor(ifelse(predict(gbm(V1~., data = v.tr, distribution = 'gaussian', n.trees=5000, interaction.depth = 4 ) ,newdata = v.te, n.trees=5000)> 1.35 , 1, 0)), v.te$V1)[[3]][1])
}
unlist(errors)
}
best_gbm<-data.frame(best_n=1:16, error = rbind(mean(npreds_gbm(1)), mean(npreds_gbm(2)), mean(npreds_gbm(3)), mean(npreds_gbm(4)), mean(npreds_gbm(5)), mean(npreds_gbm(6)), mean(npreds_gbm(7)), mean(npreds_gbm(8)), mean(npreds_gbm(9)), mean(npreds_gbm(10)), mean(npreds_gbm(11)), mean(npreds_gbm(12)), mean(npreds_gbm(13)), mean(npreds_gbm(14)), mean(npreds_gbm(15)), mean(npreds_gbm(16))))
matplot(best_gbm$best_n, best_gbm$error, xlab = "best n predictors", ylab ="error", main = 'Best n Predictors for GBM' , pch = 19)
lines(best_gbm$best_n, best_gbm$error)
abline(h = min(unlist( best_gbm[2]))+.2*sd(unlist( best_gbm[2])))
best_gbm[which(best_gbm$error== min(best_gbm$error), arr.ind = TRUE),]
## best_n error
## 2 2 0.04678161
#best svm
library(e1071)
#svm linear
npreds_svm_lin<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
kernel = "linear",
ranges = list(cost = c(0.1,1,10,100,1000)),
gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}
best_npreds_svm_lin<-list()
for( i in 1:16){
best_npreds_svm_lin[[i]]<-mean(npreds_svm_lin(i))
}
#svm radial
npreds_svm_rad<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
kernel = "radial",
ranges = list(cost = c(0.1,1,10,100,1000)),
gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}
best_npreds_svm_rad<-list()
for( i in 1:16){
best_npreds_svm_rad[[i]]<-mean(npreds_svm_rad(i))
}
#svm polynomial
npreds_svm_pol<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
kernel = "polynomial",
ranges = list(cost = c(0.1,1,10,100,1000)),
gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}
best_npreds_svm_pol<-list()
for( i in 1:16){
best_npreds_svm_pol[[i]]<-mean(npreds_svm_pol(i))
}
best_svm<-data.frame( n_var = 1:16,
linear = unlist(best_npreds_svm_lin),
radial = unlist(best_npreds_svm_rad),
polynomial = unlist(best_npreds_svm_pol)
)
matplot(as.matrix(best_svm)[,2:4], type = c('b'), pch = 20, col = c(1,2,4), xlab='best n variables', ylab = 'error', main = "Best SVM" )
abline(h=min(as.matrix(best_svm)[,2:4]) + .2*sd(as.matrix(best_svm)[,2:4]))
best_svm_model= data.frame(which(as.matrix(best_svm)==min(as.matrix(best_svm)), arr.ind = TRUE), best_svm[which(as.matrix(best_svm)==min(as.matrix(best_svm)), arr.ind = TRUE)])
names(best_svm_model)<-c("best_n", "model","error")
best_svm_model[1,2]<-names(best_svm)[best_svm_model[1,2]]
best_svm_model
## best_n model error
## 1 7 radial 0.03448276