getting and fixing data set

v<-data.frame(read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', header = FALSE))
v[1]<-as.factor(ifelse(v[1]=='democrat',0,1))
set.seed(1)
train<-sample(nrow(v),nrow(v)*.8)
v.train<-v[train,]
v.test<-v[-train,]

libraries

## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Warning: package 'gbm' was built under R version 3.5.3
## Loaded gbm 2.1.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
library(dplyr)
library(gbm)
library(randomForest)
library(MASS)

looking at the votes across parties

There are superior ways to do this, but this is what i did.

#looking at the votes across parties
amount_undecided<-length(1:ncol(v))
for (i in 1:ncol(v)){ amount_undecided[i]<-nrow(v[v[i]=='?',])}

perc_und_party<- list()
for (i in 1:ncol(v)){ perc_und_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='?',1]))}
perc_und_party<-as.data.frame(perc_und_party)
names(perc_und_party)<-names(v)
rownames(perc_und_party)<-c('Dem?_percent','Rep?_percent','n?')

perc_no_party<- list()
for (i in 1:ncol(v)){ perc_no_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='n',1]))}
perc_no_party<-as.data.frame(perc_no_party)
names(perc_no_party)<-names(v)
rownames(perc_no_party)<-c('DemNO_percent','RepNO_percent','nNO')

perc_yes_party<- list()
for (i in 1:ncol(v)){ perc_yes_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='y',1]))}
perc_yes_party<-as.data.frame(perc_yes_party)
names(perc_yes_party)<-names(v)
rownames(perc_yes_party)<-c('DemYES_percent','RepYES_percent','nYES')

num_und_party<- list()
for (i in 1:ncol(v)){ num_und_party[[i]]<-as.matrix(table(v[v[i]=='?',1]))}
num_und_party<-as.data.frame(num_und_party)
names(num_und_party)<-names(v)
rownames(num_und_party)<-c('Dem?_#','Rep?_#')

num_no_party<- list()
for (i in 1:ncol(v)){ num_no_party[[i]]<-as.matrix(table(v[v[i]=='n',1]))}
num_no_party<-as.data.frame(num_no_party)
names(num_no_party)<-names(v)
rownames(num_no_party)<-c('DemNO_#','RepNO_#')

num_yes_party<- list()
for (i in 1:ncol(v)){ num_yes_party[[i]]<-as.matrix(table(v[v[i]=='y',1]))}
num_yes_party<-as.data.frame(num_yes_party)
names(num_yes_party)<-names(v)
rownames(num_yes_party)<-c('DemYES_#','RepYES_#')

bill_pass<-length(1:ncol(v))
for (i in 1:ncol(v)) {ncol
  bill_pass[i]<-ifelse(nrow(v[v[i]=='y',])>=218,'pass','no_pass')
}
bill_pass<-t(as.data.frame(bill_pass))
vote_party<-as.matrix(rbind(num_no_party,perc_no_party, num_yes_party, perc_yes_party, num_und_party, perc_und_party))
vote_party<-as.data.frame(rbind(bill_pass,vote_party))
vote_party_pretty<-as.data.frame(t(vote_party[-1]))
vote_party_pretty
##     bill_pass DemNO_# RepNO_#    DemNO_percent     RepNO_percent nNO
## V2    no_pass     102     134 43.2203389830508  56.7796610169492 236
## V3    no_pass     119      73 61.9791666666667  38.0208333333333 192
## V4       pass      29     142 16.9590643274854  83.0409356725146 171
## V5    no_pass     245       2 99.1902834008097 0.809716599190283 247
## V6    no_pass     200       8 96.1538461538462  3.84615384615385 208
## V7       pass     135      17 88.8157894736842  11.1842105263158 152
## V8       pass      59     123 32.4175824175824  67.5824175824176 182
## V9       pass      45     133 25.2808988764045  74.7191011235955 178
## V10   no_pass      60     146  29.126213592233   70.873786407767 206
## V11   no_pass     139      73 65.5660377358491  34.4339622641509 212
## V12   no_pass     126     138 47.7272727272727  52.2727272727273 264
## V13   no_pass     213      20 91.4163090128755  8.58369098712446 233
## V14   no_pass     179      22 89.0547263681592  10.9452736318408 201
## V15      pass     167       3 98.2352941176471  1.76470588235294 170
## V16   no_pass      91     142 39.0557939914163  60.9442060085837 233
## V17      pass      12      50 19.3548387096774  80.6451612903226  62
##     DemYES_# RepYES_#   DemYES_percent   RepYES_percent nYES Dem?_# Rep?_#
## V2       156       31 83.4224598930481 16.5775401069519  187      9      3
## V3       120       75 61.5384615384615 38.4615384615385  195     28     20
## V4       231       22  91.304347826087 8.69565217391304  253      7      4
## V5        14      163 7.90960451977401  92.090395480226  177      8      3
## V6        55      157 25.9433962264151 74.0566037735849  212     12      3
## V7       123      149 45.2205882352941 54.7794117647059  272      9      2
## V8       200       39 83.6820083682008 16.3179916317992  239      8      6
## V9       218       24 90.0826446280992 9.91735537190083  242      4     11
## V10      188       19 90.8212560386473 9.17874396135266  207     19      3
## V11      124       92 57.4074074074074 42.5925925925926  216      4      3
## V12      129       21               86               14  150     12      9
## V13       36      135 21.0526315789474 78.9473684210526  171     18     13
## V14       73      136 34.9282296650718 65.0717703349282  209     15     10
## V15       90      158 36.2903225806452 63.7096774193548  248     10      7
## V16      160       14 91.9540229885057 8.04597701149425  174     16     12
## V17      173       96 64.3122676579926 35.6877323420074  269     82     22
##         Dem?_percent     Rep?_percent  n?
## V2                75               25  12
## V3  58.3333333333333 41.6666666666667  48
## V4  63.6363636363636 36.3636363636364  11
## V5  72.7272727272727 27.2727272727273  11
## V6                80               20  15
## V7  81.8181818181818 18.1818181818182  11
## V8  57.1428571428571 42.8571428571429  14
## V9  26.6666666666667 73.3333333333333  15
## V10 86.3636363636364 13.6363636363636  22
## V11 57.1428571428571 42.8571428571429   7
## V12 57.1428571428571 42.8571428571429  21
## V13 58.0645161290323 41.9354838709677  31
## V14               60               40  25
## V15 58.8235294117647 41.1764705882353  17
## V16 57.1428571428571 42.8571428571429  28
## V17 78.8461538461538 21.1538461538462 104

cross validated logistic regression

ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(V1 ~ ., family="binomial", data = v.train, trControl = ctrl, tuneLength = 5)
summary(mod_fit)
##                 Length Class      Mode     
## call               5   -none-     call     
## type               1   -none-     character
## predicted        348   factor     numeric  
## err.rate        1500   -none-     numeric  
## confusion          6   -none-     numeric  
## votes            696   matrix     numeric  
## oob.times        348   -none-     numeric  
## classes            2   -none-     character
## importance        32   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity          0   -none-     NULL     
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            14   -none-     list     
## y                348   factor     numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            32   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          2   -none-     character
## param              1   -none-     list
pred = predict(mod_fit, newdata=v.test)
confusionMatrix(data=pred, v.test$V1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 50  3
##          1  3 31
##                                           
##                Accuracy : 0.931           
##                  95% CI : (0.8559, 0.9743)
##     No Information Rate : 0.6092          
##     P-Value [Acc > NIR] : 7.442e-12       
##                                           
##                   Kappa : 0.8552          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9434          
##             Specificity : 0.9118          
##          Pos Pred Value : 0.9434          
##          Neg Pred Value : 0.9118          
##              Prevalence : 0.6092          
##          Detection Rate : 0.5747          
##    Detection Prevalence : 0.6092          
##       Balanced Accuracy : 0.9276          
##                                           
##        'Positive' Class : 0               
## 

k fold LDA

set.seed(1)
library(MASS)
v.lda.train<-lda(V1~., data = v.train, cv = FALSE)
testpredz<-predict(v.lda.train, v.test,type='response')
Test_Predz<-testpredz$class
1-caret::confusionMatrix(table(Test_Predz, Test_Party = v.test$V1))$overall[1]
##   Accuracy 
## 0.03448276

random forest across different mtry, all 16 variables

rf.v<- list()
yhat.rf<-list()
rf.error<-list()
for ( i in 1:16 ) {
  set.seed(24)
  rf.v[[i]]<-randomForest(V1 ~ ., data = v.train, mtry = i, importance = TRUE)
  yhat.rf[[i]]<-predict(rf.v[[i]], newdata = v.test)
  rf.error[[i]]<-caret::confusionMatrix(yhat.rf[[i]], v.test$V1)$overall[1]
}

rf.airz<-data.frame(accuracy=unlist(rf.error))
rownames(rf.airz)<-paste0("mtry=",1:16)

par(mfrow=c(1,1))
matplot(1:16, rf.error, xlab = 'mtry', ylab = 'Accuracy', main = "What # of predictors sampled at each split \n gives the lowest error?")
lines(1:16, rf.error, type = "o")

closer look at most important variables, across best mtrys, 2:4,8

rf_var_imps<-data.frame(
  scale(varImp(rf.v[[2]])[1]),
  scale(varImp(rf.v[[3]])[1]),
  scale(varImp(rf.v[[4]])[1]),
  scale(varImp(rf.v[[8]])[1])
)

names(rf_var_imps)<-paste0("mtry_",c(2,3,4,8))
avg_var_imp<-apply(rf_var_imps,1,mean)

best_vars<-names(avg_var_imp[order(-avg_var_imp)])

best_n<-list()  #most important variable by descending importances
for(i in 1:16){
  best_n[[i]]<-best_vars[1:i]
}

best_n
## [[1]]
## [1] "V5"
## 
## [[2]]
## [1] "V5" "V4"
## 
## [[3]]
## [1] "V5"  "V4"  "V12"
## 
## [[4]]
## [1] "V5"  "V4"  "V12" "V13"
## 
## [[5]]
## [1] "V5"  "V4"  "V12" "V13" "V6" 
## 
## [[6]]
## [1] "V5"  "V4"  "V12" "V13" "V6"  "V15"
## 
## [[7]]
## [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10"
## 
## [[8]]
## [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16"
## 
## [[9]]
## [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17"
## 
## [[10]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9" 
## 
## [[11]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## 
## [[12]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## [12] "V14"
## 
## [[13]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## [12] "V14" "V3" 
## 
## [[14]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## [12] "V14" "V3"  "V7" 
## 
## [[15]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## [12] "V14" "V3"  "V7"  "V11"
## 
## [[16]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## [12] "V14" "V3"  "V7"  "V11" "V2"

do random forest across all 16 possible mtrys, using best n (1:16) predictors, WITH 100 different seeds

## [[1]]
## [1] 0.0437931
## 
## [[2]]
## [1] 0.04551724
## 
## [[3]]
## [1] 0.04528736
## 
## [[4]]
## [1] 0.04586207
## 
## [[5]]
## [1] 0.04126437
## 
## [[6]]
## [1] 0.04586207
## 
## [[7]]
## [1] 0.04655172
## 
## [[8]]
## [1] 0.05114943
## 
## [[9]]
## [1] 0.05275862
## 
## [[10]]
## [1] 0.05252874
## 
## [[11]]
## [1] 0.05298851
## 
## [[12]]
## [1] 0.05609195
## 
## [[13]]
## [1] 0.05712644
## 
## [[14]]
## [1] 0.05908046
## 
## [[15]]
## [1] 0.0637931
## 
## [[16]]
## [1] 0.06494253
## [[1]]
## NULL
## 
## [[2]]
## [1] 0.04885057
## 
## [[3]]
## [1] 0.04609195
## 
## [[4]]
## [1] 0.04885057
## 
## [[5]]
## [1] 0.0437931
## 
## [[6]]
## [1] 0.04804598
## 
## [[7]]
## [1] 0.04724138
## 
## [[8]]
## [1] 0.04873563
## 
## [[9]]
## [1] 0.04448276
## 
## [[10]]
## [1] 0.04298851
## 
## [[11]]
## [1] 0.03988506
## 
## [[12]]
## [1] 0.04068966
## 
## [[13]]
## [1] 0.04091954
## 
## [[14]]
## [1] 0.0408046
## 
## [[15]]
## [1] 0.04045977
## 
## [[16]]
## [1] 0.04505747
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## [1] 0.04137931
## 
## [[4]]
## [1] 0.04632184
## 
## [[5]]
## [1] 0.04126437
## 
## [[6]]
## [1] 0.0462069
## 
## [[7]]
## [1] 0.04471264
## 
## [[8]]
## [1] 0.04896552
## 
## [[9]]
## [1] 0.04678161
## 
## [[10]]
## [1] 0.04321839
## 
## [[11]]
## [1] 0.03758621
## 
## [[12]]
## [1] 0.03862069
## 
## [[13]]
## [1] 0.03816092
## 
## [[14]]
## [1] 0.03781609
## 
## [[15]]
## [1] 0.03862069
## 
## [[16]]
## [1] 0.04114943
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## [1] 0.04310345
## 
## [[5]]
## [1] 0.03965517
## 
## [[6]]
## [1] 0.04344828
## 
## [[7]]
## [1] 0.04321839
## 
## [[8]]
## [1] 0.04781609
## 
## [[9]]
## [1] 0.04770115
## 
## [[10]]
## [1] 0.04471264
## 
## [[11]]
## [1] 0.03827586
## 
## [[12]]
## [1] 0.03850575
## 
## [[13]]
## [1] 0.03747126
## 
## [[14]]
## [1] 0.03666667
## 
## [[15]]
## [1] 0.03862069
## 
## [[16]]
## [1] 0.04103448
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## [1] 0.03908046
## 
## [[6]]
## [1] 0.04206897
## 
## [[7]]
## [1] 0.04321839
## 
## [[8]]
## [1] 0.04655172
## 
## [[9]]
## [1] 0.04977011
## 
## [[10]]
## [1] 0.04551724
## 
## [[11]]
## [1] 0.03896552
## 
## [[12]]
## [1] 0.03988506
## 
## [[13]]
## [1] 0.03632184
## 
## [[14]]
## [1] 0.0362069
## 
## [[15]]
## [1] 0.0383908
## 
## [[16]]
## [1] 0.04022989
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## [1] 0.04218391
## 
## [[7]]
## [1] 0.04321839
## 
## [[8]]
## [1] 0.04678161
## 
## [[9]]
## [1] 0.04931034
## 
## [[10]]
## [1] 0.04655172
## 
## [[11]]
## [1] 0.04034483
## 
## [[12]]
## [1] 0.03942529
## 
## [[13]]
## [1] 0.03689655
## 
## [[14]]
## [1] 0.03735632
## 
## [[15]]
## [1] 0.0383908
## 
## [[16]]
## [1] 0.04011494
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## [1] 0.04264368
## 
## [[8]]
## [1] 0.04597701
## 
## [[9]]
## [1] 0.05011494
## 
## [[10]]
## [1] 0.04816092
## 
## [[11]]
## [1] 0.04057471
## 
## [[12]]
## [1] 0.04045977
## 
## [[13]]
## [1] 0.0391954
## 
## [[14]]
## [1] 0.03770115
## 
## [[15]]
## [1] 0.04103448
## 
## [[16]]
## [1] 0.03965517
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## [1] 0.04632184
## 
## [[9]]
## [1] 0.05
## 
## [[10]]
## [1] 0.04977011
## 
## [[11]]
## [1] 0.04045977
## 
## [[12]]
## [1] 0.04045977
## 
## [[13]]
## [1] 0.04068966
## 
## [[14]]
## [1] 0.03896552
## 
## [[15]]
## [1] 0.04
## 
## [[16]]
## [1] 0.04091954
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## [1] 0.05011494
## 
## [[10]]
## [1] 0.05
## 
## [[11]]
## [1] 0.04114943
## 
## [[12]]
## [1] 0.04137931
## 
## [[13]]
## [1] 0.04218391
## 
## [[14]]
## [1] 0.03942529
## 
## [[15]]
## [1] 0.0408046
## 
## [[16]]
## [1] 0.0408046
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## [1] 0.05045977
## 
## [[11]]
## [1] 0.04310345
## 
## [[12]]
## [1] 0.04183908
## 
## [[13]]
## [1] 0.04264368
## 
## [[14]]
## [1] 0.04091954
## 
## [[15]]
## [1] 0.0408046
## 
## [[16]]
## [1] 0.04241379
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## [1] 0.04413793
## 
## [[12]]
## [1] 0.04333333
## 
## [[13]]
## [1] 0.04390805
## 
## [[14]]
## [1] 0.04114943
## 
## [[15]]
## [1] 0.04172414
## 
## [[16]]
## [1] 0.04195402
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## [1] 0.04356322
## 
## [[13]]
## [1] 0.04367816
## 
## [[14]]
## [1] 0.0416092
## 
## [[15]]
## [1] 0.04275862
## 
## [[16]]
## [1] 0.04218391
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## [1] 0.04528736
## 
## [[14]]
## [1] 0.04264368
## 
## [[15]]
## [1] 0.04367816
## 
## [[16]]
## [1] 0.04229885
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## [1] 0.04356322
## 
## [[15]]
## [1] 0.04390805
## 
## [[16]]
## [1] 0.04310345
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## [1] 0.04494253
## 
## [[16]]
## [1] 0.04425287

Once again, there was probably a shorter way to do this, but Im not going to waste my time fixing it rn. It is a lot of code, so Im just going to show how mtry = 16 was done

mtry_16<-
  function(x){
    errors<-list()
    for( seed in 1:100){
      set.seed(seed)
      tr<-sample(nrow(v),nrow(v)*.8)
      v.tr<-v[tr,]
      v.te<-v[-tr,]
      
      errors[[seed]]<-1-confusionMatrix(predict(randomForest(V1~., data = v.tr[,c("V1",best_n[[x]])], mtry = 16), newdata=v.te), v.te$V1)$overall[1]
      
    }
    
    unlist(errors)
  }
mtry_16_preds<-list()
for( i in 16:16){
  mtry_16_preds[[i]]<-mean(mtry_16(i))
}

mtry_16_preds
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## NULL
## 
## [[16]]
## [1] 0.0445977

Organizing best rf models

mtry_n_preds<-list( #list of models
  mtry_1_preds,
  mtry_2_preds,
  mtry_3_preds,
  mtry_4_preds,
  mtry_5_preds,
  mtry_6_preds,
  mtry_7_preds,
  mtry_8_preds,
  mtry_9_preds,
  mtry_10_preds,
  mtry_11_preds,
  mtry_12_preds,
  mtry_13_preds,
  mtry_14_preds,
  mtry_15_preds,
  mtry_16_preds)

best_mtry_100_seeds<-data.frame(
  mtry1=unlist(mtry_1_preds),
  mtry2=c(1000, unlist(mtry_2_preds)),
  mtry3=c(rep(1000,2), unlist(mtry_3_preds)),
  mtry4=c(rep(1000,3), unlist(mtry_4_preds)),
  mtry5=c(rep(1000,4), unlist(mtry_5_preds)),
  mtry6=c(rep(1000,5), unlist(mtry_6_preds)),
  mtry7=c(rep(1000,6), unlist(mtry_7_preds)),
  mtry8=c(rep(1000,7), unlist(mtry_8_preds)),
  mtry9=c(rep(1000,8), unlist(mtry_9_preds)),
  mtry10=c(rep(1000,9), unlist(mtry_10_preds)),
  mtry11=c(rep(1000,10), unlist(mtry_11_preds)),
  mtry12=c(rep(1000,11), unlist(mtry_12_preds)),
  mtry13=c(rep(1000,12), unlist(mtry_13_preds)),
  mtry14=c(rep(1000,13), unlist(mtry_14_preds)),
  mtry15=c(rep(1000,14), unlist(mtry_15_preds)),
  mtry16=c(rep(1000,15), unlist(mtry_16_preds))
)

rownames(best_mtry_100_seeds)<-paste0("nvar_",1:16)

best_mtry_100_seeds
##              mtry1        mtry2        mtry3        mtry4        mtry5
## nvar_1  0.04379310 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_2  0.04551724 4.885057e-02 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_3  0.04528736 4.609195e-02 4.137931e-02 1.000000e+03 1.000000e+03
## nvar_4  0.04586207 4.885057e-02 4.632184e-02 4.310345e-02 1.000000e+03
## nvar_5  0.04126437 4.379310e-02 4.126437e-02 3.965517e-02 3.908046e-02
## nvar_6  0.04586207 4.804598e-02 4.620690e-02 4.344828e-02 4.206897e-02
## nvar_7  0.04655172 4.724138e-02 4.471264e-02 4.321839e-02 4.321839e-02
## nvar_8  0.05114943 4.873563e-02 4.896552e-02 4.781609e-02 4.655172e-02
## nvar_9  0.05275862 4.448276e-02 4.678161e-02 4.770115e-02 4.977011e-02
## nvar_10 0.05252874 4.298851e-02 4.321839e-02 4.471264e-02 4.551724e-02
## nvar_11 0.05298851 3.988506e-02 3.758621e-02 3.827586e-02 3.896552e-02
## nvar_12 0.05609195 4.068966e-02 3.862069e-02 3.850575e-02 3.988506e-02
## nvar_13 0.05712644 4.091954e-02 3.816092e-02 3.747126e-02 3.632184e-02
## nvar_14 0.05908046 4.080460e-02 3.781609e-02 3.666667e-02 3.620690e-02
## nvar_15 0.06379310 4.045977e-02 3.862069e-02 3.862069e-02 3.839080e-02
## nvar_16 0.06494253 4.505747e-02 4.114943e-02 4.103448e-02 4.022989e-02
##                mtry6        mtry7        mtry8        mtry9       mtry10
## nvar_1  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_2  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_3  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_4  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_5  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_6  4.218391e-02 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_7  4.321839e-02 4.264368e-02 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_8  4.678161e-02 4.597701e-02 4.632184e-02 1.000000e+03 1.000000e+03
## nvar_9  4.931034e-02 5.011494e-02 5.000000e-02 5.011494e-02 1.000000e+03
## nvar_10 4.655172e-02 4.816092e-02 4.977011e-02 5.000000e-02 5.045977e-02
## nvar_11 4.034483e-02 4.057471e-02 4.045977e-02 4.114943e-02 4.310345e-02
## nvar_12 3.942529e-02 4.045977e-02 4.045977e-02 4.137931e-02 4.183908e-02
## nvar_13 3.689655e-02 3.919540e-02 4.068966e-02 4.218391e-02 4.264368e-02
## nvar_14 3.735632e-02 3.770115e-02 3.896552e-02 3.942529e-02 4.091954e-02
## nvar_15 3.839080e-02 4.103448e-02 4.000000e-02 4.080460e-02 4.080460e-02
## nvar_16 4.011494e-02 3.965517e-02 4.091954e-02 4.080460e-02 4.241379e-02
##               mtry11       mtry12       mtry13       mtry14       mtry15
## nvar_1  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_2  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_3  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_4  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_5  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_6  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_7  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_8  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_9  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_10 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_11 4.413793e-02 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_12 4.333333e-02 4.356322e-02 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_13 4.390805e-02 4.367816e-02 4.528736e-02 1.000000e+03 1.000000e+03
## nvar_14 4.114943e-02 4.160920e-02 4.264368e-02 4.356322e-02 1.000000e+03
## nvar_15 4.172414e-02 4.275862e-02 4.367816e-02 4.390805e-02 4.494253e-02
## nvar_16 4.195402e-02 4.218391e-02 4.229885e-02 4.310345e-02 4.425287e-02
##              mtry16
## nvar_1  1.00000e+03
## nvar_2  1.00000e+03
## nvar_3  1.00000e+03
## nvar_4  1.00000e+03
## nvar_5  1.00000e+03
## nvar_6  1.00000e+03
## nvar_7  1.00000e+03
## nvar_8  1.00000e+03
## nvar_9  1.00000e+03
## nvar_10 1.00000e+03
## nvar_11 1.00000e+03
## nvar_12 1.00000e+03
## nvar_13 1.00000e+03
## nvar_14 1.00000e+03
## nvar_15 1.00000e+03
## nvar_16 4.45977e-02
best_mtry_matrix<-as.matrix(best_mtry_100_seeds)
best_rf_model<-data.frame(which(best_mtry_matrix == min(best_mtry_matrix), arr.ind = TRUE), min(best_mtry_matrix))
names(best_rf_model)<-c('nvar','mtry','error')

and the best Random Forrest goes to

best_rf_model
##         nvar mtry     error
## nvar_14   14    5 0.0362069

best n preds lda across different predictors, through 100 seeds

npreds_lda<-
  function(x){
    errors<-list()
    for( seed in 1:100){
      set.seed(seed)
      tr<-sample(nrow(v),nrow(v)*.8)
      v.tr<-v[tr,]
      v.te<-v[-tr,]
      
      
      errors[[seed]]<-1-confusionMatrix(predict(lda(V1~., data = v.tr[,c("V1",best_n[[x]])] , cv = FALSE), v.te, type='response')$class, v.te$V1)$overall[1]
      
    }
    
    unlist(errors)
  }
best_npreds_lda<-list()
for( i in 1:16){
  best_npreds_lda[[i]]<-mean(npreds_lda(i))
}


matplot(1:16, best_npreds_lda, xlab = "best n predictors", ylab ="error", main = 'Best n Predictors for LDA' , pch = 19)
lines(1:16, best_npreds_lda)
abline(h = min(unlist(best_npreds_lda))+.2*sd(unlist(best_npreds_lda)))

min(unlist(best_npreds_lda))
## [1] 0.04413793
data.frame(n_preds = which(as.matrix(best_npreds_lda)== min(unlist(best_npreds_lda)), arr.ind = TRUE)[,1], error =  min(unlist(best_npreds_lda)) )
##   n_preds      error
## 1       2 0.04413793
## 2       4 0.04413793

lets try gradient boosting

v.boost<-gbm(V1~., data = v.train, distribution = 'gaussian',n.trees=5000,interaction.depth=4)
yhat.boost=predict(v.boost ,newdata =v.test, n.trees=5000)

v.boostn<-function(x) {
  as.numeric(caret::confusionMatrix(as.factor(ifelse(yhat.boost> x , 1, 0)), v.test$V1)[[3]][1])
}

cutoffs<-seq(1,2,.05)
bbb<-as.data.frame(cutoffs)
bbb$accuracy<-apply(bbb,1,v.boostn)
bbb
##    cutoffs  accuracy
## 1     1.00 0.7011494
## 2     1.05 0.8160920
## 3     1.10 0.8850575
## 4     1.15 0.9080460
## 5     1.20 0.9195402
## 6     1.25 0.9425287
## 7     1.30 0.9425287
## 8     1.35 0.9425287
## 9     1.40 0.9425287
## 10    1.45 0.9425287
## 11    1.50 0.9310345
## 12    1.55 0.9195402
## 13    1.60 0.9080460
## 14    1.65 0.9080460
## 15    1.70 0.9080460
## 16    1.75 0.9080460
## 17    1.80 0.8965517
## 18    1.85 0.8735632
## 19    1.90 0.8620690
## 20    1.95 0.8505747
## 21    2.00 0.7586207
plot(bbb)
lines(bbb[,1],bbb[,2])

#lets try it across the best n groups of predictors through 5 seeds. it takes too damn long to do 100

npreds_gbm<-
  function(x){
    errors<-list()
    for( seed in 1:100){
      set.seed(seed)
      tr<-sample(nrow(v),nrow(v)*.8)
      v.tr<-v[tr,c("V1",best_n[[x]])]
      v.te<-v[-tr,c("V1",best_n[[x]])]
      errors[[seed]]<- 1-(confusionMatrix(as.factor(ifelse(predict(gbm(V1~., data = v.tr, distribution = 'gaussian', n.trees=5000, interaction.depth = 4 ) ,newdata = v.te, n.trees=5000)> 1.35 , 1, 0)), v.te$V1)[[3]][1])
    }
    unlist(errors)
  }



best_gbm<-data.frame(best_n=1:16, error = rbind(mean(npreds_gbm(1)), mean(npreds_gbm(2)),  mean(npreds_gbm(3)),  mean(npreds_gbm(4)),  mean(npreds_gbm(5)),  mean(npreds_gbm(6)),  mean(npreds_gbm(7)),  mean(npreds_gbm(8)),  mean(npreds_gbm(9)),  mean(npreds_gbm(10)),  mean(npreds_gbm(11)),  mean(npreds_gbm(12)),  mean(npreds_gbm(13)),  mean(npreds_gbm(14)),  mean(npreds_gbm(15)),  mean(npreds_gbm(16))))

matplot(best_gbm$best_n, best_gbm$error, xlab = "best n predictors", ylab ="error", main = 'Best n Predictors for GBM' , pch = 19)
lines(best_gbm$best_n, best_gbm$error)
abline(h = min(unlist( best_gbm[2]))+.2*sd(unlist( best_gbm[2])))

best_gbm[which(best_gbm$error== min(best_gbm$error), arr.ind = TRUE),]
##   best_n      error
## 2      2 0.04678161

best svm

#best svm

library(e1071)
#svm linear
npreds_svm_lin<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
                 kernel = "linear", 
                 ranges = list(cost = c(0.1,1,10,100,1000)),
                 gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}

best_npreds_svm_lin<-list()
for( i in 1:16){
    best_npreds_svm_lin[[i]]<-mean(npreds_svm_lin(i))
}

#svm radial
npreds_svm_rad<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
                 kernel = "radial", 
                 ranges = list(cost = c(0.1,1,10,100,1000)),
                 gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}

best_npreds_svm_rad<-list()
for( i in 1:16){
    best_npreds_svm_rad[[i]]<-mean(npreds_svm_rad(i))
}


#svm polynomial

npreds_svm_pol<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
                 kernel = "polynomial", 
                 ranges = list(cost = c(0.1,1,10,100,1000)),
                 gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}

best_npreds_svm_pol<-list()
for( i in 1:16){
    best_npreds_svm_pol[[i]]<-mean(npreds_svm_pol(i))
}


best_svm<-data.frame( n_var = 1:16,
linear = unlist(best_npreds_svm_lin),
radial = unlist(best_npreds_svm_rad),
polynomial = unlist(best_npreds_svm_pol)
)

matplot(as.matrix(best_svm)[,2:4], type = c('b'), pch = 20, col = c(1,2,4), xlab='best n variables', ylab = 'error', main = "Best SVM" )
abline(h=min(as.matrix(best_svm)[,2:4]) + .2*sd(as.matrix(best_svm)[,2:4]))

best_svm_model= data.frame(which(as.matrix(best_svm)==min(as.matrix(best_svm)), arr.ind = TRUE), best_svm[which(as.matrix(best_svm)==min(as.matrix(best_svm)), arr.ind = TRUE)])

names(best_svm_model)<-c("best_n", "model","error")
best_svm_model[1,2]<-names(best_svm)[best_svm_model[1,2]]

best_svm_model
##   best_n  model      error
## 1      7 radial 0.03448276