getting and fixing data set

v<-data.frame(read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', header = FALSE))
v[1]<-as.factor(ifelse(v[1]=='democrat',0,1))
set.seed(1)
train<-sample(nrow(v),nrow(v)*.8)
v.train<-v[train,]
v.test<-v[-train,]

libraries

## Warning: package 'caret' was built under R version 3.5.3

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Warning: package 'gbm' was built under R version 3.5.3

## Loaded gbm 2.1.5

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(caret)
library(dplyr)
library(gbm)
library(randomForest)
library(MASS)

looking at the votes across parties

There are superior ways to do this, but this is what i did.

#looking at the votes across parties
amount_undecided<-length(1:ncol(v))
for (i in 1:ncol(v)){ amount_undecided[i]<-nrow(v[v[i]=='?',])}

perc_und_party<- list()
for (i in 1:ncol(v)){ perc_und_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='?',1]))}
perc_und_party<-as.data.frame(perc_und_party)
names(perc_und_party)<-names(v)
rownames(perc_und_party)<-c('Dem?_percent','Rep?_percent','n?')

perc_no_party<- list()
for (i in 1:ncol(v)){ perc_no_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='n',1]))}
perc_no_party<-as.data.frame(perc_no_party)
names(perc_no_party)<-names(v)
rownames(perc_no_party)<-c('DemNO_percent','RepNO_percent','nNO')

perc_yes_party<- list()
for (i in 1:ncol(v)){ perc_yes_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='y',1]))}
perc_yes_party<-as.data.frame(perc_yes_party)
names(perc_yes_party)<-names(v)
rownames(perc_yes_party)<-c('DemYES_percent','RepYES_percent','nYES')

num_und_party<- list()
for (i in 1:ncol(v)){ num_und_party[[i]]<-as.matrix(table(v[v[i]=='?',1]))}
num_und_party<-as.data.frame(num_und_party)
names(num_und_party)<-names(v)
rownames(num_und_party)<-c('Dem?_#','Rep?_#')

num_no_party<- list()
for (i in 1:ncol(v)){ num_no_party[[i]]<-as.matrix(table(v[v[i]=='n',1]))}
num_no_party<-as.data.frame(num_no_party)
names(num_no_party)<-names(v)
rownames(num_no_party)<-c('DemNO_#','RepNO_#')

num_yes_party<- list()
for (i in 1:ncol(v)){ num_yes_party[[i]]<-as.matrix(table(v[v[i]=='y',1]))}
num_yes_party<-as.data.frame(num_yes_party)
names(num_yes_party)<-names(v)
rownames(num_yes_party)<-c('DemYES_#','RepYES_#')

bill_pass<-length(1:ncol(v))
for (i in 1:ncol(v)) {ncol
  bill_pass[i]<-ifelse(nrow(v[v[i]=='y',])>=218,'pass','no_pass')
}
bill_pass<-t(as.data.frame(bill_pass))
vote_party<-as.matrix(rbind(num_no_party,perc_no_party, num_yes_party, perc_yes_party, num_und_party, perc_und_party))
vote_party<-as.data.frame(rbind(bill_pass,vote_party))
vote_party_pretty<-as.data.frame(t(vote_party[-1]))
vote_party_pretty

##     bill_pass DemNO_# RepNO_#    DemNO_percent     RepNO_percent nNO
## V2    no_pass     102     134 43.2203389830508  56.7796610169492 236
## V3    no_pass     119      73 61.9791666666667  38.0208333333333 192
## V4       pass      29     142 16.9590643274854  83.0409356725146 171
## V5    no_pass     245       2 99.1902834008097 0.809716599190283 247
## V6    no_pass     200       8 96.1538461538462  3.84615384615385 208
## V7       pass     135      17 88.8157894736842  11.1842105263158 152
## V8       pass      59     123 32.4175824175824  67.5824175824176 182
## V9       pass      45     133 25.2808988764045  74.7191011235955 178
## V10   no_pass      60     146  29.126213592233   70.873786407767 206
## V11   no_pass     139      73 65.5660377358491  34.4339622641509 212
## V12   no_pass     126     138 47.7272727272727  52.2727272727273 264
## V13   no_pass     213      20 91.4163090128755  8.58369098712446 233
## V14   no_pass     179      22 89.0547263681592  10.9452736318408 201
## V15      pass     167       3 98.2352941176471  1.76470588235294 170
## V16   no_pass      91     142 39.0557939914163  60.9442060085837 233
## V17      pass      12      50 19.3548387096774  80.6451612903226  62
##     DemYES_# RepYES_#   DemYES_percent   RepYES_percent nYES Dem?_# Rep?_#
## V2       156       31 83.4224598930481 16.5775401069519  187      9      3
## V3       120       75 61.5384615384615 38.4615384615385  195     28     20
## V4       231       22  91.304347826087 8.69565217391304  253      7      4
## V5        14      163 7.90960451977401  92.090395480226  177      8      3
## V6        55      157 25.9433962264151 74.0566037735849  212     12      3
## V7       123      149 45.2205882352941 54.7794117647059  272      9      2
## V8       200       39 83.6820083682008 16.3179916317992  239      8      6
## V9       218       24 90.0826446280992 9.91735537190083  242      4     11
## V10      188       19 90.8212560386473 9.17874396135266  207     19      3
## V11      124       92 57.4074074074074 42.5925925925926  216      4      3
## V12      129       21               86               14  150     12      9
## V13       36      135 21.0526315789474 78.9473684210526  171     18     13
## V14       73      136 34.9282296650718 65.0717703349282  209     15     10
## V15       90      158 36.2903225806452 63.7096774193548  248     10      7
## V16      160       14 91.9540229885057 8.04597701149425  174     16     12
## V17      173       96 64.3122676579926 35.6877323420074  269     82     22
##         Dem?_percent     Rep?_percent  n?
## V2                75               25  12
## V3  58.3333333333333 41.6666666666667  48
## V4  63.6363636363636 36.3636363636364  11
## V5  72.7272727272727 27.2727272727273  11
## V6                80               20  15
## V7  81.8181818181818 18.1818181818182  11
## V8  57.1428571428571 42.8571428571429  14
## V9  26.6666666666667 73.3333333333333  15
## V10 86.3636363636364 13.6363636363636  22
## V11 57.1428571428571 42.8571428571429   7
## V12 57.1428571428571 42.8571428571429  21
## V13 58.0645161290323 41.9354838709677  31
## V14               60               40  25
## V15 58.8235294117647 41.1764705882353  17
## V16 57.1428571428571 42.8571428571429  28
## V17 78.8461538461538 21.1538461538462 104

cross validated logistic regression

ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(V1 ~ ., family="binomial", data = v.train, trControl = ctrl, tuneLength = 5)
summary(mod_fit)

##                 Length Class      Mode     
## call               5   -none-     call     
## type               1   -none-     character
## predicted        348   factor     numeric  
## err.rate        1500   -none-     numeric  
## confusion          6   -none-     numeric  
## votes            696   matrix     numeric  
## oob.times        348   -none-     numeric  
## classes            2   -none-     character
## importance        32   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity          0   -none-     NULL     
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            14   -none-     list     
## y                348   factor     numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            32   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          2   -none-     character
## param              1   -none-     list

pred = predict(mod_fit, newdata=v.test)
confusionMatrix(data=pred, v.test$V1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 50  3
##          1  3 31
##                                           
##                Accuracy : 0.931           
##                  95% CI : (0.8559, 0.9743)
##     No Information Rate : 0.6092          
##     P-Value [Acc > NIR] : 7.442e-12       
##                                           
##                   Kappa : 0.8552          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9434          
##             Specificity : 0.9118          
##          Pos Pred Value : 0.9434          
##          Neg Pred Value : 0.9118          
##              Prevalence : 0.6092          
##          Detection Rate : 0.5747          
##    Detection Prevalence : 0.6092          
##       Balanced Accuracy : 0.9276          
##                                           
##        'Positive' Class : 0               
##

k fold LDA

set.seed(1)
library(MASS)
v.lda.train<-lda(V1~., data = v.train, cv = FALSE)
testpredz<-predict(v.lda.train, v.test,type='response')
Test_Predz<-testpredz$class
1-caret::confusionMatrix(table(Test_Predz, Test_Party = v.test$V1))$overall[1]

##   Accuracy 
## 0.03448276

random forest across different mtry, all 16 variables

rf.v<- list()
yhat.rf<-list()
rf.error<-list()
for ( i in 1:16 ) {
  set.seed(24)
  rf.v[[i]]<-randomForest(V1 ~ ., data = v.train, mtry = i, importance = TRUE)
  yhat.rf[[i]]<-predict(rf.v[[i]], newdata = v.test)
  rf.error[[i]]<-caret::confusionMatrix(yhat.rf[[i]], v.test$V1)$overall[1]
}

rf.airz<-data.frame(accuracy=unlist(rf.error))
rownames(rf.airz)<-paste0("mtry=",1:16)

par(mfrow=c(1,1))
matplot(1:16, rf.error, xlab = 'mtry', ylab = 'Accuracy', main = "What # of predictors sampled at each split \n gives the lowest error?")
lines(1:16, rf.error, type = "o")

closer look at most important variables, across best mtrys, 2:4,8

rf_var_imps<-data.frame(
  scale(varImp(rf.v[[2]])[1]),
  scale(varImp(rf.v[[3]])[1]),
  scale(varImp(rf.v[[4]])[1]),
  scale(varImp(rf.v[[8]])[1])
)

names(rf_var_imps)<-paste0("mtry_",c(2,3,4,8))
avg_var_imp<-apply(rf_var_imps,1,mean)

best_vars<-names(avg_var_imp[order(-avg_var_imp)])

best_n<-list()  #most important variable by descending importances
for(i in 1:16){
  best_n[[i]]<-best_vars[1:i]
}

best_n

## [[1]]
## [1] "V5"
## 
## [[2]]
## [1] "V5" "V4"
## 
## [[3]]
## [1] "V5"  "V4"  "V12"
## 
## [[4]]
## [1] "V5"  "V4"  "V12" "V13"
## 
## [[5]]
## [1] "V5"  "V4"  "V12" "V13" "V6" 
## 
## [[6]]
## [1] "V5"  "V4"  "V12" "V13" "V6"  "V15"
## 
## [[7]]
## [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10"
## 
## [[8]]
## [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16"
## 
## [[9]]
## [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17"
## 
## [[10]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9" 
## 
## [[11]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## 
## [[12]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## [12] "V14"
## 
## [[13]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## [12] "V14" "V3" 
## 
## [[14]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## [12] "V14" "V3"  "V7" 
## 
## [[15]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## [12] "V14" "V3"  "V7"  "V11"
## 
## [[16]]
##  [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
## [12] "V14" "V3"  "V7"  "V11" "V2"

do random forest across all 16 possible mtrys, using best n (1:16) predictors, WITH 100 different seeds

## [[1]]
## [1] 0.0437931
## 
## [[2]]
## [1] 0.04551724
## 
## [[3]]
## [1] 0.04528736
## 
## [[4]]
## [1] 0.04586207
## 
## [[5]]
## [1] 0.04126437
## 
## [[6]]
## [1] 0.04586207
## 
## [[7]]
## [1] 0.04655172
## 
## [[8]]
## [1] 0.05114943
## 
## [[9]]
## [1] 0.05275862
## 
## [[10]]
## [1] 0.05252874
## 
## [[11]]
## [1] 0.05298851
## 
## [[12]]
## [1] 0.05609195
## 
## [[13]]
## [1] 0.05712644
## 
## [[14]]
## [1] 0.05908046
## 
## [[15]]
## [1] 0.0637931
## 
## [[16]]
## [1] 0.06494253

## [[1]]
## NULL
## 
## [[2]]
## [1] 0.04885057
## 
## [[3]]
## [1] 0.04609195
## 
## [[4]]
## [1] 0.04885057
## 
## [[5]]
## [1] 0.0437931
## 
## [[6]]
## [1] 0.04804598
## 
## [[7]]
## [1] 0.04724138
## 
## [[8]]
## [1] 0.04873563
## 
## [[9]]
## [1] 0.04448276
## 
## [[10]]
## [1] 0.04298851
## 
## [[11]]
## [1] 0.03988506
## 
## [[12]]
## [1] 0.04068966
## 
## [[13]]
## [1] 0.04091954
## 
## [[14]]
## [1] 0.0408046
## 
## [[15]]
## [1] 0.04045977
## 
## [[16]]
## [1] 0.04505747

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## [1] 0.04137931
## 
## [[4]]
## [1] 0.04632184
## 
## [[5]]
## [1] 0.04126437
## 
## [[6]]
## [1] 0.0462069
## 
## [[7]]
## [1] 0.04471264
## 
## [[8]]
## [1] 0.04896552
## 
## [[9]]
## [1] 0.04678161
## 
## [[10]]
## [1] 0.04321839
## 
## [[11]]
## [1] 0.03758621
## 
## [[12]]
## [1] 0.03862069
## 
## [[13]]
## [1] 0.03816092
## 
## [[14]]
## [1] 0.03781609
## 
## [[15]]
## [1] 0.03862069
## 
## [[16]]
## [1] 0.04114943

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## [1] 0.04310345
## 
## [[5]]
## [1] 0.03965517
## 
## [[6]]
## [1] 0.04344828
## 
## [[7]]
## [1] 0.04321839
## 
## [[8]]
## [1] 0.04781609
## 
## [[9]]
## [1] 0.04770115
## 
## [[10]]
## [1] 0.04471264
## 
## [[11]]
## [1] 0.03827586
## 
## [[12]]
## [1] 0.03850575
## 
## [[13]]
## [1] 0.03747126
## 
## [[14]]
## [1] 0.03666667
## 
## [[15]]
## [1] 0.03862069
## 
## [[16]]
## [1] 0.04103448

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## [1] 0.03908046
## 
## [[6]]
## [1] 0.04206897
## 
## [[7]]
## [1] 0.04321839
## 
## [[8]]
## [1] 0.04655172
## 
## [[9]]
## [1] 0.04977011
## 
## [[10]]
## [1] 0.04551724
## 
## [[11]]
## [1] 0.03896552
## 
## [[12]]
## [1] 0.03988506
## 
## [[13]]
## [1] 0.03632184
## 
## [[14]]
## [1] 0.0362069
## 
## [[15]]
## [1] 0.0383908
## 
## [[16]]
## [1] 0.04022989

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## [1] 0.04218391
## 
## [[7]]
## [1] 0.04321839
## 
## [[8]]
## [1] 0.04678161
## 
## [[9]]
## [1] 0.04931034
## 
## [[10]]
## [1] 0.04655172
## 
## [[11]]
## [1] 0.04034483
## 
## [[12]]
## [1] 0.03942529
## 
## [[13]]
## [1] 0.03689655
## 
## [[14]]
## [1] 0.03735632
## 
## [[15]]
## [1] 0.0383908
## 
## [[16]]
## [1] 0.04011494

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## [1] 0.04264368
## 
## [[8]]
## [1] 0.04597701
## 
## [[9]]
## [1] 0.05011494
## 
## [[10]]
## [1] 0.04816092
## 
## [[11]]
## [1] 0.04057471
## 
## [[12]]
## [1] 0.04045977
## 
## [[13]]
## [1] 0.0391954
## 
## [[14]]
## [1] 0.03770115
## 
## [[15]]
## [1] 0.04103448
## 
## [[16]]
## [1] 0.03965517

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## [1] 0.04632184
## 
## [[9]]
## [1] 0.05
## 
## [[10]]
## [1] 0.04977011
## 
## [[11]]
## [1] 0.04045977
## 
## [[12]]
## [1] 0.04045977
## 
## [[13]]
## [1] 0.04068966
## 
## [[14]]
## [1] 0.03896552
## 
## [[15]]
## [1] 0.04
## 
## [[16]]
## [1] 0.04091954

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## [1] 0.05011494
## 
## [[10]]
## [1] 0.05
## 
## [[11]]
## [1] 0.04114943
## 
## [[12]]
## [1] 0.04137931
## 
## [[13]]
## [1] 0.04218391
## 
## [[14]]
## [1] 0.03942529
## 
## [[15]]
## [1] 0.0408046
## 
## [[16]]
## [1] 0.0408046

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## [1] 0.05045977
## 
## [[11]]
## [1] 0.04310345
## 
## [[12]]
## [1] 0.04183908
## 
## [[13]]
## [1] 0.04264368
## 
## [[14]]
## [1] 0.04091954
## 
## [[15]]
## [1] 0.0408046
## 
## [[16]]
## [1] 0.04241379

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## [1] 0.04413793
## 
## [[12]]
## [1] 0.04333333
## 
## [[13]]
## [1] 0.04390805
## 
## [[14]]
## [1] 0.04114943
## 
## [[15]]
## [1] 0.04172414
## 
## [[16]]
## [1] 0.04195402

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## [1] 0.04356322
## 
## [[13]]
## [1] 0.04367816
## 
## [[14]]
## [1] 0.0416092
## 
## [[15]]
## [1] 0.04275862
## 
## [[16]]
## [1] 0.04218391

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## [1] 0.04528736
## 
## [[14]]
## [1] 0.04264368
## 
## [[15]]
## [1] 0.04367816
## 
## [[16]]
## [1] 0.04229885

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## [1] 0.04356322
## 
## [[15]]
## [1] 0.04390805
## 
## [[16]]
## [1] 0.04310345

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## [1] 0.04494253
## 
## [[16]]
## [1] 0.04425287

Once again, there was probably a shorter way to do this, but Im not going to waste my time fixing it rn. It is a lot of code, so Im just going to show how mtry = 16 was done

mtry_16<-
  function(x){
    errors<-list()
    for( seed in 1:100){
      set.seed(seed)
      tr<-sample(nrow(v),nrow(v)*.8)
      v.tr<-v[tr,]
      v.te<-v[-tr,]
      
      errors[[seed]]<-1-confusionMatrix(predict(randomForest(V1~., data = v.tr[,c("V1",best_n[[x]])], mtry = 16), newdata=v.te), v.te$V1)$overall[1]
      
    }
    
    unlist(errors)
  }
mtry_16_preds<-list()
for( i in 16:16){
  mtry_16_preds[[i]]<-mean(mtry_16(i))
}

mtry_16_preds

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## NULL
## 
## [[16]]
## [1] 0.0445977

Organizing best rf models

mtry_n_preds<-list( #list of models
  mtry_1_preds,
  mtry_2_preds,
  mtry_3_preds,
  mtry_4_preds,
  mtry_5_preds,
  mtry_6_preds,
  mtry_7_preds,
  mtry_8_preds,
  mtry_9_preds,
  mtry_10_preds,
  mtry_11_preds,
  mtry_12_preds,
  mtry_13_preds,
  mtry_14_preds,
  mtry_15_preds,
  mtry_16_preds)

best_mtry_100_seeds<-data.frame(
  mtry1=unlist(mtry_1_preds),
  mtry2=c(1000, unlist(mtry_2_preds)),
  mtry3=c(rep(1000,2), unlist(mtry_3_preds)),
  mtry4=c(rep(1000,3), unlist(mtry_4_preds)),
  mtry5=c(rep(1000,4), unlist(mtry_5_preds)),
  mtry6=c(rep(1000,5), unlist(mtry_6_preds)),
  mtry7=c(rep(1000,6), unlist(mtry_7_preds)),
  mtry8=c(rep(1000,7), unlist(mtry_8_preds)),
  mtry9=c(rep(1000,8), unlist(mtry_9_preds)),
  mtry10=c(rep(1000,9), unlist(mtry_10_preds)),
  mtry11=c(rep(1000,10), unlist(mtry_11_preds)),
  mtry12=c(rep(1000,11), unlist(mtry_12_preds)),
  mtry13=c(rep(1000,12), unlist(mtry_13_preds)),
  mtry14=c(rep(1000,13), unlist(mtry_14_preds)),
  mtry15=c(rep(1000,14), unlist(mtry_15_preds)),
  mtry16=c(rep(1000,15), unlist(mtry_16_preds))
)

rownames(best_mtry_100_seeds)<-paste0("nvar_",1:16)

best_mtry_100_seeds

##              mtry1        mtry2        mtry3        mtry4        mtry5
## nvar_1  0.04379310 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_2  0.04551724 4.885057e-02 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_3  0.04528736 4.609195e-02 4.137931e-02 1.000000e+03 1.000000e+03
## nvar_4  0.04586207 4.885057e-02 4.632184e-02 4.310345e-02 1.000000e+03
## nvar_5  0.04126437 4.379310e-02 4.126437e-02 3.965517e-02 3.908046e-02
## nvar_6  0.04586207 4.804598e-02 4.620690e-02 4.344828e-02 4.206897e-02
## nvar_7  0.04655172 4.724138e-02 4.471264e-02 4.321839e-02 4.321839e-02
## nvar_8  0.05114943 4.873563e-02 4.896552e-02 4.781609e-02 4.655172e-02
## nvar_9  0.05275862 4.448276e-02 4.678161e-02 4.770115e-02 4.977011e-02
## nvar_10 0.05252874 4.298851e-02 4.321839e-02 4.471264e-02 4.551724e-02
## nvar_11 0.05298851 3.988506e-02 3.758621e-02 3.827586e-02 3.896552e-02
## nvar_12 0.05609195 4.068966e-02 3.862069e-02 3.850575e-02 3.988506e-02
## nvar_13 0.05712644 4.091954e-02 3.816092e-02 3.747126e-02 3.632184e-02
## nvar_14 0.05908046 4.080460e-02 3.781609e-02 3.666667e-02 3.620690e-02
## nvar_15 0.06379310 4.045977e-02 3.862069e-02 3.862069e-02 3.839080e-02
## nvar_16 0.06494253 4.505747e-02 4.114943e-02 4.103448e-02 4.022989e-02
##                mtry6        mtry7        mtry8        mtry9       mtry10
## nvar_1  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_2  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_3  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_4  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_5  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_6  4.218391e-02 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_7  4.321839e-02 4.264368e-02 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_8  4.678161e-02 4.597701e-02 4.632184e-02 1.000000e+03 1.000000e+03
## nvar_9  4.931034e-02 5.011494e-02 5.000000e-02 5.011494e-02 1.000000e+03
## nvar_10 4.655172e-02 4.816092e-02 4.977011e-02 5.000000e-02 5.045977e-02
## nvar_11 4.034483e-02 4.057471e-02 4.045977e-02 4.114943e-02 4.310345e-02
## nvar_12 3.942529e-02 4.045977e-02 4.045977e-02 4.137931e-02 4.183908e-02
## nvar_13 3.689655e-02 3.919540e-02 4.068966e-02 4.218391e-02 4.264368e-02
## nvar_14 3.735632e-02 3.770115e-02 3.896552e-02 3.942529e-02 4.091954e-02
## nvar_15 3.839080e-02 4.103448e-02 4.000000e-02 4.080460e-02 4.080460e-02
## nvar_16 4.011494e-02 3.965517e-02 4.091954e-02 4.080460e-02 4.241379e-02
##               mtry11       mtry12       mtry13       mtry14       mtry15
## nvar_1  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_2  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_3  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_4  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_5  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_6  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_7  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_8  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_9  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_10 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_11 4.413793e-02 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_12 4.333333e-02 4.356322e-02 1.000000e+03 1.000000e+03 1.000000e+03
## nvar_13 4.390805e-02 4.367816e-02 4.528736e-02 1.000000e+03 1.000000e+03
## nvar_14 4.114943e-02 4.160920e-02 4.264368e-02 4.356322e-02 1.000000e+03
## nvar_15 4.172414e-02 4.275862e-02 4.367816e-02 4.390805e-02 4.494253e-02
## nvar_16 4.195402e-02 4.218391e-02 4.229885e-02 4.310345e-02 4.425287e-02
##              mtry16
## nvar_1  1.00000e+03
## nvar_2  1.00000e+03
## nvar_3  1.00000e+03
## nvar_4  1.00000e+03
## nvar_5  1.00000e+03
## nvar_6  1.00000e+03
## nvar_7  1.00000e+03
## nvar_8  1.00000e+03
## nvar_9  1.00000e+03
## nvar_10 1.00000e+03
## nvar_11 1.00000e+03
## nvar_12 1.00000e+03
## nvar_13 1.00000e+03
## nvar_14 1.00000e+03
## nvar_15 1.00000e+03
## nvar_16 4.45977e-02

best_mtry_matrix<-as.matrix(best_mtry_100_seeds)
best_rf_model<-data.frame(which(best_mtry_matrix == min(best_mtry_matrix), arr.ind = TRUE), min(best_mtry_matrix))
names(best_rf_model)<-c('nvar','mtry','error')

and the best Random Forrest goes to

best_rf_model

##         nvar mtry     error
## nvar_14   14    5 0.0362069

best n preds lda across different predictors, through 100 seeds

npreds_lda<-
  function(x){
    errors<-list()
    for( seed in 1:100){
      set.seed(seed)
      tr<-sample(nrow(v),nrow(v)*.8)
      v.tr<-v[tr,]
      v.te<-v[-tr,]
      
      
      errors[[seed]]<-1-confusionMatrix(predict(lda(V1~., data = v.tr[,c("V1",best_n[[x]])] , cv = FALSE), v.te, type='response')$class, v.te$V1)$overall[1]
      
    }
    
    unlist(errors)
  }
best_npreds_lda<-list()
for( i in 1:16){
  best_npreds_lda[[i]]<-mean(npreds_lda(i))
}


matplot(1:16, best_npreds_lda, xlab = "best n predictors", ylab ="error", main = 'Best n Predictors for LDA' , pch = 19)
lines(1:16, best_npreds_lda)
abline(h = min(unlist(best_npreds_lda))+.2*sd(unlist(best_npreds_lda)))

min(unlist(best_npreds_lda))

## [1] 0.04413793

data.frame(n_preds = which(as.matrix(best_npreds_lda)== min(unlist(best_npreds_lda)), arr.ind = TRUE)[,1], error =  min(unlist(best_npreds_lda)) )

##   n_preds      error
## 1       2 0.04413793
## 2       4 0.04413793

lets try gradient boosting

v.boost<-gbm(V1~., data = v.train, distribution = 'gaussian',n.trees=5000,interaction.depth=4)
yhat.boost=predict(v.boost ,newdata =v.test, n.trees=5000)

v.boostn<-function(x) {
  as.numeric(caret::confusionMatrix(as.factor(ifelse(yhat.boost> x , 1, 0)), v.test$V1)[[3]][1])
}

cutoffs<-seq(1,2,.05)
bbb<-as.data.frame(cutoffs)
bbb$accuracy<-apply(bbb,1,v.boostn)
bbb

##    cutoffs  accuracy
## 1     1.00 0.7011494
## 2     1.05 0.8160920
## 3     1.10 0.8850575
## 4     1.15 0.9080460
## 5     1.20 0.9195402
## 6     1.25 0.9425287
## 7     1.30 0.9425287
## 8     1.35 0.9425287
## 9     1.40 0.9425287
## 10    1.45 0.9425287
## 11    1.50 0.9310345
## 12    1.55 0.9195402
## 13    1.60 0.9080460
## 14    1.65 0.9080460
## 15    1.70 0.9080460
## 16    1.75 0.9080460
## 17    1.80 0.8965517
## 18    1.85 0.8735632
## 19    1.90 0.8620690
## 20    1.95 0.8505747
## 21    2.00 0.7586207

plot(bbb)
lines(bbb[,1],bbb[,2])

#lets try it across the best n groups of predictors through 5 seeds. it takes too damn long to do 100

npreds_gbm<-
  function(x){
    errors<-list()
    for( seed in 1:100){
      set.seed(seed)
      tr<-sample(nrow(v),nrow(v)*.8)
      v.tr<-v[tr,c("V1",best_n[[x]])]
      v.te<-v[-tr,c("V1",best_n[[x]])]
      errors[[seed]]<- 1-(confusionMatrix(as.factor(ifelse(predict(gbm(V1~., data = v.tr, distribution = 'gaussian', n.trees=5000, interaction.depth = 4 ) ,newdata = v.te, n.trees=5000)> 1.35 , 1, 0)), v.te$V1)[[3]][1])
    }
    unlist(errors)
  }



best_gbm<-data.frame(best_n=1:16, error = rbind(mean(npreds_gbm(1)), mean(npreds_gbm(2)),  mean(npreds_gbm(3)),  mean(npreds_gbm(4)),  mean(npreds_gbm(5)),  mean(npreds_gbm(6)),  mean(npreds_gbm(7)),  mean(npreds_gbm(8)),  mean(npreds_gbm(9)),  mean(npreds_gbm(10)),  mean(npreds_gbm(11)),  mean(npreds_gbm(12)),  mean(npreds_gbm(13)),  mean(npreds_gbm(14)),  mean(npreds_gbm(15)),  mean(npreds_gbm(16))))

matplot(best_gbm$best_n, best_gbm$error, xlab = "best n predictors", ylab ="error", main = 'Best n Predictors for GBM' , pch = 19)
lines(best_gbm$best_n, best_gbm$error)
abline(h = min(unlist( best_gbm[2]))+.2*sd(unlist( best_gbm[2])))

best_gbm[which(best_gbm$error== min(best_gbm$error), arr.ind = TRUE),]

##   best_n      error
## 2      2 0.04678161

best svm

#best svm

library(e1071)
#svm linear
npreds_svm_lin<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
                 kernel = "linear", 
                 ranges = list(cost = c(0.1,1,10,100,1000)),
                 gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}

best_npreds_svm_lin<-list()
for( i in 1:16){
    best_npreds_svm_lin[[i]]<-mean(npreds_svm_lin(i))
}

#svm radial
npreds_svm_rad<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
                 kernel = "radial", 
                 ranges = list(cost = c(0.1,1,10,100,1000)),
                 gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}

best_npreds_svm_rad<-list()
for( i in 1:16){
    best_npreds_svm_rad[[i]]<-mean(npreds_svm_rad(i))
}


#svm polynomial

npreds_svm_pol<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
                 kernel = "polynomial", 
                 ranges = list(cost = c(0.1,1,10,100,1000)),
                 gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}

best_npreds_svm_pol<-list()
for( i in 1:16){
    best_npreds_svm_pol[[i]]<-mean(npreds_svm_pol(i))
}


best_svm<-data.frame( n_var = 1:16,
linear = unlist(best_npreds_svm_lin),
radial = unlist(best_npreds_svm_rad),
polynomial = unlist(best_npreds_svm_pol)
)

matplot(as.matrix(best_svm)[,2:4], type = c('b'), pch = 20, col = c(1,2,4), xlab='best n variables', ylab = 'error', main = "Best SVM" )
abline(h=min(as.matrix(best_svm)[,2:4]) + .2*sd(as.matrix(best_svm)[,2:4]))

best_svm_model= data.frame(which(as.matrix(best_svm)==min(as.matrix(best_svm)), arr.ind = TRUE), best_svm[which(as.matrix(best_svm)==min(as.matrix(best_svm)), arr.ind = TRUE)])

names(best_svm_model)<-c("best_n", "model","error")
best_svm_model[1,2]<-names(best_svm)[best_svm_model[1,2]]

best_svm_model

##   best_n  model      error
## 1      7 radial 0.03448276

Final - Congressional Voting Data

Group Project

May 13, 2019

getting and fixing data set

libraries

looking at the votes across parties

There are superior ways to do this, but this is what i did.

cross validated logistic regression

k fold LDA

random forest across different mtry, all 16 variables

closer look at most important variables, across best mtrys, 2:4,8

do random forest across all 16 possible mtrys, using best n (1:16) predictors, WITH 100 different seeds

Once again, there was probably a shorter way to do this, but Im not going to waste my time fixing it rn. It is a lot of code, so Im just going to show how mtry = 16 was done

Organizing best rf models

and the best Random Forrest goes to

best n preds lda across different predictors, through 100 seeds

lets try gradient boosting

best svm