Data Mining Final Project

Michal Greenwood, Ashley Mata, Andrew Padro, John Thomas and Manuel Valles
May 13th, 2019

Getting and Fixing Data Set

v<-data.frame(read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', header = FALSE))
v[1]<-as.factor(ifelse(v[1]=='democrat',0,1))
set.seed(1)
train<-sample(nrow(v),nrow(v)*.8)
v.train<-v[train,]
v.test<-v[-train,]

Libraries

library(caret)
library(dplyr)
library(gbm)
library(randomForest)
library(MASS)

Looking at the Votes Across Parties

There are other ways to do this, but this is what we did.

amount_undecided<-length(1:ncol(v))
for (i in 1:ncol(v)){ amount_undecided[i]<-nrow(v[v[i]=='?',])}

perc_und_party<- list()
for (i in 1:ncol(v)){ perc_und_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='?',1]))}
perc_und_party<-as.data.frame(perc_und_party)
names(perc_und_party)<-names(v)
rownames(perc_und_party)<-c('Dem?_percent','Rep?_percent','n?')

perc_no_party<- list()
for (i in 1:ncol(v)){ perc_no_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='n',1]))}
perc_no_party<-as.data.frame(perc_no_party)
names(perc_no_party)<-names(v)
rownames(perc_no_party)<-c('DemNO_percent','RepNO_percent','nNO')

perc_yes_party<- list()
for (i in 1:ncol(v)){ perc_yes_party[[i]]<-as.matrix(memisc::percent(v[v[i]=='y',1]))}
perc_yes_party<-as.data.frame(perc_yes_party)
names(perc_yes_party)<-names(v)
rownames(perc_yes_party)<-c('DemYES_percent','RepYES_percent','nYES')

num_und_party<- list()
for (i in 1:ncol(v)){ num_und_party[[i]]<-as.matrix(table(v[v[i]=='?',1]))}
num_und_party<-as.data.frame(num_und_party)
names(num_und_party)<-names(v)
rownames(num_und_party)<-c('Dem?_#','Rep?_#')

num_no_party<- list()
for (i in 1:ncol(v)){ num_no_party[[i]]<-as.matrix(table(v[v[i]=='n',1]))}
num_no_party<-as.data.frame(num_no_party)
names(num_no_party)<-names(v)
rownames(num_no_party)<-c('DemNO_#','RepNO_#')

num_yes_party<- list()
for (i in 1:ncol(v)){ num_yes_party[[i]]<-as.matrix(table(v[v[i]=='y',1]))}
num_yes_party<-as.data.frame(num_yes_party)
names(num_yes_party)<-names(v)
rownames(num_yes_party)<-c('DemYES_#','RepYES_#')

bill_pass<-length(1:ncol(v))
for (i in 1:ncol(v)) {ncol
  bill_pass[i]<-ifelse(nrow(v[v[i]=='y',])>=218,'pass','no_pass')
}
bill_pass<-t(as.data.frame(bill_pass))
vote_party<-as.matrix(rbind(num_no_party,perc_no_party, num_yes_party, perc_yes_party, num_und_party, perc_und_party))
vote_party<-as.data.frame(rbind(bill_pass,vote_party))
vote_party_pretty<-as.data.frame(t(vote_party[-1]))

n_dem<-NROW(v[v$V1==0,])
n_rep<-NROW(v[v$V1==1,])

vote_party_pretty %>% mutate(PercentDem_No = vote_party_pretty$`DemNO_#`/n_dem  )

   bill_pass DemNO_# RepNO_#    DemNO_percent     RepNO_percent nNO
1    no_pass     102     134 43.2203389830508  56.7796610169492 236
2    no_pass     119      73 61.9791666666667  38.0208333333333 192
3       pass      29     142 16.9590643274854  83.0409356725146 171
4    no_pass     245       2 99.1902834008097 0.809716599190283 247
5    no_pass     200       8 96.1538461538462  3.84615384615385 208
6       pass     135      17 88.8157894736842  11.1842105263158 152
7       pass      59     123 32.4175824175824  67.5824175824176 182
8       pass      45     133 25.2808988764045  74.7191011235955 178
9    no_pass      60     146  29.126213592233   70.873786407767 206
10   no_pass     139      73 65.5660377358491  34.4339622641509 212
11   no_pass     126     138 47.7272727272727  52.2727272727273 264
12   no_pass     213      20 91.4163090128755  8.58369098712446 233
13   no_pass     179      22 89.0547263681592  10.9452736318408 201
14      pass     167       3 98.2352941176471  1.76470588235294 170
15   no_pass      91     142 39.0557939914163  60.9442060085837 233
16      pass      12      50 19.3548387096774  80.6451612903226  62
   DemYES_# RepYES_#   DemYES_percent   RepYES_percent nYES Dem?_# Rep?_#
1       156       31 83.4224598930481 16.5775401069519  187      9      3
2       120       75 61.5384615384615 38.4615384615385  195     28     20
3       231       22  91.304347826087 8.69565217391304  253      7      4
4        14      163 7.90960451977401  92.090395480226  177      8      3
5        55      157 25.9433962264151 74.0566037735849  212     12      3
6       123      149 45.2205882352941 54.7794117647059  272      9      2
7       200       39 83.6820083682008 16.3179916317992  239      8      6
8       218       24 90.0826446280992 9.91735537190083  242      4     11
9       188       19 90.8212560386473 9.17874396135266  207     19      3
10      124       92 57.4074074074074 42.5925925925926  216      4      3
11      129       21               86               14  150     12      9
12       36      135 21.0526315789474 78.9473684210526  171     18     13
13       73      136 34.9282296650718 65.0717703349282  209     15     10
14       90      158 36.2903225806452 63.7096774193548  248     10      7
15      160       14 91.9540229885057 8.04597701149425  174     16     12
16      173       96 64.3122676579926 35.6877323420074  269     82     22
       Dem?_percent     Rep?_percent  n? PercentDem_No
1                75               25  12            NA
2  58.3333333333333 41.6666666666667  48            NA
3  63.6363636363636 36.3636363636364  11            NA
4  72.7272727272727 27.2727272727273  11            NA
5                80               20  15            NA
6  81.8181818181818 18.1818181818182  11            NA
7  57.1428571428571 42.8571428571429  14            NA
8  26.6666666666667 73.3333333333333  15            NA
9  86.3636363636364 13.6363636363636  22            NA
10 57.1428571428571 42.8571428571429   7            NA
11 57.1428571428571 42.8571428571429  21            NA
12 58.0645161290323 41.9354838709677  31            NA
13               60               40  25            NA
14 58.8235294117647 41.1764705882353  17            NA
15 57.1428571428571 42.8571428571429  28            NA
16 78.8461538461538 21.1538461538462 104            NA

#View(vote_party_pretty)

Cross Validated Logistic Regression

ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(V1 ~ ., family="binomial", data = v.train, trControl = ctrl, tuneLength = 5)

pred = predict(mod_fit, newdata=v.test)
confusionMatrix(data=pred, v.test$V1)

Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 50  3
         1  3 31

               Accuracy : 0.931           
                 95% CI : (0.8559, 0.9743)
    No Information Rate : 0.6092          
    P-Value [Acc > NIR] : 7.442e-12       

                  Kappa : 0.8552          

 Mcnemar's Test P-Value : 1               

            Sensitivity : 0.9434          
            Specificity : 0.9118          
         Pos Pred Value : 0.9434          
         Neg Pred Value : 0.9118          
             Prevalence : 0.6092          
         Detection Rate : 0.5747          
   Detection Prevalence : 0.6092          
      Balanced Accuracy : 0.9276          

       'Positive' Class : 0

varImp(mod_fit$finalModel)

        Overall
V2n   0.3076245
V2y   0.1935486
V3n   0.8183339
V3y   0.7024232
V4n   3.0291660
V4y   3.1391322
V5n  37.3727408
V5y  92.4662184
V6n   0.3873454
V6y   0.9557746
V7n   0.3690561
V7y   0.3656961
V8n   0.7579766
V8y   0.6636729
V9n   0.4885352
V9y   0.2193060
V10n  0.4371934
V10y  1.0515425
V11n  0.7635286
V11y  0.8758000
V12n  1.8533688
V12y  4.4335176
V13n  2.6145978
V13y  2.4972471
V14n  0.3765825
V14y  0.7019656
V15n  0.4292976
V15y  0.3709025
V16n  0.6463404
V16y  1.2821558
V17n  0.4721428
V17y  3.2667538

K-fold LDA

set.seed(1)
library(MASS)
v.lda.train<-lda(V1~., data = v.train, cv = FALSE)
testpredz<-predict(v.lda.train, v.test,type='response')
Test_Predz<-testpredz$class
1-caret::confusionMatrix(table(Test_Predz, Test_Party = v.test$V1))$overall[1]

  Accuracy 
0.03448276

Random Forest Across Different mtry, all 16 Variables

rf.v<- list()
yhat.rf<-list()
rf.error<-list()
for ( i in 1:16 ) {
  set.seed(24)
  rf.v[[i]]<-randomForest(V1 ~ ., data = v.train, mtry = i, importance = TRUE)
  yhat.rf[[i]]<-predict(rf.v[[i]], newdata = v.test)
  rf.error[[i]]<-caret::confusionMatrix(yhat.rf[[i]], v.test$V1)$overall[1]
}

rf.airz<-data.frame(accuracy=unlist(rf.error))
rownames(rf.airz)<-paste0("mtry=",1:16)

par(mfrow=c(1,1))
matplot(1:16, rf.error, xlab = 'mtry', ylab = 'Accuracy', main = "What # of predictors sampled at each split \n gives the lowest error?")
lines(1:16, rf.error, type = "o")

plot of chunk unnamed-chunk-9

Closer Look at Most Important Variables, Across Best mtrys, 2:4,8

rf_var_imps<-data.frame(
  scale(varImp(rf.v[[2]])[1]),
  scale(varImp(rf.v[[3]])[1]),
  scale(varImp(rf.v[[4]])[1]),
  scale(varImp(rf.v[[8]])[1])
)

names(rf_var_imps)<-paste0("mtry_",c(2,3,4,8))
avg_var_imp<-apply(rf_var_imps,1,mean)

best_vars<-names(avg_var_imp[order(-avg_var_imp)])

best_n<-list()  #most important variable by descending importances
for(i in 1:16){
  best_n[[i]]<-best_vars[1:i]
}

best_n

[[1]]
[1] "V5"

[[2]]
[1] "V5" "V4"

[[3]]
[1] "V5"  "V4"  "V12"

[[4]]
[1] "V5"  "V4"  "V12" "V13"

[[5]]
[1] "V5"  "V4"  "V12" "V13" "V6" 

[[6]]
[1] "V5"  "V4"  "V12" "V13" "V6"  "V15"

[[7]]
[1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10"

[[8]]
[1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16"

[[9]]
[1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17"

[[10]]
 [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9" 

[[11]]
 [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 

[[12]]
 [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
[12] "V14"

[[13]]
 [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
[12] "V14" "V3" 

[[14]]
 [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
[12] "V14" "V3"  "V7" 

[[15]]
 [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
[12] "V14" "V3"  "V7"  "V11"

[[16]]
 [1] "V5"  "V4"  "V12" "V13" "V6"  "V15" "V10" "V16" "V17" "V9"  "V8" 
[12] "V14" "V3"  "V7"  "V11" "V2"

Do randomForest across all 16 possible mtrys, using best n (1:16) predictors, WITH 10 different seeds

Once again, there was probably a shorter way to do this, but Im not going to waste my time fixing it rn. It is a lot of code, so Im just going to show how mtry = 16 was done

mtry_16<- function(x){
    errors<-list()
    for( seed in 1:10){
      set.seed(seed)
      tr<-sample(nrow(v),nrow(v)*.8)
      v.tr<-v[tr,]
      v.te<-v[-tr,]

      errors[[seed]]<-1-confusionMatrix(predict(randomForest(V1~., data = v.tr[,c("V1",best_n[[x]])], mtry = 16), newdata=v.te), v.te$V1)$overall[1]

    }

    unlist(errors)
  }

mtry_16_preds<-list()
for( i in 16:16){
  mtry_16_preds[[i]]<-mean(mtry_16(i))
}

mtry_16_preds

[[1]]
NULL

[[2]]
NULL

[[3]]
NULL

[[4]]
NULL

[[5]]
NULL

[[6]]
NULL

[[7]]
NULL

[[8]]
NULL

[[9]]
NULL

[[10]]
NULL

[[11]]
NULL

[[12]]
NULL

[[13]]
NULL

[[14]]
NULL

[[15]]
NULL

[[16]]
[1] 0.04137931

Organizing the Best rf models

#list of models
mtry_n_preds<-list(mtry_1_preds,mtry_2_preds,mtry_3_preds,
  mtry_4_preds,mtry_5_preds,mtry_6_preds,mtry_7_preds,
  mtry_8_preds,mtry_9_preds,mtry_10_preds,mtry_11_preds,
  mtry_12_preds,mtry_13_preds,mtry_14_preds,mtry_15_preds,
  mtry_15_preds)

best_mtry_10_seeds<-data.frame(
  mtry1=unlist(mtry_1_preds),mtry2=c(1000, unlist(mtry_2_preds)),mtry3=c(rep(1000,2), unlist(mtry_3_preds)),
  mtry4=c(rep(1000,3), unlist(mtry_4_preds)),mtry5=c(rep(1000,4), unlist(mtry_5_preds)),
  mtry6=c(rep(1000,5), unlist(mtry_6_preds)),mtry7=c(rep(1000,6), unlist(mtry_7_preds)),
  mtry8=c(rep(1000,7), unlist(mtry_8_preds)),mtry9=c(rep(1000,8), unlist(mtry_9_preds)),
  mtry10=c(rep(1000,9), unlist(mtry_10_preds)),mtry11=c(rep(1000,10), unlist(mtry_11_preds)),
  mtry12=c(rep(1000,11), unlist(mtry_12_preds)),mtry13=c(rep(1000,12), unlist(mtry_13_preds)),
  mtry14=c(rep(1000,13), unlist(mtry_14_preds)),mtry15=c(rep(1000,14), unlist(mtry_15_preds)),
  mtry16=c(rep(1000,15), unlist(mtry_16_preds))
)

rownames(best_mtry_10_seeds)<-paste0("nvar_",1:16)

best_mtry_10_seeds

             mtry1        mtry2        mtry3        mtry4        mtry5
nvar_1  0.03563218 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_2  0.03908046 4.137931e-02 1.000000e+03 1.000000e+03 1.000000e+03
nvar_3  0.03678161 3.678161e-02 2.988506e-02 1.000000e+03 1.000000e+03
nvar_4  0.03908046 4.137931e-02 3.793103e-02 3.218391e-02 1.000000e+03
nvar_5  0.03218391 3.448276e-02 3.448276e-02 3.333333e-02 3.333333e-02
nvar_6  0.03678161 4.022989e-02 3.678161e-02 3.678161e-02 3.333333e-02
nvar_7  0.03793103 3.908046e-02 4.022989e-02 3.793103e-02 3.793103e-02
nvar_8  0.04022989 4.022989e-02 4.827586e-02 4.482759e-02 4.367816e-02
nvar_9  0.04252874 4.022989e-02 4.137931e-02 4.482759e-02 4.827586e-02
nvar_10 0.04482759 3.793103e-02 4.252874e-02 4.482759e-02 4.252874e-02
nvar_11 0.04252874 3.448276e-02 3.333333e-02 3.448276e-02 3.333333e-02
nvar_12 0.04137931 3.678161e-02 3.448276e-02 3.448276e-02 3.908046e-02
nvar_13 0.03908046 3.678161e-02 3.678161e-02 3.678161e-02 3.563218e-02
nvar_14 0.04597701 3.678161e-02 3.793103e-02 3.563218e-02 3.678161e-02
nvar_15 0.04942529 3.448276e-02 3.448276e-02 3.563218e-02 3.678161e-02
nvar_16 0.05057471 3.908046e-02 3.678161e-02 3.793103e-02 3.678161e-02
               mtry6        mtry7        mtry8        mtry9       mtry10
nvar_1  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_2  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_3  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_4  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_5  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_6  3.218391e-02 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_7  3.793103e-02 3.563218e-02 1.000000e+03 1.000000e+03 1.000000e+03
nvar_8  4.482759e-02 4.597701e-02 4.597701e-02 1.000000e+03 1.000000e+03
nvar_9  4.942529e-02 5.057471e-02 5.057471e-02 5.172414e-02 1.000000e+03
nvar_10 4.482759e-02 4.712644e-02 4.827586e-02 5.057471e-02 5.057471e-02
nvar_11 3.908046e-02 3.908046e-02 4.137931e-02 3.908046e-02 4.252874e-02
nvar_12 3.908046e-02 3.793103e-02 3.908046e-02 4.022989e-02 4.252874e-02
nvar_13 3.678161e-02 3.908046e-02 3.793103e-02 3.908046e-02 4.252874e-02
nvar_14 3.678161e-02 3.678161e-02 3.793103e-02 3.793103e-02 3.908046e-02
nvar_15 3.678161e-02 3.908046e-02 3.678161e-02 3.678161e-02 3.678161e-02
nvar_16 3.563218e-02 3.563218e-02 3.678161e-02 3.563218e-02 3.678161e-02
              mtry11       mtry12       mtry13       mtry14       mtry15
nvar_1  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_2  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_3  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_4  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_5  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_6  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_7  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_8  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_9  1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_10 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_11 4.482759e-02 1.000000e+03 1.000000e+03 1.000000e+03 1.000000e+03
nvar_12 4.712644e-02 4.712644e-02 1.000000e+03 1.000000e+03 1.000000e+03
nvar_13 4.137931e-02 4.137931e-02 4.712644e-02 1.000000e+03 1.000000e+03
nvar_14 3.793103e-02 4.022989e-02 4.482759e-02 4.367816e-02 1.000000e+03
nvar_15 3.793103e-02 3.793103e-02 4.137931e-02 4.137931e-02 4.022989e-02
nvar_16 3.678161e-02 3.563218e-02 3.908046e-02 4.022989e-02 4.137931e-02
              mtry16
nvar_1  1.000000e+03
nvar_2  1.000000e+03
nvar_3  1.000000e+03
nvar_4  1.000000e+03
nvar_5  1.000000e+03
nvar_6  1.000000e+03
nvar_7  1.000000e+03
nvar_8  1.000000e+03
nvar_9  1.000000e+03
nvar_10 1.000000e+03
nvar_11 1.000000e+03
nvar_12 1.000000e+03
nvar_13 1.000000e+03
nvar_14 1.000000e+03
nvar_15 1.000000e+03
nvar_16 4.137931e-02

best_mtry_matrix<-as.matrix(best_mtry_10_seeds)
best_rf_model<-data.frame(which(best_mtry_matrix == min(best_mtry_matrix), arr.ind = TRUE), min(best_mtry_matrix))
names(best_rf_model)<-c('nvar','mtry','error')

and our best Random Forrest goes to:

best_rf_model

       nvar mtry      error
nvar_3    3    3 0.02988506

Now with 10 different seeds, these are the best n seeds of LDA across different predictors.

npreds_lda<-
  function(x){
    errors<-list()
    for( seed in 1:10){
      set.seed(seed)
      tr<-sample(nrow(v),nrow(v)*.8)
      v.tr<-v[tr,]
      v.te<-v[-tr,]


      errors[[seed]]<-1-confusionMatrix(predict(lda(V1~., data = v.tr[,c("V1",best_n[[x]])] , cv = FALSE), v.te, type='response')$class, v.te$V1)$overall[1]

    }

    unlist(errors)
  }
best_npreds_lda<-list()
for( i in 1:16){
  best_npreds_lda[[i]]<-mean(npreds_lda(i))
}


matplot(1:16, best_npreds_lda, xlab = "best n predictors", ylab ="error", main = 'Best n Predictors for LDA' , pch = 19)
lines(1:16, best_npreds_lda)
abline(h = min(unlist(best_npreds_lda))+.2*sd(unlist(best_npreds_lda)))

plot of chunk unnamed-chunk-16

min(unlist(best_npreds_lda))

[1] 0.03678161

data.frame(n_preds = which(as.matrix(best_npreds_lda)== min(unlist(best_npreds_lda)), arr.ind = TRUE)[,1], error =  min(unlist(best_npreds_lda)) )

    n_preds      error
row       3 0.03678161

Now let's attempt Gradient Boosting

v.boost<-gbm(V1~., data = v.train, distribution = 'gaussian',n.trees=5000,interaction.depth=4)
yhat.boost=predict(v.boost ,newdata =v.test, n.trees=5000)

v.boostn<-function(x) {
  as.numeric(caret::confusionMatrix(as.factor(ifelse(yhat.boost> x , 1, 0)), v.test$V1)[[3]][1])
}

cutoffs<-seq(1,2,.05)
bbb<-as.data.frame(cutoffs)
bbb$accuracy<-apply(bbb,1,v.boostn)
bbb

   cutoffs  accuracy
1     1.00 0.6436782
2     1.05 0.8160920
3     1.10 0.9080460
4     1.15 0.9310345
5     1.20 0.9310345
6     1.25 0.9310345
7     1.30 0.9425287
8     1.35 0.9425287
9     1.40 0.9425287
10    1.45 0.9425287
11    1.50 0.9080460
12    1.55 0.9080460
13    1.60 0.9080460
14    1.65 0.9080460
15    1.70 0.9080460
16    1.75 0.9080460
17    1.80 0.9080460
18    1.85 0.8735632
19    1.90 0.8735632
20    1.95 0.8505747
21    2.00 0.7471264

plot(bbb)
lines(bbb[,1],bbb[,2])

plot of chunk unnamed-chunk-17

predictors through 5 seeds. it takes too damn long to do 100

npreds_gbm<-
  function(x){
    errors<-list()
    for( seed in 1:5){
      set.seed(seed)
      tr<-sample(nrow(v),nrow(v)*.8)
      v.tr<-v[tr,c("V1",best_n[[x]])]
      v.te<-v[-tr,c("V1",best_n[[x]])]
      errors[[seed]]<- 1-(confusionMatrix(as.factor(ifelse(predict(gbm(V1~., data = v.tr, distribution = 'gaussian', n.trees=5000, interaction.depth = 4 ) ,newdata = v.te, n.trees=5000)> 1.35 , 1, 0)), v.te$V1)[[3]][1])
    }
    unlist(errors)
  }

best_gbm<-data.frame(best_n=1:16, error = rbind(mean(npreds_gbm(1)), mean(npreds_gbm(2)),  mean(npreds_gbm(3)),  mean(npreds_gbm(4)),  mean(npreds_gbm(5)),  mean(npreds_gbm(6)),  mean(npreds_gbm(7)),  mean(npreds_gbm(8)),  mean(npreds_gbm(9)),  mean(npreds_gbm(10)),  mean(npreds_gbm(11)),  mean(npreds_gbm(12)),  mean(npreds_gbm(13)),  mean(npreds_gbm(14)),  mean(npreds_gbm(15)),  mean(npreds_gbm(16))))

matplot(best_gbm$best_n, best_gbm$error, xlab = "best n predictors", ylab ="error", main = 'Best n Predictors for GBM' , pch = 19)
lines(best_gbm$best_n, best_gbm$error)
abline(h = min(unlist( best_gbm[2]))+.2*sd(unlist( best_gbm[2])))

plot of chunk unnamed-chunk-19

best_gbm[which(best_gbm$error== min(best_gbm$error), arr.ind = TRUE),]

  best_n      error
2      2 0.04597701
3      3 0.04597701

Our Best SVM is as follows:

#best svm

library(e1071)
#svm linear
npreds_svm_lin<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
                 kernel = "linear", 
                 ranges = list(cost = c(0.1,1,10,100,1000)),
                 gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}

best_npreds_svm_lin<-list()
for( i in 1:16){
    best_npreds_svm_lin[[i]]<-mean(npreds_svm_lin(i))
}

#svm radial
npreds_svm_rad<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
                 kernel = "radial", 
                 ranges = list(cost = c(0.1,1,10,100,1000)),
                 gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}

best_npreds_svm_rad<-list()
for( i in 1:16){
    best_npreds_svm_rad[[i]]<-mean(npreds_svm_rad(i))
}

#svm polynomial

npreds_svm_pol<-
function(x){
svm.tune<-list()
predinhos<-list()
svm.errors<-list()
for( seed in 1:10){
set.seed(seed)
tr<-sample(nrow(v),nrow(v)*.8)
v.tr<-v[tr,]
v.te<-v[-tr,]
svm.tune[[seed]] <- tune(svm, V1~., data = v.tr[,c("V1",best_n[[x]])] ,
                 kernel = "polynomial", 
                 ranges = list(cost = c(0.1,1,10,100,1000)),
                 gamma = c(0.5,1,2,3,4))
predinhos[[seed]]<-predict(svm.tune[[seed]]$best.model, v.te)
svm.errors[[seed]]<- 1-confusionMatrix(data = predinhos[[seed]], v.te$V1)$overall[1]
}
unlist(svm.errors)
}

best_npreds_svm_pol<-list()
for( i in 1:16){
    best_npreds_svm_pol[[i]]<-mean(npreds_svm_pol(i))
}

best_svm<-data.frame( n_var = 1:16,
linear = unlist(best_npreds_svm_lin),
radial = unlist(best_npreds_svm_rad),
polynomial = unlist(best_npreds_svm_pol)
)

matplot(as.matrix(best_svm)[,2:4], type = c('b'), pch = 20, col = c(1,2,4), xlab='best n variables', ylab = 'error', main = "Best SVM" )
abline(h=min(as.matrix(best_svm)[,2:4]) + .2*sd(as.matrix(best_svm)[,2:4]))

plot of chunk unnamed-chunk-23

best_svm_model= data.frame(which(as.matrix(best_svm)==min(as.matrix(best_svm)), arr.ind = TRUE), best_svm[which(as.matrix(best_svm)==min(as.matrix(best_svm)), arr.ind = TRUE)])

names(best_svm_model)<-c("best_n", "model","error")
best_svm_model[1,2]<-names(best_svm)[best_svm_model[1,2]]

best_svm_model

  best_n  model      error
1      7 radial 0.03448276

Conclusion

Ultimately, the randomForest model with of mtry = 3 and the best 3 predictors gave us the lowest error rate of 2.99%. This outperformed all the models we tried, and was the best at predicting which political party a member of congress was part of, based off their voting record.