BB Week 10

Split/Subset Data

covid<- read.csv("covid-19-survey-responses-sample.csv", header = TRUE)
covid<-subset(covid,select=(names(covid[12:ncol(covid)-2])))
intrain<- createDataPartition(y = covid$q03_symptoms, p=.7, list=FALSE)
training <- covid[intrain,]
test<- covid[-intrain, ]
dim(training)
## [1] 15 11
dim(test)
## [1]  5 11
anyNA(covid)
## [1] FALSE
glimpse(covid)
## Observations: 20
## Variables: 11
## $ q01_exposed               <fct> f, f, f, f, f, f, f, f, f, f, f, f, f, f,...
## $ q02_quarantine            <fct> t, t, t, t, t, f, t, f, t, f, t, f, t, t,...
## $ q03_symptoms              <fct> f, f, f, f, f, f, f, f, f, f, f, f, f, t,...
## $ q04_tested                <fct> no_i_never_needed_it, no_i_never_needed_i...
## $ q05_denied_testing        <fct> , , , , , , , , , , , , , , , i_had_no_kn...
## $ q06_family_positive       <fct> no, no, no, no, no, no, no, no, no, no, n...
## $ q07_keep_med_appt         <fct> no_appointments_were_cancelled, not_appli...
## $ q08_public_transportation <fct> no_never_used_it, no_never_used_it, no_ne...
## $ q09_perfer_med_appts      <fct> at_home_phone_call_video_chat, at_home_ph...
## $ q10_prefer_med_treatments <fct> at_home, at_a_hospital, at_home, at_a_hos...
## $ q11_prefer_labs           <fct> at_home, at_a_hospital, at_home, at_a_hos...

Train/Test/Predict

trctrl<- trainControl(method="repeatedcv", number=10,repeats=3)
svm_lin<- caret::train(q03_symptoms~., data=training, method="svmLinear", 
                trControl=trctrl, preProcess=c("center","scale"),
                tuneLength =10)
svm_lin
## Support Vector Machines with Linear Kernel 
## 
## 15 samples
## 10 predictors
##  2 classes: 'f', 't' 
## 
## Pre-processing: centered (26), scaled (26) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 13, 14, 14, 12, 14, 14, ... 
## Resampling results:
## 
##   Accuracy   Kappa
##   0.8666667  0.5  
## 
## Tuning parameter 'C' was held constant at a value of 1
test_pred<- predict(svm_lin, newdata=test)
test_pred
## [1] f f f f f
## Levels: f t
confusionMatrix(table(test_pred,test$q03_symptoms))
## Confusion Matrix and Statistics
## 
##          
## test_pred f t
##         f 4 1
##         t 0 0
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.2836, 0.9949)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 0.7373          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 1.0             
##             Specificity : 0.0             
##          Pos Pred Value : 0.8             
##          Neg Pred Value : NaN             
##              Prevalence : 0.8             
##          Detection Rate : 0.8             
##    Detection Prevalence : 1.0             
##       Balanced Accuracy : 0.5             
##                                           
##        'Positive' Class : f               
## 

Tune for best model

set.seed(1)
tune.out<-tune(svm,q03_symptoms~.,data=covid, kernel="linear",
               ranges=list(cost=c(.00001,.0001, .001, .01, 1,10), 
                           gamma=c(1,2,3,4,5))) #get best model with tune
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1     1
## 
## - best performance: 0.15 
## 
## - Detailed performance results:
##     cost gamma error dispersion
## 1  1e-05     1  0.20  0.2581989
## 2  1e-04     1  0.20  0.2581989
## 3  1e-03     1  0.20  0.2581989
## 4  1e-02     1  0.20  0.2581989
## 5  1e+00     1  0.15  0.2415229
## 6  1e+01     1  0.15  0.2415229
## 7  1e-05     2  0.20  0.2581989
## 8  1e-04     2  0.20  0.2581989
## 9  1e-03     2  0.20  0.2581989
## 10 1e-02     2  0.20  0.2581989
## 11 1e+00     2  0.15  0.2415229
## 12 1e+01     2  0.15  0.2415229
## 13 1e-05     3  0.20  0.2581989
## 14 1e-04     3  0.20  0.2581989
## 15 1e-03     3  0.20  0.2581989
## 16 1e-02     3  0.20  0.2581989
## 17 1e+00     3  0.15  0.2415229
## 18 1e+01     3  0.15  0.2415229
## 19 1e-05     4  0.20  0.2581989
## 20 1e-04     4  0.20  0.2581989
## 21 1e-03     4  0.20  0.2581989
## 22 1e-02     4  0.20  0.2581989
## 23 1e+00     4  0.15  0.2415229
## 24 1e+01     4  0.15  0.2415229
## 25 1e-05     5  0.20  0.2581989
## 26 1e-04     5  0.20  0.2581989
## 27 1e-03     5  0.20  0.2581989
## 28 1e-02     5  0.20  0.2581989
## 29 1e+00     5  0.15  0.2415229
## 30 1e+01     5  0.15  0.2415229
bestmod<-tune.out$best.model
summary(bestmod)
## 
## Call:
## best.tune(method = svm, train.x = q03_symptoms ~ ., data = covid, 
##     ranges = list(cost = c(1e-05, 1e-04, 0.001, 0.01, 1, 10), gamma = c(1, 
##         2, 3, 4, 5)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  12
## 
##  ( 8 4 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  f t

Plot Data

grid <- expand.grid(C = c(1,10,100, 1000, 10000))
svm_Linear_Grid <- caret::train(q03_symptoms ~., data = training, method = "svmLinear", trControl=trctrl, preProcess = c("center", "scale"),tuneGrid = grid, tuneLength = 10, cost=bestmod$cost,gamma=bestmod$gamma)

svm_Linear_Grid
## Support Vector Machines with Linear Kernel 
## 
## 15 samples
## 10 predictors
##  2 classes: 'f', 't' 
## 
## Pre-processing: centered (26), scaled (26) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 14, 14, 14, 13, 13, 13, ... 
## Resampling results across tuning parameters:
## 
##   C      Accuracy   Kappa
##       1  0.9166667  0.5  
##      10  0.9166667  0.5  
##     100  0.9166667  0.5  
##    1000  0.9166667  0.5  
##   10000  0.9166667  0.5  
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 1.
plot(svm_Linear_Grid)

APPENDIX

Code used in analysis

knitr::opts_chunk$set(
    echo = TRUE,
    message = FALSE,
    warning = FALSE
)
#knitr::opts_chunk$set(echo = TRUE)
require(knitr)
library(ggplot2)
library(tidyr)
library(MASS)
library(psych)
library(kableExtra)
library(dplyr)
library(faraway)
library(gridExtra)
library(reshape2)
library(leaps)
library(pROC)
library(caret)
library(naniar)
library(pander)
library(pROC)
library(mlbench)
library(e1071)
library(fpp2)
library(mlr)
covid<- read.csv("covid-19-survey-responses-sample.csv", header = TRUE)
covid<-subset(covid,select=(names(covid[12:ncol(covid)-2])))
intrain<- createDataPartition(y = covid$q03_symptoms, p=.7, list=FALSE)
training <- covid[intrain,]
test<- covid[-intrain, ]
dim(training)
dim(test)
anyNA(covid)
glimpse(covid)

trctrl<- trainControl(method="repeatedcv", number=10,repeats=3)
svm_lin<- caret::train(q03_symptoms~., data=training, method="svmLinear", 
                trControl=trctrl, preProcess=c("center","scale"),
                tuneLength =10)
svm_lin

test_pred<- predict(svm_lin, newdata=test)
test_pred
confusionMatrix(table(test_pred,test$q03_symptoms))


set.seed(1)
tune.out<-tune(svm,q03_symptoms~.,data=covid, kernel="linear",
               ranges=list(cost=c(.00001,.0001, .001, .01, 1,10), 
                           gamma=c(1,2,3,4,5))) #get best model with tune
summary(tune.out)
bestmod<-tune.out$best.model
summary(bestmod)

grid <- expand.grid(C = c(1,10,100, 1000, 10000))
svm_Linear_Grid <- caret::train(q03_symptoms ~., data = training, method = "svmLinear", trControl=trctrl, preProcess = c("center", "scale"),tuneGrid = grid, tuneLength = 10, cost=bestmod$cost,gamma=bestmod$gamma)

svm_Linear_Grid

plot(svm_Linear_Grid)