used to create csv subset files for all airlines

import the Cheapseats dataset

ksvm_df <- read.csv("Cheapseats.csv", stringsAsFactors = F)

label Cheapseats airline satisfaction

ksvm_df$satisfactionLabel <- ifelse(ksvm_df$satisfaction>3,"happy","unhappy")
ksvm_df$satisfactionLabel <- factor(ksvm_df$satisfactionLabel)

create test/train data

createTestTrain <- function() {
  n = dim(ksvm_df)[1]
  train_index = sample(1:n, size=.7*n, replace = F)  # create random sample with size 70% of n
  train <- ksvm_df[train_index,]  # set train data to the random indices generated 
  test <- ksvm_df[-train_index,] # set test data to exclude the random indices
  return(list(test,train))
}

build the svm model

test_train <- createTestTrain()
test <- test_train[[1]]
train <- test_train[[2]]
svm_output <- ksvm(satisfactionLabel~airline_status + age + gender + type_of_travel + arrival_delay_greater_5_mins + num_flights, data=train, model="rfbfot", kpar="automatic", C=10, cross=3, prob.model=T)
svm_output
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 10 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.260139334106138 
## 
## Number of Support Vectors : 7938 
## 
## Objective Function Value : -75056.9 
## Training error : 0.200781 
## Cross validation error : 0.204962 
## Probability model included.
svm_predict <- predict(svm_output, test, type="votes")
compTable <- data.frame(test[,29], svm_predict[1,])  # creates a dataframe from the from test$classify and the first row in the svm_predict matrix
table(compTable)
##           svm_predict.1...
## test...29.    0    1
##    happy    224 3637
##    unhappy 2562 1369

the svm produces an error rate of 20%, let’s see if we can reduce that by removing the variables with less correlation to satisfaction

test_train <- createTestTrain()
test <- test_train[[1]]
train <- test_train[[2]]
svm_output2 <- ksvm(satisfactionLabel~airline_status + type_of_travel + arrival_delay_greater_5_mins + num_flights, data=train, model="rfbfot", kpar="automatic", C=10, cross=3, prob.model=T)
svm_output2
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 10 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.543045651199313 
## 
## Number of Support Vectors : 8986 
## 
## Objective Function Value : -81666.39 
## Training error : 0.224105 
## Cross validation error : 0.22581 
## Probability model included.
svm_predict2 <- predict(svm_output2, test, type="votes")
compTable2 <- data.frame(test[,29], svm_predict2[1,])  # creates a dataframe from the from test$classify and the first row in the svm_predict matrix
table(compTable2)
##           svm_predict2.1...
## test...29.    0    1
##    happy    208 3672
##    unhappy 2353 1559

prediction rate does not seem to improve with removing variables; nevertheless, let’s remove all but the strongest predictors

test_train <- createTestTrain()
test <- test_train[[1]]
train <- test_train[[2]]
svm_output3 <- ksvm(satisfactionLabel~airline_status + type_of_travel, data=train, model="rfbfot", kpar="automatic", C=10, cross=3, prob.model=T)
svm_output3
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 10 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.666666666666667 
## 
## Number of Support Vectors : 8843 
## 
## Objective Function Value : -88308.6 
## Training error : 0.242863 
## Cross validation error : 0.243138 
## Probability model included.
svm_predict3 <- predict(svm_output3, test, type="votes")
compTable3 <- data.frame(test[,29], svm_predict3[1,])  # creates a dataframe from the from test$classify and the first row in the svm_predict matrix
table(compTable3)
##           svm_predict3.1...
## test...29.    0    1
##    happy    446 3433
##    unhappy 2411 1502