Linear Kernel
library(e1071)
## Loading required package: class
# Use tune() to do 10-fold CV
tune.out = tune(svm, Survived ~ Pclass + Sex + Age + Child + Sex * Pclass +
SibSp + Parch + Family + Mother, data = trainData, kernel = "linear", ranges = list(cost = c(0.001,
0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.001
##
## - best performance: 0.1829
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.1829 0.02866
## 2 1e-02 0.1889 0.03953
## 3 1e-01 0.1891 0.04010
## 4 1e+00 0.1847 0.04064
## 5 5e+00 0.1842 0.03982
## 6 1e+01 0.1842 0.03984
## 7 1e+02 0.1840 0.03952
bestmod = tune.out$best.model
summary(bestmod)
##
## Call:
## best.tune(method = svm, train.x = Survived ~ Pclass + Sex + Age +
## Child + Sex * Pclass + SibSp + Parch + Family + Mother, data = trainData,
## ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)),
## kernel = "linear")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 0.001
## gamma: 0.1111
## epsilon: 0.1
##
##
## Number of Support Vectors: 691
yhat.svm.linear = predict(bestmod, testData)
# Creating CSV for Kaggle Submission
survival.svm.linear <- vector()
survival.svm.linear = ifelse(yhat.svm.linear > 0.5, 1, 0)
kaggle.sub <- cbind(PassengerId, survival.svm.linear)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_svm_linear.csv",
row.names = FALSE)
SVM is not an improvement. Possibly overfitting. Perhaps using a higher cost factor (which yields a lower number of support vectors) might help.
Radial Kernel
set.seed(200)
tune.out = tune(svm, Survived ~ Pclass + Sex + Age + Child + Sex * Pclass +
SibSp + Parch + Family + Mother, data = trainData, kernel = "radial", ranges = list(cost = c(0.1,
1, 10, 100, 1000), gamma = c(0.5, 1, 2, 3, 4)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 0.1 0.5
##
## - best performance: 0.1574
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.5 0.1574 0.02676
## 2 1e+00 0.5 0.1630 0.02832
## 3 1e+01 0.5 0.1801 0.02790
## 4 1e+02 0.5 0.1981 0.03260
## 5 1e+03 0.5 0.2943 0.13300
## 6 1e-01 1.0 0.1651 0.02737
## 7 1e+00 1.0 0.1672 0.02826
## 8 1e+01 1.0 0.1780 0.02834
## 9 1e+02 1.0 0.2135 0.05587
## 10 1e+03 1.0 0.3082 0.13626
## 11 1e-01 2.0 0.1764 0.03087
## 12 1e+00 2.0 0.1737 0.03063
## 13 1e+01 2.0 0.1846 0.02813
## 14 1e+02 2.0 0.2079 0.04415
## 15 1e+03 2.0 0.3766 0.23220
## 16 1e-01 3.0 0.1835 0.03121
## 17 1e+00 3.0 0.1754 0.03072
## 18 1e+01 3.0 0.1843 0.03137
## 19 1e+02 3.0 0.2140 0.03367
## 20 1e+03 3.0 0.3712 0.19206
## 21 1e-01 4.0 0.1887 0.03156
## 22 1e+00 4.0 0.1763 0.02978
## 23 1e+01 4.0 0.1816 0.03367
## 24 1e+02 4.0 0.2127 0.03828
## 25 1e+03 4.0 0.2841 0.08356
bestmod = tune.out$best.model
summary(bestmod)
##
## Call:
## best.tune(method = svm, train.x = Survived ~ Pclass + Sex + Age +
## Child + Sex * Pclass + SibSp + Parch + Family + Mother, data = trainData,
## ranges = list(cost = c(0.1, 1, 10, 100, 1000), gamma = c(0.5,
## 1, 2, 3, 4)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 0.1
## gamma: 0.5
## epsilon: 0.1
##
##
## Number of Support Vectors: 548
yhat.svm.radial = predict(bestmod, testData)
# Creating CSV for Kaggle Submission
survival.svm.radial <- vector()
survival.svm.radial = ifelse(yhat.svm.radial > 0.5, 1, 0)
kaggle.sub <- cbind(PassengerId, survival.svm.radial)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_svm_radial.csv",
row.names = FALSE)
SVM with Radial Kernel did terrible also! (0.39 - can't get much worse… Linear kernel was 0.76 which means it's probably much closer to a linear boundary)
Let's try a linear kernel with cost of 10 (resulting in lower number of support vectors) and fewer predictors.
svm.titanic = svm(Survived ~ Pclass + Sex + Age + Child + Sex * Pclass + Family +
Mother, data = trainData, kernel = "linear", cost = 1)
summary(svm.titanic)
##
## Call:
## svm(formula = Survived ~ Pclass + Sex + Age + Child + Sex * Pclass +
## Family + Mother, data = trainData, kernel = "linear", cost = 1)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.1429
## epsilon: 0.1
##
##
## Number of Support Vectors: 470
yhat.svm.linear2 = predict(svm.titanic, testData)
# Creating CSV for Kaggle Submission
survival.svm.linear2 <- vector()
survival.svm.linear2 = ifelse(yhat.svm.linear2 > 0.5, 1, 0)
kaggle.sub <- cbind(PassengerId, survival.svm.linear2)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_svm_linear2.csv",
row.names = FALSE)
Did not improve results (still around 0.76).
Try Logistic and LDA next. Also try other classification methods as well.