Regularization Methods for Kaggle Titanic Competition

Ridge Regression

library(glmnet)
## Loading required package: Matrix
## Loading required package: lattice
## Loaded glmnet 1.9-5

# set 'lambda' range
grid = 10^seq(10, -2, length = 100)
x = as.matrix(trainData[, -c(1, 3)])
y = trainData$Survived

# Cross Validation for Ridge Regression
set.seed(200)
cv.out = cv.glmnet(x, y, alpha = 0)  #alpha=0 means Ridge Regression
plot(cv.out)

plot of chunk unnamed-chunk-2

bestlam = cv.out$lambda.min
bestlam
## [1] 0.02642

# Refit model with the best Lambda selected from CV
out = glmnet(x, y, alpha = 0)
predict(out, type = "coefficients", s = bestlam)
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                      1
## (Intercept)  1.4785905
## Pclass      -0.1556062
## Sex          0.4669367
## Age         -0.0037361
## SibSp       -0.0289564
## Parch       -0.0335638
## Fare         0.0005504
## Embarked_C   0.0616297
## Embarked_Q   0.0581361
## Child       -0.2592628
## Family      -0.0214440
## Mother      -0.1327676

# Fix NA in 'Fare'
testData$Fare[which(is.na(testData$Fare))] = mean(testData$Fare, na.rm = TRUE)

# Make prediction using best lambda
ridge.pred = predict(out, s = bestlam, newx = as.matrix(testData[, -2]))
survival = ifelse(ridge.pred > 0.5, 1, 0)

# Creating CSV for Kaggle Submission
kaggle.sub <- cbind(PassengerId, survival)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_ridge.csv", 
    row.names = FALSE)

Not an improvement on straight logistic regression with 6 variables.

Lasso

lasso.mod = glmnet(x, y, alpha = 1, lambda = grid)  #alpha=1 means LASSO
plot(lasso.mod)

plot of chunk unnamed-chunk-3


# Cross validation to find best lambda
set.seed(200)
cv.out = cv.glmnet(x, y, alpha = 1)  #alpha=1 means LASSO
plot(cv.out)

plot of chunk unnamed-chunk-3

bestlam = cv.out$lambda.min
lasso.pred = predict(lasso.mod, s = bestlam, newx = as.matrix(testData[, -2]))

out = glmnet(x, y, alpha = 1, lambda = grid)  #alpha=1 means LASSO
lasso.coef = predict(out, type = "coefficients", s = bestlam)
lasso.coef
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                      1
## (Intercept)  1.1811209
## Pclass      -0.1539891
## Sex          0.4894363
## Age         -0.0028576
## SibSp       -0.0100730
## Parch        .        
## Fare         0.0002719
## Embarked_C   0.0436608
## Embarked_Q   0.0151964
## Child       -0.2113539
## Family      -0.0320038
## Mother      -0.0391918

# Fix NA in 'Fare'
testData$Fare[which(is.na(testData$Fare))] = mean(testData$Fare, na.rm = TRUE)

# Make prediction using best lambda
lasso.pred = predict(out, s = bestlam, newx = as.matrix(testData[, -2]))
survival = ifelse(lasso.pred > 0.5, 1, 0)

# Creating CSV for Kaggle Submission
kaggle.sub <- cbind(PassengerId, survival)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_lasso.csv", 
    row.names = FALSE)

Not an improvement either.