library(glmnet)
## Loading required package: Matrix
## Loading required package: lattice
## Loaded glmnet 1.9-5
# set 'lambda' range
grid = 10^seq(10, -2, length = 100)
x = as.matrix(trainData[, -c(1, 3)])
y = trainData$Survived
# Cross Validation for Ridge Regression
set.seed(200)
cv.out = cv.glmnet(x, y, alpha = 0) #alpha=0 means Ridge Regression
plot(cv.out)
bestlam = cv.out$lambda.min
bestlam
## [1] 0.02642
# Refit model with the best Lambda selected from CV
out = glmnet(x, y, alpha = 0)
predict(out, type = "coefficients", s = bestlam)
## 12 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.4785905
## Pclass -0.1556062
## Sex 0.4669367
## Age -0.0037361
## SibSp -0.0289564
## Parch -0.0335638
## Fare 0.0005504
## Embarked_C 0.0616297
## Embarked_Q 0.0581361
## Child -0.2592628
## Family -0.0214440
## Mother -0.1327676
# Fix NA in 'Fare'
testData$Fare[which(is.na(testData$Fare))] = mean(testData$Fare, na.rm = TRUE)
# Make prediction using best lambda
ridge.pred = predict(out, s = bestlam, newx = as.matrix(testData[, -2]))
survival = ifelse(ridge.pred > 0.5, 1, 0)
# Creating CSV for Kaggle Submission
kaggle.sub <- cbind(PassengerId, survival)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_ridge.csv",
row.names = FALSE)
Not an improvement on straight logistic regression with 6 variables.
lasso.mod = glmnet(x, y, alpha = 1, lambda = grid) #alpha=1 means LASSO
plot(lasso.mod)
# Cross validation to find best lambda
set.seed(200)
cv.out = cv.glmnet(x, y, alpha = 1) #alpha=1 means LASSO
plot(cv.out)
bestlam = cv.out$lambda.min
lasso.pred = predict(lasso.mod, s = bestlam, newx = as.matrix(testData[, -2]))
out = glmnet(x, y, alpha = 1, lambda = grid) #alpha=1 means LASSO
lasso.coef = predict(out, type = "coefficients", s = bestlam)
lasso.coef
## 12 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.1811209
## Pclass -0.1539891
## Sex 0.4894363
## Age -0.0028576
## SibSp -0.0100730
## Parch .
## Fare 0.0002719
## Embarked_C 0.0436608
## Embarked_Q 0.0151964
## Child -0.2113539
## Family -0.0320038
## Mother -0.0391918
# Fix NA in 'Fare'
testData$Fare[which(is.na(testData$Fare))] = mean(testData$Fare, na.rm = TRUE)
# Make prediction using best lambda
lasso.pred = predict(out, s = bestlam, newx = as.matrix(testData[, -2]))
survival = ifelse(lasso.pred > 0.5, 1, 0)
# Creating CSV for Kaggle Submission
kaggle.sub <- cbind(PassengerId, survival)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_lasso.csv",
row.names = FALSE)
Not an improvement either.