x = as.matrix(trainData[, -c(1, 3)])
y = trainData$Survived
library(pls)
##
## Attaching package: 'pls'
##
## The following object is masked from 'package:stats':
##
## loadings
set.seed(200)
pcr.fit = pcr(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked_C +
Embarked_Q + Child + Mother + Sex * Pclass, data = trainData, scale = T,
validation = "CV")
summary(pcr.fit)
## Data: X dimension: 891 11
## Y dimension: 891 1
## Fit method: svdpc
## Number of components considered: 11
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.4869 0.4585 0.4274 0.4254 0.4017 0.4020 0.3992
## adjCV 0.4869 0.4584 0.4272 0.4253 0.4015 0.4018 0.3991
## 7 comps 8 comps 9 comps 10 comps 11 comps
## CV 0.3863 0.3844 0.3805 0.3812 0.3718
## adjCV 0.3861 0.3842 0.3803 0.3810 0.3715
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 23.85 42.75 57.92 68.21 76.65 83.68 89.50
## Survived 11.71 23.51 24.31 32.84 32.85 33.78 37.92
## 8 comps 9 comps 10 comps 11 comps
## X 93.77 96.91 99.48 100.00
## Survived 38.70 39.86 39.86 42.86
# plot the MSE
validationplot(pcr.fit, val.type = "MSEP")
# Find number of components giving lowest MSE
pcr.fit = pcr(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked_C +
Embarked_Q + Child + Mother + Sex * Pclass, data = trainData, scale = T,
ncomp = 8)
# Now fit onto the Test set
pcr.pred = predict(pcr.fit, testData, ncomp = 8)
survival = ifelse(pcr.pred > 0.5, 1, 0)
# Creating CSV for Kaggle Submission
kaggle.sub <- cbind(PassengerId, survival)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_pcr.csv",
row.names = FALSE)
Also not an improvement on straight logistic regression with 6 variables.
# ALMOST SAME PROCEDURES AS PCR
set.seed(200)
# find best # of components using training data
pls.fit = plsr(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked_C +
Embarked_Q + Child + Mother + Sex * Pclass, data = trainData, scale = T,
validation = "CV")
summary(pls.fit)
## Data: X dimension: 891 11
## Y dimension: 891 1
## Fit method: kernelpls
## Number of components considered: 11
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 0.4869 0.3986 0.3834 0.3797 0.3776 0.3754 0.3739
## adjCV 0.4869 0.3984 0.3832 0.3795 0.3774 0.3753 0.3737
## 7 comps 8 comps 9 comps 10 comps 11 comps
## CV 0.3722 0.3719 0.3718 0.3718 0.3718
## adjCV 0.3719 0.3716 0.3716 0.3716 0.3715
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 20.69 34.58 49.28 57.88 65.9 75.12 78.78
## Survived 33.79 38.95 40.22 40.97 41.7 42.29 42.81
## 8 comps 9 comps 10 comps 11 comps
## X 84.42 89.18 97.34 100.00
## Survived 42.86 42.86 42.86 42.86
validationplot(pls.fit, val.type = "MSEP")
# get test MSE
pls.fit = plsr(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked_C +
Embarked_Q + Child + Mother + Sex * Pclass, data = trainData, scale = T,
ncomp = 4)
summary(pls.fit)
## Data: X dimension: 891 11
## Y dimension: 891 1
## Fit method: kernelpls
## Number of components considered: 4
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps
## X 20.69 34.58 49.28 57.88
## Survived 33.79 38.95 40.22 40.97
pls.pred = predict(pls.fit, testData, ncomp = 4)
survival = ifelse(pls.pred > 0.5, 1, 0)
# Creating CSV for Kaggle Submission
kaggle.sub <- cbind(PassengerId, survival)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_pls.csv",
row.names = FALSE)
Also not an improvement on straight logistic regression with 6 variables.