rm(list=ls())
setwd("Z:/vmlxU2_azhang/Titanic")
train <- read.csv("train.csv")
head(train)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
# Read dataTest data:
test <- read.csv("test.csv", sep=",",header=TRUE)
require(Amelia)
## Loading required package: Amelia
## Warning: package 'Amelia' was built under R version 3.1.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.1.2
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.3, built: 2014-11-14)
## ## Copyright (C) 2005-2015 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(train, main="Titanic Training Data - Missings Map",
col=c("black", "yellow"), legend=TRUE)
About 20 percent of the Age data is missing; while this package not seems perfect becasue some cabin data are missing too, but it does not show properly here.
Let’s put some simple data visualization tools to help us understand what might influence the outcome we’re trying to predict by using machine learning algorithms later.
barplot(table(train$Survived),
names.arg = c("Perished", "Survived"),
main="Survived (passenger fate)", col="black")
barplot(table(train$Pclass),
names.arg = c("first", "second", "third"),
main="Pclass (passenger traveling class)", col="firebrick")
# About half are in the third class
barplot(table(train$Sex), main="Sex (gender)", col="darkviolet")
hist(train$Age, main="Age", xlab = NULL, col="brown")
barplot(table(train$SibSp), main="SibSp (siblings + spouse aboard)",
col="darkblue")
barplot(table(train$Parch), main="Parch (parents + kids aboard)",
col="gray50")
hist(train$Fare, main="Fare (fee paid for ticket[s])", xlab = NULL,
col="darkgreen")
barplot(table(train$Embarked))
mosaicplot(train$Pclass ~ train$Survived,
main="Passenger Fate by Traveling Class", shade=FALSE,
color=TRUE, xlab="Pclass", ylab="Survived")
mosaicplot(train$Sex ~ train$Survived,
main="Passenger Fate by Gender", shade=FALSE, color=TRUE,
xlab="Sex", ylab="Survived")
require("dplyr")
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.1.2
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require("gbm")
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.1.3
## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1.1
# Check the basic statistics of the train data:
summary(train)
## PassengerId Survived Pclass
## Min. : 1.0 Min. :0.0000 Min. :1.000
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000
## Median :446.0 Median :0.0000 Median :3.000
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Name Sex Age
## Abbing, Mr. Anthony : 1 female:314 Min. : 0.42
## Abbott, Mr. Rossmore Edward : 1 male :577 1st Qu.:20.12
## Abbott, Mrs. Stanton (Rosa Hunt) : 1 Median :28.00
## Abelson, Mr. Samuel : 1 Mean :29.70
## Abelson, Mrs. Samuel (Hannah Wizosky): 1 3rd Qu.:38.00
## Adahl, Mr. Mauritz Nils Martin : 1 Max. :80.00
## (Other) :885 NA's :177
## SibSp Parch Ticket Fare
## Min. :0.000 Min. :0.0000 1601 : 7 Min. : 0.00
## 1st Qu.:0.000 1st Qu.:0.0000 347082 : 7 1st Qu.: 7.91
## Median :0.000 Median :0.0000 CA. 2343: 7 Median : 14.45
## Mean :0.523 Mean :0.3816 3101295 : 6 Mean : 32.20
## 3rd Qu.:1.000 3rd Qu.:0.0000 347088 : 6 3rd Qu.: 31.00
## Max. :8.000 Max. :6.0000 CA 2144 : 6 Max. :512.33
## (Other) :852
## Cabin Embarked
## :687 : 2
## B96 B98 : 4 C:168
## C23 C25 C27: 4 Q: 77
## G6 : 4 S:644
## C22 C26 : 3
## D : 3
## (Other) :186
As some of the variable are not helpful for the prediction, such as name might be elimitated
survived = train$Survived
head(survived)
## [1] 0 1 1 1 0 0
train = select(train, -Survived) # Eliminate the column of "Survived" from train.
end_trn = nrow(train)
end_trn
## [1] 891
#combine the two into one data set.
# Manipulate variables (create new ones, cap and floor), we do the same operation for the training and testing data
dim(test)
## [1] 418 11
all = rbind(train,test)
dim(all)
## [1] 1309 11
end = nrow(all)
# In order to obtain a better fit with gbm, we need to get rid of junk especially factor variables with lots of levels
str(train)
## 'data.frame': 891 obs. of 11 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
all = select(all
, Pclass
, Sex
, Age
, SibSp
, Parch
, Fare
, Embarked
)
# Not many variables to choose from perform variable selection later
head(all)
## Pclass Sex Age SibSp Parch Fare Embarked
## 1 3 male 22 1 0 7.2500 S
## 2 1 female 38 1 0 71.2833 C
## 3 3 female 26 0 0 7.9250 S
## 4 1 female 35 1 0 53.1000 S
## 5 3 male 35 0 0 8.0500 S
## 6 3 male NA 0 0 8.4583 Q
str(all)
## 'data.frame': 1309 obs. of 7 variables:
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
# Select a high guess of how many trees we'll need
ntrees = 5000 # kaggle test score: 0.76077
Model = gbm.fit(
x = all[1:end_trn,] # dataframe of features
, y = survived #dependent variable
, distribution = "bernoulli"
#use bernoulli for binary outcomes
, n.trees = ntrees
#Choose this value to be large, then we will prune the
#tree after running the model
, shrinkage = 0.01 # This might be not very important
, interaction.depth = 3
#use cross validation to choose interaction depth!!
, n.minobsinnode = 10
#n.minobsinnode has an important effect on overfitting!
#decreasing this parameter increases the in-sample fit,
#but can result in overfitting
, nTrain = round(end_trn * 0.8)
#use this so that you can select the number of trees at the end
, verbose = TRUE #print the preliminary output
)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3310 1.2974 0.0100 0.0039
## 2 1.3237 1.2896 0.0100 0.0035
## 3 1.3164 1.2817 0.0100 0.0032
## 4 1.3087 1.2740 0.0100 0.0035
## 5 1.3016 1.2662 0.0100 0.0035
## 6 1.2947 1.2585 0.0100 0.0034
## 7 1.2879 1.2510 0.0100 0.0034
## 8 1.2815 1.2446 0.0100 0.0034
## 9 1.2748 1.2373 0.0100 0.0034
## 10 1.2686 1.2305 0.0100 0.0030
## 20 1.2116 1.1695 0.0100 0.0026
## 40 1.1239 1.0750 0.0100 0.0015
## 60 1.0623 1.0085 0.0100 0.0012
## 80 1.0154 0.9578 0.0100 0.0008
## 100 0.9792 0.9157 0.0100 0.0007
## 120 0.9536 0.8858 0.0100 0.0005
## 140 0.9316 0.8582 0.0100 0.0002
## 160 0.9127 0.8347 0.0100 0.0004
## 180 0.8968 0.8157 0.0100 0.0001
## 200 0.8837 0.7983 0.0100 0.0002
## 220 0.8713 0.7828 0.0100 0.0001
## 240 0.8602 0.7697 0.0100 0.0001
## 260 0.8509 0.7621 0.0100 0.0001
## 280 0.8429 0.7535 0.0100 0.0002
## 300 0.8351 0.7442 0.0100 -0.0001
## 320 0.8283 0.7357 0.0100 -0.0000
## 340 0.8216 0.7318 0.0100 -0.0001
## 360 0.8163 0.7275 0.0100 -0.0000
## 380 0.8115 0.7223 0.0100 -0.0002
## 400 0.8062 0.7177 0.0100 -0.0001
## 420 0.8009 0.7143 0.0100 -0.0000
## 440 0.7964 0.7117 0.0100 0.0000
## 460 0.7920 0.7086 0.0100 0.0000
## 480 0.7880 0.7066 0.0100 -0.0002
## 500 0.7833 0.7026 0.0100 -0.0001
## 520 0.7796 0.7006 0.0100 -0.0001
## 540 0.7756 0.6990 0.0100 -0.0001
## 560 0.7721 0.6978 0.0100 -0.0003
## 580 0.7687 0.6971 0.0100 -0.0000
## 600 0.7657 0.6959 0.0100 -0.0001
## 620 0.7625 0.6942 0.0100 -0.0001
## 640 0.7591 0.6937 0.0100 -0.0001
## 660 0.7560 0.6928 0.0100 -0.0000
## 680 0.7526 0.6902 0.0100 -0.0001
## 700 0.7493 0.6883 0.0100 -0.0001
## 720 0.7463 0.6877 0.0100 -0.0002
## 740 0.7430 0.6877 0.0100 -0.0001
## 760 0.7405 0.6884 0.0100 -0.0001
## 780 0.7375 0.6874 0.0100 -0.0002
## 800 0.7347 0.6871 0.0100 -0.0001
## 820 0.7323 0.6854 0.0100 -0.0003
## 840 0.7296 0.6864 0.0100 -0.0001
## 860 0.7278 0.6848 0.0100 -0.0001
## 880 0.7252 0.6833 0.0100 0.0000
## 900 0.7231 0.6818 0.0100 -0.0002
## 920 0.7209 0.6816 0.0100 -0.0001
## 940 0.7188 0.6800 0.0100 -0.0002
## 960 0.7162 0.6803 0.0100 -0.0001
## 980 0.7140 0.6808 0.0100 -0.0001
## 1000 0.7116 0.6809 0.0100 -0.0001
## 1020 0.7094 0.6812 0.0100 -0.0001
## 1040 0.7074 0.6811 0.0100 -0.0001
## 1060 0.7053 0.6816 0.0100 -0.0002
## 1080 0.7036 0.6814 0.0100 -0.0002
## 1100 0.7017 0.6815 0.0100 -0.0001
## 1120 0.6992 0.6839 0.0100 -0.0001
## 1140 0.6969 0.6841 0.0100 -0.0002
## 1160 0.6950 0.6834 0.0100 -0.0002
## 1180 0.6931 0.6851 0.0100 -0.0003
## 1200 0.6907 0.6851 0.0100 -0.0000
## 1220 0.6886 0.6855 0.0100 -0.0001
## 1240 0.6865 0.6865 0.0100 -0.0003
## 1260 0.6840 0.6845 0.0100 -0.0002
## 1280 0.6823 0.6847 0.0100 -0.0002
## 1300 0.6800 0.6836 0.0100 -0.0001
## 1320 0.6783 0.6845 0.0100 -0.0000
## 1340 0.6764 0.6850 0.0100 -0.0001
## 1360 0.6745 0.6865 0.0100 -0.0002
## 1380 0.6726 0.6873 0.0100 -0.0001
## 1400 0.6704 0.6865 0.0100 -0.0002
## 1420 0.6686 0.6864 0.0100 -0.0002
## 1440 0.6665 0.6867 0.0100 -0.0002
## 1460 0.6648 0.6854 0.0100 -0.0002
## 1480 0.6634 0.6837 0.0100 -0.0001
## 1500 0.6615 0.6833 0.0100 -0.0001
## 1520 0.6598 0.6832 0.0100 -0.0001
## 1540 0.6580 0.6833 0.0100 -0.0001
## 1560 0.6563 0.6824 0.0100 -0.0001
## 1580 0.6543 0.6820 0.0100 -0.0000
## 1600 0.6521 0.6837 0.0100 -0.0002
## 1620 0.6501 0.6832 0.0100 -0.0002
## 1640 0.6481 0.6822 0.0100 -0.0001
## 1660 0.6464 0.6804 0.0100 -0.0002
## 1680 0.6446 0.6827 0.0100 -0.0001
## 1700 0.6427 0.6816 0.0100 -0.0002
## 1720 0.6414 0.6825 0.0100 -0.0002
## 1740 0.6402 0.6833 0.0100 -0.0003
## 1760 0.6387 0.6832 0.0100 -0.0002
## 1780 0.6373 0.6823 0.0100 -0.0001
## 1800 0.6357 0.6809 0.0100 -0.0002
## 1820 0.6341 0.6813 0.0100 -0.0002
## 1840 0.6323 0.6814 0.0100 -0.0001
## 1860 0.6306 0.6811 0.0100 -0.0002
## 1880 0.6289 0.6803 0.0100 -0.0001
## 1900 0.6272 0.6801 0.0100 -0.0001
## 1920 0.6259 0.6812 0.0100 -0.0002
## 1940 0.6246 0.6818 0.0100 -0.0001
## 1960 0.6229 0.6803 0.0100 -0.0001
## 1980 0.6215 0.6812 0.0100 -0.0002
## 2000 0.6199 0.6824 0.0100 -0.0002
## 2020 0.6185 0.6806 0.0100 -0.0003
## 2040 0.6172 0.6799 0.0100 -0.0001
## 2060 0.6158 0.6798 0.0100 -0.0001
## 2080 0.6147 0.6800 0.0100 -0.0002
## 2100 0.6127 0.6790 0.0100 -0.0002
## 2120 0.6113 0.6792 0.0100 -0.0002
## 2140 0.6101 0.6785 0.0100 -0.0001
## 2160 0.6088 0.6792 0.0100 -0.0001
## 2180 0.6077 0.6795 0.0100 -0.0001
## 2200 0.6065 0.6787 0.0100 -0.0000
## 2220 0.6053 0.6799 0.0100 -0.0001
## 2240 0.6041 0.6795 0.0100 -0.0002
## 2260 0.6027 0.6787 0.0100 -0.0002
## 2280 0.6014 0.6805 0.0100 -0.0001
## 2300 0.6003 0.6811 0.0100 -0.0002
## 2320 0.5990 0.6810 0.0100 -0.0002
## 2340 0.5978 0.6811 0.0100 -0.0002
## 2360 0.5965 0.6825 0.0100 -0.0002
## 2380 0.5951 0.6835 0.0100 -0.0001
## 2400 0.5941 0.6831 0.0100 -0.0003
## 2420 0.5931 0.6824 0.0100 -0.0001
## 2440 0.5919 0.6831 0.0100 -0.0001
## 2460 0.5908 0.6835 0.0100 -0.0001
## 2480 0.5892 0.6838 0.0100 -0.0000
## 2500 0.5877 0.6836 0.0100 -0.0001
## 2520 0.5863 0.6850 0.0100 -0.0003
## 2540 0.5850 0.6870 0.0100 -0.0001
## 2560 0.5839 0.6874 0.0100 -0.0002
## 2580 0.5827 0.6890 0.0100 -0.0001
## 2600 0.5812 0.6894 0.0100 -0.0002
## 2620 0.5799 0.6880 0.0100 -0.0002
## 2640 0.5787 0.6891 0.0100 -0.0002
## 2660 0.5775 0.6893 0.0100 -0.0002
## 2680 0.5762 0.6891 0.0100 -0.0002
## 2700 0.5751 0.6896 0.0100 -0.0001
## 2720 0.5740 0.6876 0.0100 -0.0002
## 2740 0.5729 0.6874 0.0100 -0.0001
## 2760 0.5719 0.6888 0.0100 -0.0002
## 2780 0.5710 0.6901 0.0100 -0.0001
## 2800 0.5697 0.6916 0.0100 -0.0000
## 2820 0.5686 0.6920 0.0100 -0.0001
## 2840 0.5678 0.6920 0.0100 -0.0003
## 2860 0.5667 0.6918 0.0100 -0.0001
## 2880 0.5657 0.6931 0.0100 -0.0001
## 2900 0.5649 0.6934 0.0100 -0.0002
## 2920 0.5636 0.6941 0.0100 -0.0002
## 2940 0.5625 0.6944 0.0100 -0.0001
## 2960 0.5615 0.6942 0.0100 -0.0003
## 2980 0.5604 0.6955 0.0100 -0.0002
## 3000 0.5594 0.6949 0.0100 -0.0001
## 3020 0.5586 0.6946 0.0100 -0.0002
## 3040 0.5576 0.6947 0.0100 -0.0001
## 3060 0.5563 0.6951 0.0100 -0.0002
## 3080 0.5551 0.6945 0.0100 -0.0001
## 3100 0.5539 0.6938 0.0100 -0.0001
## 3120 0.5529 0.6930 0.0100 -0.0001
## 3140 0.5518 0.6926 0.0100 -0.0000
## 3160 0.5510 0.6924 0.0100 -0.0002
## 3180 0.5501 0.6925 0.0100 -0.0001
## 3200 0.5491 0.6923 0.0100 -0.0001
## 3220 0.5479 0.6921 0.0100 -0.0001
## 3240 0.5468 0.6916 0.0100 -0.0002
## 3260 0.5458 0.6916 0.0100 -0.0002
## 3280 0.5451 0.6911 0.0100 -0.0001
## 3300 0.5442 0.6903 0.0100 -0.0001
## 3320 0.5432 0.6905 0.0100 -0.0001
## 3340 0.5424 0.6912 0.0100 -0.0002
## 3360 0.5417 0.6930 0.0100 -0.0002
## 3380 0.5409 0.6925 0.0100 -0.0001
## 3400 0.5399 0.6919 0.0100 -0.0000
## 3420 0.5388 0.6916 0.0100 -0.0000
## 3440 0.5381 0.6927 0.0100 -0.0001
## 3460 0.5371 0.6934 0.0100 -0.0002
## 3480 0.5363 0.6945 0.0100 -0.0001
## 3500 0.5352 0.6945 0.0100 -0.0002
## 3520 0.5344 0.6949 0.0100 -0.0001
## 3540 0.5337 0.6971 0.0100 -0.0001
## 3560 0.5329 0.6956 0.0100 -0.0001
## 3580 0.5320 0.6944 0.0100 -0.0001
## 3600 0.5311 0.6941 0.0100 -0.0003
## 3620 0.5304 0.6937 0.0100 -0.0002
## 3640 0.5296 0.6936 0.0100 -0.0001
## 3660 0.5285 0.6935 0.0100 -0.0001
## 3680 0.5276 0.6929 0.0100 -0.0002
## 3700 0.5268 0.6921 0.0100 -0.0001
## 3720 0.5257 0.6916 0.0100 -0.0002
## 3740 0.5246 0.6927 0.0100 -0.0001
## 3760 0.5237 0.6930 0.0100 -0.0001
## 3780 0.5230 0.6941 0.0100 -0.0002
## 3800 0.5221 0.6925 0.0100 -0.0001
## 3820 0.5212 0.6929 0.0100 -0.0001
## 3840 0.5202 0.6942 0.0100 -0.0002
## 3860 0.5195 0.6953 0.0100 -0.0001
## 3880 0.5186 0.6950 0.0100 -0.0001
## 3900 0.5178 0.6961 0.0100 -0.0001
## 3920 0.5169 0.6975 0.0100 -0.0001
## 3940 0.5161 0.6985 0.0100 -0.0003
## 3960 0.5154 0.6998 0.0100 -0.0001
## 3980 0.5144 0.6985 0.0100 -0.0001
## 4000 0.5136 0.6991 0.0100 -0.0002
## 4020 0.5129 0.6993 0.0100 -0.0002
## 4040 0.5121 0.6998 0.0100 -0.0001
## 4060 0.5110 0.7001 0.0100 -0.0001
## 4080 0.5101 0.6999 0.0100 -0.0001
## 4100 0.5091 0.7003 0.0100 -0.0002
## 4120 0.5084 0.7019 0.0100 -0.0001
## 4140 0.5075 0.7018 0.0100 -0.0001
## 4160 0.5068 0.7014 0.0100 -0.0002
## 4180 0.5061 0.7012 0.0100 -0.0001
## 4200 0.5054 0.7020 0.0100 -0.0002
## 4220 0.5048 0.7014 0.0100 -0.0002
## 4240 0.5041 0.7026 0.0100 -0.0002
## 4260 0.5034 0.7033 0.0100 -0.0001
## 4280 0.5024 0.7037 0.0100 -0.0002
## 4300 0.5016 0.7045 0.0100 -0.0002
## 4320 0.5009 0.7040 0.0100 -0.0001
## 4340 0.5004 0.7049 0.0100 -0.0001
## 4360 0.4999 0.7036 0.0100 -0.0002
## 4380 0.4992 0.7038 0.0100 -0.0000
## 4400 0.4982 0.7043 0.0100 -0.0003
## 4420 0.4975 0.7035 0.0100 -0.0002
## 4440 0.4969 0.7045 0.0100 -0.0002
## 4460 0.4961 0.7042 0.0100 -0.0001
## 4480 0.4953 0.7044 0.0100 -0.0002
## 4500 0.4946 0.7043 0.0100 -0.0001
## 4520 0.4940 0.7032 0.0100 -0.0002
## 4540 0.4932 0.7031 0.0100 -0.0002
## 4560 0.4925 0.7027 0.0100 -0.0001
## 4580 0.4917 0.7039 0.0100 -0.0001
## 4600 0.4912 0.7044 0.0100 -0.0002
## 4620 0.4904 0.7054 0.0100 -0.0002
## 4640 0.4897 0.7063 0.0100 -0.0002
## 4660 0.4889 0.7073 0.0100 -0.0002
## 4680 0.4882 0.7073 0.0100 -0.0002
## 4700 0.4875 0.7057 0.0100 -0.0001
## 4720 0.4870 0.7068 0.0100 -0.0002
## 4740 0.4862 0.7066 0.0100 -0.0002
## 4760 0.4858 0.7073 0.0100 -0.0001
## 4780 0.4851 0.7079 0.0100 -0.0002
## 4800 0.4843 0.7090 0.0100 -0.0001
## 4820 0.4837 0.7091 0.0100 -0.0002
## 4840 0.4831 0.7096 0.0100 -0.0002
## 4860 0.4823 0.7099 0.0100 -0.0001
## 4880 0.4815 0.7092 0.0100 -0.0000
## 4900 0.4808 0.7079 0.0100 -0.0001
## 4920 0.4801 0.7090 0.0100 -0.0001
## 4940 0.4794 0.7094 0.0100 -0.0001
## 4960 0.4788 0.7099 0.0100 -0.0001
## 4980 0.4780 0.7095 0.0100 -0.0001
## 5000 0.4773 0.7113 0.0100 -0.0002
#look at the last model built
#Relative influence among the variables can be used in variable selection
summary(Model)
## var rel.inf
## Fare Fare 32.436103
## Age Age 28.226271
## Sex Sex 21.180985
## Pclass Pclass 8.517800
## Embarked Embarked 4.171794
## SibSp SibSp 3.670808
## Parch Parch 1.796239
#If you see one variable that's much more important than all of the rest,
#that could be evidence of overfitting.
#optimal number of trees based upon CV
gbm.perf(Model)
## Using test method...
## [1] 2128
for(i in 1:length(Model$var.names)){
plot(Model, i.var = i
, ntrees = gbm.perf(Model, plot.it = FALSE) #optimal number of trees
, type = "response" #to get fitted probabilities
)
}
#test set predictions
TestPredictions = predict(object = Model,newdata =all[(end_trn+1):end,]
, n.trees = gbm.perf(Model, plot.it = FALSE)
, type = "response") #to output a probability
## Using test method...
#training set predictions
TrainPredictions = predict(object = Model,newdata =all[1:end_trn,]
, n.trees = gbm.perf(Model, plot.it = FALSE)
, type = "response")
## Using test method...
head(TrainPredictions)
## [1] 0.07820954 0.97460715 0.80044339 0.98858516 0.21980217 0.06847401
#round the predictions to zero or one
#in general, don't do this!
#it was only because the answers in the comp had to be 0 or 1
TestPredictions = round(TestPredictions)
TrainPredictions = round(TrainPredictions)
head(TrainPredictions)
## [1] 0 1 1 1 0 0
#could also mess around with different cutoff values
#would need CV to determine the best
head(TrainPredictions, n = 20)
## [1] 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0 0 1 1
head(survived, n = 20)
## [1] 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1
#in sample classification accuracy
accuracy <- 1-sum(abs(survived-TrainPredictions)) / length(TrainPredictions)
accuracy
## [1] 0.8776655
#depending upon the tuning parameters,
#I've gotten this as high as 99%, but that model
#resulted in lower test set scores
error <- 1 - accuracy
error
## [1] 0.1223345
#to get predicted out of sample accuracy
#need to set aside a testing data set
#write the submission
head(test)
## PassengerId Pclass Name Sex
## 1 892 3 Kelly, Mr. James male
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female
## 3 894 2 Myles, Mr. Thomas Francis male
## 4 895 3 Wirz, Mr. Albert male
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6 897 3 Svensson, Mr. Johan Cervin male
## Age SibSp Parch Ticket Fare Cabin Embarked
## 1 34.5 0 0 330911 7.8292 Q
## 2 47.0 1 0 363272 7.0000 S
## 3 62.0 0 0 240276 9.6875 Q
## 4 27.0 0 0 315154 8.6625 S
## 5 22.0 1 1 3101298 12.2875 S
## 6 14.0 0 0 7538 9.2250 S
nrow(test)
## [1] 418
head(test[,1])
## [1] 892 893 894 895 896 897
# submission = data.frame(PassengerId = 1:nrow(test), survived = TestPredictions)
submission = data.frame(PassengerId = test[,1], survived = TestPredictions)
head(submission)
## PassengerId survived
## 1 892 0
## 2 893 0
## 3 894 0
## 4 895 0
## 5 896 0
## 6 897 0
write.csv(submission, file = "submission_gbm.csv", row.names = FALSE)
#####################################################
submission <- read.csv("submission_gbm.csv")
dim(submission) # Result: 418 predictions and the file has a header row
## [1] 418 2
head(submission[,1])
## [1] 892 893 894 895 896 897