wd = "/Users/aarontomat/Desktop/Boston College Fall 2021/Big Data Econometrics/"
rfile1=paste0(wd,"train (Titanic).csv")
tf = read.csv(rfile1,na.strings=c("NA",""))
str(tf)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr NA "C85" NA "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
tf$Embarked = as.factor(tf$Embarked)
tf$Name = as.factor(tf$Name)
tf$Sex = as.factor(tf$Sex)
tf$Ticket = as.factor(tf$Ticket)
tf$Cabin = as.factor(tf$Cabin)
tf$Survived = as.factor(tf$Survived)
attach(tf) ;
str(tf)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 147 levels "A10","A14","A16",..: NA 82 NA 56 NA NA 130 NA NA NA ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
sapply(tf,function(df){100*sum(is.na(df==TRUE)/length(df))}) # get a % of NA for all variables
## PassengerId Survived Pclass Name Sex Age
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 19.8653199
## SibSp Parch Ticket Fare Cabin Embarked
## 0.0000000 0.0000000 0.0000000 0.0000000 77.1043771 0.2244669
The features that have NA’s are Age, Cabin, and Embarked which we will try to approximate instead of just dropping them in order to have as full a data set as possible.
table(tf$Embarked, useNA="always") # to show number of NA values in the dataset
##
## C Q S <NA>
## 168 77 644 2
tf$Embarked[which(is.na(tf$Embarked))]='S' # assign missing value to most counted port Southampton
table(tf$Embarked, useNA="always") # no NA and S went up by two
##
## C Q S <NA>
## 168 77 646 0
For Embarked there are only three possible options that can be attributed to a passenger which are Cherbourg, Queenstown, and Southampton each represented by the first of letter of their respective names. We see that Embarked has only 0.22% of its observations listed as NA, which comes out to exactly two observations. To estimate these two NA’s we simply assign them to be part of the most observed port, which was Southampton. The port of Southampton accounts for 72% of the observations, which means we have a 72% chance of correctly predicting the embarkation port of these two NA’s, which is better than random guessing.
sum(is.na(tf$Age)==TRUE) # 177 out of 891 are NA
## [1] 177
sum(is.na(tf$Age)==TRUE)/length(tf$Age)
## [1] 0.1986532
Age has almost 20% of its observations listed as NA, indicating there is more work to be done in order to get a fair approximation. We will use the Name variable in the data set to estimate age, by specifically observing the title given to each passenger. The titles of passengers range from Mr., Mrs., Miss, Ms., Master, Dr, etc. and we will assume that they provide a good projection of the age of that person. We will extract the title of each passenger and separate them into groups, after which we will then calculate the mean age of each title group. We will then use those means as the approximated age that are to be assigned to the NA values. As such the missing age of passenger that has the title Mr. will now be assigned the average age of all the other passengers who also have the title Mr.
tf$Name = as.character(tf$Name)
# Missing ages for each age can be imputed using titles: assuming each title average is a good proxy
table_words = table(unlist(strsplit(tf$Name, "\\s+")))
sort(table_words [grep('\\.',names(table_words))],decreasing=TRUE)
##
## Mr. Miss. Mrs. Master. Dr. Rev. Col. Major.
## 517 182 125 40 7 6 2 2
## Mlle. Capt. Countess. Don. Jonkheer. L. Lady. Mme.
## 2 1 1 1 1 1 1 1
## Ms. Sir.
## 1 1
library(stringr)
# Get a substring containing a period, then bind the column.
tb=cbind(tf$Age, str_match(tf$Name, "[a-zA-Z]+\\."))
# Acquire the statistics of missing values, we can count each title.
table(tb[is.na(tb[,1]),2])
##
## Dr. Master. Miss. Mr. Mrs.
## 1 4 36 119 17
# Calculate the mean of each group based on title
mean.mr=mean(tf$Age[grepl(" Mr\\.", tf$Name) & !is.na(tf$Age)])
mean.mrs=mean(tf$Age[grepl(" Mrs\\.", tf$Name) & !is.na(tf$Age)])
mean.dr=mean(tf$Age[grepl(" Dr\\.", tf$Name) & !is.na(tf$Age)])
mean.miss=mean(tf$Age[grepl(" Miss\\.", tf$Name) & !is.na(tf$Age)])
mean.master=mean(tf$Age[grepl(" Master\\.", tf$Name) & !is.na(tf$Age)])
# Assign the missing values of each title to the mean estimated above
tf$Age[grepl(" Mr\\.", tf$Name) & is.na(tf$Age)]=mean.mr
tf$Age[grepl(" Mrs\\.", tf$Name) & is.na(tf$Age)]=mean.mrs
tf$Age[grepl(" Dr\\.", tf$Name) & is.na(tf$Age)]=mean.dr
tf$Age[grepl(" Miss\\.", tf$Name) & is.na(tf$Age)]=mean.miss
tf$Age[grepl(" Master\\.", tf$Name) & is.na(tf$Age)]=mean.master
# Verify that Age has no more N/A by running the sapply function
NaAge <- sapply(tf,function(df){100*sum(is.na(df==TRUE)/length(df))})
NaAge["Age"] # Age has 0% of NA's
## Age
## 0
sapply(tf,function(df){100*sum(is.na(df==TRUE)/length(df))})
## PassengerId Survived Pclass Name Sex Age
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000
## SibSp Parch Ticket Fare Cabin Embarked
## 0.00000 0.00000 0.00000 0.00000 77.10438 0.00000
The last variable to have NA’s is Cabin. Cabin has over 77% of its observations listed as NA, with many of them coming from passengers from second or third class. With so many missing values there is just no way to be able to estimate them all. You could perhaps assume that passengers who share a last name are more likely to share a cabin, however, many passengers in second or third class shared cabins with multiple families, and many passengers in first class had only person to a cabin. Therefore, with so many unknowns the task would prove impossible.
hist(tf$Age, main="Passenger Age", xlab = "Age")
The distribution of the age of the passengers is fairly normally distributed with the range of ages being less than 1 year old all the way up to 80 years old, with a majority of the ages of passengers being less than 40.
# Percentage of people under the age of 40
nrow(filter(tf, Age < 40))/nrow(tf)*100
## [1] 81.59371
barplot(table(tf$SibSp),main="Passenger Siblings/Spouses")
The graph showing the distribution of passenger siblings/spouses is highly skewed to the right, indicating that there were not many siblings and spouses on board. When later looking at the histograms comparing those that Survived to the ones that did not, there is again a very high skew to the right, with there being more people perishing who had a greater number of siblings.
barplot(table(tf$Parch),main="Passenger Parch")
The graph showing the distribution of number of parents and children is again highly skewed to the right, as many passengers had 0 parents and children on board the Titanic with them.
hist(table(tf$Far),main="Passenger Fare", xlab="Fare")
The histogram depicting the distribution of Passenger fares is also highly skewed to the right with many fare prices being below $10. There is also a seperation between the fare prices in each different class, with 1st class spending the most on average for their fare by a large margin, followed by 2nd and 3rd class.
tfC1 <- tf %>% filter(tf$Pclass == 1)
sum(tfC1$Fare)/nrow(tfC1) # Average Fare price paid for a 1st class ticket
## [1] 84.15469
tfC2 <- tf %>% filter(tf$Pclass == 2)
sum(tfC2$Fare)/nrow(tfC2) # Average Fare price paid for a 2nd class ticket
## [1] 20.66218
tfC3 <- tf %>% filter(tf$Pclass == 3)
sum(tfC3$Fare)/nrow(tfC3) # Average Fare price paid for a 3rd class ticket
## [1] 13.67555
barplot(table(tf$Embarked)/nrow(tf),main="Port of Embarkation")
The plot that depicts the number of passengers and their port of embarkation shows a high number of observations associated with the port of Southhampton, over 70%. The next most frequent port is Cherbourg with just under 20% of observation followed by Queenstown with just under 10% of observations.
barplot(table(tf$Sex))
The plot that shows the distribution of passengers by Sex indicates that there were more males on board the ship than females.
counts = table(tf$Survived,tf$Sex)
barplot(counts, col=c("darkblue","red"), legend=c("Perished","Survived"), main = "Passenger Survival by Sex")
The protocol at the time of the Titanic was that women and children have priority over males when loading passengers into lifeboats. Therefore, Sex would actually be a good predictor at estimating who survived and who perished, because of the policy prioritizing certain passengers. After modeling the data and fitting a new data set the model has not yet seen, it would give a higher chance of survival to female passengers than male ones, which is accurate, since that is what happened in real life. The plot that shows the number of Passengers of survived by Sex indicates that a larger number of females survived than perished, while a larger number of males perished than survived.
countsC = table(tf$Survived,tf$Pclass)
barplot(countsC, col=c("darkblue","red"), legend=c("Perished","Survived"), main = "Titanic Class Bar Plot")
Looking at the bar plot depicting them number of passengers who perished by class it seems to be that a higher number of people from the third class perished than survived, with the split favoring perished slightly in the second class, and first class showing a majority of its passengers surviving.
hist(tf$Age[which(tf$Survived=="0")], main = "Passenger Age Histogram", xlab="Age",ylab="Count", col="blue", breaks=seq(0,80, by=2))
hist(tf$Age[which(tf$Survived=="1")], col="red", add=T,breaks=seq(0,80, by=2)) # Red is survived, blue is died
boxplot(tf$Age ~ tf$Survived, main="Passenger Survival by Age",xlab="Survived",ylab="Age")
# Survival Rate of....
# Passengers less than 13 years old
tf.child=tf$Survived[tf$Age<13]
length(tf.child[which(tf.child==1)])/length(tf.child)
## [1] 0.5753425
# Passengers between 13 and 24 years old
tf.youth=tf$Survived[tf$Age>=13 & tf$Age<25]
length(tf.youth[which(tf.youth==1)])/length(tf.youth)
## [1] 0.4081633
# Passengers between 25 and 64 years old
tf.adult=tf$Survived[tf$Age>=25 & tf$Age<65]
length(tf.adult[which(tf.adult==1)])/length(tf.adult)
## [1] 0.3540925
# Passengers greater than or equal to 65 years old
tf.senior=tf$Survived[tf$Age>=65]
length(tf.senior[which(tf.senior==1)])/length(tf.senior)
## [1] 0.09090909
When looking at the graph that compares age to survival it shows that a majority of passengers almost across all age groups perished. The greatest differences between the number of survived and perished occurs between the ages of 20 and 40. The boxplot shows that the range of ages of passengers who perished is smaller, than those who survived, as well as the mean age of passengers who perished was higher than those who survived. When dividing the ages into groups we can see that only 10% of all passengers 65 or older survived, while more than half of passengers under 13 survived. Passengers who existed within the more populous 13 to 24 and 25 to 64 age groups saw their survival rate be around 40% and 35% respectively.
We are going to ignore the Cabin Feature of this data set as there are many NA’s that, as discussed earlier, would be hard or near impossible to calculate, therefore rendering this variable useless.
# We will seperate out the variables that are not going to be used in the models
tf <- tf %>% dplyr::select(-PassengerId,-Name,-Ticket,-Cabin)
# PassengerID is unimportant,
# Name has too many unique categorical values,
# Ticket tells us nothing about the passengers,
# Cabin has too many NA's
# Then we will then split the data set into train and test.
set.seed(10)
tftrain <- sample(nrow(tf),round(.7*nrow(tf)))
tf.train <- tf[tftrain,]
tf.test <- tf[-tftrain,]
We will train the models using 70% of the original data set and the remaning 30% will be used to validate them.
set.seed(10)
train_control <- trainControl(method = "cv",
number = 10)
KCV <- function(method){
model <- train(Survived ~., data = tf.train,
method = "glm",
trControl = train_control)
print(model)
}
Above we use two resampling methods when modelling our data, the validation set approach and K-fold cross validation. The validation set approach, where we split the data into two subsets, a training set and a validation set, uses the training set to fit the model, while the validation set is used in conjunction with the model to make predictions based on its observations. The K-fold cross validation is used as a way to compare models and get an estimate of their accuracy or test error rate. K-fold cross validation also works by splitting up the data but into different sections or folds that then act as the validation set, with the model being constructed using the remaining observations. This process is repeated depending on the number of specified folds, with each iteration selecting different observations as part of the validation set. Therefore the K-fold cross validation can be said to be more accurate as multiple validation samples are taken and fitted, compared to the validation set approach, where we only have one.
tf.Corr <- tf %>% dplyr::select(-Survived,-Sex,-Embarked)
cor(tf.Corr)
## Pclass Age SibSp Parch Fare
## Pclass 1.00000000 -0.34356704 0.08308136 0.01844267 -0.54949962
## Age -0.34356704 1.00000000 -0.26766027 -0.19680171 0.09109524
## SibSp 0.08308136 -0.26766027 1.00000000 0.41483770 0.15965104
## Parch 0.01844267 -0.19680171 0.41483770 1.00000000 0.21622494
## Fare -0.54949962 0.09109524 0.15965104 0.21622494 1.00000000
The table above displays a correlation matrix involving all the numerical variables, meaning Survived, Sex, and Embarked are not included. It shows that there is a somewhat high negative correlation between Fare and Pclass with the relationship there being that the higher the Fare the lower the class i.e. We saw that first class tickets were the most expenive on average. There is also a relativly high positive correlation between SibSp and Parch, which makes sense if people are travelling with their entire family they will have their siblings/spouses and parents/children aboard with them. However, both correlations are not excessivly high and therefore none of the variables will be removed due to multicollinearity
x <- model.matrix(Survived ~ ., data = tf.train)[, -1]
y <- tf.train$Survived
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-2
set.seed(10)
cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial")
plot(cv.lasso)
cv.lasso$lambda.min
## [1] 0.006865961
bestlam <- cv.lasso$lambda.min
lasso.model <- glmnet(x, y, alpha = 1, family = "binomial",
lambda = cv.lasso$lambda.min)
x.test <- model.matrix(Survived ~., tf.test)[,-1]
probabilities <- lasso.model %>% predict(newx = x.test)
predicted.classes <- ifelse(probabilities > 0.5, "1", "0")
observed.classes <- tf.test$Survived
mean(predicted.classes == observed.classes)
## [1] 0.8089888
lasso.coef <- predict(lasso.model, type = "coefficients", s = bestlam)[1:9, ]
lasso.coef
## (Intercept) Pclass Sexmale Age SibSp Parch
## 4.867290955 -1.029054324 -2.434691442 -0.038420946 -0.273092246 0.000000000
## Fare EmbarkedQ EmbarkedS
## 0.001507473 0.000000000 -0.422965650
The variables that whose coefficients have gone to 0 are Parch and Embarked Q. Embarked Q went to 0 possibly because it was closely related to Embarked S. In our data set Embarked is only one variable so we will leave that alone. Parch most likely went to 0 because of its relation with SibSp. So even though we left it in as part of our models after looking at the correlation plot, we will remove it now.
# Remove Parch
tf <- tf %>% dplyr::select(-Parch)
# Again split the data set now with Parch removed
set.seed(10)
tftrain <- sample(nrow(tf),round(.7*nrow(tf)))
tf.train <- tf[tftrain,]
tf.test <- tf[-tftrain,]
Logistic regression coefficients estimates are calculated based on maximum likelihood, which is used to make a predicted response that is a probability between 0 and 1. We set a threshold, in this case 0.5, that would classify an observation into each of our two classes 1 (Survived) or 0 (Perished). Whenever an observation is predicted to have a probability of greater than 0.5 it will be classified as Survived, otherwise it will be classified as Perished. ### Model
logit <- glm(Survived ~., data = tf.train, family = binomial)
summary(logit)
##
## Call:
## glm(formula = Survived ~ ., family = binomial, data = tf.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7414 -0.6298 -0.4050 0.6548 2.4880
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.577101 0.684335 8.150 3.65e-16 ***
## Pclass -1.121692 0.171441 -6.543 6.04e-11 ***
## Sexmale -2.606145 0.232870 -11.191 < 2e-16 ***
## Age -0.046647 0.009386 -4.970 6.69e-07 ***
## SibSp -0.368101 0.119905 -3.070 0.00214 **
## Fare 0.002059 0.002707 0.760 0.44701
## EmbarkedQ -0.240631 0.461013 -0.522 0.60170
## EmbarkedS -0.572027 0.278699 -2.052 0.04012 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 834.27 on 623 degrees of freedom
## Residual deviance: 555.09 on 616 degrees of freedom
## AIC: 571.09
##
## Number of Fisher Scoring iterations: 5
# Set Up
library(boot)
boot.logit <- function(data, index){
coef(glm(Survived ~., data = tf.train, family = binomial, subset = index))
}
set.seed(10)
# Bootstrap standard errors and comparison to standard errors of model
boot(tf.train, boot.logit, 100);summary(logit)$coef[,2]
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = tf.train, statistic = boot.logit, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 5.577100665 0.2471798044 0.759885111
## t2* -1.121692310 -0.0289017346 0.197807325
## t3* -2.606145166 -0.0566792743 0.258989873
## t4* -0.046647413 -0.0041587067 0.011192669
## t5* -0.368101292 -0.0307326391 0.117744760
## t6* 0.002058502 0.0004717611 0.004289385
## t7* -0.240631328 -0.0743754430 0.491316097
## t8* -0.572026562 -0.0006975694 0.285581809
## (Intercept) Pclass Sexmale Age SibSp Fare
## 0.68433495 0.17144070 0.23286980 0.00938551 0.11990462 0.00270711
## EmbarkedQ EmbarkedS
## 0.46101267 0.27869886
The bootstrap estimates are fairly similar to the standard errors of the original model, although they are not the same. Because the bootstrap is not relying on any assumptions it is probably giving better estimates of the standard error than the model does.
logit.Prob = predict(logit, tf.test, type = "response")
logit.Pred = rep("Perished", nrow(tf.test))
logit.Pred[logit.Prob > .5] = "Survived"
logit.table <- table(logit.Pred, tf.test$Survived)
logit.table
##
## logit.Pred 0 1
## Perished 149 30
## Survived 19 69
(logit.table[1,1]+logit.table[2,2])/sum(logit.table)
## [1] 0.8164794
In this case the accuracy of the model was 0.8127, indicating that it correctly classified 81.27% of observations.
KCV("glm")
## Generalized Linear Model
##
## 624 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 561, 561, 562, 562, 562, 561, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7884281 0.5505109
Linear Discriminant Analysis and Quadratic Discriminant Analysis both make their predictions by approximating the Bayes Classifier, which estimates the probability that an observation belongs to a specific class, given the predictor value of that observation. In order to estimate the Bayes Classifier, we have to calculate the probability of an observation given they are in a specific class, which we will do by making certain assumptions. LDA assumes that observations from each class come from a normal distribution, with a class-specific mean and a common variance. QDA assumes that there are different covariances for each class. Linear Discriminant Analysis is used most effectively when there is a linear boundary between classifiers, and Quadratic Discriminant Analysis is at its best when there is a non-linear boundary, hence the difference in their names.
library(MASS)
lda<- lda(Survived ~ ., data = tf.train)
lda
## Call:
## lda(Survived ~ ., data = tf.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6105769 0.3894231
##
## Group means:
## Pclass Sexmale Age SibSp Fare EmbarkedQ EmbarkedS
## 0 2.511811 0.8398950 30.83962 0.5879265 22.52568 0.08398950 0.7847769
## 1 1.975309 0.3168724 27.89318 0.5308642 46.71536 0.09053498 0.6337449
##
## Coefficients of linear discriminants:
## LD1
## Pclass -0.726791654
## Sexmale -2.026371036
## Age -0.029546762
## SibSp -0.219282355
## Fare 0.001746876
## EmbarkedQ -0.094815273
## EmbarkedS -0.371887242
# Step Up
boot.lda <- function(data,index){
coef(lda(Survived ~ ., data = tf.train, subset=index))
}
set.seed(10)
# Bootstrap standard errors
boot(tf.train, boot.lda, 100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = tf.train, statistic = boot.lda, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -0.726791654 -0.0145468291 0.117838613
## t2* -2.026371036 0.0042513619 0.142862914
## t3* -0.029546762 -0.0018296591 0.006303598
## t4* -0.219282355 -0.0080691844 0.058764993
## t5* 0.001746876 -0.0002369459 0.001806019
## t6* -0.094815273 -0.0257751954 0.291298110
## t7* -0.371887242 0.0087493157 0.175850016
lda.Pred <- predict(lda, tf.test)
lda.Class <- lda.Pred$class
table(lda.Class,tf.test$Survived)
##
## lda.Class 0 1
## 0 145 30
## 1 23 69
# Probability of Survival
mean(lda.Class == tf.test$Survived)
## [1] 0.8014981
The model correctly predicted 80% of the observations
KCV("lda")
## Generalized Linear Model
##
## 624 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 561, 561, 562, 562, 562, 561, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7884281 0.5505109
qda <- qda(Survived ~ ., data = tf.train)
qda
## Call:
## qda(Survived ~ ., data = tf.train)
##
## Prior probabilities of groups:
## 0 1
## 0.6105769 0.3894231
##
## Group means:
## Pclass Sexmale Age SibSp Fare EmbarkedQ EmbarkedS
## 0 2.511811 0.8398950 30.83962 0.5879265 22.52568 0.08398950 0.7847769
## 1 1.975309 0.3168724 27.89318 0.5308642 46.71536 0.09053498 0.6337449
qda.Pred <- predict(qda, tf.test)
qda.Class <- qda.Pred$class
table(qda.Class,tf.test$Survived)
##
## qda.Class 0 1
## 0 146 27
## 1 22 72
mean(qda.Class == tf.test$Survived)
## [1] 0.8164794
QDA predicted 81.6% of the observations, which is an improvement over LDA.
KCV("qda")
## Generalized Linear Model
##
## 624 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 561, 562, 562, 561, 562, 561, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7963902 0.5654695
Naïve Bayes also follows Bayes theorem, with its main assumption when making predictions is that when estimating the probability of X given a certain class, within each class, the predictors are independent. This allows for less concern about certain predictors being associated with each other. Naïve Bayes is useful for data sets that have a high number of predictors and relatively smaller number of observations. Hence, the accuracy does not outperform LDA or QDA.
library(e1071)
nb <- naiveBayes(Survived ~ ., data = tf.train)
nb
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.6105769 0.3894231
##
## Conditional probabilities:
## Pclass
## Y [,1] [,2]
## 0 2.511811 0.7594958
## 1 1.975309 0.8572777
##
## Sex
## Y female male
## 0 0.1601050 0.8398950
## 1 0.6831276 0.3168724
##
## Age
## Y [,1] [,2]
## 0 30.83962 12.95806
## 1 27.89318 13.92785
##
## SibSp
## Y [,1] [,2]
## 0 0.5879265 1.3555970
## 1 0.5308642 0.7287443
##
## Fare
## Y [,1] [,2]
## 0 22.52568 31.59516
## 1 46.71536 64.04710
##
## Embarked
## Y C Q S
## 0 0.13123360 0.08398950 0.78477690
## 1 0.27572016 0.09053498 0.63374486
nb.Pred <- predict(nb, tf.test)
table.nb <- table(nb.Pred, tf.test$Survived)
table.nb
##
## nb.Pred 0 1
## 0 150 37
## 1 18 62
(table.nb[1,1]+table.nb[2,2])/(sum(table.nb))
## [1] 0.7940075
The Naïve Bayes model correctly predicted 79% of observations.
KCV("nb")
## Generalized Linear Model
##
## 624 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 562, 562, 561, 562, 561, 562, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7915771 0.555903
KNN is used to make its predictions about an observation X by identifying training observations that are closest to it. We then assign that observation to a class along with similar observations also close to certain training observations. Therefore, when the decision boundary is very non-linear KNN will be quite accurate. The difference between KNN and QDA is that KNN works best when there are many observations relative to predictors.
library(class)
library(caTools)
train.X <- tf[tftrain, ]
test.X <- tf[-tftrain, ]
train.Survived <- tf.train$Survived
# Had to get rid of categorical variables
split <- sample.split(tf, SplitRatio = 0.3)
train_X <- subset(tf %>% dplyr:: select(-Sex,-Embarked), split == "TRUE")
test_X <- subset(tf %>% dplyr:: select(-Sex,-Embarked), split == "FALSE")
# Need to scale the data
train_scale <- scale(train_X[,2:5])
test_scale <- scale(test_X[,2:5])
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_X$Survived,
k = 1)
cm <- table(classifier_knn, test_X$Survived)
cm
##
## classifier_knn 0 1
## 0 288 111
## 1 113 124
mean(classifier_knn == test_X$Survived) # Number of observations correctly predicted
## [1] 0.6477987
misClassError <- mean(classifier_knn != test_X$Survived)
print(paste('KNN Error Rate =', misClassError, "for k = 1")) # KNN Error Rate
## [1] "KNN Error Rate = 0.352201257861635 for k = 1"
print(paste('Accuracy =', 1-misClassError, "for k = 1")) # KNN Accuracy
## [1] "Accuracy = 0.647798742138365 for k = 1"
table <- table(classifier_knn, test_X$Survived)
table[2,2]/(table[2,1]+table[2,2])
## [1] 0.5232068
KNN.level <- function(k){
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_X$Survived,
k)
cm <- table(test_X$Survived, classifier_knn)
cm
misClassError <- mean(classifier_knn != test_X$Survived)
print(paste('KNN Error Rate =', misClassError, "for k =", k))
print(paste('Accuracy =', 1-misClassError,"for k =", k))
table <- table(classifier_knn, test_X$Survived)
table[2,2]/(table[2,1]+table[2,2])
}
KNN.level(5)
## [1] "KNN Error Rate = 0.377358490566038 for k = 5"
## [1] "Accuracy = 0.622641509433962 for k = 5"
## [1] 0.4898785
KNN.level(10)
## [1] "KNN Error Rate = 0.284591194968553 for k = 10"
## [1] "Accuracy = 0.715408805031447 for k = 10"
## [1] 0.6377551
library(pROC)
# Sampling
set.seed(300)
KNNTrain <- createDataPartition(y = tf$Survived,p = 0.70,list = FALSE)
training <- tf[KNNTrain,]
testing <- tf[-KNNTrain,]
prop.table(table(training$Survived)) * 100
##
## 0 1
## 61.6 38.4
prop.table(table(testing$Survived)) * 100
##
## 0 1
## 61.65414 38.34586
prop.table(table(tf$Survived)) * 100
##
## 0 1
## 61.61616 38.38384
# Processing: Center and Scale the variables
trainX <- training[,names(training) != "Survived"]
preProcValues <- preProcess(x = trainX,method = c("center", "scale"))
preProcValues
## Created from 625 samples and 6 variables
##
## Pre-processing:
## - centered (4)
## - ignored (2)
## - scaled (4)
set.seed(400)
ctrl <- trainControl(method="repeatedcv",repeats = 3) #,classProbs=TRUE,summaryFunction = twoClassSummary)
knnFit <- train(Survived ~ ., data = training, method = "knn", trControl = ctrl, preProcess = c("center","scale"), tuneLength = 20)
knnFit
## k-Nearest Neighbors
##
## 625 samples
## 6 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (7), scaled (7)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 562, 563, 562, 562, 563, 563, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8031149 0.5783337
## 7 0.7988735 0.5672480
## 9 0.7994880 0.5660426
## 11 0.7940519 0.5549203
## 13 0.7860642 0.5361481
## 15 0.7801502 0.5233693
## 17 0.7796211 0.5193393
## 19 0.7759345 0.5070154
## 21 0.7711043 0.4908997
## 23 0.7593958 0.4579110
## 25 0.7604625 0.4577570
## 27 0.7604796 0.4555512
## 29 0.7647465 0.4655693
## 31 0.7663509 0.4674006
## 33 0.7673664 0.4701700
## 35 0.7716334 0.4812650
## 37 0.7689708 0.4753459
## 39 0.7689452 0.4760922
## 41 0.7657450 0.4700016
## 43 0.7668203 0.4758555
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knnFit)
knnPredict <- predict(knnFit,newdata = testing)
table(knnPredict, testing$Survived)
##
## knnPredict 0 1
## 0 144 30
## 1 20 72
KCV("knn")
## Generalized Linear Model
##
## 624 samples
## 6 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 561, 561, 562, 562, 561, 562, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7962878 0.5663878
The K-fold accuracy for logistic regression showed a value of 0.798 which is close to the accuracy obtained from the validation set approach of 0.8165. Similarly the K-fold cross validation accuracy rate was less than the validation set approach for all other models except for Naïve Bayes, with each model’s accuracy rates not showing a wide discrepancy and only minimal differences.
logit.PredC = as.factor(ifelse(logit.Prob>0.5,1,0))
confusionMatrix(data=logit.PredC,reference=tf.test$Survived, positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 149 30
## 1 19 69
##
## Accuracy : 0.8165
## 95% CI : (0.7647, 0.861)
## No Information Rate : 0.6292
## P-Value [Acc > NIR] : 2.009e-11
##
## Kappa : 0.5975
##
## Mcnemar's Test P-Value : 0.1531
##
## Sensitivity : 0.6970
## Specificity : 0.8869
## Pos Pred Value : 0.7841
## Neg Pred Value : 0.8324
## Prevalence : 0.3708
## Detection Rate : 0.2584
## Detection Prevalence : 0.3296
## Balanced Accuracy : 0.7919
##
## 'Positive' Class : 1
##
logit.prediction = prediction(logit.Prob,tf.test$Survived)
logit.roc = performance(logit.prediction,measure="tpr", x.measure="fpr")
logit.auc = performance(logit.prediction,measure="auc",x.measure="cutoff")
plot(logit.roc, colorize = TRUE, lwd=3,main=paste("AUC:",logit.auc@y.values[[1]]))
The ROC curve which shows the relationship between the two types of errors that can be associated with the model, indicates that the area under the curve (AUC) is 0.856, which is quite good, as the ideal curve would show 100% true positive rate and 0% false positive rate.
confusionMatrix(lda.Class,tf.test$Survived, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 145 30
## 1 23 69
##
## Accuracy : 0.8015
## 95% CI : (0.7485, 0.8476)
## No Information Rate : 0.6292
## P-Value [Acc > NIR] : 8.057e-10
##
## Kappa : 0.5683
##
## Mcnemar's Test P-Value : 0.4098
##
## Sensitivity : 0.6970
## Specificity : 0.8631
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.8286
## Prevalence : 0.3708
## Detection Rate : 0.2584
## Detection Prevalence : 0.3446
## Balanced Accuracy : 0.7800
##
## 'Positive' Class : 1
##
tf.lda.cm <- train(Survived ~., method = "lda", data = tf.test)
confusionMatrix(data=tf.lda.cm,reference=tf.test$Survived, positive="1")
## Bootstrapped (25 reps) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 0 1
## 0 53.3 11.8
## 1 9.5 25.4
##
## Accuracy (average) : 0.7871
lda.predict <- prediction(lda.Pred$posterior[,2], tf.test$Survived)
lda.roc <- performance(lda.predict,measure="tpr", x.measure="fpr")
lda.auc <- performance(lda.predict,measure="auc",x.measure="cutoff")
plot(lda.roc,colorize=TRUE,lwd=3,main=paste("AUC:",lda.auc@y.values[[1]]))
confusionMatrix(qda.Class,tf.test$Survived, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 146 27
## 1 22 72
##
## Accuracy : 0.8165
## 95% CI : (0.7647, 0.861)
## No Information Rate : 0.6292
## P-Value [Acc > NIR] : 2.009e-11
##
## Kappa : 0.6026
##
## Mcnemar's Test P-Value : 0.5677
##
## Sensitivity : 0.7273
## Specificity : 0.8690
## Pos Pred Value : 0.7660
## Neg Pred Value : 0.8439
## Prevalence : 0.3708
## Detection Rate : 0.2697
## Detection Prevalence : 0.3521
## Balanced Accuracy : 0.7982
##
## 'Positive' Class : 1
##
tf.qda.cm <- train(Survived ~., method = "qda", data = tf.test)
confusionMatrix(data=tf.qda.cm,reference=tf.test$Survived, positive="1")
## Bootstrapped (25 reps) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 0 1
## 0 50.8 12.5
## 1 11.3 25.4
##
## Accuracy (average) : 0.7622
qda.predict <- prediction(qda.Pred$posterior[,2], tf.test$Survived)
qda.roc <- performance(qda.predict,measure="tpr", x.measure="fpr")
qda.auc <- performance(qda.predict,measure="auc",x.measure="cutoff")
plot(qda.roc,colorize=TRUE,lwd=3,main=paste("AUC:",qda.auc@y.values[[1]]))
The accuracy of these models shows that QDA did a better job at classifying each observation into the correct class, which therefore means that the boundary between the classes Survived and Perished is non-linear. However, the bootstrapped confusion matrix showed a better accuracy for LDA, meaning that when taking multiple samples, the LDA classifier does a better job. The ROC curve for LDA displays an AUC of 0.857, which is better than the logistic regression model and better than the AUC for QDA, which was very similar to logistic regression. Therefore, LDA is able to have a high true positive rate without sacrificing a too high false positive rate.
confusionMatrix(nb.Pred,tf.test$Survived, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 150 37
## 1 18 62
##
## Accuracy : 0.794
## 95% CI : (0.7405, 0.8409)
## No Information Rate : 0.6292
## P-Value [Acc > NIR] : 4.439e-09
##
## Kappa : 0.5404
##
## Mcnemar's Test P-Value : 0.01522
##
## Sensitivity : 0.6263
## Specificity : 0.8929
## Pos Pred Value : 0.7750
## Neg Pred Value : 0.8021
## Prevalence : 0.3708
## Detection Rate : 0.2322
## Detection Prevalence : 0.2996
## Balanced Accuracy : 0.7596
##
## 'Positive' Class : 1
##
tf.nb.cm <- train(Survived ~., method = "nb", data = tf.test)
confusionMatrix(data=tf.nb.cm,reference=tf.test$Survived, positive="1")
## Bootstrapped (25 reps) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 0 1
## 0 51.4 10.6
## 1 11.4 26.6
##
## Accuracy (average) : 0.7798
nb.pred.prob <- predict(nb, newdata = tf.test, type = "raw")
nb.pred.prob <- as.data.frame(nb.pred.prob)
nb.predict <- prediction(nb.pred.prob[,2], tf.test$Survived)
nb.roc <- performance(nb.predict,measure="tpr", x.measure="fpr")
nb.auc <- performance(nb.predict,measure="auc",x.measure="cutoff")
plot(nb.roc,colorize=TRUE,lwd=3,main=paste("AUC:",nb.auc@y.values[[1]]))
The ROC curve for Naïve Bayes displays the lowest area under the curve, indicating that there is higher false positive rate with a lower true positive rate, compared to the other models.
confusionMatrix(knnPredict, testing$Survived,positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 144 30
## 1 20 72
##
## Accuracy : 0.812
## 95% CI : (0.7598, 0.8571)
## No Information Rate : 0.6165
## P-Value [Acc > NIR] : 4.357e-12
##
## Kappa : 0.595
##
## Mcnemar's Test P-Value : 0.2031
##
## Sensitivity : 0.7059
## Specificity : 0.8780
## Pos Pred Value : 0.7826
## Neg Pred Value : 0.8276
## Prevalence : 0.3835
## Detection Rate : 0.2707
## Detection Prevalence : 0.3459
## Balanced Accuracy : 0.7920
##
## 'Positive' Class : 1
##
knnPredict <- predict(knnFit,newdata = testing, type="prob")
knnPredict <- prediction(knnPredict[,2], testing$Survived)
nb.roc <- performance(knnPredict,measure="tpr", x.measure="fpr")
nb.auc <- performance(knnPredict,measure="auc",x.measure="cutoff")
plot(nb.roc,colorize=TRUE,lwd=3,main=paste("AUC:",nb.auc@y.values[[1]]))
The ROC curve for KNN displays the highest area under the curve out of all the models, meaning that the KNN model has a better tradeoff between the true positive and false positive rates. Meaning KNN is able to have a higher true positive rate without having to simultaneously suffer from a high false positive rate as well.
wd = "/Users/aarontomat/Desktop/Boston College Fall 2021/Big Data Econometrics/"
rfile1=paste0(wd,"test (Titanic).csv")
tf2 = read.csv(rfile1,na.strings=c("NA",""))
tf2$Embarked = as.factor(tf2$Embarked)
tf2$Name = as.factor(tf2$Name)
tf2$Sex = as.factor(tf2$Sex)
tf2$Ticket = as.factor(tf2$Ticket)
tf2$Cabin = as.factor(tf2$Cabin)
attach(tf2)
## The following objects are masked from tf:
##
## Age, Cabin, Embarked, Fare, Name, Parch, PassengerId, Pclass, Sex,
## SibSp, Ticket
sapply(tf2,function(df){100*sum(is.na(df==TRUE)/length(df))})
## PassengerId Pclass Name Sex Age SibSp
## 0.0000000 0.0000000 0.0000000 0.0000000 20.5741627 0.0000000
## Parch Ticket Fare Cabin Embarked
## 0.0000000 0.0000000 0.2392344 78.2296651 0.0000000
tf2$Name = as.character(tf2$Name)
# Missing ages for each age can be imputed using titles: assuming each title average is a good proxy
table_words2 = table(unlist(strsplit(tf2$Name, "\\s+")))
sort(table_words2 [grep('\\.',names(table_words2))],decreasing=TRUE)
##
## Mr. Miss. Mrs. Master. Col. Rev. Dona. Dr. Ms.
## 240 78 72 21 2 2 1 1 1
library(stringr)
# Get a substring containing a period, then bind the column.
tb2=cbind(tf2$Age, str_match(tf2$Name, "[a-zA-Z]+\\."))
# Acquire the statistics of missing values, we can count each title.
table(tb2[is.na(tb2[,1]),2])
##
## Master. Miss. Mr. Mrs. Ms.
## 4 14 57 10 1
# Calculate the mean of each group based on title
mean.mr=mean(tf2$Age[grepl(" Mr\\.", tf2$Name) & !is.na(tf2$Age)])
mean.mrs=mean(tf2$Age[grepl(" Mrs\\.", tf2$Name) & !is.na(tf2$Age)])
mean.dr=mean(tf2$Age[grepl(" Dr\\.", tf2$Name) & !is.na(tf2$Age)])
mean.miss=mean(tf2$Age[grepl(" Miss\\.", tf2$Name) & !is.na(tf2$Age)])
mean.master=mean(tf2$Age[grepl(" Master\\.", tf2$Name) & !is.na(tf2$Age)])
mean.ms=mean(tf2$Age[grepl(" Miss\\.", tf2$Name) & !is.na(tf2$Age)])
# Assign the missing values of each title to the mean estimated above
tf2$Age[grepl(" Mr\\.", tf2$Name) & is.na(tf2$Age)]=mean.mr
tf2$Age[grepl(" Mrs\\.", tf2$Name) & is.na(tf2$Age)]=mean.mrs
tf2$Age[grepl(" Dr\\.", tf2$Name) & is.na(tf2$Age)]=mean.dr
tf2$Age[grepl(" Miss\\.", tf2$Name) & is.na(tf2$Age)]=mean.miss
tf2$Age[grepl(" Master\\.", tf2$Name) & is.na(tf2$Age)]=mean.master
tf2$Age[grepl(" Ms\\.", tf2$Name) & is.na(tf2$Age)]=mean.ms
# Verify that Age has no more N/A by running the sapply function
NaAge <- sapply(tf2,function(df){100*sum(is.na(df==TRUE)/length(df))})
NaAge["Age"] # Age has 0% of NA's
## Age
## 0
# The fare can be approximated using pclass as a proxy
tb3 <- cbind(tf2$Pclass,tf2$Fare)
table(tb3[is.na(tb3[,2]),1])
##
## 3
## 1
mean.C3 = mean(tf2$Pclass)
MFD <- tf2 %>% dplyr::select(Pclass,Fare) %>% filter(Pclass == 3)
MFD[is.na(MFD)] = 0
MFD <- MFD %>% filter(Fare > 0)
mean.C3 = mean(MFD$Fare)
tf2$Fare[is.na(tf2$Fare)] = mean.C3
sapply(tf2,function(df){100*sum(is.na(df==TRUE)/length(df))})["Fare"]
## Fare
## 0
tf2 <- tf2 %>% dplyr::select(-Cabin,-PassengerId,-Name,-Parch,-Ticket)
# Logit
logit <- glm(Survived ~., data = tf, family = binomial)
logit.Prob = predict(logit, tf2, type = "response")
logit.Pred = rep("0", nrow(tf2))
logit.Pred[logit.Prob > .5] = "1"
logit.Predictions <- data.frame(logit.Pred)
# LDA
lda<- lda(Survived ~ ., data = tf)
lda.Pred <- predict(lda, tf2)
lda.Class <- lda.Pred$class
lda.Predictions <- as.data.frame(lda.Class)
# QDA
qda<- qda(Survived ~ ., data = tf)
qda.Pred <- predict(qda, tf2)
qda.Class <- qda.Pred$class
qda.Predictions <- as.data.frame(qda.Class)
# Naïve Bayes
nb <- naiveBayes(Survived ~ ., data = tf)
nb.Pred <- predict(nb, tf2)
nb.Predictions <- as.data.frame(nb.Pred)
# KNN
knnFit <- train(Survived ~ ., data = tf, method = "knn", trControl = ctrl, preProcess = c("center","scale"), tuneLength = 20)
knnPredict <- predict(knnFit,newdata = tf2)
knnPredictions <- as.data.frame(knnPredict)
The logistic model had the highest accuracy when using the validation set approach and as well as a relativly high accuracy when using K-fold cross validation. Similarly the AUC was average indicating a good ratio of true positive and false positive results. Out of all the models the Logistic Model scored the most consistently, while when other models sometimes did well, and other times performed poorly.