Question 1
Load the Titanic passenger list excel file into R, convert the
dataset into a data frame, and convert all non-numeric columns of data
into factors.
#date file loaded and changed to a dataframe
Titanic <- read_excel("G:/Other computers/My Laptop/Documents/Richard 621/Week 9/Titanic passenger list.xlsx")
Titanic<- as.data.frame(Titanic)
summary(Titanic)
## Passenger_Class Survived Sex Sibling_or_Spouse
## Min. :1.000 Length:1309 Length:1309 Min. :0.0000
## 1st Qu.:2.000 Class :character Class :character 1st Qu.:0.0000
## Median :3.000 Mode :character Mode :character Median :0.0000
## Mean :2.295 Mean :0.4989
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :8.0000
##
## Parent_or_Child Fare Embarked
## Min. :0.000 Min. : 0.000 Length:1309
## 1st Qu.:0.000 1st Qu.: 7.896 Class :character
## Median :0.000 Median : 14.454 Mode :character
## Mean :0.385 Mean : 33.295
## 3rd Qu.:0.000 3rd Qu.: 31.275
## Max. :9.000 Max. :512.329
## NA's :1
str(Titanic)
## 'data.frame': 1309 obs. of 7 variables:
## $ Passenger_Class : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Survived : chr "Lived" "Lived" "Died" "Died" ...
## $ Sex : chr "female" "male" "female" "male" ...
## $ Sibling_or_Spouse: num 0 1 1 1 1 0 1 0 2 0 ...
## $ Parent_or_Child : num 0 2 2 2 2 0 0 0 0 0 ...
## $ Fare : num 211 152 152 152 152 ...
## $ Embarked : chr "Southhampton" "Southhampton" "Southhampton" "Southhampton" ...
#changed character variables to factor variables
Titanic$Survived <- as.factor(Titanic$Survived)
Titanic$Sex <- as.factor(Titanic$Sex)
Titanic$Embarked <- as.factor(Titanic$Embarked)
str(Titanic)
## 'data.frame': 1309 obs. of 7 variables:
## $ Passenger_Class : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Survived : Factor w/ 2 levels "Died","Lived": 2 2 1 1 1 2 2 1 2 1 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ Sibling_or_Spouse: num 0 1 1 1 1 0 1 0 2 0 ...
## $ Parent_or_Child : num 0 2 2 2 2 0 0 0 0 0 ...
## $ Fare : num 211 152 152 152 152 ...
## $ Embarked : Factor w/ 3 levels "Cherbourg","Queenstown",..: 3 3 3 3 3 3 3 3 3 1 ...
Question 2
Check for missing values in the dataset
#Fare has one missing value and Embarked has two missing values
sum(is.na(Titanic))
## [1] 3
sapply(Titanic, function(x) sum(is.na(x)))
## Passenger_Class Survived Sex Sibling_or_Spouse
## 0 0 0 0
## Parent_or_Child Fare Embarked
## 0 1 2
a) Create a table and/or graph displaying the number of passengers
who embarked from each location (i.e., the number of passengers who
embarked from Cherbourg, the number who embarked from Queenstown, etc.).
From which location did the largest number of passengers embark? Impute
all missing values in the Embarked column with this location.
#created a barplot to show locations and number of people who embarked from those locations
barplot(table(Titanic$Embarked), col="green", ylab ="Number of Passengers", xlab ="Embarked Location")

#Southhapmpton has the largest amount of passengers at over 800 used the code below to impute the missing values to Southhampton
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
Titanic$Embarked <- with(Titanic, impute(Embarked, 'Southhampton'))
b) For any missing value(s) in a numeric column(s) of data, impute
the average of the column.
#Fare had a missing value so it was imputed with the mean and the last line of code show no more missing values
Titanic$Fare[is.na(Titanic$Fare)] <- mean(Titanic$Fare, na.rm = T)
sum(is.na(Titanic)) # no more missng values in any column
## [1] 0
Question 4
Using 300 bootstrapped sets, develop a bagging model on your
training set to predict whether or not a passenger survived the sinking
of the Titanic.
#install.packages('ipred')
library(ipred)
## Warning: package 'ipred' was built under R version 4.2.1
Titanic_bag <- bagging(formula = Survived~., data = Titanic_train, nbagg = 300)
Titanic_bag
##
## Bagging classification trees with 300 bootstrap replications
##
## Call: bagging.data.frame(formula = Survived ~ ., data = Titanic_train,
## nbagg = 300)
a) What is the out-of-bag error for your model?
Titanic_bag_oob <- bagging(formula = Survived~.,
data = Titanic_train,
coob = T,
nbagg = 300)
Titanic_bag_oob
##
## Bagging classification trees with 300 bootstrap replications
##
## Call: bagging.data.frame(formula = Survived ~ ., data = Titanic_train,
## coob = T, nbagg = 300)
##
## Out-of-bag estimate of misclassification error: 0.208
#Out-of-bag estimate of misclassification error: 0.2037 when first ran
b) Use your model to make predictions for the observations in your
testing set.
Titanic_bag_pred <- predict(Titanic_bag, newdata = Titanic_test, type= "class")
table(Titanic_test$Survived, Titanic_bag_pred, dnn = c("Truth", "Predicted"))
## Predicted
## Truth Died Lived
## Died 70 8
## Lived 18 35
# below table obtained when first sampled
#Predicted
#Truth Died Lived
#Died 76 15
#Lived 14 26
Quesiton 5
Using 300 trees, develop a random forest model on your training set
to predict whether or not a passenger survived the sinking of the
Titanic.
#300 trees used to make random forest model
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
Titanic_rf <- randomForest(Survived~., data = Titanic_train, importance = TRUE, ntree = 300)
Titanic_rf
##
## Call:
## randomForest(formula = Survived ~ ., data = Titanic_train, importance = TRUE, ntree = 300)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 19.44%
## Confusion matrix:
## Died Lived class.error
## Died 671 60 0.08207934
## Lived 169 278 0.37807606
#when first ran below was what was returned
#Call:
#randomForest(formula = Survived ~ ., data = Titanic_train, importance = TRUE, ntree = 300)
#Type of random forest: classification
#Number of trees: 300
#No. of variables tried at each split: 2
#OOB estimate of error rate: 19.61%
#Confusion matrix:
#Died Lived class.error
#Died 669 66 0.08979592
#Lived 165 278 0.37246050
a) What is the out-of-bag error for your model?
OOB estimate of error rate: 19.61%
c) What is the false positive rate (for your random forest model
developed with 300 trees)?
False positive = 0.08979592
d) What is the false negative rate (for your random forest model
developed with 300 tress)?
False negative = 0.37246050
e) Create a plot comparing the out-of-bag error, the false positive
rate, the false negative rate, and the number of trees. Based on your
plot, does it appear that a large number of trees are needed to develop
a fairly accurate random forest model?
#No, less than 50 trees would be fine as there is not much movement past 35
plot(Titanic_rf, lwd=rep(2, 3))
legend("right", legend = c("OOB Error", "FPR", "FNR"), lwd = rep(2, 3), lty = c(1, 2, 3), col = c("black", "red", "green"))

f) Use your random forest model to make predictions for the
observations in your testing set.
Titanic_rf_pred <- predict(Titanic_rf, newdata = Titanic_test, type= "class")
table(Titanic_test$Survived, Titanic_rf_pred, dnn = c("Truth", "Predicted"))
## Predicted
## Truth Died Lived
## Died 71 7
## Lived 20 33
#Results obtained when first ran
#Predicted
#Truth Died Lived
#Died 79 12
#Lived 16 24