# Clear the workspace
rm(list = ls()) # Clear env
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 505233 27.0 1121400 59.9 644245 34.5
## Vcells 894352 6.9 8388608 64.0 1635533 12.5
cat("\f") # Clear console
# Set Working Directory
#setwd("/Users/Ryan McNulty/Documents/Data Analysis/DATA/titanic")
# Import Train data
df <- read.csv(
file = "train.csv", # File name
na.strings = c("", "NA") # Replace blanks with NA - added after Q2 colsum 1st run
)
PassengerId is a qualitative nominal variable.
Age is a quantitative interval variable.
colSums(is.na(df))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
Cabin with 687 blanks
Create function to find the Mode of a variable
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
Replace missing observations
df$Age[is.na(df$Age)] <- median(df$Age, na.rm=TRUE)
df$SibSp[is.na(df$SibSp)] <- getmode(df$SibSp)
df$Parch[is.na(df$Parch)] <- getmode(df$Parch)
colSums(is.na(df)) # Recheck missing observations
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
Because only Age had missing observations prior, only that count of NULL values has been updated
Statistical Description of Age Variable
library(psych)
describe(df$Age)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 891 29.36 13.02 28 28.83 8.9 0.42 80 79.58 0.51 0.97 0.44
Statistical Description of SibSp Variable
library(psych)
describe(df$SibSp)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 891 0.52 1.1 0 0.27 0 0 8 8 3.68 17.73 0.04
Statistical Description of Parch Variable
library(psych)
describe(df$Parch)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 891 0.38 0.81 0 0.18 0 0 6 6 2.74 9.69 0.03
table(df$Survived, df$Sex)
##
## female male
## 0 81 468
## 1 233 109
Males who did not survive is the highest cross-section of the population in this table.
boxplot(
df$Age~df$Survived,
notch=TRUE,
horizontal=T
)
The median age between surviving and non-surviving passengers is the same/close. However the passengers who did not survive tended to be slightly older.