Titanic Analysis

library(readr)
At238b <- read_csv("https://goo.gl/At238b")

## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): pclass, name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (6): survived, age, sibsp, parch, fare, body
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(At238b)

#load the packages
library(mlbench)

## Warning: package 'mlbench' was built under R version 4.3.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.92 loaded

library(e1071)

## Warning: package 'e1071' was built under R version 4.3.3

library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: ggplot2
## Loading required package: lattice

library(rpart)
library(rattle)

## Warning: package 'rattle' was built under R version 4.3.3

## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.3.3

library(RColorBrewer)

T3 <- At238b
titanic <- T3[, c("survived", "embarked", "age", "sex", "sibsp", "parch", "fare")]
View(titanic)

#Statistical Analysis
#convert embarked and sex column to number in order to perform some statistical analysis
titanic$embarked <- as.numeric(factor(titanic$embarked, levels = c("S", "C")))
titanic$sex <- as.numeric(factor(titanic$sex, levels = c("male", "female")))
str(titanic)

## tibble [1,309 × 7] (S3: tbl_df/tbl/data.frame)
##  $ survived: num [1:1309] 1 1 0 0 0 1 1 0 1 0 ...
##  $ embarked: num [1:1309] 1 1 1 1 1 1 1 1 1 2 ...
##  $ age     : num [1:1309] 29 0.92 2 30 25 48 63 39 53 71 ...
##  $ sex     : num [1:1309] 2 1 2 1 2 1 2 1 2 1 ...
##  $ sibsp   : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num [1:1309] 211 152 152 152 152 ...

#summarize the dataset
summary(titanic)

##     survived        embarked          age             sex       
##  Min.   :0.000   Min.   :1.000   Min.   : 0.17   Min.   :1.000  
##  1st Qu.:0.000   1st Qu.:1.000   1st Qu.:21.00   1st Qu.:1.000  
##  Median :0.000   Median :1.000   Median :28.00   Median :1.000  
##  Mean   :0.382   Mean   :1.228   Mean   :29.88   Mean   :1.356  
##  3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:39.00   3rd Qu.:2.000  
##  Max.   :1.000   Max.   :2.000   Max.   :80.00   Max.   :2.000  
##                  NA's   :125     NA's   :263                    
##      sibsp            parch            fare        
##  Min.   :0.0000   Min.   :0.000   Min.   :  0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:  7.896  
##  Median :0.0000   Median :0.000   Median : 14.454  
##  Mean   :0.4989   Mean   :0.385   Mean   : 33.295  
##  3rd Qu.:1.0000   3rd Qu.:0.000   3rd Qu.: 31.275  
##  Max.   :8.0000   Max.   :9.000   Max.   :512.329  
##                                   NA's   :1

#display first 20 rows of data
head(titanic, n=20)

## # A tibble: 20 × 7
##    survived embarked   age   sex sibsp parch  fare
##       <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1        1        1 29        2     0     0 211. 
##  2        1        1  0.92     1     1     2 152. 
##  3        0        1  2        2     1     2 152. 
##  4        0        1 30        1     1     2 152. 
##  5        0        1 25        2     1     2 152. 
##  6        1        1 48        1     0     0  26.6
##  7        1        1 63        2     1     0  78.0
##  8        0        1 39        1     0     0   0  
##  9        1        1 53        2     2     0  51.5
## 10        0        2 71        1     0     0  49.5
## 11        0        2 47        1     1     0 228. 
## 12        1        2 18        2     1     0 228. 
## 13        1        2 24        2     0     0  69.3
## 14        1        1 26        2     0     0  78.8
## 15        1        1 80        1     0     0  30  
## 16        0        1 NA        1     0     0  25.9
## 17        0        2 24        1     0     1 248. 
## 18        1        2 50        2     0     1 248. 
## 19        1        2 32        2     0     0  76.3
## 20        0        2 36        1     0     0  75.2

#display the dimensions of the dataset
dim(titanic)

## [1] 1309    7

#list types for each attribute
sapply(titanic, class)

##  survived  embarked       age       sex     sibsp     parch      fare 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"

# distribution of class variable
y <- titanic$survived
cbind(freq=table(y), percentage=prop.table(table(y))*100)

##   freq percentage
## 0  809    61.8029
## 1  500    38.1971

#calculate standard deviation for all attributes
sapply(titanic[,1:7], sd)

##  survived  embarked       age       sex     sibsp     parch      fare 
## 0.4860552        NA        NA 0.4789973 1.0416584 0.8655603        NA

#calculate skewness for each variable
skew <- apply(titanic[,1:7], 2, skewness)
# display skewness, larger/smaller deviations from 0 show more skew
print(skew)

##  survived  embarked       age       sex     sibsp     parch      fare 
## 0.4852900        NA        NA 0.6008094 3.8354145 3.6606736        NA

#calculate proportion of survived passengers
survived_proportion <- mean(titanic$survived)
#print the proportion
print(survived_proportion)

## [1] 0.381971

#removing null values
any(is.na(titanic))

## [1] TRUE

titanic<-na.omit(titanic)
str(titanic)

## tibble [993 × 7] (S3: tbl_df/tbl/data.frame)
##  $ survived: num [1:993] 1 1 0 0 0 1 1 0 1 0 ...
##  $ embarked: num [1:993] 1 1 1 1 1 1 1 1 1 2 ...
##  $ age     : num [1:993] 29 0.92 2 30 25 48 63 39 53 71 ...
##  $ sex     : num [1:993] 2 1 2 1 2 1 2 1 2 1 ...
##  $ sibsp   : num [1:993] 0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : num [1:993] 0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num [1:993] 211 152 152 152 152 ...
##  - attr(*, "na.action")= 'omit' Named int [1:316] 16 38 41 47 60 70 71 75 81 107 ...
##   ..- attr(*, "names")= chr [1:316] "16" "38" "41" "47" ...

View(titanic)

#calculate correlation matrix
correlation_matrix <- cor(titanic)
#extract correlation of "survived" with other variables
survived_correlation <- correlation_matrix["survived", ]
#print correlation of "survived" with other variables
print(survived_correlation)

##     survived     embarked          age          sex        sibsp        parch 
##  1.000000000  0.219669225 -0.053274961  0.546984591 -0.001033379  0.124687212 
##         fare 
##  0.246971525

#display correlation plot
corrplot(correlation_matrix, method="circle")

#convert survived, embarked, and sex columns to factors
titanic$survived <- factor(titanic$survived)
titanic$embarked <- factor(titanic$embarked)
titanic$sex <- factor(titanic$sex)
#check the structure of the dataset to confirm the changes
str(titanic)

## tibble [993 × 7] (S3: tbl_df/tbl/data.frame)
##  $ survived: Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 2 1 ...
##  $ embarked: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 2 ...
##  $ age     : num [1:993] 29 0.92 2 30 25 48 63 39 53 71 ...
##  $ sex     : Factor w/ 2 levels "1","2": 2 1 2 1 2 1 2 1 2 1 ...
##  $ sibsp   : num [1:993] 0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : num [1:993] 0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num [1:993] 211 152 152 152 152 ...
##  - attr(*, "na.action")= 'omit' Named int [1:316] 16 38 41 47 60 70 71 75 81 107 ...
##   ..- attr(*, "names")= chr [1:316] "16" "38" "41" "47" ...

#set seed for reproducibility
set.seed(1000)
#split the dataset into training and testing based on indices
train_indices <- 1:1046
test_indices <- 1047:1309
#create training and testing datasets
training_data <- titanic[train_indices, ]
testing_data <- titanic[test_indices, ]

#train your learner on the training dataset and save your model in a variable  fit
fit <- rpart(survived ~ sex + age + sibsp + parch + fare + embarked,
             data=training_data, method="class")
plot(fit)

#save the plot as a PNG file
png("regression_tree.png")
plot(fit)
dev.off()

## png 
##   2

#obtain a more readable plot
fancyRpartPlot(fit)

#examine the tree
summary(fit)

## Call:
## rpart(formula = survived ~ sex + age + sibsp + parch + fare + 
##     embarked, data = training_data, method = "class")
##   n=993 (53 observations deleted due to missingness)
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.47572816      0 1.0000000 1.0000000 0.03768468
## 2 0.02669903      1 0.5242718 0.5242718 0.03155481
## 3 0.02427184      2 0.4975728 0.5218447 0.03150193
## 4 0.01000000      3 0.4733010 0.4830097 0.03061716
## 
## Variable importance
##   sex   age parch  fare sibsp 
##    72     8     8     7     6 
## 
## Node number 1: 993 observations,    complexity param=0.4757282
##   predicted class=0  expected loss=0.4149043  P(node) =1
##     class counts:   581   412
##    probabilities: 0.585 0.415 
##   left son=2 (631 obs) right son=3 (362 obs)
##   Primary splits:
##       sex      splits as  LR,           improve=144.246200, (0 missing)
##       fare     < 15.1729  to the left,  improve= 39.594810, (0 missing)
##       embarked splits as  LR,           improve= 23.264440, (0 missing)
##       parch    < 0.5      to the left,  improve= 20.579540, (0 missing)
##       age      < 8.5      to the right, improve=  9.988721, (0 missing)
##   Surrogate splits:
##       parch < 0.5      to the left,  agree=0.674, adj=0.105, (0 split)
##       fare  < 77.6229  to the left,  agree=0.665, adj=0.080, (0 split)
##       age   < 5.5      to the right, agree=0.637, adj=0.006, (0 split)
## 
## Node number 2: 631 observations,    complexity param=0.02669903
##   predicted class=0  expected loss=0.2107765  P(node) =0.6354481
##     class counts:   498   133
##    probabilities: 0.789 0.211 
##   left son=4 (592 obs) right son=5 (39 obs)
##   Primary splits:
##       age      < 9.5      to the right, improve=15.390130, (0 missing)
##       fare     < 26.26875 to the left,  improve= 6.785901, (0 missing)
##       embarked splits as  LR,           improve= 6.167414, (0 missing)
##       parch    < 0.5      to the left,  improve= 5.107995, (0 missing)
##       sibsp    < 0.5      to the left,  improve= 0.824624, (0 missing)
##   Surrogate splits:
##       sibsp < 3.5      to the left,  agree=0.943, adj=0.077, (0 split)
## 
## Node number 3: 362 observations
##   predicted class=1  expected loss=0.2292818  P(node) =0.3645519
##     class counts:    83   279
##    probabilities: 0.229 0.771 
## 
## Node number 4: 592 observations
##   predicted class=0  expected loss=0.1824324  P(node) =0.5961732
##     class counts:   484   108
##    probabilities: 0.818 0.182 
## 
## Node number 5: 39 observations,    complexity param=0.02427184
##   predicted class=1  expected loss=0.3589744  P(node) =0.03927492
##     class counts:    14    25
##    probabilities: 0.359 0.641 
##   left son=10 (12 obs) right son=11 (27 obs)
##   Primary splits:
##       sibsp < 2.5      to the right, improve=10.7820500, (0 missing)
##       fare  < 20.825   to the right, improve= 1.5954570, (0 missing)
##       age   < 3.5      to the right, improve= 0.4884005, (0 missing)
##       parch < 1.5      to the right, improve= 0.3345875, (0 missing)
##   Surrogate splits:
##       fare < 26.95    to the right, agree=0.744, adj=0.167, (0 split)
## 
## Node number 10: 12 observations
##   predicted class=0  expected loss=0.08333333  P(node) =0.01208459
##     class counts:    11     1
##    probabilities: 0.917 0.083 
## 
## Node number 11: 27 observations
##   predicted class=1  expected loss=0.1111111  P(node) =0.02719033
##     class counts:     3    24
##    probabilities: 0.111 0.889

#What is the most “important” feature over which the tree first split?
      #sex
#Do you agree with the Titanic slogan “ Women and children first!”
      #yes, with sex being the most important feature and age being the next most 
      #important, it is suggested that gender and age play a vital role in survival

#make predictions on the test dataset
Prediction <- predict(fit, testing_data, type = "class")

#create a data frame with PassengerSex and Survived columns
Results <- data.frame(PassengerSex = testing_data$sex,
                      Survived = Prediction)

#save the data frame as a CSV file
write.csv(Results, file = "Titanicdtree.csv", row.names = FALSE)

Titanic Analysis

Destinee Redfearn

2024-04-12