Import the Titanic Data Set.
T3 <- read.csv("C:/Users/Ozili Nwokobia/OneDrive/Desktop/PROJECT DATASET.txt")
View(T3)
str(T3)
## 'data.frame': 1309 obs. of 14 variables:
## $ pclass : chr "1st" "1st" "1st" "1st" ...
## $ survived : int 1 1 0 0 0 1 1 0 1 0 ...
## $ name : chr "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
## $ sex : chr "female" "male" "female" "male" ...
## $ age : num 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ ticket : chr "24160" "113781" "113781" "113781" ...
## $ fare : num 211 152 152 152 152 ...
## $ cabin : chr "B5" "C22 C26" "C22 C26" "C22 C26" ...
## $ embarked : chr "S" "S" "S" "S" ...
## $ boat : chr "2" "11" NA NA ...
## $ body : int NA NA NA 135 NA NA NA NA NA 22 ...
## $ home.dest: chr "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...
Building a new Data set named titanic, using the columns survived, embarked, sex, sibsp, parch & fare.
titanic<- T3[, c("survived", "embarked", "sex", "sibsp", "parch", "fare")]
View(titanic)
str(titanic)
## 'data.frame': 1309 obs. of 6 variables:
## $ survived: int 1 1 0 0 0 1 1 0 1 0 ...
## $ embarked: chr "S" "S" "S" "S" ...
## $ sex : chr "female" "male" "female" "male" ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num 211 152 152 152 152 ...
Perform a Statistical Analysis of the titanic data set
summary(titanic)
## survived embarked sex sibsp
## Min. :0.000 Length:1309 Length:1309 Min. :0.0000
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:0.0000
## Median :0.000 Mode :character Mode :character Median :0.0000
## Mean :0.382 Mean :0.4989
## 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :1.000 Max. :8.0000
##
## parch fare
## Min. :0.000 Min. : 0.000
## 1st Qu.:0.000 1st Qu.: 7.896
## Median :0.000 Median : 14.454
## Mean :0.385 Mean : 33.295
## 3rd Qu.:0.000 3rd Qu.: 31.275
## Max. :9.000 Max. :512.329
## NA's :1
table(titanic$sex)
##
## female male
## 466 843
table(titanic$embarked)
##
## C Q S
## 270 123 914
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
survival_by_sex <- titanic %>%
group_by(sex) %>%
summarise(Survival_Rate = mean(survived), Total = n())
survival_by_embarked <- titanic %>%
group_by(embarked) %>%
summarise(Survival_Rate = mean(survived), Total = n())
chisq.test(table(titanic$survived, titanic$sex))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(titanic$survived, titanic$sex)
## X-squared = 363.62, df = 1, p-value < 2.2e-16
chisq.test(table(titanic$survived, titanic$embarked))
##
## Pearson's Chi-squared test
##
## data: table(titanic$survived, titanic$embarked)
## X-squared = 44.242, df = 2, p-value = 2.472e-10
Displaying my finding proportion using the Survived Variable
# Assuming 'titanic_data' is your dataset and 'survived' is the column of interest
proportion_survived1 <- mean(titanic$survived, na.rm = TRUE)
# Print the proportion
print(proportion_survived1)
## [1] 0.381971
survival_proportion <- mean(titanic$survived)
survival_proportion
## [1] 0.381971
Removing all the Rows with N/A’s
titanicNA<- na.omit(titanic)
str(titanicNA)
## 'data.frame': 1306 obs. of 6 variables:
## $ survived: int 1 1 0 0 0 1 1 0 1 0 ...
## $ embarked: chr "S" "S" "S" "S" ...
## $ sex : chr "female" "male" "female" "male" ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num 211 152 152 152 152 ...
## - attr(*, "na.action")= 'omit' Named int [1:3] 169 285 1226
## ..- attr(*, "names")= chr [1:3] "169" "285" "1226"
Make Survived Embarked and Sex as Factors
titanic$survived <- factor(titanic$survived, levels = c(0, 1), labels = c("No", "Yes"))
titanic$embarked <- factor(titanic$embarked, levels = c("C", "Q", "S"), labels = c("Cherbourg", "Queenstown", "Southampton"))
titanic$sex <- factor(titanic$sex)
str(titanic)
## 'data.frame': 1309 obs. of 6 variables:
## $ survived: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 2 1 2 1 ...
## $ embarked: Factor w/ 3 levels "Cherbourg","Queenstown",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num 211 152 152 152 152 ...
Find the correlation Matrix between survival and the other features
titanic$survived_numeric <- as.numeric(titanic$survived) - 1
numeric_data <- titanic[c("survived_numeric", "fare", "sibsp", "parch")]
correlation_matrix <- cor(numeric_data, use = "complete.obs")
print(correlation_matrix)
## survived_numeric fare sibsp parch
## survived_numeric 1.00000000 0.2442655 -0.02812218 0.08241782
## fare 0.24426547 1.0000000 0.16023826 0.22153866
## sibsp -0.02812218 0.1602383 1.00000000 0.37348524
## parch 0.08241782 0.2215387 0.37348524 1.00000000
Plot survival with other features to see if any correlations exist
library(ggplot2)
ggplot(titanic, aes(x = sex, fill = survived)) +
geom_bar(position = "fill") +
labs(title = "Survival by Sex", y = "Proportion of Total")
ggplot(titanic, aes(x = embarked, fill = survived)) +
geom_bar(position = "fill") +
labs(title = "Survival by Embarkation Point", y = "Proportion of Total")
ggplot(titanic, aes(x = survived, y = fare, fill = survived)) +
geom_boxplot() +
labs(title = "Fare Distribution by Survival Status", y = "Fare")
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).
Set a seed to 1000 and use it to split titanic into 80% training, 20%
testing
set.seed(1000)
library(caret)
## Loading required package: lattice
split <- createDataPartition(titanic$survived, p = 0.8, list = FALSE)
training_data <- titanic[split, ]
testing_data <- titanic[-split, ]
Using Rpart package
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
fit <- rpart(survived ~ sex + sibsp + parch + fare + embarked,
data = training_data, method="class")
fancyRpartPlot(fit)
The tree did split using gender it shows the slogan “Women and children
first” was true
Prediction
Prediction<-predict(fit, testing_data, type = "class")
print(Prediction)
## 1 7 9 21 30 33 34 35 36 50 52 53 54 60 64 69
## Yes Yes Yes No No Yes Yes No Yes No No No No Yes Yes No
## 86 87 92 93 98 99 101 102 103 110 114 115 116 128 133 137
## Yes No No Yes Yes Yes No No Yes No Yes No No Yes No No
## 151 160 175 176 182 188 195 199 202 211 214 216 228 229 242 244
## No Yes No No Yes Yes No Yes No No Yes No Yes No No No
## 249 252 253 257 271 272 277 280 286 287 288 289 297 298 308 316
## No Yes No No Yes No No No No Yes No Yes Yes Yes No Yes
## 321 322 328 329 335 339 349 350 352 359 365 370 377 379 381 382
## No No No No No No No Yes No Yes No Yes No No Yes Yes
## 388 394 396 401 408 409 418 423 424 431 436 441 451 456 460 461
## Yes No Yes Yes Yes No No No No Yes No Yes No No No Yes
## 466 470 480 486 490 494 501 506 513 516 517 520 530 539 551 558
## Yes Yes Yes No Yes No No No No No No No Yes No Yes Yes
## 568 577 582 586 592 593 597 603 607 616 617 622 638 643 646 647
## No No No No Yes No No No No No No Yes No No No Yes
## 656 659 676 679 694 698 703 705 708 709 710 712 725 729 731 734
## No Yes No No No Yes Yes No No No Yes No No No No No
## 735 738 739 742 745 753 754 759 762 779 811 814 818 827 833 835
## No No Yes No No No No No No Yes No No No No Yes No
## 840 843 844 858 859 862 867 881 882 884 885 893 896 897 900 901
## No No No No No Yes Yes No No No No No Yes No Yes No
## 902 907 912 913 914 917 921 929 931 935 939 940 943 957 959 960
## Yes No No No No Yes No Yes No Yes No Yes No Yes Yes No
## 961 963 964 966 967 969 993 1003 1011 1013 1015 1025 1028 1030 1040 1041
## No No No No Yes No Yes Yes No Yes Yes No No No Yes Yes
## 1043 1062 1072 1073 1074 1084 1095 1096 1098 1099 1105 1108 1110 1117 1121 1139
## Yes Yes Yes No No No Yes Yes No Yes No No No No No No
## 1141 1154 1156 1168 1172 1174 1181 1183 1185 1189 1196 1197 1201 1202 1223 1229
## No Yes No No No Yes Yes Yes No Yes No No No No No No
## 1234 1238 1239 1242 1251 1255 1257 1264 1267 1271 1273 1278 1280 1282 1283 1284
## No No No No No No No No No No No No Yes No No No
## 1288 1290 1292 1294 1308
## No No No No No
## Levels: No Yes