Import the Titanic Data Set.

T3 <- read.csv("C:/Users/Ozili Nwokobia/OneDrive/Desktop/PROJECT DATASET.txt")
View(T3)
str(T3)
## 'data.frame':    1309 obs. of  14 variables:
##  $ pclass   : chr  "1st" "1st" "1st" "1st" ...
##  $ survived : int  1 1 0 0 0 1 1 0 1 0 ...
##  $ name     : chr  "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
##  $ sex      : chr  "female" "male" "female" "male" ...
##  $ age      : num  29 0.92 2 30 25 48 63 39 53 71 ...
##  $ sibsp    : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch    : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket   : chr  "24160" "113781" "113781" "113781" ...
##  $ fare     : num  211 152 152 152 152 ...
##  $ cabin    : chr  "B5" "C22 C26" "C22 C26" "C22 C26" ...
##  $ embarked : chr  "S" "S" "S" "S" ...
##  $ boat     : chr  "2" "11" NA NA ...
##  $ body     : int  NA NA NA 135 NA NA NA NA NA 22 ...
##  $ home.dest: chr  "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...

Building a new Data set named titanic, using the columns survived, embarked, sex, sibsp, parch & fare.

titanic<- T3[, c("survived", "embarked", "sex", "sibsp", "parch", "fare")]
View(titanic)
str(titanic)
## 'data.frame':    1309 obs. of  6 variables:
##  $ survived: int  1 1 0 0 0 1 1 0 1 0 ...
##  $ embarked: chr  "S" "S" "S" "S" ...
##  $ sex     : chr  "female" "male" "female" "male" ...
##  $ sibsp   : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num  211 152 152 152 152 ...

Perform a Statistical Analysis of the titanic data set

summary(titanic)
##     survived       embarked             sex                sibsp       
##  Min.   :0.000   Length:1309        Length:1309        Min.   :0.0000  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:0.0000  
##  Median :0.000   Mode  :character   Mode  :character   Median :0.0000  
##  Mean   :0.382                                         Mean   :0.4989  
##  3rd Qu.:1.000                                         3rd Qu.:1.0000  
##  Max.   :1.000                                         Max.   :8.0000  
##                                                                        
##      parch            fare        
##  Min.   :0.000   Min.   :  0.000  
##  1st Qu.:0.000   1st Qu.:  7.896  
##  Median :0.000   Median : 14.454  
##  Mean   :0.385   Mean   : 33.295  
##  3rd Qu.:0.000   3rd Qu.: 31.275  
##  Max.   :9.000   Max.   :512.329  
##                  NA's   :1
table(titanic$sex)
## 
## female   male 
##    466    843
table(titanic$embarked)
## 
##   C   Q   S 
## 270 123 914
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
survival_by_sex <- titanic %>%
  group_by(sex) %>%
  summarise(Survival_Rate = mean(survived), Total = n())
survival_by_embarked <- titanic %>%
  group_by(embarked) %>%
  summarise(Survival_Rate = mean(survived), Total = n())

chisq.test(table(titanic$survived, titanic$sex))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(titanic$survived, titanic$sex)
## X-squared = 363.62, df = 1, p-value < 2.2e-16
chisq.test(table(titanic$survived, titanic$embarked))
## 
##  Pearson's Chi-squared test
## 
## data:  table(titanic$survived, titanic$embarked)
## X-squared = 44.242, df = 2, p-value = 2.472e-10

Displaying my finding proportion using the Survived Variable

# Assuming 'titanic_data' is your dataset and 'survived' is the column of interest
proportion_survived1 <- mean(titanic$survived, na.rm = TRUE)

# Print the proportion
print(proportion_survived1)
## [1] 0.381971
survival_proportion <- mean(titanic$survived)

survival_proportion
## [1] 0.381971

Removing all the Rows with N/A’s

titanicNA<- na.omit(titanic)
str(titanicNA)
## 'data.frame':    1306 obs. of  6 variables:
##  $ survived: int  1 1 0 0 0 1 1 0 1 0 ...
##  $ embarked: chr  "S" "S" "S" "S" ...
##  $ sex     : chr  "female" "male" "female" "male" ...
##  $ sibsp   : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num  211 152 152 152 152 ...
##  - attr(*, "na.action")= 'omit' Named int [1:3] 169 285 1226
##   ..- attr(*, "names")= chr [1:3] "169" "285" "1226"

Make Survived Embarked and Sex as Factors

titanic$survived <- factor(titanic$survived, levels = c(0, 1), labels = c("No", "Yes"))
titanic$embarked <- factor(titanic$embarked, levels = c("C", "Q", "S"), labels = c("Cherbourg", "Queenstown", "Southampton"))
titanic$sex <- factor(titanic$sex)

str(titanic)
## 'data.frame':    1309 obs. of  6 variables:
##  $ survived: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 2 1 2 1 ...
##  $ embarked: Factor w/ 3 levels "Cherbourg","Queenstown",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
##  $ sibsp   : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num  211 152 152 152 152 ...

Find the correlation Matrix between survival and the other features

titanic$survived_numeric <- as.numeric(titanic$survived) - 1
numeric_data <- titanic[c("survived_numeric", "fare", "sibsp", "parch")]
correlation_matrix <- cor(numeric_data, use = "complete.obs")
print(correlation_matrix)
##                  survived_numeric      fare       sibsp      parch
## survived_numeric       1.00000000 0.2442655 -0.02812218 0.08241782
## fare                   0.24426547 1.0000000  0.16023826 0.22153866
## sibsp                 -0.02812218 0.1602383  1.00000000 0.37348524
## parch                  0.08241782 0.2215387  0.37348524 1.00000000

Plot survival with other features to see if any correlations exist

library(ggplot2)
ggplot(titanic, aes(x = sex, fill = survived)) +
  geom_bar(position = "fill") +
  labs(title = "Survival by Sex", y = "Proportion of Total")

ggplot(titanic, aes(x = embarked, fill = survived)) +
  geom_bar(position = "fill") +
  labs(title = "Survival by Embarkation Point", y = "Proportion of Total")

ggplot(titanic, aes(x = survived, y = fare, fill = survived)) +
  geom_boxplot() +
  labs(title = "Fare Distribution by Survival Status", y = "Fare")
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).

Set a seed to 1000 and use it to split titanic into 80% training, 20% testing

set.seed(1000)
library(caret)
## Loading required package: lattice
split <- createDataPartition(titanic$survived, p = 0.8, list = FALSE)

training_data <- titanic[split, ]
testing_data <- titanic[-split, ]

Using Rpart package

library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
fit <- rpart(survived ~ sex + sibsp + parch + fare + embarked,
               data = training_data, method="class")
fancyRpartPlot(fit)

The tree did split using gender it shows the slogan “Women and children first” was true

Prediction

Prediction<-predict(fit, testing_data, type = "class")
print(Prediction)
##    1    7    9   21   30   33   34   35   36   50   52   53   54   60   64   69 
##  Yes  Yes  Yes   No   No  Yes  Yes   No  Yes   No   No   No   No  Yes  Yes   No 
##   86   87   92   93   98   99  101  102  103  110  114  115  116  128  133  137 
##  Yes   No   No  Yes  Yes  Yes   No   No  Yes   No  Yes   No   No  Yes   No   No 
##  151  160  175  176  182  188  195  199  202  211  214  216  228  229  242  244 
##   No  Yes   No   No  Yes  Yes   No  Yes   No   No  Yes   No  Yes   No   No   No 
##  249  252  253  257  271  272  277  280  286  287  288  289  297  298  308  316 
##   No  Yes   No   No  Yes   No   No   No   No  Yes   No  Yes  Yes  Yes   No  Yes 
##  321  322  328  329  335  339  349  350  352  359  365  370  377  379  381  382 
##   No   No   No   No   No   No   No  Yes   No  Yes   No  Yes   No   No  Yes  Yes 
##  388  394  396  401  408  409  418  423  424  431  436  441  451  456  460  461 
##  Yes   No  Yes  Yes  Yes   No   No   No   No  Yes   No  Yes   No   No   No  Yes 
##  466  470  480  486  490  494  501  506  513  516  517  520  530  539  551  558 
##  Yes  Yes  Yes   No  Yes   No   No   No   No   No   No   No  Yes   No  Yes  Yes 
##  568  577  582  586  592  593  597  603  607  616  617  622  638  643  646  647 
##   No   No   No   No  Yes   No   No   No   No   No   No  Yes   No   No   No  Yes 
##  656  659  676  679  694  698  703  705  708  709  710  712  725  729  731  734 
##   No  Yes   No   No   No  Yes  Yes   No   No   No  Yes   No   No   No   No   No 
##  735  738  739  742  745  753  754  759  762  779  811  814  818  827  833  835 
##   No   No  Yes   No   No   No   No   No   No  Yes   No   No   No   No  Yes   No 
##  840  843  844  858  859  862  867  881  882  884  885  893  896  897  900  901 
##   No   No   No   No   No  Yes  Yes   No   No   No   No   No  Yes   No  Yes   No 
##  902  907  912  913  914  917  921  929  931  935  939  940  943  957  959  960 
##  Yes   No   No   No   No  Yes   No  Yes   No  Yes   No  Yes   No  Yes  Yes   No 
##  961  963  964  966  967  969  993 1003 1011 1013 1015 1025 1028 1030 1040 1041 
##   No   No   No   No  Yes   No  Yes  Yes   No  Yes  Yes   No   No   No  Yes  Yes 
## 1043 1062 1072 1073 1074 1084 1095 1096 1098 1099 1105 1108 1110 1117 1121 1139 
##  Yes  Yes  Yes   No   No   No  Yes  Yes   No  Yes   No   No   No   No   No   No 
## 1141 1154 1156 1168 1172 1174 1181 1183 1185 1189 1196 1197 1201 1202 1223 1229 
##   No  Yes   No   No   No  Yes  Yes  Yes   No  Yes   No   No   No   No   No   No 
## 1234 1238 1239 1242 1251 1255 1257 1264 1267 1271 1273 1278 1280 1282 1283 1284 
##   No   No   No   No   No   No   No   No   No   No   No   No  Yes   No   No   No 
## 1288 1290 1292 1294 1308 
##   No   No   No   No   No 
## Levels: No Yes