library(readr)
At238b <- read_csv("https://goo.gl/At238b")
## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): pclass, name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (6): survived, age, sibsp, parch, fare, body
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(At238b)
#load the packages
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(rpart)
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(RColorBrewer)
T3 <- At238b
titanic <- T3[, c("survived", "embarked", "age", "sex", "sibsp", "parch", "fare")]
View(titanic)
#Statistical Analysis
#convert embarked and sex column to number in order to perform some statistical analysis
titanic$embarked <- as.numeric(factor(titanic$embarked, levels = c("S", "C")))
titanic$sex <- as.numeric(factor(titanic$sex, levels = c("male", "female")))
str(titanic)
## tibble [1,309 × 7] (S3: tbl_df/tbl/data.frame)
## $ survived: num [1:1309] 1 1 0 0 0 1 1 0 1 0 ...
## $ embarked: num [1:1309] 1 1 1 1 1 1 1 1 1 2 ...
## $ age : num [1:1309] 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sex : num [1:1309] 2 1 2 1 2 1 2 1 2 1 ...
## $ sibsp : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num [1:1309] 211 152 152 152 152 ...
#summarize the dataset
summary(titanic)
## survived embarked age sex
## Min. :0.000 Min. :1.000 Min. : 0.17 Min. :1.000
## 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:21.00 1st Qu.:1.000
## Median :0.000 Median :1.000 Median :28.00 Median :1.000
## Mean :0.382 Mean :1.228 Mean :29.88 Mean :1.356
## 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:39.00 3rd Qu.:2.000
## Max. :1.000 Max. :2.000 Max. :80.00 Max. :2.000
## NA's :125 NA's :263
## sibsp parch fare
## Min. :0.0000 Min. :0.000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.: 7.896
## Median :0.0000 Median :0.000 Median : 14.454
## Mean :0.4989 Mean :0.385 Mean : 33.295
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.: 31.275
## Max. :8.0000 Max. :9.000 Max. :512.329
## NA's :1
#display first 20 rows of data
head(titanic, n=20)
## # A tibble: 20 × 7
## survived embarked age sex sibsp parch fare
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 29 2 0 0 211.
## 2 1 1 0.92 1 1 2 152.
## 3 0 1 2 2 1 2 152.
## 4 0 1 30 1 1 2 152.
## 5 0 1 25 2 1 2 152.
## 6 1 1 48 1 0 0 26.6
## 7 1 1 63 2 1 0 78.0
## 8 0 1 39 1 0 0 0
## 9 1 1 53 2 2 0 51.5
## 10 0 2 71 1 0 0 49.5
## 11 0 2 47 1 1 0 228.
## 12 1 2 18 2 1 0 228.
## 13 1 2 24 2 0 0 69.3
## 14 1 1 26 2 0 0 78.8
## 15 1 1 80 1 0 0 30
## 16 0 1 NA 1 0 0 25.9
## 17 0 2 24 1 0 1 248.
## 18 1 2 50 2 0 1 248.
## 19 1 2 32 2 0 0 76.3
## 20 0 2 36 1 0 0 75.2
#display the dimensions of the dataset
dim(titanic)
## [1] 1309 7
#list types for each attribute
sapply(titanic, class)
## survived embarked age sex sibsp parch fare
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
# distribution of class variable
y <- titanic$survived
cbind(freq=table(y), percentage=prop.table(table(y))*100)
## freq percentage
## 0 809 61.8029
## 1 500 38.1971
#calculate standard deviation for all attributes
sapply(titanic[,1:7], sd)
## survived embarked age sex sibsp parch fare
## 0.4860552 NA NA 0.4789973 1.0416584 0.8655603 NA
#calculate skewness for each variable
skew <- apply(titanic[,1:7], 2, skewness)
# display skewness, larger/smaller deviations from 0 show more skew
print(skew)
## survived embarked age sex sibsp parch fare
## 0.4852900 NA NA 0.6008094 3.8354145 3.6606736 NA
#calculate proportion of survived passengers
survived_proportion <- mean(titanic$survived)
#print the proportion
print(survived_proportion)
## [1] 0.381971
#removing null values
any(is.na(titanic))
## [1] TRUE
titanic<-na.omit(titanic)
str(titanic)
## tibble [993 × 7] (S3: tbl_df/tbl/data.frame)
## $ survived: num [1:993] 1 1 0 0 0 1 1 0 1 0 ...
## $ embarked: num [1:993] 1 1 1 1 1 1 1 1 1 2 ...
## $ age : num [1:993] 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sex : num [1:993] 2 1 2 1 2 1 2 1 2 1 ...
## $ sibsp : num [1:993] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:993] 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num [1:993] 211 152 152 152 152 ...
## - attr(*, "na.action")= 'omit' Named int [1:316] 16 38 41 47 60 70 71 75 81 107 ...
## ..- attr(*, "names")= chr [1:316] "16" "38" "41" "47" ...
View(titanic)
#calculate correlation matrix
correlation_matrix <- cor(titanic)
#extract correlation of "survived" with other variables
survived_correlation <- correlation_matrix["survived", ]
#print correlation of "survived" with other variables
print(survived_correlation)
## survived embarked age sex sibsp parch
## 1.000000000 0.219669225 -0.053274961 0.546984591 -0.001033379 0.124687212
## fare
## 0.246971525
#display correlation plot
corrplot(correlation_matrix, method="circle")

#convert survived, embarked, and sex columns to factors
titanic$survived <- factor(titanic$survived)
titanic$embarked <- factor(titanic$embarked)
titanic$sex <- factor(titanic$sex)
#check the structure of the dataset to confirm the changes
str(titanic)
## tibble [993 × 7] (S3: tbl_df/tbl/data.frame)
## $ survived: Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 2 1 ...
## $ embarked: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 2 ...
## $ age : num [1:993] 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sex : Factor w/ 2 levels "1","2": 2 1 2 1 2 1 2 1 2 1 ...
## $ sibsp : num [1:993] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:993] 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num [1:993] 211 152 152 152 152 ...
## - attr(*, "na.action")= 'omit' Named int [1:316] 16 38 41 47 60 70 71 75 81 107 ...
## ..- attr(*, "names")= chr [1:316] "16" "38" "41" "47" ...
#set seed for reproducibility
set.seed(1000)
#split the dataset into training and testing based on indices
train_indices <- 1:1046
test_indices <- 1047:1309
#create training and testing datasets
training_data <- titanic[train_indices, ]
testing_data <- titanic[test_indices, ]
#train your learner on the training dataset and save your model in a variable fit
fit <- rpart(survived ~ sex + age + sibsp + parch + fare + embarked,
data=training_data, method="class")
plot(fit)

#save the plot as a PNG file
png("regression_tree.png")
plot(fit)
dev.off()
## png
## 2
#obtain a more readable plot
fancyRpartPlot(fit)

#examine the tree
summary(fit)
## Call:
## rpart(formula = survived ~ sex + age + sibsp + parch + fare +
## embarked, data = training_data, method = "class")
## n=993 (53 observations deleted due to missingness)
##
## CP nsplit rel error xerror xstd
## 1 0.47572816 0 1.0000000 1.0000000 0.03768468
## 2 0.02669903 1 0.5242718 0.5242718 0.03155481
## 3 0.02427184 2 0.4975728 0.5218447 0.03150193
## 4 0.01000000 3 0.4733010 0.4830097 0.03061716
##
## Variable importance
## sex age parch fare sibsp
## 72 8 8 7 6
##
## Node number 1: 993 observations, complexity param=0.4757282
## predicted class=0 expected loss=0.4149043 P(node) =1
## class counts: 581 412
## probabilities: 0.585 0.415
## left son=2 (631 obs) right son=3 (362 obs)
## Primary splits:
## sex splits as LR, improve=144.246200, (0 missing)
## fare < 15.1729 to the left, improve= 39.594810, (0 missing)
## embarked splits as LR, improve= 23.264440, (0 missing)
## parch < 0.5 to the left, improve= 20.579540, (0 missing)
## age < 8.5 to the right, improve= 9.988721, (0 missing)
## Surrogate splits:
## parch < 0.5 to the left, agree=0.674, adj=0.105, (0 split)
## fare < 77.6229 to the left, agree=0.665, adj=0.080, (0 split)
## age < 5.5 to the right, agree=0.637, adj=0.006, (0 split)
##
## Node number 2: 631 observations, complexity param=0.02669903
## predicted class=0 expected loss=0.2107765 P(node) =0.6354481
## class counts: 498 133
## probabilities: 0.789 0.211
## left son=4 (592 obs) right son=5 (39 obs)
## Primary splits:
## age < 9.5 to the right, improve=15.390130, (0 missing)
## fare < 26.26875 to the left, improve= 6.785901, (0 missing)
## embarked splits as LR, improve= 6.167414, (0 missing)
## parch < 0.5 to the left, improve= 5.107995, (0 missing)
## sibsp < 0.5 to the left, improve= 0.824624, (0 missing)
## Surrogate splits:
## sibsp < 3.5 to the left, agree=0.943, adj=0.077, (0 split)
##
## Node number 3: 362 observations
## predicted class=1 expected loss=0.2292818 P(node) =0.3645519
## class counts: 83 279
## probabilities: 0.229 0.771
##
## Node number 4: 592 observations
## predicted class=0 expected loss=0.1824324 P(node) =0.5961732
## class counts: 484 108
## probabilities: 0.818 0.182
##
## Node number 5: 39 observations, complexity param=0.02427184
## predicted class=1 expected loss=0.3589744 P(node) =0.03927492
## class counts: 14 25
## probabilities: 0.359 0.641
## left son=10 (12 obs) right son=11 (27 obs)
## Primary splits:
## sibsp < 2.5 to the right, improve=10.7820500, (0 missing)
## fare < 20.825 to the right, improve= 1.5954570, (0 missing)
## age < 3.5 to the right, improve= 0.4884005, (0 missing)
## parch < 1.5 to the right, improve= 0.3345875, (0 missing)
## Surrogate splits:
## fare < 26.95 to the right, agree=0.744, adj=0.167, (0 split)
##
## Node number 10: 12 observations
## predicted class=0 expected loss=0.08333333 P(node) =0.01208459
## class counts: 11 1
## probabilities: 0.917 0.083
##
## Node number 11: 27 observations
## predicted class=1 expected loss=0.1111111 P(node) =0.02719033
## class counts: 3 24
## probabilities: 0.111 0.889
#What is the most “important” feature over which the tree first split?
#sex
#Do you agree with the Titanic slogan “ Women and children first!”
#yes, with sex being the most important feature and age being the next most
#important, it is suggested that gender and age play a vital role in survival
#make predictions on the test dataset
Prediction <- predict(fit, testing_data, type = "class")
#create a data frame with PassengerSex and Survived columns
Results <- data.frame(PassengerSex = testing_data$sex,
Survived = Prediction)
#save the data frame as a CSV file
write.csv(Results, file = "Titanicdtree.csv", row.names = FALSE)