Project 3: Using Regression Trees: Titanic Data Analysis

# Load required library
library(readr)
# 1,2: Import dataset from the given URL to variable T3
T3 <- read_csv("https://goo.gl/At238b")

## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): pclass, name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (6): survived, age, sibsp, parch, fare, body
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# View the structure of the loaded dataset
str(T3)

## spc_tbl_ [1,309 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ pclass   : chr [1:1309] "1st" "1st" "1st" "1st" ...
##  $ survived : num [1:1309] 1 1 0 0 0 1 1 0 1 0 ...
##  $ name     : chr [1:1309] "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
##  $ sex      : chr [1:1309] "female" "male" "female" "male" ...
##  $ age      : num [1:1309] 29 0.92 2 30 25 48 63 39 53 71 ...
##  $ sibsp    : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
##  $ parch    : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket   : chr [1:1309] "24160" "113781" "113781" "113781" ...
##  $ fare     : num [1:1309] 211 152 152 152 152 ...
##  $ cabin    : chr [1:1309] "B5" "C22 C26" "C22 C26" "C22 C26" ...
##  $ embarked : chr [1:1309] "S" "S" "S" "S" ...
##  $ boat     : chr [1:1309] "2" "11" NA NA ...
##  $ body     : num [1:1309] NA NA NA 135 NA NA NA NA NA 22 ...
##  $ home.dest: chr [1:1309] "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   pclass = col_character(),
##   ..   survived = col_double(),
##   ..   name = col_character(),
##   ..   sex = col_character(),
##   ..   age = col_double(),
##   ..   sibsp = col_double(),
##   ..   parch = col_double(),
##   ..   ticket = col_character(),
##   ..   fare = col_double(),
##   ..   cabin = col_character(),
##   ..   embarked = col_character(),
##   ..   boat = col_character(),
##   ..   body = col_double(),
##   ..   home.dest = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

# View the first few rows of the dataset
head(T3)

## # A tibble: 6 × 14
##   pclass survived name       sex     age sibsp parch ticket  fare cabin embarked
##   <chr>     <dbl> <chr>      <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr> <chr>   
## 1 1st           1 Allen, Mi… fema… 29        0     0 24160  211.  B5    S       
## 2 1st           1 Allison, … male   0.92     1     2 113781 152.  C22 … S       
## 3 1st           0 Allison, … fema…  2        1     2 113781 152.  C22 … S       
## 4 1st           0 Allison, … male  30        1     2 113781 152.  C22 … S       
## 5 1st           0 Allison, … fema… 25        1     2 113781 152.  C22 … S       
## 6 1st           1 Anderson,… male  48        0     0 19952   26.6 E12   S       
## # ℹ 3 more variables: boat <chr>, body <dbl>, home.dest <chr>

# 3: Create the new dataset titanic with selected features
titanic <- T3[, c("survived", "embarked", "age", "sex", "sibsp", "parch", "fare")]

#4:Statistics of the titanic dataset
# View the structure of the created dataset
str(titanic)

## tibble [1,309 × 7] (S3: tbl_df/tbl/data.frame)
##  $ survived: num [1:1309] 1 1 0 0 0 1 1 0 1 0 ...
##  $ embarked: chr [1:1309] "S" "S" "S" "S" ...
##  $ age     : num [1:1309] 29 0.92 2 30 25 48 63 39 53 71 ...
##  $ sex     : chr [1:1309] "female" "male" "female" "male" ...
##  $ sibsp   : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num [1:1309] 211 152 152 152 152 ...

#display the dimensions of the dataset
dim(titanic)

## [1] 1309    7

#list types for each attribute
sapply(titanic, class)

##    survived    embarked         age         sex       sibsp       parch 
##   "numeric" "character"   "numeric" "character"   "numeric"   "numeric" 
##        fare 
##   "numeric"

#Summary statistics of the titanic dataset
summary(titanic)

##     survived       embarked              age            sex           
##  Min.   :0.000   Length:1309        Min.   : 0.17   Length:1309       
##  1st Qu.:0.000   Class :character   1st Qu.:21.00   Class :character  
##  Median :0.000   Mode  :character   Median :28.00   Mode  :character  
##  Mean   :0.382                      Mean   :29.88                     
##  3rd Qu.:1.000                      3rd Qu.:39.00                     
##  Max.   :1.000                      Max.   :80.00                     
##                                     NA's   :263                       
##      sibsp            parch            fare        
##  Min.   :0.0000   Min.   :0.000   Min.   :  0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:  7.896  
##  Median :0.0000   Median :0.000   Median : 14.454  
##  Mean   :0.4989   Mean   :0.385   Mean   : 33.295  
##  3rd Qu.:1.0000   3rd Qu.:0.000   3rd Qu.: 31.275  
##  Max.   :8.0000   Max.   :9.000   Max.   :512.329  
##                                   NA's   :1

# Load required library
library(ggplot2)


# Create a bar plot of gender distribution
ggplot(titanic, aes(x = sex)) +   geom_bar(fill = "blue") +   labs(title = "Gender Distribution", x = "Gender", y = "Count")

# Create a bar plot of Age distribution
ggplot(titanic, aes(x = age)) +   geom_bar(fill = "blue") +   labs(title = "Age Distribution", x = "Age", y = "Count")

## Warning: Removed 263 rows containing non-finite values (`stat_count()`).

#5: Proportion of Survived in the dataset
S <- titanic$survived 
cbind(freq=table(S), percentage=prop.table(table(S))*100)

##   freq percentage
## 0  809    61.8029
## 1  500    38.1971

survived_proportion <- mean(titanic$survived)
#print the proportion
print(survived_proportion)

## [1] 0.381971

# Create a bar plot to find survivors on the basis of their Gender
ggplot(titanic, aes(x = sex, fill = factor(survived))) +   geom_bar() +   labs(title = "Survivorship: Gender ", x = "Gender", y = "Count") +
  scale_fill_manual(values = c("red","blue"), labels = c("Not Survived", "Survived"))

# Create a bar plot to find survivors on the basis of Port of Embarkation
ggplot(titanic, aes(x = embarked, fill = factor(survived))) +   geom_bar() +   labs(title = "Survivorship: on the basis of Port of Embarkation ", x = "Port of Embarkation", y = "Count") +
  scale_fill_manual(values = c("red","blue"), labels = c("Not Survived", "Survived"))

# Create a box plot to find survivors on the basis of fare
boxplot(fare ~ survived, data = titanic, main = "Survivorship: on the basis of Fare", xlab = "Survived", ylab = "Fare", col = c("red", "blue"), names = c("No", "Yes"), outline = FALSE)

# Find total na's
titanic_na <- colSums(is.na(titanic))
titanic_na

## survived embarked      age      sex    sibsp    parch     fare 
##        0        2      263        0        0        0        1

#6: Remove NAs from the dataset
titanic <- na.omit(titanic) #total 266 records have been omitted
summary(titanic) # 1043 records are in titanic dataset

##     survived        embarked              age            sex           
##  Min.   :0.0000   Length:1043        Min.   : 0.17   Length:1043       
##  1st Qu.:0.0000   Class :character   1st Qu.:21.00   Class :character  
##  Median :0.0000   Mode  :character   Median :28.00   Mode  :character  
##  Mean   :0.4075                      Mean   :29.81                     
##  3rd Qu.:1.0000                      3rd Qu.:39.00                     
##  Max.   :1.0000                      Max.   :80.00                     
##      sibsp            parch             fare       
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  8.05  
##  Median :0.0000   Median :0.0000   Median : 15.75  
##  Mean   :0.5043   Mean   :0.4219   Mean   : 36.60  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 35.08  
##  Max.   :8.0000   Max.   :6.0000   Max.   :512.33

#7: Convert Survived, Embarked and Sex columns to factors
titanic$survived <- factor(titanic$survived)
titanic$embarked <- factor(titanic$embarked)
titanic$sex <- factor(titanic$sex)

#8: Correlation matrix between survival and other features

# Convert Sex and Embarked to numeric representation
titanic$Sex_num <- as.numeric(titanic$sex) #Female: 1, Male: 2
titanic$Embarked_num <- as.numeric(titanic$embarked)
head(titanic$sex)

## [1] female male   female male   female male  
## Levels: female male

head(titanic$Sex_num)

## [1] 1 2 1 2 1 2

head(titanic$embarked)

## [1] S S S S S S
## Levels: C Q S

head(titanic$Embarked_num) # C: 1, Q:2, S:3

## [1] 3 3 3 3 3 3

# Convert factors to numeric for co-relation calculation
titanic$Survived_num <- as.numeric(titanic$survived)
titanic$Age_num <- as.numeric(titanic$age)
titanic$Sex_num <- as.numeric(titanic$sex)

# load package
library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.92 loaded

correlations <- cor(titanic[, c("Survived_num", "Age_num", "sibsp", "parch", "fare", "Sex_num", "Embarked_num")])
correlations

##              Survived_num     Age_num       sibsp       parch       fare
## Survived_num   1.00000000 -0.05741486 -0.01140343  0.11543601  0.2478576
## Age_num       -0.05741486  1.00000000 -0.24234489 -0.14931063  0.1772057
## sibsp         -0.01140343 -0.24234489  1.00000000  0.37395967  0.1421305
## parch          0.11543601 -0.14931063  0.37395967  1.00000000  0.2176495
## fare           0.24785762  0.17720569  0.14213054  0.21764954  1.0000000
## Sex_num       -0.53633212  0.06600630 -0.09646420 -0.22253083 -0.1864003
## Embarked_num  -0.20225751 -0.08326914  0.04550984  0.01122982 -0.3014545
##                 Sex_num Embarked_num
## Survived_num -0.5363321  -0.20225751
## Age_num       0.0660063  -0.08326914
## sibsp        -0.0964642   0.04550984
## parch        -0.2225308   0.01122982
## fare         -0.1864003  -0.30145454
## Sex_num       1.0000000   0.10942541
## Embarked_num  0.1094254   1.00000000

#Correlation of "survived" with other variables
S_correlation <- correlations["Survived_num", ]
#   Survived_num to Age_num: Approximately -0.057, indicating a very weak negative correlation. This suggests that there's a slight tendency for younger passengers to have a higher chance of survival.
#   Survived_num to sibsp: Approximately -0.011, indicating a very weak negative correlation. This suggests that there's a slight tendency for passengers with fewer siblings or spouses aboard to have a slightly higher chance of survival.
#   Survived_num to parch: Approximately 0.115, indicating a weak positive correlation. This suggests that there's a slight tendency for passengers with more parents or children aboard to have a slightly higher chance of survival.
#   Survived_num to fare: Approximately 0.248, indicating a weak positive correlation. This suggests that there's a slight tendency for passengers who paid higher fares to have a slightly higher chance of survival.
#   Survived_num to Sex_num: Approximately -0.536, indicating a moderate negative correlation. This suggests that females have a higher chance of survival compared to males.
#   Survived_num to Embarked_num: Approximately -0.202, indicating a weak negative correlation. This suggests that there's a slight tendency for passengers who embarked from certain ports to have a slightly higher chance of survival.
S_correlation

## Survived_num      Age_num        sibsp        parch         fare      Sex_num 
##   1.00000000  -0.05741486  -0.01140343   0.11543601   0.24785762  -0.53633212 
## Embarked_num 
##  -0.20225751

#9: create correlation plot
corrplot(correlations, method="circle")

#10: Set seed to 1000
set.seed(1000)

summary(titanic)

##  survived embarked      age            sex          sibsp       
##  0:618    C:212    Min.   : 0.17   female:386   Min.   :0.0000  
##  1:425    Q: 50    1st Qu.:21.00   male  :657   1st Qu.:0.0000  
##           S:781    Median :28.00                Median :0.0000  
##                    Mean   :29.81                Mean   :0.5043  
##                    3rd Qu.:39.00                3rd Qu.:1.0000  
##                    Max.   :80.00                Max.   :8.0000  
##      parch             fare           Sex_num      Embarked_num  
##  Min.   :0.0000   Min.   :  0.00   Min.   :1.00   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:  8.05   1st Qu.:1.00   1st Qu.:2.000  
##  Median :0.0000   Median : 15.75   Median :2.00   Median :3.000  
##  Mean   :0.4219   Mean   : 36.60   Mean   :1.63   Mean   :2.546  
##  3rd Qu.:1.0000   3rd Qu.: 35.08   3rd Qu.:2.00   3rd Qu.:3.000  
##  Max.   :6.0000   Max.   :512.33   Max.   :2.00   Max.   :3.000  
##   Survived_num      Age_num     
##  Min.   :1.000   Min.   : 0.17  
##  1st Qu.:1.000   1st Qu.:21.00  
##  Median :1.000   Median :28.00  
##  Mean   :1.407   Mean   :29.81  
##  3rd Qu.:2.000   3rd Qu.:39.00  
##  Max.   :2.000   Max.   :80.00

# Split the dataset into 80% training and 20% testing
# Split the dataset into 80% training and 20% testing
train_indices <- sample(1:nrow(titanic), 0.8 * nrow(titanic))
train <- titanic[train_indices, ]
test <- titanic[-train_indices, ]


library(rpart)

## Warning: package 'rpart' was built under R version 4.3.3

#Train your learner on the training dataset and save your model in a variable fit
fit <- rpart(survived ~ sex + age + sibsp + parch + fare + embarked, data = train, method = "class")
library(rattle)

## Warning: package 'rattle' was built under R version 4.3.3

## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.3.3

library(RColorBrewer)
rpart.plot(fit, 
           box.palette = "GnBu", # color scheme
           branch.lty = 3, # dotted branch lines
           shadow.col = "gray", # shadows under the node boxes
           nn = TRUE) # display the node numbers

fancyRpartPlot(fit)

#11: Build your training dataset on fixed (till index 834) and test dataset (till index 1043)
train <- titanic[1:834, ]
test <- titanic[835:1043, ]

#12: Install and load rpart package if not already installed
library(rpart)


#13: Train your learner on the training dataset and save your model in a variable fit
fit <- rpart(survived ~ sex + age + sibsp + parch + fare + embarked, data = train, method = "class")
#14,15: Plot your regression tree and save plot into an image file

rpart.plot(fit, 
           box.palette = "GnBu", # color scheme
           branch.lty = 3, # dotted branch lines
           shadow.col = "gray", # shadows under the node boxes
           nn = TRUE) # display the node numbers

png("D:/MS Sem 2/Data Minning/Projects/regression_tree.png", width = 800, height = 600, res = 120)
rpart.plot(fit, 
           box.palette = "GnBu", # color scheme
           branch.lty = 3, # dotted branch lines
           shadow.col = "gray", # shadows under the node boxes
           nn = TRUE) # display the node numbers
# shutting off the current process
dev.off()

## png 
##   2

#16: Type the R command fancyRpartPlot(fit) to obtain a more readable plot
# Plot a more readable tree
fancyRpartPlot(fit)

# The most important feature over which the tree first split can be found using:
summary(fit)

## Call:
## rpart(formula = survived ~ sex + age + sibsp + parch + fare + 
##     embarked, data = train, method = "class")
##   n= 834 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.54423592      0 1.0000000 1.0000000 0.03849577
## 2 0.03753351      1 0.4557641 0.4557641 0.03119009
## 3 0.01340483      2 0.4182306 0.4235925 0.03033961
## 4 0.01000000      5 0.3780161 0.4182306 0.03019154
## 
## Variable importance
##      sex     fare      age    parch    sibsp embarked 
##       65       12        9        7        5        2 
## 
## Node number 1: 834 observations,    complexity param=0.5442359
##   predicted class=0  expected loss=0.4472422  P(node) =1
##     class counts:   461   373
##    probabilities: 0.553 0.447 
##   left son=2 (511 obs) right son=3 (323 obs)
##   Primary splits:
##       sex      splits as  RL,           improve=142.00650, (0 missing)
##       fare     < 15.625   to the left,  improve= 44.08751, (0 missing)
##       parch    < 0.5      to the left,  improve= 21.33507, (0 missing)
##       embarked splits as  RLL,          improve= 15.82247, (0 missing)
##       age      < 8.5      to the right, improve= 14.18318, (0 missing)
##   Surrogate splits:
##       parch < 0.5      to the left,  agree=0.656, adj=0.111, (0 split)
##       fare  < 77.6229  to the left,  agree=0.649, adj=0.093, (0 split)
##       sibsp < 2.5      to the left,  agree=0.616, adj=0.009, (0 split)
##       age   < 16.5     to the right, agree=0.615, adj=0.006, (0 split)
## 
## Node number 2: 511 observations,    complexity param=0.03753351
##   predicted class=0  expected loss=0.2152642  P(node) =0.6127098
##     class counts:   401   110
##    probabilities: 0.785 0.215 
##   left son=4 (483 obs) right son=5 (28 obs)
##   Primary splits:
##       age      < 10       to the right, improve=16.941050, (0 missing)
##       fare     < 26.26875 to the left,  improve= 8.516230, (0 missing)
##       parch    < 0.5      to the left,  improve= 5.474622, (0 missing)
##       embarked splits as  RLL,          improve= 4.881588, (0 missing)
##       sibsp    < 0.5      to the left,  improve= 1.056154, (0 missing)
##   Surrogate splits:
##       sibsp < 3.5      to the left,  agree=0.951, adj=0.107, (0 split)
## 
## Node number 3: 323 observations,    complexity param=0.01340483
##   predicted class=1  expected loss=0.1857585  P(node) =0.3872902
##     class counts:    60   263
##    probabilities: 0.186 0.814 
##   left son=6 (42 obs) right son=7 (281 obs)
##   Primary splits:
##       fare     < 10.1625  to the left,  improve=12.6432300, (0 missing)
##       sibsp    < 3.5      to the right, improve= 6.4895870, (0 missing)
##       embarked splits as  RLR,          improve= 4.6826630, (0 missing)
##       age      < 28.5     to the left,  improve= 1.8840550, (0 missing)
##       parch    < 1.5      to the right, improve= 0.5440428, (0 missing)
##   Surrogate splits:
##       embarked splits as  RLR, agree=0.904, adj=0.262, (0 split)
## 
## Node number 4: 483 observations
##   predicted class=0  expected loss=0.184265  P(node) =0.5791367
##     class counts:   394    89
##    probabilities: 0.816 0.184 
## 
## Node number 5: 28 observations
##   predicted class=1  expected loss=0.25  P(node) =0.03357314
##     class counts:     7    21
##    probabilities: 0.250 0.750 
## 
## Node number 6: 42 observations,    complexity param=0.01340483
##   predicted class=0  expected loss=0.452381  P(node) =0.05035971
##     class counts:    23    19
##    probabilities: 0.548 0.452 
##   left son=12 (28 obs) right son=13 (14 obs)
##   Primary splits:
##       age      < 19.5     to the right, improve=2.8809520, (0 missing)
##       fare     < 8.35625  to the right, improve=1.6720240, (0 missing)
##       sibsp    < 0.5      to the right, improve=1.2135640, (0 missing)
##       embarked splits as  RLR,          improve=0.1280423, (0 missing)
##   Surrogate splits:
##       fare     < 7.74165  to the right, agree=0.714, adj=0.143, (0 split)
##       parch    < 0.5      to the left,  agree=0.690, adj=0.071, (0 split)
##       embarked splits as  RLL,          agree=0.690, adj=0.071, (0 split)
## 
## Node number 7: 281 observations,    complexity param=0.01340483
##   predicted class=1  expected loss=0.1316726  P(node) =0.3369305
##     class counts:    37   244
##    probabilities: 0.132 0.868 
##   left son=14 (7 obs) right son=15 (274 obs)
##   Primary splits:
##       sibsp    < 3.5      to the right, improve=7.556541, (0 missing)
##       fare     < 48.2     to the left,  improve=4.423521, (0 missing)
##       age      < 11.5     to the left,  improve=1.903240, (0 missing)
##       parch    < 1.5      to the right, improve=1.666061, (0 missing)
##       embarked splits as  RLL,          improve=0.909409, (0 missing)
## 
## Node number 12: 28 observations
##   predicted class=0  expected loss=0.3214286  P(node) =0.03357314
##     class counts:    19     9
##    probabilities: 0.679 0.321 
## 
## Node number 13: 14 observations
##   predicted class=1  expected loss=0.2857143  P(node) =0.01678657
##     class counts:     4    10
##    probabilities: 0.286 0.714 
## 
## Node number 14: 7 observations
##   predicted class=0  expected loss=0.1428571  P(node) =0.008393285
##     class counts:     6     1
##    probabilities: 0.857 0.143 
## 
## Node number 15: 274 observations
##   predicted class=1  expected loss=0.1131387  P(node) =0.3285372
##     class counts:    31   243
##    probabilities: 0.113 0.887

# Typically, the feature that appears at the top of the tree (the root node) is considered the most important for splitting the data. 
# In this case, it is 'Sex', which aligns with the "Women and children first" principle observed in the Titanic disaster.
#Variable Importance Section also tells us that 'sex' has highest importance value of 65, which aligns with the moto

# Make predictions on the test dataset
Prediction <- predict(fit, test, type = "class")
Prediction

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   0   0   0   0   0   0   0   0   0   1   0   0   0   1   0   0   0   0   0   0 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   0   0   0   0   0   1   0   0   1   0   0   0   0   0   0   1   1   0   0   0 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   0   1   1   0   1   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   0   1   0   0   0   0   0   0   0   0   0   0   0   1   1   1   1   1   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   0   0   1   0   0   0   1   1   1   0   0   0   0   0   0   0   0   1   0 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   0   0   0   0   0   1   1   1   1   1   1   0   0   1   1   0   1   0   0   0 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   0   0   0   0   0   1   1   1   0   0   0   0   0   0   0   1   1   0   1   1 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   0   1   0   0   0   0   0   0   0   0   0   0   1   1   0   0   0   0   0   0 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   0   0   1   1   0   0   0   0   1   1   1   0   1   0   0   0   1   0   1   0 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   0   0   0   1   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0   0 
## 201 202 203 204 205 206 207 208 209 
##   0   0   0   1   0   1   0   0   0 
## Levels: 0 1

# Create a data frame with predicted values
Results <- data.frame(PassengerSex = test$sex, Survived = Prediction)
Results

##     PassengerSex Survived
## 1           male        0
## 2           male        0
## 3           male        0
## 4           male        0
## 5           male        0
## 6           male        0
## 7         female        0
## 8         female        0
## 9           male        0
## 10        female        1
## 11          male        0
## 12          male        0
## 13          male        0
## 14        female        1
## 15          male        0
## 16          male        0
## 17          male        0
## 18        female        0
## 19          male        0
## 20          male        0
## 21          male        0
## 22          male        0
## 23        female        0
## 24          male        0
## 25          male        0
## 26        female        1
## 27        female        0
## 28          male        0
## 29        female        1
## 30          male        0
## 31          male        0
## 32          male        0
## 33          male        0
## 34          male        0
## 35          male        0
## 36          male        1
## 37        female        1
## 38          male        0
## 39        female        0
## 40          male        0
## 41          male        0
## 42        female        1
## 43        female        1
## 44          male        0
## 45        female        1
## 46          male        0
## 47          male        0
## 48        female        1
## 49        female        0
## 50          male        0
## 51        female        1
## 52        female        0
## 53          male        0
## 54          male        0
## 55          male        0
## 56          male        0
## 57          male        0
## 58        female        0
## 59          male        0
## 60          male        0
## 61        female        0
## 62          male        1
## 63          male        0
## 64          male        0
## 65        female        0
## 66          male        0
## 67          male        0
## 68          male        0
## 69        female        0
## 70        female        0
## 71          male        0
## 72          male        0
## 73        female        0
## 74          male        1
## 75          male        1
## 76        female        1
## 77        female        1
## 78        female        1
## 79          male        1
## 80          male        1
## 81          male        1
## 82          male        0
## 83          male        0
## 84        female        1
## 85          male        0
## 86          male        0
## 87          male        0
## 88          male        1
## 89        female        1
## 90        female        1
## 91          male        0
## 92          male        0
## 93          male        0
## 94          male        0
## 95          male        0
## 96        female        0
## 97          male        0
## 98          male        0
## 99        female        1
## 100         male        0
## 101         male        0
## 102         male        0
## 103         male        0
## 104         male        0
## 105         male        0
## 106         male        1
## 107         male        1
## 108         male        1
## 109         male        1
## 110       female        1
## 111       female        1
## 112         male        0
## 113         male        0
## 114       female        1
## 115       female        1
## 116         male        0
## 117       female        1
## 118         male        0
## 119         male        0
## 120         male        0
## 121         male        0
## 122         male        0
## 123         male        0
## 124       female        0
## 125         male        0
## 126       female        1
## 127       female        1
## 128       female        1
## 129         male        0
## 130         male        0
## 131         male        0
## 132         male        0
## 133         male        0
## 134         male        0
## 135         male        0
## 136       female        1
## 137         male        1
## 138         male        0
## 139       female        1
## 140       female        1
## 141         male        0
## 142       female        1
## 143         male        0
## 144         male        0
## 145         male        0
## 146         male        0
## 147       female        0
## 148         male        0
## 149         male        0
## 150       female        0
## 151         male        0
## 152         male        0
## 153       female        1
## 154       female        1
## 155         male        0
## 156         male        0
## 157         male        0
## 158         male        0
## 159         male        0
## 160         male        0
## 161         male        0
## 162         male        0
## 163         male        1
## 164       female        1
## 165         male        0
## 166         male        0
## 167         male        0
## 168         male        0
## 169         male        1
## 170       female        1
## 171       female        1
## 172         male        0
## 173       female        1
## 174       female        0
## 175         male        0
## 176         male        0
## 177       female        1
## 178         male        0
## 179       female        1
## 180         male        0
## 181         male        0
## 182         male        0
## 183         male        0
## 184       female        1
## 185         male        0
## 186         male        0
## 187       female        1
## 188         male        0
## 189         male        0
## 190       female        1
## 191         male        0
## 192         male        0
## 193         male        0
## 194       female        0
## 195         male        0
## 196         male        0
## 197         male        0
## 198       female        0
## 199         male        0
## 200         male        0
## 201         male        0
## 202         male        0
## 203         male        0
## 204       female        1
## 205         male        0
## 206       female        1
## 207         male        0
## 208         male        0
## 209         male        0

# Save the data frame to a CSV file
write.csv(Results, file = "Titanicdtree.csv", row.names = FALSE)
# Read the CSV file
TCT <- read.csv("Titanicdtree.csv")
view(TCT)
#uploaded on Teams  #https://tigernet365.sharepoint.com/:x:/s/CS583Spring24/EVGbP-FDTbVPvUwfrUTBykMB-Mxkl-qfDCMHh2oTBRVLZQ?e=xZ1iKW
# Create a confusion matrix
conf_matrix <- table(Actual = test$survived, Predicted = Prediction)
print("Confusion Matrix:")

## [1] "Confusion Matrix:"

print(conf_matrix)

##       Predicted
## Actual   0   1
##      0 120  37
##      1  32  20

library(ggplot2)

# Convert the confusion matrix to a data frame for ggplot
conf_matrix_df <- as.data.frame.table(conf_matrix)
colnames(conf_matrix_df) <- c("Actual", "Predicted", "Count")
#True positives (TP) = 20
#True negatives (TN) = 120
#False positives (FP) = 37
#False negatives (FN) = 32

# Plot the confusion matrix using ggplot
ggplot(data = conf_matrix_df, aes(x = Predicted, y = Actual, fill = Count)) +
  geom_tile(color = "white") +  # Add tiles
  geom_text(aes(label = Count)) +  # Add text labels
  scale_fill_gradient(low = "lightblue", high = "darkblue") +  # Define color gradient
  labs(title = "Confusion Matrix", x = "Predicted", y = "Actual") +  # Add titles
  theme_minimal()  # Set theme

# Calculate accuracy
accuracy <- mean(Prediction == test$survived)
print(paste("Accuracy:", accuracy))

## [1] "Accuracy: 0.669856459330144"

# Calculate accuracy through agreement vector
table(Prediction,test$survived)

##           
## Prediction   0   1
##          0 120  32
##          1  37  20

Prediction

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   0   0   0   0   0   0   0   0   0   1   0   0   0   1   0   0   0   0   0   0 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   0   0   0   0   0   1   0   0   1   0   0   0   0   0   0   1   1   0   0   0 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   0   1   1   0   1   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   0   1   0   0   0   0   0   0   0   0   0   0   0   1   1   1   1   1   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   0   0   1   0   0   0   1   1   1   0   0   0   0   0   0   0   0   1   0 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   0   0   0   0   0   1   1   1   1   1   1   0   0   1   1   0   1   0   0   0 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   0   0   0   0   0   1   1   1   0   0   0   0   0   0   0   1   1   0   1   1 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   0   1   0   0   0   0   0   0   0   0   0   0   1   1   0   0   0   0   0   0 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   0   0   1   1   0   0   0   0   1   1   1   0   1   0   0   0   1   0   1   0 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   0   0   0   1   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0   0 
## 201 202 203 204 205 206 207 208 209 
##   0   0   0   1   0   1   0   0   0 
## Levels: 0 1

agreement<-Prediction==test$survived
agreement

##   [1] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE
##  [13]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
##  [25]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [37]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE
##  [49]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE
##  [61] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
##  [85]  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
##  [97]  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
## [121]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
## [133]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE
## [145]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
## [157]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [169]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
## [181]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE
## [193]  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [205]  TRUE FALSE  TRUE  TRUE  TRUE

table(agreement)

## agreement
## FALSE  TRUE 
##    69   140

agreement

##   [1] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE
##  [13]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
##  [25]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [37]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE
##  [49]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE
##  [61] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
##  [85]  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
##  [97]  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
## [121]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
## [133]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE
## [145]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
## [157]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [169]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
## [181]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE
## [193]  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [205]  TRUE FALSE  TRUE  TRUE  TRUE

#33% incorrect prediction
prop.table(table(agreement))

## agreement
##     FALSE      TRUE 
## 0.3301435 0.6698565

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Project 3: Using Regression Trees: Titanic Data Analysis

Tooba Maryam

2024-04-13

R Markdown

Including Plots