# Load required library
library(readr)
# 1,2: Import dataset from the given URL to variable T3
T3 <- read_csv("https://goo.gl/At238b")
## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): pclass, name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (6): survived, age, sibsp, parch, fare, body
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the structure of the loaded dataset
str(T3)
## spc_tbl_ [1,309 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ pclass : chr [1:1309] "1st" "1st" "1st" "1st" ...
## $ survived : num [1:1309] 1 1 0 0 0 1 1 0 1 0 ...
## $ name : chr [1:1309] "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
## $ sex : chr [1:1309] "female" "male" "female" "male" ...
## $ age : num [1:1309] 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sibsp : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
## $ ticket : chr [1:1309] "24160" "113781" "113781" "113781" ...
## $ fare : num [1:1309] 211 152 152 152 152 ...
## $ cabin : chr [1:1309] "B5" "C22 C26" "C22 C26" "C22 C26" ...
## $ embarked : chr [1:1309] "S" "S" "S" "S" ...
## $ boat : chr [1:1309] "2" "11" NA NA ...
## $ body : num [1:1309] NA NA NA 135 NA NA NA NA NA 22 ...
## $ home.dest: chr [1:1309] "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...
## - attr(*, "spec")=
## .. cols(
## .. pclass = col_character(),
## .. survived = col_double(),
## .. name = col_character(),
## .. sex = col_character(),
## .. age = col_double(),
## .. sibsp = col_double(),
## .. parch = col_double(),
## .. ticket = col_character(),
## .. fare = col_double(),
## .. cabin = col_character(),
## .. embarked = col_character(),
## .. boat = col_character(),
## .. body = col_double(),
## .. home.dest = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# View the first few rows of the dataset
head(T3)
## # A tibble: 6 × 14
## pclass survived name sex age sibsp parch ticket fare cabin embarked
## <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 1st 1 Allen, Mi… fema… 29 0 0 24160 211. B5 S
## 2 1st 1 Allison, … male 0.92 1 2 113781 152. C22 … S
## 3 1st 0 Allison, … fema… 2 1 2 113781 152. C22 … S
## 4 1st 0 Allison, … male 30 1 2 113781 152. C22 … S
## 5 1st 0 Allison, … fema… 25 1 2 113781 152. C22 … S
## 6 1st 1 Anderson,… male 48 0 0 19952 26.6 E12 S
## # ℹ 3 more variables: boat <chr>, body <dbl>, home.dest <chr>
# 3: Create the new dataset titanic with selected features
titanic <- T3[, c("survived", "embarked", "age", "sex", "sibsp", "parch", "fare")]
#4:Statistics of the titanic dataset
# View the structure of the created dataset
str(titanic)
## tibble [1,309 × 7] (S3: tbl_df/tbl/data.frame)
## $ survived: num [1:1309] 1 1 0 0 0 1 1 0 1 0 ...
## $ embarked: chr [1:1309] "S" "S" "S" "S" ...
## $ age : num [1:1309] 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sex : chr [1:1309] "female" "male" "female" "male" ...
## $ sibsp : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num [1:1309] 211 152 152 152 152 ...
#display the dimensions of the dataset
dim(titanic)
## [1] 1309 7
#list types for each attribute
sapply(titanic, class)
## survived embarked age sex sibsp parch
## "numeric" "character" "numeric" "character" "numeric" "numeric"
## fare
## "numeric"
#Summary statistics of the titanic dataset
summary(titanic)
## survived embarked age sex
## Min. :0.000 Length:1309 Min. : 0.17 Length:1309
## 1st Qu.:0.000 Class :character 1st Qu.:21.00 Class :character
## Median :0.000 Mode :character Median :28.00 Mode :character
## Mean :0.382 Mean :29.88
## 3rd Qu.:1.000 3rd Qu.:39.00
## Max. :1.000 Max. :80.00
## NA's :263
## sibsp parch fare
## Min. :0.0000 Min. :0.000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.: 7.896
## Median :0.0000 Median :0.000 Median : 14.454
## Mean :0.4989 Mean :0.385 Mean : 33.295
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.: 31.275
## Max. :8.0000 Max. :9.000 Max. :512.329
## NA's :1
# Load required library
library(ggplot2)
# Create a bar plot of gender distribution
ggplot(titanic, aes(x = sex)) + geom_bar(fill = "blue") + labs(title = "Gender Distribution", x = "Gender", y = "Count")
# Create a bar plot of Age distribution
ggplot(titanic, aes(x = age)) + geom_bar(fill = "blue") + labs(title = "Age Distribution", x = "Age", y = "Count")
## Warning: Removed 263 rows containing non-finite values (`stat_count()`).
#5: Proportion of Survived in the dataset
S <- titanic$survived
cbind(freq=table(S), percentage=prop.table(table(S))*100)
## freq percentage
## 0 809 61.8029
## 1 500 38.1971
survived_proportion <- mean(titanic$survived)
#print the proportion
print(survived_proportion)
## [1] 0.381971
# Create a bar plot to find survivors on the basis of their Gender
ggplot(titanic, aes(x = sex, fill = factor(survived))) + geom_bar() + labs(title = "Survivorship: Gender ", x = "Gender", y = "Count") +
scale_fill_manual(values = c("red","blue"), labels = c("Not Survived", "Survived"))
# Create a bar plot to find survivors on the basis of Port of Embarkation
ggplot(titanic, aes(x = embarked, fill = factor(survived))) + geom_bar() + labs(title = "Survivorship: on the basis of Port of Embarkation ", x = "Port of Embarkation", y = "Count") +
scale_fill_manual(values = c("red","blue"), labels = c("Not Survived", "Survived"))
# Create a box plot to find survivors on the basis of fare
boxplot(fare ~ survived, data = titanic, main = "Survivorship: on the basis of Fare", xlab = "Survived", ylab = "Fare", col = c("red", "blue"), names = c("No", "Yes"), outline = FALSE)
# Find total na's
titanic_na <- colSums(is.na(titanic))
titanic_na
## survived embarked age sex sibsp parch fare
## 0 2 263 0 0 0 1
#6: Remove NAs from the dataset
titanic <- na.omit(titanic) #total 266 records have been omitted
summary(titanic) # 1043 records are in titanic dataset
## survived embarked age sex
## Min. :0.0000 Length:1043 Min. : 0.17 Length:1043
## 1st Qu.:0.0000 Class :character 1st Qu.:21.00 Class :character
## Median :0.0000 Mode :character Median :28.00 Mode :character
## Mean :0.4075 Mean :29.81
## 3rd Qu.:1.0000 3rd Qu.:39.00
## Max. :1.0000 Max. :80.00
## sibsp parch fare
## Min. :0.0000 Min. :0.0000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 8.05
## Median :0.0000 Median :0.0000 Median : 15.75
## Mean :0.5043 Mean :0.4219 Mean : 36.60
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 35.08
## Max. :8.0000 Max. :6.0000 Max. :512.33
#7: Convert Survived, Embarked and Sex columns to factors
titanic$survived <- factor(titanic$survived)
titanic$embarked <- factor(titanic$embarked)
titanic$sex <- factor(titanic$sex)
#8: Correlation matrix between survival and other features
# Convert Sex and Embarked to numeric representation
titanic$Sex_num <- as.numeric(titanic$sex) #Female: 1, Male: 2
titanic$Embarked_num <- as.numeric(titanic$embarked)
head(titanic$sex)
## [1] female male female male female male
## Levels: female male
head(titanic$Sex_num)
## [1] 1 2 1 2 1 2
head(titanic$embarked)
## [1] S S S S S S
## Levels: C Q S
head(titanic$Embarked_num) # C: 1, Q:2, S:3
## [1] 3 3 3 3 3 3
# Convert factors to numeric for co-relation calculation
titanic$Survived_num <- as.numeric(titanic$survived)
titanic$Age_num <- as.numeric(titanic$age)
titanic$Sex_num <- as.numeric(titanic$sex)
# load package
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
correlations <- cor(titanic[, c("Survived_num", "Age_num", "sibsp", "parch", "fare", "Sex_num", "Embarked_num")])
correlations
## Survived_num Age_num sibsp parch fare
## Survived_num 1.00000000 -0.05741486 -0.01140343 0.11543601 0.2478576
## Age_num -0.05741486 1.00000000 -0.24234489 -0.14931063 0.1772057
## sibsp -0.01140343 -0.24234489 1.00000000 0.37395967 0.1421305
## parch 0.11543601 -0.14931063 0.37395967 1.00000000 0.2176495
## fare 0.24785762 0.17720569 0.14213054 0.21764954 1.0000000
## Sex_num -0.53633212 0.06600630 -0.09646420 -0.22253083 -0.1864003
## Embarked_num -0.20225751 -0.08326914 0.04550984 0.01122982 -0.3014545
## Sex_num Embarked_num
## Survived_num -0.5363321 -0.20225751
## Age_num 0.0660063 -0.08326914
## sibsp -0.0964642 0.04550984
## parch -0.2225308 0.01122982
## fare -0.1864003 -0.30145454
## Sex_num 1.0000000 0.10942541
## Embarked_num 0.1094254 1.00000000
#Correlation of "survived" with other variables
S_correlation <- correlations["Survived_num", ]
# Survived_num to Age_num: Approximately -0.057, indicating a very weak negative correlation. This suggests that there's a slight tendency for younger passengers to have a higher chance of survival.
# Survived_num to sibsp: Approximately -0.011, indicating a very weak negative correlation. This suggests that there's a slight tendency for passengers with fewer siblings or spouses aboard to have a slightly higher chance of survival.
# Survived_num to parch: Approximately 0.115, indicating a weak positive correlation. This suggests that there's a slight tendency for passengers with more parents or children aboard to have a slightly higher chance of survival.
# Survived_num to fare: Approximately 0.248, indicating a weak positive correlation. This suggests that there's a slight tendency for passengers who paid higher fares to have a slightly higher chance of survival.
# Survived_num to Sex_num: Approximately -0.536, indicating a moderate negative correlation. This suggests that females have a higher chance of survival compared to males.
# Survived_num to Embarked_num: Approximately -0.202, indicating a weak negative correlation. This suggests that there's a slight tendency for passengers who embarked from certain ports to have a slightly higher chance of survival.
S_correlation
## Survived_num Age_num sibsp parch fare Sex_num
## 1.00000000 -0.05741486 -0.01140343 0.11543601 0.24785762 -0.53633212
## Embarked_num
## -0.20225751
#9: create correlation plot
corrplot(correlations, method="circle")
#10: Set seed to 1000
set.seed(1000)
summary(titanic)
## survived embarked age sex sibsp
## 0:618 C:212 Min. : 0.17 female:386 Min. :0.0000
## 1:425 Q: 50 1st Qu.:21.00 male :657 1st Qu.:0.0000
## S:781 Median :28.00 Median :0.0000
## Mean :29.81 Mean :0.5043
## 3rd Qu.:39.00 3rd Qu.:1.0000
## Max. :80.00 Max. :8.0000
## parch fare Sex_num Embarked_num
## Min. :0.0000 Min. : 0.00 Min. :1.00 Min. :1.000
## 1st Qu.:0.0000 1st Qu.: 8.05 1st Qu.:1.00 1st Qu.:2.000
## Median :0.0000 Median : 15.75 Median :2.00 Median :3.000
## Mean :0.4219 Mean : 36.60 Mean :1.63 Mean :2.546
## 3rd Qu.:1.0000 3rd Qu.: 35.08 3rd Qu.:2.00 3rd Qu.:3.000
## Max. :6.0000 Max. :512.33 Max. :2.00 Max. :3.000
## Survived_num Age_num
## Min. :1.000 Min. : 0.17
## 1st Qu.:1.000 1st Qu.:21.00
## Median :1.000 Median :28.00
## Mean :1.407 Mean :29.81
## 3rd Qu.:2.000 3rd Qu.:39.00
## Max. :2.000 Max. :80.00
# Split the dataset into 80% training and 20% testing
# Split the dataset into 80% training and 20% testing
train_indices <- sample(1:nrow(titanic), 0.8 * nrow(titanic))
train <- titanic[train_indices, ]
test <- titanic[-train_indices, ]
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
#Train your learner on the training dataset and save your model in a variable fit
fit <- rpart(survived ~ sex + age + sibsp + parch + fare + embarked, data = train, method = "class")
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(RColorBrewer)
rpart.plot(fit,
box.palette = "GnBu", # color scheme
branch.lty = 3, # dotted branch lines
shadow.col = "gray", # shadows under the node boxes
nn = TRUE) # display the node numbers
fancyRpartPlot(fit)
#11: Build your training dataset on fixed (till index 834) and test dataset (till index 1043)
train <- titanic[1:834, ]
test <- titanic[835:1043, ]
#12: Install and load rpart package if not already installed
library(rpart)
#13: Train your learner on the training dataset and save your model in a variable fit
fit <- rpart(survived ~ sex + age + sibsp + parch + fare + embarked, data = train, method = "class")
#14,15: Plot your regression tree and save plot into an image file
rpart.plot(fit,
box.palette = "GnBu", # color scheme
branch.lty = 3, # dotted branch lines
shadow.col = "gray", # shadows under the node boxes
nn = TRUE) # display the node numbers
png("D:/MS Sem 2/Data Minning/Projects/regression_tree.png", width = 800, height = 600, res = 120)
rpart.plot(fit,
box.palette = "GnBu", # color scheme
branch.lty = 3, # dotted branch lines
shadow.col = "gray", # shadows under the node boxes
nn = TRUE) # display the node numbers
# shutting off the current process
dev.off()
## png
## 2
#16: Type the R command fancyRpartPlot(fit) to obtain a more readable plot
# Plot a more readable tree
fancyRpartPlot(fit)
# The most important feature over which the tree first split can be found using:
summary(fit)
## Call:
## rpart(formula = survived ~ sex + age + sibsp + parch + fare +
## embarked, data = train, method = "class")
## n= 834
##
## CP nsplit rel error xerror xstd
## 1 0.54423592 0 1.0000000 1.0000000 0.03849577
## 2 0.03753351 1 0.4557641 0.4557641 0.03119009
## 3 0.01340483 2 0.4182306 0.4235925 0.03033961
## 4 0.01000000 5 0.3780161 0.4182306 0.03019154
##
## Variable importance
## sex fare age parch sibsp embarked
## 65 12 9 7 5 2
##
## Node number 1: 834 observations, complexity param=0.5442359
## predicted class=0 expected loss=0.4472422 P(node) =1
## class counts: 461 373
## probabilities: 0.553 0.447
## left son=2 (511 obs) right son=3 (323 obs)
## Primary splits:
## sex splits as RL, improve=142.00650, (0 missing)
## fare < 15.625 to the left, improve= 44.08751, (0 missing)
## parch < 0.5 to the left, improve= 21.33507, (0 missing)
## embarked splits as RLL, improve= 15.82247, (0 missing)
## age < 8.5 to the right, improve= 14.18318, (0 missing)
## Surrogate splits:
## parch < 0.5 to the left, agree=0.656, adj=0.111, (0 split)
## fare < 77.6229 to the left, agree=0.649, adj=0.093, (0 split)
## sibsp < 2.5 to the left, agree=0.616, adj=0.009, (0 split)
## age < 16.5 to the right, agree=0.615, adj=0.006, (0 split)
##
## Node number 2: 511 observations, complexity param=0.03753351
## predicted class=0 expected loss=0.2152642 P(node) =0.6127098
## class counts: 401 110
## probabilities: 0.785 0.215
## left son=4 (483 obs) right son=5 (28 obs)
## Primary splits:
## age < 10 to the right, improve=16.941050, (0 missing)
## fare < 26.26875 to the left, improve= 8.516230, (0 missing)
## parch < 0.5 to the left, improve= 5.474622, (0 missing)
## embarked splits as RLL, improve= 4.881588, (0 missing)
## sibsp < 0.5 to the left, improve= 1.056154, (0 missing)
## Surrogate splits:
## sibsp < 3.5 to the left, agree=0.951, adj=0.107, (0 split)
##
## Node number 3: 323 observations, complexity param=0.01340483
## predicted class=1 expected loss=0.1857585 P(node) =0.3872902
## class counts: 60 263
## probabilities: 0.186 0.814
## left son=6 (42 obs) right son=7 (281 obs)
## Primary splits:
## fare < 10.1625 to the left, improve=12.6432300, (0 missing)
## sibsp < 3.5 to the right, improve= 6.4895870, (0 missing)
## embarked splits as RLR, improve= 4.6826630, (0 missing)
## age < 28.5 to the left, improve= 1.8840550, (0 missing)
## parch < 1.5 to the right, improve= 0.5440428, (0 missing)
## Surrogate splits:
## embarked splits as RLR, agree=0.904, adj=0.262, (0 split)
##
## Node number 4: 483 observations
## predicted class=0 expected loss=0.184265 P(node) =0.5791367
## class counts: 394 89
## probabilities: 0.816 0.184
##
## Node number 5: 28 observations
## predicted class=1 expected loss=0.25 P(node) =0.03357314
## class counts: 7 21
## probabilities: 0.250 0.750
##
## Node number 6: 42 observations, complexity param=0.01340483
## predicted class=0 expected loss=0.452381 P(node) =0.05035971
## class counts: 23 19
## probabilities: 0.548 0.452
## left son=12 (28 obs) right son=13 (14 obs)
## Primary splits:
## age < 19.5 to the right, improve=2.8809520, (0 missing)
## fare < 8.35625 to the right, improve=1.6720240, (0 missing)
## sibsp < 0.5 to the right, improve=1.2135640, (0 missing)
## embarked splits as RLR, improve=0.1280423, (0 missing)
## Surrogate splits:
## fare < 7.74165 to the right, agree=0.714, adj=0.143, (0 split)
## parch < 0.5 to the left, agree=0.690, adj=0.071, (0 split)
## embarked splits as RLL, agree=0.690, adj=0.071, (0 split)
##
## Node number 7: 281 observations, complexity param=0.01340483
## predicted class=1 expected loss=0.1316726 P(node) =0.3369305
## class counts: 37 244
## probabilities: 0.132 0.868
## left son=14 (7 obs) right son=15 (274 obs)
## Primary splits:
## sibsp < 3.5 to the right, improve=7.556541, (0 missing)
## fare < 48.2 to the left, improve=4.423521, (0 missing)
## age < 11.5 to the left, improve=1.903240, (0 missing)
## parch < 1.5 to the right, improve=1.666061, (0 missing)
## embarked splits as RLL, improve=0.909409, (0 missing)
##
## Node number 12: 28 observations
## predicted class=0 expected loss=0.3214286 P(node) =0.03357314
## class counts: 19 9
## probabilities: 0.679 0.321
##
## Node number 13: 14 observations
## predicted class=1 expected loss=0.2857143 P(node) =0.01678657
## class counts: 4 10
## probabilities: 0.286 0.714
##
## Node number 14: 7 observations
## predicted class=0 expected loss=0.1428571 P(node) =0.008393285
## class counts: 6 1
## probabilities: 0.857 0.143
##
## Node number 15: 274 observations
## predicted class=1 expected loss=0.1131387 P(node) =0.3285372
## class counts: 31 243
## probabilities: 0.113 0.887
# Typically, the feature that appears at the top of the tree (the root node) is considered the most important for splitting the data.
# In this case, it is 'Sex', which aligns with the "Women and children first" principle observed in the Titanic disaster.
#Variable Importance Section also tells us that 'sex' has highest importance value of 65, which aligns with the moto
# Make predictions on the test dataset
Prediction <- predict(fit, test, type = "class")
Prediction
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 0 1 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 0 0 0 0 0 1 1 1 1 1 1 0 0 1 1 0 1 0 0 0
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 1 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 0 0 1 1 0 0 0 0 1 1 1 0 1 0 0 0 1 0 1 0
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## 201 202 203 204 205 206 207 208 209
## 0 0 0 1 0 1 0 0 0
## Levels: 0 1
# Create a data frame with predicted values
Results <- data.frame(PassengerSex = test$sex, Survived = Prediction)
Results
## PassengerSex Survived
## 1 male 0
## 2 male 0
## 3 male 0
## 4 male 0
## 5 male 0
## 6 male 0
## 7 female 0
## 8 female 0
## 9 male 0
## 10 female 1
## 11 male 0
## 12 male 0
## 13 male 0
## 14 female 1
## 15 male 0
## 16 male 0
## 17 male 0
## 18 female 0
## 19 male 0
## 20 male 0
## 21 male 0
## 22 male 0
## 23 female 0
## 24 male 0
## 25 male 0
## 26 female 1
## 27 female 0
## 28 male 0
## 29 female 1
## 30 male 0
## 31 male 0
## 32 male 0
## 33 male 0
## 34 male 0
## 35 male 0
## 36 male 1
## 37 female 1
## 38 male 0
## 39 female 0
## 40 male 0
## 41 male 0
## 42 female 1
## 43 female 1
## 44 male 0
## 45 female 1
## 46 male 0
## 47 male 0
## 48 female 1
## 49 female 0
## 50 male 0
## 51 female 1
## 52 female 0
## 53 male 0
## 54 male 0
## 55 male 0
## 56 male 0
## 57 male 0
## 58 female 0
## 59 male 0
## 60 male 0
## 61 female 0
## 62 male 1
## 63 male 0
## 64 male 0
## 65 female 0
## 66 male 0
## 67 male 0
## 68 male 0
## 69 female 0
## 70 female 0
## 71 male 0
## 72 male 0
## 73 female 0
## 74 male 1
## 75 male 1
## 76 female 1
## 77 female 1
## 78 female 1
## 79 male 1
## 80 male 1
## 81 male 1
## 82 male 0
## 83 male 0
## 84 female 1
## 85 male 0
## 86 male 0
## 87 male 0
## 88 male 1
## 89 female 1
## 90 female 1
## 91 male 0
## 92 male 0
## 93 male 0
## 94 male 0
## 95 male 0
## 96 female 0
## 97 male 0
## 98 male 0
## 99 female 1
## 100 male 0
## 101 male 0
## 102 male 0
## 103 male 0
## 104 male 0
## 105 male 0
## 106 male 1
## 107 male 1
## 108 male 1
## 109 male 1
## 110 female 1
## 111 female 1
## 112 male 0
## 113 male 0
## 114 female 1
## 115 female 1
## 116 male 0
## 117 female 1
## 118 male 0
## 119 male 0
## 120 male 0
## 121 male 0
## 122 male 0
## 123 male 0
## 124 female 0
## 125 male 0
## 126 female 1
## 127 female 1
## 128 female 1
## 129 male 0
## 130 male 0
## 131 male 0
## 132 male 0
## 133 male 0
## 134 male 0
## 135 male 0
## 136 female 1
## 137 male 1
## 138 male 0
## 139 female 1
## 140 female 1
## 141 male 0
## 142 female 1
## 143 male 0
## 144 male 0
## 145 male 0
## 146 male 0
## 147 female 0
## 148 male 0
## 149 male 0
## 150 female 0
## 151 male 0
## 152 male 0
## 153 female 1
## 154 female 1
## 155 male 0
## 156 male 0
## 157 male 0
## 158 male 0
## 159 male 0
## 160 male 0
## 161 male 0
## 162 male 0
## 163 male 1
## 164 female 1
## 165 male 0
## 166 male 0
## 167 male 0
## 168 male 0
## 169 male 1
## 170 female 1
## 171 female 1
## 172 male 0
## 173 female 1
## 174 female 0
## 175 male 0
## 176 male 0
## 177 female 1
## 178 male 0
## 179 female 1
## 180 male 0
## 181 male 0
## 182 male 0
## 183 male 0
## 184 female 1
## 185 male 0
## 186 male 0
## 187 female 1
## 188 male 0
## 189 male 0
## 190 female 1
## 191 male 0
## 192 male 0
## 193 male 0
## 194 female 0
## 195 male 0
## 196 male 0
## 197 male 0
## 198 female 0
## 199 male 0
## 200 male 0
## 201 male 0
## 202 male 0
## 203 male 0
## 204 female 1
## 205 male 0
## 206 female 1
## 207 male 0
## 208 male 0
## 209 male 0
# Save the data frame to a CSV file
write.csv(Results, file = "Titanicdtree.csv", row.names = FALSE)
# Read the CSV file
TCT <- read.csv("Titanicdtree.csv")
view(TCT)
#uploaded on Teams #https://tigernet365.sharepoint.com/:x:/s/CS583Spring24/EVGbP-FDTbVPvUwfrUTBykMB-Mxkl-qfDCMHh2oTBRVLZQ?e=xZ1iKW
# Create a confusion matrix
conf_matrix <- table(Actual = test$survived, Predicted = Prediction)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(conf_matrix)
## Predicted
## Actual 0 1
## 0 120 37
## 1 32 20
library(ggplot2)
# Convert the confusion matrix to a data frame for ggplot
conf_matrix_df <- as.data.frame.table(conf_matrix)
colnames(conf_matrix_df) <- c("Actual", "Predicted", "Count")
#True positives (TP) = 20
#True negatives (TN) = 120
#False positives (FP) = 37
#False negatives (FN) = 32
# Plot the confusion matrix using ggplot
ggplot(data = conf_matrix_df, aes(x = Predicted, y = Actual, fill = Count)) +
geom_tile(color = "white") + # Add tiles
geom_text(aes(label = Count)) + # Add text labels
scale_fill_gradient(low = "lightblue", high = "darkblue") + # Define color gradient
labs(title = "Confusion Matrix", x = "Predicted", y = "Actual") + # Add titles
theme_minimal() # Set theme
# Calculate accuracy
accuracy <- mean(Prediction == test$survived)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.669856459330144"
# Calculate accuracy through agreement vector
table(Prediction,test$survived)
##
## Prediction 0 1
## 0 120 32
## 1 37 20
Prediction
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 0 1 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 0 0 0 0 0 1 1 1 1 1 1 0 0 1 1 0 1 0 0 0
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 1 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 0 0 1 1 0 0 0 0 1 1 1 0 1 0 0 0 1 0 1 0
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## 201 202 203 204 205 206 207 208 209
## 0 0 0 1 0 1 0 0 0
## Levels: 0 1
agreement<-Prediction==test$survived
agreement
## [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
## [13] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE
## [49] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## [61] FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [85] TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE
## [97] TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [133] TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
## [145] TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [157] TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [169] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
## [181] TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
## [193] TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [205] TRUE FALSE TRUE TRUE TRUE
table(agreement)
## agreement
## FALSE TRUE
## 69 140
agreement
## [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
## [13] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE
## [49] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## [61] FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [85] TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE
## [97] TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [133] TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
## [145] TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [157] TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [169] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
## [181] TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
## [193] TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [205] TRUE FALSE TRUE TRUE TRUE
#33% incorrect prediction
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.3301435 0.6698565
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.