This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(knitr)
## Warning: package 'knitr' was built under R version 4.3.2
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
titanic3 <- read_csv("https://gist.githubusercontent.com/robertness/0374e73fa396e2fc3c4f/raw/c23566973e82dba895d82d0693ced633862dcd22/titanic3.csv")
## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): pclass, name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (6): survived, age, sibsp, parch, fare, body
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(titanic3)
#Build new dataset
titanic <- titanic3[ , c("survived", "embarked", "sex", "sibsp", "parch", "fare")]
View(titanic)
#Statistical analysis
str(titanic)
## tibble [1,309 × 6] (S3: tbl_df/tbl/data.frame)
## $ survived: num [1:1309] 1 1 0 0 0 1 1 0 1 0 ...
## $ embarked: chr [1:1309] "S" "S" "S" "S" ...
## $ sex : chr [1:1309] "female" "male" "female" "male" ...
## $ sibsp : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num [1:1309] 211 152 152 152 152 ...
summary(titanic)
## survived embarked sex sibsp
## Min. :0.000 Length:1309 Length:1309 Min. :0.0000
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:0.0000
## Median :0.000 Mode :character Mode :character Median :0.0000
## Mean :0.382 Mean :0.4989
## 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :1.000 Max. :8.0000
##
## parch fare
## Min. :0.000 Min. : 0.000
## 1st Qu.:0.000 1st Qu.: 7.896
## Median :0.000 Median : 14.454
## Mean :0.385 Mean : 33.295
## 3rd Qu.:0.000 3rd Qu.: 31.275
## Max. :9.000 Max. :512.329
## NA's :1
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
survival_sex <- titanic %>%
group_by(sex) %>%
summarise(Survival_Rate = mean(survived), Total = n())
survival_embarked <- titanic %>%
group_by(embarked) %>%
summarise(Survival_Rate = mean(survived), Total = n())
chisq.test(table(titanic$survived, titanic$sex))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(titanic$survived, titanic$sex)
## X-squared = 363.62, df = 1, p-value < 2.2e-16
chisq.test(table(titanic$survived, titanic$embarked))
##
## Pearson's Chi-squared test
##
## data: table(titanic$survived, titanic$embarked)
## X-squared = 44.242, df = 2, p-value = 2.472e-10
#finding proportion
proportion_surv1 <- mean(titanic$survived, na.rm = TRUE)
print(proportion_surv1)
## [1] 0.381971
surv_proportion <- mean(titanic$survived)
print(surv_proportion)
## [1] 0.381971
#remove NAs
titanicNA<- na.omit(titanic)
str(titanicNA)
## tibble [1,306 × 6] (S3: tbl_df/tbl/data.frame)
## $ survived: num [1:1306] 1 1 0 0 0 1 1 0 1 0 ...
## $ embarked: chr [1:1306] "S" "S" "S" "S" ...
## $ sex : chr [1:1306] "female" "male" "female" "male" ...
## $ sibsp : num [1:1306] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:1306] 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num [1:1306] 211 152 152 152 152 ...
## - attr(*, "na.action")= 'omit' Named int [1:3] 169 285 1226
## ..- attr(*, "names")= chr [1:3] "169" "285" "1226"
#make survived, embarked and sex as Factors
titanic$survived <- factor(titanic$survived, levels = c(0, 1), labels = c("No", "Yes"))
titanic$embarked <- factor(titanic$embarked, levels = c("C", "Q", "S"), labels = c("Cherbourg", "Queenstown", "Southampton"))
titanic$sex <- factor(titanic$sex)
str(titanic)
## tibble [1,309 × 6] (S3: tbl_df/tbl/data.frame)
## $ survived: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 2 1 2 1 ...
## $ embarked: Factor w/ 3 levels "Cherbourg","Queenstown",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ sibsp : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num [1:1309] 211 152 152 152 152 ...
#find correlation matrix
titanic$survived_num <- as.numeric(titanic$survived) - 1
num_data <- titanic[c("survived_num", "fare", "sibsp", "parch")]
correlation_matrix <- cor(num_data, use = "complete.obs")
print(correlation_matrix)
## survived_num fare sibsp parch
## survived_num 1.00000000 0.2442655 -0.02812218 0.08241782
## fare 0.24426547 1.0000000 0.16023826 0.22153866
## sibsp -0.02812218 0.1602383 1.00000000 0.37348524
## parch 0.08241782 0.2215387 0.37348524 1.00000000
#plot
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
ggplot(titanic, aes(x = sex, fill = survived)) +
geom_bar(position = "fill") +
labs(title = "Survival by Sex", y = "Proportion of Total")
ggplot(titanic, aes(x = embarked, fill = survived)) +
geom_bar(position = "fill") +
labs(title = "Survival by Embarkation Point", y = "Proportion of Total")
ggplot(titanic, aes(x = survived, y = fare, fill = survived)) +
geom_boxplot() +
labs(title = "Fare Distribution by Survival Status", y = "Fare")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
#train and test
set.seed(1000)
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.2
split <- createDataPartition(titanic$survived, p = 0.8, list = FALSE)
training_data <- titanic[split, ]
testing_data <- titanic[-split, ]
#Using Rpart
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 4.3.2
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
fit <- rpart(survived ~ sex + sibsp + parch + fare + embarked,
data = training_data, method="class")
fancyRpartPlot(fit)
Prediction <- predict(fit, testing_data, type = "class")
Prediction
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## Yes Yes Yes No No Yes Yes No Yes No No No No Yes Yes No Yes No No Yes
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## Yes Yes No No Yes No Yes No No Yes No No No Yes No No Yes Yes No Yes
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## No No Yes No Yes No No No No Yes No No Yes No No No No Yes No Yes
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## Yes Yes No Yes No No No No No No No Yes No Yes No Yes No No Yes Yes
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## Yes No Yes Yes Yes No No No No Yes No Yes No No No Yes Yes Yes Yes No
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## Yes No No No No No No No Yes No Yes Yes No No No No Yes No No No
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## No No No Yes No No No Yes No Yes No No No Yes Yes No No No Yes No
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## No No No No No No Yes No No No No No No Yes No No No No Yes No
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## No No No No No Yes Yes No No No No No Yes No Yes No Yes No No No
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## No Yes No Yes No Yes No Yes No Yes Yes No No No No No Yes No Yes Yes
## 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
## No Yes Yes No No No Yes Yes Yes Yes Yes No No No Yes Yes No Yes No No
## 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
## No No No No No Yes No No No Yes Yes Yes No Yes No No No No No No
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
## No No No No No No No No No No No No Yes No No No No No No No
## 261
## No
## Levels: No Yes
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.