R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(knitr)
## Warning: package 'knitr' was built under R version 4.3.2
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
titanic3 <- read_csv("https://gist.githubusercontent.com/robertness/0374e73fa396e2fc3c4f/raw/c23566973e82dba895d82d0693ced633862dcd22/titanic3.csv")
## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): pclass, name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (6): survived, age, sibsp, parch, fare, body
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(titanic3)
#Build new dataset
titanic <- titanic3[ , c("survived", "embarked", "sex", "sibsp", "parch", "fare")]
View(titanic)
#Statistical analysis
str(titanic)
## tibble [1,309 × 6] (S3: tbl_df/tbl/data.frame)
##  $ survived: num [1:1309] 1 1 0 0 0 1 1 0 1 0 ...
##  $ embarked: chr [1:1309] "S" "S" "S" "S" ...
##  $ sex     : chr [1:1309] "female" "male" "female" "male" ...
##  $ sibsp   : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num [1:1309] 211 152 152 152 152 ...
summary(titanic)
##     survived       embarked             sex                sibsp       
##  Min.   :0.000   Length:1309        Length:1309        Min.   :0.0000  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:0.0000  
##  Median :0.000   Mode  :character   Mode  :character   Median :0.0000  
##  Mean   :0.382                                         Mean   :0.4989  
##  3rd Qu.:1.000                                         3rd Qu.:1.0000  
##  Max.   :1.000                                         Max.   :8.0000  
##                                                                        
##      parch            fare        
##  Min.   :0.000   Min.   :  0.000  
##  1st Qu.:0.000   1st Qu.:  7.896  
##  Median :0.000   Median : 14.454  
##  Mean   :0.385   Mean   : 33.295  
##  3rd Qu.:0.000   3rd Qu.: 31.275  
##  Max.   :9.000   Max.   :512.329  
##                  NA's   :1
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
survival_sex <- titanic %>%
  group_by(sex) %>%
  summarise(Survival_Rate = mean(survived), Total = n())
survival_embarked <- titanic %>%
  group_by(embarked) %>%
  summarise(Survival_Rate = mean(survived), Total = n())

chisq.test(table(titanic$survived, titanic$sex))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(titanic$survived, titanic$sex)
## X-squared = 363.62, df = 1, p-value < 2.2e-16
chisq.test(table(titanic$survived, titanic$embarked))
## 
##  Pearson's Chi-squared test
## 
## data:  table(titanic$survived, titanic$embarked)
## X-squared = 44.242, df = 2, p-value = 2.472e-10
#finding proportion
proportion_surv1 <- mean(titanic$survived, na.rm = TRUE)
print(proportion_surv1)
## [1] 0.381971
surv_proportion <- mean(titanic$survived)
print(surv_proportion)
## [1] 0.381971
#remove NAs
titanicNA<- na.omit(titanic)
str(titanicNA)
## tibble [1,306 × 6] (S3: tbl_df/tbl/data.frame)
##  $ survived: num [1:1306] 1 1 0 0 0 1 1 0 1 0 ...
##  $ embarked: chr [1:1306] "S" "S" "S" "S" ...
##  $ sex     : chr [1:1306] "female" "male" "female" "male" ...
##  $ sibsp   : num [1:1306] 0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : num [1:1306] 0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num [1:1306] 211 152 152 152 152 ...
##  - attr(*, "na.action")= 'omit' Named int [1:3] 169 285 1226
##   ..- attr(*, "names")= chr [1:3] "169" "285" "1226"
#make survived, embarked and sex as Factors
titanic$survived <- factor(titanic$survived, levels = c(0, 1), labels = c("No", "Yes"))
titanic$embarked <- factor(titanic$embarked, levels = c("C", "Q", "S"), labels = c("Cherbourg", "Queenstown", "Southampton"))
titanic$sex <- factor(titanic$sex)
str(titanic)
## tibble [1,309 × 6] (S3: tbl_df/tbl/data.frame)
##  $ survived: Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 2 1 2 1 ...
##  $ embarked: Factor w/ 3 levels "Cherbourg","Queenstown",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
##  $ sibsp   : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num [1:1309] 211 152 152 152 152 ...
#find correlation matrix
titanic$survived_num <- as.numeric(titanic$survived) - 1
num_data <- titanic[c("survived_num", "fare", "sibsp", "parch")]
correlation_matrix <- cor(num_data, use = "complete.obs")
print(correlation_matrix)
##              survived_num      fare       sibsp      parch
## survived_num   1.00000000 0.2442655 -0.02812218 0.08241782
## fare           0.24426547 1.0000000  0.16023826 0.22153866
## sibsp         -0.02812218 0.1602383  1.00000000 0.37348524
## parch          0.08241782 0.2215387  0.37348524 1.00000000
#plot
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
ggplot(titanic, aes(x = sex, fill = survived)) +
  geom_bar(position = "fill") +
  labs(title = "Survival by Sex", y = "Proportion of Total")

ggplot(titanic, aes(x = embarked, fill = survived)) +
  geom_bar(position = "fill") +
  labs(title = "Survival by Embarkation Point", y = "Proportion of Total")

ggplot(titanic, aes(x = survived, y = fare, fill = survived)) +
  geom_boxplot() +
  labs(title = "Fare Distribution by Survival Status", y = "Fare")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

#train and test
set.seed(1000)
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.2
split <- createDataPartition(titanic$survived, p = 0.8, list = FALSE)
training_data <- titanic[split, ]
testing_data <- titanic[-split, ]

#Using Rpart
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 4.3.2
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
fit <- rpart(survived ~ sex + sibsp + parch + fare + embarked,
               data = training_data, method="class")
fancyRpartPlot(fit)

Prediction <- predict(fit, testing_data, type = "class")
Prediction
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## Yes Yes Yes  No  No Yes Yes  No Yes  No  No  No  No Yes Yes  No Yes  No  No Yes 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
## Yes Yes  No  No Yes  No Yes  No  No Yes  No  No  No Yes  No  No Yes Yes  No Yes 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##  No  No Yes  No Yes  No  No  No  No Yes  No  No Yes  No  No  No  No Yes  No Yes 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
## Yes Yes  No Yes  No  No  No  No  No  No  No Yes  No Yes  No Yes  No  No Yes Yes 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
## Yes  No Yes Yes Yes  No  No  No  No Yes  No Yes  No  No  No Yes Yes Yes Yes  No 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
## Yes  No  No  No  No  No  No  No Yes  No Yes Yes  No  No  No  No Yes  No  No  No 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##  No  No  No Yes  No  No  No Yes  No Yes  No  No  No Yes Yes  No  No  No Yes  No 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##  No  No  No  No  No  No Yes  No  No  No  No  No  No Yes  No  No  No  No Yes  No 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##  No  No  No  No  No Yes Yes  No  No  No  No  No Yes  No Yes  No Yes  No  No  No 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##  No Yes  No Yes  No Yes  No Yes  No Yes Yes  No  No  No  No  No Yes  No Yes Yes 
## 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 
##  No Yes Yes  No  No  No Yes Yes Yes Yes Yes  No  No  No Yes Yes  No Yes  No  No 
## 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 
##  No  No  No  No  No Yes  No  No  No Yes Yes Yes  No Yes  No  No  No  No  No  No 
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 
##  No  No  No  No  No  No  No  No  No  No  No  No Yes  No  No  No  No  No  No  No 
## 261 
##  No 
## Levels: No Yes

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.