getwd(
)
## [1] "C:/Users/Jerome/Documents/0000_Work_Files/0000_Montgomery_College/Data_Science_101/Data_101_Fall_2022/221024_Quiz2"
### Initialize
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(readr)
### Read the dataset
train <- read_csv("train.csv")
## Rows: 1310 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (7): pclass, survived, age, sibsp, parch, fare, body
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
### Question 1
newdf <- select(train, survived, pclass, sex, age, fare)
### Question 2 Nine male passengers > 50 survived
old_survive <- filter(newdf, survived == 1 & age > 50)
table(old_survive$sex)
##
## female male
## 29 9
### Question 3 The mean age of those who lived was 28.9 years. The mean age of those who died was 30.5 years.
### The mean fare of those who lived was 49.36 (pounds?); the mean fare of those who died was 23.35(pounds?)
### A class-based survival rate, it seems.
newdf %>%
group_by(survived) %>%
summarise(age )
## `summarise()` has grouped output by 'survived'. You can override using the
## `.groups` argument.
## # A tibble: 1,310 × 2
## # Groups: survived [3]
## survived age
## <dbl> <dbl>
## 1 0 2
## 2 0 30
## 3 0 25
## 4 0 39
## 5 0 71
## 6 0 47
## 7 0 NA
## 8 0 24
## 9 0 36
## 10 0 25
## # … with 1,300 more rows
lived <- filter(newdf, survived == 1)
died <- filter(newdf, survived == 0)
mean(lived$age)
## [1] NA
mean(lived$age, na.rm = TRUE)
## [1] 28.91823
mean(died$age, na.rm = TRUE)
## [1] 30.54537
mean(lived$fare, na.rm = TRUE)
## [1] 49.36118
mean(died$fare, na.rm = TRUE)
## [1] 23.35383
### Question 4
newdf %>%
mutate(Fare_Today = fare*28.98)
## # A tibble: 1,310 × 6
## survived pclass sex age fare Fare_Today
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1 1 female 29 211. 6125.
## 2 1 1 male 0.917 152. 4392.
## 3 0 1 female 2 152. 4392.
## 4 0 1 male 30 152. 4392.
## 5 0 1 female 25 152. 4392.
## 6 1 1 male 48 26.6 769.
## 7 1 1 female 63 78.0 2259.
## 8 0 1 male 39 0 0
## 9 1 1 female 53 51.5 1492.
## 10 0 1 male 71 49.5 1435.
## # … with 1,300 more rows
### Question 5 I'm lost on this one.
##newdf <- subset(newdf, select -c(fare))