require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Import dataset
titanic <-tbl_df(read.csv(file="http://www.personal.psu.edu/dlp/w540/datasets/titanicsurvival.csv"))
titanic
## # A tibble: 2,201 x 4
## Class Age Sex Survive
## <int> <int> <int> <int>
## 1 1 1 1 1
## 2 1 1 1 1
## 3 1 1 1 1
## 4 1 1 1 1
## 5 1 1 1 1
## 6 1 1 1 1
## 7 1 1 1 1
## 8 1 1 1 1
## 9 1 1 1 1
## 10 1 1 1 1
## # ... with 2,191 more rows
glimpse(titanic)
## Observations: 2,201
## Variables: 4
## $ Class <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Age <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Sex <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Survive <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
names(titanic)
## [1] "Class" "Age" "Sex" "Survive"
dim(titanic)
## [1] 2201 4
# Manipulate data#
titanic %>% summarize( num_passengers=n())
## # A tibble: 1 x 1
## num_passengers
## <int>
## 1 2201
titanic %>% group_by(Class) %>% summarize(prop_surv_class=mean(Survive))
## # A tibble: 4 x 2
## Class prop_surv_class
## <int> <dbl>
## 1 0 0.2395480
## 2 1 0.6246154
## 3 2 0.4140351
## 4 3 0.2521246
titanic %>% group_by(Sex) %>% summarize(prop_surv_sex=mean(Survive))
## # A tibble: 2 x 2
## Sex prop_surv_sex
## <int> <dbl>
## 1 0 0.7319149
## 2 1 0.2120162
titanic %>% group_by(Age) %>% summarize(prop_surv_age=mean(Survive))
## # A tibble: 2 x 2
## Age prop_surv_age
## <int> <dbl>
## 1 0 0.5229358
## 2 1 0.3126195
titanic %>% group_by(Age,Sex) %>% summarize(prop_surv_Age_and_Sex=mean(Survive))
## # A tibble: 4 x 3
## # Groups: Age [?]
## Age Sex prop_surv_Age_and_Sex
## <int> <int> <dbl>
## 1 0 0 0.6222222
## 2 0 1 0.4531250
## 3 1 0 0.7435294
## 4 1 1 0.2027594
titanic %>% group_by(Age,Sex,Class) %>%
summarize(prop_surv_Age_and_Sex_and_Class=mean(Survive))
## # A tibble: 14 x 4
## # Groups: Age, Sex [?]
## Age Sex Class prop_surv_Age_and_Sex_and_Class
## <int> <int> <int> <dbl>
## 1 0 0 1 1.00000000
## 2 0 0 2 1.00000000
## 3 0 0 3 0.45161290
## 4 0 1 1 1.00000000
## 5 0 1 2 1.00000000
## 6 0 1 3 0.27083333
## 7 1 0 0 0.86956522
## 8 1 0 1 0.97222222
## 9 1 0 2 0.86021505
## 10 1 0 3 0.46060606
## 11 1 1 0 0.22273782
## 12 1 1 1 0.32571429
## 13 1 1 2 0.08333333
## 14 1 1 3 0.16233766
titanic_class_survive <-titanic %>%
group_by(Class) %>%
summarize(prop_surv_class=mean(Survive))
titanic_class_survive
## # A tibble: 4 x 2
## Class prop_surv_class
## <int> <dbl>
## 1 0 0.2395480
## 2 1 0.6246154
## 3 2 0.4140351
## 4 3 0.2521246