require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Import dataset
titanic <-tbl_df(read.csv(file="http://www.personal.psu.edu/dlp/w540/datasets/titanicsurvival.csv"))
titanic
## # A tibble: 2,201 x 4
##    Class   Age   Sex Survive
##    <int> <int> <int>   <int>
##  1     1     1     1       1
##  2     1     1     1       1
##  3     1     1     1       1
##  4     1     1     1       1
##  5     1     1     1       1
##  6     1     1     1       1
##  7     1     1     1       1
##  8     1     1     1       1
##  9     1     1     1       1
## 10     1     1     1       1
## # ... with 2,191 more rows
glimpse(titanic)
## Observations: 2,201
## Variables: 4
## $ Class   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Age     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Sex     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Survive <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
names(titanic)
## [1] "Class"   "Age"     "Sex"     "Survive"
dim(titanic)
## [1] 2201    4
# Manipulate data#
titanic %>% summarize( num_passengers=n())
## # A tibble: 1 x 1
##   num_passengers
##            <int>
## 1           2201
titanic %>% group_by(Class) %>% summarize(prop_surv_class=mean(Survive))
## # A tibble: 4 x 2
##   Class prop_surv_class
##   <int>           <dbl>
## 1     0       0.2395480
## 2     1       0.6246154
## 3     2       0.4140351
## 4     3       0.2521246
titanic %>% group_by(Sex) %>% summarize(prop_surv_sex=mean(Survive))
## # A tibble: 2 x 2
##     Sex prop_surv_sex
##   <int>         <dbl>
## 1     0     0.7319149
## 2     1     0.2120162
titanic %>% group_by(Age) %>% summarize(prop_surv_age=mean(Survive))
## # A tibble: 2 x 2
##     Age prop_surv_age
##   <int>         <dbl>
## 1     0     0.5229358
## 2     1     0.3126195
titanic %>% group_by(Age,Sex) %>% summarize(prop_surv_Age_and_Sex=mean(Survive))
## # A tibble: 4 x 3
## # Groups:   Age [?]
##     Age   Sex prop_surv_Age_and_Sex
##   <int> <int>                 <dbl>
## 1     0     0             0.6222222
## 2     0     1             0.4531250
## 3     1     0             0.7435294
## 4     1     1             0.2027594
titanic %>% group_by(Age,Sex,Class) %>% 
  summarize(prop_surv_Age_and_Sex_and_Class=mean(Survive))
## # A tibble: 14 x 4
## # Groups:   Age, Sex [?]
##      Age   Sex Class prop_surv_Age_and_Sex_and_Class
##    <int> <int> <int>                           <dbl>
##  1     0     0     1                      1.00000000
##  2     0     0     2                      1.00000000
##  3     0     0     3                      0.45161290
##  4     0     1     1                      1.00000000
##  5     0     1     2                      1.00000000
##  6     0     1     3                      0.27083333
##  7     1     0     0                      0.86956522
##  8     1     0     1                      0.97222222
##  9     1     0     2                      0.86021505
## 10     1     0     3                      0.46060606
## 11     1     1     0                      0.22273782
## 12     1     1     1                      0.32571429
## 13     1     1     2                      0.08333333
## 14     1     1     3                      0.16233766
titanic_class_survive <-titanic %>% 
  group_by(Class) %>% 
  summarize(prop_surv_class=mean(Survive))
titanic_class_survive
## # A tibble: 4 x 2
##   Class prop_surv_class
##   <int>           <dbl>
## 1     0       0.2395480
## 2     1       0.6246154
## 3     2       0.4140351
## 4     3       0.2521246