Chester Ismay
ETC 223
library(babynames)
suppressPackageStartupMessages(library(dplyr))
library(ggplot2)
library(devtools)
library(broom)
data("names", package = "babynames")str(babynames)## Classes 'tbl_df', 'tbl' and 'data.frame': 1792091 obs. of 5 variables:
## $ year: num 1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
## $ sex : chr "F" "F" "F" "F" ...
## $ name: chr "Mary" "Anna" "Emma" "Elizabeth" ...
## $ n : int 7065 2604 2003 1939 1746 1578 1472 1414 1320 1288 ...
## $ prop: num 0.0724 0.0267 0.0205 0.0199 0.0179 ...
head(babynames)## Source: local data frame [6 x 5]
##
## year sex name n prop
## (dbl) (chr) (chr) (int) (dbl)
## 1 1880 F Mary 7065 0.07238359
## 2 1880 F Anna 2604 0.02667896
## 3 1880 F Emma 2003 0.02052149
## 4 1880 F Elizabeth 1939 0.01986579
## 5 1880 F Minnie 1746 0.01788843
## 6 1880 F Margaret 1578 0.01616720
babynames %>% filter(name == "Chester") %>%
qplot(year, prop, data = .)babynames %>% filter(name == "Chester" | name == "Karolyn") %>%
qplot(year, prop, data = ., colour = name)babynames %>% filter(name == "Chester" | name == "Karolyn") %>%
group_by(name) %>%
summarize(mean_prop = mean(prop),
mean_n = mean(n),
sd_prop = sd(prop),
sd_n = sd(n)
)babynames %>% filter(name == "Chester" | name == "Karolyn") %>%
group_by(name) %>%
summarize(mean_prop = mean(prop),
mean_n = mean(n),
sd_prop = sd(prop),
sd_n = sd(n)
)## Source: local data frame [2 x 5]
##
## name mean_prop mean_n sd_prop sd_n
## (chr) (dbl) (dbl) (dbl) (dbl)
## 1 Chester 8.555174e-04 608.66332 1.108811e-03 868.48528
## 2 Karolyn 3.439051e-05 56.38614 2.539501e-05 37.77591
babynames %>% filter(name == "Chester" | name == "Karolyn") %>%
group_by(name) %>%
top_n(3)babynames %>% filter(name == "Chester" | name == "Karolyn") %>%
group_by(name) %>%
top_n(3)## Selecting by prop
## Source: local data frame [6 x 5]
## Groups: name [2]
##
## year sex name n prop
## (dbl) (chr) (chr) (int) (dbl)
## 1 1916 M Chester 3212 0.0034789541
## 2 1917 M Chester 3338 0.0034794827
## 3 1918 M Chester 3691 0.0035194849
## 4 1941 F Karolyn 149 0.0001196058
## 5 1943 F Karolyn 154 0.0001073054
## 6 1946 F Karolyn 167 0.0001035495
mod_names <- babynames %>%
filter(name == "Chester"
| name %in% c("Karolyn", "Carolyn", "Caroline")) %>%
mutate(kname = ifelse(name == "Chester", "Chester", "KarolynVar")) mod_names %>% qplot(year, prop, data = ., colour = kname)mod_names <- mod_names %>% group_by(year, kname) %>%
summarize(sumprop = sum(prop))
mod_names %>% qplot(year, sumprop, data = ., colour = kname)| R | Foreign Language | R examples |
|---|---|---|
| functions | verb | - sqrt() |
| - arrange() | ||
| - lm() | ||
| command | sentence | - exp(3) |
| - tail(babynames) |
KEY POINT - Exposure makes you fluent!
These two commands accomplish the same thing in R.
Data_Table %>% function_name (argument1, argument2)
function_name(Data_Table, argument1, argument2)Chaining is much easier to read when you want to do a series of steps.
filter(
summarise(
select(
group_by(flights, year, month, day),
arr_delay, dep_delay
),
arr = mean(arr_delay, na.rm = TRUE),
dep = mean(dep_delay, na.rm = TRUE)
),
arr > 30 | dep > 30
)flights %>%
group_by(year, month, day) %>%
select(arr_delay, dep_delay) %>%
summarise(
arr = mean(arr_delay, na.rm = TRUE),
dep = mean(dep_delay, na.rm = TRUE)
) %>%
filter(arr > 30 | dep > 30)library(pnwflights14)
data("flights", package = "pnwflights14")flights %>% str()## Classes 'tbl_df', 'tbl' and 'data.frame': 162049 obs. of 16 variables:
## $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int 1 4 8 28 34 37 346 526 527 536 ...
## $ dep_delay: num 96 -6 13 -2 44 82 227 -4 7 1 ...
## $ arr_time : int 235 738 548 800 325 747 936 1148 917 1334 ...
## $ arr_delay: num 70 -23 -4 -23 43 88 219 15 24 -6 ...
## $ carrier : chr "AS" "US" "UA" "US" ...
## $ tailnum : chr "N508AS" "N195UW" "N37422" "N547UW" ...
## $ flight : int 145 1830 1609 466 121 1823 1481 229 1576 478 ...
## $ origin : chr "PDX" "SEA" "PDX" "PDX" ...
## $ dest : chr "ANC" "CLT" "IAH" "CLT" ...
## $ air_time : num 194 252 201 251 201 224 202 217 136 268 ...
## $ distance : num 1542 2279 1825 2282 1448 ...
## $ hour : num 0 0 0 0 0 0 3 5 5 5 ...
## $ minute : num 1 4 8 28 34 37 46 26 27 36 ...
flights %>% group_by(origin) %>%
summarize(correl = cor(distance, arr_delay, use = "complete.obs"))## Source: local data frame [2 x 2]
##
## origin correl
## (chr) (dbl)
## 1 PDX -0.07893240
## 2 SEA -0.04535174
flights %>% summarize(correl = cor(dep_delay, month, use = "complete.obs"))## Source: local data frame [1 x 1]
##
## correl
## (dbl)
## 1 0.005312167
flights %>% filter(origin == "PDX") %>%
summarize(correl = cor(dep_delay, arr_delay, use = "complete.obs"))## Source: local data frame [1 x 1]
##
## correl
## (dbl)
## 1 0.9388435
lin_reg <- flights %>% filter(origin == "PDX") %>%
lm(arr_delay ~ dep_delay, data = .)
broom::tidy(lin_reg)## term estimate std.error statistic p.value
## 1 (Intercept) -3.9571898 0.049581177 -79.81234 0
## 2 dep_delay 0.9963924 0.001590334 626.53032 0
t_test <- flights %>% t.test(arr_delay ~ origin, data = .)
tidy(t_test) %>% rename("PDX" = estimate1, "SEA" = estimate2)## estimate PDX SEA statistic p.value parameter conf.low conf.high
## 1 -0.7970638 1.705651 2.502714 -4.707663 2.509114e-06 99052.6 -1.128913 -0.4652143
anova_model <- flights %>% aov(arr_delay ~ carrier, data = .)
tidy(anova_model)## term df sumsq meansq statistic p.value
## 1 carrier 10 1448964 144896.3545 150.3188 1.635115e-315
## 2 Residuals 160737 154938701 963.9268 NA NA
Data Processing with dplyr & tidyr : https://rpubs.com/bradleyboehmke/data_wrangling
Introduction to dplyr : https://goo.gl/SY6qBy
Your assignment : Right click on me and Save
Solutions (Rmd) : Right click and Save
Solutions (HTML) : Click away…after you’ve tried!
All workshops in ETC 211 from 4 - 5 PM
September 16 - Data analysis with Stata
September 23 - Data analysis with R
Slides available at http://rpubs.com/cismay/dawr_workshop_2015