library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
x1 <- c("Dec", "Apr", "Jan", "Mar")
x2 <- c("Dec", "Apr", "Jam", "Mar")
sort(x1)
## [1] "Apr" "Dec" "Jan" "Mar"
#> [1] "Apr" "Dec" "Jan" "Mar"
month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
y1 <- factor(x1, levels = month_levels)
y1
## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
#> [1] Dec Apr Jan Mar
#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
sort(y1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
#> [1] Jan Mar Apr Dec
#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
y2 <- factor(x2, levels = month_levels)
y2
## [1] Dec Apr <NA> Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
#> [1] Dec Apr <NA> Mar
#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
y2 <- parse_factor(x2, levels = month_levels)
## Warning: 1 parsing failure.
## row col expected actual
## 3 -- value in level set Jam
#> Warning: 1 parsing failure.
#> row col expected actual
#> 3 -- value in level set Jam
factor(x1)
## [1] Dec Apr Jan Mar
## Levels: Apr Dec Jan Mar
#> [1] Dec Apr Jan Mar
#> Levels: Apr Dec Jan Mar
f1 <- factor(x1, levels = unique(x1))
f1
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
#> [1] Dec Apr Jan Mar
#> Levels: Dec Apr Jan Mar
f2 <- x1 %>% factor() %>% fct_inorder()
f2
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
#> [1] Dec Apr Jan Mar
#> Levels: Dec Apr Jan Mar
levels(f2)
## [1] "Dec" "Apr" "Jan" "Mar"
#> [1] "Dec" "Apr" "Jan" "Mar"
##General Social Survey
gss_cat
## # A tibble: 21,483 × 9
## year marital age race rincome partyid relig denom tvhours
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
## 1 2000 Never married 26 White $8000 to 9999 Ind,near … Prot… Sout… 12
## 2 2000 Divorced 48 White $8000 to 9999 Not str r… Prot… Bapt… NA
## 3 2000 Widowed 67 White Not applicable Independe… Prot… No d… 2
## 4 2000 Never married 39 White Not applicable Ind,near … Orth… Not … 4
## 5 2000 Divorced 25 White Not applicable Not str d… None Not … 1
## 6 2000 Married 25 White $20000 - 24999 Strong de… Prot… Sout… NA
## 7 2000 Never married 36 White $25000 or more Not str r… Chri… Not … 3
## 8 2000 Divorced 44 White $7000 to 7999 Ind,near … Prot… Luth… NA
## 9 2000 Married 44 White $25000 or more Not str d… Prot… Other 0
## 10 2000 Married 47 White $25000 or more Strong re… Prot… Sout… 3
## # ℹ 21,473 more rows
#> # A tibble: 21,483 × 9
#> year marital age race rincome partyid relig denom tvhours
#> <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
#> 1 2000 Never married 26 White $8000 to 9999 Ind,near r… Prot… Sout… 12
#> 2 2000 Divorced 48 White $8000 to 9999 Not str re… Prot… Bapt… NA
#> 3 2000 Widowed 67 White Not applicable Independent Prot… No d… 2
#> 4 2000 Never married 39 White Not applicable Ind,near r… Orth… Not … 4
#> 5 2000 Divorced 25 White Not applicable Not str de… None Not … 1
#> 6 2000 Married 25 White $20000 - 24999 Strong dem… Prot… Sout… NA
#> # ℹ 21,477 more rows
gss_cat %>%
count(race)
## # A tibble: 3 × 2
## race n
## <fct> <int>
## 1 Other 1959
## 2 Black 3129
## 3 White 16395
#> # A tibble: 3 × 2
#> race n
#> <fct> <int>
#> 1 Other 1959
#> 2 Black 3129
#> 3 White 16395
ggplot(gss_cat, aes(race)) +
geom_bar()
ggplot(gss_cat, aes(race)) +
geom_bar() +
scale_x_discrete(drop = FALSE)
rincome_plot <-
gss_cat %>%
ggplot(aes(x = rincome)) +
geom_bar()
rincome_plot
rincome_plot +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
rincome_plot +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
rincome_plot +
coord_flip()
Though more than asked for in this question, I could further improve this plot by
removing the “Not applicable” responses, renaming “Lt $1000” to “Less than $1000”, using color to distinguish non-response categories (“Refused”, “Don’t know”, and “No answer”) from income levels (“Lt $1000”, …), adding meaningful y- and x-axis titles, and formatting the counts axis labels to use commas.
gss_cat %>%
filter(!rincome %in% c("Not applicable")) %>%
mutate(rincome = fct_recode(rincome,
"Less than $1000" = "Lt $1000"
)) %>%
mutate(rincome_na = rincome %in% c("Refused", "Don't know", "No answer")) %>%
ggplot(aes(x = rincome, fill = rincome_na)) +
geom_bar() +
coord_flip() +
scale_y_continuous("Number of Respondents", labels = scales::comma) +
scale_x_discrete("Respondent's Income") +
scale_fill_manual(values = c("FALSE" = "black", "TRUE" = "gray")) +
theme(legend.position = "None")
gss_cat %>%
filter(!rincome %in% c("Not applicable", "Don't know", "No answer", "Refused")) %>%
mutate(rincome = fct_recode(rincome,
"Less than $1000" = "Lt $1000"
)) %>%
ggplot(aes(x = rincome)) +
geom_bar() +
coord_flip() +
scale_y_continuous("Number of Respondents", labels = scales::comma) +
scale_x_discrete("Respondent's Income")
What is the most common relig in this survey? What’s the most common partyid? The most common relig is “Protestant” The most common partyid is “Independent”
Which relig does denom (denomination) apply to? How can you find out with a table? How can you find out with a visualization?
levels(gss_cat$denom)
## [1] "No answer" "Don't know" "No denomination"
## [4] "Other" "Episcopal" "Presbyterian-dk wh"
## [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us"
## [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth"
## [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod"
## [16] "Luth ch in america" "Am lutheran" "Methodist-dk which"
## [19] "Other methodist" "United methodist" "Afr meth ep zion"
## [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists"
## [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am"
## [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable"
#> [1] "No answer" "Don't know" "No denomination"
#> [4] "Other" "Episcopal" "Presbyterian-dk wh"
#> [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us"
#> [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth"
#> [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod"
#> [16] "Luth ch in america" "Am lutheran" "Methodist-dk which"
#> [19] "Other methodist" "United methodist" "Afr meth ep zion"
#> [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists"
#> [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am"
#> [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable"
gss_cat %>%
filter(!denom %in% c(
"No answer", "Other", "Don't know", "Not applicable",
"No denomination"
)) %>%
count(relig)
## # A tibble: 1 × 2
## relig n
## <fct> <int>
## 1 Protestant 7025
#> # A tibble: 1 x 2
#> relig n
#> <fct> <int>
#> 1 Protestant 7025
gss_cat %>%
count(relig, denom) %>%
ggplot(aes(x = relig, y = denom, size = n)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90))
##Modifying factor order
relig_summary <- gss_cat %>%
group_by(relig) %>%
summarise(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
ggplot(relig_summary, aes(tvhours, relig)) + geom_point()
ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) +
geom_point()
relig_summary %>%
mutate(relig = fct_reorder(relig, tvhours)) %>%
ggplot(aes(tvhours, relig)) +
geom_point()
rincome_summary <- gss_cat %>%
group_by(rincome) %>%
summarise(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
ggplot(rincome_summary, aes(age, fct_reorder(rincome, age))) + geom_point()
ggplot(rincome_summary, aes(age, fct_relevel(rincome, "Not applicable"))) +
geom_point()
by_age <- gss_cat %>%
filter(!is.na(age)) %>%
count(age, marital) %>%
group_by(age) %>%
mutate(prop = n / sum(n))
ggplot(by_age, aes(age, prop, colour = marital)) +
geom_line(na.rm = TRUE)
ggplot(by_age, aes(age, prop, colour = fct_reorder2(marital, age, prop))) +
geom_line() +
labs(colour = "marital")
gss_cat %>%
mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(marital)) +
geom_bar()
summary(gss_cat[["tvhours"]])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 2.000 2.981 4.000 24.000 10146
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> 0 1 2 3 4 24 10146
gss_cat %>%
filter(!is.na(tvhours)) %>%
ggplot(aes(x = tvhours)) +
geom_histogram(binwidth = 1)
2.For each factor in gss_cat identify whether the order of the levels is arbitrary or principled.
keep(gss_cat, is.factor) %>% names()
## [1] "marital" "race" "rincome" "partyid" "relig" "denom"
#> [1] "marital" "race" "rincome" "partyid" "relig" "denom"
levels(gss_cat[["marital"]])
## [1] "No answer" "Never married" "Separated" "Divorced"
## [5] "Widowed" "Married"
#> [1] "No answer" "Never married" "Separated" "Divorced"
#> [5] "Widowed" "Married"
gss_cat %>%
ggplot(aes(x = marital)) +
geom_bar()
levels(gss_cat$race)
## [1] "Other" "Black" "White" "Not applicable"
#> [1] "Other" "Black" "White" "Not applicable"
gss_cat %>%
ggplot(aes(race)) +
geom_bar() +
scale_x_discrete(drop = FALSE)
levels(gss_cat$rincome)
## [1] "No answer" "Don't know" "Refused" "$25000 or more"
## [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999"
## [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999"
## [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable"
#> [1] "No answer" "Don't know" "Refused" "$25000 or more"
#> [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999"
#> [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999"
#> [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable"
levels(gss_cat$relig)
## [1] "No answer" "Don't know"
## [3] "Inter-nondenominational" "Native american"
## [5] "Christian" "Orthodox-christian"
## [7] "Moslem/islam" "Other eastern"
## [9] "Hinduism" "Buddhism"
## [11] "Other" "None"
## [13] "Jewish" "Catholic"
## [15] "Protestant" "Not applicable"
#> [1] "No answer" "Don't know"
#> [3] "Inter-nondenominational" "Native american"
#> [5] "Christian" "Orthodox-christian"
#> [7] "Moslem/islam" "Other eastern"
#> [9] "Hinduism" "Buddhism"
#> [11] "Other" "None"
#> [13] "Jewish" "Catholic"
#> [15] "Protestant" "Not applicable"
gss_cat %>%
ggplot(aes(relig)) +
geom_bar() +
coord_flip()
levels(gss_cat$denom)
## [1] "No answer" "Don't know" "No denomination"
## [4] "Other" "Episcopal" "Presbyterian-dk wh"
## [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us"
## [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth"
## [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod"
## [16] "Luth ch in america" "Am lutheran" "Methodist-dk which"
## [19] "Other methodist" "United methodist" "Afr meth ep zion"
## [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists"
## [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am"
## [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable"
#> [1] "No answer" "Don't know" "No denomination"
#> [4] "Other" "Episcopal" "Presbyterian-dk wh"
#> [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us"
#> [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth"
#> [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod"
#> [16] "Luth ch in america" "Am lutheran" "Methodist-dk which"
#> [19] "Other methodist" "United methodist" "Afr meth ep zion"
#> [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists"
#> [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am"
#> [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable"
levels(gss_cat$partyid)
## [1] "No answer" "Don't know" "Other party"
## [4] "Strong republican" "Not str republican" "Ind,near rep"
## [7] "Independent" "Ind,near dem" "Not str democrat"
## [10] "Strong democrat"
#> [1] "No answer" "Don't know" "Other party"
#> [4] "Strong republican" "Not str republican" "Ind,near rep"
#> [7] "Independent" "Ind,near dem" "Not str democrat"
#> [10] "Strong democrat"
3.Why did moving “Not applicable” to the front of the levels move it to the bottom of the plot? Because that gives the level “Not applicable” an integer value of 1.
gss_cat %>% count(partyid)
## # A tibble: 10 × 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Strong republican 2314
## 5 Not str republican 3032
## 6 Ind,near rep 1791
## 7 Independent 4119
## 8 Ind,near dem 2499
## 9 Not str democrat 3690
## 10 Strong democrat 3490
#> # A tibble: 10 × 2
#> partyid n
#> <fct> <int>
#> 1 No answer 154
#> 2 Don't know 1
#> 3 Other party 393
#> 4 Strong republican 2314
#> 5 Not str republican 3032
#> 6 Ind,near rep 1791
#> # ℹ 4 more rows
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat"
)) %>%
count(partyid)
## # A tibble: 10 × 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Republican, strong 2314
## 5 Republican, weak 3032
## 6 Independent, near rep 1791
## 7 Independent 4119
## 8 Independent, near dem 2499
## 9 Democrat, weak 3690
## 10 Democrat, strong 3490
#> # A tibble: 10 × 2
#> partyid n
#> <fct> <int>
#> 1 No answer 154
#> 2 Don't know 1
#> 3 Other party 393
#> 4 Republican, strong 2314
#> 5 Republican, weak 3032
#> 6 Independent, near rep 1791
#> # ℹ 4 more rows
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat",
"Other" = "No answer",
"Other" = "Don't know",
"Other" = "Other party"
)) %>%
count(partyid)
## # A tibble: 8 × 2
## partyid n
## <fct> <int>
## 1 Other 548
## 2 Republican, strong 2314
## 3 Republican, weak 3032
## 4 Independent, near rep 1791
## 5 Independent 4119
## 6 Independent, near dem 2499
## 7 Democrat, weak 3690
## 8 Democrat, strong 3490
#> # A tibble: 8 × 2
#> partyid n
#> <fct> <int>
#> 1 Other 548
#> 2 Republican, strong 2314
#> 3 Republican, weak 3032
#> 4 Independent, near rep 1791
#> 5 Independent 4119
#> 6 Independent, near dem 2499
#> # ℹ 2 more rows
gss_cat %>%
mutate(partyid = fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)) %>%
count(partyid)
## # A tibble: 4 × 2
## partyid n
## <fct> <int>
## 1 other 548
## 2 rep 5346
## 3 ind 8409
## 4 dem 7180
#> # A tibble: 4 × 2
#> partyid n
#> <fct> <int>
#> 1 other 548
#> 2 rep 5346
#> 3 ind 8409
#> 4 dem 7180
gss_cat %>%
mutate(relig = fct_lump(relig)) %>%
count(relig)
## # A tibble: 2 × 2
## relig n
## <fct> <int>
## 1 Protestant 10846
## 2 Other 10637
#> # A tibble: 2 × 2
#> relig n
#> <fct> <int>
#> 1 Protestant 10846
#> 2 Other 10637
gss_cat %>%
mutate(relig = fct_lump(relig, n = 10)) %>%
count(relig, sort = TRUE) %>%
print(n = Inf)
## # A tibble: 10 × 2
## relig n
## <fct> <int>
## 1 Protestant 10846
## 2 Catholic 5124
## 3 None 3523
## 4 Christian 689
## 5 Other 458
## 6 Jewish 388
## 7 Buddhism 147
## 8 Inter-nondenominational 109
## 9 Moslem/islam 104
## 10 Orthodox-christian 95
#> # A tibble: 10 × 2
#> relig n
#> <fct> <int>
#> 1 Protestant 10846
#> 2 Catholic 5124
#> 3 None 3523
#> 4 Christian 689
#> 5 Other 458
#> 6 Jewish 388
#> 7 Buddhism 147
#> 8 Inter-nondenominational 109
#> 9 Moslem/islam 104
#> 10 Orthodox-christian 95
1.How have the proportions of people identifying as Democrat, Republican, and Independent changed over time?
levels(gss_cat$partyid)
## [1] "No answer" "Don't know" "Other party"
## [4] "Strong republican" "Not str republican" "Ind,near rep"
## [7] "Independent" "Ind,near dem" "Not str democrat"
## [10] "Strong democrat"
#> [1] "No answer" "Don't know" "Other party"
#> [4] "Strong republican" "Not str republican" "Ind,near rep"
#> [7] "Independent" "Ind,near dem" "Not str democrat"
#> [10] "Strong democrat"
gss_cat %>%
mutate(
partyid =
fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)
) %>%
count(year, partyid) %>%
group_by(year) %>%
mutate(p = n / sum(n)) %>%
ggplot(aes(
x = year, y = p,
colour = fct_reorder2(partyid, year, p)
)) +
geom_point() +
geom_line() +
labs(colour = "Party ID.")
2.How could you collapse rincome into a small set of categories?
levels(gss_cat$rincome)
## [1] "No answer" "Don't know" "Refused" "$25000 or more"
## [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999"
## [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999"
## [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable"
#> [1] "No answer" "Don't know" "Refused" "$25000 or more"
#> [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999"
#> [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999"
#> [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable"
library("stringr")
gss_cat %>%
mutate(
rincome =
fct_collapse(
rincome,
`Unknown` = c("No answer", "Don't know", "Refused", "Not applicable"),
`Lt $5000` = c("Lt $1000", str_c(
"$", c("1000", "3000", "4000"),
" to ", c("2999", "3999", "4999")
)),
`$5000 to 10000` = str_c(
"$", c("5000", "6000", "7000", "8000"),
" to ", c("5999", "6999", "7999", "9999")
)
)
) %>%
ggplot(aes(x = rincome)) +
geom_bar() +
coord_flip()
library(tidyverse)
library(lubridate)
library(nycflights13)
today()
## [1] "2024-11-25"
#> [1] "2023-07-24"
now()
## [1] "2024-11-25 20:28:19 EST"
#> [1] "2023-07-24 15:08:49 UTC"
ymd("2017-01-31")
## [1] "2017-01-31"
#> [1] "2017-01-31"
mdy("January 31st, 2017")
## [1] "2017-01-31"
#> [1] "2017-01-31"
dmy("31-Jan-2017")
## [1] "2017-01-31"
#> [1] "2017-01-31"
ymd(20170131)
## [1] "2017-01-31"
#> [1] "2017-01-31"
ymd_hms("2017-01-31 20:11:59")
## [1] "2017-01-31 20:11:59 UTC"
#> [1] "2017-01-31 20:11:59 UTC"
mdy_hm("01/31/2017 08:01")
## [1] "2017-01-31 08:01:00 UTC"
#> [1] "2017-01-31 08:01:00 UTC"
ymd(20170131, tz = "UTC")
## [1] "2017-01-31 UTC"
#> [1] "2017-01-31 UTC"
flights %>%
select(year, month, day, hour, minute)
## # A tibble: 336,776 × 5
## year month day hour minute
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 5 15
## 2 2013 1 1 5 29
## 3 2013 1 1 5 40
## 4 2013 1 1 5 45
## 5 2013 1 1 6 0
## 6 2013 1 1 5 58
## 7 2013 1 1 6 0
## 8 2013 1 1 6 0
## 9 2013 1 1 6 0
## 10 2013 1 1 6 0
## # ℹ 336,766 more rows
#> # A tibble: 336,776 × 5
#> year month day hour minute
#> <int> <int> <int> <dbl> <dbl>
#> 1 2013 1 1 5 15
#> 2 2013 1 1 5 29
#> 3 2013 1 1 5 40
#> 4 2013 1 1 5 45
#> 5 2013 1 1 6 0
#> 6 2013 1 1 5 58
#> # ℹ 336,770 more rows
flights %>%
select(year, month, day, hour, minute) %>%
mutate(departure = make_datetime(year, month, day, hour, minute))
## # A tibble: 336,776 × 6
## year month day hour minute departure
## <int> <int> <int> <dbl> <dbl> <dttm>
## 1 2013 1 1 5 15 2013-01-01 05:15:00
## 2 2013 1 1 5 29 2013-01-01 05:29:00
## 3 2013 1 1 5 40 2013-01-01 05:40:00
## 4 2013 1 1 5 45 2013-01-01 05:45:00
## 5 2013 1 1 6 0 2013-01-01 06:00:00
## 6 2013 1 1 5 58 2013-01-01 05:58:00
## 7 2013 1 1 6 0 2013-01-01 06:00:00
## 8 2013 1 1 6 0 2013-01-01 06:00:00
## 9 2013 1 1 6 0 2013-01-01 06:00:00
## 10 2013 1 1 6 0 2013-01-01 06:00:00
## # ℹ 336,766 more rows
#> # A tibble: 336,776 × 6
#> year month day hour minute departure
#> <int> <int> <int> <dbl> <dbl> <dttm>
#> 1 2013 1 1 5 15 2013-01-01 05:15:00
#> 2 2013 1 1 5 29 2013-01-01 05:29:00
#> 3 2013 1 1 5 40 2013-01-01 05:40:00
#> 4 2013 1 1 5 45 2013-01-01 05:45:00
#> 5 2013 1 1 6 0 2013-01-01 06:00:00
#> 6 2013 1 1 5 58 2013-01-01 05:58:00
#> # ℹ 336,770 more rows
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
flights_dt
## # A tibble: 328,063 × 9
## origin dest dep_delay arr_delay dep_time sched_dep_time
## <chr> <chr> <dbl> <dbl> <dttm> <dttm>
## 1 EWR IAH 2 11 2013-01-01 05:17:00 2013-01-01 05:15:00
## 2 LGA IAH 4 20 2013-01-01 05:33:00 2013-01-01 05:29:00
## 3 JFK MIA 2 33 2013-01-01 05:42:00 2013-01-01 05:40:00
## 4 JFK BQN -1 -18 2013-01-01 05:44:00 2013-01-01 05:45:00
## 5 LGA ATL -6 -25 2013-01-01 05:54:00 2013-01-01 06:00:00
## 6 EWR ORD -4 12 2013-01-01 05:54:00 2013-01-01 05:58:00
## 7 EWR FLL -5 19 2013-01-01 05:55:00 2013-01-01 06:00:00
## 8 LGA IAD -3 -14 2013-01-01 05:57:00 2013-01-01 06:00:00
## 9 JFK MCO -3 -8 2013-01-01 05:57:00 2013-01-01 06:00:00
## 10 LGA ORD -2 8 2013-01-01 05:58:00 2013-01-01 06:00:00
## # ℹ 328,053 more rows
## # ℹ 3 more variables: arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>
#> # A tibble: 328,063 × 9
#> origin dest dep_delay arr_delay dep_time sched_dep_time
#> <chr> <chr> <dbl> <dbl> <dttm> <dttm>
#> 1 EWR IAH 2 11 2013-01-01 05:17:00 2013-01-01 05:15:00
#> 2 LGA IAH 4 20 2013-01-01 05:33:00 2013-01-01 05:29:00
#> 3 JFK MIA 2 33 2013-01-01 05:42:00 2013-01-01 05:40:00
#> 4 JFK BQN -1 -18 2013-01-01 05:44:00 2013-01-01 05:45:00
#> 5 LGA ATL -6 -25 2013-01-01 05:54:00 2013-01-01 06:00:00
#> 6 EWR ORD -4 12 2013-01-01 05:54:00 2013-01-01 05:58:00
#> # ℹ 328,057 more rows
#> # ℹ 3 more variables: arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>
flights_dt %>%
ggplot(aes(dep_time)) +
geom_freqpoly(binwidth = 86400) # 86400 seconds = 1 day
flights_dt %>%
filter(dep_time < ymd(20130102)) %>%
ggplot(aes(dep_time)) +
geom_freqpoly(binwidth = 600) # 600 s = 10 minutes
as_datetime(today())
## [1] "2024-11-25 UTC"
#> [1] "2023-07-24 UTC"
as_date(now())
## [1] "2024-11-25"
#> [1] "2023-07-24"
as_datetime(60 * 60 * 10)
## [1] "1970-01-01 10:00:00 UTC"
#> [1] "1970-01-01 10:00:00 UTC"
as_date(365 * 10 + 2)
## [1] "1980-01-01"
#> [1] "1980-01-01"
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
1.What happens if you parse a string that contains invalid dates?
ymd(c("2010-10-10", "bananas"))
## Warning: 1 failed to parse.
## [1] "2010-10-10" NA
it produces an NA and a warning message. 2.What does the tzone argument to today() do? Why is it important? It determines the time-zone of the date. Since different time-zones can have different dates, the value of today() can vary depending on the time-zone specified. 3.Use the appropriate lubridate function to parse each of the following dates:
d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015)", "July 1 (2015)")
d5 <- "12/30/14" # Dec 30, 2014
mdy(d1)
## [1] "2010-01-01"
#> [1] "2010-01-01"
ymd(d2)
## [1] "2015-03-07"
#> [1] "2015-03-07"
dmy(d3)
## [1] "2017-06-06"
#> [1] "2017-06-06"
mdy(d4)
## [1] "2015-08-19" "2015-07-01"
#> [1] "2015-08-19" "2015-07-01"
mdy(d5)
## [1] "2014-12-30"
#> [1] "2014-12-30"
datetime <- ymd_hms("2016-07-08 12:34:56")
year(datetime)
## [1] 2016
#> [1] 2016
month(datetime)
## [1] 7
#> [1] 7
mday(datetime)
## [1] 8
#> [1] 8
yday(datetime)
## [1] 190
#> [1] 190
wday(datetime)
## [1] 6
#> [1] 6
month(datetime, label = TRUE)
## [1] Jul
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
#> [1] Jul
#> 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
wday(datetime, label = TRUE, abbr = FALSE)
## [1] Friday
## 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
#> [1] Friday
#> 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
flights_dt %>%
mutate(wday = wday(dep_time, label = TRUE)) %>%
ggplot(aes(x = wday)) +
geom_bar()
flights_dt %>%
mutate(minute = minute(dep_time)) %>%
group_by(minute) %>%
summarise(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()) %>%
ggplot(aes(minute, avg_delay)) +
geom_line()
sched_dep <- flights_dt %>%
mutate(minute = minute(sched_dep_time)) %>%
group_by(minute) %>%
summarise(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n())
ggplot(sched_dep, aes(minute, avg_delay)) +
geom_line()
ggplot(sched_dep, aes(minute, n)) +
geom_line()
flights_dt %>%
count(week = floor_date(dep_time, "week")) %>%
ggplot(aes(week, n)) +
geom_line()
(datetime <- ymd_hms("2016-07-08 12:34:56"))
## [1] "2016-07-08 12:34:56 UTC"
#> [1] "2016-07-08 12:34:56 UTC"
year(datetime) <- 2020
datetime
## [1] "2020-07-08 12:34:56 UTC"
#> [1] "2020-07-08 12:34:56 UTC"
month(datetime) <- 01
datetime
## [1] "2020-01-08 12:34:56 UTC"
#> [1] "2020-01-08 12:34:56 UTC"
hour(datetime) <- hour(datetime) + 1
datetime
## [1] "2020-01-08 13:34:56 UTC"
#> [1] "2020-01-08 13:34:56 UTC"
update(datetime, year = 2020, month = 2, mday = 2, hour = 2)
## [1] "2020-02-02 02:34:56 UTC"
#> [1] "2020-02-02 02:34:56 UTC"
ymd("2015-02-01") %>%
update(mday = 30)
## [1] "2015-03-02"
#> [1] "2015-03-02"
ymd("2015-02-01") %>%
update(hour = 400)
## [1] "2015-02-17 16:00:00 UTC"
#> [1] "2015-02-17 16:00:00 UTC"
flights_dt %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
ggplot(aes(dep_hour)) +
geom_freqpoly(binwidth = 300)
## Exercises How does the distribution of flight times within a day
change over the course of the year?
flights_dt %>%
filter(!is.na(dep_time)) %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
mutate(month = factor(month(dep_time))) %>%
ggplot(aes(dep_hour, color = month)) +
geom_freqpoly(binwidth = 60 * 60)
flights_dt %>%
filter(!is.na(dep_time)) %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
mutate(month = factor(month(dep_time))) %>%
ggplot(aes(dep_hour, color = month)) +
geom_freqpoly(aes(y = ..density..), binwidth = 60 * 60)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Compare dep_time, sched_dep_time and dep_delay. Are they consistent? Explain your findings.
flights_dt %>%
mutate(dep_time_ = sched_dep_time + dep_delay * 60) %>%
filter(dep_time_ != dep_time) %>%
select(dep_time_, dep_time, sched_dep_time, dep_delay)
## # A tibble: 1,205 × 4
## dep_time_ dep_time sched_dep_time dep_delay
## <dttm> <dttm> <dttm> <dbl>
## 1 2013-01-02 08:48:00 2013-01-01 08:48:00 2013-01-01 18:35:00 853
## 2 2013-01-03 00:42:00 2013-01-02 00:42:00 2013-01-02 23:59:00 43
## 3 2013-01-03 01:26:00 2013-01-02 01:26:00 2013-01-02 22:50:00 156
## 4 2013-01-04 00:32:00 2013-01-03 00:32:00 2013-01-03 23:59:00 33
## 5 2013-01-04 00:50:00 2013-01-03 00:50:00 2013-01-03 21:45:00 185
## 6 2013-01-04 02:35:00 2013-01-03 02:35:00 2013-01-03 23:59:00 156
## 7 2013-01-05 00:25:00 2013-01-04 00:25:00 2013-01-04 23:59:00 26
## 8 2013-01-05 01:06:00 2013-01-04 01:06:00 2013-01-04 22:45:00 141
## 9 2013-01-06 00:14:00 2013-01-05 00:14:00 2013-01-05 23:59:00 15
## 10 2013-01-06 00:37:00 2013-01-05 00:37:00 2013-01-05 22:30:00 127
## # ℹ 1,195 more rows
#> # A tibble: 1,205 x 4
#> dep_time_ dep_time sched_dep_time dep_delay
#> <dttm> <dttm> <dttm> <dbl>
#> 1 2013-01-02 08:48:00 2013-01-01 08:48:00 2013-01-01 18:35:00 853
#> 2 2013-01-03 00:42:00 2013-01-02 00:42:00 2013-01-02 23:59:00 43
#> 3 2013-01-03 01:26:00 2013-01-02 01:26:00 2013-01-02 22:50:00 156
#> 4 2013-01-04 00:32:00 2013-01-03 00:32:00 2013-01-03 23:59:00 33
#> 5 2013-01-04 00:50:00 2013-01-03 00:50:00 2013-01-03 21:45:00 185
#> 6 2013-01-04 02:35:00 2013-01-03 02:35:00 2013-01-03 23:59:00 156
#> # … with 1,199 more rows
Compare air_time with the duration between the departure and arrival. Explain your findings. (Hint: consider the location of the airport.)
flights_dt %>%
mutate(
flight_duration = as.numeric(arr_time - dep_time),
air_time_mins = air_time,
diff = flight_duration - air_time_mins
) %>%
select(origin, dest, flight_duration, air_time_mins, diff)
## # A tibble: 328,063 × 5
## origin dest flight_duration air_time_mins diff
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 EWR IAH 193 227 -34
## 2 LGA IAH 197 227 -30
## 3 JFK MIA 221 160 61
## 4 JFK BQN 260 183 77
## 5 LGA ATL 138 116 22
## 6 EWR ORD 106 150 -44
## 7 EWR FLL 198 158 40
## 8 LGA IAD 72 53 19
## 9 JFK MCO 161 140 21
## 10 LGA ORD 115 138 -23
## # ℹ 328,053 more rows
#> # A tibble: 328,063 x 5
#> origin dest flight_duration air_time_mins diff
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 EWR IAH 193 227 -34
#> 2 LGA IAH 197 227 -30
#> 3 JFK MIA 221 160 61
#> 4 JFK BQN 260 183 77
#> 5 LGA ATL 138 116 22
#> 6 EWR ORD 106 150 -44
#> # … with 328,057 more rows
How does the average delay time change over the course of a day? Should you use dep_time or sched_dep_time? Why?
flights_dt %>%
mutate(sched_dep_hour = hour(sched_dep_time)) %>%
group_by(sched_dep_hour) %>%
summarise(dep_delay = mean(dep_delay)) %>%
ggplot(aes(y = dep_delay, x = sched_dep_hour)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#> `summarise()` ungrouping output (override with `.groups` argument)
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
On what day of the week should you leave if you want to minimise the chance of a delay?
flights_dt %>%
mutate(dow = wday(sched_dep_time)) %>%
group_by(dow) %>%
summarise(
dep_delay = mean(dep_delay),
arr_delay = mean(arr_delay, na.rm = TRUE)
) %>%
print(n = Inf)
## # A tibble: 7 × 3
## dow dep_delay arr_delay
## <dbl> <dbl> <dbl>
## 1 1 11.5 4.82
## 2 2 14.7 9.65
## 3 3 10.6 5.39
## 4 4 11.7 7.05
## 5 5 16.1 11.7
## 6 6 14.7 9.07
## 7 7 7.62 -1.45
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 7 x 3
#> dow dep_delay arr_delay
#> <dbl> <dbl> <dbl>
#> 1 1 11.5 4.82
#> 2 2 14.7 9.65
#> 3 3 10.6 5.39
#> 4 4 11.7 7.05
#> 5 5 16.1 11.7
#> 6 6 14.7 9.07
#> 7 7 7.62 -1.45
flights_dt %>%
mutate(wday = wday(dep_time, label = TRUE)) %>%
group_by(wday) %>%
summarize(ave_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = wday, y = ave_dep_delay)) +
geom_bar(stat = "identity")
#> `summarise()` ungrouping output (override with `.groups` argument)
flights_dt %>%
mutate(wday = wday(dep_time, label = TRUE)) %>%
group_by(wday) %>%
summarize(ave_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
ggplot(aes(x = wday, y = ave_arr_delay)) +
geom_bar(stat = "identity")
#> `summarise()` ungrouping output (override with `.groups` argument)
What makes the distribution of diamonds\(carat and flights\)sched_dep_time similar?
ggplot(diamonds, aes(x = carat)) +
geom_density()
ggplot(diamonds, aes(x = carat %% 1 * 100)) +
geom_histogram(binwidth = 1)
ggplot(flights_dt, aes(x = minute(sched_dep_time))) +
geom_histogram(binwidth = 1)
flights_dt %>%
mutate(minute = minute(dep_time),
early = dep_delay < 0) %>%
group_by(minute) %>%
summarise(
early = mean(early, na.rm = TRUE),
n = n()) %>%
ggplot(aes(minute, early)) +
geom_line()
#> `summarise()` ungrouping output (override with `.groups` argument)
Confirm my hypothesis that the early departures of flights in minutes 20-30 and 50-60 are caused by scheduled flights that leave early. Hint: create a binary variable that tells you whether or not a flight was delayed.
flights_dt %>%
mutate(minute = minute(dep_time),
early = dep_delay < 0) %>%
group_by(minute) %>%
summarise(
early = mean(early, na.rm = TRUE),
n = n()) %>%
ggplot(aes(minute, early)) +
geom_line()
#> `summarise()` ungrouping output (override with `.groups` argument)