library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
1. who data set in the tidyr package
who_tidy <- who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
) %>%
mutate(key = stringr::str_replace(key, "newrel", "new_rel")) %>%
separate(key, into = c("new", "type", "sexage"), sep = "_") %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, into = c("sex", "age"), sep = 1)
2.Using the tidied data, explore one question of your interest
and answer it with visualization or summary table.
cases_by_gender <- who_tidy %>%
group_by(sex) %>%
summarize(total_cases = sum(cases))
ggplot(cases_by_gender, aes(x = sex, y = total_cases, fill = sex)) +
geom_col()
1.Tidy the data following what we learned in class.
tuition_data <- "/Users/HoangDucVinh/Downloads/us_avg_tuition.csv" %>%
read_csv() %>%
pivot_longer(cols = -State, names_to = "year", values_to = "tuition")
## Rows: 50 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): State, 2004-05, 2005-06, 2006-07, 2007-08, 2008-09, 2009-10, 2010-...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
2.Make an informative visualization of the data to show the
average tuition across all years in the data set for each state. Which
state has the highest tuition? Which state has the lowest?
tuition_summary <- tuition_data %>%
mutate(tuition = gsub("[\\$,]", "", tuition),
tuition = as.numeric(tuition)) %>%
group_by(State) %>%
summarize(avg_tuition = mean(tuition, na.rm = TRUE))
ggplot(tuition_summary, aes(x = reorder(State, avg_tuition), y = avg_tuition)) +
geom_col() +
coord_flip()
Highest is Vermont and lowest is Wyonming
3.Make an informative visualization of the data to show the
increasing rate of average tuition from 2004-2005 to 2015-2016 in each
state. Which state’s tuition increased at the fastest rate? Which one
the slowest?
tuition_rate <- tuition_data %>%
filter(year %in% c("2004-05", "2015-16")) %>%
mutate(tuition = gsub("[\\$,]", "", tuition),
tuition = as.numeric(tuition)) %>%
pivot_wider(names_from = year, values_from = tuition) %>%
mutate(increase_rate = (`2015-16` - `2004-05`) / `2004-05`) %>%
arrange(desc(increase_rate))
ggplot(tuition_rate, aes(x = reorder(State, increase_rate), y = increase_rate)) +
geom_col() +
coord_flip()
Hawaii is the fastest and Ohio is the slowest.
1.Finish the lab exercise - Create a airport map with each
airport location marked on the map and colored by the number of
flights per day from NYC to each airport.
flights_per_day <- flights %>%
group_by(dest) %>%
summarize(flights_daily_avg = n() / 365)
airport_map_data <- flights_per_day %>%
left_join(airports, by = c("dest" = "faa")) %>%
filter(!is.na(lat), !is.na(lon))
ggplot(airport_map_data, aes(x = lon, y = lat)) +
annotation_borders("state") +
geom_point(aes(color = flights_daily_avg), size = 3, alpha = 0.8) +
scale_color_viridis_c() +
coord_quickmap()
2.What weather conditions make it more likely to see a departure
delay? hot or cold weather? windy weather? rainy or snowy? foggy? Create
a proper data frame and use proper visualization or summary table to
answer the question.
flight_weather <- flights %>%
inner_join(weather, by = c("year", "month", "day", "hour", "origin")) %>%
filter(!is.na(dep_delay))
visibility_delay <- flight_weather %>%
group_by(visib) %>%
summarize(avg_delay = mean(dep_delay, na.rm = TRUE), count = n())
ggplot(visibility_delay, aes(x = visib, y = avg_delay)) +
geom_line()
precip_delay <- flight_weather %>%
mutate(precip_status = ifelse(precip > 0, "Precipitation", "No Precipitation")) %>%
group_by(precip_status) %>%
summarize(avg_delay = mean(dep_delay, na.rm = TRUE))
ggplot(precip_delay, aes(x = precip_status, y = avg_delay, fill = precip_status)) +
geom_col()
3.Display the spatial pattern of arrival delays on June 13, 2013
using a map, and then use Google to cross-reference with the weather.
Explain how the weather condition might have affected the spatial
pattern of arrival delays.
jun_delays <- flights %>%
filter(year == 2013, month == 6, day == 13) %>%
group_by(dest) %>%
summarize(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
left_join(airports, by = c("dest" = "faa")) %>%
filter(!is.na(lat), !is.na(lon))
ggplot(jun_delays, aes(x = lon, y = lat)) +
borders("state") +
geom_point(aes(color = avg_arr_delay, size = avg_arr_delay),alpha = 0.7) +
scale_color_gradient2() +
coord_quickmap()
## Warning: `borders()` was deprecated in ggplot2 4.0.0.
## ℹ Please use `annotation_borders()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).