library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)


1. who data set in the tidyr package

who_tidy <- who %>%
  pivot_longer(
    cols = new_sp_m014:newrel_f65, 
    names_to = "key", 
    values_to = "cases", 
    values_drop_na = TRUE
  ) %>%
  mutate(key = stringr::str_replace(key, "newrel", "new_rel")) %>%
  separate(key, into = c("new", "type", "sexage"), sep = "_") %>%
  select(-new, -iso2, -iso3) %>%
  separate(sexage, into = c("sex", "age"), sep = 1)


2.Using the tidied data, explore one question of your interest and answer it with visualization or summary table.

cases_by_gender <- who_tidy %>%
  group_by(sex) %>%
  summarize(total_cases = sum(cases))
ggplot(cases_by_gender, aes(x = sex, y = total_cases, fill = sex)) +
  geom_col() 


1.Tidy the data following what we learned in class.

tuition_data <- "/Users/HoangDucVinh/Downloads/us_avg_tuition.csv" %>% 
  read_csv() %>% 
  pivot_longer(cols = -State, names_to = "year", values_to = "tuition")
## Rows: 50 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): State, 2004-05, 2005-06, 2006-07, 2007-08, 2008-09, 2009-10, 2010-...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


2.Make an informative visualization of the data to show the average tuition across all years in the data set for each state. Which state has the highest tuition? Which state has the lowest?

tuition_summary <- tuition_data %>%
  mutate(tuition = gsub("[\\$,]", "", tuition),
         tuition = as.numeric(tuition)) %>% 
  group_by(State) %>%
  summarize(avg_tuition = mean(tuition, na.rm = TRUE))

ggplot(tuition_summary, aes(x = reorder(State, avg_tuition), y = avg_tuition)) +
  geom_col() +
  coord_flip() 


Highest is Vermont and lowest is Wyonming


3.Make an informative visualization of the data to show the increasing rate of average tuition from 2004-2005 to 2015-2016 in each state. Which state’s tuition increased at the fastest rate? Which one the slowest?

tuition_rate <- tuition_data %>%
  filter(year %in% c("2004-05", "2015-16")) %>%
  mutate(tuition = gsub("[\\$,]", "", tuition),
         tuition = as.numeric(tuition)) %>%
  pivot_wider(names_from = year, values_from = tuition) %>%
  mutate(increase_rate = (`2015-16` - `2004-05`) / `2004-05`) %>%
  arrange(desc(increase_rate))
ggplot(tuition_rate, aes(x = reorder(State, increase_rate), y = increase_rate)) +
  geom_col() +
  coord_flip()


Hawaii is the fastest and Ohio is the slowest.


1.Finish the lab exercise - Create a airport map with each airport location marked on the map and colored by the number of flights per day from NYC to each airport.

flights_per_day <- flights %>%
  group_by(dest) %>%
  summarize(flights_daily_avg = n() / 365)

airport_map_data <- flights_per_day %>%
  left_join(airports, by = c("dest" = "faa")) %>%
  filter(!is.na(lat), !is.na(lon))

ggplot(airport_map_data, aes(x = lon, y = lat)) +
  annotation_borders("state") + 
  geom_point(aes(color = flights_daily_avg), size = 3, alpha = 0.8) +
  scale_color_viridis_c() + 
  coord_quickmap()


2.What weather conditions make it more likely to see a departure delay? hot or cold weather? windy weather? rainy or snowy? foggy? Create a proper data frame and use proper visualization or summary table to answer the question.

flight_weather <- flights %>%
  inner_join(weather, by = c("year", "month", "day", "hour", "origin")) %>%
  filter(!is.na(dep_delay))
visibility_delay <- flight_weather %>%
  group_by(visib) %>%
  summarize(avg_delay = mean(dep_delay, na.rm = TRUE), count = n())

ggplot(visibility_delay, aes(x = visib, y = avg_delay)) +
  geom_line()

precip_delay <- flight_weather %>%
  mutate(precip_status = ifelse(precip > 0, "Precipitation", "No Precipitation")) %>%
  group_by(precip_status) %>%
  summarize(avg_delay = mean(dep_delay, na.rm = TRUE))
ggplot(precip_delay, aes(x = precip_status, y = avg_delay, fill = precip_status)) +
  geom_col()


3.Display the spatial pattern of arrival delays on June 13, 2013 using a map, and then use Google to cross-reference with the weather. Explain how the weather condition might have affected the spatial pattern of arrival delays.

jun_delays <- flights %>%
  filter(year == 2013, month == 6, day == 13) %>%
  group_by(dest) %>%
  summarize(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  left_join(airports, by = c("dest" = "faa")) %>%
  filter(!is.na(lat), !is.na(lon))

ggplot(jun_delays, aes(x = lon, y = lat)) +
  borders("state") +
  geom_point(aes(color = avg_arr_delay, size = avg_arr_delay),alpha = 0.7) +
  scale_color_gradient2() +
  coord_quickmap()
## Warning: `borders()` was deprecated in ggplot2 4.0.0.
## ℹ Please use `annotation_borders()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).