who_tidy <- who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
) %>%
mutate(key = stringr::str_replace(key, "newrel", "new_rel")) %>%
separate(key, c("new", "type", "sexage"), sep = "_") %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, c("sex", "age"), sep = 1)
head(who_tidy)
## # A tibble: 6 × 6
## country year type sex age cases
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1997 sp m 014 0
## 2 Afghanistan 1997 sp m 1524 10
## 3 Afghanistan 1997 sp m 2534 6
## 4 Afghanistan 1997 sp m 3544 3
## 5 Afghanistan 1997 sp m 4554 5
## 6 Afghanistan 1997 sp m 5564 2
Question: Which country had the highest total number of cases in 2013?
who_2013 <- who_tidy %>%
filter(year == 2013) %>%
group_by(country) %>%
summarise(total_cases = sum(cases)) %>%
arrange(desc(total_cases))
head(who_2013, 5)
## # A tibble: 5 × 2
## country total_cases
## <chr> <dbl>
## 1 China 847176
## 2 Indonesia 325582
## 3 South Africa 312380
## 4 Bangladesh 181637
## 5 Nigeria 100401
We need to clean the currency symbols and pivot the year columns.
tuition_raw <- read_csv("us_avg_tuition (1).csv")
tuition_tidy <- tuition_raw %>%
pivot_longer(cols = -State, names_to = "Year", values_to = "Tuition") %>%
mutate(Tuition = as.numeric(gsub("[\\$,]", "", Tuition)))
head(tuition_tidy)
## # A tibble: 6 × 3
## State Year Tuition
## <chr> <chr> <dbl>
## 1 Alabama 2004-05 5683
## 2 Alabama 2005-06 5841
## 3 Alabama 2006-07 5753
## 4 Alabama 2007-08 6008
## 5 Alabama 2008-09 6475
## 6 Alabama 2009-10 7189
Question: Which state has the highest and lowest average tuition (across all years)?
tuition_avg <- tuition_tidy %>%
group_by(State) %>%
summarise(mean_tuition = mean(Tuition)) %>%
arrange(desc(mean_tuition))
ggplot(tuition_avg, aes(x = reorder(State, mean_tuition), y = mean_tuition)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Average Tuition by State (2004-2016)", x = "State", y = "Average Cost ($)")
highest_state <- tuition_avg$State[1]
lowest_state <- tail(tuition_avg$State, 1)
We calculate the percentage increase between the first and last year available.
tuition_growth <- tuition_tidy %>%
filter(Year %in% c("2004-05", "2015-16")) %>%
pivot_wider(names_from = Year, values_from = Tuition) %>%
mutate(Growth_Rate = (`2015-16` - `2004-05`) / `2004-05`) %>%
arrange(desc(Growth_Rate))
ggplot(tuition_growth, aes(x = reorder(State, Growth_Rate), y = Growth_Rate)) +
geom_point(color = "red") +
coord_flip() +
scale_y_continuous(labels = scales::percent) +
labs(title = "Tuition Growth Rate (2004 to 2016)", x = "State", y = "% Increase")
flights_map_data <- flights %>%
group_by(dest) %>%
summarise(n_flights = n() / 365) %>% # Daily average
inner_join(airports, by = c("dest" = "faa"))
usa <- map_data("state")
ggplot() +
geom_polygon(data = usa, aes(x = long, y = lat, group = group), fill = "grey90", color = "white") +
geom_point(data = flights_map_data, aes(x = lon, y = lat, size = n_flights, color = n_flights)) +
scale_color_viridis_c() +
coord_quickmap() +
labs(title = "Average Daily Flights from NYC to US Airports", color = "Flights/Day", size = "Flights/Day")
weather_delays <- flights %>%
inner_join(weather, by = c("origin", "year", "month", "day", "hour")) %>%
group_by(precip > 0, temp > 80, wind_speed > 20) %>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE))
# Specifically looking at Precipitation
ggplot(flights %>% inner_join(weather), aes(x = factor(precip > 0), y = dep_delay)) +
geom_boxplot(outlier.shape = NA) +
ylim(0, 100) +
labs(title = "Precipitation vs. Departure Delay", x = "Precipitation Present", y = "Delay (min)")
Conclusion: Higher wind speeds and precipitation (rain/snow) are generally associated with higher average departure delays.
june13_delays <- flights %>%
filter(year == 2013, month == 6, day == 13) %>%
group_by(dest) %>%
summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c("dest" = "faa"))
ggplot() +
geom_polygon(data = usa, aes(x = long, y = lat, group = group), fill = "grey95") +
geom_point(data = june13_delays, aes(x = lon, y = lat, color = avg_arr_delay, size = abs(avg_arr_delay))) +
scale_color_gradient2(low = "blue", mid = "white", high = "red") +
coord_quickmap() +
labs(title = "Arrival Delays on June 13, 2013")
Weather Context: June 13, 2013, was notable for a “Derecho” (a massive wind storm) that swept across the Midwest and Northeast. The map shows significant red clusters in these regions, indicating that the weather system caused a spatial ripple effect of delays across the flight network.