Data Science Assignment: Tidy and Relational Data

1. WHO Dataset in the tidyr Package

Tidying the WHO Data

who_tidy <- who %>%
  pivot_longer(
    cols = new_sp_m014:newrel_f65, 
    names_to = "key", 
    values_to = "cases", 
    values_drop_na = TRUE
  ) %>%
  mutate(key = stringr::str_replace(key, "newrel", "new_rel")) %>%
  separate(key, c("new", "type", "sexage"), sep = "_") %>%
  select(-new, -iso2, -iso3) %>%
  separate(sexage, c("sex", "age"), sep = 1)

head(who_tidy)

## # A tibble: 6 × 6
##   country      year type  sex   age   cases
##   <chr>       <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan  1997 sp    m     014       0
## 2 Afghanistan  1997 sp    m     1524     10
## 3 Afghanistan  1997 sp    m     2534      6
## 4 Afghanistan  1997 sp    m     3544      3
## 5 Afghanistan  1997 sp    m     4554      5
## 6 Afghanistan  1997 sp    m     5564      2

Exploratory Analysis

Question: Which country had the highest total number of cases in 2013?

who_2013 <- who_tidy %>%
  filter(year == 2013) %>%
  group_by(country) %>%
  summarise(total_cases = sum(cases)) %>%
  arrange(desc(total_cases))

head(who_2013, 5)

## # A tibble: 5 × 2
##   country      total_cases
##   <chr>              <dbl>
## 1 China             847176
## 2 Indonesia         325582
## 3 South Africa      312380
## 4 Bangladesh        181637
## 5 Nigeria           100401

2. US Average Tuition Dataset

Tidy the Data

We need to clean the currency symbols and pivot the year columns.

tuition_raw <- read_csv("us_avg_tuition (1).csv")

tuition_tidy <- tuition_raw %>%
  pivot_longer(cols = -State, names_to = "Year", values_to = "Tuition") %>%
  mutate(Tuition = as.numeric(gsub("[\\$,]", "", Tuition)))

head(tuition_tidy)

## # A tibble: 6 × 3
##   State   Year    Tuition
##   <chr>   <chr>     <dbl>
## 1 Alabama 2004-05    5683
## 2 Alabama 2005-06    5841
## 3 Alabama 2006-07    5753
## 4 Alabama 2007-08    6008
## 5 Alabama 2008-09    6475
## 6 Alabama 2009-10    7189

Visualization: Average Tuition by State

Question: Which state has the highest and lowest average tuition (across all years)?

tuition_avg <- tuition_tidy %>%
  group_by(State) %>%
  summarise(mean_tuition = mean(Tuition)) %>%
  arrange(desc(mean_tuition))

ggplot(tuition_avg, aes(x = reorder(State, mean_tuition), y = mean_tuition)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Average Tuition by State (2004-2016)", x = "State", y = "Average Cost ($)")

highest_state <- tuition_avg$State[1]
lowest_state <- tail(tuition_avg$State, 1)

Highest Tuition: Vermont
Lowest Tuition: Wyoming

Visualization: Rate of Increase (2004-2005 to 2015-2016)

We calculate the percentage increase between the first and last year available.

tuition_growth <- tuition_tidy %>%
  filter(Year %in% c("2004-05", "2015-16")) %>%
  pivot_wider(names_from = Year, values_from = Tuition) %>%
  mutate(Growth_Rate = (`2015-16` - `2004-05`) / `2004-05`) %>%
  arrange(desc(Growth_Rate))

ggplot(tuition_growth, aes(x = reorder(State, Growth_Rate), y = Growth_Rate)) +
  geom_point(color = "red") +
  coord_flip() +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Tuition Growth Rate (2004 to 2016)", x = "State", y = "% Increase")

Fastest Growth: Hawaii
Slowest Growth: Ohio

3. Analyze flights and weather (nycflights13)

3.1 Airport Map: Number of Flights from NYC

flights_map_data <- flights %>%
  group_by(dest) %>%
  summarise(n_flights = n() / 365) %>% # Daily average
  inner_join(airports, by = c("dest" = "faa"))

usa <- map_data("state")

ggplot() +
  geom_polygon(data = usa, aes(x = long, y = lat, group = group), fill = "grey90", color = "white") +
  geom_point(data = flights_map_data, aes(x = lon, y = lat, size = n_flights, color = n_flights)) +
  scale_color_viridis_c() +
  coord_quickmap() +
  labs(title = "Average Daily Flights from NYC to US Airports", color = "Flights/Day", size = "Flights/Day")

3.2 Weather Conditions and Departure Delays

weather_delays <- flights %>%
  inner_join(weather, by = c("origin", "year", "month", "day", "hour")) %>%
  group_by(precip > 0, temp > 80, wind_speed > 20) %>%
  summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE))

# Specifically looking at Precipitation
ggplot(flights %>% inner_join(weather), aes(x = factor(precip > 0), y = dep_delay)) +
  geom_boxplot(outlier.shape = NA) +
  ylim(0, 100) +
  labs(title = "Precipitation vs. Departure Delay", x = "Precipitation Present", y = "Delay (min)")

Conclusion: Higher wind speeds and precipitation (rain/snow) are generally associated with higher average departure delays.

3.3 Spatial Pattern of Arrival Delays (June 13, 2013)

june13_delays <- flights %>%
  filter(year == 2013, month == 6, day == 13) %>%
  group_by(dest) %>%
  summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  inner_join(airports, by = c("dest" = "faa"))

ggplot() +
  geom_polygon(data = usa, aes(x = long, y = lat, group = group), fill = "grey95") +
  geom_point(data = june13_delays, aes(x = lon, y = lat, color = avg_arr_delay, size = abs(avg_arr_delay))) +
  scale_color_gradient2(low = "blue", mid = "white", high = "red") +
  coord_quickmap() +
  labs(title = "Arrival Delays on June 13, 2013")

Weather Context: June 13, 2013, was notable for a “Derecho” (a massive wind storm) that swept across the Midwest and Northeast. The map shows significant red clusters in these regions, indicating that the weather system caused a spatial ripple effect of delays across the flight network.