library(tidyverse)
library(patchwork)
library(ggthemes)
library(ggplot2)
library(scales)
library(dplyr)
library(readr)
weather_forecasts <- read_csv("data/weather_forecasts.csv")
forecast_cities <- read_csv("data/forecast_cities.csv")

Report

This data set has information about sixteen months of forecasts and observations from 167 cities, including predicted and observed high and low temperatures, precipitation amounts, and general outlooks. By examining the discrepancy between forecasted and actual temperatures, I aim to identify geographic areas where weather prediction struggles and look into potential contributing factors such as elevation and climate.

Data Wrangling

Starting off with the data wrangling, I created a temp_error column calculated by taking the absolute difference between the forecasted and observed temperatures, as well as the mean_temp_error which was calculated for each city and state combination, so that I could compare the forecast accuracy across different locations. I also added a mean_temp_error for the state level. To explore the impact of the prediction timeframe on forecast accuracy, I grouped the data by forecast_hours_before, and the mean_temp_error was calculated for each forecast horizon (12, 24, 36, and 48 hours).

weather_forecasts <- weather_forecasts %>%
  mutate(temp_error = abs(forecast_temp - observed_temp))
city_error <- weather_forecasts %>%
  group_by(city, state) %>%
  summarize(mean_temp_error = mean(temp_error, na.rm = TRUE), .groups = "drop")

state_error <- weather_forecasts %>%
    group_by(state) %>%
    summarize(mean_temp_error = mean(temp_error, na.rm = TRUE), .groups = "drop")
horizon_error <- weather_forecasts %>%
  group_by(forecast_hours_before) %>%
  summarize(mean_temp_error = mean(temp_error, na.rm = TRUE), .groups = "drop")
city_error_with_info <- city_error %>%
  left_join(forecast_cities, by = c("city", "state"))

Plots

For my first figure, I wanted to observe the relationship between termperature prediction and forecast horizon. The figure shows that the temperature prediction is worse and less accurate the farther out the forecast horizon, which is typical for weather forecasting. It also investigates the accuracy of the forecast prediction and confirms there is indeed errors present.

ggplot(horizon_error, aes(x = factor(forecast_hours_before), y = mean_temp_error)) +
  geom_col(fill = "blue") +
  labs(title = "Mean Temperature Error by Forecast Horizon",
       x = "Forecast Horizon (hours)",
       y = "Mean Temperature Error (Fahrenheit)")

I also decided to use a lollipop graph to investigate and visualize the mean temperature error by state and rank them, which showed that Montana had the highest mean temperature error, whereas Puerto Rico had the lowest. This indicates that tropical climates could be more predictable temperature wise as there is less fluctuation, especially longer term, whereas places like Montana that have high elevation and a dry and continental climate could be more difficult to predict.

# new graph for new element excellent project requirement
# sources I used:
# https://r-graph-gallery.com/300-basic-lollipop-plot.html
# https://stackoverflow.com/questions/63165943/how-to-reorder-x-axis-based-on-y-axis-values-in-r-ggplot2?rq=3
state_error_sorted <- state_error %>% arrange(mean_temp_error)

ggplot(state_error_sorted, aes(x = reorder(state, mean_temp_error), y = mean_temp_error)) +
  geom_segment(aes(xend = state, yend = 0), color = "blue") +
  geom_point(aes(color = mean_temp_error), size = 4) +
  coord_flip() +
  labs(title = "Mean Temperature Error by State (Ranked)", x = "State", y = "Mean Temperature Error (Fahrenheit)") +
  theme_minimal()

I also wanted to look into the distribution of the temperature error, so I created a histogram to measure this, and I found that the temperature error with the highest frequency is around 2-3 degrees, and the overall distribution is right-skewed, which makes sense as given modern forecasting technology the highest frequencies should be smaller errors in temperature compared to larger ones.

ggplot(weather_forecasts, aes(x = temp_error)) +
  geom_histogram(bins = 60, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Temperature Error",
       x = "Temperature Error (Fahrenheit)",
       y = "Frequency") +
  coord_cartesian(xlim = c(0, 20)) +
  theme_bw() +
  theme(
    panel.grid = element_blank()
  )

I then calculated mean temperature errors and joined with city-specific information (e.g., elevation, distance to coast) from the forecast_cities dataset, as I wanted to analyze if there was any correlation of city characteristics that could cause a decline in forecast accuracy. I plotted 4 scatter plots of elevation, elevation change, wind, and distance to coast respectively, all versus mean temperature error. As we can see from the results, there seems to be no clear relationship between elevation and wind and the mean temperature error, but elevation change and especially distance to coast shows a suggestive pattern and noticeable trend, that mean temperature changes tends to increase with distance to coast, though there is still abundant scatter present. This seems to suggest distance to coast is the most influential on mean temperature error, and this could be because locations further inland experience greater temperature variability, likely due to the absence of the moderating influence of large bodies of water. This increased variability makes accurate temperature forecasting more challenging, leading to larger mean temperature errors.

plot1 <- ggplot(city_error_with_info, aes(x = elevation, y = mean_temp_error)) +
  geom_point() +
  labs(title = "Elevation vs. Mean Temperature Error",
       x = "Elevation (meters)",
       y = "Mean Temperature Error")
plot2 <- ggplot(city_error_with_info, aes(x = elevation_change_four, y = mean_temp_error)) +
  geom_point() +
  labs(title = "Elevation Change vs. Mean Temperature Error",
       x = "Elevation Change (meters)",
       y = "Mean Temperature Error")
plot3 <- ggplot(city_error_with_info, aes(x = wind, y = mean_temp_error)) +
  geom_point() +
  labs(title = "Wind vs. Mean Temperature Error",
       x = "Wind",
       y = "Mean Temperature Error")
plot4 <- ggplot(city_error_with_info, aes(x = distance_to_coast, y = mean_temp_error)) +
  geom_point() +
  labs(title = "Distance to Coast vs. Mean Temperature Error",
       x = "Distance to Coast (meters)",
       y = "Mean Temperature Error")

(plot1 + plot2) / (plot3 + plot4)

Lastly, I wanted to see the prediction accuracy from a geographic context, so I created a map with the mean temperature error by state. This showed that there seems like a general increase in the states with the highest mean temperature error are located in the West and Midwest parts of the United States (with some exceptions like Arizona), while the states with the lowest mean temperature error are located in the South and Northeast parts of the United States.

# used source to find abbreviation of states and states name: state.abb and state.name: https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/state.html#:~:text=R%20currently%20contains%20the%20following,abbreviations%20for%20the%20state%20names. 
library(maps)
us_states <- map_data("state")
state_error <- state_error %>%
  mutate(state_name = tolower(state.name[match(state, state.abb)])) 
map_data <- left_join(us_states, state_error, by = c("region" = "state_name"))
ggplot(map_data, aes(x = long, y = lat, group = group)) +
  geom_polygon(aes(fill = mean_temp_error), color = "black") +
  scale_fill_gradient(low = "blue", high = "red", na.value = "gray") +
  coord_quickmap() +
  labs(title = "Mean Temperature Error by State",
       fill = "Mean Temp Error") +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text = element_blank(),
    panel.grid = element_blank()
  ) +
  labs(
    x = "Longitude",
    y = "Latitude"
  )