Introduction

Accurate weather forecasts play a vital role in decision-making for various sectors. However, forecast accuracy can vary significantly depending on geographic and climate factors. In this report, we explore forecast accuracy for high and low temperature predictions across 167 U.S. cities. We aim to determine which areas struggle with forecast errors and investigate possible reasons behind these inaccuracies.

Load Libraries and Data

library(tidyverse)
library(ggthemes)
library(lubridate)
library(viridis)
library(ggridges)

# Load datasets
weather_forecasts <- read_csv("data/weather_forecasts.csv")
forecast_cities <- read_csv("data/forecast_cities.csv")
outlook_meanings <- read_csv("data/outlook_meanings.csv")

Data Preparation

# Ensure date columns are properly parsed
weather_forecasts <- weather_forecasts %>%
  mutate(date = ymd(date),
         high_or_low = as.factor(high_or_low),
         forecast_outlook = as.factor(forecast_outlook))

forecast_cities <- forecast_cities %>%
  mutate(city = as.factor(city), state = as.factor(state))

Data Integration

# Join weather data with city information
full_data <- weather_forecasts %>%
  left_join(forecast_cities, by = "city") %>%
  left_join(outlook_meanings, by = "forecast_outlook")

# Create a new column for forecast error (absolute difference between forecast and observed temp)
full_data <- full_data %>%
  mutate(temp_error = abs(forecast_temp - observed_temp))

Summary of Forecast Errors

# Summary statistics for overall error
overall_accuracy <- full_data %>%
  summarize(
    mean_error = mean(temp_error, na.rm = TRUE),
    median_error = median(temp_error, na.rm = TRUE),
    error_sd = sd(temp_error, na.rm = TRUE)
  )

overall_accuracy
## # A tibble: 1 × 3
##   mean_error median_error error_sd
##        <dbl>        <dbl>    <dbl>
## 1       2.32            2     2.12

Analyzing Error by State

# Average error by state

# Calculate the average error by state and select the top cities with the highest errors

error_by_state_top <- full_data %>%
  group_by(state.x) %>%
  summarize(avg_error = mean(temp_error, na.rm = TRUE)) %>%
  arrange(desc(avg_error)) %>%
  slice_head(n = 10)

error_by_state_top
## # A tibble: 10 × 2
##    state.x avg_error
##    <chr>       <dbl>
##  1 MT           3.23
##  2 AK           3.18
##  3 ND           2.89
##  4 SD           2.87
##  5 CO           2.84
##  6 NE           2.82
##  7 ID           2.67
##  8 NH           2.67
##  9 WV           2.62
## 10 KS           2.58

Analyzing Error by City

# Average error by city per state

error_by_city_top <- full_data %>%
  group_by(city, state.x) %>%
  summarize(avg_error = mean(temp_error, na.rm = TRUE), .groups = "drop") %>%
  
  # For each state, keep only the single city with the highest error
  group_by(state.x) %>%
  slice_max(order_by = avg_error, n = 1) %>%
  ungroup() %>%
  slice_max(order_by = avg_error, n = 10)

error_by_city_top
## # A tibble: 10 × 3
##    city         state.x avg_error
##    <chr>        <chr>       <dbl>
##  1 FAIRBANKS    AK           4.14
##  2 HELENA       MT           3.74
##  3 CASPER       WY           3.31
##  4 YAKIMA       WA           3.25
##  5 NORTH_PLATTE NE           3.08
##  6 POCATELLO    ID           3.05
##  7 BISMARCK     ND           3.03
##  8 PUEBLO       CO           3.02
##  9 SAN_ANGELO   TX           3.01
## 10 ELKINS       WV           2.94
ggplot(error_by_city_top, aes(x = reorder(paste(city, state.x, sep = ", "), avg_error), y = avg_error, fill = avg_error)) +
  geom_col() +
  scale_fill_viridis(option = "C") +
  labs(title = "Top 10 Cities (One per State) with the Highest Forecast Errors",
       x = "State", y = "Average Error (°F)", fill = "Error") +
  theme_minimal() +
  coord_flip()


# Error Metrics Calculation


``` r
#Calculate additional forecast accuracy metrics
accuracy_metrics <- full_data %>%
  summarize(
    mean_error = mean(forecast_temp - observed_temp, na.rm = TRUE),
    mean_absolute_error = mean(abs(forecast_temp - observed_temp), na.rm = TRUE),
    root_mean_squared_error = sqrt(mean((forecast_temp - observed_temp)^2, na.rm = TRUE)),
    mean_absolute_percentage_error = mean(abs((forecast_temp - observed_temp) / observed_temp) * 100, na.rm = TRUE)
  )

accuracy_metrics
## # A tibble: 1 × 4
##   mean_error mean_absolute_error root_mean_squared_error mean_absolute_percent…¹
##        <dbl>               <dbl>                   <dbl>                   <dbl>
## 1     -0.429                2.32                    3.14                     Inf
## # ℹ abbreviated name: ¹​mean_absolute_percentage_error

Relationship Between Elevation and Error

# Analyze relationship between elevation and forecast error
elevation_analysis <- full_data %>%
  group_by(city, state.x) %>%
  summarize(avg_error = mean(temp_error, na.rm = TRUE),
            elevation = mean(elevation, na.rm = TRUE))

# plot
plot_elevation <- ggplot(elevation_analysis, aes(x = elevation, y = avg_error)) +
  geom_point(color = "darkblue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Relationship Between Elevation and Forecast Error",
       x = "Elevation (meters)", y = "Average Temperature Error (°F)") +
  theme_minimal()

plot_elevation

Insight:

Higher elevation is associated with larger forecast errors, possibly due to complex atmospheric conditions at higher altitudes.

Error Over Time

# Analyze average error over time
error_over_time <- full_data %>%
  group_by(date) %>%
  summarize(avg_error = mean(temp_error, na.rm = TRUE))

# Plot
ggplot(error_over_time, aes(x = date, y = avg_error)) +
  geom_line(color = "steelblue", size = 1) +
  labs(title = "Forecast Accuracy Over Time",
       x = "Date", y = "Average Temperature Error (°F)") +
  theme_minimal()

Insight:

The plot shows fluctuations in error over time, with certain periods having higher spikes. These may correspond to extreme weather events or seasonal variability.

Error by Forecast Horizon

# Analyze error by forecast horizon
error_by_horizon <- full_data %>%
  group_by(forecast_hours_before) %>%
  summarize(avg_error = mean(temp_error, na.rm = TRUE))


#Visualize using boxplot (simpler and faster)
plot_box <- ggplot(full_data, aes(x = factor(forecast_hours_before), y = temp_error)) +
  geom_boxplot(fill = "skyblue", color = "darkblue", outlier.shape = NA) +
  labs(title = "Boxplot of Forecast Errors by Forecast Horizon",
       x = "Forecast Horizon (Hours Before Observation)", 
       y = "Temperature Error (°F)") +
  coord_cartesian(ylim = c(0, 8)) +  
  theme_minimal()

plot_box

Insight:

Forecast errors increase with longer forecast horizons, indicating reduced accuracy for longer-term predictions.

Error by Day of Week (New Element)

# Add a day-of-week column based on the date
full_data <- full_data %>%
  mutate(weekday = wday(date, label = TRUE))

# Compute the average error by day of the week
error_by_weekday <- full_data %>%
  group_by(weekday) %>%
  summarize(avg_error = mean(temp_error, na.rm = TRUE))

# Plot the average error by weekday
ggplot(error_by_weekday, aes(x = weekday, y = avg_error, fill = weekday)) +
  geom_col() +
  labs(title = "Average Forecast Error by Day of Week",
       x = "Day of Week", y = "Average Temperature Error (°F)") +
  theme_minimal() +
  scale_fill_viridis_d()

Insight:

The forecast error analysis by day of week reveals distinct error patterns across weekdays. These variations may be linked to operational forecasting cycles or model update timings that differ between weekdays and weekends.

Error by Distance to Coast (New Element)

library(ggrepel)

# Compute the average error by distance to coast
coast_analysis <- full_data %>%
  group_by(city, state.x) %>%
  summarize(avg_error = mean(temp_error, na.rm = TRUE),
            dist_coast = mean(distance_to_coast, na.rm = TRUE))

# Plot the relationship
ggplot(coast_analysis, aes(x = dist_coast, y = avg_error, 
                           label = paste(city, state.x, sep = ", "))) +
  geom_point(aes(color = avg_error), alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "darkblue", linetype = "dashed") +
  geom_text_repel(size = 3, max.overlaps = 10) +
  scale_color_gradient(low = "lightblue", high = "darkred") +
  labs(title = "Relationship Between Distance to Coast and Forecast Error",
       x = "Distance to Coast (miles)",
       y = "Average Forecast Error (°F)",
       color = "Avg Error",
       size = "Avg Error") +
  theme_minimal() +
  theme(legend.position = "bottom")

Insight:

Cities farther inland may experience larger forecast errors (or vice versa), as seen in the positive slope. This shows the role that coastal influences can have on local weather patterns.

Analyzing Climate Factors

# Group data by climate classification and calculate error
error_by_climate <- full_data %>%
  group_by(koppen) %>%
  summarize(avg_error = mean(temp_error, na.rm = TRUE)) %>%
  arrange(desc(avg_error))

# Plot
ggplot(error_by_climate, aes(x = reorder(koppen, avg_error), y = avg_error, fill = avg_error)) +
  geom_col() +
  scale_fill_viridis(option = "C") +
  labs(title = "Average Forecast Error by Climate Classification",
       x = "Climate Type", y = "Average Error (°F)", fill = "Error") +
  theme_minimal() +
  coord_flip()

Insight:

Different climate classifications exhibit varying forecast errors. Areas with rapidly changing weather conditions tend to have larger errors, highlighting the challenge in forecasting for dynamic climates.

Conclusion

This analysis demonstrates that forecast errors are influenced by several factors:

These insights can help inform improvements in forecasting models, particularly in regions with complex geographic or climatic challenges. By addressing grouping inconsistencies and exploring additional factors like distance to the coast, we gain a more comprehensive understanding of where and why forecast errors occur.