library(tidyverse)
library(patchwork)
library(readr)
q <- 0
weather <- read_csv("data/weather_forecasts.csv")
## Rows: 651968 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): city, state, high_or_low, forecast_outlook, possible_error
## dbl  (4): forecast_hours_before, observed_temp, forecast_temp, observed_precip
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
outlook <- read_csv("data/outlook_meanings.csv")
## Rows: 23 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): forecast_outlook, meaning
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
forecast <- read_csv("data/forecast_cities.csv")
## Rows: 236 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): city, state, koppen
## dbl (8): lon, lat, elevation, distance_to_coast, wind, elevation_change_four...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(forecast)
## # A tibble: 6 Ă— 11
##   city         state    lon   lat koppen elevation distance_to_coast  wind
##   <chr>        <chr>  <dbl> <dbl> <chr>      <dbl>             <dbl> <dbl>
## 1 ABILENE      TX     -99.7  32.4 Cfa        545.              329.   4.43
## 2 AKRON_CANTON OH     -81.4  40.9 Dfa        370.              330.   4.28
## 3 ALBANY       NY     -73.8  42.8 Dfb         85.0              99.8  3.28
## 4 ALBUQUERQUE  NM    -107.   35.0 BSk       1620.              519.   3.04
## 5 ALLENTOWN    PA     -75.5  40.6 Dfa        118.               48.4  3.34
## 6 AMARILLO     TX    -102.   35.2 BSk       1098.              568.   5.14
## # ℹ 3 more variables: elevation_change_four <dbl>,
## #   elevation_change_eight <dbl>, avg_annual_precip <dbl>
head(weather)
## # A tibble: 6 Ă— 10
##   date       city    state high_or_low forecast_hours_before observed_temp
##   <date>     <chr>   <chr> <chr>                       <dbl>         <dbl>
## 1 2021-01-30 ABILENE TX    high                           48            70
## 2 2021-01-30 ABILENE TX    high                           36            70
## 3 2021-01-30 ABILENE TX    high                           24            70
## 4 2021-01-30 ABILENE TX    high                           12            70
## 5 2021-01-30 ABILENE TX    low                            48            42
## 6 2021-01-30 ABILENE TX    low                            36            42
## # ℹ 4 more variables: forecast_temp <dbl>, observed_precip <dbl>,
## #   forecast_outlook <chr>, possible_error <chr>
#create temp error column, and make two new columns out of high or low column
weather_error <- weather  |> #group_by(high_or_low) |> 

  mutate(
    temp_error = abs(observed_temp - forecast_temp)
  ) |> 
  pivot_wider(names_from = high_or_low, values_from = temp_error, names_prefix = "error_") 
#create the error city, creating new variables grouped by city
error_city <- weather_error |>
  group_by(city) |> 
  summarise(
    avg_error_high = mean(error_high, na.rm = TRUE),
    avg_error_low = mean(error_low, na.rm = TRUE)
  )

#joining the two data frames by city, and summarizing the cities across all variables. 
forecast_data <- weather_error  |>
  left_join(error_city, by = "city") |> group_by(city) |> summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE)))

head(forecast_data)
## # A tibble: 6 Ă— 9
##   city         forecast_hours_before observed_temp forecast_temp observed_precip
##   <chr>                        <dbl>         <dbl>         <dbl>           <dbl>
## 1 ABILENE                       30.0          66.0          64.9          0.0446
## 2 AKRON_CANTON                  30.0          52.3          51.8          0.127 
## 3 ALBANY                        30.0          48.0          48.2          0.142 
## 4 ALBUQUERQUE                   30.0          58.7          58.1          0.0131
## 5 ALLENTOWN                     30.0          51.8          52.2          0.128 
## 6 AMARILLO                      30.0          58.3          58.6          0.0388
## # ℹ 4 more variables: error_high <dbl>, error_low <dbl>, avg_error_high <dbl>,
## #   avg_error_low <dbl>
# Bucking down the data frame for specific variables
forecast <- forecast |> select(city, lon, lat, state, elevation, distance_to_coast, wind, avg_annual_precip)

#joining two data frames to create new data frame
forecast_total <- forecast_data |> left_join(forecast, by = "city") 

#filtering out the na values in the data frame, for the graphing portion to be clean
forecast_total <- forecast_total |> 
  filter(!is.na(wind) & !is.na(avg_error_high) & is.finite(wind) & is.finite(avg_error_high))
#plot of avg high error by location
ggplot(forecast_total, aes(x = lon, y = lat, size = avg_error_high, color = avg_error_high)) +
  geom_point(alpha = 0.6) +
  labs(title = "Geographic High Viz of Forecast Errors",
       x = "Longitude",
       y = "Latitude",
       size = "Avg High Temp Error",
       color = "Avg High Temp Error") +
  theme_minimal()

#plot of avg low error by location

ggplot(forecast_total, aes(x = lon, y = lat, size = avg_error_low, color = avg_error_low)) +
  geom_point(alpha = 0.7) +
  labs(title = "Geographic High Viz of Forecast Errors",
       x = "Longitude",
       y = "Latitude",
       size = "Avg Low Temp Error",
       color = "Avg Low Temp Error") +
  theme_minimal()

ggplot(forecast_total, aes(x = elevation, y = avg_error_high)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
  labs(title = "Effect of Elevation on High Temp Forecast Error",
       x = "Elevation (m)",
       y = "Average High Temp Error") +
  theme_minimal()

ggplot(forecast_total, aes(x = elevation, y = avg_error_low)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
  labs(title = "Effect of Elevation on Low Temp Forecast Error",
       x = "Elevation (m)",
       y = "Average Low Temp Error") +
  theme_minimal()

ggplot(forecast_total, aes(x = distance_to_coast, y = avg_error_high)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
  labs(title = "Effect of Distance to Coast on High Temp Forecast Error",
       x = "Distance to Coast (miles)",
       y = "Average High Temp Error") +
  theme_minimal()

ggplot(forecast_total, aes(x = distance_to_coast, y = avg_error_low)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
  labs(title = "Effect of Distance to Coast on Low Temp Forecast Error",
       x = "Distance to Coast (miles)",
       y = "Average Low Temp Error") +
  theme_minimal()

ggplot(forecast_total, aes(x = wind, y = avg_error_high)) +
  geom_point(alpha = 0.5, color = "purple") +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
  labs(title = "Effect of Wind Speed on High Temp Forecast Error",
       x = "Wind Speed (mph)",
       y = "Average High Temp Error") +
  theme_minimal()

ggplot(forecast_total, aes(x = wind, y = avg_error_low)) +
  geom_point(alpha = 0.5, color = "purple") +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
  labs(title = "Effect of Wind Speed on Low Temp Forecast Error",
       x = "Wind Speed (mph)",
       y = "Average Low Temp Error") +
  theme_minimal()

ggplot(forecast_total, aes(x = avg_annual_precip, y = avg_error_high)) +
  geom_point(alpha = 0.5, color = "black") +
  geom_smooth(method = "lm", formula = y ~ x,  se = FALSE, color = "orange") +
  labs(title = "Effect of Average Annual Precipitation on High Temp Forecast Error",
       x = "Average Annual Precipitation ",
       y = "Average High Temp Error") +
  theme_minimal()

ggplot(forecast_total, aes(x = avg_annual_precip, y = avg_error_low)) +
  geom_point(alpha = 0.5, color = "black") +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "orange") +
  labs(title = "Effect of Average Annual Precipitation on Low Temp Forecast Error",
       x = "Average Annual Precipitation ",
       y = "Average Low Temp Error") +
  theme_minimal()

Write-up

In this analysis, we examined weather forecast errors across multiple cities, focusing on discrepancies between predicted and observed temperatures. By calculating the average high and low-temperature errors for each city, we aimed to assess forecast reliability and identify potential patterns.

Our findings indicate that forecast accuracy varies significantly by location. Some cities consistently exhibit lower error margins, suggesting more stable weather conditions or better-calibrated forecasting models. In contrast, other cities show greater discrepancies, potentially due to unpredictable local weather patterns or limitations in predictive models.

To better understand the relationship between weather conditions and forecast errors, we visualized the data using various plots. One key visualization analyzed the impact of wind speed on high-temperature forecast errors. A scatter plot with a linear regression trend line indicated a slight positive correlation, suggesting that as wind speed increases, forecast errors for high temperatures tend to rise. This could be attributed to the complex influence of wind on local temperatures, as stronger winds often cause rapid fluctuations that may not be fully captured by forecasting models.

Another visualization examined the distribution of forecast errors across cities. The data revealed notable variations, with some cities consistently experiencing lower forecast errors, while others showed significantly higher discrepancies. This suggests that regional factors—such as geographical location, altitude, and proximity to the coast—play a role in forecast accuracy. Additionally, the distribution of high and low-temperature errors appeared asymmetric, indicating that forecasting models may perform better under certain conditions than others.