library(tidyverse)
library(patchwork)
library(readr)
q <- 0
weather <- read_csv("data/weather_forecasts.csv")
## Rows: 651968 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): city, state, high_or_low, forecast_outlook, possible_error
## dbl (4): forecast_hours_before, observed_temp, forecast_temp, observed_precip
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
outlook <- read_csv("data/outlook_meanings.csv")
## Rows: 23 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): forecast_outlook, meaning
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
forecast <- read_csv("data/forecast_cities.csv")
## Rows: 236 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): city, state, koppen
## dbl (8): lon, lat, elevation, distance_to_coast, wind, elevation_change_four...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(forecast)
## # A tibble: 6 Ă— 11
## city state lon lat koppen elevation distance_to_coast wind
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 ABILENE TX -99.7 32.4 Cfa 545. 329. 4.43
## 2 AKRON_CANTON OH -81.4 40.9 Dfa 370. 330. 4.28
## 3 ALBANY NY -73.8 42.8 Dfb 85.0 99.8 3.28
## 4 ALBUQUERQUE NM -107. 35.0 BSk 1620. 519. 3.04
## 5 ALLENTOWN PA -75.5 40.6 Dfa 118. 48.4 3.34
## 6 AMARILLO TX -102. 35.2 BSk 1098. 568. 5.14
## # ℹ 3 more variables: elevation_change_four <dbl>,
## # elevation_change_eight <dbl>, avg_annual_precip <dbl>
head(weather)
## # A tibble: 6 Ă— 10
## date city state high_or_low forecast_hours_before observed_temp
## <date> <chr> <chr> <chr> <dbl> <dbl>
## 1 2021-01-30 ABILENE TX high 48 70
## 2 2021-01-30 ABILENE TX high 36 70
## 3 2021-01-30 ABILENE TX high 24 70
## 4 2021-01-30 ABILENE TX high 12 70
## 5 2021-01-30 ABILENE TX low 48 42
## 6 2021-01-30 ABILENE TX low 36 42
## # ℹ 4 more variables: forecast_temp <dbl>, observed_precip <dbl>,
## # forecast_outlook <chr>, possible_error <chr>
#create temp error column, and make two new columns out of high or low column
weather_error <- weather |> #group_by(high_or_low) |>
mutate(
temp_error = abs(observed_temp - forecast_temp)
) |>
pivot_wider(names_from = high_or_low, values_from = temp_error, names_prefix = "error_")
#create the error city, creating new variables grouped by city
error_city <- weather_error |>
group_by(city) |>
summarise(
avg_error_high = mean(error_high, na.rm = TRUE),
avg_error_low = mean(error_low, na.rm = TRUE)
)
#joining the two data frames by city, and summarizing the cities across all variables.
forecast_data <- weather_error |>
left_join(error_city, by = "city") |> group_by(city) |> summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE)))
head(forecast_data)
## # A tibble: 6 Ă— 9
## city forecast_hours_before observed_temp forecast_temp observed_precip
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ABILENE 30.0 66.0 64.9 0.0446
## 2 AKRON_CANTON 30.0 52.3 51.8 0.127
## 3 ALBANY 30.0 48.0 48.2 0.142
## 4 ALBUQUERQUE 30.0 58.7 58.1 0.0131
## 5 ALLENTOWN 30.0 51.8 52.2 0.128
## 6 AMARILLO 30.0 58.3 58.6 0.0388
## # ℹ 4 more variables: error_high <dbl>, error_low <dbl>, avg_error_high <dbl>,
## # avg_error_low <dbl>
# Bucking down the data frame for specific variables
forecast <- forecast |> select(city, lon, lat, state, elevation, distance_to_coast, wind, avg_annual_precip)
#joining two data frames to create new data frame
forecast_total <- forecast_data |> left_join(forecast, by = "city")
#filtering out the na values in the data frame, for the graphing portion to be clean
forecast_total <- forecast_total |>
filter(!is.na(wind) & !is.na(avg_error_high) & is.finite(wind) & is.finite(avg_error_high))
#plot of avg high error by location
ggplot(forecast_total, aes(x = lon, y = lat, size = avg_error_high, color = avg_error_high)) +
geom_point(alpha = 0.6) +
labs(title = "Geographic High Viz of Forecast Errors",
x = "Longitude",
y = "Latitude",
size = "Avg High Temp Error",
color = "Avg High Temp Error") +
theme_minimal()
#plot of avg low error by location
ggplot(forecast_total, aes(x = lon, y = lat, size = avg_error_low, color = avg_error_low)) +
geom_point(alpha = 0.7) +
labs(title = "Geographic High Viz of Forecast Errors",
x = "Longitude",
y = "Latitude",
size = "Avg Low Temp Error",
color = "Avg Low Temp Error") +
theme_minimal()
ggplot(forecast_total, aes(x = elevation, y = avg_error_high)) +
geom_point(alpha = 0.5, color = "blue") +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
labs(title = "Effect of Elevation on High Temp Forecast Error",
x = "Elevation (m)",
y = "Average High Temp Error") +
theme_minimal()
ggplot(forecast_total, aes(x = elevation, y = avg_error_low)) +
geom_point(alpha = 0.5, color = "blue") +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
labs(title = "Effect of Elevation on Low Temp Forecast Error",
x = "Elevation (m)",
y = "Average Low Temp Error") +
theme_minimal()
ggplot(forecast_total, aes(x = distance_to_coast, y = avg_error_high)) +
geom_point(alpha = 0.5, color = "darkgreen") +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
labs(title = "Effect of Distance to Coast on High Temp Forecast Error",
x = "Distance to Coast (miles)",
y = "Average High Temp Error") +
theme_minimal()
ggplot(forecast_total, aes(x = distance_to_coast, y = avg_error_low)) +
geom_point(alpha = 0.5, color = "darkgreen") +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
labs(title = "Effect of Distance to Coast on Low Temp Forecast Error",
x = "Distance to Coast (miles)",
y = "Average Low Temp Error") +
theme_minimal()
ggplot(forecast_total, aes(x = wind, y = avg_error_high)) +
geom_point(alpha = 0.5, color = "purple") +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
labs(title = "Effect of Wind Speed on High Temp Forecast Error",
x = "Wind Speed (mph)",
y = "Average High Temp Error") +
theme_minimal()
ggplot(forecast_total, aes(x = wind, y = avg_error_low)) +
geom_point(alpha = 0.5, color = "purple") +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "red") +
labs(title = "Effect of Wind Speed on Low Temp Forecast Error",
x = "Wind Speed (mph)",
y = "Average Low Temp Error") +
theme_minimal()
ggplot(forecast_total, aes(x = avg_annual_precip, y = avg_error_high)) +
geom_point(alpha = 0.5, color = "black") +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "orange") +
labs(title = "Effect of Average Annual Precipitation on High Temp Forecast Error",
x = "Average Annual Precipitation ",
y = "Average High Temp Error") +
theme_minimal()
ggplot(forecast_total, aes(x = avg_annual_precip, y = avg_error_low)) +
geom_point(alpha = 0.5, color = "black") +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "orange") +
labs(title = "Effect of Average Annual Precipitation on Low Temp Forecast Error",
x = "Average Annual Precipitation ",
y = "Average Low Temp Error") +
theme_minimal()
In this analysis, we examined weather forecast errors across multiple cities, focusing on discrepancies between predicted and observed temperatures. By calculating the average high and low-temperature errors for each city, we aimed to assess forecast reliability and identify potential patterns.
Our findings indicate that forecast accuracy varies significantly by location. Some cities consistently exhibit lower error margins, suggesting more stable weather conditions or better-calibrated forecasting models. In contrast, other cities show greater discrepancies, potentially due to unpredictable local weather patterns or limitations in predictive models.
To better understand the relationship between weather conditions and forecast errors, we visualized the data using various plots. One key visualization analyzed the impact of wind speed on high-temperature forecast errors. A scatter plot with a linear regression trend line indicated a slight positive correlation, suggesting that as wind speed increases, forecast errors for high temperatures tend to rise. This could be attributed to the complex influence of wind on local temperatures, as stronger winds often cause rapid fluctuations that may not be fully captured by forecasting models.
Another visualization examined the distribution of forecast errors across cities. The data revealed notable variations, with some cities consistently experiencing lower forecast errors, while others showed significantly higher discrepancies. This suggests that regional factors—such as geographical location, altitude, and proximity to the coast—play a role in forecast accuracy. Additionally, the distribution of high and low-temperature errors appeared asymmetric, indicating that forecasting models may perform better under certain conditions than others.