This project involves data preprocessing, visualization, and
regression modeling using weather data from JFK Airport.
We go step by step through 11 tasks:
1. Load and clean the data
2. Handle missing values
3. Explore distributions and outliers
4. Build simple and multiple linear regression models
5. Compare models using RMSE and R²
6. Visualize predicted vs. actual values
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.0 ──
## ✔ broom 1.0.9 ✔ recipes 1.3.1
## ✔ dials 1.4.2 ✔ rsample 1.3.1
## ✔ dplyr 1.1.4 ✔ tibble 3.3.0
## ✔ ggplot2 3.5.2 ✔ tidyr 1.3.1
## ✔ infer 1.0.9 ✔ tune 2.0.0
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.3.3 ✔ workflowsets 1.1.1
## ✔ purrr 1.1.0 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rlang)
##
## Attaching package: 'rlang'
##
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
jfk_weather <- read.csv("C:/Users/MODEL23/Documents/jfk_weather_sample.csv")
head(jfk_weather)
## DATE HOURLYDewPointTempF HOURLYRelativeHumidity
## 1 2015-07-25T13:51:00Z 60 46
## 2 2016-11-18T23:51:00Z 34 48
## 3 2013-01-06T08:51:00Z 33 89
## 4 2011-01-27T16:51:00Z 18 48
## 5 2015-01-03T12:16:00Z 27 61
## 6 2013-02-15T20:51:00Z 35 79
## HOURLYDRYBULBTEMPF HOURLYWETBULBTEMPF HOURLYPrecip HOURLYWindSpeed
## 1 83 68 0.00 13
## 2 53 44 0.00 6
## 3 36 35 0.00 13
## 4 36 30 0.00 14
## 5 39 34 T 11
## 6 41 38 0.00 6
## HOURLYSeaLevelPressure HOURLYStationPressure
## 1 30.01 29.99
## 2 30.05 30.03
## 3 30.14 30.12
## 4 29.82 29.80
## 5 NA 30.50
## 6 29.94 29.92
lax_to_jfk <- read.csv("C:/Users/MODEL23/Documents/lax_to_jfk.csv")
head(lax_to_jfk)
## Month DayOfWeek FlightDate Reporting_Airline Origin Dest CRSDepTime
## 1 3 5 2003-03-28 UA LAX JFK 2210
## 2 11 4 2018-11-29 AS LAX JFK 1045
## 3 8 5 2015-08-28 UA LAX JFK 805
## 4 4 7 2003-04-20 DL LAX JFK 2205
## 5 11 3 2005-11-30 UA LAX JFK 840
## 6 4 1 1992-04-06 UA LAX JFK 1450
## CRSArrTime DepTime ArrTime ArrDelay ArrDelayMinutes CarrierDelay WeatherDelay
## 1 615 2209 617 2 2 NA NA
## 2 1912 1049 1851 -21 0 NA NA
## 3 1634 757 1620 -14 0 NA NA
## 4 619 2212 616 -3 0 NA NA
## 5 1653 836 1640 -13 0 NA NA
## 6 2308 1452 2248 -20 0 NA NA
## NASDelay SecurityDelay LateAircraftDelay DepDelay DepDelayMinutes DivDistance
## 1 NA NA NA -1 0 NA
## 2 NA NA NA 4 4 NA
## 3 NA NA NA -8 0 NA
## 4 NA NA NA 7 7 NA
## 5 NA NA NA -4 0 NA
## 6 NA NA NA 2 2 NA
## DivArrDelay
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
glimpse(jfk_weather)
## Rows: 5,727
## Columns: 9
## $ DATE <chr> "2015-07-25T13:51:00Z", "2016-11-18T23:51:00Z",…
## $ HOURLYDewPointTempF <chr> "60", "34", "33", "18", "27", "35", "4", "14", …
## $ HOURLYRelativeHumidity <int> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF <int> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYWETBULBTEMPF <int> 68, 44, 35, 30, 34, 38, 15, 21, 52, 72, 78, 35,…
## $ HOURLYPrecip <chr> "0.00", "0.00", "0.00", "0.00", "T", "0.00", "0…
## $ HOURLYWindSpeed <int> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYSeaLevelPressure <dbl> 30.01, 30.05, 30.14, 29.82, NA, 29.94, 30.42, 3…
## $ HOURLYStationPressure <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…
Note: Here I successfully loaded the JFK weather dataset and inspected the first few rows. This confirms the dataset contains variables like humidity, temperature, wind speed, and precipitation.
new_df <- jfk_weather %>%
select(HOURLYRelativeHumidity, HOURLYDRYBULBTEMPF, HOURLYPrecip, HOURLYWindSpeed, HOURLYStationPressure)
head(new_df, 10)
## HOURLYRelativeHumidity HOURLYDRYBULBTEMPF HOURLYPrecip HOURLYWindSpeed
## 1 46 83 0.00 13
## 2 48 53 0.00 6
## 3 89 36 0.00 13
## 4 48 36 0.00 14
## 5 61 39 T 11
## 6 79 41 0.00 6
## 7 51 19 0.00 0
## 8 65 24 0.00 11
## 9 90 54 0.06 11
## 10 94 73 <NA> 5
## HOURLYStationPressure
## 1 29.99
## 2 30.03
## 3 30.12
## 4 29.80
## 5 30.50
## 6 29.92
## 7 30.40
## 8 30.35
## 9 30.03
## 10 29.91
unique(new_df$HOURLYPrecip)
## [1] "0.00" "T" "0.06" NA "0.03" "0.02" "0.08" "0.01" "0.07"
## [10] "0.16" "0.09" "0.22" "0.02s" "0.24" "0.18" "0.05" "0.04" "0.09s"
## [19] "0.11" "0.14" "0.25" "0.10" "0.01s" "0.58" "0.12" "0.13" "0.46"
## [28] "1.07" "1.19" "0.34" "0.20" "0.36s" "0.42" "0.17" "0.27" "0.35"
## [37] "0.31" "0.33" "0.23" "0.26" "0.28" "0.75" "0.19" "0.36" "0.03s"
## [46] "0.07s" "0.54" "0.59" "0.21"
Note: I selected only the relevant variables needed for modeling. We also checked precipitation values and noticed some special entries (like “T” for trace amounts) that need cleaning.
modified_df <- new_df %>%
mutate(
HOURLYPrecip = str_replace(HOURLYPrecip, "T", "0.0"),
HOURLYPrecip = str_remove(HOURLYPrecip, "s$"),
HOURLYPrecip = as.numeric(HOURLYPrecip)
)
unique(modified_df$HOURLYPrecip)
## [1] 0.00 0.06 NA 0.03 0.02 0.08 0.01 0.07 0.16 0.09 0.22 0.24 0.18 0.05 0.04
## [16] 0.11 0.14 0.25 0.10 0.58 0.12 0.13 0.46 1.07 1.19 0.34 0.20 0.36 0.42 0.17
## [31] 0.27 0.35 0.31 0.33 0.23 0.26 0.28 0.75 0.19 0.54 0.59 0.21
glimpse(modified_df)
## Rows: 5,727
## Columns: 5
## $ HOURLYRelativeHumidity <int> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF <int> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYPrecip <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ HOURLYWindSpeed <int> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYStationPressure <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…
Note: I cleaned the precipitation column by
converting “T” (trace) into 0.0
and ensuring the column is
numeric. This makes it usable for analysis and modeling.
cleaned_df <- new_df %>%
mutate(
HOURLYPrecip = str_replace(HOURLYPrecip, "T", "0.0"),
HOURLYPrecip = str_remove(HOURLYPrecip, "s$"),
HOURLYPrecip = as.numeric(HOURLYPrecip)
)
glimpse(cleaned_df)
## Rows: 5,727
## Columns: 5
## $ HOURLYRelativeHumidity <int> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF <int> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYPrecip <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ HOURLYWindSpeed <int> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYStationPressure <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…
Note: I confirmed that the cleaned dataset is now consistent and precipitation is properly numeric. This prepares the data for further steps.
final_df <- cleaned_df %>%
rename(
relative_humidity = HOURLYRelativeHumidity,
dry_bulb_temp_f = HOURLYDRYBULBTEMPF,
precip = HOURLYPrecip,
wind_speed = HOURLYWindSpeed,
station_pressure = HOURLYStationPressure
)
head(final_df,10)
## relative_humidity dry_bulb_temp_f precip wind_speed station_pressure
## 1 46 83 0.00 13 29.99
## 2 48 53 0.00 6 30.03
## 3 89 36 0.00 13 30.12
## 4 48 36 0.00 14 29.80
## 5 61 39 0.00 11 30.50
## 6 79 41 0.00 6 29.92
## 7 51 19 0.00 0 30.40
## 8 65 24 0.00 11 30.35
## 9 90 54 0.06 11 30.03
## 10 94 73 NA 5 29.91
Note: Column names were simplified into shorter, more readable names. This improves clarity when working with the dataset.
sum(is.na(final_df))
## [1] 2276
replace_na_df <- final_df %>%
mutate(across(c(relative_humidity, dry_bulb_temp_f, precip, wind_speed, station_pressure),
~as.numeric(.) %>% replace_na(mean(., na.rm = TRUE))))
Note: Missing values were handled by replacing them with the column mean. This ensures no gaps in the data that could affect model training.
set.seed(1234)
train_indices <- sample(1:nrow(replace_na_df), size = 0.8 * nrow(replace_na_df))
training_data <- replace_na_df[train_indices, ]
testing_data <- replace_na_df[-train_indices, ]
Note: The dataset was split into training (80%) and testing (20%) sets. This allows us to train models on one portion and evaluate them on unseen data for reliability.
# Histograms
training_data %>%
pivot_longer(cols = c(relative_humidity, dry_bulb_temp_f, precip, wind_speed, station_pressure),
names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value, fill = variable)) +
geom_histogram(bins = 30, alpha = 0.6) +
facet_wrap(~variable, scales = "free") +
theme_minimal()
# Boxplots
training_data %>%
pivot_longer(cols = c(relative_humidity, dry_bulb_temp_f, precip, wind_speed, station_pressure),
names_to = "variable", values_to = "value") %>%
ggplot(aes(x = variable, y = value, fill = variable)) +
geom_boxplot() +
theme_minimal()
Note: Histograms show the distribution of each variable, while boxplots highlight potential outliers. For example, precipitation values are skewed with many zeros, which is common in weather data.
# Single predictor models
model1 <- lm(precip ~ relative_humidity, data = training_data)
model2 <- lm(precip ~ dry_bulb_temp_f, data = training_data)
model3 <- lm(precip ~ wind_speed, data = training_data)
model4 <- lm(precip ~ station_pressure, data = training_data)
# Summaries
summary(model1)
##
## Call:
## lm(formula = precip ~ relative_humidity, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.01616 -0.00802 -0.00366 0.00069 1.17500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.290e-02 1.877e-03 -6.87 7.26e-12 ***
## relative_humidity 2.906e-04 2.691e-05 10.80 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.036 on 4579 degrees of freedom
## Multiple R-squared: 0.02483, Adjusted R-squared: 0.02462
## F-statistic: 116.6 on 1 and 4579 DF, p-value: < 2.2e-16
summary(model2)
##
## Call:
## lm(formula = precip ~ dry_bulb_temp_f, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.00758 -0.00672 -0.00617 -0.00012 1.18316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.216e-03 1.860e-03 2.804 0.00506 **
## dry_bulb_temp_f 2.388e-05 3.205e-05 0.745 0.45626
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03646 on 4579 degrees of freedom
## Multiple R-squared: 0.0001212, Adjusted R-squared: -9.714e-05
## F-statistic: 0.5551 on 1 and 4579 DF, p-value: 0.4563
summary(model3)
##
## Call:
## lm(formula = precip ~ wind_speed, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.01244 -0.00649 -0.00541 0.00000 1.18053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.521e-03 1.112e-03 3.168 0.00155 **
## wind_speed 2.703e-04 8.701e-05 3.106 0.00191 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03642 on 4579 degrees of freedom
## Multiple R-squared: 0.002103, Adjusted R-squared: 0.001885
## F-statistic: 9.649 on 1 and 4579 DF, p-value: 0.001907
summary(model4)
##
## Call:
## lm(formula = precip ~ station_pressure, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.02058 -0.00736 -0.00406 0.00004 1.17851
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.501903 0.068231 7.356 2.23e-13 ***
## station_pressure -0.016518 0.002275 -7.260 4.51e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03625 on 4579 degrees of freedom
## Multiple R-squared: 0.01138, Adjusted R-squared: 0.01116
## F-statistic: 52.71 on 1 and 4579 DF, p-value: 4.515e-13
# Multiple regression
model_multi <- lm(precip ~ relative_humidity + dry_bulb_temp_f + wind_speed + station_pressure,
data = training_data)
summary(model_multi)
##
## Call:
## lm(formula = precip ~ relative_humidity + dry_bulb_temp_f + wind_speed +
## station_pressure, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.03033 -0.00816 -0.00341 0.00121 1.16781
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.017e-01 7.756e-02 2.601 0.00933 **
## relative_humidity 3.033e-04 2.930e-05 10.352 < 2e-16 ***
## dry_bulb_temp_f -4.532e-06 3.269e-05 -0.139 0.88973
## wind_speed 4.321e-04 9.641e-05 4.482 7.58e-06 ***
## station_pressure -7.338e-03 2.536e-03 -2.893 0.00384 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03584 on 4576 degrees of freedom
## Multiple R-squared: 0.03431, Adjusted R-squared: 0.03347
## F-statistic: 40.65 on 4 and 4576 DF, p-value: < 2.2e-16
# Polynomial regression
model_poly <- lm(precip ~ poly(dry_bulb_temp_f, 2) + relative_humidity + wind_speed + station_pressure,
data = training_data)
summary(model_poly)
##
## Call:
## lm(formula = precip ~ poly(dry_bulb_temp_f, 2) + relative_humidity +
## wind_speed + station_pressure, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.03022 -0.00817 -0.00343 0.00119 1.16782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.014e-01 7.716e-02 2.611 0.00906 **
## poly(dry_bulb_temp_f, 2)1 -5.454e-03 3.721e-02 -0.147 0.88348
## poly(dry_bulb_temp_f, 2)2 9.101e-03 3.718e-02 0.245 0.80663
## relative_humidity 3.050e-04 3.012e-05 10.128 < 2e-16 ***
## wind_speed 4.310e-04 9.652e-05 4.465 8.18e-06 ***
## station_pressure -7.340e-03 2.537e-03 -2.894 0.00383 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03584 on 4575 degrees of freedom
## Multiple R-squared: 0.03433, Adjusted R-squared: 0.03327
## F-statistic: 32.53 on 5 and 4575 DF, p-value: < 2.2e-16
Note: I built simple linear regressions with one predictor at a time, as well as multiple regression and polynomial regression. The summaries show coefficients, significance, and fit (R²).
model_metrics <- function(model, train, test){
train_pred <- predict(model, train)
test_pred <- predict(model, test)
tibble(
Train_RMSE = sqrt(mean((train$precip - train_pred)^2)),
Test_RMSE = sqrt(mean((test$precip - test_pred)^2)),
Train_R2 = cor(train$precip, train_pred)^2,
Test_R2 = cor(test$precip, test_pred)^2
)
}
results <- bind_rows(
model_metrics(model1, training_data, testing_data) %>% mutate(Model = "Model 1 (Humidity)"),
model_metrics(model2, training_data, testing_data) %>% mutate(Model = "Model 2 (Temp)"),
model_metrics(model3, training_data, testing_data) %>% mutate(Model = "Model 3 (Wind)"),
model_metrics(model4, training_data, testing_data) %>% mutate(Model = "Model 4 (Pressure)"),
model_metrics(model_multi, training_data, testing_data) %>% mutate(Model = "Multiple Linear Regression"),
model_metrics(model_poly, training_data, testing_data) %>% mutate(Model = "Polynomial Model")
)
results
## # A tibble: 6 × 5
## Train_RMSE Test_RMSE Train_R2 Test_R2 Model
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 0.0360 0.0286 0.0248 0.0435 Model 1 (Humidity)
## 2 0.0364 0.0292 0.000121 0.0000200 Model 2 (Temp)
## 3 0.0364 0.0290 0.00210 0.0283 Model 3 (Wind)
## 4 0.0362 0.0289 0.0114 0.0246 Model 4 (Pressure)
## 5 0.0358 0.0280 0.0343 0.0842 Multiple Linear Regression
## 6 0.0358 0.0280 0.0343 0.0841 Polynomial Model
Note: The comparison table shows RMSE (error) and R² (fit) for each model. The best model is the one with lowest Test RMSE and highest Test R².
# Choose the best model (from results table)
best_model <- model_multi
# Predictions
test_pred <- predict(best_model, testing_data)
# Combine with actual
comparison <- tibble(
Actual = testing_data$precip,
Predicted = test_pred
)
# Scatter plot
ggplot(comparison, aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.6, color = "blue") +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Predicted vs. Actual Precipitation",
x = "Actual Precipitation",
y = "Predicted Precipitation")
Note: The scatter plot compares predicted precipitation with actual values. If the model were perfect, all points would fall on the red dashed line. Here, points are spread but show some alignment, meaning the model captures trends but not all variation.
Based on the comparison, the Multiple Linear Regression model performed best, giving a balance between accuracy and generalization. However, precipitation remains difficult to predict perfectly due to its variability.
```