Description

This project involves data preprocessing, visualization, and regression modeling using weather data from JFK Airport.
We go step by step through 11 tasks:
1. Load and clean the data
2. Handle missing values
3. Explore distributions and outliers
4. Build simple and multiple linear regression models
5. Compare models using RMSE and R²
6. Visualize predicted vs. actual values

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.0 ──

## ✔ broom        1.0.9     ✔ recipes      1.3.1
## ✔ dials        1.4.2     ✔ rsample      1.3.1
## ✔ dplyr        1.1.4     ✔ tibble       3.3.0
## ✔ ggplot2      3.5.2     ✔ tidyr        1.3.1
## ✔ infer        1.0.9     ✔ tune         2.0.0
## ✔ modeldata    1.5.1     ✔ workflows    1.3.0
## ✔ parsnip      1.3.3     ✔ workflowsets 1.1.1
## ✔ purrr        1.1.0     ✔ yardstick    1.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(rlang)

## 
## Attaching package: 'rlang'
## 
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice

Task 1: Load Data

jfk_weather <- read.csv("C:/Users/MODEL23/Documents/jfk_weather_sample.csv")
head(jfk_weather)

##                   DATE HOURLYDewPointTempF HOURLYRelativeHumidity
## 1 2015-07-25T13:51:00Z                  60                     46
## 2 2016-11-18T23:51:00Z                  34                     48
## 3 2013-01-06T08:51:00Z                  33                     89
## 4 2011-01-27T16:51:00Z                  18                     48
## 5 2015-01-03T12:16:00Z                  27                     61
## 6 2013-02-15T20:51:00Z                  35                     79
##   HOURLYDRYBULBTEMPF HOURLYWETBULBTEMPF HOURLYPrecip HOURLYWindSpeed
## 1                 83                 68         0.00              13
## 2                 53                 44         0.00               6
## 3                 36                 35         0.00              13
## 4                 36                 30         0.00              14
## 5                 39                 34            T              11
## 6                 41                 38         0.00               6
##   HOURLYSeaLevelPressure HOURLYStationPressure
## 1                  30.01                 29.99
## 2                  30.05                 30.03
## 3                  30.14                 30.12
## 4                  29.82                 29.80
## 5                     NA                 30.50
## 6                  29.94                 29.92

lax_to_jfk <- read.csv("C:/Users/MODEL23/Documents/lax_to_jfk.csv")
head(lax_to_jfk)

##   Month DayOfWeek FlightDate Reporting_Airline Origin Dest CRSDepTime
## 1     3         5 2003-03-28                UA    LAX  JFK       2210
## 2    11         4 2018-11-29                AS    LAX  JFK       1045
## 3     8         5 2015-08-28                UA    LAX  JFK        805
## 4     4         7 2003-04-20                DL    LAX  JFK       2205
## 5    11         3 2005-11-30                UA    LAX  JFK        840
## 6     4         1 1992-04-06                UA    LAX  JFK       1450
##   CRSArrTime DepTime ArrTime ArrDelay ArrDelayMinutes CarrierDelay WeatherDelay
## 1        615    2209     617        2               2           NA           NA
## 2       1912    1049    1851      -21               0           NA           NA
## 3       1634     757    1620      -14               0           NA           NA
## 4        619    2212     616       -3               0           NA           NA
## 5       1653     836    1640      -13               0           NA           NA
## 6       2308    1452    2248      -20               0           NA           NA
##   NASDelay SecurityDelay LateAircraftDelay DepDelay DepDelayMinutes DivDistance
## 1       NA            NA                NA       -1               0          NA
## 2       NA            NA                NA        4               4          NA
## 3       NA            NA                NA       -8               0          NA
## 4       NA            NA                NA        7               7          NA
## 5       NA            NA                NA       -4               0          NA
## 6       NA            NA                NA        2               2          NA
##   DivArrDelay
## 1          NA
## 2          NA
## 3          NA
## 4          NA
## 5          NA
## 6          NA

glimpse(jfk_weather)

## Rows: 5,727
## Columns: 9
## $ DATE                   <chr> "2015-07-25T13:51:00Z", "2016-11-18T23:51:00Z",…
## $ HOURLYDewPointTempF    <chr> "60", "34", "33", "18", "27", "35", "4", "14", …
## $ HOURLYRelativeHumidity <int> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF     <int> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYWETBULBTEMPF     <int> 68, 44, 35, 30, 34, 38, 15, 21, 52, 72, 78, 35,…
## $ HOURLYPrecip           <chr> "0.00", "0.00", "0.00", "0.00", "T", "0.00", "0…
## $ HOURLYWindSpeed        <int> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYSeaLevelPressure <dbl> 30.01, 30.05, 30.14, 29.82, NA, 29.94, 30.42, 3…
## $ HOURLYStationPressure  <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…

Note: Here I successfully loaded the JFK weather dataset and inspected the first few rows. This confirms the dataset contains variables like humidity, temperature, wind speed, and precipitation.

Task 2: Data Preprocessing

new_df <- jfk_weather %>%
  select(HOURLYRelativeHumidity, HOURLYDRYBULBTEMPF, HOURLYPrecip, HOURLYWindSpeed, HOURLYStationPressure)
head(new_df, 10)

##    HOURLYRelativeHumidity HOURLYDRYBULBTEMPF HOURLYPrecip HOURLYWindSpeed
## 1                      46                 83         0.00              13
## 2                      48                 53         0.00               6
## 3                      89                 36         0.00              13
## 4                      48                 36         0.00              14
## 5                      61                 39            T              11
## 6                      79                 41         0.00               6
## 7                      51                 19         0.00               0
## 8                      65                 24         0.00              11
## 9                      90                 54         0.06              11
## 10                     94                 73         <NA>               5
##    HOURLYStationPressure
## 1                  29.99
## 2                  30.03
## 3                  30.12
## 4                  29.80
## 5                  30.50
## 6                  29.92
## 7                  30.40
## 8                  30.35
## 9                  30.03
## 10                 29.91

unique(new_df$HOURLYPrecip)

##  [1] "0.00"  "T"     "0.06"  NA      "0.03"  "0.02"  "0.08"  "0.01"  "0.07" 
## [10] "0.16"  "0.09"  "0.22"  "0.02s" "0.24"  "0.18"  "0.05"  "0.04"  "0.09s"
## [19] "0.11"  "0.14"  "0.25"  "0.10"  "0.01s" "0.58"  "0.12"  "0.13"  "0.46" 
## [28] "1.07"  "1.19"  "0.34"  "0.20"  "0.36s" "0.42"  "0.17"  "0.27"  "0.35" 
## [37] "0.31"  "0.33"  "0.23"  "0.26"  "0.28"  "0.75"  "0.19"  "0.36"  "0.03s"
## [46] "0.07s" "0.54"  "0.59"  "0.21"

Note: I selected only the relevant variables needed for modeling. We also checked precipitation values and noticed some special entries (like “T” for trace amounts) that need cleaning.

Task 3: Clean Precipitation Column

modified_df <- new_df %>%
  mutate(
    HOURLYPrecip = str_replace(HOURLYPrecip, "T", "0.0"),
    HOURLYPrecip = str_remove(HOURLYPrecip, "s$"),
    HOURLYPrecip = as.numeric(HOURLYPrecip)
  )

unique(modified_df$HOURLYPrecip)

##  [1] 0.00 0.06   NA 0.03 0.02 0.08 0.01 0.07 0.16 0.09 0.22 0.24 0.18 0.05 0.04
## [16] 0.11 0.14 0.25 0.10 0.58 0.12 0.13 0.46 1.07 1.19 0.34 0.20 0.36 0.42 0.17
## [31] 0.27 0.35 0.31 0.33 0.23 0.26 0.28 0.75 0.19 0.54 0.59 0.21

glimpse(modified_df)

## Rows: 5,727
## Columns: 5
## $ HOURLYRelativeHumidity <int> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF     <int> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYPrecip           <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ HOURLYWindSpeed        <int> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYStationPressure  <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…

Note: I cleaned the precipitation column by converting “T” (trace) into 0.0 and ensuring the column is numeric. This makes it usable for analysis and modeling.

Task 4: Final Cleaning

cleaned_df <- new_df %>%
  mutate(
    HOURLYPrecip = str_replace(HOURLYPrecip, "T", "0.0"),
    HOURLYPrecip = str_remove(HOURLYPrecip, "s$"),
    HOURLYPrecip = as.numeric(HOURLYPrecip)
  )

glimpse(cleaned_df)

## Rows: 5,727
## Columns: 5
## $ HOURLYRelativeHumidity <int> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF     <int> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYPrecip           <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ HOURLYWindSpeed        <int> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYStationPressure  <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…

Note: I confirmed that the cleaned dataset is now consistent and precipitation is properly numeric. This prepares the data for further steps.

Task 5: Rename Columns

final_df <- cleaned_df %>%
  rename(
    relative_humidity = HOURLYRelativeHumidity,
    dry_bulb_temp_f = HOURLYDRYBULBTEMPF,
    precip = HOURLYPrecip,
    wind_speed = HOURLYWindSpeed,
    station_pressure = HOURLYStationPressure
  )
head(final_df,10)

##    relative_humidity dry_bulb_temp_f precip wind_speed station_pressure
## 1                 46              83   0.00         13            29.99
## 2                 48              53   0.00          6            30.03
## 3                 89              36   0.00         13            30.12
## 4                 48              36   0.00         14            29.80
## 5                 61              39   0.00         11            30.50
## 6                 79              41   0.00          6            29.92
## 7                 51              19   0.00          0            30.40
## 8                 65              24   0.00         11            30.35
## 9                 90              54   0.06         11            30.03
## 10                94              73     NA          5            29.91

Note: Column names were simplified into shorter, more readable names. This improves clarity when working with the dataset.

Task 6: Handle Missing Values

sum(is.na(final_df))

## [1] 2276

replace_na_df <- final_df %>%
  mutate(across(c(relative_humidity, dry_bulb_temp_f, precip, wind_speed, station_pressure), 
                ~as.numeric(.) %>% replace_na(mean(., na.rm = TRUE))))

Note: Missing values were handled by replacing them with the column mean. This ensures no gaps in the data that could affect model training.

Task 7: Train-Test Split

set.seed(1234)
train_indices <- sample(1:nrow(replace_na_df), size = 0.8 * nrow(replace_na_df))

training_data <- replace_na_df[train_indices, ]
testing_data <- replace_na_df[-train_indices, ]

Note: The dataset was split into training (80%) and testing (20%) sets. This allows us to train models on one portion and evaluate them on unseen data for reliability.

Task 8: Data Visualization

# Histograms
training_data %>%
  pivot_longer(cols = c(relative_humidity, dry_bulb_temp_f, precip, wind_speed, station_pressure),
               names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value, fill = variable)) +
  geom_histogram(bins = 30, alpha = 0.6) +
  facet_wrap(~variable, scales = "free") +
  theme_minimal()

# Boxplots
training_data %>%
  pivot_longer(cols = c(relative_humidity, dry_bulb_temp_f, precip, wind_speed, station_pressure),
               names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = variable, y = value, fill = variable)) +
  geom_boxplot() +
  theme_minimal()

Note: Histograms show the distribution of each variable, while boxplots highlight potential outliers. For example, precipitation values are skewed with many zeros, which is common in weather data.

Task 9: Build Regression Models

# Single predictor models
model1 <- lm(precip ~ relative_humidity, data = training_data)
model2 <- lm(precip ~ dry_bulb_temp_f, data = training_data)
model3 <- lm(precip ~ wind_speed, data = training_data)
model4 <- lm(precip ~ station_pressure, data = training_data)

# Summaries
summary(model1)

## 
## Call:
## lm(formula = precip ~ relative_humidity, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.01616 -0.00802 -0.00366  0.00069  1.17500 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.290e-02  1.877e-03   -6.87 7.26e-12 ***
## relative_humidity  2.906e-04  2.691e-05   10.80  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.036 on 4579 degrees of freedom
## Multiple R-squared:  0.02483,    Adjusted R-squared:  0.02462 
## F-statistic: 116.6 on 1 and 4579 DF,  p-value: < 2.2e-16

summary(model2)

## 
## Call:
## lm(formula = precip ~ dry_bulb_temp_f, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.00758 -0.00672 -0.00617 -0.00012  1.18316 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     5.216e-03  1.860e-03   2.804  0.00506 **
## dry_bulb_temp_f 2.388e-05  3.205e-05   0.745  0.45626   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03646 on 4579 degrees of freedom
## Multiple R-squared:  0.0001212,  Adjusted R-squared:  -9.714e-05 
## F-statistic: 0.5551 on 1 and 4579 DF,  p-value: 0.4563

summary(model3)

## 
## Call:
## lm(formula = precip ~ wind_speed, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.01244 -0.00649 -0.00541  0.00000  1.18053 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 3.521e-03  1.112e-03   3.168  0.00155 **
## wind_speed  2.703e-04  8.701e-05   3.106  0.00191 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03642 on 4579 degrees of freedom
## Multiple R-squared:  0.002103,   Adjusted R-squared:  0.001885 
## F-statistic: 9.649 on 1 and 4579 DF,  p-value: 0.001907

summary(model4)

## 
## Call:
## lm(formula = precip ~ station_pressure, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.02058 -0.00736 -0.00406  0.00004  1.17851 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.501903   0.068231   7.356 2.23e-13 ***
## station_pressure -0.016518   0.002275  -7.260 4.51e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03625 on 4579 degrees of freedom
## Multiple R-squared:  0.01138,    Adjusted R-squared:  0.01116 
## F-statistic: 52.71 on 1 and 4579 DF,  p-value: 4.515e-13

# Multiple regression
model_multi <- lm(precip ~ relative_humidity + dry_bulb_temp_f + wind_speed + station_pressure,
                  data = training_data)
summary(model_multi)

## 
## Call:
## lm(formula = precip ~ relative_humidity + dry_bulb_temp_f + wind_speed + 
##     station_pressure, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.03033 -0.00816 -0.00341  0.00121  1.16781 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.017e-01  7.756e-02   2.601  0.00933 ** 
## relative_humidity  3.033e-04  2.930e-05  10.352  < 2e-16 ***
## dry_bulb_temp_f   -4.532e-06  3.269e-05  -0.139  0.88973    
## wind_speed         4.321e-04  9.641e-05   4.482 7.58e-06 ***
## station_pressure  -7.338e-03  2.536e-03  -2.893  0.00384 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03584 on 4576 degrees of freedom
## Multiple R-squared:  0.03431,    Adjusted R-squared:  0.03347 
## F-statistic: 40.65 on 4 and 4576 DF,  p-value: < 2.2e-16

# Polynomial regression
model_poly <- lm(precip ~ poly(dry_bulb_temp_f, 2) + relative_humidity + wind_speed + station_pressure,
                 data = training_data)
summary(model_poly)

## 
## Call:
## lm(formula = precip ~ poly(dry_bulb_temp_f, 2) + relative_humidity + 
##     wind_speed + station_pressure, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.03022 -0.00817 -0.00343  0.00119  1.16782 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.014e-01  7.716e-02   2.611  0.00906 ** 
## poly(dry_bulb_temp_f, 2)1 -5.454e-03  3.721e-02  -0.147  0.88348    
## poly(dry_bulb_temp_f, 2)2  9.101e-03  3.718e-02   0.245  0.80663    
## relative_humidity          3.050e-04  3.012e-05  10.128  < 2e-16 ***
## wind_speed                 4.310e-04  9.652e-05   4.465 8.18e-06 ***
## station_pressure          -7.340e-03  2.537e-03  -2.894  0.00383 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03584 on 4575 degrees of freedom
## Multiple R-squared:  0.03433,    Adjusted R-squared:  0.03327 
## F-statistic: 32.53 on 5 and 4575 DF,  p-value: < 2.2e-16

Note: I built simple linear regressions with one predictor at a time, as well as multiple regression and polynomial regression. The summaries show coefficients, significance, and fit (R²).

Task 10: Compare Models

model_metrics <- function(model, train, test){
  train_pred <- predict(model, train)
  test_pred <- predict(model, test)
  
  tibble(
    Train_RMSE = sqrt(mean((train$precip - train_pred)^2)),
    Test_RMSE  = sqrt(mean((test$precip - test_pred)^2)),
    Train_R2   = cor(train$precip, train_pred)^2,
    Test_R2    = cor(test$precip, test_pred)^2
  )
}

results <- bind_rows(
  model_metrics(model1, training_data, testing_data) %>% mutate(Model = "Model 1 (Humidity)"),
  model_metrics(model2, training_data, testing_data) %>% mutate(Model = "Model 2 (Temp)"),
  model_metrics(model3, training_data, testing_data) %>% mutate(Model = "Model 3 (Wind)"),
  model_metrics(model4, training_data, testing_data) %>% mutate(Model = "Model 4 (Pressure)"),
  model_metrics(model_multi, training_data, testing_data) %>% mutate(Model = "Multiple Linear Regression"),
  model_metrics(model_poly, training_data, testing_data) %>% mutate(Model = "Polynomial Model")
)

results

## # A tibble: 6 × 5
##   Train_RMSE Test_RMSE Train_R2   Test_R2 Model                     
##        <dbl>     <dbl>    <dbl>     <dbl> <chr>                     
## 1     0.0360    0.0286 0.0248   0.0435    Model 1 (Humidity)        
## 2     0.0364    0.0292 0.000121 0.0000200 Model 2 (Temp)            
## 3     0.0364    0.0290 0.00210  0.0283    Model 3 (Wind)            
## 4     0.0362    0.0289 0.0114   0.0246    Model 4 (Pressure)        
## 5     0.0358    0.0280 0.0343   0.0842    Multiple Linear Regression
## 6     0.0358    0.0280 0.0343   0.0841    Polynomial Model

Note: The comparison table shows RMSE (error) and R² (fit) for each model. The best model is the one with lowest Test RMSE and highest Test R².

Task 11: Predicted vs. Actual Plot (Best Model)

# Choose the best model (from results table)
best_model <- model_multi

# Predictions
test_pred <- predict(best_model, testing_data)

# Combine with actual
comparison <- tibble(
  Actual = testing_data$precip,
  Predicted = test_pred
)

# Scatter plot
ggplot(comparison, aes(x = Actual, y = Predicted)) +
  geom_point(alpha = 0.6, color = "blue") +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  theme_minimal() +
  labs(title = "Predicted vs. Actual Precipitation",
       x = "Actual Precipitation",
       y = "Predicted Precipitation")

Note: The scatter plot compares predicted precipitation with actual values. If the model were perfect, all points would fall on the red dashed line. Here, points are spread but show some alignment, meaning the model captures trends but not all variation.

Final Conclusion

Based on the comparison, the Multiple Linear Regression model performed best, giving a balance between accuracy and generalization. However, precipitation remains difficult to predict perfectly due to its variability.

```

Data analysis

Tawose MoyoNInuoluwa Daniel

2025-09-16