library(tidyverse)
library(tidymodels)
url <- "https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-sample-data.tar.gz"
download.file(url, destfile="noaa-weather-sample-data.tar")
untar("noaa-weather-sample-data.tar")
jfk_weather <- read_csv("noaa-weather-sample-data/jfk_weather_sample.csv")
## Rows: 5727 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): HOURLYPrecip
## dbl (7): HOURLYDewPointTempF, HOURLYRelativeHumidity, HOURLYDRYBULBTEMPF, H...
## dttm (1): DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(jfk_weather)
## # A tibble: 6 × 9
## DATE HOURLYDewPointTempF HOURLYRelativeHumidity
## <dttm> <dbl> <dbl>
## 1 2015-07-25 13:51:00 60 46
## 2 2016-11-18 23:51:00 34 48
## 3 2013-01-06 08:51:00 33 89
## 4 2011-01-27 16:51:00 18 48
## 5 2015-01-03 12:16:00 27 61
## 6 2013-02-15 20:51:00 35 79
## # ℹ 6 more variables: HOURLYDRYBULBTEMPF <dbl>, HOURLYWETBULBTEMPF <dbl>,
## # HOURLYPrecip <chr>, HOURLYWindSpeed <dbl>, HOURLYSeaLevelPressure <dbl>,
## # HOURLYStationPressure <dbl>
glimpse(jfk_weather)
## Rows: 5,727
## Columns: 9
## $ DATE <dttm> 2015-07-25 13:51:00, 2016-11-18 23:51:00, 2013…
## $ HOURLYDewPointTempF <dbl> 60, 34, 33, 18, 27, 35, 4, 14, 51, 71, 76, 19, …
## $ HOURLYRelativeHumidity <dbl> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF <dbl> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYWETBULBTEMPF <dbl> 68, 44, 35, 30, 34, 38, 15, 21, 52, 72, 78, 35,…
## $ HOURLYPrecip <chr> "0.00", "0.00", "0.00", "0.00", "T", "0.00", "0…
## $ HOURLYWindSpeed <dbl> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYSeaLevelPressure <dbl> 30.01, 30.05, 30.14, 29.82, NA, 29.94, 30.42, 3…
## $ HOURLYStationPressure <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…
The key columns that we will explore in this project are:
HOURLYRelativeHumidity HOURLYDRYBULBTEMPF HOURLYPrecip HOURLYWindSpeed HOURLYStationPressure
sub_jfk_weather <- select(jfk_weather, c(HOURLYRelativeHumidity, HOURLYDRYBULBTEMPF, HOURLYPrecip, HOURLYWindSpeed, HOURLYStationPressure))
head(sub_jfk_weather, 10)
## # A tibble: 10 × 5
## HOURLYRelativeHumidity HOURLYDRYBULBTEMPF HOURLYPrecip HOURLYWindSpeed
## <dbl> <dbl> <chr> <dbl>
## 1 46 83 0.00 13
## 2 48 53 0.00 6
## 3 89 36 0.00 13
## 4 48 36 0.00 14
## 5 61 39 T 11
## 6 79 41 0.00 6
## 7 51 19 0.00 0
## 8 65 24 0.00 11
## 9 90 54 0.06 11
## 10 94 73 <NA> 5
## # ℹ 1 more variable: HOURLYStationPressure <dbl>
unique(sub_jfk_weather$HOURLYPrecip)
## [1] "0.00" "T" "0.06" NA "0.03" "0.02" "0.08" "0.01" "0.07"
## [10] "0.16" "0.09" "0.22" "0.02s" "0.24" "0.18" "0.05" "0.04" "0.09s"
## [19] "0.11" "0.14" "0.25" "0.10" "0.01s" "0.58" "0.12" "0.13" "0.46"
## [28] "1.07" "1.19" "0.34" "0.20" "0.36s" "0.42" "0.17" "0.27" "0.35"
## [37] "0.31" "0.33" "0.23" "0.26" "0.28" "0.75" "0.19" "0.36" "0.03s"
## [46] "0.07s" "0.54" "0.59" "0.21"
sub_jfk_weather2 <- sub_jfk_weather %>% mutate(HOURLYPrecip=ifelse(HOURLYPrecip=="T", "0.0", HOURLYPrecip))
sub_jfk_weather3 <- sub_jfk_weather2 %>% mutate(HOURLYPrecip=str_remove(HOURLYPrecip, pattern="s$"))
unique(sub_jfk_weather3$HOURLYPrecip)
## [1] "0.00" "0.0" "0.06" NA "0.03" "0.02" "0.08" "0.01" "0.07" "0.16"
## [11] "0.09" "0.22" "0.24" "0.18" "0.05" "0.04" "0.11" "0.14" "0.25" "0.10"
## [21] "0.58" "0.12" "0.13" "0.46" "1.07" "1.19" "0.34" "0.20" "0.36" "0.42"
## [31] "0.17" "0.27" "0.35" "0.31" "0.33" "0.23" "0.26" "0.28" "0.75" "0.19"
## [41] "0.54" "0.59" "0.21"
Convert HOURLYPrecip to the numeric type and store the cleaned dataframe as a new variable.
glimpse(sub_jfk_weather3)
## Rows: 5,727
## Columns: 5
## $ HOURLYRelativeHumidity <dbl> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF <dbl> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYPrecip <chr> "0.00", "0.00", "0.00", "0.00", "0.0", "0.00", …
## $ HOURLYWindSpeed <dbl> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYStationPressure <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…
sub_jfk_weather3$HOURLYPrecip <- as.numeric(sub_jfk_weather3$HOURLYPrecip)
glimpse(sub_jfk_weather3)
## Rows: 5,727
## Columns: 5
## $ HOURLYRelativeHumidity <dbl> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF <dbl> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYPrecip <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ HOURLYWindSpeed <dbl> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYStationPressure <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…
HOURLYRelativeHumidity’ to ‘relative_humidity’ HOURLYDRYBULBTEMPF’ to ‘dry_bulb_temp_f’ HOURLYPrecip’ to ‘precip’ HOURLYWindSpeed’ to ‘wind_speed’ HOURLYStationPressure’ to ‘station_pressure’
And I would like to replace all NAs to 0 here, too.
sub_jfk_weather4 <- sub_jfk_weather3 %>% replace_na(list(HOURLYRelativeHumidity=0, HOURLYDRYBULBTEMPF=0, HOURLYPrecip=0, HOURLYWindSpeed=0, HOURLYStationPressure=0)) %>% rename("relative_humidity"="HOURLYRelativeHumidity", "dry_bulb_temp_f"="HOURLYDRYBULBTEMPF", "precip"="HOURLYPrecip", "wind_speed"="HOURLYWindSpeed", "station_pressure"="HOURLYStationPressure")
head(sub_jfk_weather4)
## # A tibble: 6 × 5
## relative_humidity dry_bulb_temp_f precip wind_speed station_pressure
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 46 83 0 13 30.0
## 2 48 53 0 6 30.0
## 3 89 36 0 13 30.1
## 4 48 36 0 14 29.8
## 5 61 39 0 11 30.5
## 6 79 41 0 6 29.9
First, split the data into a training and testing set. Splitting a dataset is done randomly, so to have reproducible results set the seed = 1234. Also, use 80% of the data for training.
Next, looking at just the training set, plot histograms or box plots of the variables (relative_humidity, dry_bulb_temp_f, precip, wind_speed, station_pressure) for an intial look of their distributions using tidyverse’s ggplot. Leave the testing set as is because it is good practice to not see the testing set until evaluating the final model.
set.seed(1234)
weather_split <- initial_split(sub_jfk_weather4, prop = 0.8)
train_data <- training(weather_split)
test_data <- testing(weather_split)
ggplot(train_data, aes(y=relative_humidity))+geom_boxplot()
ggplot(train_data, aes(y=dry_bulb_temp_f))+geom_boxplot()
ggplot(train_data, aes(y=precip))+geom_boxplot()
ggplot(train_data, aes(y=wind_speed))+geom_boxplot()
ggplot(train_data, aes(y=station_pressure))+geom_boxplot()
Create simple linear regression models where precip is the response variable and each of relative_humidity, dry_bulb_temp_f,wind_speed or station_pressure will be a predictor variable, e.g. precip ~ relative_humidity, precip ~ dry_bulb_temp_f, etc. for a total of four simple models. Additionally, visualize each simple model with a scatter plot.
lm_spec <- linear_reg() %>% set_engine(engine = "lm")
train_fit1 <- lm_spec %>% fit(precip ~ relative_humidity, data = train_data)
train_fit1
## parsnip model object
##
##
## Call:
## stats::lm(formula = precip ~ relative_humidity, data = data)
##
## Coefficients:
## (Intercept) relative_humidity
## -0.0106076 0.0002333
train_fit2 <- lm_spec %>% fit(precip ~ dry_bulb_temp_f, data = train_data)
train_fit2
## parsnip model object
##
##
## Call:
## stats::lm(formula = precip ~ dry_bulb_temp_f, data = data)
##
## Coefficients:
## (Intercept) dry_bulb_temp_f
## 2.695e-03 3.544e-05
train_fit3 <- lm_spec %>% fit(precip ~ wind_speed, data = train_data)
train_fit3
## parsnip model object
##
##
## Call:
## stats::lm(formula = precip ~ wind_speed, data = data)
##
## Coefficients:
## (Intercept) wind_speed
## 0.0014408 0.0002915
train_fit4 <- lm_spec %>% fit(precip ~ station_pressure, data = train_data)
train_fit4
## parsnip model object
##
##
## Call:
## stats::lm(formula = precip ~ station_pressure, data = data)
##
## Coefficients:
## (Intercept) station_pressure
## 1.792e-03 9.669e-05
train_plot1 <-ggplot(train_data, aes(x=relative_humidity, y=precip))+geom_point()
train_plot1
train_plot2 <-ggplot(train_data, aes(x=dry_bulb_temp_f, y=precip))+geom_point()
train_plot2
train_plot3 <-ggplot(train_data, aes(x=wind_speed, y=precip))+geom_point()
train_plot3
train_plot4 <-ggplot(train_data, aes(x=station_pressure, y=precip))+geom_point()
train_plot4
Now, try improving the simple models you created in the previous section.
Create at least two more models, each model should use at least one of the different techniques:
Add more features/predictors Add regularization (L1, L2 or a mix) Add a polynomial component
9-1. Multiple Linear Regression
train_fit5 <- lm_spec %>% fit(precip ~ ., data = train_data)
train_fit5
## parsnip model object
##
##
## Call:
## stats::lm(formula = precip ~ ., data = data)
##
## Coefficients:
## (Intercept) relative_humidity dry_bulb_temp_f wind_speed
## -1.284e-03 3.068e-04 7.994e-06 4.998e-04
## station_pressure
## -6.848e-04
train_results5 <- train_fit5 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
train_results5
## # A tibble: 4,581 × 2
## .pred truth
## <dbl> <dbl>
## 1 0.0112 0
## 2 0.00595 0
## 3 -0.0000804 0
## 4 0.0121 0.58
## 5 0.00318 0
## 6 0.0235 0
## 7 0.00364 0
## 8 -0.00322 0
## 9 0.00295 0
## 10 0.00110 0
## # ℹ 4,571 more rows
9-2. Add Ridge(L2) regularization
weather_recipe <- recipe(precip ~., data = train_data)
ridge_spec <- linear_reg(penalty = 0.1, mixture = 0) %>% set_engine("glmnet")
ridge_wf <- workflow() %>% add_recipe(weather_recipe)
ridge_fit <- ridge_wf %>% add_model(ridge_spec) %>% fit(data=train_data)
ridge_fit %>% pull_workflow_fit() %>% tidy()
## # A tibble: 5 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -0.000160 0.1
## 2 relative_humidity 0.0000632 0.1
## 3 dry_bulb_temp_f 0.00000424 0.1
## 4 wind_speed 0.0000839 0.1
## 5 station_pressure -0.0000169 0.1
train_result6 <- ridge_fit %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
head(train_result6)
## # A tibble: 6 × 2
## .pred truth
## <dbl> <dbl>
## 1 0.00618 0
## 2 0.00511 0
## 3 0.00378 0
## 4 0.00624 0.58
## 5 0.00446 0
## 6 0.00825 0
9-2-1. Using grid search to tune the model above
tune_spec <- linear_reg(penalty = tune(), mixture = 0) %>% set_engine("glmnet")
weather_cvfolds <- vfold_cv(train_data)
lambda_grid <- grid_regular(levels = 50, penalty(range = c(-3, 0.3)))
ridge_grid <- tune_grid(ridge_wf %>% add_model(tune_spec), resamples=weather_cvfolds, grid=lambda_grid)
show_best(ridge_grid, metric="rmse")
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00471 rmse standard 0.0330 10 0.00484 Preprocessor1_Model11
## 2 0.00404 rmse standard 0.0330 10 0.00484 Preprocessor1_Model10
## 3 0.00551 rmse standard 0.0330 10 0.00485 Preprocessor1_Model12
## 4 0.00346 rmse standard 0.0330 10 0.00484 Preprocessor1_Model09
## 5 0.00643 rmse standard 0.0330 10 0.00485 Preprocessor1_Model13
9-2-2. Apply the optimal penalty found above to Elastic Net
elastic_spec <- linear_reg(penalty = 0.00346, mixture = 0.2) %>% set_engine("glmnet")
elastic_wf <- workflow() %>% add_recipe(weather_recipe)
elastic_fit <- ridge_wf %>% add_model(elastic_spec) %>% fit(data=train_data)
elastic_fit %>% pull_workflow_fit() %>% tidy()
## # A tibble: 5 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -0.00497 0.00346
## 2 relative_humidity 0.000216 0.00346
## 3 dry_bulb_temp_f 0 0.00346
## 4 wind_speed 0.000271 0.00346
## 5 station_pressure -0.000256 0.00346
train_result7 <- elastic_fit %>% predict(new_data=train_data) %>% mutate(truth=train_data$precip)
head(train_result7)
## # A tibble: 6 × 2
## .pred truth
## <dbl> <dbl>
## 1 0.00962 0
## 2 0.00591 0
## 3 0.00195 0
## 4 0.00974 0.58
## 5 0.00401 0
## 6 0.0166 0
Compare the regression metrics of each model from section 9 to find the best model overall. To do this,
Evaluate the models on the testing set using at least one metric (like MSE, RMSE or R-squared). After calculating the metrics on the testing set for each model, print them out in as a table to easily compare.
Finally, from the comparison table you create, conclude which model performed the best.
train_results1 <- train_fit1 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
test_results1 <- train_fit1 %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)
rsq(train_results1, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0203
rsq(test_results1, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0359
train_results2 <- train_fit2 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
test_results2 <- train_fit2 %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)
rsq(train_results2, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.000336
rsq(test_results2, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.000237
train_results3 <- train_fit3 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
test_results3 <- train_fit3 %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)
rsq(train_results3, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.00263
rsq(test_results3, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0287
train_results4 <- train_fit4 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
test_results4 <- train_fit4 %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)
rsq(train_results4, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.000162
rsq(test_results4, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.000523
train_results5 <- train_fit5 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
test_results5 <- train_fit5 %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)
rsq(train_results5, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0302
rsq(test_results5, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0814
train_results6 <- ridge_fit %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
test_results6 <- ridge_fit %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)
rsq(train_results6, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0249
rsq(test_results6, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0585
train_results7 <- elastic_fit %>% predict(new_data=train_data) %>% mutate(truth=train_data$precip)
test_results7 <- ridge_fit %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)
rsq(train_results7, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0285
rsq(test_results7, truth=truth, estimate=.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0585
model_names <- c("precip~humidity","precip~drybulbtemp","precip~windspeed","precip~stationpressure","Multiple Linear Regression", "Ridge (L2) regularization", "Elastic Net Regularization")
train_error <- c(0.0203, 0.000336, 0.00263, 0.000162, 0.0302, 0.0249, 0.0285)
test_error <- c(0.0359, 0.000237, 0.0287, 0.000523, 0.0814, 0.0585, 0.0585)
comparison_df <- data.frame(model_names, train_error, test_error)
comparison_df
## model_names train_error test_error
## 1 precip~humidity 0.020300 0.035900
## 2 precip~drybulbtemp 0.000336 0.000237
## 3 precip~windspeed 0.002630 0.028700
## 4 precip~stationpressure 0.000162 0.000523
## 5 Multiple Linear Regression 0.030200 0.081400
## 6 Ridge (L2) regularization 0.024900 0.058500
## 7 Elastic Net Regularization 0.028500 0.058500