IBM R Final Report

0. Import required modules

library(tidyverse)
library(tidymodels)

1. Download NOAA Weather Dataset

url <- "https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-sample-data.tar.gz"

download.file(url, destfile="noaa-weather-sample-data.tar")

untar("noaa-weather-sample-data.tar")

2. Extract and Read into Project

jfk_weather <- read_csv("noaa-weather-sample-data/jfk_weather_sample.csv")

## Rows: 5727 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): HOURLYPrecip
## dbl  (7): HOURLYDewPointTempF, HOURLYRelativeHumidity, HOURLYDRYBULBTEMPF, H...
## dttm (1): DATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(jfk_weather)

## # A tibble: 6 × 9
##   DATE                HOURLYDewPointTempF HOURLYRelativeHumidity
##   <dttm>                            <dbl>                  <dbl>
## 1 2015-07-25 13:51:00                  60                     46
## 2 2016-11-18 23:51:00                  34                     48
## 3 2013-01-06 08:51:00                  33                     89
## 4 2011-01-27 16:51:00                  18                     48
## 5 2015-01-03 12:16:00                  27                     61
## 6 2013-02-15 20:51:00                  35                     79
## # ℹ 6 more variables: HOURLYDRYBULBTEMPF <dbl>, HOURLYWETBULBTEMPF <dbl>,
## #   HOURLYPrecip <chr>, HOURLYWindSpeed <dbl>, HOURLYSeaLevelPressure <dbl>,
## #   HOURLYStationPressure <dbl>

glimpse(jfk_weather)

## Rows: 5,727
## Columns: 9
## $ DATE                   <dttm> 2015-07-25 13:51:00, 2016-11-18 23:51:00, 2013…
## $ HOURLYDewPointTempF    <dbl> 60, 34, 33, 18, 27, 35, 4, 14, 51, 71, 76, 19, …
## $ HOURLYRelativeHumidity <dbl> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF     <dbl> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYWETBULBTEMPF     <dbl> 68, 44, 35, 30, 34, 38, 15, 21, 52, 72, 78, 35,…
## $ HOURLYPrecip           <chr> "0.00", "0.00", "0.00", "0.00", "T", "0.00", "0…
## $ HOURLYWindSpeed        <dbl> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYSeaLevelPressure <dbl> 30.01, 30.05, 30.14, 29.82, NA, 29.94, 30.42, 3…
## $ HOURLYStationPressure  <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…

3. Select Subset of Columns

The key columns that we will explore in this project are:

HOURLYRelativeHumidity HOURLYDRYBULBTEMPF HOURLYPrecip HOURLYWindSpeed HOURLYStationPressure

sub_jfk_weather <- select(jfk_weather, c(HOURLYRelativeHumidity, HOURLYDRYBULBTEMPF, HOURLYPrecip, HOURLYWindSpeed, HOURLYStationPressure))

head(sub_jfk_weather, 10)

## # A tibble: 10 × 5
##    HOURLYRelativeHumidity HOURLYDRYBULBTEMPF HOURLYPrecip HOURLYWindSpeed
##                     <dbl>              <dbl> <chr>                  <dbl>
##  1                     46                 83 0.00                      13
##  2                     48                 53 0.00                       6
##  3                     89                 36 0.00                      13
##  4                     48                 36 0.00                      14
##  5                     61                 39 T                         11
##  6                     79                 41 0.00                       6
##  7                     51                 19 0.00                       0
##  8                     65                 24 0.00                      11
##  9                     90                 54 0.06                      11
## 10                     94                 73 <NA>                       5
## # ℹ 1 more variable: HOURLYStationPressure <dbl>

4. Clean Up Columns

Replace all the T values with “0.0” and
Remove “s” from values like “0.02s”.

unique(sub_jfk_weather$HOURLYPrecip)

##  [1] "0.00"  "T"     "0.06"  NA      "0.03"  "0.02"  "0.08"  "0.01"  "0.07" 
## [10] "0.16"  "0.09"  "0.22"  "0.02s" "0.24"  "0.18"  "0.05"  "0.04"  "0.09s"
## [19] "0.11"  "0.14"  "0.25"  "0.10"  "0.01s" "0.58"  "0.12"  "0.13"  "0.46" 
## [28] "1.07"  "1.19"  "0.34"  "0.20"  "0.36s" "0.42"  "0.17"  "0.27"  "0.35" 
## [37] "0.31"  "0.33"  "0.23"  "0.26"  "0.28"  "0.75"  "0.19"  "0.36"  "0.03s"
## [46] "0.07s" "0.54"  "0.59"  "0.21"

sub_jfk_weather2 <- sub_jfk_weather %>% mutate(HOURLYPrecip=ifelse(HOURLYPrecip=="T", "0.0", HOURLYPrecip))
sub_jfk_weather3 <- sub_jfk_weather2 %>% mutate(HOURLYPrecip=str_remove(HOURLYPrecip, pattern="s$"))

unique(sub_jfk_weather3$HOURLYPrecip)

##  [1] "0.00" "0.0"  "0.06" NA     "0.03" "0.02" "0.08" "0.01" "0.07" "0.16"
## [11] "0.09" "0.22" "0.24" "0.18" "0.05" "0.04" "0.11" "0.14" "0.25" "0.10"
## [21] "0.58" "0.12" "0.13" "0.46" "1.07" "1.19" "0.34" "0.20" "0.36" "0.42"
## [31] "0.17" "0.27" "0.35" "0.31" "0.33" "0.23" "0.26" "0.28" "0.75" "0.19"
## [41] "0.54" "0.59" "0.21"

5. Convert Columns to Numerical Types

Convert HOURLYPrecip to the numeric type and store the cleaned dataframe as a new variable.

glimpse(sub_jfk_weather3)

## Rows: 5,727
## Columns: 5
## $ HOURLYRelativeHumidity <dbl> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF     <dbl> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYPrecip           <chr> "0.00", "0.00", "0.00", "0.00", "0.0", "0.00", …
## $ HOURLYWindSpeed        <dbl> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYStationPressure  <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…

sub_jfk_weather3$HOURLYPrecip <- as.numeric(sub_jfk_weather3$HOURLYPrecip)

glimpse(sub_jfk_weather3)

## Rows: 5,727
## Columns: 5
## $ HOURLYRelativeHumidity <dbl> 46, 48, 89, 48, 61, 79, 51, 65, 90, 94, 79, 37,…
## $ HOURLYDRYBULBTEMPF     <dbl> 83, 53, 36, 36, 39, 41, 19, 24, 54, 73, 83, 44,…
## $ HOURLYPrecip           <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ HOURLYWindSpeed        <dbl> 13, 6, 13, 14, 11, 6, 0, 11, 11, 5, 21, 7, 17, …
## $ HOURLYStationPressure  <dbl> 29.99, 30.03, 30.12, 29.80, 30.50, 29.92, 30.40…

6. Rename Columns

HOURLYRelativeHumidity’ to ‘relative_humidity’ HOURLYDRYBULBTEMPF’ to ‘dry_bulb_temp_f’ HOURLYPrecip’ to ‘precip’ HOURLYWindSpeed’ to ‘wind_speed’ HOURLYStationPressure’ to ‘station_pressure’

And I would like to replace all NAs to 0 here, too.

sub_jfk_weather4 <- sub_jfk_weather3 %>% replace_na(list(HOURLYRelativeHumidity=0, HOURLYDRYBULBTEMPF=0, HOURLYPrecip=0, HOURLYWindSpeed=0, HOURLYStationPressure=0)) %>% rename("relative_humidity"="HOURLYRelativeHumidity", "dry_bulb_temp_f"="HOURLYDRYBULBTEMPF", "precip"="HOURLYPrecip", "wind_speed"="HOURLYWindSpeed", "station_pressure"="HOURLYStationPressure")

head(sub_jfk_weather4)

## # A tibble: 6 × 5
##   relative_humidity dry_bulb_temp_f precip wind_speed station_pressure
##               <dbl>           <dbl>  <dbl>      <dbl>            <dbl>
## 1                46              83      0         13             30.0
## 2                48              53      0          6             30.0
## 3                89              36      0         13             30.1
## 4                48              36      0         14             29.8
## 5                61              39      0         11             30.5
## 6                79              41      0          6             29.9

7. Exploratory Data Analysis

First, split the data into a training and testing set. Splitting a dataset is done randomly, so to have reproducible results set the seed = 1234. Also, use 80% of the data for training.

Next, looking at just the training set, plot histograms or box plots of the variables (relative_humidity, dry_bulb_temp_f, precip, wind_speed, station_pressure) for an intial look of their distributions using tidyverse’s ggplot. Leave the testing set as is because it is good practice to not see the testing set until evaluating the final model.

set.seed(1234)

weather_split <- initial_split(sub_jfk_weather4, prop = 0.8)
train_data <- training(weather_split)
test_data <- testing(weather_split)

ggplot(train_data, aes(y=relative_humidity))+geom_boxplot()

ggplot(train_data, aes(y=dry_bulb_temp_f))+geom_boxplot()

ggplot(train_data, aes(y=precip))+geom_boxplot()

ggplot(train_data, aes(y=wind_speed))+geom_boxplot()

ggplot(train_data, aes(y=station_pressure))+geom_boxplot()

8. Linear Regression

Create simple linear regression models where precip is the response variable and each of relative_humidity, dry_bulb_temp_f,wind_speed or station_pressure will be a predictor variable, e.g. precip ~ relative_humidity, precip ~ dry_bulb_temp_f, etc. for a total of four simple models. Additionally, visualize each simple model with a scatter plot.

lm_spec <- linear_reg() %>% set_engine(engine = "lm")

train_fit1 <- lm_spec %>% fit(precip ~ relative_humidity, data = train_data)
train_fit1

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = precip ~ relative_humidity, data = data)
## 
## Coefficients:
##       (Intercept)  relative_humidity  
##        -0.0106076          0.0002333

train_fit2 <- lm_spec %>% fit(precip ~ dry_bulb_temp_f, data = train_data)
train_fit2

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = precip ~ dry_bulb_temp_f, data = data)
## 
## Coefficients:
##     (Intercept)  dry_bulb_temp_f  
##       2.695e-03        3.544e-05

train_fit3 <- lm_spec %>% fit(precip ~ wind_speed, data = train_data)
train_fit3

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = precip ~ wind_speed, data = data)
## 
## Coefficients:
## (Intercept)   wind_speed  
##   0.0014408    0.0002915

train_fit4 <- lm_spec %>% fit(precip ~ station_pressure, data = train_data)
train_fit4

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = precip ~ station_pressure, data = data)
## 
## Coefficients:
##      (Intercept)  station_pressure  
##        1.792e-03         9.669e-05

train_plot1 <-ggplot(train_data, aes(x=relative_humidity, y=precip))+geom_point()
train_plot1

train_plot2 <-ggplot(train_data, aes(x=dry_bulb_temp_f, y=precip))+geom_point()
train_plot2

train_plot3 <-ggplot(train_data, aes(x=wind_speed, y=precip))+geom_point()
train_plot3

train_plot4 <-ggplot(train_data, aes(x=station_pressure, y=precip))+geom_point()
train_plot4

9. Improve the Model

Now, try improving the simple models you created in the previous section.

Create at least two more models, each model should use at least one of the different techniques:

Add more features/predictors Add regularization (L1, L2 or a mix) Add a polynomial component

9-1. Multiple Linear Regression

train_fit5 <- lm_spec %>% fit(precip ~ ., data = train_data)
train_fit5

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = precip ~ ., data = data)
## 
## Coefficients:
##       (Intercept)  relative_humidity    dry_bulb_temp_f         wind_speed  
##        -1.284e-03          3.068e-04          7.994e-06          4.998e-04  
##  station_pressure  
##        -6.848e-04

train_results5 <- train_fit5 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)

train_results5

## # A tibble: 4,581 × 2
##         .pred truth
##         <dbl> <dbl>
##  1  0.0112     0   
##  2  0.00595    0   
##  3 -0.0000804  0   
##  4  0.0121     0.58
##  5  0.00318    0   
##  6  0.0235     0   
##  7  0.00364    0   
##  8 -0.00322    0   
##  9  0.00295    0   
## 10  0.00110    0   
## # ℹ 4,571 more rows

9-2. Add Ridge(L2) regularization

weather_recipe <- recipe(precip ~., data = train_data)
ridge_spec <- linear_reg(penalty = 0.1, mixture = 0) %>% set_engine("glmnet")
ridge_wf <- workflow() %>% add_recipe(weather_recipe)
ridge_fit <- ridge_wf %>% add_model(ridge_spec) %>% fit(data=train_data)
ridge_fit %>% pull_workflow_fit() %>% tidy()

## # A tibble: 5 × 3
##   term                 estimate penalty
##   <chr>                   <dbl>   <dbl>
## 1 (Intercept)       -0.000160       0.1
## 2 relative_humidity  0.0000632      0.1
## 3 dry_bulb_temp_f    0.00000424     0.1
## 4 wind_speed         0.0000839      0.1
## 5 station_pressure  -0.0000169      0.1

train_result6 <- ridge_fit %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
head(train_result6)

## # A tibble: 6 × 2
##     .pred truth
##     <dbl> <dbl>
## 1 0.00618  0   
## 2 0.00511  0   
## 3 0.00378  0   
## 4 0.00624  0.58
## 5 0.00446  0   
## 6 0.00825  0

9-2-1. Using grid search to tune the model above

tune_spec <- linear_reg(penalty = tune(), mixture = 0) %>% set_engine("glmnet")
weather_cvfolds <- vfold_cv(train_data)
lambda_grid <- grid_regular(levels = 50, penalty(range = c(-3, 0.3)))
ridge_grid <- tune_grid(ridge_wf %>% add_model(tune_spec), resamples=weather_cvfolds, grid=lambda_grid)

show_best(ridge_grid, metric="rmse")

## # A tibble: 5 × 7
##   penalty .metric .estimator   mean     n std_err .config              
##     <dbl> <chr>   <chr>       <dbl> <int>   <dbl> <chr>                
## 1 0.00471 rmse    standard   0.0330    10 0.00484 Preprocessor1_Model11
## 2 0.00404 rmse    standard   0.0330    10 0.00484 Preprocessor1_Model10
## 3 0.00551 rmse    standard   0.0330    10 0.00485 Preprocessor1_Model12
## 4 0.00346 rmse    standard   0.0330    10 0.00484 Preprocessor1_Model09
## 5 0.00643 rmse    standard   0.0330    10 0.00485 Preprocessor1_Model13

9-2-2. Apply the optimal penalty found above to Elastic Net

elastic_spec <- linear_reg(penalty = 0.00346, mixture = 0.2) %>% set_engine("glmnet")
elastic_wf <- workflow() %>% add_recipe(weather_recipe)
elastic_fit <- ridge_wf %>% add_model(elastic_spec) %>% fit(data=train_data)
elastic_fit %>% pull_workflow_fit() %>% tidy()

## # A tibble: 5 × 3
##   term               estimate penalty
##   <chr>                 <dbl>   <dbl>
## 1 (Intercept)       -0.00497  0.00346
## 2 relative_humidity  0.000216 0.00346
## 3 dry_bulb_temp_f    0        0.00346
## 4 wind_speed         0.000271 0.00346
## 5 station_pressure  -0.000256 0.00346

train_result7 <- elastic_fit %>% predict(new_data=train_data) %>% mutate(truth=train_data$precip)
head(train_result7)

## # A tibble: 6 × 2
##     .pred truth
##     <dbl> <dbl>
## 1 0.00962  0   
## 2 0.00591  0   
## 3 0.00195  0   
## 4 0.00974  0.58
## 5 0.00401  0   
## 6 0.0166   0

Find the best model

Compare the regression metrics of each model from section 9 to find the best model overall. To do this,

Evaluate the models on the testing set using at least one metric (like MSE, RMSE or R-squared). After calculating the metrics on the testing set for each model, print them out in as a table to easily compare.

Finally, from the comparison table you create, conclude which model performed the best.

train_results1 <- train_fit1 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)

test_results1 <- train_fit1 %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)

rsq(train_results1, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0203

rsq(test_results1, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0359

train_results2 <- train_fit2 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)

test_results2 <- train_fit2 %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)

rsq(train_results2, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard    0.000336

rsq(test_results2, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard    0.000237

train_results3 <- train_fit3 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)

test_results3 <- train_fit3 %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)

rsq(train_results3, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard     0.00263

rsq(test_results3, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0287

train_results4 <- train_fit4 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)

test_results4 <- train_fit4 %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)

rsq(train_results4, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard    0.000162

rsq(test_results4, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard    0.000523

train_results5 <- train_fit5 %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
test_results5 <- train_fit5 %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)

rsq(train_results5, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0302

rsq(test_results5, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0814

train_results6 <- ridge_fit %>% predict(new_data = train_data) %>% mutate(truth=train_data$precip)
test_results6 <- ridge_fit %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)

rsq(train_results6, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0249

rsq(test_results6, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0585

train_results7 <- elastic_fit %>% predict(new_data=train_data) %>% mutate(truth=train_data$precip)
test_results7 <- ridge_fit %>% predict(new_data = test_data) %>% mutate(truth=test_data$precip)

rsq(train_results7, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0285

rsq(test_results7, truth=truth, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0585

model_names <- c("precip~humidity","precip~drybulbtemp","precip~windspeed","precip~stationpressure","Multiple Linear Regression", "Ridge (L2) regularization", "Elastic Net Regularization")
train_error <- c(0.0203, 0.000336, 0.00263, 0.000162, 0.0302, 0.0249, 0.0285)
test_error <- c(0.0359, 0.000237, 0.0287, 0.000523, 0.0814, 0.0585, 0.0585)

comparison_df <- data.frame(model_names, train_error, test_error)
comparison_df

##                  model_names train_error test_error
## 1            precip~humidity    0.020300   0.035900
## 2         precip~drybulbtemp    0.000336   0.000237
## 3           precip~windspeed    0.002630   0.028700
## 4     precip~stationpressure    0.000162   0.000523
## 5 Multiple Linear Regression    0.030200   0.081400
## 6  Ridge (L2) regularization    0.024900   0.058500
## 7 Elastic Net Regularization    0.028500   0.058500