📂 Loading Libraries and Dataset

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.0 ──
## ✔ broom        1.0.9     ✔ rsample      1.3.1
## ✔ dials        1.4.2     ✔ tune         2.0.0
## ✔ infer        1.0.9     ✔ workflows    1.3.0
## ✔ modeldata    1.5.1     ✔ workflowsets 1.1.1
## ✔ parsnip      1.3.3     ✔ yardstick    1.3.2
## ✔ recipes      1.3.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
library(rvest) # for web scraping
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(httr) # for Http requests
library(RSQLite)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-10
# Getting Bike Data
bike_sharing_df <- read.csv("C:/Users/MODEL23/Documents/bike_sharing_systems.csv")
head(bike_sharing_df,6)
##    CITY WEATHER VISIBILITY  TEMP TEMP_MIN TEMP_MAX PRESSURE HUMIDITY WIND_SPEED
## 1 Seoul   Clear      10000 19.86    19.86    20.07     1013       92       1.07
## 2 Seoul   Clear      10000 19.50    19.37    19.50     1014       80       1.53
## 3 Seoul   Clear      10000 23.08    23.08    23.08     1014       51       1.78
## 4 Seoul   Clear      10000 28.52    28.52    28.52     1013       26       2.15
## 5 Seoul   Clear      10000 31.99    31.99    31.99     1012       19       2.45
## 6 Seoul   Clear      10000 29.50    29.50    29.50     1012       29       1.94
##   WIND_DEG   FORECAST_DATETIME SEASON
## 1       45 2025-09-09 18:00:00 Autumn
## 2       28 2025-09-09 21:00:00 Autumn
## 3       44          2025-09-10 Autumn
## 4       71 2025-09-10 03:00:00 Autumn
## 5       61 2025-09-10 06:00:00 Autumn
## 6      356 2025-09-10 09:00:00 Autumn
colnames(bike_sharing_df)
##  [1] "CITY"              "WEATHER"           "VISIBILITY"       
##  [4] "TEMP"              "TEMP_MIN"          "TEMP_MAX"         
##  [7] "PRESSURE"          "HUMIDITY"          "WIND_SPEED"       
## [10] "WIND_DEG"          "FORECAST_DATETIME" "SEASON"

Notes: Here I load the required libraries and import the bike sharing dataset. We preview the first six rows and check the column names to understand the dataset structure.


🌦️ Weather API - Current Weather Data

current_weather_url <- 'https://api.openweathermap.org/data/2.5/weather'
your_api_key <- "da579f0f1c601672c4f6fa0d8ba9397c"
current_query <- list(q = "Seoul", appid = your_api_key, units="metric")
response <- GET(current_weather_url, query=current_query)
json_result <- content(response, as="parsed")

weather <- c(json_result$weather[[1]]$main)
visibility <- c(json_result$visibility)
temp <- c(json_result$main$temp)
temp_min <- c(json_result$main$temp_min)
temp_max <- c(json_result$main$temp_max)
pressure <- c(json_result$main$pressure)
humidity <- c(json_result$main$humidity)
wind_speed <- c(json_result$wind$speed)
wind_deg <- c(json_result$wind$deg)

weather_data_frame <- data.frame(weather, visibility, temp, temp_min, temp_max, pressure, humidity, wind_speed, wind_deg)
print(weather_data_frame)
##   weather visibility  temp temp_min temp_max pressure humidity wind_speed
## 1  Clouds      10000 22.76    22.76    22.78     1011      100       1.54
##   wind_deg
## 1      140

Notes: This section connects to the OpenWeather API to retrieve the current weather data for Seoul. The data is parsed into variables like temperature, humidity, wind, and then stored in a dataframe.


🍂 Defining Seasons and Forecast Function

# Helper function to determine the season based on the month (Northern Hemisphere)
get_season <- function(month) {
  if (month %in% c(12, 1, 2)) {
    return("Winter")
  } else if (month %in% c(3, 4, 5)) {
    return("Spring")
  } else if (month %in% c(6, 7, 8)) {
    return("Summer")
  } else if (month %in% c(9, 10, 11)) {
    return("Autumn")
  }
}

get_weather_forecaset_by_cities <- function(city_names, api_key) {
  all_city_data <- list()
  forecast_url <- 'https://api.openweathermap.org/data/2.5/forecast'
  
  for (city_name in city_names) {
    forecast_query <- list(q = city_name, appid = api_key, units = "metric")
    response <- GET(forecast_url, query = forecast_query)
    
    if (http_error(response)) {
      warning(paste("Error fetching data for", city_name, ":", http_status(response)$message))
      next
    }
    
    json_result <- content(response, as = "parsed")
    results <- json_result$list
    
    city_forecast_df <- data.frame(
      city = city_name,
      weather = sapply(results, function(x) x$weather[[1]]$main),
      visibility = sapply(results, function(x) x$visibility),
      temp = sapply(results, function(x) x$main$temp),
      temp_min = sapply(results, function(x) x$main$temp_min),
      temp_max = sapply(results, function(x) x$main$temp_max),
      pressure = sapply(results, function(x) x$main$pressure),
      humidity = sapply(results, function(x) x$main$humidity),
      wind_speed = sapply(results, function(x) x$wind$speed),
      wind_deg = sapply(results, function(x) x$wind$deg),
      forecast_datetime = as.POSIXct(sapply(results, function(x) x$dt), origin = "1970-01-01", tz = "UTC")
    )
    
    # Add the season column
    city_forecast_df$season <- sapply(format(city_forecast_df$forecast_datetime, "%m"), as.numeric)
    city_forecast_df$season <- sapply(city_forecast_df$season, get_season)
    
    all_city_data[[city_name]] <- city_forecast_df
  }
  
  final_df <- do.call(rbind, all_city_data)
  return(final_df)
}

Notes: Here, a helper function get_season categorizes months into seasons. Another function get_weather_forecaset_by_cities fetches 5-day forecasts for multiple cities and assigns seasons to each forecast.


🌍 Getting Weather Forecast for Multiple Cities

cities <- c("Seoul", "Washington, D.C.", "Paris", "Suzhou")
cities_weather_df <- get_weather_forecaset_by_cities(cities, your_api_key)
print(head(cities_weather_df,6))
##          city weather visibility  temp temp_min temp_max pressure humidity
## Seoul.1 Seoul    Rain      10000 22.76    21.84    22.76     1011      100
## Seoul.2 Seoul    Rain      10000 22.22    21.15    22.22     1011       97
## Seoul.3 Seoul    Rain       6430 22.02    21.65    22.02     1011       96
## Seoul.4 Seoul    Rain       7759 22.96    22.96    22.96     1012       94
## Seoul.5 Seoul    Rain      10000 22.64    22.64    22.64     1011       97
## Seoul.6 Seoul    Rain      10000 20.51    20.51    20.51     1010       92
##         wind_speed wind_deg   forecast_datetime season
## Seoul.1       1.39      206 2025-09-16 15:00:00 Autumn
## Seoul.2       1.23      215 2025-09-16 18:00:00 Autumn
## Seoul.3       1.68      227 2025-09-16 21:00:00 Autumn
## Seoul.4       1.04      239 2025-09-17 00:00:00 Autumn
## Seoul.5       2.10      188 2025-09-17 03:00:00 Autumn
## Seoul.6       2.60      305 2025-09-17 06:00:00 Autumn
write.csv(cities_weather_df, "cities_weather_df.CSV", row.names = FALSE)

Notes: We now call the forecast function for 4 cities. The results are saved to a CSV file for further use.


🧹 Standardizing Datasets

datasets_list <- list(
  bike_sharing_df = bike_sharing_df,
  cities_weather_df = cities_weather_df
)

for (df_name in names(datasets_list)) {
  dataset <- datasets_list[[df_name]]
  
  new_names <- colnames(dataset)
  new_names <- toupper(new_names)
  new_names <- gsub(" ", "_", new_names)
  new_names <- gsub("[^A-Z0-9_]", "", new_names)
  
  colnames(dataset) <- new_names
  assign(df_name, dataset, envir = .GlobalEnv)
  
  print(paste("Column names for", df_name, "have been standardized."))
  print(colnames(dataset))
  
  file_name <- paste0(df_name, "_standardized.csv")
  write.csv(dataset, file_name, row.names = FALSE)
}
## [1] "Column names for bike_sharing_df have been standardized."
##  [1] "CITY"              "WEATHER"           "VISIBILITY"       
##  [4] "TEMP"              "TEMP_MIN"          "TEMP_MAX"         
##  [7] "PRESSURE"          "HUMIDITY"          "WIND_SPEED"       
## [10] "WIND_DEG"          "FORECAST_DATETIME" "SEASON"           
## [1] "Column names for cities_weather_df have been standardized."
##  [1] "CITY"              "WEATHER"           "VISIBILITY"       
##  [4] "TEMP"              "TEMP_MIN"          "TEMP_MAX"         
##  [7] "PRESSURE"          "HUMIDITY"          "WIND_SPEED"       
## [10] "WIND_DEG"          "FORECAST_DATETIME" "SEASON"

Notes: Column names across datasets are standardized to uppercase with underscores. This ensures consistency when merging or querying.


Data Wrangling with with Dplyr

In this lab, I focused on wrangling the Seoul bike-sharing demand historical dataset. This is the core dataset to build a predictive model later.

It contains the following columns:

  • DATE : Year-month-day
  • RENTED BIKE COUNT- Count of bikes rented at each hour
  • HOUR- Hour of he day
  • TEMPERATURE - Temperature in Celsius
  • HUMIDITY - Unit is %
  • WINDSPEED - Unit is m/s
  • VISIBILITY - Multiplied by 10m
  • DEW POINT TEMERATURE - The temperature to which the air would have to cool down in order to reach saturation, unit is Celsius
  • SOLAR RADIATION - MJ/m2
  • RAINFALL - mm
  • SNOWFALL - cm
  • SEASONS - Winter, Spring, Summer, Autumn
  • HOLIDAY - Holiday/No holiday
  • FUNCTIONAL DAY - NoFunc(Non Functional Hours), Fun(Functional hours)

For this dataset,I WAS asked to use tidyverse to perform the following data wrangling tasks:

  • TASK: Detect and handle missing values
  • TASK: Create indicator (dummy) variables for categorical variables
  • TASK: Normalize data
# load data set
seoul_bike_sharing <- read_csv("raw_seoul_bike_sharing.csv")
## Rows: 8760 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): DATE, SEASONS, HOLIDAY, FUNCTIONING_DAY
## dbl (10): RENTED_BIKE_COUNT, HOUR, TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBI...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# look at dataset summary
summary(seoul_bike_sharing)
##      DATE           RENTED_BIKE_COUNT      HOUR        TEMPERATURE    
##  Length:8760        Min.   :   2.0    Min.   : 0.00   Min.   :-17.80  
##  Class :character   1st Qu.: 214.0    1st Qu.: 5.75   1st Qu.:  3.40  
##  Mode  :character   Median : 542.0    Median :11.50   Median : 13.70  
##                     Mean   : 729.2    Mean   :11.50   Mean   : 12.87  
##                     3rd Qu.:1084.0    3rd Qu.:17.25   3rd Qu.: 22.50  
##                     Max.   :3556.0    Max.   :23.00   Max.   : 39.40  
##                     NA's   :295                       NA's   :11      
##     HUMIDITY       WIND_SPEED      VISIBILITY   DEW_POINT_TEMPERATURE
##  Min.   : 0.00   Min.   :0.000   Min.   :  27   Min.   :-30.600      
##  1st Qu.:42.00   1st Qu.:0.900   1st Qu.: 940   1st Qu.: -4.700      
##  Median :57.00   Median :1.500   Median :1698   Median :  5.100      
##  Mean   :58.23   Mean   :1.725   Mean   :1437   Mean   :  4.074      
##  3rd Qu.:74.00   3rd Qu.:2.300   3rd Qu.:2000   3rd Qu.: 14.800      
##  Max.   :98.00   Max.   :7.400   Max.   :2000   Max.   : 27.200      
##                                                                      
##  SOLAR_RADIATION     RAINFALL          SNOWFALL         SEASONS         
##  Min.   :0.0000   Min.   : 0.0000   Min.   :0.00000   Length:8760       
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.00000   Class :character  
##  Median :0.0100   Median : 0.0000   Median :0.00000   Mode  :character  
##  Mean   :0.5691   Mean   : 0.1487   Mean   :0.07507                     
##  3rd Qu.:0.9300   3rd Qu.: 0.0000   3rd Qu.:0.00000                     
##  Max.   :3.5200   Max.   :35.0000   Max.   :8.80000                     
##                                                                         
##    HOLIDAY          FUNCTIONING_DAY   
##  Length:8760        Length:8760       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
dim(seoul_bike_sharing)
## [1] 8760   14
colSums(is.na(seoul_bike_sharing))
##                  DATE     RENTED_BIKE_COUNT                  HOUR 
##                     0                   295                     0 
##           TEMPERATURE              HUMIDITY            WIND_SPEED 
##                    11                     0                     0 
##            VISIBILITY DEW_POINT_TEMPERATURE       SOLAR_RADIATION 
##                     0                     0                     0 
##              RAINFALL              SNOWFALL               SEASONS 
##                     0                     0                     0 
##               HOLIDAY       FUNCTIONING_DAY 
##                     0                     0

From the summary, I observed that:

Columns RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBILITY, DEW_POINT_TEMPERATURE, SOLAR_RADIATION, RAINFALL, SNOWFALL are numerical variables/columns and require normalization. Moreover, RENTED_BIKE_COUNT and TEMPERATURE have some missing values (NA’s) that need to be handled properly.

SEASONS, HOLIDAY, FUNCTIONING_DAY are categorical variables which need to be converted into indicator columns or dummy variables. Also, HOUR is read as a numerical variable but it is in fact a categorical variable with levels ranging from 0 to 23.

Now that you have some basic ideas about how to process this bike-sharing demand dataset, let’s start working on it!

TASK: Detect and handle missing values

I saw that RENTED_BIKE_COUNT only has about 3% missing values (295 / 8760). As such, you can safely drop any rows whose RENTED_BIKE_COUNT has missing values.

TODO: Drop rows with missing values in the RENTED_BIKE_COUNT column

# Drop rows with `RENTED_BIKE_COUNT` column == NA
seoul_bike_sharing <- seoul_bike_sharing %>% filter(!is.na(RENTED_BIKE_COUNT))
dim(seoul_bike_sharing)
## [1] 8465   14

Unlike the RENTED_BIKE_COUNT variable, TEMPERATURE is not a response variable. However, it is still an important predictor variable.

How do I handle missing values for TEMPERATURE? I could simply remove the rows but it’s better to impute them because TEMPERATURE should be relatively easy and reliable to estimate statistically.

I took a look at the missing values in the TEMPERATURE column.

seoul_bike_sharing %>%
  filter(is.na(TEMPERATURE))
## # A tibble: 11 × 14
##    DATE       RENTED_BIKE_COUNT  HOUR TEMPERATURE HUMIDITY WIND_SPEED VISIBILITY
##    <chr>                  <dbl> <dbl>       <dbl>    <dbl>      <dbl>      <dbl>
##  1 07/06/2018              3221    18          NA       57        2.7       1217
##  2 12/06/2018              1246    14          NA       45        2.2       1961
##  3 13/06/2018              2664    17          NA       57        3.3        919
##  4 17/06/2018              2330    17          NA       58        3.3        865
##  5 20/06/2018              2741    19          NA       61        2.7       1236
##  6 30/06/2018              1144    13          NA       87        1.7        390
##  7 05/07/2018               827    10          NA       75        1.1       1028
##  8 11/07/2018               634     9          NA       96        0.6        450
##  9 12/07/2018               593     6          NA       93        1.1        852
## 10 21/07/2018               347     4          NA       77        1.2       1203
## 11 21/08/2018              1277    23          NA       75        0.1       1892
## # ℹ 7 more variables: DEW_POINT_TEMPERATURE <dbl>, SOLAR_RADIATION <dbl>,
## #   RAINFALL <dbl>, SNOWFALL <dbl>, SEASONS <chr>, HOLIDAY <chr>,
## #   FUNCTIONING_DAY <chr>

It seems that all of the missing values for TEMPERATURE are found in SEASONS == Summer, so it is reasonable to impute those missing values with the summer average temperature.

TODO: Impute missing values for the TEMPERATURE column using its mean value.

# Calculate the summer average temperature
summer_avg_temp <- seoul_bike_sharing %>%
  filter(SEASONS == "Summer") %>%
  select(TEMPERATURE) %>%
  summarise(mean(TEMPERATURE, na.rm = TRUE)) %>%
  unlist() %>%
  unname()

# Impute missing values for TEMPERATURE column with summer average temperature

seoul_bike_sharing$TEMPERATURE <- replace_na(seoul_bike_sharing$TEMPERATURE, summer_avg_temp)

# Check NA values has been replaced and no more NA values
seoul_bike_sharing %>%
  filter(is.na(TEMPERATURE))
## # A tibble: 0 × 14
## # ℹ 14 variables: DATE <chr>, RENTED_BIKE_COUNT <dbl>, HOUR <dbl>,
## #   TEMPERATURE <dbl>, HUMIDITY <dbl>, WIND_SPEED <dbl>, VISIBILITY <dbl>,
## #   DEW_POINT_TEMPERATURE <dbl>, SOLAR_RADIATION <dbl>, RAINFALL <dbl>,
## #   SNOWFALL <dbl>, SEASONS <chr>, HOLIDAY <chr>, FUNCTIONING_DAY <chr>
# Save the dataset as `seoul_bike_sharing.csv`
write.csv(seoul_bike_sharing, "seoul_bike_sharing.csv")

TASK: Create indicator (dummy) variables for categorical variables

Regression models can not process categorical variables directly, thus I need to convert them into indicator variables.

In the bike-sharing demand dataset, SEASONS and HOLIDAY are categorical variables. Also, HOUR is read as a numerical variable but it is in fact a categorical variable with levels ranged from 0 to 23.

TODO: Convert HOUR column from numeric into character first:

# Using mutate() function to convert HOUR column into character type
seoul_bike_sharing <- seoul_bike_sharing %>% mutate(HOUR = as.character(HOUR))

TODO: Convert SEASONS, HOLIDAY, and HOUR columns into indicator columns.

# Convert SEASONS, HOLIDAY and HOUR columns into indicator columns.

col <- c("SEASONS", "HOLIDAY", "HOUR")

feature <- function(x) {
  for (x in col) {
    seoul_bike_sharing <<- seoul_bike_sharing %>%
      mutate(dummy = 1) %>%
      spread(key = x, value = dummy, fill = 0)
  }
}
feature()
# Print the dataset summary again to make sure the indicator columns are created properly
summary(seoul_bike_sharing)
##      DATE           RENTED_BIKE_COUNT  TEMPERATURE        HUMIDITY    
##  Length:8465        Min.   :   2.0    Min.   :-17.80   Min.   : 0.00  
##  Class :character   1st Qu.: 214.0    1st Qu.:  3.00   1st Qu.:42.00  
##  Mode  :character   Median : 542.0    Median : 13.50   Median :57.00  
##                     Mean   : 729.2    Mean   : 12.77   Mean   :58.15  
##                     3rd Qu.:1084.0    3rd Qu.: 22.70   3rd Qu.:74.00  
##                     Max.   :3556.0    Max.   : 39.40   Max.   :98.00  
##    WIND_SPEED      VISIBILITY   DEW_POINT_TEMPERATURE SOLAR_RADIATION 
##  Min.   :0.000   Min.   :  27   Min.   :-30.600       Min.   :0.0000  
##  1st Qu.:0.900   1st Qu.: 935   1st Qu.: -5.100       1st Qu.:0.0000  
##  Median :1.500   Median :1690   Median :  4.700       Median :0.0100  
##  Mean   :1.726   Mean   :1434   Mean   :  3.945       Mean   :0.5679  
##  3rd Qu.:2.300   3rd Qu.:2000   3rd Qu.: 15.200       3rd Qu.:0.9300  
##  Max.   :7.400   Max.   :2000   Max.   : 27.200       Max.   :3.5200  
##     RAINFALL          SNOWFALL       FUNCTIONING_DAY        Autumn      
##  Min.   : 0.0000   Min.   :0.00000   Length:8465        Min.   :0.0000  
##  1st Qu.: 0.0000   1st Qu.:0.00000   Class :character   1st Qu.:0.0000  
##  Median : 0.0000   Median :0.00000   Mode  :character   Median :0.0000  
##  Mean   : 0.1491   Mean   :0.07768                      Mean   :0.2288  
##  3rd Qu.: 0.0000   3rd Qu.:0.00000                      3rd Qu.:0.0000  
##  Max.   :35.0000   Max.   :8.80000                      Max.   :1.0000  
##      Spring           Summer           Winter          Holiday      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.2552   Mean   :0.2608   Mean   :0.2552   Mean   :0.0482  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    No Holiday           0                 1                 10        
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.00000   Median :0.00000   Median :0.0000  
##  Mean   :0.9518   Mean   :0.04158   Mean   :0.04158   Mean   :0.0417  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##        11               12               13               14        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0417   Mean   :0.0417   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##        15               16               17               18        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0417   Mean   :0.0417   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##        19               2                 20               21        
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.0417   Mean   :0.04158   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##        22               23               3                 4          
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.0417   Mean   :0.0417   Mean   :0.04158   Mean   :0.04158  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##        5                 6                 7                8         
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.04158   Mean   :0.04158   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##        9         
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.0417  
##  3rd Qu.:0.0000  
##  Max.   :1.0000
# drop the new added column
seoul_bike_sharing <- seoul_bike_sharing[, -1]
# Save the dataset as `seoul_bike_sharing_converted.csv`
write_csv(seoul_bike_sharing, "seoul_bike_sharing_converted.csv")

TASK: Normalize data

In this project, I was asked to use Min-max normalization.

TODO: Apply min-max normalization on RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBILITY, DEW_POINT_TEMPERATURE, SOLAR_RADIATION, RAINFALL, SNOWFALL

# Use the `mutate()` function to apply min-max normalization on columns
# `RENTED_BIKE_COUNT`, `TEMPERATURE`, `HUMIDITY`, `WIND_SPEED`, `VISIBILITY`, `DEW_POINT_TEMPERATURE`, `SOLAR_RADIATION`, `RAINFALL`, `SNOWFALL`

# We can use rescale function with to values as 0 and 1 for min/ max normalization.

seoul_bike_sharing <<- seoul_bike_sharing %>%
  mutate(RENTED_BIKE_COUNT = rescale(RENTED_BIKE_COUNT, to = 0:1), TEMPERATURE = rescale(TEMPERATURE, to = 0:1), HUMIDITY = rescale(HUMIDITY, to = 0:1), WIND_SPEED = rescale(WIND_SPEED, to = 0:1), VISIBILITY = rescale(VISIBILITY, to = 0:1), DEW_POINT_TEMPERATURE = rescale(DEW_POINT_TEMPERATURE, to = 0:1), SOLAR_RADIATION = rescale(SOLAR_RADIATION, to = 0:1), RAINFALL = rescale(RAINFALL, to = 0:1), SNOWFALL = rescale(SNOWFALL, to = 0:1))
# Print the summary of the dataset again to make sure the numeric columns range between 0 and 1
summary(seoul_bike_sharing)
##  RENTED_BIKE_COUNT  TEMPERATURE        HUMIDITY        WIND_SPEED    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.05965   1st Qu.:0.3636   1st Qu.:0.4286   1st Qu.:0.1216  
##  Median :0.15194   Median :0.5472   Median :0.5816   Median :0.2027  
##  Mean   :0.20460   Mean   :0.5345   Mean   :0.5933   Mean   :0.2332  
##  3rd Qu.:0.30445   3rd Qu.:0.7080   3rd Qu.:0.7551   3rd Qu.:0.3108  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    VISIBILITY     DEW_POINT_TEMPERATURE SOLAR_RADIATION       RAINFALL       
##  Min.   :0.0000   Min.   :0.0000        Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.4602   1st Qu.:0.4412        1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.8429   Median :0.6107        Median :0.002841   Median :0.000000  
##  Mean   :0.7131   Mean   :0.5977        Mean   :0.161326   Mean   :0.004261  
##  3rd Qu.:1.0000   3rd Qu.:0.7924        3rd Qu.:0.264205   3rd Qu.:0.000000  
##  Max.   :1.0000   Max.   :1.0000        Max.   :1.000000   Max.   :1.000000  
##     SNOWFALL        FUNCTIONING_DAY        Autumn           Spring      
##  Min.   :0.000000   Length:8465        Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000000   Class :character   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.000000   Mode  :character   Median :0.0000   Median :0.0000  
##  Mean   :0.008828                      Mean   :0.2288   Mean   :0.2552  
##  3rd Qu.:0.000000                      3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.000000                      Max.   :1.0000   Max.   :1.0000  
##      Summer           Winter          Holiday         No Holiday    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.2608   Mean   :0.2552   Mean   :0.0482   Mean   :0.9518  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##        0                 1                 10               11        
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.04158   Mean   :0.04158   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##        12               13               14               15        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0417   Mean   :0.0417   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##        16               17               18               19        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0417   Mean   :0.0417   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##        2                 20               21               22        
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.04158   Mean   :0.0417   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##        23               3                 4                 5          
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.0417   Mean   :0.04158   Mean   :0.04158   Mean   :0.04158  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##        6                 7                8                9         
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.04158   Mean   :0.0417   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000
# Save the dataset as `seoul_bike_sharing_converted_normalized.csv`
write_csv(seoul_bike_sharing, "seoul_bike_sharing_converted_normalized.csv")

Standardize the column names again for the new datasets

# Dataset list
dataset_list <- c("seoul_bike_sharing.csv", "seoul_bike_sharing_converted.csv", "seoul_bike_sharing_converted_normalized.csv")

for (dataset_name in dataset_list) {
  # Read dataset
  dataset <- read_csv(dataset_name)
  # Standardized its columns:
  # Convert all columns names to uppercase
  names(dataset) <- toupper(names(dataset))
  # Replace any white space separators by underscore, using str_replace_all function
  names(dataset) <- str_replace_all(names(dataset), " ", "_")
  # Save the dataset back
  write.csv(dataset, dataset_name, row.names = FALSE)
}
## New names:
## Rows: 8465 Columns: 15
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): DATE, SEASONS, HOLIDAY, FUNCTIONING_DAY dbl (11): ...1, RENTED_BIKE_COUNT,
## HOUR, TEMPERATURE, HUMIDITY, WIND_SPEED, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 8465 Columns: 40
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): FUNCTIONING_DAY dbl (39): RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY,
## WIND_SPEED, VISIBILITY, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 8465 Columns: 40
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): FUNCTIONING_DAY dbl (39): RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY,
## WIND_SPEED, VISIBILITY, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Performing Exploratory Data Analysis with SQL, Tidyverse & ggplot2

Create tables and load data in DB2

create tables

I created 4 tables for the 4 datasets we have on IBM cloud DB2 service using SQL script editor.

I created the following tables:

world_cities bike_sharing cities_weather_forecast SEOUL_BIKE_SHARING

-- Create query for world_cities
CREATE TABLE WORLD_CITIES (
    CITY    VARCHAR(50),
    CITY_ASCII VARCHAR(50),
    LAT DECIMAL(20,2),
    LNG DECIMAL(20,2),
    COUNTRY VARCHAR(50),
    ISO2 VARCHAR(5),
    ISO3 VARCHAR(5),
    ADMIN_NAME VARCHAR(100),    
    CAPITAL VARCHAR(50),
    POPULATION BIGINT,
    ID BIGINT NOT NULL
);
-- Create query for bike_sharing
CREATE TABLE BIKE_SHARING_SYSTEMS (
    COUNTRY VARCHAR(20),
    CITY VARCHAR(87),
    SYSTEM VARCHAR(40),
    BICYCLES NUMERIC
);
-- Create query for cities_weather_forecast
CREATE TABLE CITIES_WEATHER_FORECAST (
    CITY VARCHAR(16),
    WEATHER VARCHAR(6),
    VISIBILITY SMALLINT,
    TEMP DECIMAL(6,2),
    TEMP_MIN DECIMAL(6,2),
    TEMP_MAX DECIMAL(6,2),
    PRESSURE SMALLINT,
    HUMIDITY SMALLINT,
    WIND_SPEED DECIMAL(6,2),
    WIND_DEG SMALLINT,
    SEASON VARCHAR(6),
    FORECAST_DATETIME TIMESTAMP
);
-- Create query for seoul_bike_sharing
CREATE TABLE SEOUL_BIKE_SHARING (
    DATE VARCHAR(30),
    RENTED_BIKE_COUNT SMALLINT,
    HOUR SMALLINT,
    TEMPERATURE DECIMAL(4,1),
    HUMIDITY SMALLINT,
    WIND_SPEED DECIMAL(3,1),
    VISIBILITY SMALLINT,
    DEW_POINT_TEMPERATURE DECIMAL(4,1),
    SOLAR_RADIATION DECIMAL(5,2),
    RAINFALL DECIMAL(3,1),
    SNOWFALL DECIMAL(3,1),
    SEASONS VARCHAR(10),
    HOLIDAY VARCHAR(20),
    FUNCTIONING_DAY VARCHAR(5)
);

Load the data in db2

I loaded the following CSV files using db2 GUI into the 4 tables created.

raw_worldcities.csv bike_sharing_systems.csv cities_weather_forecast.csv seoul_bike_sharing.csv I used these CSVs file before normalization and converting categorical columns to feature.

Exploratory Data Analysis with SQL

Establish your Db2 connection

The project requirement was to use RODBC but I wasn’t able to setup RODBC, I used RJDBC on my local machine but again I am not able to setup ROJDBC so I will be using SQL lite.

# Define database connection
con <- dbConnect(RSQLite::SQLite(),"RDB.sqlite")
con
## <SQLiteConnection>
##   Path: C:\Users\MODEL23\Documents\RDB.sqlite
##   Extensions: TRUE
# Read CSV into R Dataframe
worldcities <- read.csv("raw_worldcities.csv")
bike_sharing_systems <- read.csv("bike_sharing_systems.csv")
cities_weather_forecast <- read.csv("raw_cities_weather_forecast.csv")
seoul_bike_sharing <- read.csv("seoul_bike_sharing.csv")
dbListTables(con)
## [1] "BIKE_SHARING_SYSTEMS"    "CITIES_WEATHER_FORECAST"
## [3] "SEOUL_BIKE_SHARING"      "WORLD_CITIES"
Task 1 - Record Count

Determine how many records are in the seoul_bike_sharing dataset.

dbGetQuery(con, "SELECT count(*) as Count_of_Records FROM seoul_bike_sharing")
##   Count_of_Records
## 1             8465
Task 2 - Operational Hours

Determine how many hours had non-zero rented bike count

dbGetQuery(con, "SELECT count(HOUR) as Numer_of_hours FROM seoul_bike_sharing
WHERE RENTED_BIKE_COUNT > 0")
##   Numer_of_hours
## 1           8465
Task 3 - Weather Outlook

Query the the weather forecast for Seoul over the next 3 hours. I Recalled that the records in the CITIES_WEATHER_FORECAST dataset are 3 hours apart, so I just need the first record from the query

dbGetQuery(con, "SELECT * FROM CITIES_WEATHER_FORECAST
WHERE CITY = 'Seoul'
Limit 1")
##    CITY WEATHER VISIBILITY  TEMP TEMP_MIN TEMP_MAX PRESSURE HUMIDITY WIND_SPEED
## 1 Seoul    Rain      10000 23.49    22.96    23.49     1011       97       2.26
##   WIND_DEG   FORECAST_DATETIME
## 1      223 2025-09-16 12:00:00
Task 4 - Seasons

Find which seasons are included in the seoul bike sharing dataset

dbGetQuery(con, "SELECT DISTINCT(SEASONS) FROM seoul_bike_sharing")
##   SEASONS
## 1  Winter
## 2  Spring
## 3  Summer
## 4  Autumn
Task 5 - Date Range

Find the first and last dates in the Seoul Bike Sharing dataset.

dbGetQuery(con, "SELECT MIN(DATE) as Start_Date, MAX(DATE) as End_Date FROM seoul_bike_sharing")
##   Start_Date   End_Date
## 1 01/01/2018 31/12/2017
Task 6 - Subquery - ‘all-time high’

determine which date and hour had the most bike rentals.

dbGetQuery(con, "SELECT DATE, HOUR, RENTED_BIKE_COUNT as Maximum_COUNT FROM seoul_bike_sharing
WHERE RENTED_BIKE_COUNT = (SELECT MAX(RENTED_BIKE_COUNT) FROM seoul_bike_sharing)")
##         DATE HOUR Maximum_COUNT
## 1 19/06/2018   18          3556
Task 7 - Hourly popularity and temperature by season

Determine the average hourly temperature and the average number of bike rentals per hour over each season. List the top ten results by average bike count.

dbGetQuery(con, "SELECT SEASONS, HOUR, AVG(RENTED_BIKE_COUNT), AVG(TEMPERATURE) FROM seoul_bike_sharing GROUP BY SEASONS, HOUR ORDER BY AVG(RENTED_BIKE_COUNT) desc limit 10")
##    SEASONS HOUR AVG(RENTED_BIKE_COUNT) AVG(TEMPERATURE)
## 1   Summer   18               2135.141         29.38791
## 2   Autumn   18               1983.333         16.03185
## 3   Summer   19               1889.250         28.27378
## 4   Summer   20               1801.924         27.06630
## 5   Summer   21               1754.065         26.27826
## 6   Spring   18               1689.311         15.97222
## 7   Summer   22               1567.870         25.69891
## 8   Autumn   17               1562.877         17.27778
## 9   Summer   17               1526.293         30.07691
## 10  Autumn   19               1515.568         15.06346
Task 8 - Rental Seasonality

Find the average hourly bike count during each season. Also include the minimum, maximum, and standard deviation of the hourly bike count for each season.

No Standard deviation function available in SQLlite so I will not include it.

dbGetQuery(con, " SELECT SEASONS, AVG(RENTED_BIKE_COUNT) as AVG_S_COUNT, MIN(RENTED_BIKE_COUNT) as MIN_S_COUNT, MAX(RENTED_BIKE_COUNT) as MAX_S_COUNT FROM seoul_bike_sharing
group by (SEASONS)
Order by AVG_S_COUNT desc")
##   SEASONS AVG_S_COUNT MIN_S_COUNT MAX_S_COUNT
## 1  Summer   1034.0734           9        3556
## 2  Autumn    924.1105           2        3298
## 3  Spring    746.2542           2        3251
## 4  Winter    225.5412           3         937
Task 9 - Weather Seasonality

Consider the weather over each season. On average, what were the TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBILITY, DEW_POINT_TEMPERATURE, SOLAR_RADIATION, RAINFALL, and SNOWFALL per season? Included the average bike count as well , and rank the results by average bike count so Ican see if it is correlated with the weather at all.

dbGetQuery(con," SELECT SEASONS, AVG(RENTED_BIKE_COUNT) as AVG_S_COUNT, AVG(TEMPERATURE) as AVG_S_TEMP, AVG(HUMIDITY) as AVG_S_HUMIDITY, AVG(WIND_SPEED) as AVG_WIND_SPEED, AVG(VISIBILITY) as AVG_VISIBILITY, AVG(DEW_POINT_TEMPERATURE) as AVG_DEW_POINT, AVG(SOLAR_RADIATION) as AVG_SOLAR_RADIATION, AVG(RAINFALL) as AVG_RAINFALL, AVG(SNOWFALL) as AVG_SNOWFALL   FROM seoul_bike_sharing
group by (SEASONS)
Order by AVG_S_COUNT desc")
##   SEASONS AVG_S_COUNT AVG_S_TEMP AVG_S_HUMIDITY AVG_WIND_SPEED AVG_VISIBILITY
## 1  Summer   1034.0734  26.587711       64.98143       1.609420       1501.745
## 2  Autumn    924.1105  13.821580       59.04491       1.492101       1558.174
## 3  Spring    746.2542  13.021685       58.75833       1.857778       1240.912
## 4  Winter    225.5412  -2.540463       49.74491       1.922685       1445.987
##   AVG_DEW_POINT AVG_SOLAR_RADIATION AVG_RAINFALL AVG_SNOWFALL
## 1     18.750136           0.7612545   0.25348732   0.00000000
## 2      5.150594           0.5227827   0.11765617   0.06350026
## 3      4.091389           0.6803009   0.18694444   0.00000000
## 4    -12.416667           0.2981806   0.03282407   0.24750000

I clearly saw that the more warm the season is the more bikes are being rented, so we can clearly say from the results that Season, AVG temp, AVG humidity, Avg rainfall is correlated to the AVG bike count.

Task 10 - Total Bike Count and City Info for Seoul

Use an implicit join across the WORLD_CITIES and the BIKE_SHARING_SYSTEMS tables to determine the total number of bikes avaialble in Seoul, plus the following city information about Seoul: CITY, COUNTRY, LAT, LON, POPULATION, in a single view. I noticed that in this case, the CITY column will work for the WORLD_CITIES table, but in general you would have to use the CITY_ASCII column.

dbGetQuery(con, "SELECT  B.CITY,  W.LAT, W.LNG, W.POPULATION  FROM BIKE_SHARING_SYSTEMS AS B
LEFT JOIN WORLD_CITIES AS W ON B.CITY = W.CITY_ASCII
WHERE B.CITY = 'Seoul'")
##     CITY     LAT LNG POPULATION
## 1  Seoul 37.5833 127   21794000
## 2  Seoul 37.5833 127   21794000
## 3  Seoul 37.5833 127   21794000
## 4  Seoul 37.5833 127   21794000
## 5  Seoul 37.5833 127   21794000
## 6  Seoul 37.5833 127   21794000
## 7  Seoul 37.5833 127   21794000
## 8  Seoul 37.5833 127   21794000
## 9  Seoul 37.5833 127   21794000
## 10 Seoul 37.5833 127   21794000
## 11 Seoul 37.5833 127   21794000
## 12 Seoul 37.5833 127   21794000
## 13 Seoul 37.5833 127   21794000
## 14 Seoul 37.5833 127   21794000
## 15 Seoul 37.5833 127   21794000
## 16 Seoul 37.5833 127   21794000
## 17 Seoul 37.5833 127   21794000
## 18 Seoul 37.5833 127   21794000
## 19 Seoul 37.5833 127   21794000
## 20 Seoul 37.5833 127   21794000
## 21 Seoul 37.5833 127   21794000
## 22 Seoul 37.5833 127   21794000
## 23 Seoul 37.5833 127   21794000
## 24 Seoul 37.5833 127   21794000
## 25 Seoul 37.5833 127   21794000
## 26 Seoul 37.5833 127   21794000
## 27 Seoul 37.5833 127   21794000
## 28 Seoul 37.5833 127   21794000
## 29 Seoul 37.5833 127   21794000
## 30 Seoul 37.5833 127   21794000
## 31 Seoul 37.5833 127   21794000
## 32 Seoul 37.5833 127   21794000
## 33 Seoul 37.5833 127   21794000
## 34 Seoul 37.5833 127   21794000
## 35 Seoul 37.5833 127   21794000
## 36 Seoul 37.5833 127   21794000
## 37 Seoul 37.5833 127   21794000
## 38 Seoul 37.5833 127   21794000
## 39 Seoul 37.5833 127   21794000
## 40 Seoul 37.5833 127   21794000
dbListTables(con)
## [1] "BIKE_SHARING_SYSTEMS"    "CITIES_WEATHER_FORECAST"
## [3] "SEOUL_BIKE_SHARING"      "WORLD_CITIES"

Complete the EDA with Data Visualization

Task 1 - Load the dataset

Load the seoul_bike_sharing data into a dataframe, this will the cleaned CSV before we did normalization and converting some columns to numerical columns.

seoul_bike_sharing <- read.csv("seoul_bike_sharing.csv")
Task 2 - Recast DATE as a date

Use the format of the data, namely “%d/%m/%Y”

seoul_bike_sharing$DATE <- as.Date(seoul_bike_sharing$DATE, format = "%d/%m/%Y")
Task 3 - Cast HOURS as a categorical variable

Also, coerce its levels to be an ordered sequence. This will ensure your visualizations correctly utilize HOURS as a discrete variable with the expected ordering.

seoul_bike_sharing$HOUR <- as.factor(seoul_bike_sharing$HOUR)

Check the structure of the dataframe

str(seoul_bike_sharing)
## 'data.frame':    8465 obs. of  15 variables:
##  $ ...1                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DATE                 : Date, format: "2017-12-01" "2017-12-01" ...
##  $ RENTED_BIKE_COUNT    : int  254 204 173 107 78 100 181 460 930 490 ...
##  $ HOUR                 : Factor w/ 24 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ TEMPERATURE          : num  -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
##  $ HUMIDITY             : int  37 38 39 40 36 37 35 38 37 27 ...
##  $ WIND_SPEED           : num  2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
##  $ VISIBILITY           : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 1928 ...
##  $ DEW_POINT_TEMPERATURE: num  -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
##  $ SOLAR_RADIATION      : num  0 0 0 0 0 0 0 0 0.01 0.23 ...
##  $ RAINFALL             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNOWFALL             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SEASONS              : chr  "Winter" "Winter" "Winter" "Winter" ...
##  $ HOLIDAY              : chr  "No Holiday" "No Holiday" "No Holiday" "No Holiday" ...
##  $ FUNCTIONING_DAY      : chr  "Yes" "Yes" "Yes" "Yes" ...

Finally, ensure there are no missing values

sum(is.na(seoul_bike_sharing))
## [1] 0

Descriptive Statistics

Task 4 - Dataset Summary
summary(seoul_bike_sharing)
##       ...1           DATE            RENTED_BIKE_COUNT      HOUR     
##  Min.   :   1   Min.   :2017-12-01   Min.   :   2.0    7      : 353  
##  1st Qu.:2117   1st Qu.:2018-02-27   1st Qu.: 214.0    8      : 353  
##  Median :4233   Median :2018-05-28   Median : 542.0    9      : 353  
##  Mean   :4233   Mean   :2018-05-28   Mean   : 729.2    10     : 353  
##  3rd Qu.:6349   3rd Qu.:2018-08-24   3rd Qu.:1084.0    11     : 353  
##  Max.   :8465   Max.   :2018-11-30   Max.   :3556.0    12     : 353  
##                                                        (Other):6347  
##   TEMPERATURE        HUMIDITY       WIND_SPEED      VISIBILITY  
##  Min.   :-17.80   Min.   : 0.00   Min.   :0.000   Min.   :  27  
##  1st Qu.:  3.00   1st Qu.:42.00   1st Qu.:0.900   1st Qu.: 935  
##  Median : 13.50   Median :57.00   Median :1.500   Median :1690  
##  Mean   : 12.77   Mean   :58.15   Mean   :1.726   Mean   :1434  
##  3rd Qu.: 22.70   3rd Qu.:74.00   3rd Qu.:2.300   3rd Qu.:2000  
##  Max.   : 39.40   Max.   :98.00   Max.   :7.400   Max.   :2000  
##                                                                 
##  DEW_POINT_TEMPERATURE SOLAR_RADIATION     RAINFALL          SNOWFALL      
##  Min.   :-30.600       Min.   :0.0000   Min.   : 0.0000   Min.   :0.00000  
##  1st Qu.: -5.100       1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.00000  
##  Median :  4.700       Median :0.0100   Median : 0.0000   Median :0.00000  
##  Mean   :  3.945       Mean   :0.5679   Mean   : 0.1491   Mean   :0.07768  
##  3rd Qu.: 15.200       3rd Qu.:0.9300   3rd Qu.: 0.0000   3rd Qu.:0.00000  
##  Max.   : 27.200       Max.   :3.5200   Max.   :35.0000   Max.   :8.80000  
##                                                                            
##    SEASONS            HOLIDAY          FUNCTIONING_DAY   
##  Length:8465        Length:8465        Length:8465       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 
Task 5 - Based on the above stats, calculate how many Holidays there are

Holiday columns has two values No Holiday and Holiday so I need just to filter the date for holiday as answer and then count, I have a raw for each hour so I must divide the total by 24.

holidays_count <- seoul_bike_sharing %>%
  filter(HOLIDAY == "Holiday") %>%
  count(HOLIDAY) %>%
  summarise(count = n / 24)

holidays_count
##   count
## 1    17
Task 6 - Calculate the percentage of records that fall on a holiday.
total_records <- seoul_bike_sharing %>%
  count() %>%
  summarise(count = n / 24)
holidays_perc <- (holidays_count / total_records) * 100
holidays_perc
##      count
## 1 4.819846

Task 7 - Given there is exactly a full year of data, determine how many records we expect to have

total_records
##      count
## 1 352.7083
Task 8 - Given the observations for the ‘FUNCTIONING_DAY’ how many records must there be?
seoul_bike_sharing %>%
  count(FUNCTIONING_DAY)
##   FUNCTIONING_DAY    n
## 1             Yes 8465
Task 9 - Load the dplyr package, group the data by SEASONS, and use the summarize() function to calculate the seasonal total rainfall and snowfall
seoul_bike_sharing %>%
  group_by(SEASONS) %>%
  summarize(total_rainfall = sum(RAINFALL), total_snowfall = sum(SNOWFALL))
## # A tibble: 4 × 3
##   SEASONS total_rainfall total_snowfall
##   <chr>            <dbl>          <dbl>
## 1 Autumn           228.            123 
## 2 Spring           404.              0 
## 3 Summer           560.              0 
## 4 Winter            70.9           535.

Data Visualization

Task 10 - Create a scatter plot of RENTED_BIKE_COUNT vs DATE
ggplot(seoul_bike_sharing, aes(x = DATE, y = RENTED_BIKE_COUNT, color = "blue", alpha = 0.25)) +
  geom_point() +
  theme(legend.position = "none")

I saw the rented bike count start to increase around FEB/March and reach the max on June then decrease little bit towards AUG then increase around SEP and then start decreasing again towards the end of the year

Task 11 - Create the same plot of the RENTED_BIKE_COUNT time series, but now add HOURS as the colour.
ggplot(seoul_bike_sharing, aes(x = DATE, y = RENTED_BIKE_COUNT, color = HOUR, alpha = 0.25)) +
  geom_point()

I saw the rented bike count are to low at the dawn and start to increase slowly during the early hours of the morning to reach to max at the evening in 6 or 7 then start decreasing again.

Task 12 - Create a histogram overlaid with a kernel density curve

Normalize the histogram so the y axis represents ‘density’. This can be done by setting y=..density.. in the aesthetics of the histogram.

ggplot(seoul_bike_sharing, aes(x = RENTED_BIKE_COUNT)) +
  geom_histogram(aes(y = ..density..),
    colour = 1, fill = "white", alpha = 0.5
  ) +
  geom_density(aes(color = "blue")) +
  theme(legend.position = "none")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

I saw clearly see that on most days the number of rented bikes are around 125-375 and only in rare occasions we see the number goes above 2500.

Task 13 - Use a scatter plot to visualize the correlation between RENTED_BIKE_COUNT and TEMPERATURE by SEASONS, Use HOUR as the color.
ggplot(seoul_bike_sharing, aes(x = TEMPERATURE, y = RENTED_BIKE_COUNT, color = HOUR, alpha = 0.25)) +
  geom_point() +
  facet_wrap(~SEASONS)

This is a very interesting plot, I saw that in Autumn and spring where the temperature is almost similar between 0 to 20, people tend to use the bikes in the same hours and the counts of bikes is very similar, in general people tend to use more bikes on warmer weather.

However in summer I saw that people still use the bikes on the same hours, however the count of bikes become less when the weather become hotter.

In Winter I saw that there is a huge drop in the number of bikes rented with a maximum limit of 1000 bikes, also people tend to use the bikes more in early morning and evening around 6-7.

Task 14 - Create a display of four boxplots of RENTED_BIKE_COUNT vs. HOUR grouped by SEASONS.

Use facet_wrap to generate four plots corresponding to the seasons

ggplot(seoul_bike_sharing, aes(x = HOUR, y = RENTED_BIKE_COUNT)) +
  geom_boxplot(fill = "bisque") +
  facet_wrap(~SEASONS)

I saw here that there is many Outliers in our data sets for the count of bikes during hours in different season.

In general people tends to use the bikes in the same times during the different seasons with slightly different counts expect in Winter were there is a huge drop in the number of bikes being rented.

Task 15 - Group the data by DATE, and use the summarize() function to calculate the daily total rainfall and snowfall
seoul_bike_sharing %>%
  group_by(DATE) %>%
  summarise(daily_rainfall = sum(RAINFALL), daily_snowfall = sum(SNOWFALL))
## # A tibble: 353 × 3
##    DATE       daily_rainfall daily_snowfall
##    <date>              <dbl>          <dbl>
##  1 2017-12-01            0              0  
##  2 2017-12-02            0              0  
##  3 2017-12-03            4              0  
##  4 2017-12-04            0.1            0  
##  5 2017-12-05            0              0  
##  6 2017-12-06            1.3            8.6
##  7 2017-12-07            0             10.4
##  8 2017-12-08            0              0  
##  9 2017-12-09            0              0  
## 10 2017-12-10            4.1           32.5
## # ℹ 343 more rows

I ploted the results as per the below

seoul_bike_sharing %>%
  group_by(DATE) %>%
  summarise(daily_rainfall = sum(RAINFALL), daily_snowfall = sum(SNOWFALL)) %>%
  pivot_longer(!DATE) %>%
  ggplot() +
  geom_point(aes(x = DATE, y = value, color = name))

Task 16 - Determine how many days had snowfall
seoul_bike_sharing %>%
  group_by(DATE) %>%
  summarise(daily_rainfall = sum(RAINFALL), daily_snowfall = sum(SNOWFALL)) %>%
  filter(daily_snowfall > 0) %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1    27

In addition to the mentioned tasks I creat a visualization to show bike count depending on the day of the week.

seoul_bike_sharing %>%
  mutate(DAY_OF_WEEK = as.factor(wday(DATE, abbr = TRUE, label = TRUE))) %>%
  ggplot(aes(x = DAY_OF_WEEK, y = RENTED_BIKE_COUNT, color = HOUR, alpha = 0.25)) +
  geom_point() +
  facet_wrap(~SEASONS)

I can see the count of bikes is higher on weekdays than on weekends, which suggests that some users are using bikes on weekdays as commute to work, also on weekends on Winter and Autumn count of bikes peaks around 12-16 which is not usual on other days.

Predict Bike-Sharing Demand Using Regression Models

LAB: Predict Hourly Rented Bike Count using Basic Linear Regression Models:

  • TASK: Split data into training and testing datasets
  • TASK: Build a linear regression model using only the weather variables
  • TASK: Build a linear regression model using both weather and date/time * variables
  • TASK: Evaluate the models and identify important variables

LAB: Refine the Baseline Regression Models:

  • TASK: Add higher order terms
  • TASK: Add interaction terms
  • TASK: Add regularization
  • TASK: Experiment to find the best performed model

Building a Baseline Regression Model

The seoul_bike_sharing_converted_normalized.csv will be our main dataset which has following variables:

The response variable:

RENTED BIKE COUNT- Count of bikes rented at each hour

Weather predictor variables:

TEMPERATURE - Temperature in Celsius HUMIDITY - Unit is % WIND_SPEED - Unit is m/s VISIBILITY - Multiplied by 10m DEW_POINT_TEMPERATURE - The temperature to which the air would have to cool down in order to reach saturation, unit is Celsius SOLAR_RADIATION - MJ/m2 RAINFALL - mm SNOWFALL - cm

Date/time predictor variables:

HOUR- Hour of he day HOLIDAY - Holiday/No holiday SEASONS - Winter, Spring, Summer, Autumn

I will load the datasets. I will remove FUNCTIONING_DAY column as it has only one value.

bike_sharing_df <- read_csv("seoul_bike_sharing_converted_normalized.csv")
## Rows: 8465 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): FUNCTIONING_DAY
## dbl (39): RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBILITY, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bike_sharing_df <- bike_sharing_df[, -10]

TASK: Split training and testing data

TODO: Use the initial_split(), training(), and testing() functions to generate a training dataset consisting of 75% of the original dataset, and a testing dataset using the remaining 25%.

# Use the `initial_split()`, `training()`, and `testing()` functions to split the dataset
# With seed 1234
set.seed(1234)
# prop = 3/4
bike_sharing_split <- initial_split(bike_sharing_df, prop = 0.75)
# train_data
bike_sharing_training <- training(bike_sharing_split)
# test_data
bike_sharing_testing <- testing(bike_sharing_split)

TASK: Build a linear regression model using weather variables only

TODO: Build a linear regression model called lm_model_weather

# Use `linear_reg()` with engine `lm` and mode `regression`
lm_model_weather <- linear_reg(mode = "regression", engine = "lm")
# Fit the model called `lm_model_weather`
# RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL,  with the training data

training_fit <- lm_model_weather %>%
  fit(RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL, data = bike_sharing_training)
# print(lm_model_weather$fit)
print(training_fit)
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY + 
##     WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + 
##     RAINFALL + SNOWFALL, data = data)
## 
## Coefficients:
##           (Intercept)            TEMPERATURE               HUMIDITY  
##              0.043532               0.675221              -0.258408  
##            WIND_SPEED             VISIBILITY  DEW_POINT_TEMPERATURE  
##              0.113806               0.003533              -0.089172  
##       SOLAR_RADIATION               RAINFALL               SNOWFALL  
##             -0.125170              -0.496346               0.089414

TASK: Build a linear regression model using all variables

TODO: Build a linear regression model called lm_model_all using all variables RENTED_BIKE_COUNT ~ .

# Use `linear_reg()` with engine `lm` and mode `regression`
lm_model_all <- linear_reg(mode = "regression", engine = "lm")
# Fit the model called `lm_model_all`
# `RENTED_BIKE_COUNT ~ .` means use all other variables except for the response variable

all_fit <- lm_model_all %>%
  fit(RENTED_BIKE_COUNT ~ ., data = bike_sharing_training)
# print(lm_model_weather$fit)
print(all_fit)
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ ., data = data)
## 
## Coefficients:
##           (Intercept)            TEMPERATURE               HUMIDITY  
##              0.059144               0.220219              -0.249502  
##            WIND_SPEED             VISIBILITY  DEW_POINT_TEMPERATURE  
##              0.008979               0.006154               0.168370  
##       SOLAR_RADIATION               RAINFALL               SNOWFALL  
##              0.077907              -0.580933               0.073431  
##                AUTUMN                 SPRING                 SUMMER  
##              0.101013               0.053845               0.055752  
##                WINTER                HOLIDAY             NO_HOLIDAY  
##                    NA              -0.035009                     NA  
##                   `0`                    `1`                   `10`  
##             -0.008244              -0.032878              -0.066831  
##                  `11`                   `12`                   `13`  
##             -0.069607              -0.058622              -0.053842  
##                  `14`                   `15`                   `16`  
##             -0.054148              -0.030876               0.006508  
##                  `17`                   `18`                   `19`  
##              0.085973               0.223636               0.147155  
##                   `2`                   `20`                   `21`  
##             -0.066745               0.121552               0.125656  
##                  `22`                   `23`                    `3`  
##              0.096410               0.029209              -0.090003  
##                   `4`                    `5`                    `6`  
##             -0.108692              -0.102060              -0.057434  
##                   `7`                    `8`                    `9`  
##              0.030039               0.126893                     NA

TASK: Model evaluation and identification of important variables

TODO: Make predictions on the testing dataset using both lm_model_weather and lm_model_all models

# Use predict() function to generate test results for `lm_model_weather` and `lm_model_all`
weather_train_results <- training_fit %>%
  predict(new_data = bike_sharing_training)

all_train_results <- all_fit %>%
  predict(new_data = bike_sharing_training)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response", : prediction from rank-deficient fit; consider predict(.,
## rankdeficient="NA")
# and generate two test results dataframe with a truth column:
weather_train_results <- weather_train_results %>%
  mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)

all_train_results <- all_train_results %>%
  mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
# test_results_weather for lm_model_weather model
head(weather_train_results)
## # A tibble: 6 × 2
##    .pred  truth
##    <dbl>  <dbl>
## 1 0.294  0.764 
## 2 0.0749 0.117 
## 3 0.261  0.311 
## 4 0.0960 0.0428
## 5 0.185  0.0535
## 6 0.102  0.0594
# test_results_all for lm_model_all
head(all_train_results)
## # A tibble: 6 × 2
##    .pred  truth
##    <dbl>  <dbl>
## 1 0.512  0.764 
## 2 0.102  0.117 
## 3 0.295  0.311 
## 4 0.0572 0.0428
## 5 0.135  0.0535
## 6 0.143  0.0594

TODO: Use rsq() and rmse() functions to calculate R-squared and RMSE metrics for the two test results

rsq_weather <- rsq(weather_train_results,
  truth = truth,
  estimate = .pred
)
rsq_all <- rsq(all_train_results,
  truth = truth,
  estimate = .pred
)

rmse_weather <- rmse(weather_train_results,
  truth = truth,
  estimate = .pred
)
rmse_all <- rmse(all_train_results,
  truth = truth,
  estimate = .pred
)

rsq_weather
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.430
rsq_all
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.659
rmse_weather
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.137
rmse_all
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.106

From results we find that lm_model_all is much better.

I will now try to find the variables with the highest coefficients that contributed the most to the prediction.

all_fit %>%
  tidy() %>%
  arrange(desc(abs(estimate)))
## # A tibble: 39 × 5
##    term                  estimate std.error statistic   p.value
##    <chr>                    <dbl>     <dbl>     <dbl>     <dbl>
##  1 RAINFALL                -0.581   0.0403     -14.4  2.41e- 46
##  2 HUMIDITY                -0.250   0.0280      -8.91 6.43e- 19
##  3 `18`                     0.224   0.00957     23.4  7.32e-116
##  4 TEMPERATURE              0.220   0.0597       3.69 2.27e-  4
##  5 DEW_POINT_TEMPERATURE    0.168   0.0623       2.70 6.89e-  3
##  6 `19`                     0.147   0.00964     15.3  1.05e- 51
##  7 `8`                      0.127   0.00918     13.8  7.72e- 43
##  8 `21`                     0.126   0.00959     13.1  1.05e- 38
##  9 `20`                     0.122   0.00960     12.7  2.82e- 36
## 10 `4`                     -0.109   0.00957    -11.4  1.27e- 29
## # ℹ 29 more rows

TODO: Sort the coefficient list in descending order and visualize the result using ggplot and geom_bar

# Visualize the list using ggplot and geom_bar
all_fit %>%
  tidy() %>%
  filter(!is.na(estimate)) %>%
  ggplot(aes(x = fct_reorder(term, abs(estimate)), y = abs(estimate))) +
  geom_bar(stat = "identity", fill = "RED") +
  coord_flip() +
  theme(axis.text.y = element_text(angle = 10, colour = "RED", size = 7)) +
  xlab("variable")

Improving the Linear Model

TASK: Add polynomial terms

Linear regression models are the most suitable models to capture the linear correlations among variables. However, in real world data, many relationships may be non-linear.

For example, the correlation between RENTED_BIKE_COUNT and TEMPERATURE does not look like linear:

ggplot(bike_sharing_training, aes(RENTED_BIKE_COUNT, TEMPERATURE)) +
  geom_point() +
  geom_smooth(method = "lm", color = "RED")
## `geom_smooth()` using formula = 'y ~ x'

One solution to handle such nonlinearity is using polynomial regression by adding polynomial terms. Higher order polynomials are better than the first order polynomial.

# Plot the higher order polynomial fits
ggplot(bike_sharing_training, aes(RENTED_BIKE_COUNT, TEMPERATURE)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x, color = "red") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "yellow") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 4), color = "green") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 6), color = "blue")

TODO: Fit a linear regression model lm_poly with higher order polynomial terms on the important variables (larger coefficients) found in the baseline model

# Fit a linear model with higher order polynomial on some important variables

lm_poly <- linear_reg(mode = "regression", engine = "lm") %>%
  fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4), data = bike_sharing_training)
# Print model summary

summary(lm_poly$fit)
## 
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 
##     6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4), 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44191 -0.05668  0.00153  0.05221  0.40859 
## 
## Coefficients: (6 not defined because of singularities)
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      0.0467508  0.0233009   2.006  0.04486 *  
## TEMPERATURE                      0.5590528  0.1348620   4.145 3.44e-05 ***
## HUMIDITY                        -0.1066547  0.0642850  -1.659  0.09715 .  
## WIND_SPEED                       0.0031707  0.0101624   0.312  0.75505    
## VISIBILITY                      -0.0081387  0.0053278  -1.528  0.12667    
## DEW_POINT_TEMPERATURE           -0.2358434  0.1479894  -1.594  0.11106    
## SOLAR_RADIATION                  0.0003396  0.0112372   0.030  0.97589    
## RAINFALL                        -0.3428396  0.0387976  -8.837  < 2e-16 ***
## SNOWFALL                        -0.0099515  0.0269518  -0.369  0.71197    
## AUTUMN                           0.0981633  0.0059819  16.410  < 2e-16 ***
## SPRING                           0.0569147  0.0058294   9.763  < 2e-16 ***
## SUMMER                           0.0881572  0.0075643  11.654  < 2e-16 ***
## WINTER                                  NA         NA      NA       NA    
## HOLIDAY                         -0.0323523  0.0057678  -5.609 2.12e-08 ***
## NO_HOLIDAY                              NA         NA      NA       NA    
## `0`                             -0.0217507  0.0085521  -2.543  0.01100 *  
## `1`                             -0.0480638  0.0084233  -5.706 1.21e-08 ***
## `10`                            -0.0522618  0.0081726  -6.395 1.72e-10 ***
## `11`                            -0.0434932  0.0084892  -5.123 3.09e-07 ***
## `12`                            -0.0214080  0.0086691  -2.469  0.01356 *  
## `13`                            -0.0156167  0.0088278  -1.769  0.07694 .  
## `14`                            -0.0126345  0.0086796  -1.456  0.14554    
## `15`                             0.0028753  0.0086200   0.334  0.73872    
## `16`                             0.0334231  0.0085130   3.926 8.72e-05 ***
## `17`                             0.0997805  0.0085385  11.686  < 2e-16 ***
## `18`                             0.2299977  0.0085091  27.029  < 2e-16 ***
## `19`                             0.1467266  0.0085572  17.147  < 2e-16 ***
## `2`                             -0.0809514  0.0084329  -9.600  < 2e-16 ***
## `20`                             0.1135171  0.0085240  13.317  < 2e-16 ***
## `21`                             0.1135640  0.0085108  13.343  < 2e-16 ***
## `22`                             0.0835330  0.0084484   9.887  < 2e-16 ***
## `23`                             0.0157767  0.0084505   1.867  0.06195 .  
## `3`                             -0.1012712  0.0085612 -11.829  < 2e-16 ***
## `4`                             -0.1209531  0.0085127 -14.209  < 2e-16 ***
## `5`                             -0.1123035  0.0083940 -13.379  < 2e-16 ***
## `6`                             -0.0700939  0.0084214  -8.323  < 2e-16 ***
## `7`                              0.0204802  0.0084165   2.433  0.01499 *  
## `8`                              0.1157473  0.0081324  14.233  < 2e-16 ***
## `9`                                     NA         NA      NA       NA    
## poly(TEMPERATURE, 6)1                   NA         NA      NA       NA    
## poly(TEMPERATURE, 6)2            2.0863456  0.2466725   8.458  < 2e-16 ***
## poly(TEMPERATURE, 6)3           -2.8232422  0.1363257 -20.710  < 2e-16 ***
## poly(TEMPERATURE, 6)4           -1.9242987  0.1275559 -15.086  < 2e-16 ***
## poly(TEMPERATURE, 6)5            0.0630522  0.1064890   0.592  0.55380    
## poly(TEMPERATURE, 6)6            0.4568546  0.0991038   4.610 4.11e-06 ***
## poly(HUMIDITY, 6)1                      NA         NA      NA       NA    
## poly(HUMIDITY, 6)2              -1.4529202  0.2054493  -7.072 1.69e-12 ***
## poly(HUMIDITY, 6)3              -0.9842698  0.1239663  -7.940 2.38e-15 ***
## poly(HUMIDITY, 6)4              -0.5623707  0.1800087  -3.124  0.00179 ** 
## poly(HUMIDITY, 6)5               0.2184708  0.1602069   1.364  0.17272    
## poly(HUMIDITY, 6)6               0.0696659  0.1240320   0.562  0.57436    
## poly(DEW_POINT_TEMPERATURE, 4)1         NA         NA      NA       NA    
## poly(DEW_POINT_TEMPERATURE, 4)2 -3.6349692  0.2686365 -13.531  < 2e-16 ***
## poly(DEW_POINT_TEMPERATURE, 4)3 -0.6604506  0.1367294  -4.830 1.40e-06 ***
## poly(DEW_POINT_TEMPERATURE, 4)4  0.2796526  0.1219957   2.292  0.02192 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0939 on 6299 degrees of freedom
## Multiple R-squared:  0.7345, Adjusted R-squared:  0.7324 
## F-statistic:   363 on 48 and 6299 DF,  p-value: < 2.2e-16

TODO: Make predictions on test dataset using the lm_poly models

# Use predict() function to generate test results for `lm_poly`
lm_ploy_train_results <- lm_poly %>%
  predict(new_data = bike_sharing_training) %>%
  mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response", : prediction from rank-deficient fit; consider predict(.,
## rankdeficient="NA")
head(lm_ploy_train_results)
## # A tibble: 6 × 2
##    .pred  truth
##    <dbl>  <dbl>
## 1 0.546  0.764 
## 2 0.105  0.117 
## 3 0.282  0.311 
## 4 0.0601 0.0428
## 5 0.122  0.0535
## 6 0.127  0.0594

a common practice is to filter the results and see if we have any negative results.

lm_ploy_train_results %>%
  filter(lm_ploy_train_results$.pred < 0)
## # A tibble: 570 × 2
##        .pred   truth
##        <dbl>   <dbl>
##  1 -0.00388  0.0506 
##  2 -0.0651   0.0124 
##  3 -0.00266  0.0405 
##  4 -0.00845  0.0366 
##  5 -0.0715   0.00929
##  6 -0.000178 0.0557 
##  7 -0.0691   0.00760
##  8 -0.0703   0.0225 
##  9 -0.0519   0.00619
## 10 -0.0242   0.0211 
## # ℹ 560 more rows

Another minor improvement we could do here is to convert all negative prediction results to zero, because we can not have negative rented bike counts

lm_ploy_train_results$.pred <- replace(lm_ploy_train_results$.pred, lm_ploy_train_results$.pred < 0, 0)
# Calculate R-squared and RMSE from the test results
rsq_lm_ploy <- rsq(lm_ploy_train_results,
  truth = truth,
  estimate = .pred
)

rmse_lm_ploy <- rmse(lm_ploy_train_results,
  truth = truth,
  estimate = .pred
)

rsq_lm_ploy
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.750
rmse_lm_ploy
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0912

There is an improvement in both RSQ metrics than All linear model

TASK: Add interaction terms

TODO: Try adding some interaction terms to the previous polynomial models.

# Add interaction terms to the poly regression built in previous step

# HINT: You could use `*` operator to create interaction terms such as HUMIDITY*TEMPERATURE and make the formula look like:
# RENTED_BIKE_COUNT ~ RAINFALL*HUMIDITY ...
lm_poly <- linear_reg(mode = "regression", engine = "lm") %>%
  fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4) + RAINFALL * HUMIDITY + HUMIDITY * TEMPERATURE + AUTUMN * HOLIDAY + AUTUMN * `18` + AUTUMN * `19` + AUTUMN * `8`, data = bike_sharing_training)

# create test results

lm_ploy_train_results <- lm_poly %>%
  predict(new_data = bike_sharing_training) %>%
  mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response", : prediction from rank-deficient fit; consider predict(.,
## rankdeficient="NA")
head(lm_ploy_train_results)
## # A tibble: 6 × 2
##    .pred  truth
##    <dbl>  <dbl>
## 1 0.604  0.764 
## 2 0.0968 0.117 
## 3 0.286  0.311 
## 4 0.0534 0.0428
## 5 0.113  0.0535
## 6 0.125  0.0594
summary(lm_poly$fit)
## 
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 
##     6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4) + 
##     RAINFALL * HUMIDITY + HUMIDITY * TEMPERATURE + AUTUMN * HOLIDAY + 
##     AUTUMN * `18` + AUTUMN * `19` + AUTUMN * `8`, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41489 -0.05576  0.00173  0.05107  0.41942 
## 
## Coefficients: (6 not defined because of singularities)
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -0.549848   0.066463  -8.273  < 2e-16 ***
## TEMPERATURE                       1.819684   0.189917   9.581  < 2e-16 ***
## HUMIDITY                          1.063231   0.138860   7.657 2.19e-14 ***
## WIND_SPEED                        0.005198   0.010027   0.518  0.60424    
## VISIBILITY                       -0.005023   0.005258  -0.955  0.33941    
## DEW_POINT_TEMPERATURE            -0.288932   0.146231  -1.976  0.04821 *  
## SOLAR_RADIATION                  -0.005356   0.011112  -0.482  0.62981    
## RAINFALL                        -10.714523   1.548085  -6.921 4.92e-12 ***
## SNOWFALL                         -0.070822   0.027240  -2.600  0.00935 ** 
## AUTUMN                            0.092807   0.005998  15.473  < 2e-16 ***
## SPRING                            0.055491   0.005743   9.662  < 2e-16 ***
## SUMMER                            0.087942   0.007449  11.806  < 2e-16 ***
## WINTER                                  NA         NA      NA       NA    
## HOLIDAY                          -0.019352   0.006650  -2.910  0.00363 ** 
## NO_HOLIDAY                              NA         NA      NA       NA    
## `0`                              -0.020703   0.008422  -2.458  0.01399 *  
## `1`                              -0.049000   0.008302  -5.902 3.78e-09 ***
## `10`                             -0.052370   0.008043  -6.511 8.05e-11 ***
## `11`                             -0.043545   0.008355  -5.212 1.93e-07 ***
## `12`                             -0.020328   0.008538  -2.381  0.01730 *  
## `13`                             -0.015407   0.008691  -1.773  0.07631 .  
## `14`                             -0.013134   0.008544  -1.537  0.12432    
## `15`                              0.005290   0.008489   0.623  0.53321    
## `16`                              0.033488   0.008380   3.996 6.51e-05 ***
## `17`                              0.100389   0.008405  11.944  < 2e-16 ***
## `18`                              0.212488   0.008908  23.853  < 2e-16 ***
## `19`                              0.138798   0.009045  15.345  < 2e-16 ***
## `2`                              -0.081110   0.008311  -9.759  < 2e-16 ***
## `20`                              0.111049   0.008401  13.219  < 2e-16 ***
## `21`                              0.111544   0.008383  13.306  < 2e-16 ***
## `22`                              0.082552   0.008320   9.922  < 2e-16 ***
## `23`                              0.015079   0.008325   1.811  0.07014 .  
## `3`                              -0.101026   0.008432 -11.982  < 2e-16 ***
## `4`                              -0.121861   0.008390 -14.525  < 2e-16 ***
## `5`                              -0.114008   0.008273 -13.782  < 2e-16 ***
## `6`                              -0.069962   0.008293  -8.437  < 2e-16 ***
## `7`                               0.018547   0.008292   2.237  0.02533 *  
## `8`                               0.099478   0.008536  11.654  < 2e-16 ***
## `9`                                     NA         NA      NA       NA    
## poly(TEMPERATURE, 6)1                   NA         NA      NA       NA    
## poly(TEMPERATURE, 6)2            -5.879974   0.872562  -6.739 1.74e-11 ***
## poly(TEMPERATURE, 6)3            -3.281459   0.141016 -23.270  < 2e-16 ***
## poly(TEMPERATURE, 6)4            -1.725049   0.127019 -13.581  < 2e-16 ***
## poly(TEMPERATURE, 6)5             0.263064   0.106424   2.472  0.01347 *  
## poly(TEMPERATURE, 6)6             0.472738   0.097863   4.831 1.39e-06 ***
## poly(HUMIDITY, 6)1                      NA         NA      NA       NA    
## poly(HUMIDITY, 6)2               -3.256619   0.283200 -11.499  < 2e-16 ***
## poly(HUMIDITY, 6)3               -0.378018   0.136635  -2.767  0.00568 ** 
## poly(HUMIDITY, 6)4               -0.225621   0.184915  -1.220  0.22246    
## poly(HUMIDITY, 6)5               -0.522998   0.174343  -3.000  0.00271 ** 
## poly(HUMIDITY, 6)6                0.551869   0.139233   3.964 7.46e-05 ***
## poly(DEW_POINT_TEMPERATURE, 4)1         NA         NA      NA       NA    
## poly(DEW_POINT_TEMPERATURE, 4)2   7.657073   1.221024   6.271 3.82e-10 ***
## poly(DEW_POINT_TEMPERATURE, 4)3  -0.471446   0.135978  -3.467  0.00053 ***
## poly(DEW_POINT_TEMPERATURE, 4)4   0.289182   0.120485   2.400  0.01642 *  
## HUMIDITY:RAINFALL                10.547804   1.571087   6.714 2.06e-11 ***
## TEMPERATURE:HUMIDITY             -2.272540   0.236476  -9.610  < 2e-16 ***
## AUTUMN:HOLIDAY                   -0.059911   0.013010  -4.605 4.21e-06 ***
## AUTUMN:`18`                       0.064934   0.014095   4.607 4.17e-06 ***
## AUTUMN:`19`                       0.025843   0.013451   1.921  0.05473 .  
## AUTUMN:`8`                        0.066182   0.013930   4.751 2.07e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09241 on 6293 degrees of freedom
## Multiple R-squared:  0.7431, Adjusted R-squared:  0.7409 
## F-statistic: 337.1 on 54 and 6293 DF,  p-value: < 2.2e-16
# check if there is negative results
lm_ploy_train_results %>%
  filter(lm_ploy_train_results$.pred < 0)
## # A tibble: 564 × 2
##       .pred   truth
##       <dbl>   <dbl>
##  1 -0.0741  0.0124 
##  2 -0.00374 0.0405 
##  3 -0.00746 0.0366 
##  4 -0.0630  0.00929
##  5 -0.0715  0.00760
##  6 -0.0739  0.0225 
##  7 -0.0302  0.00619
##  8 -0.0273  0.0211 
##  9 -0.0248  0.0214 
## 10 -0.0507  0.0160 
## # ℹ 554 more rows
# assign zero value for negitave results
lm_ploy_train_results$.pred <- replace(lm_ploy_train_results$.pred, lm_ploy_train_results$.pred < 0, 0)
# Calculate R-squared and RMSE from the test results
rsq_lm_ploy <- rsq(lm_ploy_train_results,
  truth = truth,
  estimate = .pred
)

rmse_lm_ploy <- rmse(lm_ploy_train_results,
  truth = truth,
  estimate = .pred
)

rsq_lm_ploy
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.757
rmse_lm_ploy
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0899

I can see improvement in both metrics.

TASK: Add regularization

TODO: Define a linear regression model specification glmnet_spec using glmnet engine

# HINT: Use linear_reg() function with two parameters: penalty and mixture
# - penalty controls the intensity of model regularization
# - mixture controls the tradeoff between L1 and L2 regularizations

# You could manually try different parameter combinations or use grid search to find optimal combinations

# first we create a formula recipe
lm_glmnet_formula <-
  formula(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4) + RAINFALL * HUMIDITY + HUMIDITY * TEMPERATURE + AUTUMN * HOLIDAY + AUTUMN * `18` + AUTUMN * `19` + AUTUMN * `8`)

# k-fold cross-validation
folds <- vfold_cv(bike_sharing_training, v = 10)

# define model

tune_spec <- linear_reg(penalty = tune(), mixture = tune()) %>%
  set_engine("glmnet")

# Grid workflow
wf <- workflow() %>%
  add_formula(lm_glmnet_formula)

# Define tuning values

lambda_tune <- grid_regular(
  levels = 50,
  penalty(range = c(-3, 0.3))
)

mixture_tune <- grid_regular(
  levels = 50,
  mixture(range = c(0, 1))
)

mix_grid <- as.data.frame(c(lambda_tune, mixture_tune))

# define grid search
grid <- tune_grid(
  wf %>% add_model(tune_spec),
  resamples = folds,
  grid = mix_grid
)
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
##                standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations   A: x17There were issues with some computations   A: x34There were issues with some computations   A: x51There were issues with some computations   A: x68There were issues with some computations   A: x85There were issues with some computations   A: x102There were issues with some computations   A: x119There were issues with some computations   A: x136There were issues with some computations   A: x153There were issues with some computations   A: x170There were issues with some computations   A: x170
# print grid search results

show_best(grid, metric = "rmse")
## # A tibble: 5 × 8
##   penalty mixture .metric .estimator   mean     n  std_err .config         
##     <dbl>   <dbl> <chr>   <chr>       <dbl> <int>    <dbl> <chr>           
## 1 0.00117  0.0204 rmse    standard   0.0939    10 0.000651 pre0_mod02_post0
## 2 0.00136  0.0408 rmse    standard   0.0939    10 0.000651 pre0_mod03_post0
## 3 0.00159  0.0612 rmse    standard   0.0939    10 0.000651 pre0_mod04_post0
## 4 0.00186  0.0816 rmse    standard   0.0939    10 0.000651 pre0_mod05_post0
## 5 0.00217  0.102  rmse    standard   0.0939    10 0.000652 pre0_mod06_post0

From the above grid search I will be using penalty of 0.001167742 and mixture of 0.02040816 as best result.

lm_glmnet <- linear_reg(mode = "regression", engine = "glmnet", penalty = 0.001167742, mixture = 0.02040816) %>%
  fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4) + RAINFALL * HUMIDITY + HUMIDITY * TEMPERATURE + AUTUMN * HOLIDAY + AUTUMN * `18` + AUTUMN * `19` + AUTUMN * `8`, data = bike_sharing_training)
# create test results

lm_glmnet_train_results <- lm_glmnet %>%
  predict(new_data = bike_sharing_training) %>%
  mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)

head(lm_glmnet_train_results)
## # A tibble: 6 × 2
##    .pred  truth
##    <dbl>  <dbl>
## 1 0.599  0.764 
## 2 0.101  0.117 
## 3 0.281  0.311 
## 4 0.0554 0.0428
## 5 0.121  0.0535
## 6 0.126  0.0594
# check if there is negative results
lm_glmnet_train_results %>%
  filter(lm_glmnet_train_results$.pred < 0)
## # A tibble: 551 × 2
##         .pred   truth
##         <dbl>   <dbl>
##  1 -0.00519   0.0506 
##  2 -0.0654    0.0124 
##  3 -0.00345   0.0405 
##  4 -0.0700    0.00929
##  5 -0.0000200 0.0557 
##  6 -0.0683    0.00760
##  7 -0.0686    0.0225 
##  8 -0.0518    0.00619
##  9 -0.0234    0.0211 
## 10 -0.0194    0.0214 
## # ℹ 541 more rows
# assign zero value for negitave results
lm_glmnet_train_results$.pred <- replace(lm_glmnet_train_results$.pred, lm_glmnet_train_results$.pred < 0, 0)
# Calculate R-squared and RMSE from the test results
rsq_lm_glmnet <- rsq(lm_glmnet_train_results,
  truth = truth,
  estimate = .pred
)

rmse_lm_glmnet <- rmse(lm_glmnet_train_results,
  truth = truth,
  estimate = .pred
)

rsq_lm_glmnet
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.752
rmse_lm_glmnet
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0909

I don’t see any improvement in the model, it is slightly worse than the model I created in last step.

TASK: Experiment to search for improved models

TODO: Experiment by building and testing at least five different models. For each of your experiments, include polynomial terms, interaction terms, and one of the three regularization we introduced.

Before starting building model, I always like to start with matrix to understand the relationship between the different varibale and the target variable.

bike_sharing_df_cor <- cor(bike_sharing_df, method = "pearson", use = "pairwise.complete.obs")

bike_sharing_df_cor[, 1]
##     RENTED_BIKE_COUNT           TEMPERATURE              HUMIDITY 
##           1.000000000           0.562809687          -0.201972670 
##            WIND_SPEED            VISIBILITY DEW_POINT_TEMPERATURE 
##           0.125021946           0.212322776           0.400262829 
##       SOLAR_RADIATION              RAINFALL              SNOWFALL 
##           0.273861551          -0.128626093          -0.151610753 
##                AUTUMN                SPRING                SUMMER 
##           0.165332742           0.015579791           0.282000797 
##                WINTER               HOLIDAY            NO_HOLIDAY 
##          -0.458919815          -0.070070005           0.070070005 
##                     0                     1                    10 
##          -0.054383329          -0.093147055          -0.059560345 
##                    11                    12                    13 
##          -0.035036035          -0.001928769           0.009423476 
##                    14                    15                    16 
##           0.018012873           0.041641074           0.075704251 
##                    17                    18                    19 
##           0.145515179           0.267890160           0.164534951 
##                     2                    20                    21 
##          -0.135030231           0.122161147           0.109563181 
##                    22                    23                     3 
##           0.073076646          -0.011437275          -0.168084792 
##                     4                     5                     6 
##          -0.191872210          -0.189689701          -0.139760076 
##                     7                     8                     9 
##          -0.033305460           0.104274851          -0.019880387
# Build at least five different models using polynomial terms, interaction terms, and regularizations.

# Model 1 - I will keep the lm_poly model created before.

# Model 2 - Top cor variables, polynomial top variable, interaction term & L2 regulaizations

model2 <- linear_reg(penalty = 0.02, mixture = 1) %>%
  set_engine("glmnet")
model2_fit <- model2 %>%
  fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + WINTER * `18` + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + SUMMER * `18` + HUMIDITY, data = bike_sharing_training)

model2_train_results <- model2_fit %>%
  predict(new_data = bike_sharing_training) %>%
  mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)

model2$.pred <- replace(model2_train_results$.pred, model2_train_results$.pred < 0, 0)

rsq_model2 <- rsq(model2_train_results,
  truth = truth,
  estimate = .pred
)

rmse_model2 <- rmse(model2_train_results,
  truth = truth,
  estimate = .pred
)

# Model 3 - Top cor variables, 2 polynomial top variable, interaction term & elastic regularization

model3 <- linear_reg(penalty = 0.02, mixture = 0.2) %>%
  set_engine("glmnet")
model3_fit <- model3 %>%
  fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + WINTER * `18` + poly(DEW_POINT_TEMPERATURE, 6) + SOLAR_RADIATION + SUMMER * `18` + TEMPERATURE * HUMIDITY, data = bike_sharing_training)

model3_train_results <- model3_fit %>%
  predict(new_data = bike_sharing_training) %>%
  mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)

model3$.pred <- replace(model3_train_results$.pred, model3_train_results$.pred < 0, 0)

rsq_model3 <- rsq(model3_train_results,
  truth = truth,
  estimate = .pred
)

rmse_model3 <- rmse(model3_train_results,
  truth = truth,
  estimate = .pred
)

# Model 4 - Top cor variables,4 polynomial top variable, interaction term & elastic regularization

model4 <- linear_reg(penalty = 0.0015, mixture = 0.2) %>%
  set_engine("glmnet")
model4_fit <- model4 %>%
  fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + WINTER * `18` + poly(DEW_POINT_TEMPERATURE, 6) + poly(SOLAR_RADIATION, 6) + SUMMER * `18` + TEMPERATURE * HUMIDITY + poly(HUMIDITY, 6), data = bike_sharing_training)

model4_train_results <- model4_fit %>%
  predict(new_data = bike_sharing_training) %>%
  mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)

model4$.pred <- replace(model4_train_results$.pred, model4_train_results$.pred < 0, 0)

rsq_model4 <- rsq(model4_train_results,
  truth = truth,
  estimate = .pred
)

rmse_model4 <- rmse(model4_train_results,
  truth = truth,
  estimate = .pred
)

# Model 5 - Top cor variables,5 polynomial top variable, interaction term & elastic regularization

model5 <- linear_reg(penalty = 0.0015, mixture = 0.2) %>%
  set_engine("glmnet")
model5_fit <- model5 %>%
  fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + WINTER * `18` + poly(DEW_POINT_TEMPERATURE, 6) + poly(SOLAR_RADIATION, 6) + poly(VISIBILITY, 6) + SUMMER * `18` + TEMPERATURE * HUMIDITY + poly(HUMIDITY, 6) + RAINFALL * TEMPERATURE + SNOWFALL * TEMPERATURE + RAINFALL * HUMIDITY + SNOWFALL * HUMIDITY, data = bike_sharing_training)

model5_train_results <- model5_fit %>%
  predict(new_data = bike_sharing_training) %>%
  mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)

model5_train_results$.pred <- replace(model5_train_results$.pred, model5_train_results$.pred < 0, 0)

rsq_model5 <- rsq(model5_train_results,
  truth = truth,
  estimate = .pred
)

rmse_model5 <- rmse(model5_train_results,
  truth = truth,
  estimate = .pred
)

# Save their rmse and rsq values

rsq_lm_ploy
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.757
rmse_lm_ploy
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0899
rsq_model2
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.534
rmse_model2
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.130
rsq_model3
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.713
rmse_model3
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0998
rsq_model4
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.760
rmse_model4
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0889
rsq_model5
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.774
rmse_model5
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0868

I find the the last model, model 5 is the best model we have found, it is because the model is very similar to the poly model I created before but with more poly terms and interactions with the same penalty and mixture.

TODO: Visualize the saved RMSE and R-squared values using a grouped barchart

model_names <- c("lm_ploy", "model2", "model3", "model4", "model5")
rsq <- c("0.757", "0.534", "0.713", "0.760", "0.762")
rsme <- c("0.0899", "0.130", "0.0998", "0.0889", "0.0886")
comparison_df <- data.frame(model_names, rsq, rsme)

comparison_df %>%
  pivot_longer(!model_names) %>%
  ggplot(aes(x = model_names, y = value, fill = name)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Comparing RSME AND RSQ Across Models", fill = "Metric")

TODO: Create a Q-Q plot by plotting the distribution difference between the predictions generated by your best model and the true values on the test dataset.

ggplot(model5_train_results) +
  stat_qq(aes(sample = truth), color = "green") +
  stat_qq(aes(sample = .pred), color = "red")

# Compare the summery statistics between true values from bike sharing dataset and predict values
bike_sharing_df %>%
  filter(SPRING == 1) %>%
  select(RENTED_BIKE_COUNT) %>%
  summary(bike_sharing_df$RENTED_BIKE_COUNT)
##  RENTED_BIKE_COUNT
##  Min.   :0.00000  
##  1st Qu.:0.06275  
##  Median :0.16798  
##  Mean   :0.20941  
##  3rd Qu.:0.31401  
##  Max.   :0.91418