Importing required libraries

library(rvest)
library(tidyverse)
library(httr)
library(jsonlite)

Web scrape a Global Bike-Sharing Systems Wiki Page

Scraping the required table

url <- "https://en.wikipedia.org/wiki/List_of_bicycle-sharing_systems"
html <- read_html(url)
bike_table <- html |> 
  html_element("table") |> 
  html_table()

Previewing the scraped data

head(bike_table, 10)

Getting a summary of information contained in the table

summary(bike_table)
##    Country              City               Name              System         
##  Length:558         Length:558         Length:558         Length:558        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    Operator           Launched         Discontinued         Stations        
##  Length:558         Length:558         Length:558         Length:558        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    Bicycles         Daily ridership   
##  Length:558         Length:558        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

Saving the scraped table

write_csv(bike_table, "raw_bike_sharing_systems.csv")

Get 5-day weather forecasts for a list of cities using the OpenWeather API

get_weather_forecast_by_cities <- function(cities){
  df <- data.frame()
  for (city in cities) {
    url <- 'https://api.openweathermap.org/data/2.5/forecast'
    forecast_query <- list(q = city, appid = "cd1c417f157352ca399e1dbb0e1ccc9f", units = "metric")
    response <- GET(url = url, query = forecast_query)
    json_result <- content(response, as = "text")
    city_df <- json_result |> 
      fromJSON(flatten = TRUE) |> 
      as.data.frame() |> 
      unnest_wider(list.weather) |> 
      mutate(season = "autumn") |> 
      select(city = city.name, weather = main, visibility = list.visibility, temp = list.main.temp,
             temp_min = list.main.temp_min, temp_max = list.main.temp_max, pressure = list.main.pressure,
             humidity = list.main.humidity, wind_speed = list.wind.speed, wind_deg = list.wind.deg,
             forecast_datetime = list.dt_txt, season)
    df <- rbind(df, city_df)
  }
  return(df)
}

Try the function created

city_names <- c("Seoul", paste("Washington", "D.C."), "Paris", "Suzhou")
cities_weather_df <- get_weather_forecast_by_cities(cities = city_names)
head(cities_weather_df, 10)

Saving weather data

write_csv(cities_weather_df, "raw_cities_weather_forecast.csv")

Download several datasets

Download some general city information such as name and locations

url <- "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-RP0321EN-SkillsNetwork/labs/datasets/raw_worldcities.csv"
# download the file
download.file(url, destfile = "raw_worldcities.csv")

Download a specific hourly Seoul bike sharing demand dataset

url <- "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-RP0321EN-SkillsNetwork/labs/datasets/raw_seoul_bike_sharing.csv"
# download the file
download.file(url, destfile = "raw_seoul_bike_sharing.csv")

Loading and Cleaning Column Names

library(janitor)
dataset_list <- c('raw_bike_sharing_systems.csv', 'raw_seoul_bike_sharing.csv', 'raw_cities_weather_forecast.csv', 'raw_worldcities.csv')

Cleaning column names

for (dataset_name in dataset_list) {
  dataset <- read_csv(dataset_name)
  dataset <- dataset |> 
    clean_names()
  column_names <- names(dataset)
  names(dataset) <- str_to_upper(column_names)
  write_csv(dataset, dataset_name)
}

printing summary of datasets

for (dataset_name in dataset_list) {
  dataset <- read_csv(dataset_name)
  print(summary(dataset))
}
##    COUNTRY              CITY               NAME              SYSTEM         
##  Length:558         Length:558         Length:558         Length:558        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    OPERATOR           LAUNCHED         DISCONTINUED         STATIONS        
##  Length:558         Length:558         Length:558         Length:558        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    BICYCLES         DAILY_RIDERSHIP   
##  Length:558         Length:558        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##      DATE           RENTED_BIKE_COUNT      HOUR        TEMPERATURE    
##  Length:8760        Min.   :   2.0    Min.   : 0.00   Min.   :-17.80  
##  Class :character   1st Qu.: 214.0    1st Qu.: 5.75   1st Qu.:  3.40  
##  Mode  :character   Median : 542.0    Median :11.50   Median : 13.70  
##                     Mean   : 729.2    Mean   :11.50   Mean   : 12.87  
##                     3rd Qu.:1084.0    3rd Qu.:17.25   3rd Qu.: 22.50  
##                     Max.   :3556.0    Max.   :23.00   Max.   : 39.40  
##                     NA's   :295                       NA's   :11      
##     HUMIDITY       WIND_SPEED      VISIBILITY   DEW_POINT_TEMPERATURE
##  Min.   : 0.00   Min.   :0.000   Min.   :  27   Min.   :-30.600      
##  1st Qu.:42.00   1st Qu.:0.900   1st Qu.: 940   1st Qu.: -4.700      
##  Median :57.00   Median :1.500   Median :1698   Median :  5.100      
##  Mean   :58.23   Mean   :1.725   Mean   :1437   Mean   :  4.074      
##  3rd Qu.:74.00   3rd Qu.:2.300   3rd Qu.:2000   3rd Qu.: 14.800      
##  Max.   :98.00   Max.   :7.400   Max.   :2000   Max.   : 27.200      
##                                                                      
##  SOLAR_RADIATION     RAINFALL          SNOWFALL         SEASONS         
##  Min.   :0.0000   Min.   : 0.0000   Min.   :0.00000   Length:8760       
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.00000   Class :character  
##  Median :0.0100   Median : 0.0000   Median :0.00000   Mode  :character  
##  Mean   :0.5691   Mean   : 0.1487   Mean   :0.07507                     
##  3rd Qu.:0.9300   3rd Qu.: 0.0000   3rd Qu.:0.00000                     
##  Max.   :3.5200   Max.   :35.0000   Max.   :8.80000                     
##                                                                         
##    HOLIDAY          FUNCTIONING_DAY   
##  Length:8760        Length:8760       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
##                                       
##      CITY             WEATHER            VISIBILITY         TEMP      
##  Length:160         Length:160         Min.   : 4862   Min.   : 2.67  
##  Class :character   Class :character   1st Qu.:10000   1st Qu.:12.55  
##  Mode  :character   Mode  :character   Median :10000   Median :16.55  
##                                        Mean   : 9968   Mean   :15.91  
##                                        3rd Qu.:10000   3rd Qu.:19.38  
##                                        Max.   :10000   Max.   :27.12  
##     TEMP_MIN        TEMP_MAX        PRESSURE       HUMIDITY      WIND_SPEED    
##  Min.   : 2.67   Min.   : 2.67   Min.   : 985   Min.   :30.0   Min.   : 0.420  
##  1st Qu.:12.55   1st Qu.:12.55   1st Qu.:1009   1st Qu.:56.0   1st Qu.: 2.200  
##  Median :16.55   Median :16.55   Median :1017   Median :70.0   Median : 3.295  
##  Mean   :15.89   Mean   :15.94   Mean   :1014   Mean   :67.2   Mean   : 3.721  
##  3rd Qu.:19.32   3rd Qu.:19.40   3rd Qu.:1021   3rd Qu.:80.0   3rd Qu.: 4.850  
##  Max.   :27.12   Max.   :27.12   Max.   :1033   Max.   :96.0   Max.   :11.070  
##     WIND_DEG     FORECAST_DATETIME                SEASON         
##  Min.   :  4.0   Min.   :2023-10-29 03:00:00   Length:160        
##  1st Qu.:148.2   1st Qu.:2023-10-30 08:15:00   Class :character  
##  Median :192.0   Median :2023-10-31 13:30:00   Mode  :character  
##  Mean   :192.4   Mean   :2023-10-31 13:30:00                     
##  3rd Qu.:228.5   3rd Qu.:2023-11-01 18:45:00                     
##  Max.   :352.0   Max.   :2023-11-03 00:00:00                     
##      CITY            CITY_ASCII             LAT              LNG           
##  Length:26569       Length:26569       Min.   :-54.93   Min.   :-179.5900  
##  Class :character   Class :character   1st Qu.: 27.92   1st Qu.: -78.7794  
##  Mode  :character   Mode  :character   Median : 40.22   Median :  -0.7689  
##                                        Mean   : 33.10   Mean   : -11.3639  
##                                        3rd Qu.: 47.99   3rd Qu.:  29.6833  
##                                        Max.   : 81.72   Max.   : 179.3667  
##                                                                            
##    COUNTRY              ISO2               ISO3            ADMIN_NAME       
##  Length:26569       Length:26569       Length:26569       Length:26569      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    CAPITAL            POPULATION             ID           
##  Length:26569       Min.   :       0   Min.   :1.004e+09  
##  Class :character   1st Qu.:    9246   1st Qu.:1.277e+09  
##  Mode  :character   Median :   20080   Median :1.643e+09  
##                     Mean   :  162346   Mean   :1.556e+09  
##                     3rd Qu.:   59369   3rd Qu.:1.840e+09  
##                     Max.   :37977000   Max.   :1.934e+09  
##                     NA's   :973

Process the web-scraped bike sharing system dataset

# First load the dataset
bike_sharing_df <- read_csv("raw_bike_sharing_systems.csv")
# Print its head
head(bike_sharing_df)

Selecting required columns

# Select the four columns
sub_bike_sharing_df <- bike_sharing_df %>% select(COUNTRY, CITY, SYSTEM, BICYCLES)

Checking the classes of the columns selected

sub_bike_sharing_df %>% 
    summarize_all(class) %>%
    gather(variable, class)

Checking for strings in the bicycle column which is expected to be numeric

# grepl searches a string for non-digital characters, and returns TRUE or FALSE
# if it finds any non-digital characters, then the bicyle column is not purely numeric
find_character <- function(strings){
  grepl("[^0-9]", strings)
} 
sub_bike_sharing_df %>% 
    select(BICYCLES) %>% 
    filter(find_character(BICYCLES)) %>%
    slice(1:10)

Checking the other columns for unwanted strings

# Define a 'reference link' character class, 
# `[A-z0-9]` means at least one character 
# `\\[` and `\\]` means the character is wrapped by [], such as for [12] or [abc]
ref_pattern <- "\\[.+\\]"
find_reference_pattern <- function(string){
  grepl(ref_pattern, string)
}

Checking for strings in COUNTRY column

# Check whether the COUNTRY column has any reference links
sub_bike_sharing_df %>% 
    select(COUNTRY) %>% 
    filter(find_reference_pattern(COUNTRY)) %>%
    slice(1:10)

Checking for strings in CITY column

# Check whether the CITY column has any reference links
sub_bike_sharing_df %>% 
    select(CITY) %>% 
    filter(find_reference_pattern(CITY)) %>%
    slice(1:10)

Checking for regular expression in SYSTEM

# Check whether the System column has any reference links
sub_bike_sharing_df %>% 
    select(SYSTEM) %>% 
    filter(find_reference_pattern(SYSTEM)) %>%
    slice(1:10)

TASK: Extract the numeric value using regular expressions

# Extract the first number
extract_num <- function(column){
    # Define a digital pattern
    digitals_pattern <- "[0-9]*"
    column <- str_extract(column, pattern = digitals_pattern)
    return(column)
}
sub_bike_sharing_df <- sub_bike_sharing_df |> 
  mutate(BICYCLES = extract_num(BICYCLES))
sub_bike_sharing_df$BICYCLES <- as.numeric(sub_bike_sharing_df$BICYCLES)

Checking results for BICYCLES

summary(sub_bike_sharing_df$BICYCLES)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       4      70     300    1832    1000   78000     100

Saving our processed data

write_csv(sub_bike_sharing_df, "bike_sharing_systems.csv")

Data Wrangling with dplyr

TASK: Detect and handle missing values

Loading the required data

bike_sharing_df <- read_csv("raw_seoul_bike_sharing.csv")

First take a look at the dataset

summary(bike_sharing_df)
##      DATE           RENTED_BIKE_COUNT      HOUR        TEMPERATURE    
##  Length:8760        Min.   :   2.0    Min.   : 0.00   Min.   :-17.80  
##  Class :character   1st Qu.: 214.0    1st Qu.: 5.75   1st Qu.:  3.40  
##  Mode  :character   Median : 542.0    Median :11.50   Median : 13.70  
##                     Mean   : 729.2    Mean   :11.50   Mean   : 12.87  
##                     3rd Qu.:1084.0    3rd Qu.:17.25   3rd Qu.: 22.50  
##                     Max.   :3556.0    Max.   :23.00   Max.   : 39.40  
##                     NA's   :295                       NA's   :11      
##     HUMIDITY       WIND_SPEED      VISIBILITY   DEW_POINT_TEMPERATURE
##  Min.   : 0.00   Min.   :0.000   Min.   :  27   Min.   :-30.600      
##  1st Qu.:42.00   1st Qu.:0.900   1st Qu.: 940   1st Qu.: -4.700      
##  Median :57.00   Median :1.500   Median :1698   Median :  5.100      
##  Mean   :58.23   Mean   :1.725   Mean   :1437   Mean   :  4.074      
##  3rd Qu.:74.00   3rd Qu.:2.300   3rd Qu.:2000   3rd Qu.: 14.800      
##  Max.   :98.00   Max.   :7.400   Max.   :2000   Max.   : 27.200      
##                                                                      
##  SOLAR_RADIATION     RAINFALL          SNOWFALL         SEASONS         
##  Min.   :0.0000   Min.   : 0.0000   Min.   :0.00000   Length:8760       
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.00000   Class :character  
##  Median :0.0100   Median : 0.0000   Median :0.00000   Mode  :character  
##  Mean   :0.5691   Mean   : 0.1487   Mean   :0.07507                     
##  3rd Qu.:0.9300   3rd Qu.: 0.0000   3rd Qu.:0.00000                     
##  Max.   :3.5200   Max.   :35.0000   Max.   :8.80000                     
##                                                                         
##    HOLIDAY          FUNCTIONING_DAY   
##  Length:8760        Length:8760       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
dim(bike_sharing_df)
## [1] 8760   14

Dropping NAs in RENTED_BIKE_COUNT

bike_sharing_df <- bike_sharing_df |> 
  drop_na(RENTED_BIKE_COUNT)
dim(bike_sharing_df)
## [1] 8465   14

Checking for NAs in TEMPERATURE column

missing_temp <- bike_sharing_df |> 
  filter(is.na(TEMPERATURE))
missing_temp

Calculating the mean summer temperature in Seoul

summer_temp <- bike_sharing_df |> 
  filter(SEASONS == "Summer") |> 
  select(TEMPERATURE)
avg_summer_temp <- mean(summer_temp$TEMPERATURE, na.rm = TRUE)
avg_summer_temp
## [1] 26.58771

Replacing missing values in TEMPERATURE with the mean calculated

bike_sharing_df <- bike_sharing_df |> 
  replace_na(list(TEMPERATURE = avg_summer_temp))
summary(bike_sharing_df)
##      DATE           RENTED_BIKE_COUNT      HOUR        TEMPERATURE    
##  Length:8465        Min.   :   2.0    Min.   : 0.00   Min.   :-17.80  
##  Class :character   1st Qu.: 214.0    1st Qu.: 6.00   1st Qu.:  3.00  
##  Mode  :character   Median : 542.0    Median :12.00   Median : 13.50  
##                     Mean   : 729.2    Mean   :11.51   Mean   : 12.77  
##                     3rd Qu.:1084.0    3rd Qu.:18.00   3rd Qu.: 22.70  
##                     Max.   :3556.0    Max.   :23.00   Max.   : 39.40  
##     HUMIDITY       WIND_SPEED      VISIBILITY   DEW_POINT_TEMPERATURE
##  Min.   : 0.00   Min.   :0.000   Min.   :  27   Min.   :-30.600      
##  1st Qu.:42.00   1st Qu.:0.900   1st Qu.: 935   1st Qu.: -5.100      
##  Median :57.00   Median :1.500   Median :1690   Median :  4.700      
##  Mean   :58.15   Mean   :1.726   Mean   :1434   Mean   :  3.945      
##  3rd Qu.:74.00   3rd Qu.:2.300   3rd Qu.:2000   3rd Qu.: 15.200      
##  Max.   :98.00   Max.   :7.400   Max.   :2000   Max.   : 27.200      
##  SOLAR_RADIATION     RAINFALL          SNOWFALL         SEASONS         
##  Min.   :0.0000   Min.   : 0.0000   Min.   :0.00000   Length:8465       
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.00000   Class :character  
##  Median :0.0100   Median : 0.0000   Median :0.00000   Mode  :character  
##  Mean   :0.5679   Mean   : 0.1491   Mean   :0.07769                     
##  3rd Qu.:0.9300   3rd Qu.: 0.0000   3rd Qu.:0.00000                     
##  Max.   :3.5200   Max.   :35.0000   Max.   :8.80000                     
##    HOLIDAY          FUNCTIONING_DAY   
##  Length:8465        Length:8465       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

Saving the processed data

write_csv(bike_sharing_df, "seoul_bike_sharing.csv")

TASK: Create indicator (dummy) variables for categorical variables

Changing HOUR class into character

bike_sharing_df <- read_csv("seoul_bike_sharing.csv")
bike_sharing_df <- bike_sharing_df |> 
  mutate(HOUR = as.character(HOUR))

Changing character variables into indicator variables

bike_sharing_df <- bike_sharing_df |> 
  mutate(dummy = 1) |> 
  pivot_wider(
    names_from = HOUR,
    values_from = dummy,
    values_fill = 0,
    names_prefix = "HOUR_"
  )
bike_sharing_df <- bike_sharing_df |> 
  mutate(dummy = 1) |> 
  pivot_wider(
    names_from = SEASONS,
    values_from = dummy,
    values_fill = 0,
  )
bike_sharing_df <- bike_sharing_df |> 
  mutate(dummy = 1) |> 
  pivot_wider(
    names_from = HOLIDAY,
    values_from = dummy,
    values_fill = 0
  )
bike_sharing_df <- bike_sharing_df |> 
  mutate(dummy = 1) |> 
  pivot_wider(
    names_from = FUNCTIONING_DAY,
    values_from = dummy,
    values_fill = 0,
    names_prefix = "FUNCTIONING_DAY_"
  )

Saving dataset

write_csv(bike_sharing_df, "seoul_bike_sharing_converted.csv")

TASK: Normalize data

Creating a function to normalize data

normalize_func <- function(column){
  column <- (column - min(column))/(max(column) - min(column))
  return(column)
}

Normalizing columns

bike_sharing_df <- bike_sharing_df |> 
  mutate(
    RENTED_BIKE_COUNT = normalize_func(RENTED_BIKE_COUNT),
    TEMPERATURE = normalize_func(TEMPERATURE),
    HUMIDITY = normalize_func(HUMIDITY),
    WIND_SPEED = normalize_func(WIND_SPEED),
    VISIBILITY = normalize_func(VISIBILITY),
    DEW_POINT_TEMPERATURE = normalize_func(DEW_POINT_TEMPERATURE),
    SOLAR_RADIATION = normalize_func(SOLAR_RADIATION),
    RAINFALL = normalize_func(RAINFALL),
    SNOWFALL = normalize_func(SNOWFALL)
  )

Summarizing data

summary(bike_sharing_df)
##      DATE           RENTED_BIKE_COUNT  TEMPERATURE        HUMIDITY     
##  Length:8465        Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:0.05965   1st Qu.:0.3636   1st Qu.:0.4286  
##  Mode  :character   Median :0.15194   Median :0.5472   Median :0.5816  
##                     Mean   :0.20460   Mean   :0.5345   Mean   :0.5933  
##                     3rd Qu.:0.30445   3rd Qu.:0.7080   3rd Qu.:0.7551  
##                     Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##    WIND_SPEED       VISIBILITY     DEW_POINT_TEMPERATURE SOLAR_RADIATION   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000        Min.   :0.000000  
##  1st Qu.:0.1216   1st Qu.:0.4602   1st Qu.:0.4412        1st Qu.:0.000000  
##  Median :0.2027   Median :0.8429   Median :0.6107        Median :0.002841  
##  Mean   :0.2332   Mean   :0.7131   Mean   :0.5977        Mean   :0.161326  
##  3rd Qu.:0.3108   3rd Qu.:1.0000   3rd Qu.:0.7924        3rd Qu.:0.264205  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000        Max.   :1.000000  
##     RAINFALL           SNOWFALL            HOUR_0            HOUR_1       
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.000000   Median :0.000000   Median :0.00000   Median :0.00000  
##  Mean   :0.004261   Mean   :0.008828   Mean   :0.04158   Mean   :0.04158  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.000000   Max.   :1.000000   Max.   :1.00000   Max.   :1.00000  
##      HOUR_2            HOUR_3            HOUR_4            HOUR_5       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.04158   Mean   :0.04158   Mean   :0.04158   Mean   :0.04158  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##      HOUR_6            HOUR_7           HOUR_8           HOUR_9      
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.04158   Mean   :0.0417   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     HOUR_10          HOUR_11          HOUR_12          HOUR_13      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0417   Mean   :0.0417   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     HOUR_14          HOUR_15          HOUR_16          HOUR_17      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0417   Mean   :0.0417   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     HOUR_18          HOUR_19          HOUR_20          HOUR_21      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0417   Mean   :0.0417   Mean   :0.0417   Mean   :0.0417  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     HOUR_22          HOUR_23           Winter           Spring      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.0417   Mean   :0.0417   Mean   :0.2552   Mean   :0.2552  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      Summer           Autumn         No Holiday        Holiday      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.2608   Mean   :0.2288   Mean   :0.9518   Mean   :0.0482  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  FUNCTIONING_DAY_Yes
##  Min.   :1          
##  1st Qu.:1          
##  Median :1          
##  Mean   :1          
##  3rd Qu.:1          
##  Max.   :1

Saving dataset

write_csv(bike_sharing_df, "seoul_bike_sharing_converted_normalized.csv")

Standardize the column names again for the new datasets

# Dataset list
dataset_list <- c('seoul_bike_sharing.csv', 'seoul_bike_sharing_converted.csv', 'seoul_bike_sharing_converted_normalized.csv')

for (dataset_name in dataset_list){
    # Read dataset
    dataset <- read_csv(dataset_name)
    # Standardized its columns:
    # Convert all columns names to uppercase
    names(dataset) <- toupper(names(dataset))
    # Replace any white space separators by underscore, using str_replace_all function
    names(dataset) <- str_replace_all(names(dataset), " ", "_")
    # Save the dataset back
    write.csv(dataset, dataset_name, row.names=FALSE)
}

Exploratory Data Analysis with SQL

Loading the required library

library(RMySQL)

Creating connection

con <- dbConnect(MySQL(),
                 user = "root",
                 password = "Ososo@2020",
                 dbname = "ibm_final_project_dataset",
                 host = "localhost")
dbSendQuery(con, "SET GLOBAL local_infile = TRUE")
## <MySQLResult:1973187184,0,0>

Loading datasets

world_cities <- read_csv("raw_worldcities.csv")
bike_sharing_systems <- read_csv("bike_sharing_systems.csv")
cities_weather_forecast <- read_csv("raw_cities_weather_forecast.csv")
seoul_bike_sharing <- read_csv("seoul_bike_sharing.csv")

Saving data to MySQL

dbWriteTable(con, "WORLD_CITIES", world_cities)
## [1] TRUE
dbWriteTable(con, "BIKE_SHARING_SYSTEMS", bike_sharing_systems)
## [1] TRUE
dbWriteTable(con, "CITIES_WEATHER_FORECAST", cities_weather_forecast)
## [1] TRUE
dbWriteTable(con, "SEOUL_BIKE_SHARING", seoul_bike_sharing)
## [1] TRUE

Task 1 - Record Count

Determine how many records are in the seoul_bike_sharing dataset. Solution 1

query <- "SELECT count(*) AS NUMBER_OF_RECORDS
          FROM seoul_bike_sharing;"
dbGetQuery(con, query)

Task 2 - Operational Hours

Determine how many hours had non-zero rented bike count. Solution 2

query <- "SELECT count(HOUR) NONE_ZERO_HOURS
          FROM ibm_final_project_dataset.seoul_bike_sharing
          WHERE RENTED_BIKE_COUNT != 0;"
dbGetQuery(con, query)

Task 3 - Weather Outlook

Query the the weather forecast for Seoul over the next 3 hours. Recall that the records in the CITIES_WEATHER_FORECAST dataset are 3 hours apart, so we just need the first record from the query.

Solution 3

query <- "SELECT *
          FROM ibm_final_project_dataset.cities_weather_forecast
          LIMIT 1;"
dbGetQuery(con, query)

Task 4 - Seasons

Find which seasons are included in the Seoul bike sharing dataset. Solution 4

query <- "SELECT DISTINCT SEASONS
          FROM ibm_final_project_dataset.seoul_bike_sharing;"
dbGetQuery(con, query)

Task 5 - Date Range

Find the first and last dates in the Seoul Bike Sharing dataset. Solution 5

query <- "SELECT MIN(DATE) AS FIRST_DATE, MAX(DATE) AS LAST_DATE
          FROM ibm_final_project_dataset.seoul_bike_sharing;"
dbGetQuery(con, query)

Task 6 - Subquery - ‘all-time high’

determine which date and hour had the most bike rentals. Solution 6

query <- "SELECT DATE, HOUR
          FROM seoul_bike_sharing 
          WHERE RENTED_BIKE_COUNT = (SELECT MAX(RENTED_BIKE_COUNT)
                                                  FROM seoul_bike_sharing)"
dbGetQuery(con, query)

Task 7 - Hourly popularity and temperature by season

Determine the average hourly temperature and the average number of bike rentals per hour over each season. List the top ten results by average bike count. Solution 7

query <- "SELECT SEASONS, HOUR, ROUND(AVG(TEMPERATURE), 2) AS HOURLY_TEMPERATURE,           ROUND(AVG(RENTED_BIKE_COUNT), 2) AS HOURLY_RENTED_BIKE_COUNT
          FROM seoul_bike_sharing
          GROUP BY SEASONS, HOUR
          ORDER BY AVG(RENTED_BIKE_COUNT) DESC
          LIMIT 10;"
dbGetQuery(con, query)

Task 8 - Rental Seasonality

Find the average hourly bike count during each season. Also include the minimum, maximum, and standard deviation of the hourly bike count for each season.

Solution 8

query <- "SELECT SEASONS, HOUR, ROUND(AVG(TEMPERATURE), 2) AS HOURLY_TEMPERATURE,
ROUND(AVG(RENTED_BIKE_COUNT), 2) AS HOURLY_RENTED_BIKE_COUNT, MAX(RENTED_BIKE_COUNT) AS HIGHEST_BIKE_COUNT,
MIN(RENTED_BIKE_COUNT) AS LOWEST_BIKE_COUNT,
ROUND(STD(RENTED_BIKE_COUNT), 2) AS STD_RENTED_BIKE_COUNT
        FROM seoul_bike_sharing
        GROUP BY SEASONS, HOUR"
dbGetQuery(con, query)

Task 9 - Weather Seasonality

Consider the weather over each season. On average, what were the TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBILITY, DEW_POINT_TEMPERATURE, SOLAR_RADIATION, RAINFALL, and SNOWFALL per season? Include the average bike count as well , and rank the results by average bike count so you can see if it is correlated with the weather at all.

Solution 9

query <- "SELECT SEASONS, ROUND(AVG(TEMPERATURE), 2)  AS AVG_TEMPERATURE,
ROUND(AVG(HUMIDITY), 2) AS AVG_HUMIDITY, ROUND(AVG(WIND_SPEED), 2) AS AVG_WIND_SPEED,
ROUND(AVG(VISIBILITY), 2) AS AVG_VISIBILITY, ROUND(AVG(DEW_POINT_TEMPERATURE), 2) AS AVG_DEW_POINT_TEMPERATURE,
ROUND(AVG(SOLAR_RADIATION), 2) AS AVG_SOLAR_RADIATION, ROUND(AVG(RAINFALL), 2) AS AVG_RAINFALL,
ROUND(AVG(SNOWFALL), 2) AS AVG_SNOWFALL, ROUND(AVG(RENTED_BIKE_COUNT), 2) AS AVG_BIKE_COUNT
          FROM seoul_bike_sharing
          GROUP BY SEASONS
          ORDER BY AVG(RENTED_BIKE_COUNT) DESC;"
dbGetQuery(con, query)

Task 10 - Total Bike Count and City Info for Seoul

Use an implicit join across the WORLD_CITIES and the BIKE_SHARING_SYSTEMS tables to determine the total number of bikes available in Seoul, plus the following city information about Seoul: CITY, COUNTRY, LAT, LON, POPULATION, in a single view. Notice that in this case, the CITY column will work for the WORLD_CITIES table, but in general you would have to use the CITY_ASCII column.

Solution 10

query <- 'SELECT W.CITY, B.COUNTRY, W.LAT, W.LNG, W.POPULATION, B.BICYCLES AS TOTAL_BIKES
          FROM world_cities W, bike_sharing_systems B
          WHERE W.CITY_ASCII = B.CITY AND W.CITY = "Seoul";'
dbGetQuery(con, query)

Task 11 - Find all city names and coordinates with comparable bike scale to Seoul’s bike sharing system

Find all cities with total bike counts between 15000 and 20000. Return the city and country names, plus the coordinates (LAT, LNG), population, and number of bicycles for each city. Later we will ask you to visualize these similar cities on leaflet, with some weather data.

Solution 11

query <- 'SELECT W.CITY, B.COUNTRY, W.LAT, W.LNG, W.POPULATION, B.BICYCLES AS TOTAL_BIKES
FROM world_cities W, bike_sharing_systems B
WHERE W.CITY = B.CITY AND B.BICYCLES BETWEEN 15000 AND 20000'
dbGetQuery(con, query)
dbDisconnect(con)
## [1] TRUE

Assignment: Exploratory Data Analysis with tidyverse and ggplot2

Task 1 - Load the dataset

Ensure you read DATE as type character.

Solution 1

seoul_bike_sharing <- read_csv("seoul_bike_sharing.csv", col_types = cols("DATE" = col_character()))

Task 2 - Recast DATE as a date

Use the format of the data, namely “%d/%m/%Y”.

Solution 2

seoul_bike_sharing$DATE <- dmy(seoul_bike_sharing$DATE)
class(seoul_bike_sharing$DATE)
## [1] "Date"

Task 3 - Cast HOURS as a categorical variable

Also, coerce its levels to be an ordered sequence. This will ensure your visualizations correctly utilize HOURS as a discrete variable with the expected ordering.

Solution 3

seoul_bike_sharing$HOUR <- as_factor(seoul_bike_sharing$HOUR)
levels(seoul_bike_sharing$HOUR)
##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [16] "15" "16" "17" "18" "19" "20" "21" "22" "23"

Check the structure of the dataframe

str(seoul_bike_sharing)
## spc_tbl_ [8,465 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ DATE                 : Date[1:8465], format: "2017-12-01" "2017-12-01" ...
##  $ RENTED_BIKE_COUNT    : num [1:8465] 254 204 173 107 78 100 181 460 930 490 ...
##  $ HOUR                 : Factor w/ 24 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ TEMPERATURE          : num [1:8465] -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
##  $ HUMIDITY             : num [1:8465] 37 38 39 40 36 37 35 38 37 27 ...
##  $ WIND_SPEED           : num [1:8465] 2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
##  $ VISIBILITY           : num [1:8465] 2000 2000 2000 2000 2000 ...
##  $ DEW_POINT_TEMPERATURE: num [1:8465] -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
##  $ SOLAR_RADIATION      : num [1:8465] 0 0 0 0 0 0 0 0 0.01 0.23 ...
##  $ RAINFALL             : num [1:8465] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SNOWFALL             : num [1:8465] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SEASONS              : chr [1:8465] "Winter" "Winter" "Winter" "Winter" ...
##  $ HOLIDAY              : chr [1:8465] "No Holiday" "No Holiday" "No Holiday" "No Holiday" ...
##  $ FUNCTIONING_DAY      : chr [1:8465] "Yes" "Yes" "Yes" "Yes" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   DATE = col_character(),
##   ..   RENTED_BIKE_COUNT = col_double(),
##   ..   HOUR = col_double(),
##   ..   TEMPERATURE = col_double(),
##   ..   HUMIDITY = col_double(),
##   ..   WIND_SPEED = col_double(),
##   ..   VISIBILITY = col_double(),
##   ..   DEW_POINT_TEMPERATURE = col_double(),
##   ..   SOLAR_RADIATION = col_double(),
##   ..   RAINFALL = col_double(),
##   ..   SNOWFALL = col_double(),
##   ..   SEASONS = col_character(),
##   ..   HOLIDAY = col_character(),
##   ..   FUNCTIONING_DAY = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Checking for missing values

sum(is.na(seoul_bike_sharing))
## [1] 0

Descriptive Statistics

Now you are all set to take a look at some high level statistics of the seoul_bike_sharing dataset.

Task 4 - Dataset Summary

Use the base R summary() function to describe the seoul_bike_sharing dataset.

Solution 4

summary(seoul_bike_sharing)
##       DATE            RENTED_BIKE_COUNT      HOUR       TEMPERATURE    
##  Min.   :2017-12-01   Min.   :   2.0    7      : 353   Min.   :-17.80  
##  1st Qu.:2018-02-27   1st Qu.: 214.0    8      : 353   1st Qu.:  3.00  
##  Median :2018-05-28   Median : 542.0    9      : 353   Median : 13.50  
##  Mean   :2018-05-28   Mean   : 729.2    10     : 353   Mean   : 12.77  
##  3rd Qu.:2018-08-24   3rd Qu.:1084.0    11     : 353   3rd Qu.: 22.70  
##  Max.   :2018-11-30   Max.   :3556.0    12     : 353   Max.   : 39.40  
##                                         (Other):6347                   
##     HUMIDITY       WIND_SPEED      VISIBILITY   DEW_POINT_TEMPERATURE
##  Min.   : 0.00   Min.   :0.000   Min.   :  27   Min.   :-30.600      
##  1st Qu.:42.00   1st Qu.:0.900   1st Qu.: 935   1st Qu.: -5.100      
##  Median :57.00   Median :1.500   Median :1690   Median :  4.700      
##  Mean   :58.15   Mean   :1.726   Mean   :1434   Mean   :  3.945      
##  3rd Qu.:74.00   3rd Qu.:2.300   3rd Qu.:2000   3rd Qu.: 15.200      
##  Max.   :98.00   Max.   :7.400   Max.   :2000   Max.   : 27.200      
##                                                                      
##  SOLAR_RADIATION     RAINFALL          SNOWFALL         SEASONS         
##  Min.   :0.0000   Min.   : 0.0000   Min.   :0.00000   Length:8465       
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.00000   Class :character  
##  Median :0.0100   Median : 0.0000   Median :0.00000   Mode  :character  
##  Mean   :0.5679   Mean   : 0.1491   Mean   :0.07769                     
##  3rd Qu.:0.9300   3rd Qu.: 0.0000   3rd Qu.:0.00000                     
##  Max.   :3.5200   Max.   :35.0000   Max.   :8.80000                     
##                                                                         
##    HOLIDAY          FUNCTIONING_DAY   
##  Length:8465        Length:8465       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 

Task 5 - Based on the above stats, calculate how many Holidays there are

Solution 5:

Number_of_holidays <- seoul_bike_sharing |> 
  filter(HOLIDAY == "Holiday") |> 
  count(HOLIDAY)
Number_of_holidays

Task 6 - Calculate the percentage of records that fall on a holiday

Solution 6

holiday_percentage <- nrow(filter(seoul_bike_sharing, HOLIDAY == "Holiday"))/nrow(seoul_bike_sharing)*100
holiday_percentage
## [1] 4.819846

Task 7 - Given there is exactly a full year of data, determine how many records we expect to have

Solution 7

## A full year has 365 days
## Each day has 24 hours
Expected_number_of_records <- 365*24
Expected_number_of_records
## [1] 8760

Task 8 - Given the observations for the ‘FUNCTIONING_DAY’ how many records must there be?

Solution 8

## Number of possible records should be 2: "Yes" and "No"

Drilling Down

Let’s calculate some seasonally aggregated measures to help build some more context.

Task 9 - Load the dplyr package, group the data by SEASONS, and use the summarize() function to calculate the seasonal total rainfall and snowfall.

Solution 9

seoul_bike_sharing |>
  select(SEASONS, RAINFALL, SNOWFALL) |> 
  group_by(SEASONS) |> 
  summarise(total_rainfall = sum(RAINFALL), total_snowfall = sum(SNOWFALL))

Data Visualization

Task 10 - Create a scatter plot of RENTED_BIKE_COUNT vs DATE

Tune the opacity using the alpha parameter such that the points don’t obscure each other too much.

Solution 10

ggplot(seoul_bike_sharing, aes(x = DATE, y = RENTED_BIKE_COUNT))+
  geom_point(alpha = 0.2)+
  labs(title = "RENTED_BIKE_COUNT vs DATE")

Task 11 - Create the same plot of the RENTED_BIKE_COUNT time series, but now add HOURS as the colour

Solution 11

ggplot(seoul_bike_sharing, aes(x = DATE, y = RENTED_BIKE_COUNT, color = HOUR))+
  geom_point(alpha = 0.5)+
  labs(title = "RENTED_BIKE_COUNT vs DATE")

Distributions

Task 12 - Create a histogram overlaid with a kernel density curve

Normalize the histogram so the y axis represents ‘density’. This can be done by setting y=..density.. in the aesthetics of the histogram.

Solution 12

ggplot(seoul_bike_sharing, aes(x = RENTED_BIKE_COUNT))+
  geom_histogram(aes(y = after_stat(density)), color = "black", fill = "white", bins = 25)+
  geom_density(color = "red", alpha = 0.2)

Correlation between two variables (scatter plot)

Task 13 - Use a scatter plot to visualize the correlation between RENTED_BIKE_COUNT and TEMPERATURE by SEASONS.

Start with RENTED_BIKE_COUNT vs. TEMPERATURE, then generate four plots corresponding to the SEASONS by adding a facet_wrap() layer. Also, make use of colour and opacity to emphasize any patterns that emerge. Use HOUR as the color.

Solution 13

ggplot(seoul_bike_sharing, aes(x = TEMPERATURE, y = RENTED_BIKE_COUNT))+
  geom_point(aes(color = HOUR), alpha = 0.2)+
  facet_wrap(~ SEASONS)

Outliers (boxplot)

Task 14 - Create a display of four boxplots of RENTED_BIKE_COUNT vs. HOUR grouped by SEASONS

Use facet_wrap to generate four plots corresponding to the seasons.

Solution 14

ggplot(seoul_bike_sharing, aes(x = HOUR, y = RENTED_BIKE_COUNT))+
  geom_boxplot()+
  facet_wrap(~ SEASONS)

Task 15 - Group the data by DATE, and use the summarize() function to calculate the daily total rainfall and snowfall

Also, go ahead and plot the results if you wish.

Solution 15

seoul_bike_sharing |> 
  select(DATE, RAINFALL, SNOWFALL) |> 
  group_by(DATE) |> 
  summarise(DAILY_RAINFALL = sum(RAINFALL), DAILY_SNOWFALL = sum(SNOWFALL))
x <- seoul_bike_sharing |> 
  select(DATE, RAINFALL, SNOWFALL) |> 
  group_by(DATE) |> 
  summarise(DAILY_RAINFALL = sum(RAINFALL), DAILY_SNOWFALL = sum(SNOWFALL)) |> 
  pivot_longer(
    cols = 2:3,
    names_to = "fall",
    values_to = "values"
  ) |> 
  ggplot(aes(x = values, color = fall))+
  geom_bar(position = "dodge")

Task 16 - Determine how many days had snowfall

Solution 16

seoul_bike_sharing |> 
  select(DATE, RAINFALL, SNOWFALL) |> 
  group_by(DATE) |> 
  summarise(DAILY_RAINFALL = sum(RAINFALL), DAILY_SNOWFALL = sum(SNOWFALL)) |> 
  filter(DAILY_SNOWFALL != 0) |> 
  nrow()
## [1] 27

Predict Hourly Rented Bike Count using Basic Linear Regression Models

Loading library

library("tidymodels")

Loading dataset

bike_sharing_df <- read_csv("seoul_bike_sharing_converted_normalized.csv")

Removing DATE and FUNCTIONING_DAY

bike_sharing_df <- bike_sharing_df |> 
  select(-DATE, -FUNCTIONING_DAY_YES)

TASK: Split training and testing data

set.seed(1234)
data_split <- initial_split(bike_sharing_df, prop = 3/4)
train_data <- training(data_split)
test_data <- testing(data_split)

TASK: Build a linear regression model using weather variables only

Specifying and fitting model

lm_spec <- linear_reg() |> 
  set_engine(engine = "lm")
lm_model_weather <- lm_spec |> 
  fit(RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL,  data = train_data)

Model summary

print(lm_model_weather$fit)
## 
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY + 
##     WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + 
##     RAINFALL + SNOWFALL, data = data)
## 
## Coefficients:
##           (Intercept)            TEMPERATURE               HUMIDITY  
##              0.043532               0.675221              -0.258408  
##            WIND_SPEED             VISIBILITY  DEW_POINT_TEMPERATURE  
##              0.113806               0.003533              -0.089172  
##       SOLAR_RADIATION               RAINFALL               SNOWFALL  
##             -0.125170              -0.496346               0.089414

TASK: Build a linear regression model using all variables

lm_model_all <- lm_spec |> 
  fit(RENTED_BIKE_COUNT ~ ., data = train_data)

Printing summary of model

print(lm_model_all$fit)
## 
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ ., data = data)
## 
## Coefficients:
##           (Intercept)            TEMPERATURE               HUMIDITY  
##             0.1543567              0.2202189             -0.2495020  
##            WIND_SPEED             VISIBILITY  DEW_POINT_TEMPERATURE  
##             0.0089795              0.0061541              0.1683701  
##       SOLAR_RADIATION               RAINFALL               SNOWFALL  
##             0.0779070             -0.5809335              0.0734309  
##                HOUR_0                 HOUR_1                 HOUR_2  
##            -0.0374527             -0.0620864             -0.0959539  
##                HOUR_3                 HOUR_4                 HOUR_5  
##            -0.1192121             -0.1379013             -0.1312685  
##                HOUR_6                 HOUR_7                 HOUR_8  
##            -0.0866424              0.0008297              0.0976841  
##                HOUR_9                HOUR_10                HOUR_11  
##            -0.0292089             -0.0960401             -0.0988159  
##               HOUR_12                HOUR_13                HOUR_14  
##            -0.0878305             -0.0830508             -0.0833568  
##               HOUR_15                HOUR_16                HOUR_17  
##            -0.0600850             -0.0227013              0.0567640  
##               HOUR_18                HOUR_19                HOUR_20  
##             0.1944273              0.1179459              0.0923429  
##               HOUR_21                HOUR_22                HOUR_23  
##             0.0964469              0.0672010                     NA  
##                WINTER                 SPRING                 SUMMER  
##            -0.1010128             -0.0471677             -0.0452610  
##                AUTUMN             NO_HOLIDAY                HOLIDAY  
##                    NA              0.0350095                     NA

TASK: Model evaluation and identification of important variables

# test_results_weather for lm_model_weather model
test_results_weather <- lm_model_weather |> 
  predict(new_data = test_data) |> 
  mutate(truth = test_data$RENTED_BIKE_COUNT)

# test_results_all for lm_model_all
test_results_all <- lm_model_all |> 
  predict(new_data = test_data) |> 
  mutate(truth = test_data$RENTED_BIKE_COUNT)

Calculating rsq and rmse

rsq_weather <- rsq(test_results_weather, truth = truth, estimate = .pred)
rsq_all <- rsq(test_results_all, truth = truth, estimate = .pred)

rmse_weather <- rmse(test_results_weather, truth = truth, estimate = .pred)
rmse_all <- rmse(test_results_all, truth = truth, estimate = .pred)

rsq_weather
rsq_all
rmse_weather
rmse_all

Checking the coefficients

coefficients <- lm_model_all$fit$coefficients

Sorting coefficients

coefficients <- sort(abs(coefficients))

Plotting coefficients

plot_df <- data.frame(coefficients)
ggplot(plot_df, aes(x = coefficients, y = reorder(rownames(plot_df), coefficients)))+
  geom_col()+
  labs(title = "Coefficients Plot", y = "Coefficients", x = "Coefficient Value")

Refine the Baseline Regression Models

TASK: Add polynomial terms

ggplot(data = train_data, aes(RENTED_BIKE_COUNT, TEMPERATURE)) + 
    geom_point() 

One solution to handle such nonlinearity is using polynomial regression by adding polynomial terms. As shown before, higher order polynomials are better than the first order polynomial.

# Plot the higher order polynomial fits
ggplot(data=train_data, aes(RENTED_BIKE_COUNT, TEMPERATURE)) + 
    geom_point() + 
    geom_smooth(method = "lm", formula = y ~ x, color="red") + 
    geom_smooth(method = "lm", formula = y ~ poly(x, 2), color="yellow") + 
    geom_smooth(method = "lm", formula = y ~ poly(x, 4), color="green") + 
    geom_smooth(method = "lm", formula = y ~ poly(x, 6), color="blue")

Fitting a polynomial regression

lm_poly <- lm_spec |> 
  fit(RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) + poly(HUMIDITY, 4) + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL + HOUR_0 + HOUR_1 + HOUR_2 + HOUR_3 + HOUR_4 + HOUR_5 + HOUR_6 + HOUR_7 + HOUR_8 + HOUR_9 + HOUR_10 + HOUR_11 + HOUR_12 + HOUR_13 + HOUR_14 + HOUR_15 + HOUR_16 + HOUR_17 + HOUR_18 + HOUR_19 + HOUR_20 + HOUR_21 + HOUR_22 + HOUR_23 + WINTER + SPRING + SUMMER + AUTUMN + NO_HOLIDAY +HOLIDAY, data = train_data)

Printing model summary

summary(lm_poly$fit)
## 
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) + 
##     poly(HUMIDITY, 4) + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + 
##     SOLAR_RADIATION + RAINFALL + SNOWFALL + HOUR_0 + HOUR_1 + 
##     HOUR_2 + HOUR_3 + HOUR_4 + HOUR_5 + HOUR_6 + HOUR_7 + HOUR_8 + 
##     HOUR_9 + HOUR_10 + HOUR_11 + HOUR_12 + HOUR_13 + HOUR_14 + 
##     HOUR_15 + HOUR_16 + HOUR_17 + HOUR_18 + HOUR_19 + HOUR_20 + 
##     HOUR_21 + HOUR_22 + HOUR_23 + WINTER + SPRING + SUMMER + 
##     AUTUMN + NO_HOLIDAY + HOLIDAY, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42651 -0.05859 -0.00099  0.05277  0.41498 
## 
## Coefficients: (3 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.635695   0.049825  12.759  < 2e-16 ***
## poly(TEMPERATURE, 6)1 15.894648   1.255325  12.662  < 2e-16 ***
## poly(TEMPERATURE, 6)2 -0.444996   0.144287  -3.084 0.002050 ** 
## poly(TEMPERATURE, 6)3 -2.676345   0.101437 -26.384  < 2e-16 ***
## poly(TEMPERATURE, 6)4 -1.435420   0.109327 -13.130  < 2e-16 ***
## poly(TEMPERATURE, 6)5  0.072596   0.099044   0.733 0.463605    
## poly(TEMPERATURE, 6)6  0.524271   0.101200   5.181 2.28e-07 ***
## poly(HUMIDITY, 4)1     1.234317   0.596876   2.068 0.038684 *  
## poly(HUMIDITY, 4)2    -2.813352   0.125085 -22.491  < 2e-16 ***
## poly(HUMIDITY, 4)3    -1.153864   0.111476 -10.351  < 2e-16 ***
## poly(HUMIDITY, 4)4     0.134791   0.124200   1.085 0.277840    
## WIND_SPEED            -0.005880   0.010343  -0.568 0.569753    
## VISIBILITY            -0.020170   0.005405  -3.732 0.000192 ***
## DEW_POINT_TEMPERATURE -0.664475   0.080973  -8.206 2.75e-16 ***
## SOLAR_RADIATION        0.067152   0.010848   6.190 6.38e-10 ***
## RAINFALL              -0.369546   0.039318  -9.399  < 2e-16 ***
## SNOWFALL               0.094753   0.026761   3.541 0.000402 ***
## HOUR_0                -0.037590   0.008492  -4.426 9.75e-06 ***
## HOUR_1                -0.063245   0.008370  -7.556 4.74e-14 ***
## HOUR_2                -0.096212   0.008392 -11.465  < 2e-16 ***
## HOUR_3                -0.115251   0.008546 -13.487  < 2e-16 ***
## HOUR_4                -0.134147   0.008498 -15.785  < 2e-16 ***
## HOUR_5                -0.124819   0.008385 -14.885  < 2e-16 ***
## HOUR_6                -0.082217   0.008424  -9.760  < 2e-16 ***
## HOUR_7                 0.006742   0.008485   0.795 0.426912    
## HOUR_8                 0.095963   0.008405  11.417  < 2e-16 ***
## HOUR_9                -0.028405   0.008630  -3.291 0.001002 ** 
## HOUR_10               -0.088974   0.008972  -9.917  < 2e-16 ***
## HOUR_11               -0.084411   0.009450  -8.933  < 2e-16 ***
## HOUR_12               -0.064868   0.009724  -6.671 2.76e-11 ***
## HOUR_13               -0.056983   0.009881  -5.767 8.46e-09 ***
## HOUR_14               -0.052095   0.009600  -5.427 5.95e-08 ***
## HOUR_15               -0.029536   0.009439  -3.129 0.001761 ** 
## HOUR_16                0.007403   0.009076   0.816 0.414688    
## HOUR_17                0.081000   0.008845   9.157  < 2e-16 ***
## HOUR_18                0.214698   0.008561  25.077  < 2e-16 ***
## HOUR_19                0.130315   0.008456  15.411  < 2e-16 ***
## HOUR_20                0.098517   0.008373  11.766  < 2e-16 ***
## HOUR_21                0.097839   0.008391  11.660  < 2e-16 ***
## HOUR_22                0.067345   0.008339   8.076 7.96e-16 ***
## HOUR_23                      NA         NA      NA       NA    
## WINTER                -0.105679   0.006104 -17.315  < 2e-16 ***
## SPRING                -0.040911   0.003679 -11.122  < 2e-16 ***
## SUMMER                -0.031872   0.005126  -6.217 5.39e-10 ***
## AUTUMN                       NA         NA      NA       NA    
## NO_HOLIDAY             0.030800   0.005902   5.219 1.86e-07 ***
## HOLIDAY                      NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09628 on 6304 degrees of freedom
## Multiple R-squared:  0.7206, Adjusted R-squared:  0.7187 
## F-statistic: 378.1 on 43 and 6304 DF,  p-value: < 2.2e-16

Making predictions

test_result_poly <- lm_poly |> 
  predict(new_data = test_data) |> 
  mutate(truth = test_data$RENTED_BIKE_COUNT)

Calculating RSquared and RMSE

rsq_poly <- rsq(test_result_poly, truth = truth, estimate = .pred)
rmse_poly <- rmse(test_result_poly, truth = truth, estimate = .pred)

rsq_poly
rmse_poly

TASK: Add interaction terms

lm_poly_interaction <- lm_spec |> 
  fit(RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) + poly(HUMIDITY, 4) + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL + HOUR_0 + HOUR_1 + HOUR_2 + HOUR_3 + HOUR_4 + HOUR_5 + HOUR_6 + HOUR_7 + HOUR_8 + HOUR_9 + HOUR_10 + HOUR_11 + HOUR_12 + HOUR_13 + HOUR_14 + HOUR_15 + HOUR_16 + HOUR_17 + HOUR_18 + HOUR_19 + HOUR_20 + HOUR_21 + HOUR_22 + HOUR_23 + WINTER + SPRING + SUMMER + AUTUMN + NO_HOLIDAY + HOLIDAY + HUMIDITY*TEMPERATURE + RAINFALL*HUMIDITY + RAINFALL*WIND_SPEED, data = train_data)

Summary of model with interaction

summary(lm_poly_interaction$fit)
## 
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) + 
##     poly(HUMIDITY, 4) + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + 
##     SOLAR_RADIATION + RAINFALL + SNOWFALL + HOUR_0 + HOUR_1 + 
##     HOUR_2 + HOUR_3 + HOUR_4 + HOUR_5 + HOUR_6 + HOUR_7 + HOUR_8 + 
##     HOUR_9 + HOUR_10 + HOUR_11 + HOUR_12 + HOUR_13 + HOUR_14 + 
##     HOUR_15 + HOUR_16 + HOUR_17 + HOUR_18 + HOUR_19 + HOUR_20 + 
##     HOUR_21 + HOUR_22 + HOUR_23 + WINTER + SPRING + SUMMER + 
##     AUTUMN + NO_HOLIDAY + HOLIDAY + HUMIDITY * TEMPERATURE + 
##     RAINFALL * HUMIDITY + RAINFALL * WIND_SPEED, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43318 -0.05763  0.00123  0.05153  0.40824 
## 
## Coefficients: (5 not defined because of singularities)
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.539982   0.048504  11.133  < 2e-16 ***
## poly(TEMPERATURE, 6)1  14.441040   1.218215  11.854  < 2e-16 ***
## poly(TEMPERATURE, 6)2  -0.387659   0.139825  -2.772  0.00558 ** 
## poly(TEMPERATURE, 6)3  -3.406592   0.105351 -32.336  < 2e-16 ***
## poly(TEMPERATURE, 6)4  -1.588740   0.106280 -14.949  < 2e-16 ***
## poly(TEMPERATURE, 6)5   0.065726   0.095953   0.685  0.49338    
## poly(TEMPERATURE, 6)6   0.455156   0.098114   4.639 3.57e-06 ***
## poly(HUMIDITY, 4)1      4.239221   0.598874   7.079 1.61e-12 ***
## poly(HUMIDITY, 4)2     -1.796411   0.131296 -13.682  < 2e-16 ***
## poly(HUMIDITY, 4)3     -0.743184   0.110198  -6.744 1.68e-11 ***
## poly(HUMIDITY, 4)4     -0.734476   0.127842  -5.745 9.61e-09 ***
## WIND_SPEED              0.009287   0.010160   0.914  0.36071    
## VISIBILITY             -0.007575   0.005273  -1.437  0.15089    
## DEW_POINT_TEMPERATURE  -0.069582   0.083982  -0.829  0.40740    
## SOLAR_RADIATION        -0.009459   0.011181  -0.846  0.39755    
## RAINFALL              -11.962287   1.545631  -7.739 1.16e-14 ***
## SNOWFALL               -0.035156   0.026795  -1.312  0.18956    
## HOUR_0                 -0.036313   0.008227  -4.414 1.03e-05 ***
## HOUR_1                 -0.064481   0.008108  -7.953 2.15e-15 ***
## HOUR_2                 -0.096918   0.008129 -11.923  < 2e-16 ***
## HOUR_3                 -0.116559   0.008278 -14.080  < 2e-16 ***
## HOUR_4                 -0.137395   0.008234 -16.686  < 2e-16 ***
## HOUR_5                 -0.129357   0.008126 -15.920  < 2e-16 ***
## HOUR_6                 -0.085421   0.008163 -10.465  < 2e-16 ***
## HOUR_7                  0.003801   0.008220   0.462  0.64383    
## HOUR_8                  0.100389   0.008145  12.325  < 2e-16 ***
## HOUR_9                 -0.012594   0.008396  -1.500  0.13364    
## HOUR_10                -0.063556   0.008781  -7.238 5.11e-13 ***
## HOUR_11                -0.054178   0.009274  -5.842 5.43e-09 ***
## HOUR_12                -0.030306   0.009570  -3.167  0.00155 ** 
## HOUR_13                -0.025829   0.009693  -2.665  0.00772 ** 
## HOUR_14                -0.024236   0.009399  -2.579  0.00994 ** 
## HOUR_15                -0.007309   0.009209  -0.794  0.42744    
## HOUR_16                 0.020337   0.008816   2.307  0.02109 *  
## HOUR_17                 0.086685   0.008574  10.111  < 2e-16 ***
## HOUR_18                 0.215056   0.008300  25.911  < 2e-16 ***
## HOUR_19                 0.131780   0.008194  16.083  < 2e-16 ***
## HOUR_20                 0.097566   0.008113  12.026  < 2e-16 ***
## HOUR_21                 0.097388   0.008129  11.980  < 2e-16 ***
## HOUR_22                 0.067874   0.008078   8.402  < 2e-16 ***
## HOUR_23                       NA         NA      NA       NA    
## WINTER                 -0.099454   0.005921 -16.797  < 2e-16 ***
## SPRING                 -0.041233   0.003565 -11.568  < 2e-16 ***
## SUMMER                 -0.009796   0.005106  -1.918  0.05510 .  
## AUTUMN                        NA         NA      NA       NA    
## NO_HOLIDAY              0.031298   0.005720   5.472 4.63e-08 ***
## HOLIDAY                       NA         NA      NA       NA    
## HUMIDITY                      NA         NA      NA       NA    
## TEMPERATURE                   NA         NA      NA       NA    
## HUMIDITY:TEMPERATURE   -0.845092   0.043552 -19.404  < 2e-16 ***
## RAINFALL:HUMIDITY      11.717573   1.560004   7.511 6.67e-14 ***
## WIND_SPEED:RAINFALL     0.380232   0.234819   1.619  0.10544    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09326 on 6301 degrees of freedom
## Multiple R-squared:  0.738,  Adjusted R-squared:  0.7361 
## F-statistic: 385.8 on 46 and 6301 DF,  p-value: < 2.2e-16

Evaluating model

test_result_interaction <- lm_poly_interaction |> 
  predict(new_data = test_data) |> 
  mutate(truth = test_data$RENTED_BIKE_COUNT)
rsq_interaction <- rsq(test_result_interaction, truth = truth, estimate = .pred)
rmse_interaction <- rmse(test_result_interaction, truth = truth, estimate = .pred)
rsq_interaction
rmse_interaction

TASK: Add regularization

library(glmnet)

Specifying model and fitting regression

bike_recipe <- recipe(RENTED_BIKE_COUNT ~ ., data = train_data)

glmnet_spec <- linear_reg(penalty = 0.1, mixture = 0) |> 
  set_engine("glmnet")

glmnet_wf <- workflow() |> 
  add_recipe(bike_recipe)

glmnet_fit <- glmnet_wf |> 
  add_model(glmnet_spec) |> 
  fit(data = train_data)

Getting model coefficients

glmnet_fit |> 
  pull_workflow_fit() |> 
  tidy()

Evaluating model

test_result_glmnet <- glmnet_fit |> 
  predict(new_data = test_data) |> 
  mutate(truth = test_data$RENTED_BIKE_COUNT)
rsq_glmnet <- rsq(test_result_glmnet, truth = truth, estimate = .pred)
rmse_glmnet <- rmse(test_result_glmnet, truth = truth, estimate = .pred)
rsq_glmnet
rmse_glmnet

Comparing the performance of the various models

models_data <- tribble(
  ~model, ~rsq, ~rmse,
  "glmnet", 0.65, 0.11,
  "weather", 0.44, 0.13,
  "all", 0.67, 0.10,
  "interaction", 0.74, 0.09,
  "poly", 0.73, 0.092
)
models_data
bar_chart <- models_data |> 
  pivot_longer(
    cols = 2:3,
    names_to = "measures",
    values_to = "values"
  ) |> 
  ggplot(aes(x = model, y = values, fill = measures))+
  geom_col(position = "dodge")+
  labs(title = "Model Evaluation Chart")
bar_chart

QQ Plot Using Interaction Model

ggplot(test_result_interaction)+
  stat_qq(aes(sample = truth), color = "green")+
  stat_qq(aes(sample = .pred), color = "red")