if (knitr::is_html_output()) "# [TOC]\n" else ""
## [1] "# [TOC]\n"
# Load neccessary packages
pacman::p_load(rvest, tidyverse, httr, stringr, magrittr)
The dataset we will be using for analysis is the bike sharing system worldwide, which can be accessed from https://en.wikipedia.org/wiki/List_of_bicycle-sharing_systems
Since this HTML page at least contains three child table nodes under the root HTML node. So, we will need to use html_nodes(root_node, “table”) function to get all its child table nodes, then create a data frame for later analysis
url <- "https://en.wikipedia.org/wiki/List_of_bicycle-sharing_systems"
data <- read_html(url)
#Get all the the child table nodes under the root HTML node
table_nodes <- html_nodes(data, "table")
#Convert data to data frame
bike_df <- data %>%
html_element("table") %>%
html_table()
print(df)
## function (x, df1, df2, ncp, log = FALSE)
## {
## if (missing(ncp))
## .Call(C_df, x, df1, df2, log)
## else .Call(C_dnf, x, df1, df2, ncp, log)
## }
## <bytecode: 0x1103634c8>
## <environment: namespace:stats>
#Export to a CSV file
write.csv(bike_df,"/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/raw_bike_sharing_system.csv")
Collecting real-time current and forecast weather data for cities using the OpenWeather API
First, using R code to get the current weather data of Seoul and save it into a dataframe
Setting api key
#URL for current weather API
current_weather_url <- 'https://api.openweathermap.org/data/2.5/weather'
#List to hold URL parameter
current_query <- list(q="Seoul",appid=api_key,units="metric")
#Make a HTTP request to the current weather API
response <- GET(current_weather_url, query=current_query)
http_type(response)
## [1] "application/json"
#Read json http data
json_result <- content(response, as="parsed")
json_result
## $coord
## $coord$lon
## [1] 126.9778
##
## $coord$lat
## [1] 37.5683
##
##
## $weather
## $weather[[1]]
## $weather[[1]]$id
## [1] 804
##
## $weather[[1]]$main
## [1] "Clouds"
##
## $weather[[1]]$description
## [1] "overcast clouds"
##
## $weather[[1]]$icon
## [1] "04d"
##
##
##
## $base
## [1] "stations"
##
## $main
## $main$temp
## [1] 26.66
##
## $main$feels_like
## [1] 26.66
##
## $main$temp_min
## [1] 24.69
##
## $main$temp_max
## [1] 26.66
##
## $main$pressure
## [1] 1012
##
## $main$humidity
## [1] 93
##
## $main$sea_level
## [1] 1012
##
## $main$grnd_level
## [1] 1006
##
##
## $visibility
## [1] 10000
##
## $wind
## $wind$speed
## [1] 1.41
##
## $wind$deg
## [1] 214
##
## $wind$gust
## [1] 2.94
##
##
## $clouds
## $clouds$all
## [1] 100
##
##
## $dt
## [1] 1689556085
##
## $sys
## $sys$type
## [1] 1
##
## $sys$id
## [1] 5509
##
## $sys$country
## [1] "KR"
##
## $sys$sunrise
## [1] 1689539004
##
## $sys$sunset
## [1] 1689591152
##
##
## $timezone
## [1] 32400
##
## $id
## [1] 1835848
##
## $name
## [1] "Seoul"
##
## $cod
## [1] 200
# Create some empty vectors to hold data temporarily
city <- c()
weather <- c()
visibility <- c()
temp <- c()
temp_min <- c()
temp_max <- c()
pressure <- c()
humidity <- c()
wind_speed <- c()
wind_deg <- c()
#Assign values in json_result into different vector
city <- c(city, json_result$name)
weather <- c(weather, json_result$weather[[1]]$main)
visibility <- c(visibility, json_result$visibility)
temp <- c(temp, json_result$main$temp)
temp_min <-c(temp_min, json_result$main$temp_min)
temp_max <- c(temp_max, json_result$main$temp_max)
pressure <- c(pressure, json_result$main$pressure)
humidity <- c(humidity, json_result$main$humidity)
wind_speed <- c(wind_speed, json_result$wind$speed)
wind_deg <-c(wind_deg, json_result$wind$deg)
#Combine all vector into data frame
weather_df <- data.frame(city = city,
weather=weather,
visibility=visibility,
temp=temp,
temp_min=temp_min,
temp_max=temp_max,
pressure=pressure,
humidity=humidity,
wind_speed=wind_speed,
wind_deg=wind_deg)
print(weather_df)
## city weather visibility temp temp_min temp_max pressure humidity wind_speed
## 1 Seoul Clouds 10000 26.66 24.69 26.66 1012 93 1.41
## wind_deg
## 1 214
# Get 5 -day weather forecast for a list of cities
weather_forecast_by_cities <- function(city_names) {
df <- data.frame()
for (city_name in city_names) {
#forecast API URL
forecast_url <-'https://api.openweathermap.org/data/2.5/weather'
#create query parameter
forecast_query <- list(q=city_name,appid=api_key, units="metric")
#make HTTP GET call for the given city
response <- GET(forecast_url, query=forecast_query)
json_result <- content(response, as="parsed")
results <- json_result$list
#Loop the json result
for(result in results) {
city <- c(city, city_name)
}
# Add R lists into a data frame
city <- c(city, json_result$name)
weather <- c(weather, json_result$weather[[1]]$main)
visibility <- c(visibility, json_result$visibility)
temp <- c(temp, json_result$main$temp)
temp_min <-c(temp_min, json_result$main$temp_min)
temp_max <- c(temp_max, json_result$main$temp_max)
pressure <- c(pressure, json_result$main$pressure)
humidity <- c(humidity, json_result$main$humidity)
wind_speed <- c(wind_speed, json_result$wind$speed)
wind_deg <-c(wind_deg, json_result$wind$deg)
#Combine all vector into data frame
df <- data.frame(city = city,
weather=weather,
visibility=visibility,
temp=temp,
temp_min=temp_min,
temp_max=temp_max,
pressure=pressure,
humidity=humidity,
wind_speed=wind_speed,
wind_deg=wind_deg)
}
return(df)
}
cities <- c("Seoul", "Washington, D.C.", "Paris", "Suzhou")
cities_weather_df <- weather_forecast_by_cities(cities)
print(cities_weather_df)
## city weather visibility temp temp_min temp_max pressure humidity
## 1 Seoul Clouds 10000 26.66 24.69 26.66 1012 93
## 2 Seoul Clouds 10000 26.66 24.69 26.66 1012 93
## 3 Washington Clouds 10000 33.46 27.76 35.32 1007 21
## 4 Paris Clear 10000 15.69 11.34 16.75 1018 76
## 5 Suzhou Clouds 10000 29.70 29.70 29.70 1008 75
## wind_speed wind_deg
## 1 1.41 214
## 2 1.41 214
## 3 2.60 316
## 4 1.54 240
## 5 4.97 118
The data will be saved to cities_weather_forecast.csv
for later use.
write.csv(cities_weather_df, "/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/cities_weather_forecast.csv", row.names = FALSE)
In this data collection process, I will collect some raw datasets
from several different sources online. Then I will use regular
expression along with the stringr package to clean-up the
bike-sharing systems data.
List of datasets that will be used:
raw_bike_sharing_systems.csv: A list of active
bike-sharing systems across the world
raw_cities_weather_forecast.csv: 5-day weather
forecasts for a list of cities, from OpenWeather API
raw_worldcities.csv: A list of major cities’ info
(such as name, latitude and longitude) across the world
raw_seoul_bike_sharing.csv: Weather information
(Temperature, Humidity, Windspeed, Visibility, Dewpoint, Solar
radiation, Snowfall, Rainfall), the number of bikes rented per hour, and
date information, from Seoul bike-sharing systems
Download datasets
url1 <- "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-RP0321EN-SkillsNetwork/labs/datasets/raw_worldcities.csv"
download.file(url1, destfile="/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/raw_worldcities.csv")
download.file("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-RP0321EN-SkillsNetwork/labs/datasets/raw_seoul_bike_sharing.csv",destfile = "raw_seoul_bike_sharing.csv")
Put the dataset downloaded into the datasets_list
dataset_list <- c('raw_bike_sharing_system.csv', 'raw_seoul_bike_sharing.csv', 'cities_weather_forecast.csv', 'raw_worldcities.csv')
#Convert iterate over the above datasets and convert their column names
for (dataset_name in dataset_list) {
dataset <- read.csv(dataset_name)
names(dataset) <- toupper(names(dataset))
names(dataset) <- str_replace_all(names(dataset), " ", "_")
write.csv(dataset, dataset_name, row.names = FALSE)
}
Since the datasets are downloaded from the web, there are some values needed to cleaning up. For this project, we will focus on processing some relevant columns: COUNTRY, CITY, SYSTEM, BICYCLES
First load the datasets and take a look at it
bike_sharing_df <- read.csv("/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/raw_bike_sharing_system.csv")
head(bike_sharing_df)
## X COUNTRY CITY NAME SYSTEM
## 1 1 Albania Tirana[5] Ecovolis
## 2 2 Argentina Buenos Aires[6][7] Ecobici Serttel Brasil[8]
## 3 3 Argentina Mendoza[10] Metrobici
## 4 4 Argentina Rosario Mi Bici Tu Bici[11]
## 5 5 Argentina San Lorenzo, Santa Fe Biciudad Biciudad
## 6 6 Australia Melbourne[12] Melbourne Bike Share PBSC & 8D
## OPERATOR LAUNCHED DISCONTINUED STATIONS
## 1 March 2011 8
## 2 Bike In Baires Consortium[9] 2010 400
## 3 2014 2
## 4 2 December 2015 47
## 5 27 November 2016 8
## 6 Motivate June 2010 30 November 2019[13] 53
## BICYCLES DAILY.RIDERSHIP
## 1 200
## 2 4000 21917
## 3 40
## 4 480
## 5 80
## 6 676
Create a sub data frame of these four columns to process
sub_bike_sharing_df <- bike_sharing_df %>%
select(COUNTRY, CITY, SYSTEM, BICYCLES)
#Check the type of data in those columns
sub_bike_sharing_df %>%
summarize_all(class) %>%
gather(variable, class)
## variable class
## 1 COUNTRY character
## 2 CITY character
## 3 SYSTEM character
## 4 BICYCLES character
Check why column BYCICLES is in character class
find_character <- function(strings) grepl("[^0-9]", strings) #Create Function
sub_bike_sharing_df %>%
select(BICYCLES) %>% #Use the function to check BICYCLES column
filter(find_character(BICYCLES)) %>%
slice(0:10)
## BICYCLES
## 1 1790 (2019)[21]
## 2 4200 (2021)
## 3 4115[25]
## 4 7270 (regular) 2395 (electric)[38]
## 5 310[65]
## 6 500[75]
## 7 [78]
## 8 180[79]
## 9 600[82]
## 10 initially 800 (later 2500)
Because there are some values associated with numeric and non-numeric value, BYCICLES was classified as character.
Check if COUNTRY, CITY, SYSTEM have any reference link, such as Melbourne[12]
#Create a function to check if there is any reference link in the values
ref_pattern <- "\\[[A-z0-9]+\\]"
find_reference_pattern <- function(strings) grepl(ref_pattern, strings)
# Use the function to check if COUNTRY, CITY, SYSTEM have any reference link
sub_bike_sharing_df %>%
select(COUNTRY) %>%
filter(find_reference_pattern(COUNTRY)) %>%
slice(1:10) #subset the df with first 11 rows (code will quickly find the match the filter criteria without overwhelming)
## [1] COUNTRY
## <0 rows> (or 0-length row.names)
sub_bike_sharing_df %>%
select(CITY) %>%
filter(find_reference_pattern(CITY)) %>%
slice(1:10)
## CITY
## 1 Tirana[5]
## 2 Buenos Aires[6][7]
## 3 Mendoza[10]
## 4 Melbourne[12]
## 5 Melbourne[12]
## 6 Brisbane[14][15]
## 7 Lower Austria[16]
## 8 Different locations[19]
## 9 Brussels[24]
## 10 Namur[26]
sub_bike_sharing_df %>%
select(SYSTEM) %>%
filter(find_reference_pattern(SYSTEM)) %>%
slice(1:10)
## SYSTEM
## 1 Serttel Brasil[8]
## 2 EasyBike[64]
## 3 4 Gen.[72]
## 4 3 Gen. SmooveKey[135]
## 5 3 Gen. Smoove[162][163][164][160]
## 6 3 Gen. Smoove[200]
## 7 3 Gen. Smoove[202]
## 8 3 Gen. Smoove[204]
COUNTRY column is clean, CITY and SYSTEM have some reference links need to be cleaned
#Create a function to remove reference links
remove_ref <- function(strings) {
ref_pattern <- "\\[[A-z0-9]+\\]" # Define a pattern matching a reference link such as [1]
result <- stringr::str_replace_all(strings,ref_pattern,"") # Replace all matched substrings with a white space
result <- trimws(result)
return(result)
}
# Use the function to remove the reference links
sub_bike_sharing_df %<>% #use mutate and remove_ref fcn to remove ref in CITY and SYSTEM
mutate(SYSTEM=remove_ref(SYSTEM),
CITY=remove_ref(CITY))
# Check whether all reference links are removed
sub_bike_sharing_df %>%
select(COUNTRY, CITY, SYSTEM, BICYCLES) %>%
filter(find_reference_pattern(COUNTRY) | find_reference_pattern(CITY) | find_reference_pattern(CITY) |find_reference_pattern(BICYCLES) )
## COUNTRY CITY
## 1 Belgium Different locations
## 2 Belgium Brussels
## 3 Canada Montreal
## 4 Cyprus Limassol, Nicosia District
## 5 Czechia Prague
## 6 Czechia Prague 7
## 7 Czechia Prostějov
## 8 Czechia Ostrava
## 9 Denmark Farsø
## 10 Finland Kouvola
## 11 Finland Kuopio
## 12 Finland Lahti
## 13 Finland Lappeenranta
## 14 Finland Pori
## 15 Finland Raseborg
## 16 Finland Riihimäki
## 17 Finland Tampere
## 18 Finland Turku
## 19 Finland Varkaus
## 20 Georgia Batumi
## 21 Germany Darmstadt
## 22 Greece Corfu
## 23 Hungary Budapest
## 24 Hungary Győr
## 25 Hungary Kaposvár
## 26 Italy Milan
## 27 Lithuania Kaunas
## 28 Netherlands Various Locations (especially railway stations)
## 29 Russia Kazan
## 30 Slovakia Bratislava
## 31 Slovakia Bratislava
## 32 Slovakia Košice
## 33 Slovakia Moldava nad Bodvou
## 34 Slovakia Žilina
## 35 South Korea Changwon
## 36 United Kingdom Glasgow, Scotland
## 37 United Kingdom Greater Manchester, England
## 38 United Kingdom Edinburgh, Scotland
## 39 United Kingdom Liverpool, England
## SYSTEM BICYCLES
## 1 Blue-bike 1790 (2019)[21]
## 2 3 Gen. Cyclocity 4115[25]
## 3 PBSC & 8D 7270 (regular) 2395 (electric)[38]
## 4 3 Gen. Smoove 310[65]
## 5 500[75]
## 6 4 Gen. Ofo [78]
## 7 3 Gen. nextbike 180[79]
## 8 3 Gen. nextbike 600[82]
## 9 2 Gen [89]
## 10 Donkey Republic 60(regular) 10(electric)(2022)[96]
## 11 Freebike 350(2022) [97]
## 12 Freebike 250 (2022) [98]
## 13 Donkey Republic 120(2022)[99]
## 14 Rolanbike 50 (2022) [103]
## 15 Donkey Republic 30 (2022) [104]
## 16 Donkey Republic 60 (2022) [105]
## 17 CityBike Global 700 (2022)[106]
## 18 Nextbike 700 (2022) [107]
## 19 Juro 20 (2022) [110]
## 20 3 Gen. SmooveKey 370[136]
## 21 3 & 4 Gen. Call a Bike flex 350 [147]
## 22 3 Gen. Smoove 100[64]
## 23 3 Gen. 1526[180]
## 24 180[182]
## 25 32 (including 6 rollers) [183]
## 26 3 Gen. Clear CC 5430 (1000 E)[195]
## 27 150 E[207]
## 28 OV-Fiets/Nederlandse Spoorwegen 21700 [210]
## 29 3 Gen. Cyclocity 120[238]
## 30 400[240]
## 31 80[242]
## 32 500[246]
## 33 25[249]
## 34 nextbike 123[263]
## 35 2348[272]
## 36 3 Gen. nextbike 400[294]
## 37 1500[296]
## 38 Urban Sharing 500[297]
## 39 1000[298]
Extract the numeric value to clean the BICYCLES column
extract_num <- function(columns) {
digitals_pattern <- "\\d+" #define a pattern matching digital substring
str_extract(columns,digitals_pattern) %>%
as.numeric()
}
sub_bike_sharing_df %<>% #use mutate and to apply function to BICYLCLES
mutate(BICYCLES=extract_num(BICYCLES))
Write the clean dataset to CSV file
write.csv(sub_bike_sharing_df,"/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/bike_sharing_system.csv")
This part will focus on wrangling the Seoul bike-sharing demand historical dataset. This is the core dataset to build a predictive model later.
Standardize the column name for later use
dataset_list <- c('bike_sharing_system.csv','raw_seoul_bike_sharing.csv')
for (dataset_name in dataset_list) {
dataset <- read.csv(dataset_name)
names(dataset) <- toupper(names(dataset))
names(dataset) <- str_replace_all(names(dataset), " ", "_")
write.csv(dataset, dataset_name, row.names = FALSE)
}
# Load the dataset
bike_sharing_df <- read.csv("raw_seoul_bike_sharing.csv")
summary(bike_sharing_df)
## DATE RENTED_BIKE_COUNT HOUR TEMPERATURE
## Length:8760 Min. : 2.0 Min. : 0.00 Min. :-17.80
## Class :character 1st Qu.: 214.0 1st Qu.: 5.75 1st Qu.: 3.40
## Mode :character Median : 542.0 Median :11.50 Median : 13.70
## Mean : 729.2 Mean :11.50 Mean : 12.87
## 3rd Qu.:1084.0 3rd Qu.:17.25 3rd Qu.: 22.50
## Max. :3556.0 Max. :23.00 Max. : 39.40
## NA's :295 NA's :11
## HUMIDITY WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## Min. : 0.00 Min. :0.000 Min. : 27 Min. :-30.600
## 1st Qu.:42.00 1st Qu.:0.900 1st Qu.: 940 1st Qu.: -4.700
## Median :57.00 Median :1.500 Median :1698 Median : 5.100
## Mean :58.23 Mean :1.725 Mean :1437 Mean : 4.074
## 3rd Qu.:74.00 3rd Qu.:2.300 3rd Qu.:2000 3rd Qu.: 14.800
## Max. :98.00 Max. :7.400 Max. :2000 Max. : 27.200
##
## SOLAR_RADIATION RAINFALL SNOWFALL SEASONS
## Min. :0.0000 Min. : 0.0000 Min. :0.00000 Length:8760
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 Class :character
## Median :0.0100 Median : 0.0000 Median :0.00000 Mode :character
## Mean :0.5691 Mean : 0.1487 Mean :0.07507
## 3rd Qu.:0.9300 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :3.5200 Max. :35.0000 Max. :8.80000
##
## HOLIDAY FUNCTIONING_DAY
## Length:8760 Length:8760
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
dim(bike_sharing_df) #show the dimension: number of rows, number of columns
## [1] 8760 14
Handle missing value in RENTED_BIKE_COUNT and TEMPERATURE column
bike_sharing_df <- drop_na(bike_sharing_df, RENTED_BIKE_COUNT) #drop the NA value because this is a dependent variable, only 3% of the dataset
na_rows <- bike_sharing_df[is.na(bike_sharing_df$TEMPERATURE), ]
print(na_rows) #-> all the NA value is in the summer
## DATE RENTED_BIKE_COUNT HOUR TEMPERATURE HUMIDITY WIND_SPEED
## 4483 07/06/2018 3221 18 NA 57 2.7
## 4599 12/06/2018 1246 14 NA 45 2.2
## 4626 13/06/2018 2664 17 NA 57 3.3
## 4722 17/06/2018 2330 17 NA 58 3.3
## 4796 20/06/2018 2741 19 NA 61 2.7
## 5030 30/06/2018 1144 13 NA 87 1.7
## 5147 05/07/2018 827 10 NA 75 1.1
## 5290 11/07/2018 634 9 NA 96 0.6
## 5311 12/07/2018 593 6 NA 93 1.1
## 5525 21/07/2018 347 4 NA 77 1.2
## 6288 21/08/2018 1277 23 NA 75 0.1
## VISIBILITY DEW_POINT_TEMPERATURE SOLAR_RADIATION RAINFALL SNOWFALL SEASONS
## 4483 1217 16.4 0.96 0.0 0 Summer
## 4599 1961 12.7 1.39 0.0 0 Summer
## 4626 919 16.4 0.87 0.0 0 Summer
## 4722 865 16.7 0.66 0.0 0 Summer
## 4796 1236 17.5 0.60 0.0 0 Summer
## 5030 390 23.2 0.71 3.5 0 Summer
## 5147 1028 20.8 1.22 0.0 0 Summer
## 5290 450 24.9 0.41 0.0 0 Summer
## 5311 852 24.3 0.01 0.0 0 Summer
## 5525 1203 21.2 0.00 0.0 0 Summer
## 6288 1892 20.8 0.00 0.0 0 Summer
## HOLIDAY FUNCTIONING_DAY
## 4483 No Holiday Yes
## 4599 No Holiday Yes
## 4626 No Holiday Yes
## 4722 No Holiday Yes
## 4796 No Holiday Yes
## 5030 No Holiday Yes
## 5147 No Holiday Yes
## 5290 No Holiday Yes
## 5311 No Holiday Yes
## 5525 No Holiday Yes
## 6288 No Holiday Yes
Since all of the NA values in TEMPERATURE is in the summer and TEMPERATURE is an independent variables, they can’t be dropped but should be replaced with the average temperature in summer.
#calculate the average temperature in summer
summer_temp <- bike_sharing_df[bike_sharing_df$SEASONS == "Summer", ]
summer_avg_temp <- mean(summer_temp$TEMPERATURE, na.rm=TRUE)
print(summer_avg_temp)
## [1] 26.58771
# replace NA with average temperature in summer
bike_sharing_df["TEMPERATURE"][is.na(bike_sharing_df["TEMPERATURE"])] <- summer_avg_temp
Save the clean dataset
write.csv(bike_sharing_df,"/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/seoul_bike_sharing.csv")
library(fastDummies) #package to create dummy variables
bike_sharing_df <- read.csv("/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/seoul_bike_sharing.csv")
In the bike-sharing demand dataset, SEASONS, HOLIDAY, FUNCTIONING_DAY are categorical variable. HOUR is considered categorical variable because it levels range from 0 to 23
bike_sharing_df %>%
mutate(HOUR=as.character(HOUR)) %>% #convert HOUR to character because it's from 0 to 23
head(10)
## X DATE RENTED_BIKE_COUNT HOUR TEMPERATURE HUMIDITY WIND_SPEED
## 1 1 01/12/2017 254 0 -5.2 37 2.2
## 2 2 01/12/2017 204 1 -5.5 38 0.8
## 3 3 01/12/2017 173 2 -6.0 39 1.0
## 4 4 01/12/2017 107 3 -6.2 40 0.9
## 5 5 01/12/2017 78 4 -6.0 36 2.3
## 6 6 01/12/2017 100 5 -6.4 37 1.5
## 7 7 01/12/2017 181 6 -6.6 35 1.3
## 8 8 01/12/2017 460 7 -7.4 38 0.9
## 9 9 01/12/2017 930 8 -7.6 37 1.1
## 10 10 01/12/2017 490 9 -6.5 27 0.5
## VISIBILITY DEW_POINT_TEMPERATURE SOLAR_RADIATION RAINFALL SNOWFALL SEASONS
## 1 2000 -17.6 0.00 0 0 Winter
## 2 2000 -17.6 0.00 0 0 Winter
## 3 2000 -17.7 0.00 0 0 Winter
## 4 2000 -17.6 0.00 0 0 Winter
## 5 2000 -18.6 0.00 0 0 Winter
## 6 2000 -18.7 0.00 0 0 Winter
## 7 2000 -19.5 0.00 0 0 Winter
## 8 2000 -19.3 0.00 0 0 Winter
## 9 2000 -19.8 0.01 0 0 Winter
## 10 1928 -22.4 0.23 0 0 Winter
## HOLIDAY FUNCTIONING_DAY
## 1 No Holiday Yes
## 2 No Holiday Yes
## 3 No Holiday Yes
## 4 No Holiday Yes
## 5 No Holiday Yes
## 6 No Holiday Yes
## 7 No Holiday Yes
## 8 No Holiday Yes
## 9 No Holiday Yes
## 10 No Holiday Yes
For later usage to build the model, SEASONS, HOLIDAY and HOUE should be converted into indicator columns.
# load the package
pacman::p_load(fastDummies)
bike_sharing_df <- dummy_cols(bike_sharing_df, select_columns = "HOUR")
bike_sharing_df <- dummy_cols(bike_sharing_df, select_columns = "HOLIDAY")
bike_sharing_df <- dummy_cols(bike_sharing_df, select_columns = "SEASONS")
#Change the colnames for shorterning
colnames(bike_sharing_df)[c(40,41,42,43,44,45)] <- c("HOLIDAY", "NO HOLIDAY", "AUTUMN", "SPRING", "SUMMER","WINTER")
Save the dataset
write.csv(bike_sharing_df,"/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/seoul_bike_sharing_converted.csv")
#Create a function for min-max normalization
minmax_norm <- function(x){
(x-min(x))/(max(x)-min(x))}
bike_sharing_df <- read.csv("/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/seoul_bike_sharing_converted.csv")
#Apply min-max normalization function to numerical columns in df
bike_sharing_df %<>%
mutate(TEMPERATURE = minmax_norm(TEMPERATURE),
HUMIDITY = minmax_norm(HUMIDITY),
WIND_SPEED = minmax_norm(WIND_SPEED),
VISIBILITY = minmax_norm(VISIBILITY),
DEW_POINT_TEMPERATURE = minmax_norm(DEW_POINT_TEMPERATURE),
SOLAR_RADIATION = minmax_norm(SOLAR_RADIATION),
RAINFALL = minmax_norm(RAINFALL),
SNOWFALL = minmax_norm(SNOWFALL))
head(bike_sharing_df)
## X.1 X DATE RENTED_BIKE_COUNT HOUR TEMPERATURE HUMIDITY WIND_SPEED
## 1 1 1 01/12/2017 254 0 0.2202797 0.3775510 0.2972973
## 2 2 2 01/12/2017 204 1 0.2150350 0.3877551 0.1081081
## 3 3 3 01/12/2017 173 2 0.2062937 0.3979592 0.1351351
## 4 4 4 01/12/2017 107 3 0.2027972 0.4081633 0.1216216
## 5 5 5 01/12/2017 78 4 0.2062937 0.3673469 0.3108108
## 6 6 6 01/12/2017 100 5 0.1993007 0.3775510 0.2027027
## VISIBILITY DEW_POINT_TEMPERATURE SOLAR_RADIATION RAINFALL SNOWFALL SEASONS
## 1 1 0.2249135 0 0 0 Winter
## 2 1 0.2249135 0 0 0 Winter
## 3 1 0.2231834 0 0 0 Winter
## 4 1 0.2249135 0 0 0 Winter
## 5 1 0.2076125 0 0 0 Winter
## 6 1 0.2058824 0 0 0 Winter
## HOLIDAY FUNCTIONING_DAY HOUR_0 HOUR_1 HOUR_2 HOUR_3 HOUR_4 HOUR_5 HOUR_6
## 1 No Holiday Yes 1 0 0 0 0 0 0
## 2 No Holiday Yes 0 1 0 0 0 0 0
## 3 No Holiday Yes 0 0 1 0 0 0 0
## 4 No Holiday Yes 0 0 0 1 0 0 0
## 5 No Holiday Yes 0 0 0 0 1 0 0
## 6 No Holiday Yes 0 0 0 0 0 1 0
## HOUR_7 HOUR_8 HOUR_9 HOUR_10 HOUR_11 HOUR_12 HOUR_13 HOUR_14 HOUR_15 HOUR_16
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## HOUR_17 HOUR_18 HOUR_19 HOUR_20 HOUR_21 HOUR_22 HOUR_23 HOLIDAY.1 NO.HOLIDAY
## 1 0 0 0 0 0 0 0 0 1
## 2 0 0 0 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 1
## 5 0 0 0 0 0 0 0 0 1
## 6 0 0 0 0 0 0 0 0 1
## AUTUMN SPRING SUMMER WINTER
## 1 0 0 0 1
## 2 0 0 0 1
## 3 0 0 0 1
## 4 0 0 0 1
## 5 0 0 0 1
## 6 0 0 0 1
Save the dataset
#Save as seoul_bike_sharing_converted_normalized.csv
write.csv(bike_sharing_df,"/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/seoul_bike_sharing_converted_normalized.csv")
Standardize the column names again for the new datasets
dataset_list <- c('seoul_bike_sharing.csv', 'seoul_bike_sharing_converted.csv', 'seoul_bike_sharing_converted_normalized.csv')
for (dataset_name in dataset_list) {
dataset <- read.csv(dataset_name)
names(dataset) <- toupper(names(dataset))
names(dataset) <- str_replace_all(names(dataset), " ", "_")
write.csv(dataset, dataset_name, row.names = FALSE)
}
This part is to perform Exploratory Data Analysis using tidyverse and ggplot2 R packages, with the objective is to explore and generate some insights from the analysis.
seoul_bike_sharing <- read.csv("/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/seoul_bike_sharing.csv")
str(seoul_bike_sharing)
## 'data.frame': 8465 obs. of 15 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DATE : chr "01/12/2017" "01/12/2017" "01/12/2017" "01/12/2017" ...
## $ RENTED_BIKE_COUNT : int 254 204 173 107 78 100 181 460 930 490 ...
## $ HOUR : int 0 1 2 3 4 5 6 7 8 9 ...
## $ TEMPERATURE : num -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
## $ HUMIDITY : int 37 38 39 40 36 37 35 38 37 27 ...
## $ WIND_SPEED : num 2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
## $ VISIBILITY : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 1928 ...
## $ DEW_POINT_TEMPERATURE: num -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
## $ SOLAR_RADIATION : num 0 0 0 0 0 0 0 0 0.01 0.23 ...
## $ RAINFALL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNOWFALL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SEASONS : chr "Winter" "Winter" "Winter" "Winter" ...
## $ HOLIDAY : chr "No Holiday" "No Holiday" "No Holiday" "No Holiday" ...
## $ FUNCTIONING_DAY : chr "Yes" "Yes" "Yes" "Yes" ...
#make sure there's no missing values
seoul_bike_sharing$DATE = as.Date(seoul_bike_sharing$DATE,format="%d/%m/%Y") #recast date as a date format
seoul_bike_sharing$HOUR <- factor(seoul_bike_sharing$HOUR, levels = 0:23, ordered = TRUE) #cast the HOUR as categorical variables
seoul_bike_sharing$SEASONS <- factor(seoul_bike_sharing$SEASONS, levels=c("Winter", "Spring", "Summer","Autumn"))
class(seoul_bike_sharing$HOUR)
## [1] "ordered" "factor"
class(seoul_bike_sharing$DATE)
## [1] "Date"
class(seoul_bike_sharing$SEASONS)
## [1] "factor"
sum(is.na(seoul_bike_sharing))
## [1] 0
summary(seoul_bike_sharing)
## X DATE RENTED_BIKE_COUNT HOUR
## Min. : 1 Min. :2017-12-01 Min. : 2.0 7 : 353
## 1st Qu.:2117 1st Qu.:2018-02-27 1st Qu.: 214.0 8 : 353
## Median :4233 Median :2018-05-28 Median : 542.0 9 : 353
## Mean :4233 Mean :2018-05-28 Mean : 729.2 10 : 353
## 3rd Qu.:6349 3rd Qu.:2018-08-24 3rd Qu.:1084.0 11 : 353
## Max. :8465 Max. :2018-11-30 Max. :3556.0 12 : 353
## (Other):6347
## TEMPERATURE HUMIDITY WIND_SPEED VISIBILITY
## Min. :-17.80 Min. : 0.00 Min. :0.000 Min. : 27
## 1st Qu.: 3.00 1st Qu.:42.00 1st Qu.:0.900 1st Qu.: 935
## Median : 13.50 Median :57.00 Median :1.500 Median :1690
## Mean : 12.77 Mean :58.15 Mean :1.726 Mean :1434
## 3rd Qu.: 22.70 3rd Qu.:74.00 3rd Qu.:2.300 3rd Qu.:2000
## Max. : 39.40 Max. :98.00 Max. :7.400 Max. :2000
##
## DEW_POINT_TEMPERATURE SOLAR_RADIATION RAINFALL SNOWFALL
## Min. :-30.600 Min. :0.0000 Min. : 0.0000 Min. :0.00000
## 1st Qu.: -5.100 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000
## Median : 4.700 Median :0.0100 Median : 0.0000 Median :0.00000
## Mean : 3.945 Mean :0.5679 Mean : 0.1491 Mean :0.07769
## 3rd Qu.: 15.200 3rd Qu.:0.9300 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. : 27.200 Max. :3.5200 Max. :35.0000 Max. :8.80000
##
## SEASONS HOLIDAY FUNCTIONING_DAY
## Winter:2160 Length:8465 Length:8465
## Spring:2160 Class :character Class :character
## Summer:2208 Mode :character Mode :character
## Autumn:1937
##
##
##
#calculate how many holiday there are
holiday_count <- table(seoul_bike_sharing$HOLIDAY)
num_holiday <- holiday_count['Holiday']
num_holiday
## Holiday
## 408
#% if records fall on a holiday
num_holiday/(num_holiday +holiday_count['No Holiday'])
## Holiday
## 0.04819846
#calculate the number of rainfall and snowfall by seasons
seasonal_total <- seoul_bike_sharing %>%
group_by(SEASONS) %>%
summarize(total_rainfall=sum(RAINFALL), total_snowfall=sum(SNOWFALL))
seasonal_total
## # A tibble: 4 × 3
## SEASONS total_rainfall total_snowfall
## <fct> <dbl> <dbl>
## 1 Winter 70.9 535.
## 2 Spring 404. 0
## 3 Summer 560. 0
## 4 Autumn 228. 123
Scatter plot of RENTED_BIKE_COUNT and DATE (plot1.png)
seoul_bike_sharing %>%
mutate(DATE = as.Date(DATE,format="%d/%m/%Y"))%>%
ggplot(aes(x = as.Date(DATE), y = RENTED_BIKE_COUNT, color = HOUR, alpha=0.5)) +
geom_point() +
scale_x_date(date_labels = "%d/%m/%Y") +
labs(x= "Date")
Histogram overlaid with a kernel density curve (histogram.png)
ggplot(seoul_bike_sharing, aes(RENTED_BIKE_COUNT)) +
geom_histogram(aes(y=..density..))+
geom_density(col="red")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We can see from the histogram that most of the time there are relatively few bikes rented, mode is about 250.
#Load the packages:
pacman::p_load(rlang, tidymodels, stringr, broom, ggplot2)
#Load the dataset
bike_sharing_df <- read.csv("/Users/ngocquyenquyennguyen/Library/CloudStorage/OneDrive-UniversityofNebraska-Lincoln/R/IBM Data Science with R/seoul_bike_sharing_converted_normalized.csv")
Since DATE and FUNCTIONING_DAY is unnecessary, they are dropped
bike_sharing_df <- bike_sharing_df %>%
select(-DATE, -FUNCTIONING_DAY, -X.2, -X.1, -X, -HOUR, -SEASONS, -HOLIDAY)
colnames(bike_sharing_df)[c(34,35)] <- c("HOLIDAY", "NO_HOLIDAY")
Split the training and testing data with 75% of the original dataset
set.seed(1234)
data_split <- initial_split(bike_sharing_df, prop = 3/4) #set the training dataset with 75% of the original dataset
bike_train <- training(data_split)
bike_test <- testing(data_split)
Build some simple linear models
# Task: Build a linear regression model using weather variables only
lm_model_weather <- lm(RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL,
data=bike_train)
summary(lm_model_weather)
##
## Call:
## lm(formula = RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY + WIND_SPEED +
## VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL +
## SNOWFALL, data = bike_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1348.46 -294.03 -57.28 208.59 2329.78
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 156.71 58.07 2.699 0.00698 **
## TEMPERATURE 2399.74 261.66 9.171 < 2e-16 ***
## HUMIDITY -918.38 126.79 -7.243 4.9e-13 ***
## WIND_SPEED 404.47 48.16 8.399 < 2e-16 ***
## VISIBILITY 12.56 24.86 0.505 0.61351
## DEW_POINT_TEMPERATURE -316.92 278.83 -1.137 0.25575
## SOLAR_RADIATION -444.85 34.69 -12.824 < 2e-16 ***
## RAINFALL -1764.01 182.65 -9.658 < 2e-16 ***
## SNOWFALL 317.78 131.58 2.415 0.01576 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 487.3 on 6339 degrees of freedom
## Multiple R-squared: 0.4303, Adjusted R-squared: 0.4296
## F-statistic: 598.5 on 8 and 6339 DF, p-value: < 2.2e-16
#Task: Build a linear regression model using all variables
lm_model_all <- lm(RENTED_BIKE_COUNT ~ .,
data=bike_train)
summary(lm_model_all)
##
## Call:
## lm(formula = RENTED_BIKE_COUNT ~ ., data = bike_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1401.45 -218.96 -7.31 199.53 1780.67
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 316.008 52.341 6.037 1.65e-09 ***
## TEMPERATURE 782.658 212.129 3.690 0.000227 ***
## HUMIDITY -886.730 99.492 -8.913 < 2e-16 ***
## WIND_SPEED 31.913 40.275 0.792 0.428169
## VISIBILITY 21.872 20.262 1.079 0.280439
## DEW_POINT_TEMPERATURE 598.387 221.369 2.703 0.006888 **
## SOLAR_RADIATION 276.882 41.466 6.677 2.64e-11 ***
## RAINFALL -2064.638 143.276 -14.410 < 2e-16 ***
## SNOWFALL 260.973 103.498 2.522 0.011709 *
## HOUR_0 -133.107 33.323 -3.994 6.56e-05 ***
## HOUR_1 -220.655 32.838 -6.719 1.98e-11 ***
## HOUR_2 -341.020 32.910 -10.362 < 2e-16 ***
## HOUR_3 -423.680 33.498 -12.648 < 2e-16 ***
## HOUR_4 -490.101 33.297 -14.719 < 2e-16 ***
## HOUR_5 -466.528 32.826 -14.212 < 2e-16 ***
## HOUR_6 -307.927 32.990 -9.334 < 2e-16 ***
## HOUR_7 2.949 33.207 0.089 0.929246
## HOUR_8 347.169 32.967 10.531 < 2e-16 ***
## HOUR_9 -103.808 33.853 -3.066 0.002175 **
## HOUR_10 -341.327 35.106 -9.723 < 2e-16 ***
## HOUR_11 -351.192 36.879 -9.523 < 2e-16 ***
## HOUR_12 -312.150 37.820 -8.253 < 2e-16 ***
## HOUR_13 -295.163 38.411 -7.684 1.77e-14 ***
## HOUR_14 -296.250 37.268 -7.949 2.21e-15 ***
## HOUR_15 -213.542 36.764 -5.808 6.61e-09 ***
## HOUR_16 -80.680 35.369 -2.281 0.022575 *
## HOUR_17 201.739 34.547 5.839 5.50e-09 ***
## HOUR_18 690.995 33.487 20.634 < 2e-16 ***
## HOUR_19 419.180 33.099 12.664 < 2e-16 ***
## HOUR_20 328.187 32.827 9.997 < 2e-16 ***
## HOUR_21 342.772 32.918 10.413 < 2e-16 ***
## HOUR_22 238.833 32.723 7.299 3.26e-13 ***
## HOUR_23 NA NA NA NA
## HOLIDAY -124.424 22.948 -5.422 6.11e-08 ***
## NO_HOLIDAY NA NA NA NA
## AUTUMN 358.999 20.290 17.694 < 2e-16 ***
## SPRING 191.365 19.362 9.884 < 2e-16 ***
## SUMMER 198.142 29.187 6.789 1.24e-11 ***
## WINTER NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 377.9 on 6312 degrees of freedom
## Multiple R-squared: 0.6589, Adjusted R-squared: 0.657
## F-statistic: 348.4 on 35 and 6312 DF, p-value: < 2.2e-16
We use R-square (rsq) and Root Mean Square to evaluate and identify the most important varibales
# Use model to make prediction
lm_model_weather_pred <- predict(lm_model_weather, newdata = bike_test)
test_results_weather <- data.frame(PREDICTION=lm_model_weather_pred, TRUTH = bike_test$RENTED_BIKE_COUNT)
lm_model_all_pred <- predict(lm_model_all, newdata = bike_test)
## Warning in predict.lm(lm_model_all, newdata = bike_test): prediction from a
## rank-deficient fit may be misleading
test_results_all <- data.frame(PREDICTION = lm_model_all_pred, TRUTH = bike_test$RENTED_BIKE_COUNT)
summary(lm_model_weather)$r.squared #0.4303
## [1] 0.4302915
summary(lm_model_all)$r.squared #0.6589
## [1] 0.6589113
rmse_weather <- sqrt(mean((test_results_weather$TRUTH-test_results_weather$PREDICTION)^2))
rmse_all <- sqrt(mean((test_results_all$TRUTH-test_results_all$PREDICTION)^2))
print(rmse_weather) #474.6247
## [1] 474.6247
print(rmse_all) #361.9543
## [1] 364.4235
Plotting the coeficients in a bar chart
# create a data frame of coefficients
coef_df <- tidy(lm_model_all)
# plot the coefficients in a bar chart (coef plot.png)
ggplot(coef_df, aes(x = reorder(term, desc(abs(estimate))), y = abs(estimate))) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
xlab("Predictor") +
ylab("Coefficient") +
ggtitle("Coefficients of Linear Model") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 3 rows containing missing values (`position_stack()`).
The prediction from model using all variables may be misleading because there is colinearity in the predictor variables. This issue can be solved using glmnet model, polynomials and interaction terms
#load the packages:
pacman::p_load(tidymodels, tidyverse, stringr)
#define a linear regression model specification.
lm_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
lm_spec
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
#split training and test data
set.seed(1234)
data.split <- initial_split(bike_sharing_df, prop=4/5)
bike_train <- training(data.split)
bike_test <- testing(data.split)
First, adding polynomial terms
#(poly1.png)
ggplot(data=bike_train, aes(RENTED_BIKE_COUNT, TEMPERATURE)) +
geom_point() #nonlinearity -> polynomial regression
# (poly2.png)
ggplot(data=bike_train, aes(RENTED_BIKE_COUNT, TEMPERATURE)) +
geom_point() +
geom_smooth(method = "lm", formula = y~x, color="red") +
geom_smooth(method="lm", formula = y~poly(x,2), color="yellow") +
geom_smooth(method="lm", formula = y~poly(x,2), color="green") +
geom_smooth(method="lm", formula = y~poly(x,2), color="blue")
# Fit a linear model with higher order polynomial on some important variables
lm_poly <- lm(RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) +
poly(HUMIDITY, 4)+
poly(RAINFALL,2), data = bike_train)
summary(lm_poly$fit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -713.1 350.2 758.6 734.9 1136.4 1457.4
lm_poly_pred <- predict(lm_poly, newdata = bike_test) #predict
test_results_poly = data.frame(PREDICTION = lm_poly_pred, TRUTH = bike_test$RENTED_BIKE_COUNT) #create df for test results
#convert all negative prediction to 0 (RENTED_BIKE_COUNT can't be negative)
test_results_poly <- test_results_poly %>%
mutate(PREDICTION = ifelse(PREDICTION <0, 0, PREDICTION))
#calculate R_squared and RMSE (better than lm_weather but worse than lm_all)
summary(lm_poly)$r.squared #0.4861
## [1] 0.4861033
rmse_poly <- sqrt(mean ( (test_results_poly$TRUTH - test_results_poly$PREDICTION)^2) )
rmse_poly #451.7091
## [1] 451.7091
The effect of predictor variable TEMPERATURE on RENTED_BIKE_COUNT may also depend on other variables such as HUMIDITY, RAINFALL, or both (they interact) and the effect of SEASON on RENTED_BIKE_COUNT may also depend on HOLIDAY, HOUR, or both.
#Task: Add Interaction Terms
lm_poly_interaction <- lm(RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) + poly(HUMIDITY, 4)+poly(RAINFALL,2)+
RAINFALL*HUMIDITY + TEMPERATURE*HUMIDITY,
data = bike_train)
summary(lm_poly_interaction)
##
## Call:
## lm(formula = RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) + poly(HUMIDITY,
## 4) + poly(RAINFALL, 2) + RAINFALL * HUMIDITY + TEMPERATURE *
## HUMIDITY, data = bike_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1315.33 -254.96 -65.41 171.27 2220.84
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1503.3 61.3 24.522 < 2e-16 ***
## poly(TEMPERATURE, 6)1 58400.4 1574.8 37.085 < 2e-16 ***
## poly(TEMPERATURE, 6)2 -5101.4 480.5 -10.618 < 2e-16 ***
## poly(TEMPERATURE, 6)3 -12619.0 486.6 -25.930 < 2e-16 ***
## poly(TEMPERATURE, 6)4 -4205.2 460.9 -9.124 < 2e-16 ***
## poly(TEMPERATURE, 6)5 -710.4 456.6 -1.556 0.119806
## poly(TEMPERATURE, 6)6 388.2 458.2 0.847 0.396934
## poly(HUMIDITY, 4)1 8108.6 1538.5 5.270 1.40e-07 ***
## poly(HUMIDITY, 4)2 -7946.3 497.4 -15.977 < 2e-16 ***
## poly(HUMIDITY, 4)3 367.7 483.1 0.761 0.446703
## poly(HUMIDITY, 4)4 -2632.2 477.2 -5.516 3.60e-08 ***
## poly(RAINFALL, 2)1 -87183.4 22252.0 -3.918 9.02e-05 ***
## poly(RAINFALL, 2)2 1059.9 532.5 1.990 0.046592 *
## RAINFALL NA NA NA NA
## HUMIDITY NA NA NA NA
## TEMPERATURE NA NA NA NA
## RAINFALL:HUMIDITY 30974.2 8112.5 3.818 0.000136 ***
## HUMIDITY:TEMPERATURE -2773.9 160.2 -17.314 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 452.3 on 6757 degrees of freedom
## Multiple R-squared: 0.5086, Adjusted R-squared: 0.5076
## F-statistic: 499.5 on 14 and 6757 DF, p-value: < 2.2e-16
lm_poly_interaction_pred <- predict(lm_poly_interaction, newdata = bike_test)
## Warning in predict.lm(lm_poly_interaction, newdata = bike_test): prediction
## from a rank-deficient fit may be misleading
test_results_poly_interaction <- data.frame(PREDICTION = lm_poly_interaction_pred, TRUTH=bike_test$RENTED_BIKE_COUNT)
#model performance (improved model)
summary(lm_poly_interaction)$r.squared #0.5086
## [1] 0.5085865
rmse_poly_interaction <- rmse(test_results_poly_interaction, TRUTH, PREDICTION )
rmse_poly_interaction #442
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 442.
Adding regularization to overcome the issue of complicated, difficult and overfitting. We can use a more advanced and generalized glmnet engine. It provides a generalized linear model with Lasso, Ridge, and Elastic Net regularizations.
pacman::p_load(glmnet, yardstick)
Creating the model prediction function and model evaluation function
#prediction function
model_prediction <- function(lm_model, test_data) {
results <- lm_model %>%
predict(new_data=test_data) %>%
mutate(TRUTH=test_data$RENTED_BIKE_COUNT)
results[results<0] <-0
return(results)
}
#model evaluation function
model_evaluation <- function(results) {
rmse = rmse(results, truth=TRUTH, estimate=.pred)
rsq = rsq(results, truth=TRUTH, estimate=.pred)
print(rmse)
print(rsq)
}
#Use grid to define the best penalty (lambda)
penalty_value <- 10^seq(-4,4, by = 0.5) #penalty values ranging from 10^-4 to 10^4
x = as.matrix(bike_train[,-1]) #define a matrix for CV
y= bike_train$RENTED_BIKE_COUNT
We can use cross-validation to define the lambda with 10-fold validation
cv_ridge <- cv.glmnet(x,y, alpha = 0, lambda = penalty_value, nfolds = 10)
cv_lasso <- cv.glmnet(x,y, alpha = 1, lambda = penalty_value, nfolds = 10)
cv_elasticnet <- cv.glmnet(x,y, alpha = 0.5, lambda = penalty_value, nfolds = 10)
#glmnet spec (using CV above, best optimal is 0.3 and 0.5)
glmnet_spec <- linear_reg(penalty = 0.3, mixture=0.5) %>%
set_engine("glmnet") %>%
set_mode("regression")
The suggested performance requirements for the best model includes: The RMSE should be less than 330 (rougly 10% of the max value in test dataset) R-squared should be greater than 0.72
#Fit the model (best model)
glmnet_best <- glmnet_spec %>%
fit(RENTED_BIKE_COUNT ~ RAINFALL*HUMIDITY*TEMPERATURE + SPRING*SUMMER*HOLIDAY*HOUR_18* HOUR_19* HOUR_8* HOUR_21* HOUR_20* HOUR_4 +
poly(RAINFALL, 8) + poly(HUMIDITY, 5) + poly(TEMPERATURE, 5) + poly(DEW_POINT_TEMPERATURE, 5) + poly(SOLAR_RADIATION, 5) + poly(SNOWFALL,5) +
SPRING + SUMMER + HOLIDAY + WIND_SPEED + VISIBILITY +
HOUR_18+ HOUR_4 + HOUR_5 + HOUR_3 + HOUR_19 + HOUR_11 + HOUR_8 + HOUR_21 + HOUR_10 + HOUR_2 + HOUR_20,
data = bike_train)
glmnet_best_pred <- model_prediction(glmnet_best, bike_test)
model_evaluation(glmnet_best_pred) #rsq = 0.783, rmse = 296
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 315.
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.753
glmnet_best_rsq = rsq(glmnet_best_pred, truth = TRUTH, estimate = .pred)
glmnet_best_rmse = rmse(glmnet_best_pred, truth = TRUTH, estimate = .pred)
# Fit the model (with top 10 coeficients)
glmnet_top10 <- glmnet_spec %>%
fit(RENTED_BIKE_COUNT ~ RAINFALL*HUMIDITY*TEMPERATURE + SPRING*SUMMER + SUMMER +
poly(RAINFALL, 6) + poly(HUMIDITY, 5) + poly(TEMPERATURE, 5) + poly(DEW_POINT_TEMPERATURE,5) +
HOUR_18 + HOUR_4 + HOUR_5 +HOUR_3,
data=bike_train
)
glmnet_top10_pred <- model_prediction(glmnet_top10, bike_test)
model_evaluation(glmnet_top10_pred) #rsq = 0.640, rmse = 381 (not good)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 385.
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.631
glmnet_top10_rsq = rsq(glmnet_top10_pred, truth = TRUTH, estimate = .pred)
glmnet_top10_rmse = rmse(glmnet_top10_pred, truth = TRUTH, estimate = .pred)
# Fit Ridge Regression
glmnet_ridge <- glmnet(x,y, alpha=0)
glmnet_ridge_pred <- predict(glmnet_ridge, s=cv_ridge$lambda.min,
newx = as.matrix(bike_test[,-1]))
ridge_rmse = sqrt(mean( (bike_test[,1] - glmnet_ridge_pred)^2))
ridge_rmse #365.06
## [1] 365.0562
ridge_mse = mean( (bike_test[,1] - glmnet_ridge_pred)^2)
ridge_rsq = 1 - ridge_mse / var(bike_test[,1])
ridge_rsq #0.667
## [1] 0.6674863
# Fit Lasso
glmnet_lasso <- glmnet(x,y,alpha=1)
glm_lasso_pred <- predict(glmnet_lasso, s=cv_lasso$lambda.min,
newx=as.matrix(bike_test[,-1]))
lasso_rmse = sqrt(mean( (bike_test[,1] - glm_lasso_pred)^2))
lasso_rmse #364.0492
## [1] 364.0492
lasso_mse = mean( (bike_test[,1] - glm_lasso_pred)^2)
lasso_rsq = 1 - lasso_mse/var(bike_test[,1])
lasso_rsq #0.6693
## [1] 0.6693181
# Fit Elastic Net
glmnet_elasticnet <- glmnet(x,y,alpha=0.7)
glm_elasticnet_pred <- predict(glmnet_elasticnet, s=cv_elasticnet$lambda.min,
newx=as.matrix(bike_test[,-1]))
elasticnet_rmse = sqrt(mean( (bike_test[,1] - glm_elasticnet_pred)^2))
elasticnet_rmse #364.0468
## [1] 364.2295
elasticnet_mse = mean( (bike_test[,1] - glm_elasticnet_pred)^2)
elasticnet_rsq = 1 - elasticnet_mse/var(bike_test[,1])
elasticnet_rsq #0.6693
## [1] 0.6689906
To compare the performance of models built previously, creating a group bar chart for rsq and rmse
#Create data frame for group bar chart
model <- c(rep("glmnet_best",2), rep("glmnet_top10",2),
rep("glmnet_ridge",2), rep("glmnet_lasso",2), rep("glmnet_elasticnet",2))
stat <- rep(c("RSQ", "RMSE"),5)
value <- c(glmnet_best_rsq$.estimate, glmnet_best_rmse$.estimate,
glmnet_top10_rsq$.estimate, glmnet_top10_rmse$.estimate,
ridge_rsq, ridge_rmse,
lasso_rsq,lasso_rmse,
elasticnet_rsq, elasticnet_rmse)
model_df <- data.frame(model, stat, value)
print(model_df)
## model stat value
## 1 glmnet_best RSQ 0.7534620
## 2 glmnet_best RMSE 315.1054426
## 3 glmnet_top10 RSQ 0.6313216
## 4 glmnet_top10 RMSE 385.3212657
## 5 glmnet_ridge RSQ 0.6674863
## 6 glmnet_ridge RMSE 365.0561548
## 7 glmnet_lasso RSQ 0.6693181
## 8 glmnet_lasso RMSE 364.0492378
## 9 glmnet_elasticnet RSQ 0.6689906
## 10 glmnet_elasticnet RMSE 364.2294665
# Create group bar chart for rsq and rmse (model evaluation.png)
model_df %>%
ggplot(aes(fill=stat, x=model, y=value)) +
geom_bar(position="dodge", stat="identity")
For the best model glmnet_best, creating a Q-Q chart by plotting the distribution difference between the predictions generated by your best model and the true values on the test dataset.
# Create a Q-Q chart for best model: glmnet_best (Q-Q chart.png)
glmnet_best_pred %>%
ggplot() +
stat_qq(aes(sample=TRUTH), color='green') +
stat_qq(aes(sample=.pred), color='red')
In conclusion, the model using top 10 coefficients does not have good performance. While Ridge Regression, Lasso and Elastic Net Regularization perform better than the models using polynomials and interaction terms, it is still not the best model to use.
The number of bike rented depends on multiple variables, including weather, seasons and hours. The best statistical learning model to use for prediction is linear regression with penalty = 0.3, mixture=0.5.