Importing required libraries
library(rvest)
library(tidyverse)
library(httr)
library(jsonlite)
Scraping the required table
url <- "https://en.wikipedia.org/wiki/List_of_bicycle-sharing_systems"
html <- read_html(url)
bike_table <- html |>
html_element("table") |>
html_table()
Previewing the scraped data
head(bike_table, 10)
Getting a summary of information contained in the table
summary(bike_table)
## Country City Name System
## Length:558 Length:558 Length:558 Length:558
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Operator Launched Discontinued Stations
## Length:558 Length:558 Length:558 Length:558
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Bicycles Daily ridership
## Length:558 Length:558
## Class :character Class :character
## Mode :character Mode :character
Saving the scraped table
write_csv(bike_table, "raw_bike_sharing_systems.csv")
get_weather_forecast_by_cities <- function(cities){
df <- data.frame()
for (city in cities) {
url <- 'https://api.openweathermap.org/data/2.5/forecast'
forecast_query <- list(q = city, appid = "cd1c417f157352ca399e1dbb0e1ccc9f", units = "metric")
response <- GET(url = url, query = forecast_query)
json_result <- content(response, as = "text")
city_df <- json_result |>
fromJSON(flatten = TRUE) |>
as.data.frame() |>
unnest_wider(list.weather) |>
mutate(season = "autumn") |>
select(city = city.name, weather = main, visibility = list.visibility, temp = list.main.temp,
temp_min = list.main.temp_min, temp_max = list.main.temp_max, pressure = list.main.pressure,
humidity = list.main.humidity, wind_speed = list.wind.speed, wind_deg = list.wind.deg,
forecast_datetime = list.dt_txt, season)
df <- rbind(df, city_df)
}
return(df)
}
Try the function created
city_names <- c("Seoul", paste("Washington", "D.C."), "Paris", "Suzhou")
cities_weather_df <- get_weather_forecast_by_cities(cities = city_names)
head(cities_weather_df, 10)
Saving weather data
write_csv(cities_weather_df, "raw_cities_weather_forecast.csv")
Download some general city information such as name and locations
url <- "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-RP0321EN-SkillsNetwork/labs/datasets/raw_worldcities.csv"
# download the file
download.file(url, destfile = "raw_worldcities.csv")
url <- "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-RP0321EN-SkillsNetwork/labs/datasets/raw_seoul_bike_sharing.csv"
# download the file
download.file(url, destfile = "raw_seoul_bike_sharing.csv")
library(janitor)
dataset_list <- c('raw_bike_sharing_systems.csv', 'raw_seoul_bike_sharing.csv', 'raw_cities_weather_forecast.csv', 'raw_worldcities.csv')
Cleaning column names
for (dataset_name in dataset_list) {
dataset <- read_csv(dataset_name)
dataset <- dataset |>
clean_names()
column_names <- names(dataset)
names(dataset) <- str_to_upper(column_names)
write_csv(dataset, dataset_name)
}
printing summary of datasets
for (dataset_name in dataset_list) {
dataset <- read_csv(dataset_name)
print(summary(dataset))
}
## COUNTRY CITY NAME SYSTEM
## Length:558 Length:558 Length:558 Length:558
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## OPERATOR LAUNCHED DISCONTINUED STATIONS
## Length:558 Length:558 Length:558 Length:558
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## BICYCLES DAILY_RIDERSHIP
## Length:558 Length:558
## Class :character Class :character
## Mode :character Mode :character
## DATE RENTED_BIKE_COUNT HOUR TEMPERATURE
## Length:8760 Min. : 2.0 Min. : 0.00 Min. :-17.80
## Class :character 1st Qu.: 214.0 1st Qu.: 5.75 1st Qu.: 3.40
## Mode :character Median : 542.0 Median :11.50 Median : 13.70
## Mean : 729.2 Mean :11.50 Mean : 12.87
## 3rd Qu.:1084.0 3rd Qu.:17.25 3rd Qu.: 22.50
## Max. :3556.0 Max. :23.00 Max. : 39.40
## NA's :295 NA's :11
## HUMIDITY WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## Min. : 0.00 Min. :0.000 Min. : 27 Min. :-30.600
## 1st Qu.:42.00 1st Qu.:0.900 1st Qu.: 940 1st Qu.: -4.700
## Median :57.00 Median :1.500 Median :1698 Median : 5.100
## Mean :58.23 Mean :1.725 Mean :1437 Mean : 4.074
## 3rd Qu.:74.00 3rd Qu.:2.300 3rd Qu.:2000 3rd Qu.: 14.800
## Max. :98.00 Max. :7.400 Max. :2000 Max. : 27.200
##
## SOLAR_RADIATION RAINFALL SNOWFALL SEASONS
## Min. :0.0000 Min. : 0.0000 Min. :0.00000 Length:8760
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 Class :character
## Median :0.0100 Median : 0.0000 Median :0.00000 Mode :character
## Mean :0.5691 Mean : 0.1487 Mean :0.07507
## 3rd Qu.:0.9300 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :3.5200 Max. :35.0000 Max. :8.80000
##
## HOLIDAY FUNCTIONING_DAY
## Length:8760 Length:8760
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## CITY WEATHER VISIBILITY TEMP
## Length:160 Length:160 Min. : 4862 Min. : 2.67
## Class :character Class :character 1st Qu.:10000 1st Qu.:12.55
## Mode :character Mode :character Median :10000 Median :16.55
## Mean : 9968 Mean :15.91
## 3rd Qu.:10000 3rd Qu.:19.38
## Max. :10000 Max. :27.12
## TEMP_MIN TEMP_MAX PRESSURE HUMIDITY WIND_SPEED
## Min. : 2.67 Min. : 2.67 Min. : 985 Min. :30.0 Min. : 0.420
## 1st Qu.:12.55 1st Qu.:12.55 1st Qu.:1009 1st Qu.:56.0 1st Qu.: 2.200
## Median :16.55 Median :16.55 Median :1017 Median :70.0 Median : 3.295
## Mean :15.89 Mean :15.94 Mean :1014 Mean :67.2 Mean : 3.721
## 3rd Qu.:19.32 3rd Qu.:19.40 3rd Qu.:1021 3rd Qu.:80.0 3rd Qu.: 4.850
## Max. :27.12 Max. :27.12 Max. :1033 Max. :96.0 Max. :11.070
## WIND_DEG FORECAST_DATETIME SEASON
## Min. : 4.0 Min. :2023-10-29 03:00:00 Length:160
## 1st Qu.:148.2 1st Qu.:2023-10-30 08:15:00 Class :character
## Median :192.0 Median :2023-10-31 13:30:00 Mode :character
## Mean :192.4 Mean :2023-10-31 13:30:00
## 3rd Qu.:228.5 3rd Qu.:2023-11-01 18:45:00
## Max. :352.0 Max. :2023-11-03 00:00:00
## CITY CITY_ASCII LAT LNG
## Length:26569 Length:26569 Min. :-54.93 Min. :-179.5900
## Class :character Class :character 1st Qu.: 27.92 1st Qu.: -78.7794
## Mode :character Mode :character Median : 40.22 Median : -0.7689
## Mean : 33.10 Mean : -11.3639
## 3rd Qu.: 47.99 3rd Qu.: 29.6833
## Max. : 81.72 Max. : 179.3667
##
## COUNTRY ISO2 ISO3 ADMIN_NAME
## Length:26569 Length:26569 Length:26569 Length:26569
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## CAPITAL POPULATION ID
## Length:26569 Min. : 0 Min. :1.004e+09
## Class :character 1st Qu.: 9246 1st Qu.:1.277e+09
## Mode :character Median : 20080 Median :1.643e+09
## Mean : 162346 Mean :1.556e+09
## 3rd Qu.: 59369 3rd Qu.:1.840e+09
## Max. :37977000 Max. :1.934e+09
## NA's :973
# First load the dataset
bike_sharing_df <- read_csv("raw_bike_sharing_systems.csv")
# Print its head
head(bike_sharing_df)
Selecting required columns
# Select the four columns
sub_bike_sharing_df <- bike_sharing_df %>% select(COUNTRY, CITY, SYSTEM, BICYCLES)
Checking the classes of the columns selected
sub_bike_sharing_df %>%
summarize_all(class) %>%
gather(variable, class)
Checking for strings in the bicycle column which is expected to be numeric
# grepl searches a string for non-digital characters, and returns TRUE or FALSE
# if it finds any non-digital characters, then the bicyle column is not purely numeric
find_character <- function(strings){
grepl("[^0-9]", strings)
}
sub_bike_sharing_df %>%
select(BICYCLES) %>%
filter(find_character(BICYCLES)) %>%
slice(1:10)
Checking the other columns for unwanted strings
# Define a 'reference link' character class,
# `[A-z0-9]` means at least one character
# `\\[` and `\\]` means the character is wrapped by [], such as for [12] or [abc]
ref_pattern <- "\\[.+\\]"
find_reference_pattern <- function(string){
grepl(ref_pattern, string)
}
Checking for strings in COUNTRY column
# Check whether the COUNTRY column has any reference links
sub_bike_sharing_df %>%
select(COUNTRY) %>%
filter(find_reference_pattern(COUNTRY)) %>%
slice(1:10)
Checking for strings in CITY column
# Check whether the CITY column has any reference links
sub_bike_sharing_df %>%
select(CITY) %>%
filter(find_reference_pattern(CITY)) %>%
slice(1:10)
Checking for regular expression in SYSTEM
# Check whether the System column has any reference links
sub_bike_sharing_df %>%
select(SYSTEM) %>%
filter(find_reference_pattern(SYSTEM)) %>%
slice(1:10)
Function to remove undesirable strings
# remove reference link
remove_ref <- function(strings) {
ref_pattern <- "\\[.+\\]"
strings <- str_replace_all(strings, ref_pattern, "")
return(strings)
}
Removing unwanted strings
sub_bike_sharing_df <- sub_bike_sharing_df |>
mutate(COUNTRY = remove_ref(COUNTRY),
CITY = remove_ref(CITY),
SYSTEM = remove_ref(SYSTEM))
Checking to see if all references have been removed
result <- sub_bike_sharing_df |>
select(CITY, SYSTEM, COUNTRY) |>
filter(find_reference_pattern(CITY) | find_reference_pattern(SYSTEM) | find_reference_pattern(COUNTRY))
result
# Extract the first number
extract_num <- function(column){
# Define a digital pattern
digitals_pattern <- "[0-9]*"
column <- str_extract(column, pattern = digitals_pattern)
return(column)
}
sub_bike_sharing_df <- sub_bike_sharing_df |>
mutate(BICYCLES = extract_num(BICYCLES))
sub_bike_sharing_df$BICYCLES <- as.numeric(sub_bike_sharing_df$BICYCLES)
Checking results for BICYCLES
summary(sub_bike_sharing_df$BICYCLES)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4 70 300 1832 1000 78000 100
Saving our processed data
write_csv(sub_bike_sharing_df, "bike_sharing_systems.csv")
Loading the required data
bike_sharing_df <- read_csv("raw_seoul_bike_sharing.csv")
First take a look at the dataset
summary(bike_sharing_df)
## DATE RENTED_BIKE_COUNT HOUR TEMPERATURE
## Length:8760 Min. : 2.0 Min. : 0.00 Min. :-17.80
## Class :character 1st Qu.: 214.0 1st Qu.: 5.75 1st Qu.: 3.40
## Mode :character Median : 542.0 Median :11.50 Median : 13.70
## Mean : 729.2 Mean :11.50 Mean : 12.87
## 3rd Qu.:1084.0 3rd Qu.:17.25 3rd Qu.: 22.50
## Max. :3556.0 Max. :23.00 Max. : 39.40
## NA's :295 NA's :11
## HUMIDITY WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## Min. : 0.00 Min. :0.000 Min. : 27 Min. :-30.600
## 1st Qu.:42.00 1st Qu.:0.900 1st Qu.: 940 1st Qu.: -4.700
## Median :57.00 Median :1.500 Median :1698 Median : 5.100
## Mean :58.23 Mean :1.725 Mean :1437 Mean : 4.074
## 3rd Qu.:74.00 3rd Qu.:2.300 3rd Qu.:2000 3rd Qu.: 14.800
## Max. :98.00 Max. :7.400 Max. :2000 Max. : 27.200
##
## SOLAR_RADIATION RAINFALL SNOWFALL SEASONS
## Min. :0.0000 Min. : 0.0000 Min. :0.00000 Length:8760
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 Class :character
## Median :0.0100 Median : 0.0000 Median :0.00000 Mode :character
## Mean :0.5691 Mean : 0.1487 Mean :0.07507
## 3rd Qu.:0.9300 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :3.5200 Max. :35.0000 Max. :8.80000
##
## HOLIDAY FUNCTIONING_DAY
## Length:8760 Length:8760
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
dim(bike_sharing_df)
## [1] 8760 14
Dropping NAs in RENTED_BIKE_COUNT
bike_sharing_df <- bike_sharing_df |>
drop_na(RENTED_BIKE_COUNT)
dim(bike_sharing_df)
## [1] 8465 14
Checking for NAs in TEMPERATURE column
missing_temp <- bike_sharing_df |>
filter(is.na(TEMPERATURE))
missing_temp
Calculating the mean summer temperature in Seoul
summer_temp <- bike_sharing_df |>
filter(SEASONS == "Summer") |>
select(TEMPERATURE)
avg_summer_temp <- mean(summer_temp$TEMPERATURE, na.rm = TRUE)
avg_summer_temp
## [1] 26.58771
Replacing missing values in TEMPERATURE with the mean calculated
bike_sharing_df <- bike_sharing_df |>
replace_na(list(TEMPERATURE = avg_summer_temp))
summary(bike_sharing_df)
## DATE RENTED_BIKE_COUNT HOUR TEMPERATURE
## Length:8465 Min. : 2.0 Min. : 0.00 Min. :-17.80
## Class :character 1st Qu.: 214.0 1st Qu.: 6.00 1st Qu.: 3.00
## Mode :character Median : 542.0 Median :12.00 Median : 13.50
## Mean : 729.2 Mean :11.51 Mean : 12.77
## 3rd Qu.:1084.0 3rd Qu.:18.00 3rd Qu.: 22.70
## Max. :3556.0 Max. :23.00 Max. : 39.40
## HUMIDITY WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## Min. : 0.00 Min. :0.000 Min. : 27 Min. :-30.600
## 1st Qu.:42.00 1st Qu.:0.900 1st Qu.: 935 1st Qu.: -5.100
## Median :57.00 Median :1.500 Median :1690 Median : 4.700
## Mean :58.15 Mean :1.726 Mean :1434 Mean : 3.945
## 3rd Qu.:74.00 3rd Qu.:2.300 3rd Qu.:2000 3rd Qu.: 15.200
## Max. :98.00 Max. :7.400 Max. :2000 Max. : 27.200
## SOLAR_RADIATION RAINFALL SNOWFALL SEASONS
## Min. :0.0000 Min. : 0.0000 Min. :0.00000 Length:8465
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 Class :character
## Median :0.0100 Median : 0.0000 Median :0.00000 Mode :character
## Mean :0.5679 Mean : 0.1491 Mean :0.07769
## 3rd Qu.:0.9300 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :3.5200 Max. :35.0000 Max. :8.80000
## HOLIDAY FUNCTIONING_DAY
## Length:8465 Length:8465
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Saving the processed data
write_csv(bike_sharing_df, "seoul_bike_sharing.csv")
Changing HOUR class into character
bike_sharing_df <- read_csv("seoul_bike_sharing.csv")
bike_sharing_df <- bike_sharing_df |>
mutate(HOUR = as.character(HOUR))
Changing character variables into indicator variables
bike_sharing_df <- bike_sharing_df |>
mutate(dummy = 1) |>
pivot_wider(
names_from = HOUR,
values_from = dummy,
values_fill = 0,
names_prefix = "HOUR_"
)
bike_sharing_df <- bike_sharing_df |>
mutate(dummy = 1) |>
pivot_wider(
names_from = SEASONS,
values_from = dummy,
values_fill = 0,
)
bike_sharing_df <- bike_sharing_df |>
mutate(dummy = 1) |>
pivot_wider(
names_from = HOLIDAY,
values_from = dummy,
values_fill = 0
)
bike_sharing_df <- bike_sharing_df |>
mutate(dummy = 1) |>
pivot_wider(
names_from = FUNCTIONING_DAY,
values_from = dummy,
values_fill = 0,
names_prefix = "FUNCTIONING_DAY_"
)
Saving dataset
write_csv(bike_sharing_df, "seoul_bike_sharing_converted.csv")
Creating a function to normalize data
normalize_func <- function(column){
column <- (column - min(column))/(max(column) - min(column))
return(column)
}
Normalizing columns
bike_sharing_df <- bike_sharing_df |>
mutate(
RENTED_BIKE_COUNT = normalize_func(RENTED_BIKE_COUNT),
TEMPERATURE = normalize_func(TEMPERATURE),
HUMIDITY = normalize_func(HUMIDITY),
WIND_SPEED = normalize_func(WIND_SPEED),
VISIBILITY = normalize_func(VISIBILITY),
DEW_POINT_TEMPERATURE = normalize_func(DEW_POINT_TEMPERATURE),
SOLAR_RADIATION = normalize_func(SOLAR_RADIATION),
RAINFALL = normalize_func(RAINFALL),
SNOWFALL = normalize_func(SNOWFALL)
)
Summarizing data
summary(bike_sharing_df)
## DATE RENTED_BIKE_COUNT TEMPERATURE HUMIDITY
## Length:8465 Min. :0.00000 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.:0.05965 1st Qu.:0.3636 1st Qu.:0.4286
## Mode :character Median :0.15194 Median :0.5472 Median :0.5816
## Mean :0.20460 Mean :0.5345 Mean :0.5933
## 3rd Qu.:0.30445 3rd Qu.:0.7080 3rd Qu.:0.7551
## Max. :1.00000 Max. :1.0000 Max. :1.0000
## WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE SOLAR_RADIATION
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.:0.1216 1st Qu.:0.4602 1st Qu.:0.4412 1st Qu.:0.000000
## Median :0.2027 Median :0.8429 Median :0.6107 Median :0.002841
## Mean :0.2332 Mean :0.7131 Mean :0.5977 Mean :0.161326
## 3rd Qu.:0.3108 3rd Qu.:1.0000 3rd Qu.:0.7924 3rd Qu.:0.264205
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000000
## RAINFALL SNOWFALL HOUR_0 HOUR_1
## Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.000000 Median :0.000000 Median :0.00000 Median :0.00000
## Mean :0.004261 Mean :0.008828 Mean :0.04158 Mean :0.04158
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.000000 Max. :1.000000 Max. :1.00000 Max. :1.00000
## HOUR_2 HOUR_3 HOUR_4 HOUR_5
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.04158 Mean :0.04158 Mean :0.04158 Mean :0.04158
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## HOUR_6 HOUR_7 HOUR_8 HOUR_9
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.04158 Mean :0.0417 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## HOUR_10 HOUR_11 HOUR_12 HOUR_13
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0417 Mean :0.0417 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## HOUR_14 HOUR_15 HOUR_16 HOUR_17
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0417 Mean :0.0417 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## HOUR_18 HOUR_19 HOUR_20 HOUR_21
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0417 Mean :0.0417 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## HOUR_22 HOUR_23 Winter Spring
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0417 Mean :0.0417 Mean :0.2552 Mean :0.2552
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Summer Autumn No Holiday Holiday
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :0.0000
## Mean :0.2608 Mean :0.2288 Mean :0.9518 Mean :0.0482
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## FUNCTIONING_DAY_Yes
## Min. :1
## 1st Qu.:1
## Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
Saving dataset
write_csv(bike_sharing_df, "seoul_bike_sharing_converted_normalized.csv")
# Dataset list
dataset_list <- c('seoul_bike_sharing.csv', 'seoul_bike_sharing_converted.csv', 'seoul_bike_sharing_converted_normalized.csv')
for (dataset_name in dataset_list){
# Read dataset
dataset <- read_csv(dataset_name)
# Standardized its columns:
# Convert all columns names to uppercase
names(dataset) <- toupper(names(dataset))
# Replace any white space separators by underscore, using str_replace_all function
names(dataset) <- str_replace_all(names(dataset), " ", "_")
# Save the dataset back
write.csv(dataset, dataset_name, row.names=FALSE)
}
Loading the required library
library(RMySQL)
Creating connection
con <- dbConnect(MySQL(),
user = "root",
password = "Ososo@2020",
dbname = "ibm_final_project_dataset",
host = "localhost")
dbSendQuery(con, "SET GLOBAL local_infile = TRUE")
## <MySQLResult:1973187184,0,0>
Loading datasets
world_cities <- read_csv("raw_worldcities.csv")
bike_sharing_systems <- read_csv("bike_sharing_systems.csv")
cities_weather_forecast <- read_csv("raw_cities_weather_forecast.csv")
seoul_bike_sharing <- read_csv("seoul_bike_sharing.csv")
Saving data to MySQL
dbWriteTable(con, "WORLD_CITIES", world_cities)
## [1] TRUE
dbWriteTable(con, "BIKE_SHARING_SYSTEMS", bike_sharing_systems)
## [1] TRUE
dbWriteTable(con, "CITIES_WEATHER_FORECAST", cities_weather_forecast)
## [1] TRUE
dbWriteTable(con, "SEOUL_BIKE_SHARING", seoul_bike_sharing)
## [1] TRUE
Determine how many records are in the seoul_bike_sharing dataset. Solution 1
query <- "SELECT count(*) AS NUMBER_OF_RECORDS
FROM seoul_bike_sharing;"
dbGetQuery(con, query)
Determine how many hours had non-zero rented bike count. Solution 2
query <- "SELECT count(HOUR) NONE_ZERO_HOURS
FROM ibm_final_project_dataset.seoul_bike_sharing
WHERE RENTED_BIKE_COUNT != 0;"
dbGetQuery(con, query)
Query the the weather forecast for Seoul over the next 3 hours. Recall that the records in the CITIES_WEATHER_FORECAST dataset are 3 hours apart, so we just need the first record from the query.
Solution 3
query <- "SELECT *
FROM ibm_final_project_dataset.cities_weather_forecast
LIMIT 1;"
dbGetQuery(con, query)
Find which seasons are included in the Seoul bike sharing dataset. Solution 4
query <- "SELECT DISTINCT SEASONS
FROM ibm_final_project_dataset.seoul_bike_sharing;"
dbGetQuery(con, query)
Find the first and last dates in the Seoul Bike Sharing dataset. Solution 5
query <- "SELECT MIN(DATE) AS FIRST_DATE, MAX(DATE) AS LAST_DATE
FROM ibm_final_project_dataset.seoul_bike_sharing;"
dbGetQuery(con, query)
determine which date and hour had the most bike rentals. Solution 6
query <- "SELECT DATE, HOUR
FROM seoul_bike_sharing
WHERE RENTED_BIKE_COUNT = (SELECT MAX(RENTED_BIKE_COUNT)
FROM seoul_bike_sharing)"
dbGetQuery(con, query)
Determine the average hourly temperature and the average number of bike rentals per hour over each season. List the top ten results by average bike count. Solution 7
query <- "SELECT SEASONS, HOUR, ROUND(AVG(TEMPERATURE), 2) AS HOURLY_TEMPERATURE, ROUND(AVG(RENTED_BIKE_COUNT), 2) AS HOURLY_RENTED_BIKE_COUNT
FROM seoul_bike_sharing
GROUP BY SEASONS, HOUR
ORDER BY AVG(RENTED_BIKE_COUNT) DESC
LIMIT 10;"
dbGetQuery(con, query)
Find the average hourly bike count during each season. Also include the minimum, maximum, and standard deviation of the hourly bike count for each season.
Solution 8
query <- "SELECT SEASONS, HOUR, ROUND(AVG(TEMPERATURE), 2) AS HOURLY_TEMPERATURE,
ROUND(AVG(RENTED_BIKE_COUNT), 2) AS HOURLY_RENTED_BIKE_COUNT, MAX(RENTED_BIKE_COUNT) AS HIGHEST_BIKE_COUNT,
MIN(RENTED_BIKE_COUNT) AS LOWEST_BIKE_COUNT,
ROUND(STD(RENTED_BIKE_COUNT), 2) AS STD_RENTED_BIKE_COUNT
FROM seoul_bike_sharing
GROUP BY SEASONS, HOUR"
dbGetQuery(con, query)
Consider the weather over each season. On average, what were the TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBILITY, DEW_POINT_TEMPERATURE, SOLAR_RADIATION, RAINFALL, and SNOWFALL per season? Include the average bike count as well , and rank the results by average bike count so you can see if it is correlated with the weather at all.
Solution 9
query <- "SELECT SEASONS, ROUND(AVG(TEMPERATURE), 2) AS AVG_TEMPERATURE,
ROUND(AVG(HUMIDITY), 2) AS AVG_HUMIDITY, ROUND(AVG(WIND_SPEED), 2) AS AVG_WIND_SPEED,
ROUND(AVG(VISIBILITY), 2) AS AVG_VISIBILITY, ROUND(AVG(DEW_POINT_TEMPERATURE), 2) AS AVG_DEW_POINT_TEMPERATURE,
ROUND(AVG(SOLAR_RADIATION), 2) AS AVG_SOLAR_RADIATION, ROUND(AVG(RAINFALL), 2) AS AVG_RAINFALL,
ROUND(AVG(SNOWFALL), 2) AS AVG_SNOWFALL, ROUND(AVG(RENTED_BIKE_COUNT), 2) AS AVG_BIKE_COUNT
FROM seoul_bike_sharing
GROUP BY SEASONS
ORDER BY AVG(RENTED_BIKE_COUNT) DESC;"
dbGetQuery(con, query)
Use an implicit join across the WORLD_CITIES and the BIKE_SHARING_SYSTEMS tables to determine the total number of bikes available in Seoul, plus the following city information about Seoul: CITY, COUNTRY, LAT, LON, POPULATION, in a single view. Notice that in this case, the CITY column will work for the WORLD_CITIES table, but in general you would have to use the CITY_ASCII column.
Solution 10
query <- 'SELECT W.CITY, B.COUNTRY, W.LAT, W.LNG, W.POPULATION, B.BICYCLES AS TOTAL_BIKES
FROM world_cities W, bike_sharing_systems B
WHERE W.CITY_ASCII = B.CITY AND W.CITY = "Seoul";'
dbGetQuery(con, query)
Find all cities with total bike counts between 15000 and 20000. Return the city and country names, plus the coordinates (LAT, LNG), population, and number of bicycles for each city. Later we will ask you to visualize these similar cities on leaflet, with some weather data.
Solution 11
query <- 'SELECT W.CITY, B.COUNTRY, W.LAT, W.LNG, W.POPULATION, B.BICYCLES AS TOTAL_BIKES
FROM world_cities W, bike_sharing_systems B
WHERE W.CITY = B.CITY AND B.BICYCLES BETWEEN 15000 AND 20000'
dbGetQuery(con, query)
dbDisconnect(con)
## [1] TRUE
Ensure you read DATE as type character.
Solution 1
seoul_bike_sharing <- read_csv("seoul_bike_sharing.csv", col_types = cols("DATE" = col_character()))
Use the format of the data, namely “%d/%m/%Y”.
Solution 2
seoul_bike_sharing$DATE <- dmy(seoul_bike_sharing$DATE)
class(seoul_bike_sharing$DATE)
## [1] "Date"
Also, coerce its levels to be an ordered sequence. This will ensure your visualizations correctly utilize HOURS as a discrete variable with the expected ordering.
Solution 3
seoul_bike_sharing$HOUR <- as_factor(seoul_bike_sharing$HOUR)
levels(seoul_bike_sharing$HOUR)
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [16] "15" "16" "17" "18" "19" "20" "21" "22" "23"
Check the structure of the dataframe
str(seoul_bike_sharing)
## spc_tbl_ [8,465 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ DATE : Date[1:8465], format: "2017-12-01" "2017-12-01" ...
## $ RENTED_BIKE_COUNT : num [1:8465] 254 204 173 107 78 100 181 460 930 490 ...
## $ HOUR : Factor w/ 24 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ TEMPERATURE : num [1:8465] -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
## $ HUMIDITY : num [1:8465] 37 38 39 40 36 37 35 38 37 27 ...
## $ WIND_SPEED : num [1:8465] 2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
## $ VISIBILITY : num [1:8465] 2000 2000 2000 2000 2000 ...
## $ DEW_POINT_TEMPERATURE: num [1:8465] -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
## $ SOLAR_RADIATION : num [1:8465] 0 0 0 0 0 0 0 0 0.01 0.23 ...
## $ RAINFALL : num [1:8465] 0 0 0 0 0 0 0 0 0 0 ...
## $ SNOWFALL : num [1:8465] 0 0 0 0 0 0 0 0 0 0 ...
## $ SEASONS : chr [1:8465] "Winter" "Winter" "Winter" "Winter" ...
## $ HOLIDAY : chr [1:8465] "No Holiday" "No Holiday" "No Holiday" "No Holiday" ...
## $ FUNCTIONING_DAY : chr [1:8465] "Yes" "Yes" "Yes" "Yes" ...
## - attr(*, "spec")=
## .. cols(
## .. DATE = col_character(),
## .. RENTED_BIKE_COUNT = col_double(),
## .. HOUR = col_double(),
## .. TEMPERATURE = col_double(),
## .. HUMIDITY = col_double(),
## .. WIND_SPEED = col_double(),
## .. VISIBILITY = col_double(),
## .. DEW_POINT_TEMPERATURE = col_double(),
## .. SOLAR_RADIATION = col_double(),
## .. RAINFALL = col_double(),
## .. SNOWFALL = col_double(),
## .. SEASONS = col_character(),
## .. HOLIDAY = col_character(),
## .. FUNCTIONING_DAY = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Checking for missing values
sum(is.na(seoul_bike_sharing))
## [1] 0
Now you are all set to take a look at some high level statistics of the seoul_bike_sharing dataset.
Use the base R summary() function to describe the seoul_bike_sharing dataset.
Solution 4
summary(seoul_bike_sharing)
## DATE RENTED_BIKE_COUNT HOUR TEMPERATURE
## Min. :2017-12-01 Min. : 2.0 7 : 353 Min. :-17.80
## 1st Qu.:2018-02-27 1st Qu.: 214.0 8 : 353 1st Qu.: 3.00
## Median :2018-05-28 Median : 542.0 9 : 353 Median : 13.50
## Mean :2018-05-28 Mean : 729.2 10 : 353 Mean : 12.77
## 3rd Qu.:2018-08-24 3rd Qu.:1084.0 11 : 353 3rd Qu.: 22.70
## Max. :2018-11-30 Max. :3556.0 12 : 353 Max. : 39.40
## (Other):6347
## HUMIDITY WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## Min. : 0.00 Min. :0.000 Min. : 27 Min. :-30.600
## 1st Qu.:42.00 1st Qu.:0.900 1st Qu.: 935 1st Qu.: -5.100
## Median :57.00 Median :1.500 Median :1690 Median : 4.700
## Mean :58.15 Mean :1.726 Mean :1434 Mean : 3.945
## 3rd Qu.:74.00 3rd Qu.:2.300 3rd Qu.:2000 3rd Qu.: 15.200
## Max. :98.00 Max. :7.400 Max. :2000 Max. : 27.200
##
## SOLAR_RADIATION RAINFALL SNOWFALL SEASONS
## Min. :0.0000 Min. : 0.0000 Min. :0.00000 Length:8465
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 Class :character
## Median :0.0100 Median : 0.0000 Median :0.00000 Mode :character
## Mean :0.5679 Mean : 0.1491 Mean :0.07769
## 3rd Qu.:0.9300 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :3.5200 Max. :35.0000 Max. :8.80000
##
## HOLIDAY FUNCTIONING_DAY
## Length:8465 Length:8465
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
Solution 5:
Number_of_holidays <- seoul_bike_sharing |>
filter(HOLIDAY == "Holiday") |>
count(HOLIDAY)
Number_of_holidays
Solution 6
holiday_percentage <- nrow(filter(seoul_bike_sharing, HOLIDAY == "Holiday"))/nrow(seoul_bike_sharing)*100
holiday_percentage
## [1] 4.819846
Solution 7
## A full year has 365 days
## Each day has 24 hours
Expected_number_of_records <- 365*24
Expected_number_of_records
## [1] 8760
Solution 8
## Number of possible records should be 2: "Yes" and "No"
Let’s calculate some seasonally aggregated measures to help build some more context.
Solution 9
seoul_bike_sharing |>
select(SEASONS, RAINFALL, SNOWFALL) |>
group_by(SEASONS) |>
summarise(total_rainfall = sum(RAINFALL), total_snowfall = sum(SNOWFALL))
Tune the opacity using the alpha parameter such that the points don’t obscure each other too much.
Solution 10
ggplot(seoul_bike_sharing, aes(x = DATE, y = RENTED_BIKE_COUNT))+
geom_point(alpha = 0.2)+
labs(title = "RENTED_BIKE_COUNT vs DATE")
Solution 11
ggplot(seoul_bike_sharing, aes(x = DATE, y = RENTED_BIKE_COUNT, color = HOUR))+
geom_point(alpha = 0.5)+
labs(title = "RENTED_BIKE_COUNT vs DATE")
Normalize the histogram so the y axis represents ‘density’. This can be done by setting y=..density.. in the aesthetics of the histogram.
Solution 12
ggplot(seoul_bike_sharing, aes(x = RENTED_BIKE_COUNT))+
geom_histogram(aes(y = after_stat(density)), color = "black", fill = "white", bins = 25)+
geom_density(color = "red", alpha = 0.2)
Start with RENTED_BIKE_COUNT vs. TEMPERATURE, then generate four plots corresponding to the SEASONS by adding a facet_wrap() layer. Also, make use of colour and opacity to emphasize any patterns that emerge. Use HOUR as the color.
Solution 13
ggplot(seoul_bike_sharing, aes(x = TEMPERATURE, y = RENTED_BIKE_COUNT))+
geom_point(aes(color = HOUR), alpha = 0.2)+
facet_wrap(~ SEASONS)
Use facet_wrap to generate four plots corresponding to the seasons.
Solution 14
ggplot(seoul_bike_sharing, aes(x = HOUR, y = RENTED_BIKE_COUNT))+
geom_boxplot()+
facet_wrap(~ SEASONS)
Also, go ahead and plot the results if you wish.
Solution 15
seoul_bike_sharing |>
select(DATE, RAINFALL, SNOWFALL) |>
group_by(DATE) |>
summarise(DAILY_RAINFALL = sum(RAINFALL), DAILY_SNOWFALL = sum(SNOWFALL))
x <- seoul_bike_sharing |>
select(DATE, RAINFALL, SNOWFALL) |>
group_by(DATE) |>
summarise(DAILY_RAINFALL = sum(RAINFALL), DAILY_SNOWFALL = sum(SNOWFALL)) |>
pivot_longer(
cols = 2:3,
names_to = "fall",
values_to = "values"
) |>
ggplot(aes(x = values, color = fall))+
geom_bar(position = "dodge")
Solution 16
seoul_bike_sharing |>
select(DATE, RAINFALL, SNOWFALL) |>
group_by(DATE) |>
summarise(DAILY_RAINFALL = sum(RAINFALL), DAILY_SNOWFALL = sum(SNOWFALL)) |>
filter(DAILY_SNOWFALL != 0) |>
nrow()
## [1] 27
Loading library
library("tidymodels")
Loading dataset
bike_sharing_df <- read_csv("seoul_bike_sharing_converted_normalized.csv")
Removing DATE and FUNCTIONING_DAY
bike_sharing_df <- bike_sharing_df |>
select(-DATE, -FUNCTIONING_DAY_YES)
set.seed(1234)
data_split <- initial_split(bike_sharing_df, prop = 3/4)
train_data <- training(data_split)
test_data <- testing(data_split)
Specifying and fitting model
lm_spec <- linear_reg() |>
set_engine(engine = "lm")
lm_model_weather <- lm_spec |>
fit(RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL, data = train_data)
Model summary
print(lm_model_weather$fit)
##
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY +
## WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION +
## RAINFALL + SNOWFALL, data = data)
##
## Coefficients:
## (Intercept) TEMPERATURE HUMIDITY
## 0.043532 0.675221 -0.258408
## WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## 0.113806 0.003533 -0.089172
## SOLAR_RADIATION RAINFALL SNOWFALL
## -0.125170 -0.496346 0.089414
lm_model_all <- lm_spec |>
fit(RENTED_BIKE_COUNT ~ ., data = train_data)
Printing summary of model
print(lm_model_all$fit)
##
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ ., data = data)
##
## Coefficients:
## (Intercept) TEMPERATURE HUMIDITY
## 0.1543567 0.2202189 -0.2495020
## WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## 0.0089795 0.0061541 0.1683701
## SOLAR_RADIATION RAINFALL SNOWFALL
## 0.0779070 -0.5809335 0.0734309
## HOUR_0 HOUR_1 HOUR_2
## -0.0374527 -0.0620864 -0.0959539
## HOUR_3 HOUR_4 HOUR_5
## -0.1192121 -0.1379013 -0.1312685
## HOUR_6 HOUR_7 HOUR_8
## -0.0866424 0.0008297 0.0976841
## HOUR_9 HOUR_10 HOUR_11
## -0.0292089 -0.0960401 -0.0988159
## HOUR_12 HOUR_13 HOUR_14
## -0.0878305 -0.0830508 -0.0833568
## HOUR_15 HOUR_16 HOUR_17
## -0.0600850 -0.0227013 0.0567640
## HOUR_18 HOUR_19 HOUR_20
## 0.1944273 0.1179459 0.0923429
## HOUR_21 HOUR_22 HOUR_23
## 0.0964469 0.0672010 NA
## WINTER SPRING SUMMER
## -0.1010128 -0.0471677 -0.0452610
## AUTUMN NO_HOLIDAY HOLIDAY
## NA 0.0350095 NA
# test_results_weather for lm_model_weather model
test_results_weather <- lm_model_weather |>
predict(new_data = test_data) |>
mutate(truth = test_data$RENTED_BIKE_COUNT)
# test_results_all for lm_model_all
test_results_all <- lm_model_all |>
predict(new_data = test_data) |>
mutate(truth = test_data$RENTED_BIKE_COUNT)
Calculating rsq and rmse
rsq_weather <- rsq(test_results_weather, truth = truth, estimate = .pred)
rsq_all <- rsq(test_results_all, truth = truth, estimate = .pred)
rmse_weather <- rmse(test_results_weather, truth = truth, estimate = .pred)
rmse_all <- rmse(test_results_all, truth = truth, estimate = .pred)
rsq_weather
rsq_all
rmse_weather
rmse_all
Checking the coefficients
coefficients <- lm_model_all$fit$coefficients
Sorting coefficients
coefficients <- sort(abs(coefficients))
Plotting coefficients
plot_df <- data.frame(coefficients)
ggplot(plot_df, aes(x = coefficients, y = reorder(rownames(plot_df), coefficients)))+
geom_col()+
labs(title = "Coefficients Plot", y = "Coefficients", x = "Coefficient Value")
ggplot(data = train_data, aes(RENTED_BIKE_COUNT, TEMPERATURE)) +
geom_point()
One solution to handle such nonlinearity is using polynomial regression
by adding polynomial terms. As shown before, higher order polynomials
are better than the first order polynomial.
# Plot the higher order polynomial fits
ggplot(data=train_data, aes(RENTED_BIKE_COUNT, TEMPERATURE)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, color="red") +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), color="yellow") +
geom_smooth(method = "lm", formula = y ~ poly(x, 4), color="green") +
geom_smooth(method = "lm", formula = y ~ poly(x, 6), color="blue")
Fitting a polynomial regression
lm_poly <- lm_spec |>
fit(RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) + poly(HUMIDITY, 4) + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL + HOUR_0 + HOUR_1 + HOUR_2 + HOUR_3 + HOUR_4 + HOUR_5 + HOUR_6 + HOUR_7 + HOUR_8 + HOUR_9 + HOUR_10 + HOUR_11 + HOUR_12 + HOUR_13 + HOUR_14 + HOUR_15 + HOUR_16 + HOUR_17 + HOUR_18 + HOUR_19 + HOUR_20 + HOUR_21 + HOUR_22 + HOUR_23 + WINTER + SPRING + SUMMER + AUTUMN + NO_HOLIDAY +HOLIDAY, data = train_data)
Printing model summary
summary(lm_poly$fit)
##
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) +
## poly(HUMIDITY, 4) + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE +
## SOLAR_RADIATION + RAINFALL + SNOWFALL + HOUR_0 + HOUR_1 +
## HOUR_2 + HOUR_3 + HOUR_4 + HOUR_5 + HOUR_6 + HOUR_7 + HOUR_8 +
## HOUR_9 + HOUR_10 + HOUR_11 + HOUR_12 + HOUR_13 + HOUR_14 +
## HOUR_15 + HOUR_16 + HOUR_17 + HOUR_18 + HOUR_19 + HOUR_20 +
## HOUR_21 + HOUR_22 + HOUR_23 + WINTER + SPRING + SUMMER +
## AUTUMN + NO_HOLIDAY + HOLIDAY, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42651 -0.05859 -0.00099 0.05277 0.41498
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.635695 0.049825 12.759 < 2e-16 ***
## poly(TEMPERATURE, 6)1 15.894648 1.255325 12.662 < 2e-16 ***
## poly(TEMPERATURE, 6)2 -0.444996 0.144287 -3.084 0.002050 **
## poly(TEMPERATURE, 6)3 -2.676345 0.101437 -26.384 < 2e-16 ***
## poly(TEMPERATURE, 6)4 -1.435420 0.109327 -13.130 < 2e-16 ***
## poly(TEMPERATURE, 6)5 0.072596 0.099044 0.733 0.463605
## poly(TEMPERATURE, 6)6 0.524271 0.101200 5.181 2.28e-07 ***
## poly(HUMIDITY, 4)1 1.234317 0.596876 2.068 0.038684 *
## poly(HUMIDITY, 4)2 -2.813352 0.125085 -22.491 < 2e-16 ***
## poly(HUMIDITY, 4)3 -1.153864 0.111476 -10.351 < 2e-16 ***
## poly(HUMIDITY, 4)4 0.134791 0.124200 1.085 0.277840
## WIND_SPEED -0.005880 0.010343 -0.568 0.569753
## VISIBILITY -0.020170 0.005405 -3.732 0.000192 ***
## DEW_POINT_TEMPERATURE -0.664475 0.080973 -8.206 2.75e-16 ***
## SOLAR_RADIATION 0.067152 0.010848 6.190 6.38e-10 ***
## RAINFALL -0.369546 0.039318 -9.399 < 2e-16 ***
## SNOWFALL 0.094753 0.026761 3.541 0.000402 ***
## HOUR_0 -0.037590 0.008492 -4.426 9.75e-06 ***
## HOUR_1 -0.063245 0.008370 -7.556 4.74e-14 ***
## HOUR_2 -0.096212 0.008392 -11.465 < 2e-16 ***
## HOUR_3 -0.115251 0.008546 -13.487 < 2e-16 ***
## HOUR_4 -0.134147 0.008498 -15.785 < 2e-16 ***
## HOUR_5 -0.124819 0.008385 -14.885 < 2e-16 ***
## HOUR_6 -0.082217 0.008424 -9.760 < 2e-16 ***
## HOUR_7 0.006742 0.008485 0.795 0.426912
## HOUR_8 0.095963 0.008405 11.417 < 2e-16 ***
## HOUR_9 -0.028405 0.008630 -3.291 0.001002 **
## HOUR_10 -0.088974 0.008972 -9.917 < 2e-16 ***
## HOUR_11 -0.084411 0.009450 -8.933 < 2e-16 ***
## HOUR_12 -0.064868 0.009724 -6.671 2.76e-11 ***
## HOUR_13 -0.056983 0.009881 -5.767 8.46e-09 ***
## HOUR_14 -0.052095 0.009600 -5.427 5.95e-08 ***
## HOUR_15 -0.029536 0.009439 -3.129 0.001761 **
## HOUR_16 0.007403 0.009076 0.816 0.414688
## HOUR_17 0.081000 0.008845 9.157 < 2e-16 ***
## HOUR_18 0.214698 0.008561 25.077 < 2e-16 ***
## HOUR_19 0.130315 0.008456 15.411 < 2e-16 ***
## HOUR_20 0.098517 0.008373 11.766 < 2e-16 ***
## HOUR_21 0.097839 0.008391 11.660 < 2e-16 ***
## HOUR_22 0.067345 0.008339 8.076 7.96e-16 ***
## HOUR_23 NA NA NA NA
## WINTER -0.105679 0.006104 -17.315 < 2e-16 ***
## SPRING -0.040911 0.003679 -11.122 < 2e-16 ***
## SUMMER -0.031872 0.005126 -6.217 5.39e-10 ***
## AUTUMN NA NA NA NA
## NO_HOLIDAY 0.030800 0.005902 5.219 1.86e-07 ***
## HOLIDAY NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09628 on 6304 degrees of freedom
## Multiple R-squared: 0.7206, Adjusted R-squared: 0.7187
## F-statistic: 378.1 on 43 and 6304 DF, p-value: < 2.2e-16
Making predictions
test_result_poly <- lm_poly |>
predict(new_data = test_data) |>
mutate(truth = test_data$RENTED_BIKE_COUNT)
Calculating RSquared and RMSE
rsq_poly <- rsq(test_result_poly, truth = truth, estimate = .pred)
rmse_poly <- rmse(test_result_poly, truth = truth, estimate = .pred)
rsq_poly
rmse_poly
lm_poly_interaction <- lm_spec |>
fit(RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) + poly(HUMIDITY, 4) + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL + HOUR_0 + HOUR_1 + HOUR_2 + HOUR_3 + HOUR_4 + HOUR_5 + HOUR_6 + HOUR_7 + HOUR_8 + HOUR_9 + HOUR_10 + HOUR_11 + HOUR_12 + HOUR_13 + HOUR_14 + HOUR_15 + HOUR_16 + HOUR_17 + HOUR_18 + HOUR_19 + HOUR_20 + HOUR_21 + HOUR_22 + HOUR_23 + WINTER + SPRING + SUMMER + AUTUMN + NO_HOLIDAY + HOLIDAY + HUMIDITY*TEMPERATURE + RAINFALL*HUMIDITY + RAINFALL*WIND_SPEED, data = train_data)
Summary of model with interaction
summary(lm_poly_interaction$fit)
##
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ poly(TEMPERATURE, 6) +
## poly(HUMIDITY, 4) + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE +
## SOLAR_RADIATION + RAINFALL + SNOWFALL + HOUR_0 + HOUR_1 +
## HOUR_2 + HOUR_3 + HOUR_4 + HOUR_5 + HOUR_6 + HOUR_7 + HOUR_8 +
## HOUR_9 + HOUR_10 + HOUR_11 + HOUR_12 + HOUR_13 + HOUR_14 +
## HOUR_15 + HOUR_16 + HOUR_17 + HOUR_18 + HOUR_19 + HOUR_20 +
## HOUR_21 + HOUR_22 + HOUR_23 + WINTER + SPRING + SUMMER +
## AUTUMN + NO_HOLIDAY + HOLIDAY + HUMIDITY * TEMPERATURE +
## RAINFALL * HUMIDITY + RAINFALL * WIND_SPEED, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43318 -0.05763 0.00123 0.05153 0.40824
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.539982 0.048504 11.133 < 2e-16 ***
## poly(TEMPERATURE, 6)1 14.441040 1.218215 11.854 < 2e-16 ***
## poly(TEMPERATURE, 6)2 -0.387659 0.139825 -2.772 0.00558 **
## poly(TEMPERATURE, 6)3 -3.406592 0.105351 -32.336 < 2e-16 ***
## poly(TEMPERATURE, 6)4 -1.588740 0.106280 -14.949 < 2e-16 ***
## poly(TEMPERATURE, 6)5 0.065726 0.095953 0.685 0.49338
## poly(TEMPERATURE, 6)6 0.455156 0.098114 4.639 3.57e-06 ***
## poly(HUMIDITY, 4)1 4.239221 0.598874 7.079 1.61e-12 ***
## poly(HUMIDITY, 4)2 -1.796411 0.131296 -13.682 < 2e-16 ***
## poly(HUMIDITY, 4)3 -0.743184 0.110198 -6.744 1.68e-11 ***
## poly(HUMIDITY, 4)4 -0.734476 0.127842 -5.745 9.61e-09 ***
## WIND_SPEED 0.009287 0.010160 0.914 0.36071
## VISIBILITY -0.007575 0.005273 -1.437 0.15089
## DEW_POINT_TEMPERATURE -0.069582 0.083982 -0.829 0.40740
## SOLAR_RADIATION -0.009459 0.011181 -0.846 0.39755
## RAINFALL -11.962287 1.545631 -7.739 1.16e-14 ***
## SNOWFALL -0.035156 0.026795 -1.312 0.18956
## HOUR_0 -0.036313 0.008227 -4.414 1.03e-05 ***
## HOUR_1 -0.064481 0.008108 -7.953 2.15e-15 ***
## HOUR_2 -0.096918 0.008129 -11.923 < 2e-16 ***
## HOUR_3 -0.116559 0.008278 -14.080 < 2e-16 ***
## HOUR_4 -0.137395 0.008234 -16.686 < 2e-16 ***
## HOUR_5 -0.129357 0.008126 -15.920 < 2e-16 ***
## HOUR_6 -0.085421 0.008163 -10.465 < 2e-16 ***
## HOUR_7 0.003801 0.008220 0.462 0.64383
## HOUR_8 0.100389 0.008145 12.325 < 2e-16 ***
## HOUR_9 -0.012594 0.008396 -1.500 0.13364
## HOUR_10 -0.063556 0.008781 -7.238 5.11e-13 ***
## HOUR_11 -0.054178 0.009274 -5.842 5.43e-09 ***
## HOUR_12 -0.030306 0.009570 -3.167 0.00155 **
## HOUR_13 -0.025829 0.009693 -2.665 0.00772 **
## HOUR_14 -0.024236 0.009399 -2.579 0.00994 **
## HOUR_15 -0.007309 0.009209 -0.794 0.42744
## HOUR_16 0.020337 0.008816 2.307 0.02109 *
## HOUR_17 0.086685 0.008574 10.111 < 2e-16 ***
## HOUR_18 0.215056 0.008300 25.911 < 2e-16 ***
## HOUR_19 0.131780 0.008194 16.083 < 2e-16 ***
## HOUR_20 0.097566 0.008113 12.026 < 2e-16 ***
## HOUR_21 0.097388 0.008129 11.980 < 2e-16 ***
## HOUR_22 0.067874 0.008078 8.402 < 2e-16 ***
## HOUR_23 NA NA NA NA
## WINTER -0.099454 0.005921 -16.797 < 2e-16 ***
## SPRING -0.041233 0.003565 -11.568 < 2e-16 ***
## SUMMER -0.009796 0.005106 -1.918 0.05510 .
## AUTUMN NA NA NA NA
## NO_HOLIDAY 0.031298 0.005720 5.472 4.63e-08 ***
## HOLIDAY NA NA NA NA
## HUMIDITY NA NA NA NA
## TEMPERATURE NA NA NA NA
## HUMIDITY:TEMPERATURE -0.845092 0.043552 -19.404 < 2e-16 ***
## RAINFALL:HUMIDITY 11.717573 1.560004 7.511 6.67e-14 ***
## WIND_SPEED:RAINFALL 0.380232 0.234819 1.619 0.10544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09326 on 6301 degrees of freedom
## Multiple R-squared: 0.738, Adjusted R-squared: 0.7361
## F-statistic: 385.8 on 46 and 6301 DF, p-value: < 2.2e-16
Evaluating model
test_result_interaction <- lm_poly_interaction |>
predict(new_data = test_data) |>
mutate(truth = test_data$RENTED_BIKE_COUNT)
rsq_interaction <- rsq(test_result_interaction, truth = truth, estimate = .pred)
rmse_interaction <- rmse(test_result_interaction, truth = truth, estimate = .pred)
rsq_interaction
rmse_interaction
library(glmnet)
Specifying model and fitting regression
bike_recipe <- recipe(RENTED_BIKE_COUNT ~ ., data = train_data)
glmnet_spec <- linear_reg(penalty = 0.1, mixture = 0) |>
set_engine("glmnet")
glmnet_wf <- workflow() |>
add_recipe(bike_recipe)
glmnet_fit <- glmnet_wf |>
add_model(glmnet_spec) |>
fit(data = train_data)
Getting model coefficients
glmnet_fit |>
pull_workflow_fit() |>
tidy()
Evaluating model
test_result_glmnet <- glmnet_fit |>
predict(new_data = test_data) |>
mutate(truth = test_data$RENTED_BIKE_COUNT)
rsq_glmnet <- rsq(test_result_glmnet, truth = truth, estimate = .pred)
rmse_glmnet <- rmse(test_result_glmnet, truth = truth, estimate = .pred)
rsq_glmnet
rmse_glmnet
Comparing the performance of the various models
models_data <- tribble(
~model, ~rsq, ~rmse,
"glmnet", 0.65, 0.11,
"weather", 0.44, 0.13,
"all", 0.67, 0.10,
"interaction", 0.74, 0.09,
"poly", 0.73, 0.092
)
models_data
bar_chart <- models_data |>
pivot_longer(
cols = 2:3,
names_to = "measures",
values_to = "values"
) |>
ggplot(aes(x = model, y = values, fill = measures))+
geom_col(position = "dodge")+
labs(title = "Model Evaluation Chart")
bar_chart
QQ Plot Using Interaction Model
ggplot(test_result_interaction)+
stat_qq(aes(sample = truth), color = "green")+
stat_qq(aes(sample = .pred), color = "red")