library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.0 ──
## ✔ broom 1.0.9 ✔ rsample 1.3.1
## ✔ dials 1.4.2 ✔ tune 2.0.0
## ✔ infer 1.0.9 ✔ workflows 1.3.0
## ✔ modeldata 1.5.1 ✔ workflowsets 1.1.1
## ✔ parsnip 1.3.3 ✔ yardstick 1.3.2
## ✔ recipes 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(rvest) # for web scraping
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(httr) # for Http requests
library(RSQLite)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-10
# Getting Bike Data
bike_sharing_df <- read.csv("C:/Users/MODEL23/Documents/bike_sharing_systems.csv")
head(bike_sharing_df,6)
## CITY WEATHER VISIBILITY TEMP TEMP_MIN TEMP_MAX PRESSURE HUMIDITY WIND_SPEED
## 1 Seoul Clear 10000 19.86 19.86 20.07 1013 92 1.07
## 2 Seoul Clear 10000 19.50 19.37 19.50 1014 80 1.53
## 3 Seoul Clear 10000 23.08 23.08 23.08 1014 51 1.78
## 4 Seoul Clear 10000 28.52 28.52 28.52 1013 26 2.15
## 5 Seoul Clear 10000 31.99 31.99 31.99 1012 19 2.45
## 6 Seoul Clear 10000 29.50 29.50 29.50 1012 29 1.94
## WIND_DEG FORECAST_DATETIME SEASON
## 1 45 2025-09-09 18:00:00 Autumn
## 2 28 2025-09-09 21:00:00 Autumn
## 3 44 2025-09-10 Autumn
## 4 71 2025-09-10 03:00:00 Autumn
## 5 61 2025-09-10 06:00:00 Autumn
## 6 356 2025-09-10 09:00:00 Autumn
colnames(bike_sharing_df)
## [1] "CITY" "WEATHER" "VISIBILITY"
## [4] "TEMP" "TEMP_MIN" "TEMP_MAX"
## [7] "PRESSURE" "HUMIDITY" "WIND_SPEED"
## [10] "WIND_DEG" "FORECAST_DATETIME" "SEASON"
Notes: Here I load the required libraries and import the bike sharing dataset. We preview the first six rows and check the column names to understand the dataset structure.
current_weather_url <- 'https://api.openweathermap.org/data/2.5/weather'
your_api_key <- "da579f0f1c601672c4f6fa0d8ba9397c"
current_query <- list(q = "Seoul", appid = your_api_key, units="metric")
response <- GET(current_weather_url, query=current_query)
json_result <- content(response, as="parsed")
weather <- c(json_result$weather[[1]]$main)
visibility <- c(json_result$visibility)
temp <- c(json_result$main$temp)
temp_min <- c(json_result$main$temp_min)
temp_max <- c(json_result$main$temp_max)
pressure <- c(json_result$main$pressure)
humidity <- c(json_result$main$humidity)
wind_speed <- c(json_result$wind$speed)
wind_deg <- c(json_result$wind$deg)
weather_data_frame <- data.frame(weather, visibility, temp, temp_min, temp_max, pressure, humidity, wind_speed, wind_deg)
print(weather_data_frame)
## weather visibility temp temp_min temp_max pressure humidity wind_speed
## 1 Clouds 10000 22.76 22.76 22.78 1011 100 1.54
## wind_deg
## 1 140
Notes: This section connects to the OpenWeather API to retrieve the current weather data for Seoul. The data is parsed into variables like temperature, humidity, wind, and then stored in a dataframe.
# Helper function to determine the season based on the month (Northern Hemisphere)
get_season <- function(month) {
if (month %in% c(12, 1, 2)) {
return("Winter")
} else if (month %in% c(3, 4, 5)) {
return("Spring")
} else if (month %in% c(6, 7, 8)) {
return("Summer")
} else if (month %in% c(9, 10, 11)) {
return("Autumn")
}
}
get_weather_forecaset_by_cities <- function(city_names, api_key) {
all_city_data <- list()
forecast_url <- 'https://api.openweathermap.org/data/2.5/forecast'
for (city_name in city_names) {
forecast_query <- list(q = city_name, appid = api_key, units = "metric")
response <- GET(forecast_url, query = forecast_query)
if (http_error(response)) {
warning(paste("Error fetching data for", city_name, ":", http_status(response)$message))
next
}
json_result <- content(response, as = "parsed")
results <- json_result$list
city_forecast_df <- data.frame(
city = city_name,
weather = sapply(results, function(x) x$weather[[1]]$main),
visibility = sapply(results, function(x) x$visibility),
temp = sapply(results, function(x) x$main$temp),
temp_min = sapply(results, function(x) x$main$temp_min),
temp_max = sapply(results, function(x) x$main$temp_max),
pressure = sapply(results, function(x) x$main$pressure),
humidity = sapply(results, function(x) x$main$humidity),
wind_speed = sapply(results, function(x) x$wind$speed),
wind_deg = sapply(results, function(x) x$wind$deg),
forecast_datetime = as.POSIXct(sapply(results, function(x) x$dt), origin = "1970-01-01", tz = "UTC")
)
# Add the season column
city_forecast_df$season <- sapply(format(city_forecast_df$forecast_datetime, "%m"), as.numeric)
city_forecast_df$season <- sapply(city_forecast_df$season, get_season)
all_city_data[[city_name]] <- city_forecast_df
}
final_df <- do.call(rbind, all_city_data)
return(final_df)
}
Notes: Here, a helper function
get_season
categorizes months into seasons. Another
function get_weather_forecaset_by_cities
fetches
5-day forecasts for multiple cities and assigns seasons
to each forecast.
cities <- c("Seoul", "Washington, D.C.", "Paris", "Suzhou")
cities_weather_df <- get_weather_forecaset_by_cities(cities, your_api_key)
print(head(cities_weather_df,6))
## city weather visibility temp temp_min temp_max pressure humidity
## Seoul.1 Seoul Rain 10000 22.76 21.84 22.76 1011 100
## Seoul.2 Seoul Rain 10000 22.22 21.15 22.22 1011 97
## Seoul.3 Seoul Rain 6430 22.02 21.65 22.02 1011 96
## Seoul.4 Seoul Rain 7759 22.96 22.96 22.96 1012 94
## Seoul.5 Seoul Rain 10000 22.64 22.64 22.64 1011 97
## Seoul.6 Seoul Rain 10000 20.51 20.51 20.51 1010 92
## wind_speed wind_deg forecast_datetime season
## Seoul.1 1.39 206 2025-09-16 15:00:00 Autumn
## Seoul.2 1.23 215 2025-09-16 18:00:00 Autumn
## Seoul.3 1.68 227 2025-09-16 21:00:00 Autumn
## Seoul.4 1.04 239 2025-09-17 00:00:00 Autumn
## Seoul.5 2.10 188 2025-09-17 03:00:00 Autumn
## Seoul.6 2.60 305 2025-09-17 06:00:00 Autumn
write.csv(cities_weather_df, "cities_weather_df.CSV", row.names = FALSE)
Notes: We now call the forecast function for 4 cities. The results are saved to a CSV file for further use.
datasets_list <- list(
bike_sharing_df = bike_sharing_df,
cities_weather_df = cities_weather_df
)
for (df_name in names(datasets_list)) {
dataset <- datasets_list[[df_name]]
new_names <- colnames(dataset)
new_names <- toupper(new_names)
new_names <- gsub(" ", "_", new_names)
new_names <- gsub("[^A-Z0-9_]", "", new_names)
colnames(dataset) <- new_names
assign(df_name, dataset, envir = .GlobalEnv)
print(paste("Column names for", df_name, "have been standardized."))
print(colnames(dataset))
file_name <- paste0(df_name, "_standardized.csv")
write.csv(dataset, file_name, row.names = FALSE)
}
## [1] "Column names for bike_sharing_df have been standardized."
## [1] "CITY" "WEATHER" "VISIBILITY"
## [4] "TEMP" "TEMP_MIN" "TEMP_MAX"
## [7] "PRESSURE" "HUMIDITY" "WIND_SPEED"
## [10] "WIND_DEG" "FORECAST_DATETIME" "SEASON"
## [1] "Column names for cities_weather_df have been standardized."
## [1] "CITY" "WEATHER" "VISIBILITY"
## [4] "TEMP" "TEMP_MIN" "TEMP_MAX"
## [7] "PRESSURE" "HUMIDITY" "WIND_SPEED"
## [10] "WIND_DEG" "FORECAST_DATETIME" "SEASON"
Notes: Column names across datasets are standardized to uppercase with underscores. This ensures consistency when merging or querying.
In this lab, I focused on wrangling the Seoul bike-sharing demand historical dataset. This is the core dataset to build a predictive model later.
It contains the following columns:
For this dataset,I WAS asked to use tidyverse to perform the following data wrangling tasks:
# load data set
seoul_bike_sharing <- read_csv("raw_seoul_bike_sharing.csv")
## Rows: 8760 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): DATE, SEASONS, HOLIDAY, FUNCTIONING_DAY
## dbl (10): RENTED_BIKE_COUNT, HOUR, TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBI...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# look at dataset summary
summary(seoul_bike_sharing)
## DATE RENTED_BIKE_COUNT HOUR TEMPERATURE
## Length:8760 Min. : 2.0 Min. : 0.00 Min. :-17.80
## Class :character 1st Qu.: 214.0 1st Qu.: 5.75 1st Qu.: 3.40
## Mode :character Median : 542.0 Median :11.50 Median : 13.70
## Mean : 729.2 Mean :11.50 Mean : 12.87
## 3rd Qu.:1084.0 3rd Qu.:17.25 3rd Qu.: 22.50
## Max. :3556.0 Max. :23.00 Max. : 39.40
## NA's :295 NA's :11
## HUMIDITY WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## Min. : 0.00 Min. :0.000 Min. : 27 Min. :-30.600
## 1st Qu.:42.00 1st Qu.:0.900 1st Qu.: 940 1st Qu.: -4.700
## Median :57.00 Median :1.500 Median :1698 Median : 5.100
## Mean :58.23 Mean :1.725 Mean :1437 Mean : 4.074
## 3rd Qu.:74.00 3rd Qu.:2.300 3rd Qu.:2000 3rd Qu.: 14.800
## Max. :98.00 Max. :7.400 Max. :2000 Max. : 27.200
##
## SOLAR_RADIATION RAINFALL SNOWFALL SEASONS
## Min. :0.0000 Min. : 0.0000 Min. :0.00000 Length:8760
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 Class :character
## Median :0.0100 Median : 0.0000 Median :0.00000 Mode :character
## Mean :0.5691 Mean : 0.1487 Mean :0.07507
## 3rd Qu.:0.9300 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :3.5200 Max. :35.0000 Max. :8.80000
##
## HOLIDAY FUNCTIONING_DAY
## Length:8760 Length:8760
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
dim(seoul_bike_sharing)
## [1] 8760 14
colSums(is.na(seoul_bike_sharing))
## DATE RENTED_BIKE_COUNT HOUR
## 0 295 0
## TEMPERATURE HUMIDITY WIND_SPEED
## 11 0 0
## VISIBILITY DEW_POINT_TEMPERATURE SOLAR_RADIATION
## 0 0 0
## RAINFALL SNOWFALL SEASONS
## 0 0 0
## HOLIDAY FUNCTIONING_DAY
## 0 0
From the summary, I observed that:
Columns RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBILITY, DEW_POINT_TEMPERATURE, SOLAR_RADIATION, RAINFALL, SNOWFALL are numerical variables/columns and require normalization. Moreover, RENTED_BIKE_COUNT and TEMPERATURE have some missing values (NA’s) that need to be handled properly.
SEASONS, HOLIDAY, FUNCTIONING_DAY are categorical variables which need to be converted into indicator columns or dummy variables. Also, HOUR is read as a numerical variable but it is in fact a categorical variable with levels ranging from 0 to 23.
Now that you have some basic ideas about how to process this bike-sharing demand dataset, let’s start working on it!
I saw that RENTED_BIKE_COUNT only has about 3% missing values (295 / 8760). As such, you can safely drop any rows whose RENTED_BIKE_COUNT has missing values.
TODO: Drop rows with missing values in the
RENTED_BIKE_COUNT
column
# Drop rows with `RENTED_BIKE_COUNT` column == NA
seoul_bike_sharing <- seoul_bike_sharing %>% filter(!is.na(RENTED_BIKE_COUNT))
dim(seoul_bike_sharing)
## [1] 8465 14
Unlike the RENTED_BIKE_COUNT variable, TEMPERATURE is not a response variable. However, it is still an important predictor variable.
How do I handle missing values for TEMPERATURE? I could simply remove the rows but it’s better to impute them because TEMPERATURE should be relatively easy and reliable to estimate statistically.
I took a look at the missing values in the TEMPERATURE column.
seoul_bike_sharing %>%
filter(is.na(TEMPERATURE))
## # A tibble: 11 × 14
## DATE RENTED_BIKE_COUNT HOUR TEMPERATURE HUMIDITY WIND_SPEED VISIBILITY
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 07/06/2018 3221 18 NA 57 2.7 1217
## 2 12/06/2018 1246 14 NA 45 2.2 1961
## 3 13/06/2018 2664 17 NA 57 3.3 919
## 4 17/06/2018 2330 17 NA 58 3.3 865
## 5 20/06/2018 2741 19 NA 61 2.7 1236
## 6 30/06/2018 1144 13 NA 87 1.7 390
## 7 05/07/2018 827 10 NA 75 1.1 1028
## 8 11/07/2018 634 9 NA 96 0.6 450
## 9 12/07/2018 593 6 NA 93 1.1 852
## 10 21/07/2018 347 4 NA 77 1.2 1203
## 11 21/08/2018 1277 23 NA 75 0.1 1892
## # ℹ 7 more variables: DEW_POINT_TEMPERATURE <dbl>, SOLAR_RADIATION <dbl>,
## # RAINFALL <dbl>, SNOWFALL <dbl>, SEASONS <chr>, HOLIDAY <chr>,
## # FUNCTIONING_DAY <chr>
It seems that all of the missing values for TEMPERATURE are found in SEASONS == Summer, so it is reasonable to impute those missing values with the summer average temperature.
TODO: Impute missing values for the TEMPERATURE column using its mean value.
# Calculate the summer average temperature
summer_avg_temp <- seoul_bike_sharing %>%
filter(SEASONS == "Summer") %>%
select(TEMPERATURE) %>%
summarise(mean(TEMPERATURE, na.rm = TRUE)) %>%
unlist() %>%
unname()
# Impute missing values for TEMPERATURE column with summer average temperature
seoul_bike_sharing$TEMPERATURE <- replace_na(seoul_bike_sharing$TEMPERATURE, summer_avg_temp)
# Check NA values has been replaced and no more NA values
seoul_bike_sharing %>%
filter(is.na(TEMPERATURE))
## # A tibble: 0 × 14
## # ℹ 14 variables: DATE <chr>, RENTED_BIKE_COUNT <dbl>, HOUR <dbl>,
## # TEMPERATURE <dbl>, HUMIDITY <dbl>, WIND_SPEED <dbl>, VISIBILITY <dbl>,
## # DEW_POINT_TEMPERATURE <dbl>, SOLAR_RADIATION <dbl>, RAINFALL <dbl>,
## # SNOWFALL <dbl>, SEASONS <chr>, HOLIDAY <chr>, FUNCTIONING_DAY <chr>
# Save the dataset as `seoul_bike_sharing.csv`
write.csv(seoul_bike_sharing, "seoul_bike_sharing.csv")
Regression models can not process categorical variables directly, thus I need to convert them into indicator variables.
In the bike-sharing demand dataset, SEASONS and HOLIDAY are categorical variables. Also, HOUR is read as a numerical variable but it is in fact a categorical variable with levels ranged from 0 to 23.
TODO: Convert HOUR
column from numeric into
character first:
# Using mutate() function to convert HOUR column into character type
seoul_bike_sharing <- seoul_bike_sharing %>% mutate(HOUR = as.character(HOUR))
TODO: Convert SEASONS
, HOLIDAY
,
and HOUR
columns into indicator columns.
# Convert SEASONS, HOLIDAY and HOUR columns into indicator columns.
col <- c("SEASONS", "HOLIDAY", "HOUR")
feature <- function(x) {
for (x in col) {
seoul_bike_sharing <<- seoul_bike_sharing %>%
mutate(dummy = 1) %>%
spread(key = x, value = dummy, fill = 0)
}
}
feature()
# Print the dataset summary again to make sure the indicator columns are created properly
summary(seoul_bike_sharing)
## DATE RENTED_BIKE_COUNT TEMPERATURE HUMIDITY
## Length:8465 Min. : 2.0 Min. :-17.80 Min. : 0.00
## Class :character 1st Qu.: 214.0 1st Qu.: 3.00 1st Qu.:42.00
## Mode :character Median : 542.0 Median : 13.50 Median :57.00
## Mean : 729.2 Mean : 12.77 Mean :58.15
## 3rd Qu.:1084.0 3rd Qu.: 22.70 3rd Qu.:74.00
## Max. :3556.0 Max. : 39.40 Max. :98.00
## WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE SOLAR_RADIATION
## Min. :0.000 Min. : 27 Min. :-30.600 Min. :0.0000
## 1st Qu.:0.900 1st Qu.: 935 1st Qu.: -5.100 1st Qu.:0.0000
## Median :1.500 Median :1690 Median : 4.700 Median :0.0100
## Mean :1.726 Mean :1434 Mean : 3.945 Mean :0.5679
## 3rd Qu.:2.300 3rd Qu.:2000 3rd Qu.: 15.200 3rd Qu.:0.9300
## Max. :7.400 Max. :2000 Max. : 27.200 Max. :3.5200
## RAINFALL SNOWFALL FUNCTIONING_DAY Autumn
## Min. : 0.0000 Min. :0.00000 Length:8465 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.:0.00000 Class :character 1st Qu.:0.0000
## Median : 0.0000 Median :0.00000 Mode :character Median :0.0000
## Mean : 0.1491 Mean :0.07768 Mean :0.2288
## 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :35.0000 Max. :8.80000 Max. :1.0000
## Spring Summer Winter Holiday
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2552 Mean :0.2608 Mean :0.2552 Mean :0.0482
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## No Holiday 0 1 10
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :1.0000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.9518 Mean :0.04158 Mean :0.04158 Mean :0.0417
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## 11 12 13 14
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0417 Mean :0.0417 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## 15 16 17 18
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0417 Mean :0.0417 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## 19 2 20 21
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.0417 Mean :0.04158 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## 22 23 3 4
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.0417 Mean :0.0417 Mean :0.04158 Mean :0.04158
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.00000
## 5 6 7 8
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.04158 Mean :0.04158 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## 9
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.0417
## 3rd Qu.:0.0000
## Max. :1.0000
# drop the new added column
seoul_bike_sharing <- seoul_bike_sharing[, -1]
# Save the dataset as `seoul_bike_sharing_converted.csv`
write_csv(seoul_bike_sharing, "seoul_bike_sharing_converted.csv")
In this project, I was asked to use Min-max normalization.
TODO: Apply min-max normalization on
RENTED_BIKE_COUNT
, TEMPERATURE
,
HUMIDITY
, WIND_SPEED
, VISIBILITY
,
DEW_POINT_TEMPERATURE
, SOLAR_RADIATION
,
RAINFALL
, SNOWFALL
# Use the `mutate()` function to apply min-max normalization on columns
# `RENTED_BIKE_COUNT`, `TEMPERATURE`, `HUMIDITY`, `WIND_SPEED`, `VISIBILITY`, `DEW_POINT_TEMPERATURE`, `SOLAR_RADIATION`, `RAINFALL`, `SNOWFALL`
# We can use rescale function with to values as 0 and 1 for min/ max normalization.
seoul_bike_sharing <<- seoul_bike_sharing %>%
mutate(RENTED_BIKE_COUNT = rescale(RENTED_BIKE_COUNT, to = 0:1), TEMPERATURE = rescale(TEMPERATURE, to = 0:1), HUMIDITY = rescale(HUMIDITY, to = 0:1), WIND_SPEED = rescale(WIND_SPEED, to = 0:1), VISIBILITY = rescale(VISIBILITY, to = 0:1), DEW_POINT_TEMPERATURE = rescale(DEW_POINT_TEMPERATURE, to = 0:1), SOLAR_RADIATION = rescale(SOLAR_RADIATION, to = 0:1), RAINFALL = rescale(RAINFALL, to = 0:1), SNOWFALL = rescale(SNOWFALL, to = 0:1))
# Print the summary of the dataset again to make sure the numeric columns range between 0 and 1
summary(seoul_bike_sharing)
## RENTED_BIKE_COUNT TEMPERATURE HUMIDITY WIND_SPEED
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.05965 1st Qu.:0.3636 1st Qu.:0.4286 1st Qu.:0.1216
## Median :0.15194 Median :0.5472 Median :0.5816 Median :0.2027
## Mean :0.20460 Mean :0.5345 Mean :0.5933 Mean :0.2332
## 3rd Qu.:0.30445 3rd Qu.:0.7080 3rd Qu.:0.7551 3rd Qu.:0.3108
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## VISIBILITY DEW_POINT_TEMPERATURE SOLAR_RADIATION RAINFALL
## Min. :0.0000 Min. :0.0000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.4602 1st Qu.:0.4412 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.8429 Median :0.6107 Median :0.002841 Median :0.000000
## Mean :0.7131 Mean :0.5977 Mean :0.161326 Mean :0.004261
## 3rd Qu.:1.0000 3rd Qu.:0.7924 3rd Qu.:0.264205 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.0000 Max. :1.000000 Max. :1.000000
## SNOWFALL FUNCTIONING_DAY Autumn Spring
## Min. :0.000000 Length:8465 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000000 Class :character 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000000 Mode :character Median :0.0000 Median :0.0000
## Mean :0.008828 Mean :0.2288 Mean :0.2552
## 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.000000 Max. :1.0000 Max. :1.0000
## Summer Winter Holiday No Holiday
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.2608 Mean :0.2552 Mean :0.0482 Mean :0.9518
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## 0 1 10 11
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.04158 Mean :0.04158 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## 12 13 14 15
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0417 Mean :0.0417 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## 16 17 18 19
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0417 Mean :0.0417 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## 2 20 21 22
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.04158 Mean :0.0417 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## 23 3 4 5
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.0417 Mean :0.04158 Mean :0.04158 Mean :0.04158
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## 6 7 8 9
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.04158 Mean :0.0417 Mean :0.0417 Mean :0.0417
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
# Save the dataset as `seoul_bike_sharing_converted_normalized.csv`
write_csv(seoul_bike_sharing, "seoul_bike_sharing_converted_normalized.csv")
Standardize the column names again for the new datasets
# Dataset list
dataset_list <- c("seoul_bike_sharing.csv", "seoul_bike_sharing_converted.csv", "seoul_bike_sharing_converted_normalized.csv")
for (dataset_name in dataset_list) {
# Read dataset
dataset <- read_csv(dataset_name)
# Standardized its columns:
# Convert all columns names to uppercase
names(dataset) <- toupper(names(dataset))
# Replace any white space separators by underscore, using str_replace_all function
names(dataset) <- str_replace_all(names(dataset), " ", "_")
# Save the dataset back
write.csv(dataset, dataset_name, row.names = FALSE)
}
## New names:
## Rows: 8465 Columns: 15
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): DATE, SEASONS, HOLIDAY, FUNCTIONING_DAY dbl (11): ...1, RENTED_BIKE_COUNT,
## HOUR, TEMPERATURE, HUMIDITY, WIND_SPEED, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 8465 Columns: 40
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): FUNCTIONING_DAY dbl (39): RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY,
## WIND_SPEED, VISIBILITY, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 8465 Columns: 40
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): FUNCTIONING_DAY dbl (39): RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY,
## WIND_SPEED, VISIBILITY, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
I created 4 tables for the 4 datasets we have on IBM cloud DB2 service using SQL script editor.
I created the following tables:
world_cities bike_sharing cities_weather_forecast SEOUL_BIKE_SHARING
-- Create query for world_cities
CREATE TABLE WORLD_CITIES (
CITY VARCHAR(50),
CITY_ASCII VARCHAR(50),
LAT DECIMAL(20,2),
LNG DECIMAL(20,2),
COUNTRY VARCHAR(50),
ISO2 VARCHAR(5),
ISO3 VARCHAR(5),
ADMIN_NAME VARCHAR(100),
CAPITAL VARCHAR(50),
POPULATION BIGINT,
ID BIGINT NOT NULL
);
-- Create query for bike_sharing
CREATE TABLE BIKE_SHARING_SYSTEMS (
COUNTRY VARCHAR(20),
CITY VARCHAR(87),
SYSTEM VARCHAR(40),
BICYCLES NUMERIC
);
-- Create query for cities_weather_forecast
CREATE TABLE CITIES_WEATHER_FORECAST (
CITY VARCHAR(16),
WEATHER VARCHAR(6),
VISIBILITY SMALLINT,
TEMP DECIMAL(6,2),
TEMP_MIN DECIMAL(6,2),
TEMP_MAX DECIMAL(6,2),
PRESSURE SMALLINT,
HUMIDITY SMALLINT,
WIND_SPEED DECIMAL(6,2),
WIND_DEG SMALLINT,
SEASON VARCHAR(6),
FORECAST_DATETIME TIMESTAMP
);
-- Create query for seoul_bike_sharing
CREATE TABLE SEOUL_BIKE_SHARING (
DATE VARCHAR(30),
RENTED_BIKE_COUNT SMALLINT,
HOUR SMALLINT,
TEMPERATURE DECIMAL(4,1),
HUMIDITY SMALLINT,
WIND_SPEED DECIMAL(3,1),
VISIBILITY SMALLINT,
DEW_POINT_TEMPERATURE DECIMAL(4,1),
SOLAR_RADIATION DECIMAL(5,2),
RAINFALL DECIMAL(3,1),
SNOWFALL DECIMAL(3,1),
SEASONS VARCHAR(10),
HOLIDAY VARCHAR(20),
FUNCTIONING_DAY VARCHAR(5)
);
I loaded the following CSV files using db2 GUI into the 4 tables created.
raw_worldcities.csv bike_sharing_systems.csv cities_weather_forecast.csv seoul_bike_sharing.csv I used these CSVs file before normalization and converting categorical columns to feature.
The project requirement was to use RODBC but I wasn’t able to setup RODBC, I used RJDBC on my local machine but again I am not able to setup ROJDBC so I will be using SQL lite.
# Define database connection
con <- dbConnect(RSQLite::SQLite(),"RDB.sqlite")
con
## <SQLiteConnection>
## Path: C:\Users\MODEL23\Documents\RDB.sqlite
## Extensions: TRUE
# Read CSV into R Dataframe
worldcities <- read.csv("raw_worldcities.csv")
bike_sharing_systems <- read.csv("bike_sharing_systems.csv")
cities_weather_forecast <- read.csv("raw_cities_weather_forecast.csv")
seoul_bike_sharing <- read.csv("seoul_bike_sharing.csv")
dbListTables(con)
## [1] "BIKE_SHARING_SYSTEMS" "CITIES_WEATHER_FORECAST"
## [3] "SEOUL_BIKE_SHARING" "WORLD_CITIES"
Determine how many records are in the seoul_bike_sharing dataset.
dbGetQuery(con, "SELECT count(*) as Count_of_Records FROM seoul_bike_sharing")
## Count_of_Records
## 1 8465
Determine how many hours had non-zero rented bike count
dbGetQuery(con, "SELECT count(HOUR) as Numer_of_hours FROM seoul_bike_sharing
WHERE RENTED_BIKE_COUNT > 0")
## Numer_of_hours
## 1 8465
Query the the weather forecast for Seoul over the next 3 hours. I Recalled that the records in the CITIES_WEATHER_FORECAST dataset are 3 hours apart, so I just need the first record from the query
dbGetQuery(con, "SELECT * FROM CITIES_WEATHER_FORECAST
WHERE CITY = 'Seoul'
Limit 1")
## CITY WEATHER VISIBILITY TEMP TEMP_MIN TEMP_MAX PRESSURE HUMIDITY WIND_SPEED
## 1 Seoul Rain 10000 23.49 22.96 23.49 1011 97 2.26
## WIND_DEG FORECAST_DATETIME
## 1 223 2025-09-16 12:00:00
Find which seasons are included in the seoul bike sharing dataset
dbGetQuery(con, "SELECT DISTINCT(SEASONS) FROM seoul_bike_sharing")
## SEASONS
## 1 Winter
## 2 Spring
## 3 Summer
## 4 Autumn
Find the first and last dates in the Seoul Bike Sharing dataset.
dbGetQuery(con, "SELECT MIN(DATE) as Start_Date, MAX(DATE) as End_Date FROM seoul_bike_sharing")
## Start_Date End_Date
## 1 01/01/2018 31/12/2017
determine which date and hour had the most bike rentals.
dbGetQuery(con, "SELECT DATE, HOUR, RENTED_BIKE_COUNT as Maximum_COUNT FROM seoul_bike_sharing
WHERE RENTED_BIKE_COUNT = (SELECT MAX(RENTED_BIKE_COUNT) FROM seoul_bike_sharing)")
## DATE HOUR Maximum_COUNT
## 1 19/06/2018 18 3556
Determine the average hourly temperature and the average number of bike rentals per hour over each season. List the top ten results by average bike count.
dbGetQuery(con, "SELECT SEASONS, HOUR, AVG(RENTED_BIKE_COUNT), AVG(TEMPERATURE) FROM seoul_bike_sharing GROUP BY SEASONS, HOUR ORDER BY AVG(RENTED_BIKE_COUNT) desc limit 10")
## SEASONS HOUR AVG(RENTED_BIKE_COUNT) AVG(TEMPERATURE)
## 1 Summer 18 2135.141 29.38791
## 2 Autumn 18 1983.333 16.03185
## 3 Summer 19 1889.250 28.27378
## 4 Summer 20 1801.924 27.06630
## 5 Summer 21 1754.065 26.27826
## 6 Spring 18 1689.311 15.97222
## 7 Summer 22 1567.870 25.69891
## 8 Autumn 17 1562.877 17.27778
## 9 Summer 17 1526.293 30.07691
## 10 Autumn 19 1515.568 15.06346
Find the average hourly bike count during each season. Also include the minimum, maximum, and standard deviation of the hourly bike count for each season.
No Standard deviation function available in SQLlite so I will not include it.
dbGetQuery(con, " SELECT SEASONS, AVG(RENTED_BIKE_COUNT) as AVG_S_COUNT, MIN(RENTED_BIKE_COUNT) as MIN_S_COUNT, MAX(RENTED_BIKE_COUNT) as MAX_S_COUNT FROM seoul_bike_sharing
group by (SEASONS)
Order by AVG_S_COUNT desc")
## SEASONS AVG_S_COUNT MIN_S_COUNT MAX_S_COUNT
## 1 Summer 1034.0734 9 3556
## 2 Autumn 924.1105 2 3298
## 3 Spring 746.2542 2 3251
## 4 Winter 225.5412 3 937
Consider the weather over each season. On average, what were the TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBILITY, DEW_POINT_TEMPERATURE, SOLAR_RADIATION, RAINFALL, and SNOWFALL per season? Included the average bike count as well , and rank the results by average bike count so Ican see if it is correlated with the weather at all.
dbGetQuery(con," SELECT SEASONS, AVG(RENTED_BIKE_COUNT) as AVG_S_COUNT, AVG(TEMPERATURE) as AVG_S_TEMP, AVG(HUMIDITY) as AVG_S_HUMIDITY, AVG(WIND_SPEED) as AVG_WIND_SPEED, AVG(VISIBILITY) as AVG_VISIBILITY, AVG(DEW_POINT_TEMPERATURE) as AVG_DEW_POINT, AVG(SOLAR_RADIATION) as AVG_SOLAR_RADIATION, AVG(RAINFALL) as AVG_RAINFALL, AVG(SNOWFALL) as AVG_SNOWFALL FROM seoul_bike_sharing
group by (SEASONS)
Order by AVG_S_COUNT desc")
## SEASONS AVG_S_COUNT AVG_S_TEMP AVG_S_HUMIDITY AVG_WIND_SPEED AVG_VISIBILITY
## 1 Summer 1034.0734 26.587711 64.98143 1.609420 1501.745
## 2 Autumn 924.1105 13.821580 59.04491 1.492101 1558.174
## 3 Spring 746.2542 13.021685 58.75833 1.857778 1240.912
## 4 Winter 225.5412 -2.540463 49.74491 1.922685 1445.987
## AVG_DEW_POINT AVG_SOLAR_RADIATION AVG_RAINFALL AVG_SNOWFALL
## 1 18.750136 0.7612545 0.25348732 0.00000000
## 2 5.150594 0.5227827 0.11765617 0.06350026
## 3 4.091389 0.6803009 0.18694444 0.00000000
## 4 -12.416667 0.2981806 0.03282407 0.24750000
I clearly saw that the more warm the season is the more bikes are being rented, so we can clearly say from the results that Season, AVG temp, AVG humidity, Avg rainfall is correlated to the AVG bike count.
Use an implicit join across the WORLD_CITIES and the BIKE_SHARING_SYSTEMS tables to determine the total number of bikes avaialble in Seoul, plus the following city information about Seoul: CITY, COUNTRY, LAT, LON, POPULATION, in a single view. I noticed that in this case, the CITY column will work for the WORLD_CITIES table, but in general you would have to use the CITY_ASCII column.
dbGetQuery(con, "SELECT B.CITY, W.LAT, W.LNG, W.POPULATION FROM BIKE_SHARING_SYSTEMS AS B
LEFT JOIN WORLD_CITIES AS W ON B.CITY = W.CITY_ASCII
WHERE B.CITY = 'Seoul'")
## CITY LAT LNG POPULATION
## 1 Seoul 37.5833 127 21794000
## 2 Seoul 37.5833 127 21794000
## 3 Seoul 37.5833 127 21794000
## 4 Seoul 37.5833 127 21794000
## 5 Seoul 37.5833 127 21794000
## 6 Seoul 37.5833 127 21794000
## 7 Seoul 37.5833 127 21794000
## 8 Seoul 37.5833 127 21794000
## 9 Seoul 37.5833 127 21794000
## 10 Seoul 37.5833 127 21794000
## 11 Seoul 37.5833 127 21794000
## 12 Seoul 37.5833 127 21794000
## 13 Seoul 37.5833 127 21794000
## 14 Seoul 37.5833 127 21794000
## 15 Seoul 37.5833 127 21794000
## 16 Seoul 37.5833 127 21794000
## 17 Seoul 37.5833 127 21794000
## 18 Seoul 37.5833 127 21794000
## 19 Seoul 37.5833 127 21794000
## 20 Seoul 37.5833 127 21794000
## 21 Seoul 37.5833 127 21794000
## 22 Seoul 37.5833 127 21794000
## 23 Seoul 37.5833 127 21794000
## 24 Seoul 37.5833 127 21794000
## 25 Seoul 37.5833 127 21794000
## 26 Seoul 37.5833 127 21794000
## 27 Seoul 37.5833 127 21794000
## 28 Seoul 37.5833 127 21794000
## 29 Seoul 37.5833 127 21794000
## 30 Seoul 37.5833 127 21794000
## 31 Seoul 37.5833 127 21794000
## 32 Seoul 37.5833 127 21794000
## 33 Seoul 37.5833 127 21794000
## 34 Seoul 37.5833 127 21794000
## 35 Seoul 37.5833 127 21794000
## 36 Seoul 37.5833 127 21794000
## 37 Seoul 37.5833 127 21794000
## 38 Seoul 37.5833 127 21794000
## 39 Seoul 37.5833 127 21794000
## 40 Seoul 37.5833 127 21794000
dbListTables(con)
## [1] "BIKE_SHARING_SYSTEMS" "CITIES_WEATHER_FORECAST"
## [3] "SEOUL_BIKE_SHARING" "WORLD_CITIES"
Load the seoul_bike_sharing data into a dataframe, this will the cleaned CSV before we did normalization and converting some columns to numerical columns.
seoul_bike_sharing <- read.csv("seoul_bike_sharing.csv")
Use the format of the data, namely “%d/%m/%Y”
seoul_bike_sharing$DATE <- as.Date(seoul_bike_sharing$DATE, format = "%d/%m/%Y")
Also, coerce its levels to be an ordered sequence. This will ensure your visualizations correctly utilize HOURS as a discrete variable with the expected ordering.
seoul_bike_sharing$HOUR <- as.factor(seoul_bike_sharing$HOUR)
Check the structure of the dataframe
str(seoul_bike_sharing)
## 'data.frame': 8465 obs. of 15 variables:
## $ ...1 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DATE : Date, format: "2017-12-01" "2017-12-01" ...
## $ RENTED_BIKE_COUNT : int 254 204 173 107 78 100 181 460 930 490 ...
## $ HOUR : Factor w/ 24 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ TEMPERATURE : num -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
## $ HUMIDITY : int 37 38 39 40 36 37 35 38 37 27 ...
## $ WIND_SPEED : num 2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
## $ VISIBILITY : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 1928 ...
## $ DEW_POINT_TEMPERATURE: num -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
## $ SOLAR_RADIATION : num 0 0 0 0 0 0 0 0 0.01 0.23 ...
## $ RAINFALL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNOWFALL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SEASONS : chr "Winter" "Winter" "Winter" "Winter" ...
## $ HOLIDAY : chr "No Holiday" "No Holiday" "No Holiday" "No Holiday" ...
## $ FUNCTIONING_DAY : chr "Yes" "Yes" "Yes" "Yes" ...
Finally, ensure there are no missing values
sum(is.na(seoul_bike_sharing))
## [1] 0
summary(seoul_bike_sharing)
## ...1 DATE RENTED_BIKE_COUNT HOUR
## Min. : 1 Min. :2017-12-01 Min. : 2.0 7 : 353
## 1st Qu.:2117 1st Qu.:2018-02-27 1st Qu.: 214.0 8 : 353
## Median :4233 Median :2018-05-28 Median : 542.0 9 : 353
## Mean :4233 Mean :2018-05-28 Mean : 729.2 10 : 353
## 3rd Qu.:6349 3rd Qu.:2018-08-24 3rd Qu.:1084.0 11 : 353
## Max. :8465 Max. :2018-11-30 Max. :3556.0 12 : 353
## (Other):6347
## TEMPERATURE HUMIDITY WIND_SPEED VISIBILITY
## Min. :-17.80 Min. : 0.00 Min. :0.000 Min. : 27
## 1st Qu.: 3.00 1st Qu.:42.00 1st Qu.:0.900 1st Qu.: 935
## Median : 13.50 Median :57.00 Median :1.500 Median :1690
## Mean : 12.77 Mean :58.15 Mean :1.726 Mean :1434
## 3rd Qu.: 22.70 3rd Qu.:74.00 3rd Qu.:2.300 3rd Qu.:2000
## Max. : 39.40 Max. :98.00 Max. :7.400 Max. :2000
##
## DEW_POINT_TEMPERATURE SOLAR_RADIATION RAINFALL SNOWFALL
## Min. :-30.600 Min. :0.0000 Min. : 0.0000 Min. :0.00000
## 1st Qu.: -5.100 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000
## Median : 4.700 Median :0.0100 Median : 0.0000 Median :0.00000
## Mean : 3.945 Mean :0.5679 Mean : 0.1491 Mean :0.07768
## 3rd Qu.: 15.200 3rd Qu.:0.9300 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. : 27.200 Max. :3.5200 Max. :35.0000 Max. :8.80000
##
## SEASONS HOLIDAY FUNCTIONING_DAY
## Length:8465 Length:8465 Length:8465
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
Holiday columns has two values No Holiday and Holiday so I need just to filter the date for holiday as answer and then count, I have a raw for each hour so I must divide the total by 24.
holidays_count <- seoul_bike_sharing %>%
filter(HOLIDAY == "Holiday") %>%
count(HOLIDAY) %>%
summarise(count = n / 24)
holidays_count
## count
## 1 17
total_records <- seoul_bike_sharing %>%
count() %>%
summarise(count = n / 24)
holidays_perc <- (holidays_count / total_records) * 100
holidays_perc
## count
## 1 4.819846
total_records
## count
## 1 352.7083
seoul_bike_sharing %>%
count(FUNCTIONING_DAY)
## FUNCTIONING_DAY n
## 1 Yes 8465
seoul_bike_sharing %>%
group_by(SEASONS) %>%
summarize(total_rainfall = sum(RAINFALL), total_snowfall = sum(SNOWFALL))
## # A tibble: 4 × 3
## SEASONS total_rainfall total_snowfall
## <chr> <dbl> <dbl>
## 1 Autumn 228. 123
## 2 Spring 404. 0
## 3 Summer 560. 0
## 4 Winter 70.9 535.
ggplot(seoul_bike_sharing, aes(x = DATE, y = RENTED_BIKE_COUNT, color = "blue", alpha = 0.25)) +
geom_point() +
theme(legend.position = "none")
I saw the rented bike count start to increase around FEB/March and reach
the max on June then decrease little bit towards AUG then increase
around SEP and then start decreasing again towards the end of the
year
ggplot(seoul_bike_sharing, aes(x = DATE, y = RENTED_BIKE_COUNT, color = HOUR, alpha = 0.25)) +
geom_point()
I saw the rented bike count are to low at the dawn and start to increase
slowly during the early hours of the morning to reach to max at the
evening in 6 or 7 then start decreasing again.
Normalize the histogram so the y axis represents ‘density’. This can be done by setting y=..density.. in the aesthetics of the histogram.
ggplot(seoul_bike_sharing, aes(x = RENTED_BIKE_COUNT)) +
geom_histogram(aes(y = ..density..),
colour = 1, fill = "white", alpha = 0.5
) +
geom_density(aes(color = "blue")) +
theme(legend.position = "none")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
I saw clearly see that on most days the number of rented bikes are
around 125-375 and only in rare occasions we see the number goes above
2500.
ggplot(seoul_bike_sharing, aes(x = TEMPERATURE, y = RENTED_BIKE_COUNT, color = HOUR, alpha = 0.25)) +
geom_point() +
facet_wrap(~SEASONS)
This is a very interesting plot, I saw that in Autumn and spring where the temperature is almost similar between 0 to 20, people tend to use the bikes in the same hours and the counts of bikes is very similar, in general people tend to use more bikes on warmer weather.
However in summer I saw that people still use the bikes on the same hours, however the count of bikes become less when the weather become hotter.
In Winter I saw that there is a huge drop in the number of bikes rented with a maximum limit of 1000 bikes, also people tend to use the bikes more in early morning and evening around 6-7.
Use facet_wrap to generate four plots corresponding to the seasons
ggplot(seoul_bike_sharing, aes(x = HOUR, y = RENTED_BIKE_COUNT)) +
geom_boxplot(fill = "bisque") +
facet_wrap(~SEASONS)
I saw here that there is many Outliers in our data sets for the count of
bikes during hours in different season.
In general people tends to use the bikes in the same times during the different seasons with slightly different counts expect in Winter were there is a huge drop in the number of bikes being rented.
seoul_bike_sharing %>%
group_by(DATE) %>%
summarise(daily_rainfall = sum(RAINFALL), daily_snowfall = sum(SNOWFALL))
## # A tibble: 353 × 3
## DATE daily_rainfall daily_snowfall
## <date> <dbl> <dbl>
## 1 2017-12-01 0 0
## 2 2017-12-02 0 0
## 3 2017-12-03 4 0
## 4 2017-12-04 0.1 0
## 5 2017-12-05 0 0
## 6 2017-12-06 1.3 8.6
## 7 2017-12-07 0 10.4
## 8 2017-12-08 0 0
## 9 2017-12-09 0 0
## 10 2017-12-10 4.1 32.5
## # ℹ 343 more rows
I ploted the results as per the below
seoul_bike_sharing %>%
group_by(DATE) %>%
summarise(daily_rainfall = sum(RAINFALL), daily_snowfall = sum(SNOWFALL)) %>%
pivot_longer(!DATE) %>%
ggplot() +
geom_point(aes(x = DATE, y = value, color = name))
seoul_bike_sharing %>%
group_by(DATE) %>%
summarise(daily_rainfall = sum(RAINFALL), daily_snowfall = sum(SNOWFALL)) %>%
filter(daily_snowfall > 0) %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 27
In addition to the mentioned tasks I creat a visualization to show bike count depending on the day of the week.
seoul_bike_sharing %>%
mutate(DAY_OF_WEEK = as.factor(wday(DATE, abbr = TRUE, label = TRUE))) %>%
ggplot(aes(x = DAY_OF_WEEK, y = RENTED_BIKE_COUNT, color = HOUR, alpha = 0.25)) +
geom_point() +
facet_wrap(~SEASONS)
I can see the count of bikes is higher on weekdays than on weekends, which suggests that some users are using bikes on weekdays as commute to work, also on weekends on Winter and Autumn count of bikes peaks around 12-16 which is not usual on other days.
LAB: Predict Hourly Rented Bike Count using Basic Linear Regression Models:
LAB: Refine the Baseline Regression Models:
The seoul_bike_sharing_converted_normalized.csv will be our main dataset which has following variables:
The response variable:
RENTED BIKE COUNT- Count of bikes rented at each hour
Weather predictor variables:
TEMPERATURE - Temperature in Celsius HUMIDITY - Unit is % WIND_SPEED - Unit is m/s VISIBILITY - Multiplied by 10m DEW_POINT_TEMPERATURE - The temperature to which the air would have to cool down in order to reach saturation, unit is Celsius SOLAR_RADIATION - MJ/m2 RAINFALL - mm SNOWFALL - cm
Date/time predictor variables:
HOUR- Hour of he day HOLIDAY - Holiday/No holiday SEASONS - Winter, Spring, Summer, Autumn
I will load the datasets. I will remove FUNCTIONING_DAY column as it has only one value.
bike_sharing_df <- read_csv("seoul_bike_sharing_converted_normalized.csv")
## Rows: 8465 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): FUNCTIONING_DAY
## dbl (39): RENTED_BIKE_COUNT, TEMPERATURE, HUMIDITY, WIND_SPEED, VISIBILITY, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bike_sharing_df <- bike_sharing_df[, -10]
TODO: Use the initial_split()
,
training()
, and testing()
functions to
generate a training dataset consisting of 75% of the original dataset,
and a testing dataset using the remaining 25%.
# Use the `initial_split()`, `training()`, and `testing()` functions to split the dataset
# With seed 1234
set.seed(1234)
# prop = 3/4
bike_sharing_split <- initial_split(bike_sharing_df, prop = 0.75)
# train_data
bike_sharing_training <- training(bike_sharing_split)
# test_data
bike_sharing_testing <- testing(bike_sharing_split)
TODO: Build a linear regression model called lm_model_weather
# Use `linear_reg()` with engine `lm` and mode `regression`
lm_model_weather <- linear_reg(mode = "regression", engine = "lm")
# Fit the model called `lm_model_weather`
# RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL, with the training data
training_fit <- lm_model_weather %>%
fit(RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY + WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + RAINFALL + SNOWFALL, data = bike_sharing_training)
# print(lm_model_weather$fit)
print(training_fit)
## parsnip model object
##
##
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ TEMPERATURE + HUMIDITY +
## WIND_SPEED + VISIBILITY + DEW_POINT_TEMPERATURE + SOLAR_RADIATION +
## RAINFALL + SNOWFALL, data = data)
##
## Coefficients:
## (Intercept) TEMPERATURE HUMIDITY
## 0.043532 0.675221 -0.258408
## WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## 0.113806 0.003533 -0.089172
## SOLAR_RADIATION RAINFALL SNOWFALL
## -0.125170 -0.496346 0.089414
TODO: Build a linear regression model called
lm_model_all
using all variables
RENTED_BIKE_COUNT ~ .
# Use `linear_reg()` with engine `lm` and mode `regression`
lm_model_all <- linear_reg(mode = "regression", engine = "lm")
# Fit the model called `lm_model_all`
# `RENTED_BIKE_COUNT ~ .` means use all other variables except for the response variable
all_fit <- lm_model_all %>%
fit(RENTED_BIKE_COUNT ~ ., data = bike_sharing_training)
# print(lm_model_weather$fit)
print(all_fit)
## parsnip model object
##
##
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ ., data = data)
##
## Coefficients:
## (Intercept) TEMPERATURE HUMIDITY
## 0.059144 0.220219 -0.249502
## WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## 0.008979 0.006154 0.168370
## SOLAR_RADIATION RAINFALL SNOWFALL
## 0.077907 -0.580933 0.073431
## AUTUMN SPRING SUMMER
## 0.101013 0.053845 0.055752
## WINTER HOLIDAY NO_HOLIDAY
## NA -0.035009 NA
## `0` `1` `10`
## -0.008244 -0.032878 -0.066831
## `11` `12` `13`
## -0.069607 -0.058622 -0.053842
## `14` `15` `16`
## -0.054148 -0.030876 0.006508
## `17` `18` `19`
## 0.085973 0.223636 0.147155
## `2` `20` `21`
## -0.066745 0.121552 0.125656
## `22` `23` `3`
## 0.096410 0.029209 -0.090003
## `4` `5` `6`
## -0.108692 -0.102060 -0.057434
## `7` `8` `9`
## 0.030039 0.126893 NA
TODO: Make predictions on the testing dataset using both
lm_model_weather
and lm_model_all
models
# Use predict() function to generate test results for `lm_model_weather` and `lm_model_all`
weather_train_results <- training_fit %>%
predict(new_data = bike_sharing_training)
all_train_results <- all_fit %>%
predict(new_data = bike_sharing_training)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response", : prediction from rank-deficient fit; consider predict(.,
## rankdeficient="NA")
# and generate two test results dataframe with a truth column:
weather_train_results <- weather_train_results %>%
mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
all_train_results <- all_train_results %>%
mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
# test_results_weather for lm_model_weather model
head(weather_train_results)
## # A tibble: 6 × 2
## .pred truth
## <dbl> <dbl>
## 1 0.294 0.764
## 2 0.0749 0.117
## 3 0.261 0.311
## 4 0.0960 0.0428
## 5 0.185 0.0535
## 6 0.102 0.0594
# test_results_all for lm_model_all
head(all_train_results)
## # A tibble: 6 × 2
## .pred truth
## <dbl> <dbl>
## 1 0.512 0.764
## 2 0.102 0.117
## 3 0.295 0.311
## 4 0.0572 0.0428
## 5 0.135 0.0535
## 6 0.143 0.0594
TODO: Use rsq()
and rmse()
functions to calculate R-squared and RMSE metrics for the two test
results
rsq_weather <- rsq(weather_train_results,
truth = truth,
estimate = .pred
)
rsq_all <- rsq(all_train_results,
truth = truth,
estimate = .pred
)
rmse_weather <- rmse(weather_train_results,
truth = truth,
estimate = .pred
)
rmse_all <- rmse(all_train_results,
truth = truth,
estimate = .pred
)
rsq_weather
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.430
rsq_all
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.659
rmse_weather
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.137
rmse_all
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.106
From results we find that lm_model_all is much better.
I will now try to find the variables with the highest coefficients that contributed the most to the prediction.
all_fit %>%
tidy() %>%
arrange(desc(abs(estimate)))
## # A tibble: 39 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 RAINFALL -0.581 0.0403 -14.4 2.41e- 46
## 2 HUMIDITY -0.250 0.0280 -8.91 6.43e- 19
## 3 `18` 0.224 0.00957 23.4 7.32e-116
## 4 TEMPERATURE 0.220 0.0597 3.69 2.27e- 4
## 5 DEW_POINT_TEMPERATURE 0.168 0.0623 2.70 6.89e- 3
## 6 `19` 0.147 0.00964 15.3 1.05e- 51
## 7 `8` 0.127 0.00918 13.8 7.72e- 43
## 8 `21` 0.126 0.00959 13.1 1.05e- 38
## 9 `20` 0.122 0.00960 12.7 2.82e- 36
## 10 `4` -0.109 0.00957 -11.4 1.27e- 29
## # ℹ 29 more rows
TODO: Sort the coefficient list in descending order and
visualize the result using ggplot
and
geom_bar
# Visualize the list using ggplot and geom_bar
all_fit %>%
tidy() %>%
filter(!is.na(estimate)) %>%
ggplot(aes(x = fct_reorder(term, abs(estimate)), y = abs(estimate))) +
geom_bar(stat = "identity", fill = "RED") +
coord_flip() +
theme(axis.text.y = element_text(angle = 10, colour = "RED", size = 7)) +
xlab("variable")
Linear regression models are the most suitable models to capture the linear correlations among variables. However, in real world data, many relationships may be non-linear.
For example, the correlation between RENTED_BIKE_COUNT and TEMPERATURE does not look like linear:
ggplot(bike_sharing_training, aes(RENTED_BIKE_COUNT, TEMPERATURE)) +
geom_point() +
geom_smooth(method = "lm", color = "RED")
## `geom_smooth()` using formula = 'y ~ x'
One solution to handle such nonlinearity is using polynomial regression by adding polynomial terms. Higher order polynomials are better than the first order polynomial.
# Plot the higher order polynomial fits
ggplot(bike_sharing_training, aes(RENTED_BIKE_COUNT, TEMPERATURE)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, color = "red") +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "yellow") +
geom_smooth(method = "lm", formula = y ~ poly(x, 4), color = "green") +
geom_smooth(method = "lm", formula = y ~ poly(x, 6), color = "blue")
TODO: Fit a linear regression model
lm_poly
with
higher order polynomial terms on the important variables (larger
coefficients) found in the baseline model
# Fit a linear model with higher order polynomial on some important variables
lm_poly <- linear_reg(mode = "regression", engine = "lm") %>%
fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4), data = bike_sharing_training)
# Print model summary
summary(lm_poly$fit)
##
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE,
## 6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4),
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44191 -0.05668 0.00153 0.05221 0.40859
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0467508 0.0233009 2.006 0.04486 *
## TEMPERATURE 0.5590528 0.1348620 4.145 3.44e-05 ***
## HUMIDITY -0.1066547 0.0642850 -1.659 0.09715 .
## WIND_SPEED 0.0031707 0.0101624 0.312 0.75505
## VISIBILITY -0.0081387 0.0053278 -1.528 0.12667
## DEW_POINT_TEMPERATURE -0.2358434 0.1479894 -1.594 0.11106
## SOLAR_RADIATION 0.0003396 0.0112372 0.030 0.97589
## RAINFALL -0.3428396 0.0387976 -8.837 < 2e-16 ***
## SNOWFALL -0.0099515 0.0269518 -0.369 0.71197
## AUTUMN 0.0981633 0.0059819 16.410 < 2e-16 ***
## SPRING 0.0569147 0.0058294 9.763 < 2e-16 ***
## SUMMER 0.0881572 0.0075643 11.654 < 2e-16 ***
## WINTER NA NA NA NA
## HOLIDAY -0.0323523 0.0057678 -5.609 2.12e-08 ***
## NO_HOLIDAY NA NA NA NA
## `0` -0.0217507 0.0085521 -2.543 0.01100 *
## `1` -0.0480638 0.0084233 -5.706 1.21e-08 ***
## `10` -0.0522618 0.0081726 -6.395 1.72e-10 ***
## `11` -0.0434932 0.0084892 -5.123 3.09e-07 ***
## `12` -0.0214080 0.0086691 -2.469 0.01356 *
## `13` -0.0156167 0.0088278 -1.769 0.07694 .
## `14` -0.0126345 0.0086796 -1.456 0.14554
## `15` 0.0028753 0.0086200 0.334 0.73872
## `16` 0.0334231 0.0085130 3.926 8.72e-05 ***
## `17` 0.0997805 0.0085385 11.686 < 2e-16 ***
## `18` 0.2299977 0.0085091 27.029 < 2e-16 ***
## `19` 0.1467266 0.0085572 17.147 < 2e-16 ***
## `2` -0.0809514 0.0084329 -9.600 < 2e-16 ***
## `20` 0.1135171 0.0085240 13.317 < 2e-16 ***
## `21` 0.1135640 0.0085108 13.343 < 2e-16 ***
## `22` 0.0835330 0.0084484 9.887 < 2e-16 ***
## `23` 0.0157767 0.0084505 1.867 0.06195 .
## `3` -0.1012712 0.0085612 -11.829 < 2e-16 ***
## `4` -0.1209531 0.0085127 -14.209 < 2e-16 ***
## `5` -0.1123035 0.0083940 -13.379 < 2e-16 ***
## `6` -0.0700939 0.0084214 -8.323 < 2e-16 ***
## `7` 0.0204802 0.0084165 2.433 0.01499 *
## `8` 0.1157473 0.0081324 14.233 < 2e-16 ***
## `9` NA NA NA NA
## poly(TEMPERATURE, 6)1 NA NA NA NA
## poly(TEMPERATURE, 6)2 2.0863456 0.2466725 8.458 < 2e-16 ***
## poly(TEMPERATURE, 6)3 -2.8232422 0.1363257 -20.710 < 2e-16 ***
## poly(TEMPERATURE, 6)4 -1.9242987 0.1275559 -15.086 < 2e-16 ***
## poly(TEMPERATURE, 6)5 0.0630522 0.1064890 0.592 0.55380
## poly(TEMPERATURE, 6)6 0.4568546 0.0991038 4.610 4.11e-06 ***
## poly(HUMIDITY, 6)1 NA NA NA NA
## poly(HUMIDITY, 6)2 -1.4529202 0.2054493 -7.072 1.69e-12 ***
## poly(HUMIDITY, 6)3 -0.9842698 0.1239663 -7.940 2.38e-15 ***
## poly(HUMIDITY, 6)4 -0.5623707 0.1800087 -3.124 0.00179 **
## poly(HUMIDITY, 6)5 0.2184708 0.1602069 1.364 0.17272
## poly(HUMIDITY, 6)6 0.0696659 0.1240320 0.562 0.57436
## poly(DEW_POINT_TEMPERATURE, 4)1 NA NA NA NA
## poly(DEW_POINT_TEMPERATURE, 4)2 -3.6349692 0.2686365 -13.531 < 2e-16 ***
## poly(DEW_POINT_TEMPERATURE, 4)3 -0.6604506 0.1367294 -4.830 1.40e-06 ***
## poly(DEW_POINT_TEMPERATURE, 4)4 0.2796526 0.1219957 2.292 0.02192 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0939 on 6299 degrees of freedom
## Multiple R-squared: 0.7345, Adjusted R-squared: 0.7324
## F-statistic: 363 on 48 and 6299 DF, p-value: < 2.2e-16
TODO: Make predictions on test dataset using the
lm_poly
models
# Use predict() function to generate test results for `lm_poly`
lm_ploy_train_results <- lm_poly %>%
predict(new_data = bike_sharing_training) %>%
mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response", : prediction from rank-deficient fit; consider predict(.,
## rankdeficient="NA")
head(lm_ploy_train_results)
## # A tibble: 6 × 2
## .pred truth
## <dbl> <dbl>
## 1 0.546 0.764
## 2 0.105 0.117
## 3 0.282 0.311
## 4 0.0601 0.0428
## 5 0.122 0.0535
## 6 0.127 0.0594
a common practice is to filter the results and see if we have any negative results.
lm_ploy_train_results %>%
filter(lm_ploy_train_results$.pred < 0)
## # A tibble: 570 × 2
## .pred truth
## <dbl> <dbl>
## 1 -0.00388 0.0506
## 2 -0.0651 0.0124
## 3 -0.00266 0.0405
## 4 -0.00845 0.0366
## 5 -0.0715 0.00929
## 6 -0.000178 0.0557
## 7 -0.0691 0.00760
## 8 -0.0703 0.0225
## 9 -0.0519 0.00619
## 10 -0.0242 0.0211
## # ℹ 560 more rows
Another minor improvement we could do here is to convert all negative prediction results to zero, because we can not have negative rented bike counts
lm_ploy_train_results$.pred <- replace(lm_ploy_train_results$.pred, lm_ploy_train_results$.pred < 0, 0)
# Calculate R-squared and RMSE from the test results
rsq_lm_ploy <- rsq(lm_ploy_train_results,
truth = truth,
estimate = .pred
)
rmse_lm_ploy <- rmse(lm_ploy_train_results,
truth = truth,
estimate = .pred
)
rsq_lm_ploy
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.750
rmse_lm_ploy
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.0912
There is an improvement in both RSQ metrics than All linear model
TODO: Try adding some interaction terms to the previous polynomial models.
# Add interaction terms to the poly regression built in previous step
# HINT: You could use `*` operator to create interaction terms such as HUMIDITY*TEMPERATURE and make the formula look like:
# RENTED_BIKE_COUNT ~ RAINFALL*HUMIDITY ...
lm_poly <- linear_reg(mode = "regression", engine = "lm") %>%
fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4) + RAINFALL * HUMIDITY + HUMIDITY * TEMPERATURE + AUTUMN * HOLIDAY + AUTUMN * `18` + AUTUMN * `19` + AUTUMN * `8`, data = bike_sharing_training)
# create test results
lm_ploy_train_results <- lm_poly %>%
predict(new_data = bike_sharing_training) %>%
mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response", : prediction from rank-deficient fit; consider predict(.,
## rankdeficient="NA")
head(lm_ploy_train_results)
## # A tibble: 6 × 2
## .pred truth
## <dbl> <dbl>
## 1 0.604 0.764
## 2 0.0968 0.117
## 3 0.286 0.311
## 4 0.0534 0.0428
## 5 0.113 0.0535
## 6 0.125 0.0594
summary(lm_poly$fit)
##
## Call:
## stats::lm(formula = RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE,
## 6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4) +
## RAINFALL * HUMIDITY + HUMIDITY * TEMPERATURE + AUTUMN * HOLIDAY +
## AUTUMN * `18` + AUTUMN * `19` + AUTUMN * `8`, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41489 -0.05576 0.00173 0.05107 0.41942
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.549848 0.066463 -8.273 < 2e-16 ***
## TEMPERATURE 1.819684 0.189917 9.581 < 2e-16 ***
## HUMIDITY 1.063231 0.138860 7.657 2.19e-14 ***
## WIND_SPEED 0.005198 0.010027 0.518 0.60424
## VISIBILITY -0.005023 0.005258 -0.955 0.33941
## DEW_POINT_TEMPERATURE -0.288932 0.146231 -1.976 0.04821 *
## SOLAR_RADIATION -0.005356 0.011112 -0.482 0.62981
## RAINFALL -10.714523 1.548085 -6.921 4.92e-12 ***
## SNOWFALL -0.070822 0.027240 -2.600 0.00935 **
## AUTUMN 0.092807 0.005998 15.473 < 2e-16 ***
## SPRING 0.055491 0.005743 9.662 < 2e-16 ***
## SUMMER 0.087942 0.007449 11.806 < 2e-16 ***
## WINTER NA NA NA NA
## HOLIDAY -0.019352 0.006650 -2.910 0.00363 **
## NO_HOLIDAY NA NA NA NA
## `0` -0.020703 0.008422 -2.458 0.01399 *
## `1` -0.049000 0.008302 -5.902 3.78e-09 ***
## `10` -0.052370 0.008043 -6.511 8.05e-11 ***
## `11` -0.043545 0.008355 -5.212 1.93e-07 ***
## `12` -0.020328 0.008538 -2.381 0.01730 *
## `13` -0.015407 0.008691 -1.773 0.07631 .
## `14` -0.013134 0.008544 -1.537 0.12432
## `15` 0.005290 0.008489 0.623 0.53321
## `16` 0.033488 0.008380 3.996 6.51e-05 ***
## `17` 0.100389 0.008405 11.944 < 2e-16 ***
## `18` 0.212488 0.008908 23.853 < 2e-16 ***
## `19` 0.138798 0.009045 15.345 < 2e-16 ***
## `2` -0.081110 0.008311 -9.759 < 2e-16 ***
## `20` 0.111049 0.008401 13.219 < 2e-16 ***
## `21` 0.111544 0.008383 13.306 < 2e-16 ***
## `22` 0.082552 0.008320 9.922 < 2e-16 ***
## `23` 0.015079 0.008325 1.811 0.07014 .
## `3` -0.101026 0.008432 -11.982 < 2e-16 ***
## `4` -0.121861 0.008390 -14.525 < 2e-16 ***
## `5` -0.114008 0.008273 -13.782 < 2e-16 ***
## `6` -0.069962 0.008293 -8.437 < 2e-16 ***
## `7` 0.018547 0.008292 2.237 0.02533 *
## `8` 0.099478 0.008536 11.654 < 2e-16 ***
## `9` NA NA NA NA
## poly(TEMPERATURE, 6)1 NA NA NA NA
## poly(TEMPERATURE, 6)2 -5.879974 0.872562 -6.739 1.74e-11 ***
## poly(TEMPERATURE, 6)3 -3.281459 0.141016 -23.270 < 2e-16 ***
## poly(TEMPERATURE, 6)4 -1.725049 0.127019 -13.581 < 2e-16 ***
## poly(TEMPERATURE, 6)5 0.263064 0.106424 2.472 0.01347 *
## poly(TEMPERATURE, 6)6 0.472738 0.097863 4.831 1.39e-06 ***
## poly(HUMIDITY, 6)1 NA NA NA NA
## poly(HUMIDITY, 6)2 -3.256619 0.283200 -11.499 < 2e-16 ***
## poly(HUMIDITY, 6)3 -0.378018 0.136635 -2.767 0.00568 **
## poly(HUMIDITY, 6)4 -0.225621 0.184915 -1.220 0.22246
## poly(HUMIDITY, 6)5 -0.522998 0.174343 -3.000 0.00271 **
## poly(HUMIDITY, 6)6 0.551869 0.139233 3.964 7.46e-05 ***
## poly(DEW_POINT_TEMPERATURE, 4)1 NA NA NA NA
## poly(DEW_POINT_TEMPERATURE, 4)2 7.657073 1.221024 6.271 3.82e-10 ***
## poly(DEW_POINT_TEMPERATURE, 4)3 -0.471446 0.135978 -3.467 0.00053 ***
## poly(DEW_POINT_TEMPERATURE, 4)4 0.289182 0.120485 2.400 0.01642 *
## HUMIDITY:RAINFALL 10.547804 1.571087 6.714 2.06e-11 ***
## TEMPERATURE:HUMIDITY -2.272540 0.236476 -9.610 < 2e-16 ***
## AUTUMN:HOLIDAY -0.059911 0.013010 -4.605 4.21e-06 ***
## AUTUMN:`18` 0.064934 0.014095 4.607 4.17e-06 ***
## AUTUMN:`19` 0.025843 0.013451 1.921 0.05473 .
## AUTUMN:`8` 0.066182 0.013930 4.751 2.07e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09241 on 6293 degrees of freedom
## Multiple R-squared: 0.7431, Adjusted R-squared: 0.7409
## F-statistic: 337.1 on 54 and 6293 DF, p-value: < 2.2e-16
# check if there is negative results
lm_ploy_train_results %>%
filter(lm_ploy_train_results$.pred < 0)
## # A tibble: 564 × 2
## .pred truth
## <dbl> <dbl>
## 1 -0.0741 0.0124
## 2 -0.00374 0.0405
## 3 -0.00746 0.0366
## 4 -0.0630 0.00929
## 5 -0.0715 0.00760
## 6 -0.0739 0.0225
## 7 -0.0302 0.00619
## 8 -0.0273 0.0211
## 9 -0.0248 0.0214
## 10 -0.0507 0.0160
## # ℹ 554 more rows
# assign zero value for negitave results
lm_ploy_train_results$.pred <- replace(lm_ploy_train_results$.pred, lm_ploy_train_results$.pred < 0, 0)
# Calculate R-squared and RMSE from the test results
rsq_lm_ploy <- rsq(lm_ploy_train_results,
truth = truth,
estimate = .pred
)
rmse_lm_ploy <- rmse(lm_ploy_train_results,
truth = truth,
estimate = .pred
)
rsq_lm_ploy
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.757
rmse_lm_ploy
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.0899
I can see improvement in both metrics.
TODO: Define a linear regression model specification
glmnet_spec
using glmnet
engine
# HINT: Use linear_reg() function with two parameters: penalty and mixture
# - penalty controls the intensity of model regularization
# - mixture controls the tradeoff between L1 and L2 regularizations
# You could manually try different parameter combinations or use grid search to find optimal combinations
# first we create a formula recipe
lm_glmnet_formula <-
formula(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4) + RAINFALL * HUMIDITY + HUMIDITY * TEMPERATURE + AUTUMN * HOLIDAY + AUTUMN * `18` + AUTUMN * `19` + AUTUMN * `8`)
# k-fold cross-validation
folds <- vfold_cv(bike_sharing_training, v = 10)
# define model
tune_spec <- linear_reg(penalty = tune(), mixture = tune()) %>%
set_engine("glmnet")
# Grid workflow
wf <- workflow() %>%
add_formula(lm_glmnet_formula)
# Define tuning values
lambda_tune <- grid_regular(
levels = 50,
penalty(range = c(-3, 0.3))
)
mixture_tune <- grid_regular(
levels = 50,
mixture(range = c(0, 1))
)
mix_grid <- as.data.frame(c(lambda_tune, mixture_tune))
# define grid search
grid <- tune_grid(
wf %>% add_model(tune_spec),
resamples = folds,
grid = mix_grid
)
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
## standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations A: x17There were issues with some computations A: x34There were issues with some computations A: x51There were issues with some computations A: x68There were issues with some computations A: x85There were issues with some computations A: x102There were issues with some computations A: x119There were issues with some computations A: x136There were issues with some computations A: x153There were issues with some computations A: x170There were issues with some computations A: x170
# print grid search results
show_best(grid, metric = "rmse")
## # A tibble: 5 × 8
## penalty mixture .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00117 0.0204 rmse standard 0.0939 10 0.000651 pre0_mod02_post0
## 2 0.00136 0.0408 rmse standard 0.0939 10 0.000651 pre0_mod03_post0
## 3 0.00159 0.0612 rmse standard 0.0939 10 0.000651 pre0_mod04_post0
## 4 0.00186 0.0816 rmse standard 0.0939 10 0.000651 pre0_mod05_post0
## 5 0.00217 0.102 rmse standard 0.0939 10 0.000652 pre0_mod06_post0
From the above grid search I will be using penalty of 0.001167742 and mixture of 0.02040816 as best result.
lm_glmnet <- linear_reg(mode = "regression", engine = "glmnet", penalty = 0.001167742, mixture = 0.02040816) %>%
fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + poly(HUMIDITY, 6) + poly(DEW_POINT_TEMPERATURE, 4) + RAINFALL * HUMIDITY + HUMIDITY * TEMPERATURE + AUTUMN * HOLIDAY + AUTUMN * `18` + AUTUMN * `19` + AUTUMN * `8`, data = bike_sharing_training)
# create test results
lm_glmnet_train_results <- lm_glmnet %>%
predict(new_data = bike_sharing_training) %>%
mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
head(lm_glmnet_train_results)
## # A tibble: 6 × 2
## .pred truth
## <dbl> <dbl>
## 1 0.599 0.764
## 2 0.101 0.117
## 3 0.281 0.311
## 4 0.0554 0.0428
## 5 0.121 0.0535
## 6 0.126 0.0594
# check if there is negative results
lm_glmnet_train_results %>%
filter(lm_glmnet_train_results$.pred < 0)
## # A tibble: 551 × 2
## .pred truth
## <dbl> <dbl>
## 1 -0.00519 0.0506
## 2 -0.0654 0.0124
## 3 -0.00345 0.0405
## 4 -0.0700 0.00929
## 5 -0.0000200 0.0557
## 6 -0.0683 0.00760
## 7 -0.0686 0.0225
## 8 -0.0518 0.00619
## 9 -0.0234 0.0211
## 10 -0.0194 0.0214
## # ℹ 541 more rows
# assign zero value for negitave results
lm_glmnet_train_results$.pred <- replace(lm_glmnet_train_results$.pred, lm_glmnet_train_results$.pred < 0, 0)
# Calculate R-squared and RMSE from the test results
rsq_lm_glmnet <- rsq(lm_glmnet_train_results,
truth = truth,
estimate = .pred
)
rmse_lm_glmnet <- rmse(lm_glmnet_train_results,
truth = truth,
estimate = .pred
)
rsq_lm_glmnet
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.752
rmse_lm_glmnet
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.0909
I don’t see any improvement in the model, it is slightly worse than the model I created in last step.
TODO: Experiment by building and testing at least five different models. For each of your experiments, include polynomial terms, interaction terms, and one of the three regularization we introduced.
Before starting building model, I always like to start with matrix to understand the relationship between the different varibale and the target variable.
bike_sharing_df_cor <- cor(bike_sharing_df, method = "pearson", use = "pairwise.complete.obs")
bike_sharing_df_cor[, 1]
## RENTED_BIKE_COUNT TEMPERATURE HUMIDITY
## 1.000000000 0.562809687 -0.201972670
## WIND_SPEED VISIBILITY DEW_POINT_TEMPERATURE
## 0.125021946 0.212322776 0.400262829
## SOLAR_RADIATION RAINFALL SNOWFALL
## 0.273861551 -0.128626093 -0.151610753
## AUTUMN SPRING SUMMER
## 0.165332742 0.015579791 0.282000797
## WINTER HOLIDAY NO_HOLIDAY
## -0.458919815 -0.070070005 0.070070005
## 0 1 10
## -0.054383329 -0.093147055 -0.059560345
## 11 12 13
## -0.035036035 -0.001928769 0.009423476
## 14 15 16
## 0.018012873 0.041641074 0.075704251
## 17 18 19
## 0.145515179 0.267890160 0.164534951
## 2 20 21
## -0.135030231 0.122161147 0.109563181
## 22 23 3
## 0.073076646 -0.011437275 -0.168084792
## 4 5 6
## -0.191872210 -0.189689701 -0.139760076
## 7 8 9
## -0.033305460 0.104274851 -0.019880387
# Build at least five different models using polynomial terms, interaction terms, and regularizations.
# Model 1 - I will keep the lm_poly model created before.
# Model 2 - Top cor variables, polynomial top variable, interaction term & L2 regulaizations
model2 <- linear_reg(penalty = 0.02, mixture = 1) %>%
set_engine("glmnet")
model2_fit <- model2 %>%
fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + WINTER * `18` + DEW_POINT_TEMPERATURE + SOLAR_RADIATION + SUMMER * `18` + HUMIDITY, data = bike_sharing_training)
model2_train_results <- model2_fit %>%
predict(new_data = bike_sharing_training) %>%
mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
model2$.pred <- replace(model2_train_results$.pred, model2_train_results$.pred < 0, 0)
rsq_model2 <- rsq(model2_train_results,
truth = truth,
estimate = .pred
)
rmse_model2 <- rmse(model2_train_results,
truth = truth,
estimate = .pred
)
# Model 3 - Top cor variables, 2 polynomial top variable, interaction term & elastic regularization
model3 <- linear_reg(penalty = 0.02, mixture = 0.2) %>%
set_engine("glmnet")
model3_fit <- model3 %>%
fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + WINTER * `18` + poly(DEW_POINT_TEMPERATURE, 6) + SOLAR_RADIATION + SUMMER * `18` + TEMPERATURE * HUMIDITY, data = bike_sharing_training)
model3_train_results <- model3_fit %>%
predict(new_data = bike_sharing_training) %>%
mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
model3$.pred <- replace(model3_train_results$.pred, model3_train_results$.pred < 0, 0)
rsq_model3 <- rsq(model3_train_results,
truth = truth,
estimate = .pred
)
rmse_model3 <- rmse(model3_train_results,
truth = truth,
estimate = .pred
)
# Model 4 - Top cor variables,4 polynomial top variable, interaction term & elastic regularization
model4 <- linear_reg(penalty = 0.0015, mixture = 0.2) %>%
set_engine("glmnet")
model4_fit <- model4 %>%
fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + WINTER * `18` + poly(DEW_POINT_TEMPERATURE, 6) + poly(SOLAR_RADIATION, 6) + SUMMER * `18` + TEMPERATURE * HUMIDITY + poly(HUMIDITY, 6), data = bike_sharing_training)
model4_train_results <- model4_fit %>%
predict(new_data = bike_sharing_training) %>%
mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
model4$.pred <- replace(model4_train_results$.pred, model4_train_results$.pred < 0, 0)
rsq_model4 <- rsq(model4_train_results,
truth = truth,
estimate = .pred
)
rmse_model4 <- rmse(model4_train_results,
truth = truth,
estimate = .pred
)
# Model 5 - Top cor variables,5 polynomial top variable, interaction term & elastic regularization
model5 <- linear_reg(penalty = 0.0015, mixture = 0.2) %>%
set_engine("glmnet")
model5_fit <- model5 %>%
fit(RENTED_BIKE_COUNT ~ . + poly(TEMPERATURE, 6) + WINTER * `18` + poly(DEW_POINT_TEMPERATURE, 6) + poly(SOLAR_RADIATION, 6) + poly(VISIBILITY, 6) + SUMMER * `18` + TEMPERATURE * HUMIDITY + poly(HUMIDITY, 6) + RAINFALL * TEMPERATURE + SNOWFALL * TEMPERATURE + RAINFALL * HUMIDITY + SNOWFALL * HUMIDITY, data = bike_sharing_training)
model5_train_results <- model5_fit %>%
predict(new_data = bike_sharing_training) %>%
mutate(truth = bike_sharing_training$RENTED_BIKE_COUNT)
model5_train_results$.pred <- replace(model5_train_results$.pred, model5_train_results$.pred < 0, 0)
rsq_model5 <- rsq(model5_train_results,
truth = truth,
estimate = .pred
)
rmse_model5 <- rmse(model5_train_results,
truth = truth,
estimate = .pred
)
# Save their rmse and rsq values
rsq_lm_ploy
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.757
rmse_lm_ploy
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.0899
rsq_model2
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.534
rmse_model2
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.130
rsq_model3
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.713
rmse_model3
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.0998
rsq_model4
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.760
rmse_model4
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.0889
rsq_model5
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.774
rmse_model5
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.0868
I find the the last model, model 5 is the best model we have found, it is because the model is very similar to the poly model I created before but with more poly terms and interactions with the same penalty and mixture.
TODO: Visualize the saved RMSE and R-squared values using a grouped barchart
model_names <- c("lm_ploy", "model2", "model3", "model4", "model5")
rsq <- c("0.757", "0.534", "0.713", "0.760", "0.762")
rsme <- c("0.0899", "0.130", "0.0998", "0.0889", "0.0886")
comparison_df <- data.frame(model_names, rsq, rsme)
comparison_df %>%
pivot_longer(!model_names) %>%
ggplot(aes(x = model_names, y = value, fill = name)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparing RSME AND RSQ Across Models", fill = "Metric")
TODO: Create a Q-Q plot by plotting the distribution difference between the predictions generated by your best model and the true values on the test dataset.
ggplot(model5_train_results) +
stat_qq(aes(sample = truth), color = "green") +
stat_qq(aes(sample = .pred), color = "red")
# Compare the summery statistics between true values from bike sharing dataset and predict values
bike_sharing_df %>%
filter(SPRING == 1) %>%
select(RENTED_BIKE_COUNT) %>%
summary(bike_sharing_df$RENTED_BIKE_COUNT)
## RENTED_BIKE_COUNT
## Min. :0.00000
## 1st Qu.:0.06275
## Median :0.16798
## Mean :0.20941
## 3rd Qu.:0.31401
## Max. :0.91418