Bike Sharing

Load libraries & data

set.seed(48)
library(tidymodels)
library(tidyverse)
library(magrittr)
library(glmnet)
library(ranger)
dat <- read_csv("hour.csv")
glimpse(dat)

## Observations: 17,379
## Variables: 17
## $ instant    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ dteday     <date> 2011-01-01, 2011-01-01, 2011-01-01, 2011-01-01, 20...
## $ season     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ yr         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mnth       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ hr         <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ holiday    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ weekday    <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
## $ workingday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ weathersit <dbl> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, ...
## $ temp       <dbl> 0.24, 0.22, 0.22, 0.24, 0.24, 0.24, 0.22, 0.20, 0.2...
## $ atemp      <dbl> 0.2879, 0.2727, 0.2727, 0.2879, 0.2879, 0.2576, 0.2...
## $ hum        <dbl> 0.81, 0.80, 0.80, 0.75, 0.75, 0.75, 0.80, 0.86, 0.7...
## $ windspeed  <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0896, 0.0...
## $ casual     <dbl> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 4...
## $ registered <dbl> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 7...
## $ cnt        <dbl> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, ...

Remove unnecessary variables & Make nominal variables as factor

dat <- dat %>% select(-instant, -dteday)

dat$season %<>% as.factor
dat$holiday %<>% as.factor
dat$weekday %<>% as.factor
dat$workingday %<>% as.factor
dat$weathersit %<>% as.factor

Split data

Until, There are 2 dependent variables casual and registerd, there are two seperate train and test sets(dat_train_c, dat_test_c), (dat_train_r, dat_test_r)

dat_split <- initial_split(dat, 0.8)
dat_train <- training(dat_split)
dat_test <- testing(dat_split)


dat_train_c <- dat_train %>% select(-cnt, -registered)
dat_train_r <- dat_train %>% select(-cnt, -casual)

Data Pre-process

rec_casual <-
  recipe(casual ~ ., data = dat_train_c) %>%
  step_corr(all_numeric(), threshold = 0.85) %>% # remove high correlated data
  step_dummy(all_nominal()) %>% # make dummy variables
  step_center(all_predictors()) %>% # centerize variables
  step_scale(all_predictors()) %>% # scale from 0 to 1
  prep(dat_train_c)

rec_registered <- recipe(registered ~ ., data = dat_train_r) %>% 
  step_corr(all_numeric(), threshold = 0.85) %>% # remove high correlated data
  step_dummy(all_nominal()) %>% # make dummy variables
  step_center(all_predictors()) %>% # centerize variables
  step_scale(all_predictors()) %>% # scale from 0 to 1
  prep(dat_train_r)


dat_train_c <- bake(rec_casual, dat_train_c)
dat_train_r <- bake(rec_registered, dat_train_r)
dat_test_c <- bake(rec_casual, dat_test)
dat_test_r <- bake(rec_registered, dat_test)

Training Random Forest Models

rf_fit_c <- rand_forest(mode = "regression",
                          mtry = .preds(),
                          trees = 1000) %>%
  set_engine("ranger", importance = "permutation") %>% 
  fit(casual ~ ., data = dat_train_c)

rf_fit_r <- rand_forest(mode = "regression",
                          mtry = .preds(),
                          trees = 1000) %>%
  set_engine("ranger", importance = "permutation") %>% 
  fit(registered ~ ., data = dat_train_r)

Testing models

result_c <- data_frame(truth = dat_test_c$casual) %>%
  bind_cols(predict(rf_fit_c, new_data = dat_test_c))

metrics(result_c, truth, .pred)

.metric	.estimator	.estimate
rmse	standard	14.7234205
rsq	standard	0.9061674
mae	standard	8.3744170

result_r <- data_frame(truth = dat_test_r$registered) %>%
  bind_cols(predict(rf_fit_r, new_data = dat_test_r))

metrics(result_r, truth, .pred)

.metric	.estimator	.estimate
rmse	standard	34.0137372
rsq	standard	0.9495264
mae	standard	19.8867131

Both models have high r-squared values, and low root mean squared errors, thus, models are appropriate.

Important Variables

importance <- data_frame(variables = importance(rf_fit_c$fit) %>% names(),
                         casual = importance(rf_fit_c$fit) /6248.227 * 100,
                         registered = importance(rf_fit_r$fit) /56126.55 * 100)

importance %>% arrange(desc(casual+registered)) %>% head(10)

variables	casual	registered
hr	34.4585433	58.3953602
workingday_X1	24.6158073	15.9578492
temp	22.8690097	7.9771631
yr	4.0751190	8.8272759
hum	5.7813185	2.4731642
mnth	3.0234222	2.7846933
season_X4	0.1670428	1.2643933
weekday_X5	0.8749412	0.4848199
weathersit_X3	0.3079909	0.8608652
windspeed	0.6382454	0.2059898

It seems that hour, workingday, temperature, year variables are important variables.

dat %>%
  group_by(workingday) %>%
  summarise(total = mean(cnt))

workingday	total
0	181.4053
1	193.2078

dat %>%
  group_by(hr, workingday) %>%
  summarise(total = mean(cnt)) %>%
  ggplot(aes(x = hr, y = total, color = workingday)) +
  geom_line()

On working day, the total is slightly higher, and There is different pattern on working day or not.

dat %>%
  group_by(yr) %>%
  summarise(total = mean(cnt))

yr	total
0	143.7944
1	234.6664

p1 <- dat %>%
  ggplot(aes(x = temp, y = cnt)) +
  geom_smooth()

p2 <- dat %>%
  group_by(mnth) %>%
  summarise(total = mean(cnt)) %>%
  ggplot(aes(x = mnth, y = total)) +
  geom_line() +
  scale_x_continuous(breaks = 1:12)

multiplot(p1, p2, cols = 2)

There is growth from 2011 to 2012, and there is positive relation between temperature and total customers.

Due to this relation, we can see there is more consumer on summer.