set.seed(48)
library(tidymodels)
library(tidyverse)
library(magrittr)
library(glmnet)
library(ranger)
dat <- read_csv("hour.csv")
glimpse(dat)## Observations: 17,379
## Variables: 17
## $ instant <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ dteday <date> 2011-01-01, 2011-01-01, 2011-01-01, 2011-01-01, 20...
## $ season <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ yr <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mnth <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ hr <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ holiday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ weekday <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
## $ workingday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ weathersit <dbl> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, ...
## $ temp <dbl> 0.24, 0.22, 0.22, 0.24, 0.24, 0.24, 0.22, 0.20, 0.2...
## $ atemp <dbl> 0.2879, 0.2727, 0.2727, 0.2879, 0.2879, 0.2576, 0.2...
## $ hum <dbl> 0.81, 0.80, 0.80, 0.75, 0.75, 0.75, 0.80, 0.86, 0.7...
## $ windspeed <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0896, 0.0...
## $ casual <dbl> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 4...
## $ registered <dbl> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 7...
## $ cnt <dbl> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, ...
Until, There are 2 dependent variables casual and registerd, there are two seperate train and test sets(dat_train_c, dat_test_c), (dat_train_r, dat_test_r)
rec_casual <-
recipe(casual ~ ., data = dat_train_c) %>%
step_corr(all_numeric(), threshold = 0.85) %>% # remove high correlated data
step_dummy(all_nominal()) %>% # make dummy variables
step_center(all_predictors()) %>% # centerize variables
step_scale(all_predictors()) %>% # scale from 0 to 1
prep(dat_train_c)
rec_registered <- recipe(registered ~ ., data = dat_train_r) %>%
step_corr(all_numeric(), threshold = 0.85) %>% # remove high correlated data
step_dummy(all_nominal()) %>% # make dummy variables
step_center(all_predictors()) %>% # centerize variables
step_scale(all_predictors()) %>% # scale from 0 to 1
prep(dat_train_r)
dat_train_c <- bake(rec_casual, dat_train_c)
dat_train_r <- bake(rec_registered, dat_train_r)
dat_test_c <- bake(rec_casual, dat_test)
dat_test_r <- bake(rec_registered, dat_test)rf_fit_c <- rand_forest(mode = "regression",
mtry = .preds(),
trees = 1000) %>%
set_engine("ranger", importance = "permutation") %>%
fit(casual ~ ., data = dat_train_c)
rf_fit_r <- rand_forest(mode = "regression",
mtry = .preds(),
trees = 1000) %>%
set_engine("ranger", importance = "permutation") %>%
fit(registered ~ ., data = dat_train_r)result_c <- data_frame(truth = dat_test_c$casual) %>%
bind_cols(predict(rf_fit_c, new_data = dat_test_c))
metrics(result_c, truth, .pred)| .metric | .estimator | .estimate |
|---|---|---|
| rmse | standard | 14.7234205 |
| rsq | standard | 0.9061674 |
| mae | standard | 8.3744170 |
result_r <- data_frame(truth = dat_test_r$registered) %>%
bind_cols(predict(rf_fit_r, new_data = dat_test_r))
metrics(result_r, truth, .pred)| .metric | .estimator | .estimate |
|---|---|---|
| rmse | standard | 34.0137372 |
| rsq | standard | 0.9495264 |
| mae | standard | 19.8867131 |
Both models have high r-squared values, and low root mean squared errors, thus, models are appropriate.
importance <- data_frame(variables = importance(rf_fit_c$fit) %>% names(),
casual = importance(rf_fit_c$fit) /6248.227 * 100,
registered = importance(rf_fit_r$fit) /56126.55 * 100)
importance %>% arrange(desc(casual+registered)) %>% head(10)| variables | casual | registered |
|---|---|---|
| hr | 34.4585433 | 58.3953602 |
| workingday_X1 | 24.6158073 | 15.9578492 |
| temp | 22.8690097 | 7.9771631 |
| yr | 4.0751190 | 8.8272759 |
| hum | 5.7813185 | 2.4731642 |
| mnth | 3.0234222 | 2.7846933 |
| season_X4 | 0.1670428 | 1.2643933 |
| weekday_X5 | 0.8749412 | 0.4848199 |
| weathersit_X3 | 0.3079909 | 0.8608652 |
| windspeed | 0.6382454 | 0.2059898 |
It seems that hour, workingday, temperature, year variables are important variables.
| workingday | total |
|---|---|
| 0 | 181.4053 |
| 1 | 193.2078 |
dat %>%
group_by(hr, workingday) %>%
summarise(total = mean(cnt)) %>%
ggplot(aes(x = hr, y = total, color = workingday)) +
geom_line()On working day, the total is slightly higher, and There is different pattern on working day or not.
| yr | total |
|---|---|
| 0 | 143.7944 |
| 1 | 234.6664 |
p1 <- dat %>%
ggplot(aes(x = temp, y = cnt)) +
geom_smooth()
p2 <- dat %>%
group_by(mnth) %>%
summarise(total = mean(cnt)) %>%
ggplot(aes(x = mnth, y = total)) +
geom_line() +
scale_x_continuous(breaks = 1:12)
multiplot(p1, p2, cols = 2)There is growth from 2011 to 2012, and there is positive relation between temperature and total customers.
Due to this relation, we can see there is more consumer on summer.