knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(dplyr)
library(plotly)
library(reshape2)
library(ggplot2)
library(htmlwidgets)
This report analyzes the “Bike Sharing Dataset” using a linear regression model. The objective is to understand the variables influencing daily bike rentals. We begin by loading the data and denormalizing the features to restore their original physical scales (Celsius, humidity percentage, etc.)
# Load data
day_data <- read.csv("Bike-Sharing-Dataset/day.csv")
# Denormalize features
day_data <- day_data %>%
mutate(
temp = temp * 41,
atemp = atemp * 50,
hum = hum * 100,
windspeed = windspeed * 67,
days_since_2011 = instant - 1,
MISTY = ifelse(weathersit == 2, 1, 0),
RAIN = ifelse(weathersit == 3 | weathersit == 4, 1, 0)
)
To interpret the seasonal impact without falling into the “dummy variable trap,” we perform explicit One-Hot Encoding for Summer, Fall, and Winter, leaving Spring as the baseline category.
# ONE-HOT ENCODING
day_data <- day_data %>%
mutate(
season_summer = ifelse(season == 2, 1, 0),
season_fall = ifelse(season == 3, 1, 0),
season_winter = ifelse(season == 4, 1, 0)
)
model <- lm(cnt ~ workingday + holiday + season_summer + season_fall + season_winter +
MISTY + RAIN + temp + hum + windspeed + days_since_2011, data = day_data)
# summary
model_summary <- summary(model)
model_summary
##
## Call:
## lm(formula = cnt ~ workingday + holiday + season_summer + season_fall +
## season_winter + MISTY + RAIN + temp + hum + windspeed + days_since_2011,
## data = day_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3509.6 -397.9 78.7 534.1 3482.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1513.7656 245.7812 6.159 1.22e-09 ***
## workingday 124.9209 73.2666 1.705 0.088623 .
## holiday -686.1154 203.3015 -3.375 0.000778 ***
## season_summer 899.3182 122.2833 7.354 5.24e-13 ***
## season_fall 138.2154 161.7037 0.855 0.392977
## season_winter 425.6029 110.8199 3.840 0.000134 ***
## MISTY -379.3985 87.5532 -4.333 1.68e-05 ***
## RAIN -1901.5399 223.6400 -8.503 < 2e-16 ***
## temp 126.9110 8.0740 15.718 < 2e-16 ***
## hum -17.3772 3.1694 -5.483 5.80e-08 ***
## windspeed -42.5135 6.8917 -6.169 1.15e-09 ***
## days_since_2011 4.9264 0.1728 28.507 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 886.9 on 719 degrees of freedom
## Multiple R-squared: 0.7936, Adjusted R-squared: 0.7904
## F-statistic: 251.2 on 11 and 719 DF, p-value: < 2.2e-16
The following visualizations provide insights into the model’s behavior, showing how each feature contributes to the predictions.
This chart displays the raw coefficients. It shows the expected change in bike rentals for a one-unit increase in each feature, holding other variables constant.
coef_data <- as.data.frame(model_summary$coefficients)
coef_data$feature <- rownames(coef_data)
coef_data_no_int <- coef_data[coef_data$feature != "(Intercept)", ]
p1 <- plot_ly(coef_data_no_int, x = ~Estimate, y = ~reorder(feature, Estimate), type = "bar", orientation = "h",
error_x = list(type = "data", array = ~`Std. Error`)) %>%
layout(title = "Weights of the linear model (unscaled)",
xaxis = list(title = "Coefficient Estimate (Rentals change per unit)"),
yaxis = list(title = "Feature"))
# Save as PNG
p1_gg <- ggplot(coef_data_no_int, aes(x = Estimate, y = reorder(feature, Estimate))) +
geom_col(fill = "steelblue") +
geom_errorbar(aes(xmin = Estimate - `Std. Error`, xmax = Estimate + `Std. Error`), width = 0.2) +
labs(title = "Weights of the linear model (unscaled)", x = "Coefficient Estimate", y = "Feature") +
theme_minimal()
ggsave("plots/weight_plot_unscaled.png", p1_gg, width = 10, height = 6)
# Save as HTML
saveWidget(p1, "plots/weight_plot_unscaled.html", selfcontained = TRUE)
p1
Since variables like temp and workingday have different units, we scale them to compare their relative importance. This reveals which feature has the strongest overall “drive” on the model.
features_to_scale <- c("workingday", "holiday", "season_summer", "season_fall", "season_winter",
"MISTY", "RAIN", "temp", "hum", "windspeed", "days_since_2011")
scaled_df <- day_data %>%
select(all_of(features_to_scale)) %>%
scale() %>%
as.data.frame()
scaled_df$cnt <- day_data$cnt
scaled_model <- lm(cnt ~ ., data = scaled_df)
scaled_summary <- summary(scaled_model)
coef_scaled <- as.data.frame(scaled_summary$coefficients)
coef_scaled$feature <- rownames(coef_scaled)
coef_scaled <- coef_scaled[coef_scaled$feature != "(Intercept)", ]
p2 <- plot_ly(coef_scaled, x = ~Estimate, y = ~reorder(feature, Estimate), type = "bar", orientation = "h",marker = list(color = "darkgreen"),
error_x = list(type = "data", array = ~`Std. Error`)) %>%
layout(title = "Weights of the linear model (scaled)",
xaxis = list(title = "Coefficient Estimate (Std. Dev. units)"),
yaxis = list(title = "Feature"))
# Save as PNG
p2_gg <- ggplot(coef_scaled, aes(x = Estimate, y = reorder(feature, Estimate))) +
geom_col(fill = "darkgreen") +
geom_errorbar(aes(xmin = Estimate - `Std. Error`, xmax = Estimate + `Std. Error`), width = 0.2) +
labs(title = "Weights of the linear model (scaled)", x = "Coefficient Estimate", y = "Feature") +
theme_minimal()
ggsave("plots/weight_plot_scaled.png", p2_gg, width = 10, height = 6)
# Save as HTML
saveWidget(p2, "plots/weight_plot_scaled.html", selfcontained = TRUE)
p2
The Effect Plot provides a deeper look into the model by calculating the actual contribution of each feature to the predicted value (calculated as Effect=Weight×Value). Unlike weights alone, this visualization accounts for the actual data distribution in our dataset.
weights <- model$coefficients[-1]
features_matrix <- model.matrix(model)[,-1]
effects <- as.data.frame(t(t(features_matrix) * weights))
effects_melted <- melt(effects)
feature_order <- effects_melted %>%
group_by(variable) %>%
summarise(mean_effect = mean(value)) %>%
arrange(mean_effect) %>%
pull(variable)
effects_melted$variable <- factor(effects_melted$variable, levels = feature_order)
p3 <- plot_ly(effects_melted, y = ~variable, x = ~value, type = "box", boxpoints = FALSE) %>%
layout(title = "Effect Plot (Global interpretation)",
xaxis = list(title = "Effect (weight * value)"),
yaxis = list(title = "Feature"))
# Save static PNG
p3_gg <- ggplot(effects_melted, aes(x = value, y = variable)) +
geom_boxplot(fill = "steelblue", color = "black") +
labs(title = "Effect Plot (Ordered by Mean Effect)", x = "Effect (weight * value)", y = "Feature") +
theme_minimal()
ggsave("plots/effect_plot.png", p3_gg, width = 10, height = 6)
# Save Interactive HTMLs[cite: 1]
saveWidget(p3, "plots/effect_plot.html", selfcontained = TRUE)
p3
This boxplot shows the range of impact for each variable across the entire dataset. Variables centered far from zero (like temp or days_since_2011) are the primary drivers of rental volume. Negative values for RAIN or hum confirm that bad weather consistently reduces bike usage.
While the previous plots show global trends, we can also perform a Local Explanation to understand why the model made a specific prediction for a single day. In this case, we examine the 6th instance (January 6th, 2011).
instance_index <- 6
instance_effects <- effects[instance_index, ]
instance_effects_df <- data.frame(variable = names(instance_effects), value = as.numeric(instance_effects))
p4 <- p3 %>%
add_markers(data = instance_effects_df, x = ~value, y = ~variable,
marker = list(color = "red", symbol = "x", size = 10),
name = "6th instance") %>%
layout(title = "Effect Plot with Local Explanation (6th instance)")
# Save as PNG
p4_gg <- p3_gg +
geom_point(data = instance_effects_df, aes(x = value, y = variable), color = "red", shape = 4, size = 5, stroke = 2) +
labs(title = "Effect Plot with 6th instance (Local Explanation)")
ggsave("plots/effect_plot_6th_instance.png", p4_gg, width = 10, height = 6)
# Save as html
saveWidget(p4, "plots/effect_plot_6th_instance.html", selfcontained = TRUE)
p4
cat("Adjusted R-squared:", model_summary$adj.r.squared, "\n")
## Adjusted R-squared: 0.7903921
cat("Residual Standard Error:", model_summary$sigma, "\n")
## Residual Standard Error: 886.9126
cat("Real count (cnt):", day_data$cnt[instance_index], "\n")
## Real count (cnt): 1606
cat("Predicted count:", predict(model, day_data[instance_index, ]), "\n")
## Predicted count: 1570.903
cat("Average prediction in dataset:", mean(predict(model)), "\n")
## Average prediction in dataset: 4504.349
cat("Intercept:", model$coefficients[1], "\n")
## Intercept: 1513.766
print(instance_effects_df)
## variable value
## 1 workingday 124.92094
## 2 holiday 0.00000
## 3 season_summer 0.00000
## 4 season_fall 0.00000
## 5 season_winter 0.00000
## 6 MISTY 0.00000
## 7 RAIN 0.00000
## 8 temp 1063.29423
## 9 hum -900.59247
## 10 windspeed -255.11775
## 11 days_since_2011 24.63216