knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(dplyr)
library(plotly)
library(reshape2)
library(ggplot2)
library(htmlwidgets)

1. Introduction

This report analyzes the “Bike Sharing Dataset” using a linear regression model. The objective is to understand the variables influencing daily bike rentals. We begin by loading the data and denormalizing the features to restore their original physical scales (Celsius, humidity percentage, etc.)

# Load data
day_data <- read.csv("Bike-Sharing-Dataset/day.csv")

# Denormalize features
day_data <- day_data %>%
  mutate(
    temp = temp * 41,
    atemp = atemp * 50,
    hum = hum * 100,
    windspeed = windspeed * 67,
    days_since_2011 = instant - 1,
    MISTY = ifelse(weathersit == 2, 1, 0),
    RAIN = ifelse(weathersit == 3 | weathersit == 4, 1, 0)
  )

2. Linear Model Construction

To interpret the seasonal impact without falling into the “dummy variable trap,” we perform explicit One-Hot Encoding for Summer, Fall, and Winter, leaving Spring as the baseline category.

# ONE-HOT ENCODING 
day_data <- day_data %>%
  mutate(
    season_summer = ifelse(season == 2, 1, 0),
    season_fall   = ifelse(season == 3, 1, 0),
    season_winter = ifelse(season == 4, 1, 0)
  )


model <- lm(cnt ~ workingday + holiday + season_summer + season_fall + season_winter + 
            MISTY + RAIN + temp + hum + windspeed + days_since_2011, data = day_data)

# summary
model_summary <- summary(model)
model_summary
## 
## Call:
## lm(formula = cnt ~ workingday + holiday + season_summer + season_fall + 
##     season_winter + MISTY + RAIN + temp + hum + windspeed + days_since_2011, 
##     data = day_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3509.6  -397.9    78.7   534.1  3482.4 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1513.7656   245.7812   6.159 1.22e-09 ***
## workingday        124.9209    73.2666   1.705 0.088623 .  
## holiday          -686.1154   203.3015  -3.375 0.000778 ***
## season_summer     899.3182   122.2833   7.354 5.24e-13 ***
## season_fall       138.2154   161.7037   0.855 0.392977    
## season_winter     425.6029   110.8199   3.840 0.000134 ***
## MISTY            -379.3985    87.5532  -4.333 1.68e-05 ***
## RAIN            -1901.5399   223.6400  -8.503  < 2e-16 ***
## temp              126.9110     8.0740  15.718  < 2e-16 ***
## hum               -17.3772     3.1694  -5.483 5.80e-08 ***
## windspeed         -42.5135     6.8917  -6.169 1.15e-09 ***
## days_since_2011     4.9264     0.1728  28.507  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 886.9 on 719 degrees of freedom
## Multiple R-squared:  0.7936, Adjusted R-squared:  0.7904 
## F-statistic: 251.2 on 11 and 719 DF,  p-value: < 2.2e-16

3. Visual Interpretation of Weights

The following visualizations provide insights into the model’s behavior, showing how each feature contributes to the predictions.

3.1. Weight Plot (Unscaled)

This chart displays the raw coefficients. It shows the expected change in bike rentals for a one-unit increase in each feature, holding other variables constant.

coef_data <- as.data.frame(model_summary$coefficients)
coef_data$feature <- rownames(coef_data)
coef_data_no_int <- coef_data[coef_data$feature != "(Intercept)", ]

p1 <- plot_ly(coef_data_no_int, x = ~Estimate, y = ~reorder(feature, Estimate), type = "bar", orientation = "h",
              error_x = list(type = "data", array = ~`Std. Error`)) %>%
  layout(title = "Weights of the linear model (unscaled)",
         xaxis = list(title = "Coefficient Estimate (Rentals change per unit)"),
         yaxis = list(title = "Feature"))

# Save as PNG
p1_gg <- ggplot(coef_data_no_int, aes(x = Estimate, y = reorder(feature, Estimate))) +
  geom_col(fill = "steelblue") +
  geom_errorbar(aes(xmin = Estimate - `Std. Error`, xmax = Estimate + `Std. Error`), width = 0.2) +
  labs(title = "Weights of the linear model (unscaled)", x = "Coefficient Estimate", y = "Feature") +
  theme_minimal()
ggsave("plots/weight_plot_unscaled.png", p1_gg, width = 10, height = 6)

# Save as HTML
saveWidget(p1, "plots/weight_plot_unscaled.html", selfcontained = TRUE)

p1

3.2. Weight Plot (Scaled)

Since variables like temp and workingday have different units, we scale them to compare their relative importance. This reveals which feature has the strongest overall “drive” on the model.

features_to_scale <- c("workingday", "holiday", "season_summer", "season_fall", "season_winter", 
                      "MISTY", "RAIN", "temp", "hum", "windspeed", "days_since_2011")

scaled_df <- day_data %>%
  select(all_of(features_to_scale)) %>%
  scale() %>%
  as.data.frame()

scaled_df$cnt <- day_data$cnt

scaled_model <- lm(cnt ~ ., data = scaled_df)
scaled_summary <- summary(scaled_model)

coef_scaled <- as.data.frame(scaled_summary$coefficients)
coef_scaled$feature <- rownames(coef_scaled)
coef_scaled <- coef_scaled[coef_scaled$feature != "(Intercept)", ]

p2 <- plot_ly(coef_scaled, x = ~Estimate, y = ~reorder(feature, Estimate), type = "bar", orientation = "h",marker = list(color = "darkgreen"),
              error_x = list(type = "data", array = ~`Std. Error`)) %>%
  layout(title = "Weights of the linear model (scaled)",
         xaxis = list(title = "Coefficient Estimate (Std. Dev. units)"),
         yaxis = list(title = "Feature"))

# Save as PNG
p2_gg <- ggplot(coef_scaled, aes(x = Estimate, y = reorder(feature, Estimate))) +
  geom_col(fill = "darkgreen") +
  geom_errorbar(aes(xmin = Estimate - `Std. Error`, xmax = Estimate + `Std. Error`), width = 0.2) +
  labs(title = "Weights of the linear model (scaled)", x = "Coefficient Estimate", y = "Feature") +
  theme_minimal()
ggsave("plots/weight_plot_scaled.png", p2_gg, width = 10, height = 6)

# Save as HTML
saveWidget(p2, "plots/weight_plot_scaled.html", selfcontained = TRUE)

p2

3.3. Effect Plot (Global Interpretation)

The Effect Plot provides a deeper look into the model by calculating the actual contribution of each feature to the predicted value (calculated as Effect=Weight×Value). Unlike weights alone, this visualization accounts for the actual data distribution in our dataset.

weights <- model$coefficients[-1] 
features_matrix <- model.matrix(model)[,-1]
effects <- as.data.frame(t(t(features_matrix) * weights))

effects_melted <- melt(effects)

feature_order <- effects_melted %>%
  group_by(variable) %>%
  summarise(mean_effect = mean(value)) %>%
  arrange(mean_effect) %>%
  pull(variable)

effects_melted$variable <- factor(effects_melted$variable, levels = feature_order)

p3 <- plot_ly(effects_melted, y = ~variable, x = ~value, type = "box", boxpoints = FALSE) %>%
  layout(title = "Effect Plot (Global interpretation)",
         xaxis = list(title = "Effect (weight * value)"),
         yaxis = list(title = "Feature"))

# Save static PNG
p3_gg <- ggplot(effects_melted, aes(x = value, y = variable)) +
  geom_boxplot(fill = "steelblue", color = "black") +
  labs(title = "Effect Plot (Ordered by Mean Effect)", x = "Effect (weight * value)", y = "Feature") +
  theme_minimal()
ggsave("plots/effect_plot.png", p3_gg, width = 10, height = 6)

# Save Interactive HTMLs[cite: 1]
saveWidget(p3, "plots/effect_plot.html", selfcontained = TRUE)

p3

This boxplot shows the range of impact for each variable across the entire dataset. Variables centered far from zero (like temp or days_since_2011) are the primary drivers of rental volume. Negative values for RAIN or hum confirm that bad weather consistently reduces bike usage.

3.4. Individual Contributions (6th Instance Analysis)

While the previous plots show global trends, we can also perform a Local Explanation to understand why the model made a specific prediction for a single day. In this case, we examine the 6th instance (January 6th, 2011).

instance_index <- 6
instance_effects <- effects[instance_index, ]
instance_effects_df <- data.frame(variable = names(instance_effects), value = as.numeric(instance_effects))

p4 <- p3 %>%
  add_markers(data = instance_effects_df, x = ~value, y = ~variable, 
              marker = list(color = "red", symbol = "x", size = 10),
              name = "6th instance") %>%
  layout(title = "Effect Plot with Local Explanation (6th instance)")

# Save as PNG
p4_gg <- p3_gg +
  geom_point(data = instance_effects_df, aes(x = value, y = variable), color = "red", shape = 4, size = 5, stroke = 2) +
  labs(title = "Effect Plot with 6th instance (Local Explanation)")
ggsave("plots/effect_plot_6th_instance.png", p4_gg, width = 10, height = 6)

# Save as html
saveWidget(p4, "plots/effect_plot_6th_instance.html", selfcontained = TRUE)
p4

Performance Metrics

cat("Adjusted R-squared:", model_summary$adj.r.squared, "\n")
## Adjusted R-squared: 0.7903921
cat("Residual Standard Error:", model_summary$sigma, "\n")
## Residual Standard Error: 886.9126

6th Instance Local Analysis

cat("Real count (cnt):", day_data$cnt[instance_index], "\n")
## Real count (cnt): 1606
cat("Predicted count:", predict(model, day_data[instance_index, ]), "\n")
## Predicted count: 1570.903
cat("Average prediction in dataset:", mean(predict(model)), "\n")
## Average prediction in dataset: 4504.349
cat("Intercept:", model$coefficients[1], "\n")
## Intercept: 1513.766
print(instance_effects_df)
##           variable      value
## 1       workingday  124.92094
## 2          holiday    0.00000
## 3    season_summer    0.00000
## 4      season_fall    0.00000
## 5    season_winter    0.00000
## 6            MISTY    0.00000
## 7             RAIN    0.00000
## 8             temp 1063.29423
## 9              hum -900.59247
## 10       windspeed -255.11775
## 11 days_since_2011   24.63216