The operations team needs to optimize bike fleet distribution and maintenance schedules based on weather patterns to maximize rental revenue while minimizing operational costs.
Relevant Variables: - Count of rentals (cnt) - Weather conditions (temp, hum, weathersit) - Temporal variables (hr, weekday, holiday) - Seasonal patterns (season)
Assumptions: 1. Weather patterns are relatively consistent year over year 2. Rental patterns reflect genuine demand rather than supply constraints 3. The relationship between weather and rentals is relatively stable
# Load the dataset
bike_sharing_data <- read.csv("C:/Statistics for Data Science/Week 2/bike+sharing+dataset/hour.csv")
# Convert categorical variables
bike_sharing_data$season <- factor(bike_sharing_data$season,
levels = 1:4,
labels = c("Spring", "Summer", "Fall", "Winter"))
bike_sharing_data$weathersit <- factor(bike_sharing_data$weathersit,
levels = 1:4,
labels = c("Clear", "Mist", "Light Snow/Rain", "Heavy Rain/Snow"))
bike_sharing_data$hr <- factor(bike_sharing_data$hr)
# Basic data summary
summary(bike_sharing_data)
## instant dteday season yr
## Min. : 1 Length:17379 Spring:4242 Min. :0.0000
## 1st Qu.: 4346 Class :character Summer:4409 1st Qu.:0.0000
## Median : 8690 Mode :character Fall :4496 Median :1.0000
## Mean : 8690 Winter:4232 Mean :0.5026
## 3rd Qu.:13034 3rd Qu.:1.0000
## Max. :17379 Max. :1.0000
##
## mnth hr holiday weekday
## Min. : 1.000 16 : 730 Min. :0.00000 Min. :0.000
## 1st Qu.: 4.000 17 : 730 1st Qu.:0.00000 1st Qu.:1.000
## Median : 7.000 13 : 729 Median :0.00000 Median :3.000
## Mean : 6.538 14 : 729 Mean :0.02877 Mean :3.004
## 3rd Qu.:10.000 15 : 729 3rd Qu.:0.00000 3rd Qu.:5.000
## Max. :12.000 12 : 728 Max. :1.00000 Max. :6.000
## (Other):13004
## workingday weathersit temp atemp
## Min. :0.0000 Clear :11413 Min. :0.020 Min. :0.0000
## 1st Qu.:0.0000 Mist : 4544 1st Qu.:0.340 1st Qu.:0.3333
## Median :1.0000 Light Snow/Rain: 1419 Median :0.500 Median :0.4848
## Mean :0.6827 Heavy Rain/Snow: 3 Mean :0.497 Mean :0.4758
## 3rd Qu.:1.0000 3rd Qu.:0.660 3rd Qu.:0.6212
## Max. :1.0000 Max. :1.000 Max. :1.0000
##
## hum windspeed casual registered
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 0.0
## 1st Qu.:0.4800 1st Qu.:0.1045 1st Qu.: 4.00 1st Qu.: 34.0
## Median :0.6300 Median :0.1940 Median : 17.00 Median :115.0
## Mean :0.6272 Mean :0.1901 Mean : 35.68 Mean :153.8
## 3rd Qu.:0.7800 3rd Qu.:0.2537 3rd Qu.: 48.00 3rd Qu.:220.0
## Max. :1.0000 Max. :0.8507 Max. :367.00 Max. :886.0
##
## cnt
## Min. : 1.0
## 1st Qu.: 40.0
## Median :142.0
## Mean :189.5
## 3rd Qu.:281.0
## Max. :977.0
##
# Check for missing values
colSums(is.na(bike_sharing_data))
## instant dteday season yr mnth hr holiday
## 0 0 0 0 0 0 0
## weekday workingday weathersit temp atemp hum windspeed
## 0 0 0 0 0 0 0
## casual registered cnt
## 0 0 0
# Original Poisson model
model1 <- glm(cnt ~ temp + hum + weathersit,
data = bike_sharing_data,
family = poisson(link = "log"))
# Model summary
summary(model1)
##
## Call:
## glm(formula = cnt ~ temp + hum + weathersit, family = poisson(link = "log"),
## data = bike_sharing_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.091046 0.002579 1974.417 <2e-16 ***
## temp 1.833660 0.002890 634.435 <2e-16 ***
## hum -1.413693 0.003137 -450.633 <2e-16 ***
## weathersitMist 0.107834 0.001364 79.075 <2e-16 ***
## weathersitLight Snow/Rain -0.126845 0.002715 -46.724 <2e-16 ***
## weathersitHeavy Rain/Snow 0.122976 0.066985 1.836 0.0664 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2891591 on 17378 degrees of freedom
## Residual deviance: 2150374 on 17373 degrees of freedom
## AIC: 2261287
##
## Number of Fisher Scoring iterations: 5
# Check VIF
vif(model1)
## GVIF Df GVIF^(1/(2*Df))
## temp 1.032696 1 1.016216
## hum 1.180838 1 1.086664
## weathersit 1.175698 3 1.027344
# Basic diagnostics
par(mfrow=c(2,2))
plot(model1)
# Calculate dispersion
dispersion <- sum(residuals(model1, type = "pearson")^2) / df.residual(model1)
print(paste("Dispersion parameter:", round(dispersion, 2)))
## [1] "Dispersion parameter: 134.32"
Issues with Original Model: 1. Overdispersion (dispersion > 1) 2. Missing temporal patterns 3. No interaction effects 4. Assumes linear relationships 5. Limited predictors
# Enhanced model with interactions
model2 <- glm(cnt ~ temp * season + hum * hr + weathersit,
data = bike_sharing_data,
family = poisson(link = "log"))
# Model comparison
aic_comparison <- data.frame(
Model = c("Original", "Enhanced"),
AIC = c(AIC(model1), AIC(model2))
)
print(aic_comparison)
## Model AIC
## 1 Original 2261287
## 2 Enhanced 843484
# Significant interactions
summary(model2)$coefficients[grep(":", rownames(summary(model2)$coefficients)), ]
## Estimate Std. Error z value Pr(>|z|)
## temp:seasonSummer -1.319298982 0.01313981 -100.40470297 0.000000e+00
## temp:seasonFall -2.776865089 0.01526444 -181.91728819 0.000000e+00
## temp:seasonWinter -0.984762469 0.01394114 -70.63715902 0.000000e+00
## hum:hr1 0.112484035 0.05173441 2.17425975 2.968563e-02
## hum:hr2 0.039010519 0.05917402 0.65925077 5.097347e-01
## hum:hr3 0.039816740 0.08061726 0.49389847 6.213779e-01
## hum:hr4 0.321397078 0.10897546 2.94926116 3.185347e-03
## hum:hr5 0.398814448 0.06650941 5.99636104 2.017881e-09
## hum:hr6 0.424757889 0.04286614 9.90893641 3.806769e-23
## hum:hr7 0.251719586 0.03572897 7.04525257 1.851254e-12
## hum:hr8 0.287182995 0.03396663 8.45485714 2.794293e-17
## hum:hr9 0.346687441 0.03515997 9.86028822 6.187153e-23
## hum:hr10 0.180468101 0.03594651 5.02046297 5.154709e-07
## hum:hr11 0.105572591 0.03511451 3.00652344 2.642537e-03
## hum:hr12 0.047608102 0.03452031 1.37913308 1.678537e-01
## hum:hr13 -0.002495715 0.03455589 -0.07222258 9.424248e-01
## hum:hr14 -0.029588149 0.03463164 -0.85436748 3.929014e-01
## hum:hr15 0.066466351 0.03437201 1.93373491 5.314573e-02
## hum:hr16 0.079232986 0.03368706 2.35203003 1.867127e-02
## hum:hr17 0.085188662 0.03285991 2.59248014 9.528668e-03
## hum:hr18 0.090728515 0.03296197 2.75252065 5.913842e-03
## hum:hr19 -0.010513064 0.03352380 -0.31359997 7.538249e-01
## hum:hr20 0.026896781 0.03441237 0.78160200 4.344485e-01
## hum:hr21 0.027718226 0.03552587 0.78022654 4.352575e-01
## hum:hr22 0.096091337 0.03687531 2.60584472 9.164800e-03
## hum:hr23 0.162461732 0.03975381 4.08669618 4.375595e-05
# Fit negative binomial model
nb_model <- glm.nb(cnt ~ temp + I(temp^2) + hum + weathersit +
season + hr,
data = bike_sharing_data)
# Model summary
summary(nb_model)
##
## Call:
## glm.nb(formula = cnt ~ temp + I(temp^2) + hum + weathersit +
## season + hr, data = bike_sharing_data, init.theta = 3.111234917,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.60843 0.03841 67.904 < 2e-16 ***
## temp 4.84586 0.13578 35.689 < 2e-16 ***
## I(temp^2) -3.36031 0.13372 -25.130 < 2e-16 ***
## hum -0.45117 0.03046 -14.814 < 2e-16 ***
## weathersitMist -0.03466 0.01097 -3.159 0.00158 **
## weathersitLight Snow/Rain -0.50157 0.01848 -27.147 < 2e-16 ***
## weathersitHeavy Rain/Snow 0.12032 0.34224 0.352 0.72516
## seasonSummer 0.20687 0.01637 12.634 < 2e-16 ***
## seasonFall 0.25950 0.02078 12.491 < 2e-16 ***
## seasonWinter 0.39129 0.01431 27.352 < 2e-16 ***
## hr1 -0.43973 0.03108 -14.148 < 2e-16 ***
## hr2 -0.79188 0.03152 -25.124 < 2e-16 ***
## hr3 -1.45452 0.03274 -44.421 < 2e-16 ***
## hr4 -2.06062 0.03443 -59.856 < 2e-16 ***
## hr5 -0.89826 0.03172 -28.319 < 2e-16 ***
## hr6 0.45466 0.03071 14.807 < 2e-16 ***
## hr7 1.47877 0.03044 48.580 < 2e-16 ***
## hr8 2.01104 0.03036 66.248 < 2e-16 ***
## hr9 1.47054 0.03042 48.335 < 2e-16 ***
## hr10 1.14153 0.03056 37.354 < 2e-16 ***
## hr11 1.25982 0.03072 41.014 < 2e-16 ***
## hr12 1.42849 0.03091 46.222 < 2e-16 ***
## hr13 1.40079 0.03107 45.089 < 2e-16 ***
## hr14 1.33181 0.03120 42.686 < 2e-16 ***
## hr15 1.37269 0.03124 43.943 < 2e-16 ***
## hr16 1.57977 0.03115 50.707 < 2e-16 ***
## hr17 1.99142 0.03099 64.270 < 2e-16 ***
## hr18 1.92958 0.03084 62.575 < 2e-16 ***
## hr19 1.63334 0.03063 53.330 < 2e-16 ***
## hr20 1.33992 0.03054 43.878 < 2e-16 ***
## hr21 1.09276 0.03048 35.855 < 2e-16 ***
## hr22 0.84397 0.03048 27.686 < 2e-16 ***
## hr23 0.47317 0.03057 15.479 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(3.1112) family taken to be 1)
##
## Null deviance: 71529 on 17378 degrees of freedom
## Residual deviance: 18653 on 17346 degrees of freedom
## AIC: 191336
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 3.1112
## Std. Err.: 0.0348
##
## 2 x log-likelihood: -191268.3840
# Predictions
predictions_nb <- predict(nb_model, type = "response")
actual_vs_predicted <- data.frame(
actual = bike_sharing_data$cnt,
predicted = predictions_nb
)
# Plot predictions
ggplot(actual_vs_predicted, aes(x = actual, y = predicted)) +
geom_point(alpha = 0.1) +
geom_abline(intercept = 0, slope = 1, color = "red") +
theme_minimal() +
labs(title = "Predicted vs Actual Rental Counts",
x = "Actual Counts",
y = "Predicted Counts")
# Calculate RMSE
rmse <- sqrt(mean((actual_vs_predicted$actual - actual_vs_predicted$predicted)^2))
print(paste("RMSE:", round(rmse, 2)))
## [1] "RMSE: 103.25"
# Aggregate by hour and weather
hourly_weather_summary <- bike_sharing_data %>%
group_by(hr, weathersit) %>%
summarize(
mean_rentals = mean(cnt),
sd_rentals = sd(cnt),
.groups = 'drop'
)
# Visualization of hourly patterns
ggplot(hourly_weather_summary,
aes(x = hr, y = mean_rentals, color = weathersit)) +
geom_line() +
geom_ribbon(aes(ymin = mean_rentals - sd_rentals,
ymax = mean_rentals + sd_rentals,
fill = weathersit),
alpha = 0.2) +
theme_minimal() +
labs(title = "Average Rentals by Hour and Weather",
x = "Hour of Day",
y = "Average Number of Rentals",
color = "Weather Condition",
fill = "Weather Condition")
# Seasonal patterns
seasonal_summary <- bike_sharing_data %>%
group_by(season, weathersit) %>%
summarize(
mean_rentals = mean(cnt),
.groups = 'drop'
)
ggplot(seasonal_summary,
aes(x = season, y = mean_rentals, fill = weathersit)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Average Rentals by Season and Weather",
x = "Season",
y = "Average Number of Rentals",
fill = "Weather Condition")
# Function to calculate pseudo R-squared
pseudo_r2 <- function(model) {
1 - model$deviance/model$null.deviance
}
# Compare models
model_comparison <- data.frame(
Model = c("Original Poisson", "Enhanced Poisson", "Negative Binomial"),
AIC = c(AIC(model1), AIC(model2), AIC(nb_model)),
PseudoR2 = c(pseudo_r2(model1), pseudo_r2(model2), pseudo_r2(nb_model))
)
print(model_comparison)
## Model AIC PseudoR2
## 1 Original Poisson 2261287.3 0.2563353
## 2 Enhanced Poisson 843484.0 0.7466906
## 3 Negative Binomial 191336.4 0.7392222