knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(effsize)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(broom)
library(pwr)

Load the Dataset

# Load the dataset
bike_sharing_data <- read.csv("C:/Statistics for Data Science/Week 2/bike+sharing+dataset/hour.csv")

# Display the first few rows of the data
head(bike_sharing_data)
##   instant     dteday season yr mnth hr holiday weekday workingday weathersit
## 1       1 2011-01-01      1  0    1  0       0       6          0          1
## 2       2 2011-01-01      1  0    1  1       0       6          0          1
## 3       3 2011-01-01      1  0    1  2       0       6          0          1
## 4       4 2011-01-01      1  0    1  3       0       6          0          1
## 5       5 2011-01-01      1  0    1  4       0       6          0          1
## 6       6 2011-01-01      1  0    1  5       0       6          0          2
##   temp  atemp  hum windspeed casual registered cnt
## 1 0.24 0.2879 0.81    0.0000      3         13  16
## 2 0.22 0.2727 0.80    0.0000      8         32  40
## 3 0.22 0.2727 0.80    0.0000      5         27  32
## 4 0.24 0.2879 0.75    0.0000      3         10  13
## 5 0.24 0.2879 0.75    0.0000      0          1   1
## 6 0.24 0.2576 0.75    0.0896      0          1   1

1. Response Variable Selection

Selected Variable: Total Bike Rentals (cnt) - Direct measure of system usage and revenue generation - Continuous variable suitable for statistical analysis - Key performance indicator for bike-sharing operations - Most relevant metric for operational planning and resource allocation

# Visualize the distribution of our response variable
p1 <- ggplot(bike_sharing_data, aes(x = cnt)) +
  geom_histogram(bins = 30, fill = "lightblue", color = "black") +
  labs(title = "Distribution of Daily Bike Rentals",
       x = "Number of Rentals",
       y = "Frequency") +
  theme_minimal()

p2 <- ggplot(bike_sharing_data, aes(sample = cnt)) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "Q-Q Plot of Rental Counts") +
  theme_minimal()

grid.arrange(p1, p2, ncol = 2)

2. Categorical Variable Analysis (ANOVA)

Selected Variable: Weather Situation (weathersit)

# Create boxplot of rentals by weather
ggplot(bike_sharing_data, aes(x = weathersit, y = cnt)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Bike Rentals by Weather Situation",
       x = "Weather Condition",
       y = "Number of Rentals") +
  theme_minimal()

Null Hypothesis

H₀: The mean number of bike rentals is equal across all weather situations

H₁: At least one weather situation has a different mean number of rentals

ANOVA Test Results

# Perform ANOVA
weather_anova <- aov(cnt ~ weathersit, data = bike_sharing_data)
summary(weather_anova)
##                Df    Sum Sq  Mean Sq F value Pr(>F)    
## weathersit      1  11598301 11598301   359.8 <2e-16 ***
## Residuals   17377 560163290    32236                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate effect size manually
calculate_eta_squared <- function(aov_result) {
  anova_table <- summary(aov_result)[[1]]
  ss_effect <- anova_table[1, "Sum Sq"]
  ss_total <- sum(anova_table[, "Sum Sq"])
  return(ss_effect/ss_total)
}

eta_sq <- calculate_eta_squared(weather_anova)

Results Interpretation

  • F-value = 359.7945096 indicates strong differences between groups
  • p-value < 0.001 shows highly significant effect of weather
  • Effect size (η² = 0.02) indicates that approximately 2% of variance in rentals is explained by weather

Practical Implications

Based on these results:

  • Weather significantly impacts bike rental patterns

  • Operational planning should heavily consider weather forecasts

  • Resource allocation should be weather-dependent

3. Continuous Variable Analysis (Linear Regression)

Selected Variable: Temperature (temp)

# Create scatter plot with regression line
ggplot(bike_sharing_data, aes(x = temp, y = cnt)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Temperature vs Bike Rentals",
       x = "Normalized Temperature",
       y = "Number of Rentals") +
  theme_minimal()

Linear Regression Model

# Fit linear regression model
temp_model <- lm(cnt ~ temp, data = bike_sharing_data)
summary(temp_model)
## 
## Call:
## lm(formula = cnt ~ temp, data = bike_sharing_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -291.37 -110.23  -32.86   76.77  744.76 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.0356     3.4827   -0.01    0.992    
## temp        381.2949     6.5344   58.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 165.9 on 17377 degrees of freedom
## Multiple R-squared:  0.1638, Adjusted R-squared:  0.1638 
## F-statistic:  3405 on 1 and 17377 DF,  p-value: < 2.2e-16
# Create diagnostic plots
par(mfrow = c(2,2))
plot(temp_model)

Model Fit Evaluation

  • R² = 0.164, indicating temperature explains 16.4% of rental variance
  • Significant relationship (p < 0.001)
  • For each unit increase in normalized temperature, rentals increase by approximately 381 bikes

Practical Recommendations

  1. Capacity Planning:
    • Increase bike availability in warmer temperatures
    • Adjust fleet size seasonally
    • Plan maintenance during colder periods
  2. Operational Strategy:
    • Implement temperature-based pricing
    • Adjust staffing based on temperature forecasts
    • Focus marketing during optimal temperature periods
  3. Resource Allocation:
    • Optimize distribution based on temperature patterns
    • Plan maintenance during low-demand periods
    • Target marketing efforts during peak temperature seasons

4. Further Questions for Investigation

  1. Interaction Effects:
    • How do temperature and weather interact?
    • Are there temperature thresholds where behavior changes?
    • Does time of day moderate temperature effects?
  2. Additional Variables:
    • Impact of humidity
    • Role of wind speed
    • Effect of holidays/special events
  3. Model Improvements:
    • Multiple regression possibilities
    • Non-linear relationships
    • Seasonal adjustments

Conclusion

The analysis reveals that both weather conditions and temperature significantly impact bike rental patterns. Weather explains about 2% of rental variance, while temperature accounts for 16.4%. These findings suggest that weather-based operational planning and temperature-sensitive resource allocation are crucial for optimal system performance.