knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(effsize)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(broom)
library(pwr)
# Load the dataset
bike_sharing_data <- read.csv("C:/Statistics for Data Science/Week 2/bike+sharing+dataset/hour.csv")
# Display the first few rows of the data
head(bike_sharing_data)
## instant dteday season yr mnth hr holiday weekday workingday weathersit
## 1 1 2011-01-01 1 0 1 0 0 6 0 1
## 2 2 2011-01-01 1 0 1 1 0 6 0 1
## 3 3 2011-01-01 1 0 1 2 0 6 0 1
## 4 4 2011-01-01 1 0 1 3 0 6 0 1
## 5 5 2011-01-01 1 0 1 4 0 6 0 1
## 6 6 2011-01-01 1 0 1 5 0 6 0 2
## temp atemp hum windspeed casual registered cnt
## 1 0.24 0.2879 0.81 0.0000 3 13 16
## 2 0.22 0.2727 0.80 0.0000 8 32 40
## 3 0.22 0.2727 0.80 0.0000 5 27 32
## 4 0.24 0.2879 0.75 0.0000 3 10 13
## 5 0.24 0.2879 0.75 0.0000 0 1 1
## 6 0.24 0.2576 0.75 0.0896 0 1 1
Selected Variable: Total Bike Rentals (cnt) - Direct measure of system usage and revenue generation - Continuous variable suitable for statistical analysis - Key performance indicator for bike-sharing operations - Most relevant metric for operational planning and resource allocation
# Visualize the distribution of our response variable
p1 <- ggplot(bike_sharing_data, aes(x = cnt)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
labs(title = "Distribution of Daily Bike Rentals",
x = "Number of Rentals",
y = "Frequency") +
theme_minimal()
p2 <- ggplot(bike_sharing_data, aes(sample = cnt)) +
stat_qq() +
stat_qq_line() +
labs(title = "Q-Q Plot of Rental Counts") +
theme_minimal()
grid.arrange(p1, p2, ncol = 2)
# Create boxplot of rentals by weather
ggplot(bike_sharing_data, aes(x = weathersit, y = cnt)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Bike Rentals by Weather Situation",
x = "Weather Condition",
y = "Number of Rentals") +
theme_minimal()
H₀: The mean number of bike rentals is equal across all weather situations
H₁: At least one weather situation has a different mean number of rentals
# Perform ANOVA
weather_anova <- aov(cnt ~ weathersit, data = bike_sharing_data)
summary(weather_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## weathersit 1 11598301 11598301 359.8 <2e-16 ***
## Residuals 17377 560163290 32236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate effect size manually
calculate_eta_squared <- function(aov_result) {
anova_table <- summary(aov_result)[[1]]
ss_effect <- anova_table[1, "Sum Sq"]
ss_total <- sum(anova_table[, "Sum Sq"])
return(ss_effect/ss_total)
}
eta_sq <- calculate_eta_squared(weather_anova)
Based on these results:
Weather significantly impacts bike rental patterns
Operational planning should heavily consider weather forecasts
Resource allocation should be weather-dependent
# Create scatter plot with regression line
ggplot(bike_sharing_data, aes(x = temp, y = cnt)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Temperature vs Bike Rentals",
x = "Normalized Temperature",
y = "Number of Rentals") +
theme_minimal()
# Fit linear regression model
temp_model <- lm(cnt ~ temp, data = bike_sharing_data)
summary(temp_model)
##
## Call:
## lm(formula = cnt ~ temp, data = bike_sharing_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -291.37 -110.23 -32.86 76.77 744.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0356 3.4827 -0.01 0.992
## temp 381.2949 6.5344 58.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 165.9 on 17377 degrees of freedom
## Multiple R-squared: 0.1638, Adjusted R-squared: 0.1638
## F-statistic: 3405 on 1 and 17377 DF, p-value: < 2.2e-16
# Create diagnostic plots
par(mfrow = c(2,2))
plot(temp_model)
The analysis reveals that both weather conditions and temperature significantly impact bike rental patterns. Weather explains about 2% of rental variance, while temperature accounts for 16.4%. These findings suggest that weather-based operational planning and temperature-sensitive resource allocation are crucial for optimal system performance.