# Load the dataset
bike_data <- read.csv("/Users/roshannaidu/Desktop/IU Sem 2/Stats 1/bike+sharing+dataset/hour.csv")
# View structure and data types of variables
str(bike_data)
## 'data.frame': 17379 obs. of 17 variables:
## $ instant : int 1 2 3 4 5 6 7 8 9 10 ...
## $ dteday : chr "2011-01-01" "2011-01-01" "2011-01-01" "2011-01-01" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ yr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
## $ hr : int 0 1 2 3 4 5 6 7 8 9 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weathersit: int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
## $ atemp : num 0.288 0.273 0.273 0.288 0.288 ...
## $ hum : num 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
## $ windspeed : num 0 0 0 0 0 0.0896 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ cnt : int 16 40 32 13 1 1 2 3 8 14 ...
# View first few rows of the dataset
head(bike_data)
# View summary statistics for all variables
summary(bike_data)
## instant dteday season yr
## Min. : 1 Length:17379 Min. :1.000 Min. :0.0000
## 1st Qu.: 4346 Class :character 1st Qu.:2.000 1st Qu.:0.0000
## Median : 8690 Mode :character Median :3.000 Median :1.0000
## Mean : 8690 Mean :2.502 Mean :0.5026
## 3rd Qu.:13034 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :17379 Max. :4.000 Max. :1.0000
## mnth hr holiday weekday
## Min. : 1.000 Min. : 0.00 Min. :0.00000 Min. :0.000
## 1st Qu.: 4.000 1st Qu.: 6.00 1st Qu.:0.00000 1st Qu.:1.000
## Median : 7.000 Median :12.00 Median :0.00000 Median :3.000
## Mean : 6.538 Mean :11.55 Mean :0.02877 Mean :3.004
## 3rd Qu.:10.000 3rd Qu.:18.00 3rd Qu.:0.00000 3rd Qu.:5.000
## Max. :12.000 Max. :23.00 Max. :1.00000 Max. :6.000
## workingday weathersit temp atemp
## Min. :0.0000 Min. :1.000 Min. :0.020 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.340 1st Qu.:0.3333
## Median :1.0000 Median :1.000 Median :0.500 Median :0.4848
## Mean :0.6827 Mean :1.425 Mean :0.497 Mean :0.4758
## 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:0.660 3rd Qu.:0.6212
## Max. :1.0000 Max. :4.000 Max. :1.000 Max. :1.0000
## hum windspeed casual registered
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 0.0
## 1st Qu.:0.4800 1st Qu.:0.1045 1st Qu.: 4.00 1st Qu.: 34.0
## Median :0.6300 Median :0.1940 Median : 17.00 Median :115.0
## Mean :0.6272 Mean :0.1901 Mean : 35.68 Mean :153.8
## 3rd Qu.:0.7800 3rd Qu.:0.2537 3rd Qu.: 48.00 3rd Qu.:220.0
## Max. :1.0000 Max. :0.8507 Max. :367.00 Max. :886.0
## cnt
## Min. : 1.0
## 1st Qu.: 40.0
## Median :142.0
## Mean :189.5
## 3rd Qu.:281.0
## Max. :977.0
Selected Variable: Total Bike Rentals (cnt) - Direct measure of system usage and revenue generation - Continuous variable suitable for statistical analysis - Key performance indicator for bike-sharing operations - Most relevant metric for operational planning and resource allocation
# Visualize the bike_data of our response variable
p1 <- ggplot(bike_data, aes(x = cnt)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
labs(title = "Distribution of Daily Bike Rentals",
x = "Number of Rentals",
y = "Frequency") +
theme_minimal()
p2 <- ggplot(bike_data, aes(sample = cnt)) +
stat_qq() +
stat_qq_line() +
labs(title = "Q-Q Plot of Rental Counts") +
theme_minimal()
grid.arrange(p1, p2, ncol = 2)
Selected Variable: Weather Situation (weathersit)
# Create boxplot of rentals by weather
ggplot(bike_data, aes(x = weathersit, y = cnt)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Bike Rentals by Weather Situation",
x = "Weather Condition",
y = "Number of Rentals") +
theme_minimal()
H₀: The mean number of bike rentals is equal across all weather situations H₁: At least one weather situation has a different mean number of rentals
# Perform ANOVA
weather_anova <- aov(cnt ~ weathersit, data = bike_data)
summary(weather_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## weathersit 1 11598301 11598301 359.8 <2e-16 ***
## Residuals 17377 560163290 32236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate effect size manually
calculate_eta_squared <- function(aov_result) {
anova_table <- summary(aov_result)[[1]]
ss_effect <- anova_table[1, "Sum Sq"]
ss_total <- sum(anova_table[, "Sum Sq"])
return(ss_effect/ss_total)
}
eta_sq <- calculate_eta_squared(weather_anova)
F-value = 359.7945096 indicates strong differences between groups p-value < 0.001 shows highly significant effect of weather Effect size (η² = 0.02) indicates that approximately 2% of variance in rentals is explained by weather
Based on these results:
Weather significantly impacts bike rental patterns
Operational planning should heavily consider weather forecasts
Resource allocation should be weather-dependent
Selected Variable: Temperature (temp)
# Create scatter plot with regression line
ggplot(bike_data, aes(x = temp, y = cnt)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Temperature vs Bike Rentals",
x = "Normalized Temperature",
y = "Number of Rentals") +
theme_minimal()
# Fit linear regression model
temp_model <- lm(cnt ~ temp, data = bike_data)
summary(temp_model)
##
## Call:
## lm(formula = cnt ~ temp, data = bike_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -291.37 -110.23 -32.86 76.77 744.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0356 3.4827 -0.01 0.992
## temp 381.2949 6.5344 58.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 165.9 on 17377 degrees of freedom
## Multiple R-squared: 0.1638, Adjusted R-squared: 0.1638
## F-statistic: 3405 on 1 and 17377 DF, p-value: < 2.2e-16
# Create diagnostic plots
par(mfrow = c(2,2))
plot(temp_model)
R² = 0.164, indicating temperature explains 16.4% of rental variance Significant relationship (p < 0.001) For each unit increase in normalized temperature, rentals increase by approximately 381 bikes
The analysis reveals that both weather conditions and temperature significantly impact bike rental patterns. Weather explains about 2% of rental variance, while temperature accounts for 16.4%. These findings suggest that weather-based operational planning and temperature-sensitive resource allocation are crucial for optimal system performance.