We need several libraries for this bike share analysis project. Each library serves a specific purpose in our data manipulation and visualization pipeline.
library(tidyverse) library(ggplot2) library(corrplot) library(lubridate) library(gridExtra)
11/16/2025
We need several libraries for this bike share analysis project. Each library serves a specific purpose in our data manipulation and visualization pipeline.
library(tidyverse) library(ggplot2) library(corrplot) library(lubridate) library(gridExtra)
For this analysis, I’m examining bike sharing patterns to understand demand drivers. The dataset includes hourly rental counts with weather conditions and temporal information. Understanding these patterns can help optimize bike distribution and predict future demand.
set.seed(2024)
dates <- seq(from = as.POSIXct("2023-01-01 00:00:00"),
to = as.POSIXct("2023-12-31 23:00:00"),
by = "hour")
n_hours <- length(dates)
bike_data <- data.frame(
datetime = dates,
hour = hour(dates),
weekday = wday(dates, label = TRUE),
month = month(dates),
season = factor(quarter(dates),
labels = c("Winter", "Spring", "Summer", "Fall"))
)
bike_data <- bike_data %>%
mutate(
temperature = 15 + 10 * sin(2 * pi * yday(datetime) / 365) +
rnorm(n_hours, 0, 3),
humidity = 65 - 0.5 * temperature + rnorm(n_hours, 0, 8),
windspeed = abs(rnorm(n_hours, 12, 4)),
weather_condition = sample(c("Clear", "Cloudy", "Rain"),
n_hours, replace = TRUE,
prob = c(0.6, 0.3, 0.1))
)
Now we will generate rental counts based on realistic patterns which are observed in actual bike share systems.
bike_data <- bike_data %>%
mutate(
rush_hour_multiplier = case_when(
hour %in% c(7, 8, 9) ~ 2.5,
hour %in% c(17, 18, 19) ~ 3.0,
hour %in% c(0, 1, 2, 3, 4) ~ 0.1,
TRUE ~ 1.0
),
weekend_effect = ifelse(weekday %in% c("Sat", "Sun"), 0.8, 1.3),
weather_multiplier = case_when(
weather_condition == "Clear" ~ 1.2,
weather_condition == "Cloudy" ~ 0.9,
weather_condition == "Rain" ~ 0.3
),
temp_effect = exp(-0.02 * (temperature - 20)^2),
rentals = round(
100 * rush_hour_multiplier * weekend_effect *
weather_multiplier * temp_effect * runif(n_hours, 0.8, 1.2)
)
)
head(bike_data[, c("datetime", "hour", "weekday", "temperature",
"weather_condition", "rentals")])
## datetime hour weekday temperature weather_condition rentals ## 1 2023-01-01 00:00:00 0 Sun 18.11804 Clear 9 ## 2 2023-01-01 01:00:00 1 Sun 16.57828 Clear 7 ## 3 2023-01-01 02:00:00 2 Sun 14.84822 Clear 6 ## 4 2023-01-01 03:00:00 3 Sun 14.53350 Clear 5 ## 5 2023-01-01 04:00:00 4 Sun 18.64643 Rain 2 ## 6 2023-01-01 05:00:00 5 Sun 19.04920 Clear 112
Let’s examine the structure and summary statistics of our dataset to understand what we’re working with.
cat("Dataset size:", nrow(bike_data), "hourly records\n")
## Dataset size: 8760 hourly records
cat("Number of variables:", ncol(bike_data), "\n\n")
## Number of variables: 14
summary(bike_data[, c("rentals", "temperature", "humidity", "windspeed")])
## rentals temperature humidity windspeed ## Min. : 0.00 Min. :-3.956 Min. :24.36 Min. : 0.02119 ## 1st Qu.: 4.00 1st Qu.: 8.350 1st Qu.:51.42 1st Qu.: 9.35265 ## Median : 24.00 Median :15.091 Median :57.31 Median :11.97355 ## Mean : 66.42 Mean :14.982 Mean :57.49 Mean :11.97909 ## 3rd Qu.: 97.00 3rd Qu.:21.535 3rd Qu.:63.57 3rd Qu.:14.63794 ## Max. :550.00 Max. :33.798 Max. :87.53 Max. :25.92364
str(bike_data)
## 'data.frame': 8760 obs. of 14 variables: ## $ datetime : POSIXct, format: "2023-01-01 00:00:00" "2023-01-01 01:00:00" ... ## $ hour : int 0 1 2 3 4 5 6 7 8 9 ... ## $ weekday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 1 1 1 1 1 1 1 1 1 1 ... ## $ month : num 1 1 1 1 1 1 1 1 1 1 ... ## $ season : Factor w/ 4 levels "Winter","Spring",..: 1 1 1 1 1 1 1 1 1 1 ... ## $ temperature : num 18.1 16.6 14.8 14.5 18.6 ... ## $ humidity : num 40 63.7 48.8 55.7 58.6 ... ## $ windspeed : num 10.05 14.9 8.74 10.38 12.13 ... ## $ weather_condition : chr "Clear" "Clear" "Clear" "Clear" ... ## $ rush_hour_multiplier: num 0.1 0.1 0.1 0.1 0.1 1 1 2.5 2.5 2.5 ... ## $ weekend_effect : num 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ... ## $ weather_multiplier : num 1.2 1.2 1.2 1.2 0.3 1.2 1.2 1.2 1.2 1.2 ... ## $ temp_effect : num 0.932 0.791 0.588 0.55 0.964 ... ## $ rentals : num 9 7 6 5 2 112 93 133 50 73 ...
cat("Weather Condition Distribution:\n")
## Weather Condition Distribution:
table(bike_data$weather_condition)
## ## Clear Cloudy Rain ## 5226 2618 916
cat("\n\nSeasonal Distribution:\n")
## ## ## Seasonal Distribution:
table(bike_data$season)
## ## Winter Spring Summer Fall ## 2160 2184 2208 2208
weekday_avg <- bike_data %>% group_by(weekday) %>% summarise(avg_rentals = round(mean(rentals), 1)) print(weekday_avg)
## # A tibble: 7 × 2 ## weekday avg_rentals ## <ord> <dbl> ## 1 Sun 45.5 ## 2 Mon 73.7 ## 3 Tue 74.7 ## 4 Wed 73.7 ## 5 Thu 76.3 ## 6 Fri 74.7 ## 7 Sat 46.8
Creating additional features that might better explain rental patterns. This step is crucial for improving our analysis and model performance.
bike_data <- bike_data %>%
mutate(
is_weekend = weekday %in% c("Sat", "Sun"),
is_rush_hour = hour %in% c(7, 8, 9, 17, 18, 19),
time_of_day = case_when(
hour >= 5 & hour < 12 ~ "Morning",
hour >= 12 & hour < 17 ~ "Afternoon",
hour >= 17 & hour < 21 ~ "Evening",
TRUE ~ "Night"
),
temp_category = cut(temperature,
breaks = c(-Inf, 5, 15, 25, Inf),
labels = c("Cold", "Cool", "Mild", "Warm")),
month_name = month.name[month]
)
head(bike_data %>%
select(datetime, rentals, is_weekend, is_rush_hour, time_of_day))
## datetime rentals is_weekend is_rush_hour time_of_day ## 1 2023-01-01 00:00:00 9 TRUE FALSE Night ## 2 2023-01-01 01:00:00 7 TRUE FALSE Night ## 3 2023-01-01 02:00:00 6 TRUE FALSE Night ## 4 2023-01-01 03:00:00 5 TRUE FALSE Night ## 5 2023-01-01 04:00:00 2 TRUE FALSE Night ## 6 2023-01-01 05:00:00 112 TRUE FALSE Morning
Creating daily and hourly aggregations for different types of analysis.
daily_rentals <- bike_data %>%
mutate(date = as.Date(datetime)) %>%
group_by(date) %>%
summarise(
total_rentals = sum(rentals),
avg_temp = mean(temperature),
avg_humidity = mean(humidity),
.groups = 'drop'
)
cat("Created daily aggregation with", nrow(daily_rentals), "days\n")
## Created daily aggregation with 366 days
hourly_patterns <- bike_data %>%
group_by(hour) %>%
summarise(
avg_rentals = mean(rentals),
std_rentals = sd(rentals),
.groups = 'drop'
)
head(hourly_patterns)
## # A tibble: 6 × 3 ## hour avg_rentals std_rentals ## <int> <dbl> <dbl> ## 1 0 5.15 4.86 ## 2 1 5.04 4.98 ## 3 2 5.11 4.79 ## 4 3 5.45 5.15 ## 5 4 5.49 5.21 ## 6 5 53.3 50.2
Let’s visualize how bike rentals vary throughout the day, comparing weekdays and weekends.
# Calculate hourly averages
hourly_by_day <- bike_data %>%
group_by(hour, is_weekend) %>%
summarise(avg_rentals = mean(rentals), .groups = 'drop')
ggplot(hourly_by_day, aes(x = hour, y = avg_rentals,
color = is_weekend, group = is_weekend)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
scale_x_continuous(breaks = seq(0, 23, 2)) +
labs(title = "Hourly Bike Rental Patterns",
subtitle = "Clear rush hour peaks on weekdays vs. leisure pattern on weekends",
x = "Hour of Day", y = "Average Rentals",
color = "Weekend") +
theme_minimal() +
theme(legend.position = "top")
Analyzing how different weather conditions affect bike rental demand.
p1 <- bike_data %>%
group_by(weather_condition) %>%
summarise(avg_rentals = mean(rentals)) %>%
ggplot(aes(x = reorder(weather_condition, -avg_rentals),
y = avg_rentals, fill = weather_condition)) +
geom_col() +
labs(title = "Weather Impact on Rentals",
x = "Weather", y = "Average Rentals") +
theme_minimal() +
theme(legend.position = "none") +
scale_fill_manual(values = c("Clear" = "gold",
"Cloudy" = "gray70",
"Rain" = "steelblue"))
p2 <- ggplot(bike_data %>% sample_n(2000),
aes(x = temperature, y = rentals)) +
geom_point(alpha = 0.3, color = "darkblue") +
geom_smooth(method = "loess", color = "red", se = TRUE) +
labs(title = "Temperature vs Rentals",
x = "Temperature (°C)", y = "Rentals") +
theme_minimal()
p3 <- bike_data %>%
group_by(season) %>%
summarise(avg_rentals = mean(rentals)) %>%
ggplot(aes(x = season, y = avg_rentals, fill = season)) +
geom_col() +
labs(title = "Seasonal Patterns",
x = "Season", y = "Average Rentals") +
theme_minimal() +
scale_fill_manual(values = c("Winter" = "lightblue",
"Spring" = "lightgreen",
"Summer" = "gold",
"Fall" = "orange")) +
theme(legend.position = "none")
# Weekly patterns
p4 <- bike_data %>%
group_by(weekday) %>%
summarise(avg_rentals = mean(rentals)) %>%
ggplot(aes(x = weekday, y = avg_rentals, fill = weekday)) +
geom_col() +
labs(title = "Day of Week Patterns",
x = "Day", y = "Average Rentals") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
cor_vars <- bike_data %>%
select(rentals, temperature, humidity, windspeed, hour)
cor_matrix <- cor(cor_vars)
corrplot(cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
title = "Variable Correlations")
Let us create a regression model to predict bike rentals based on various factors.
# Prepare data for modeling
model_data <- bike_data %>%
mutate(
# Convert hour to cyclic features
hour_sin = sin(2 * pi * hour / 24),
hour_cos = cos(2 * pi * hour / 24),
# Convert month to cyclic features
month_sin = sin(2 * pi * month / 12),
month_cos = cos(2 * pi * month / 12)
)
# Split data into training and testing sets
set.seed(42)
train_indices <- sample(1:nrow(model_data), 0.8 * nrow(model_data))
train_data <- model_data[train_indices, ]
test_data <- model_data[-train_indices, ]
cat("Training set size:", nrow(train_data), "\n")
## Training set size: 7008
cat("Testing set size:", nrow(test_data), "\n")
## Testing set size: 1752
# Build multiple regression model
rental_model <- lm(rentals ~ temperature + humidity + windspeed +
hour_sin + hour_cos + month_sin + month_cos +
is_weekend + weather_condition,
data = train_data)
# Model summary
summary(rental_model)
## ## Call: ## lm(formula = rentals ~ temperature + humidity + windspeed + hour_sin + ## hour_cos + month_sin + month_cos + is_weekend + weather_condition, ## data = train_data) ## ## Residuals: ## Min 1Q Median 3Q Max ## -176.16 -42.61 -12.94 18.68 429.36 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 61.14218 8.92921 6.847 8.16e-12 *** ## temperature 1.81214 0.29076 6.233 4.86e-10 *** ## humidity -0.02338 0.11155 -0.210 0.834 ## windspeed -0.05328 0.22872 -0.233 0.816 ## hour_sin -19.57423 1.27781 -15.319 < 2e-16 *** ## hour_cos -24.74331 1.27505 -19.406 < 2e-16 *** ## month_sin 37.81485 3.02585 12.497 < 2e-16 *** ## month_cos -8.81535 1.47076 -5.994 2.15e-09 *** ## is_weekendTRUE -29.01938 2.00390 -14.481 < 2e-16 *** ## weather_conditionCloudy -19.19044 2.01492 -9.524 < 2e-16 *** ## weather_conditionRain -56.62609 3.06666 -18.465 < 2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 75.52 on 6997 degrees of freedom ## Multiple R-squared: 0.3174, Adjusted R-squared: 0.3164 ## F-statistic: 325.3 on 10 and 6997 DF, p-value: < 2.2e-16
test_data$predicted <- predict(rental_model, test_data)
rmse <- sqrt(mean((test_data$rentals - test_data$predicted)^2))
mae <- mean(abs(test_data$rentals - test_data$predicted))
r2 <- cor(test_data$rentals, test_data$predicted)^2
cat("Model Performance:\n")
## Model Performance:
cat("RMSE:", round(rmse, 2), "\n")
## RMSE: 80.42
cat("MAE:", round(mae, 2), "\n")
## MAE: 54.3
cat("R-squared:", round(r2, 3), "\n")
## R-squared: 0.28
# Visualization
p1 <- ggplot(test_data %>% sample_n(min(1000, nrow(test_data))),
aes(x = rentals, y = predicted)) +
geom_point(alpha = 0.5, color = "blue") +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "Actual vs Predicted Rentals",
x = "Actual", y = "Predicted") +
theme_minimal()
test_data$residuals <- test_data$rentals - test_data$predicted
p2 <- ggplot(test_data, aes(x = predicted, y = residuals)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Residual Analysis",
x = "Predicted", y = "Residuals") +
theme_minimal()
grid.arrange(p1, p2, ncol = 2)
#peak hours
peak_hours <- bike_data %>%
group_by(hour) %>%
summarise(avg_rentals = mean(rentals)) %>%
arrange(desc(avg_rentals)) %>%
head(5)
cat("Top 5 Peak Hours:\n")
## Top 5 Peak Hours:
print(peak_hours)
## # A tibble: 5 × 2 ## hour avg_rentals ## <int> <dbl> ## 1 19 164. ## 2 18 160. ## 3 17 154. ## 4 8 134. ## 5 7 132.
#summary
weather_impact <- bike_data %>%
group_by(weather_condition) %>%
summarise(
avg_rentals = round(mean(rentals), 1),
total_hours = n(),
pct_of_time = round(n() / nrow(bike_data) * 100, 1)
) %>%
arrange(desc(avg_rentals))
cat("\nWeather Impact:\n")
## ## Weather Impact:
print(weather_impact)
## # A tibble: 3 × 4 ## weather_condition avg_rentals total_hours pct_of_time ## <chr> <dbl> <int> <dbl> ## 1 Clear 78.8 5226 59.7 ## 2 Cloudy 58.3 2618 29.9 ## 3 Rain 18.9 916 10.5
temp_analysis <- bike_data %>%
mutate(temp_range = cut(temperature, breaks = 5)) %>%
group_by(temp_range) %>%
summarise(
avg_rentals = round(mean(rentals), 1),
count = n()
) %>%
arrange(desc(avg_rentals))
cat("\nOptimal Temperature Ranges:\n")
## ## Optimal Temperature Ranges:
print(temp_analysis)
## # A tibble: 5 × 3 ## temp_range avg_rentals count ## <fct> <dbl> <int> ## 1 (18.7,26.2] 120. 2699 ## 2 (11.1,18.7] 90.6 2294 ## 3 (26.2,33.8] 42.2 559 ## 4 (3.59,11.1] 9.6 2654 ## 5 (-3.99,3.59] 0.2 554
sample_week <- bike_data %>%
filter(datetime >= as.POSIXct("2023-06-01") &
datetime < as.POSIXct("2023-06-08"))
ggplot(sample_week, aes(x = datetime, y = rentals)) +
geom_line(color = "darkblue", size = 1) +
geom_point(aes(color = weather_condition), size = 2) +
scale_color_manual(values = c("Clear" = "gold",
"Cloudy" = "gray",
"Rain" = "darkblue")) +
labs(title = "One Week of Bike Rentals - Showing Daily and Weather Patterns",
subtitle = "June 1-7, 2023",
x = "Date", y = "Rentals",
color = "Weather") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Based on this comprehensive analysis of bike sharing patterns:
This analysis demonstrates that bike sharing demand follows highly predictable patterns driven by time and weather factors. By leveraging these insights, operators can optimize fleet distribution, reduce operational costs, and improve service availability when users need it most.