# Load tidyverse as a collection of data science packages (Practically not needed to import any other packages mostly after importing this package)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load dplyr for data manipulation
library(dplyr)
# Load ggplot2 for data visualisation
library(ggplot2)
# Load the dataset
bike_data <- read.csv("/Users/roshannaidu/Desktop/IU Sem 2/Stats 1/bike+sharing+dataset/hour.csv")
# View structure and data types of variables
str(bike_data)
## 'data.frame': 17379 obs. of 17 variables:
## $ instant : int 1 2 3 4 5 6 7 8 9 10 ...
## $ dteday : chr "2011-01-01" "2011-01-01" "2011-01-01" "2011-01-01" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ yr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
## $ hr : int 0 1 2 3 4 5 6 7 8 9 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weathersit: int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
## $ atemp : num 0.288 0.273 0.273 0.288 0.288 ...
## $ hum : num 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
## $ windspeed : num 0 0 0 0 0 0.0896 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ cnt : int 16 40 32 13 1 1 2 3 8 14 ...
# View first few rows of the dataset
head(bike_data)
# View summary statistics for all variables
summary(bike_data)
## instant dteday season yr
## Min. : 1 Length:17379 Min. :1.000 Min. :0.0000
## 1st Qu.: 4346 Class :character 1st Qu.:2.000 1st Qu.:0.0000
## Median : 8690 Mode :character Median :3.000 Median :1.0000
## Mean : 8690 Mean :2.502 Mean :0.5026
## 3rd Qu.:13034 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :17379 Max. :4.000 Max. :1.0000
## mnth hr holiday weekday
## Min. : 1.000 Min. : 0.00 Min. :0.00000 Min. :0.000
## 1st Qu.: 4.000 1st Qu.: 6.00 1st Qu.:0.00000 1st Qu.:1.000
## Median : 7.000 Median :12.00 Median :0.00000 Median :3.000
## Mean : 6.538 Mean :11.55 Mean :0.02877 Mean :3.004
## 3rd Qu.:10.000 3rd Qu.:18.00 3rd Qu.:0.00000 3rd Qu.:5.000
## Max. :12.000 Max. :23.00 Max. :1.00000 Max. :6.000
## workingday weathersit temp atemp
## Min. :0.0000 Min. :1.000 Min. :0.020 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.340 1st Qu.:0.3333
## Median :1.0000 Median :1.000 Median :0.500 Median :0.4848
## Mean :0.6827 Mean :1.425 Mean :0.497 Mean :0.4758
## 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:0.660 3rd Qu.:0.6212
## Max. :1.0000 Max. :4.000 Max. :1.000 Max. :1.0000
## hum windspeed casual registered
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 0.0
## 1st Qu.:0.4800 1st Qu.:0.1045 1st Qu.: 4.00 1st Qu.: 34.0
## Median :0.6300 Median :0.1940 Median : 17.00 Median :115.0
## Mean :0.6272 Mean :0.1901 Mean : 35.68 Mean :153.8
## 3rd Qu.:0.7800 3rd Qu.:0.2537 3rd Qu.: 48.00 3rd Qu.:220.0
## Max. :1.0000 Max. :0.8507 Max. :367.00 Max. :886.0
## cnt
## Min. : 1.0
## 1st Qu.: 40.0
## Median :142.0
## Mean :189.5
## 3rd Qu.:281.0
## Max. :977.0
# Check number of rows and columns
dim(bike_data)
## [1] 17379 17
# Display all variable names
names(bike_data)
## [1] "instant" "dteday" "season" "yr" "mnth"
## [6] "hr" "holiday" "weekday" "workingday" "weathersit"
## [11] "temp" "atemp" "hum" "windspeed" "casual"
## [16] "registered" "cnt"
# Check for missing values in each column
colSums(is.na(bike_data))
## instant dteday season yr mnth hr holiday
## 0 0 0 0 0 0 0
## weekday workingday weathersit temp atemp hum windspeed
## 0 0 0 0 0 0 0
## casual registered cnt
## 0 0 0
# Compute detailed summary statistics for temperature and total rentals (cnt)
numeric_summary <- bike_data %>%
summarise(
# Temperature summary statistics
min_temp = min(temp, na.rm = TRUE),
max_temp = max(temp, na.rm = TRUE),
mean_temp = mean(temp, na.rm = TRUE),
median_temp = median(temp, na.rm = TRUE),
sd_temp = sd(temp, na.rm = TRUE),
q1_temp = quantile(temp, 0.25, na.rm = TRUE),
q3_temp = quantile(temp, 0.75, na.rm = TRUE),
# Total rentals (cnt) summary statistics
min_cnt = min(cnt, na.rm = TRUE),
max_cnt = max(cnt, na.rm = TRUE),
mean_cnt = mean(cnt, na.rm = TRUE),
median_cnt = median(cnt, na.rm = TRUE),
sd_cnt = sd(cnt, na.rm = TRUE),
q1_cnt = quantile(cnt, 0.25, na.rm = TRUE),
q3_cnt = quantile(cnt, 0.75, na.rm = TRUE)
) %>%
pivot_longer(
everything(),
names_to = c("statistic", "variable"),
names_sep = "_",
values_to = "value"
) %>%
pivot_wider(
names_from = "variable",
values_from = "value"
)
# Display the numeric summary table
numeric_summary
temp) Insights:Range and Spread: Temperature values range from 0.02 to 1.00, reflecting normalized temperatures. This wide range shows that the dataset covers almost the entire possible spectrum of temperatures throughout the year.
Central Tendency and Distribution: The mean is 0.50, and the median is also around 0.50, indicating a roughly symmetric distribution. The relatively low standard deviation suggests temperatures do not vary dramatically from the mean.
Quartiles: The 25th percentile (Q1) is 0.34 and the 75th percentile (Q3) is 0.68, so 50% of temperatures fall within this middle range. This moderate spread indicates that mild to warm conditions are common.
Implication: The symmetric distribution and even spread around the mean imply bike rentals occur across a broad temperature range, with potential peaks during optimal mild weather. Extreme temperatures are underrepresented, which is beneficial for modeling typical usage patterns.
cnt) Insights:Range and Spread: Total bike rentals per hour range from 1 to 977, demonstrating a broad spectrum of usage. This indicates hourly rentals fluctuate greatly due to factors like time of day, season, or weather.
Central Tendency and Distribution: The mean is 189.46, higher than the median of 145, indicating a positively skewed distribution. Most hours have lower rentals, but a few high-usage periods pull the mean upward.
Standard Deviation and Quartiles: The standard deviation is high, reflecting substantial variability. The 25th percentile (Q1) is 40, and the 75th percentile (Q3) is 284, showing that 50% of rental counts are in this mid-range. The upper quartile captures periods of higher activity.
Implication: Rental counts are influenced by multiple external factors, such as time of day (commuting vs. leisure), day of the week, weather, or special events. The skewed distribution indicates that while rentals are typically lower, there are occasional peaks. Understanding these conditions is critical for forecasting demand and resource planning.
# Count unique values for season and weathersit
season_counts <- table(bike_data$season)
weather_counts <- table(bike_data$weathersit)
# Convert tables to data frames
season_df <- as.data.frame(season_counts) %>%
rename(Variable = Var1, Count = Freq) %>%
mutate(Type = "Season")
weather_df <- as.data.frame(weather_counts) %>%
rename(Variable = Var1, Count = Freq) %>%
mutate(Type = "Weather")
# Combine both into a single table
categorical_summary <- bind_rows(season_df, weather_df) %>%
select(Type, Variable, Count)
# Display combined summary table
categorical_summary
season and weathersitseason) InsightsDistribution Across Seasons:
The dataset shows counts for each season:
If the counts are not evenly distributed, it may indicate seasonal biases. For instance, more records in summer could overrepresent this season, potentially affecting analyses like comparing bike rental trends across the year.
Rental Trends by Season:
Differences in rental counts across seasons can reveal seasonal
usage patterns. Typically, higher rentals are expected in
warmer seasons (summer and fall) than colder ones (winter). Recognizing
these patterns is important for optimizing bike availability and
scheduling maintenance.
weathersit) InsightsDistribution of Weather Conditions:
The summary shows counts for each weather category:
If the dataset has many more records for favorable weather (categories 1 and 2), it suggests the bike-sharing system is most frequently used in good weather, which is expected, but also highlights limited data for poor weather conditions.
Impact of Adverse Weather on Rentals:
The distribution of rentals across weather types indicates user
sensitivity to weather changes. A sharp drop in rentals for categories 3
and 4 (light snow/rain or heavy rain/snow) would suggest users
avoid cycling in bad weather, while a moderate decline
indicates some users may still ride, perhaps for commuting
needs.
# Average number of rentals by temperature
temp_vs_rentals <- bike_data %>%
group_by(temp) %>%
summarise(avg_rentals = mean(cnt))
temp_vs_rentals
# Average rentals by working day (1 = weekday, 0 = weekend)
rentals_by_day <- bike_data %>%
group_by(workingday) %>%
summarise(avg_rentals = mean(cnt, na.rm = TRUE))
rentals_by_day
# Average rentals by weather situation (weathersit)
rentals_by_weather <- bike_data %>%
group_by(weathersit) %>%
summarise(avg_rentals = mean(cnt, na.rm = TRUE))
rentals_by_weather
weathersit categories:
Clear Weather (weathersit = 1):
Across all seasons, bike rentals are highest under clear weather
conditions. However, the average rentals vary by season. For instance,
clear weather in summer (season = 2) may have the highest average
rentals, while clear weather in winter (season = 4) still shows fewer
rentals compared to other seasons.
Mist or Cloudy Weather (weathersit = 2):
Mist or cloudy conditions moderately reduce rentals compared to clear
weather but are not as severe as more adverse conditions. While some
users may be discouraged, many continue to use the service. The impact
is likely stronger in winter than in other seasons.
Adverse Weather (weathersit = 3 and 4):
Rentals during light snow or rain (weathersit = 3) and severe weather
(weathersit = 4) are much lower across all seasons. The decrease is
especially pronounced in winter, where cold and adverse weather together
strongly reduce ridership. Summer may still see some rentals in light
rain or mild snow, but numbers remain lower than in clear
conditions.
Clear Weather Across Seasons:
Clear weather consistently results in high rentals across all seasons,
but the magnitude varies. Rentals may be highest in fall (season = 3)
and summer, slightly lower in spring (season = 1), and lowest in winter.
This indicates that even under ideal conditions, seasonality affects
bike rentals, potentially due to factors like daylight hours or
holidays.
Effect of Cold Weather in Winter:
Adverse weather (weathersit = 3 or 4) has the strongest impact in
winter. The combination of cold temperatures with snow, fog, or heavy
rain leads to a sharp decline in rentals, showing a compounding effect
of winter conditions on user behavior.
The plot below shows how bike rentals vary with temperature across different seasons. Each point represents an hourly rental count, color-coded by season. The black line shows the overall linear trend.
# Distributions of rentals over temperature
ggplot(bike_data, aes(x = temp, y = cnt, color = factor(season))) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Bike Rentals vs Temperature",
x = "Normalized Temperature",
y = "Count of Rentals"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "top"
)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(bike_data, aes(x = factor(hr), y = cnt, fill = factor(workingday))) +
geom_boxplot(alpha = 0.7, outlier.color = "red", outlier.shape = 1) +
scale_fill_manual(values = c("0" = "#1f77b4", "1" = "#ff7f0e"),
labels = c("Non-Working Day", "Working Day"), name = "Day Type") +
labs(
title = "Bike Rentals by Hour of the Day",
x = "Hour of the Day",
y = "Count of Rentals"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "top"
)
The box plots reveal clear peaks in bike rentals around 8 AM and 5 PM on working days. This correspond to typical commuting times.
Working Days:
On working days, rentals peak during morning and evening rush hours,
while midday (9 AM – 4 PM) and late-night rentals remain relatively low.
This reflects the typical commuting pattern, where users primarily rent
bikes to travel to and from work.
Non-Working Days:
On non-working days, rentals gradually increase from late morning (10
AM), peak in early afternoon (1–3 PM), and then decline toward evening.
This suggests that users rent bikes mainly for leisure or recreational
purposes rather than commuting.
Bike rentals remain consistently low between 10 PM and 6 AM for both working and non-working days, likely due to safety concerns, reduced visibility, or colder temperatures.
A small but noticeable peak around 12 - 1 PM on working days likely represents short trips during lunch breaks.
Afternoon rentals (12 – 4 PM) on non-working days are higher than the same hours on working days, indicating extended recreational or afternoon activity usage.
ggplot(bike_data, aes(x = factor(weathersit), y = cnt, fill = factor(weathersit))) +
geom_boxplot() +
labs(title = "Bike Rentals by Weather Conditions", x = "Weather Situation", y = "Count of Rentals") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "top"
)
Weather Situation 1 (Clear, Few Clouds, Partly
Cloudy):
This category has the highest median and widest range of bike rentals,
showing that users strongly prefer renting bikes in clear or partly
cloudy conditions. The upper quartile is also significantly higher,
indicating that favorable weather encourages more usage.
Weather Situation 2 (Mist + Cloudy, Mist + Broken Clouds,
Mist + Few Clouds, Mist):
Rentals are lower than in clear weather, with a slightly narrower
interquartile range. Users still rent bikes, but the overall counts
decline moderately under misty or cloudy conditions.
Weather Situation 3 (Light Snow, Light Rain, Thunderstorm,
Scattered Clouds):
The median rentals drop noticeably, and the upper quartile is much
lower. This suggests that adverse weather such as light snow or rain
discourages bike usage.
Weather Situation 4 (Heavy Rain, Ice Pellets, Snow,
Fog):
Rentals are the lowest in this category, with a median near zero and a
narrow interquartile range. Very few people rent bikes during severe
conditions, likely due to safety and comfort concerns.
Most bike rentals occur during clear or partly cloudy days. Even on low-usage days, the counts remain relatively high.
From Weather Situation 1 to 4, there is a consistent drop in median rentals and overall spread, indicating a strong negative effect of adverse weather on bike usage.
The interquartile range is widest in clear weather, suggesting that factors like temperature, day of the week, or special events have more influence when conditions are favorable.
In severe conditions (weathersit = 3 or 4), rentals are consistently low and predictable, reflecting reduced demand due to safety and comfort concerns.
Weather conditions play a critical role in determining bike rental activity. Clear or partly cloudy weather (Weather Situation 1) consistently produces the highest rental volumes, whereas extreme weather conditions such as heavy rainfall, snow, and fog lead to a sharp reduction in rentals. This pattern reflects a strong negative relationship between worsening weather conditions and bike usage, underscoring the importance of implementing flexible, weather-informed operational strategies to improve efficiency and sustain user engagement.
Rental patterns show well-defined peaks during typical commuting periods, specifically around 8 AM and between 5 and 6 PM on working days. This suggests that the bike-sharing system is primarily used as a commuting option during weekdays. In contrast, on non-working days, rental activity tends to peak from late morning through early afternoon, indicating a shift toward leisure and recreational use. These trends highlight the need for adaptive bike redistribution and customized promotional strategies that reflect different user behaviors on working versus non-working days.
Temperature has a noticeable positive effect on bike rental demand, with higher rental activity observed during warmer conditions, particularly when the weather is clear. Scatter plot visualizations reveal that users are more inclined to rent bikes when temperatures are mild to warm. This finding presents an opportunity to increase overall usage by actively promoting bike rentals during warmer seasons or by encouraging usage during relatively warmer periods within colder days.
The contrast in rental behavior between working days and non-working days indicates that the bike-sharing service fulfills different user needs depending on the day. On weekdays, rentals are largely concentrated around peak commuting hours, reflecting commuter-driven usage. On weekends and holidays, however, rental demand is more evenly spread throughout the day and is largely motivated by recreational or leisure activities. These insights can support informed decisions regarding scheduling, bike allocation, service expansion, and targeted marketing efforts.
Working days show a noticeable increase in rental activity during midday hours, likely associated with lunchtime travel. On non-working days, rental levels remain consistently high into the evening, suggesting longer or more flexible usage periods. This variability in rental behavior highlights the need for the bike-sharing service to accommodate a wide range of user requirements, from short commuting trips to extended leisure rides, while managing operational resources effectively.