Roshan R Naidu (02/02/2026)

Importing Libraries

# Load tidyverse as a collection of data science packages (Practically not needed to import any other packages mostly after importing this package)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load dplyr for data manipulation
library(dplyr)

# Load ggplot2 for data visualisation
library(ggplot2)

Loading and Exploring The Dataset

# Load the dataset
bike_data <- read.csv("/Users/roshannaidu/Desktop/IU Sem 2/Stats 1/bike+sharing+dataset/hour.csv")

# View structure and data types of variables
str(bike_data)
## 'data.frame':    17379 obs. of  17 variables:
##  $ instant   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ dteday    : chr  "2011-01-01" "2011-01-01" "2011-01-01" "2011-01-01" ...
##  $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ yr        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mnth      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ hr        : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday   : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ workingday: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weathersit: int  1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num  0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
##  $ atemp     : num  0.288 0.273 0.273 0.288 0.288 ...
##  $ hum       : num  0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
##  $ windspeed : num  0 0 0 0 0 0.0896 0 0 0 0 ...
##  $ casual    : int  3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: int  13 32 27 10 1 1 0 2 7 6 ...
##  $ cnt       : int  16 40 32 13 1 1 2 3 8 14 ...
# View first few rows of the dataset
head(bike_data)
# View summary statistics for all variables
summary(bike_data)
##     instant         dteday              season            yr        
##  Min.   :    1   Length:17379       Min.   :1.000   Min.   :0.0000  
##  1st Qu.: 4346   Class :character   1st Qu.:2.000   1st Qu.:0.0000  
##  Median : 8690   Mode  :character   Median :3.000   Median :1.0000  
##  Mean   : 8690                      Mean   :2.502   Mean   :0.5026  
##  3rd Qu.:13034                      3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :17379                      Max.   :4.000   Max.   :1.0000  
##       mnth              hr           holiday           weekday     
##  Min.   : 1.000   Min.   : 0.00   Min.   :0.00000   Min.   :0.000  
##  1st Qu.: 4.000   1st Qu.: 6.00   1st Qu.:0.00000   1st Qu.:1.000  
##  Median : 7.000   Median :12.00   Median :0.00000   Median :3.000  
##  Mean   : 6.538   Mean   :11.55   Mean   :0.02877   Mean   :3.004  
##  3rd Qu.:10.000   3rd Qu.:18.00   3rd Qu.:0.00000   3rd Qu.:5.000  
##  Max.   :12.000   Max.   :23.00   Max.   :1.00000   Max.   :6.000  
##    workingday       weathersit         temp           atemp       
##  Min.   :0.0000   Min.   :1.000   Min.   :0.020   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.340   1st Qu.:0.3333  
##  Median :1.0000   Median :1.000   Median :0.500   Median :0.4848  
##  Mean   :0.6827   Mean   :1.425   Mean   :0.497   Mean   :0.4758  
##  3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:0.660   3rd Qu.:0.6212  
##  Max.   :1.0000   Max.   :4.000   Max.   :1.000   Max.   :1.0000  
##       hum           windspeed          casual         registered   
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00   Min.   :  0.0  
##  1st Qu.:0.4800   1st Qu.:0.1045   1st Qu.:  4.00   1st Qu.: 34.0  
##  Median :0.6300   Median :0.1940   Median : 17.00   Median :115.0  
##  Mean   :0.6272   Mean   :0.1901   Mean   : 35.68   Mean   :153.8  
##  3rd Qu.:0.7800   3rd Qu.:0.2537   3rd Qu.: 48.00   3rd Qu.:220.0  
##  Max.   :1.0000   Max.   :0.8507   Max.   :367.00   Max.   :886.0  
##       cnt       
##  Min.   :  1.0  
##  1st Qu.: 40.0  
##  Median :142.0  
##  Mean   :189.5  
##  3rd Qu.:281.0  
##  Max.   :977.0
# Check number of rows and columns
dim(bike_data)
## [1] 17379    17
# Display all variable names
names(bike_data)
##  [1] "instant"    "dteday"     "season"     "yr"         "mnth"      
##  [6] "hr"         "holiday"    "weekday"    "workingday" "weathersit"
## [11] "temp"       "atemp"      "hum"        "windspeed"  "casual"    
## [16] "registered" "cnt"
# Check for missing values in each column
colSums(is.na(bike_data))
##    instant     dteday     season         yr       mnth         hr    holiday 
##          0          0          0          0          0          0          0 
##    weekday workingday weathersit       temp      atemp        hum  windspeed 
##          0          0          0          0          0          0          0 
##     casual registered        cnt 
##          0          0          0

Grouping 1: Rentals by Season

# Group by season and summarize total rentals
season_group <- bike_data %>%
  group_by(season) %>%
  summarise(total_rentals = sum(cnt), avg_temp = mean(temp))

season_group
# Calculate probabilities for each season group
season_group <- season_group %>%
  mutate(probability = total_rentals / sum(total_rentals))

season_group

Grouping 1 Visualisation

# Create a bar chart to visualize total rentals by season
ggplot(season_group, aes(x = as.factor(season), y = total_rentals, fill = as.factor(season))) +
  geom_bar(stat = "identity") +
  labs(title = "Total Rentals by Season", x = "Season", y = "Total Rentals") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.position = "top"
  )

Insight and Hypothesis

From the Visual analysis of Group 1, we observe that summer has the highest probability of bike rentals, while winter has the lowest. This suggests that cold weather significantly affects bike rental behavior.

Hypothesis: Rentals are lower in winter due to unfavorable biking conditions (e.g., cold, snow). We can test this hypothesis by analyzing rental behavior during different weather conditions in winter.

Grouping 2: Rentals by Weather Condition and Holiday

# Group by weather condition and holiday
weather_holiday_group <- bike_data %>%
  group_by(weathersit, holiday) %>%
  summarise(total_rentals = sum(cnt), avg_humidity = mean(hum))
## `summarise()` has grouped output by 'weathersit'. You can override using the
## `.groups` argument.
weather_holiday_group
# Calculate probabilities for each weather-holiday group
weather_holiday_group <- weather_holiday_group %>%
  mutate(probability = total_rentals / sum(total_rentals))

weather_holiday_group

Grouping 2 Visualisation

# Create a bar chart to visualize rentals by weather condition and holiday
ggplot(weather_holiday_group, aes(x = as.factor(weathersit), y = total_rentals, fill = as.factor(holiday))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Rentals by Weather Condition and Holiday", x = "Weather Condition", y = "Total Rentals") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.position = "top"
  )

Insight and Hypothesis

Bike rentals are lower on holidays during bad weather. This suggests that holidays may reduce the need for commuting, and bad weather further discourages biking.

Hypothesis: On holidays with bad weather (e.g., rainy or snowy days), rentals drop significantly because leisure biking is less attractive in poor weather conditions. This can be tested by analyzing rentals across more specific weather patterns during holidays.

Grouping 3: Rentals by Weekday and Working Day

# Group by weekday and working day
weekday_workday_group <- bike_data %>%
  group_by(weekday, workingday) %>%
  summarise(total_rentals = sum(cnt), avg_windspeed = mean(windspeed))
## `summarise()` has grouped output by 'weekday'. You can override using the
## `.groups` argument.
weekday_workday_group
# Calculate probabilities for weekday-working day groups
weekday_workday_group <- weekday_workday_group %>%
  mutate(probability = total_rentals / sum(total_rentals))

weekday_workday_group

Grouping 3 Visualisation

# Create a line plot to visualize total rentals by weekday and working day
ggplot(weekday_workday_group, aes(x = weekday, y = total_rentals, color = as.factor(workingday), group = workingday)) +
  geom_line() +
  labs(title = "Total Rentals by Weekday and Working Day", x = "Weekday", y = "Total Rentals") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.position = "top"
  )

Insight and Hypothesis

The highest number of rentals occurs on working weekdays, likely due to commuters using bikes for transport. Conversely, weekends and non-working days show a lower number of rentals.

Hypothesis: Rentals are concentrated on weekdays due to commuting demand. On weekends, people use bikes more for leisure, leading to fewer rentals on average.

Investigating Combinations: Season and Holiday

# Create unique combinations of season and holiday
season_holiday_combinations <- bike_data %>%
  group_by(season, holiday) %>%
  summarise(total_rentals = sum(cnt)) %>%
  ungroup()
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
season_holiday_combinations
# Check for missing combinations
all_combinations <- expand.grid(season = unique(bike_data$season),
                                holiday = unique(bike_data$holiday))

missing_combinations <- anti_join(all_combinations, season_holiday_combinations, by = c("season", "holiday"))
missing_combinations

Visualization for Season and Holiday Combinations

# Visualize the combinations of season and holiday
ggplot(season_holiday_combinations, aes(x = as.factor(season), y = total_rentals, fill = as.factor(holiday))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Bike Rentals by Season and Holiday", x = "Season", y = "Total Rentals") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    legend.position = "top"
  )

From the combinations of season and holiday, we find that rentals during holidays in winter are among the least common. This is consistent with the earlier hypothesis that colder weather discourages bike use, especially during holiday periods when fewer people are commuting.

Summary and Conclusion

In this analysis of the UCI Bike Sharing dataset, I derived the following observations:

Some further questions that I would like to investigate:

  • How does temperature variability within each season affect bike rentals?

  • Do registered users rent bikes more consistently across weather conditions compared to casual users?

  • Are there any unusual spikes in rentals during extreme weather conditions that could be tied to special events?