Load Data

bike_data <- read.csv("C:/Statistics for Data Science/Week 2/bike+sharing+dataset/hour.csv")
head(bike_data)
##   instant     dteday season yr mnth hr holiday weekday workingday weathersit
## 1       1 2011-01-01      1  0    1  0       0       6          0          1
## 2       2 2011-01-01      1  0    1  1       0       6          0          1
## 3       3 2011-01-01      1  0    1  2       0       6          0          1
## 4       4 2011-01-01      1  0    1  3       0       6          0          1
## 5       5 2011-01-01      1  0    1  4       0       6          0          1
## 6       6 2011-01-01      1  0    1  5       0       6          0          2
##   temp  atemp  hum windspeed casual registered cnt
## 1 0.24 0.2879 0.81    0.0000      3         13  16
## 2 0.22 0.2727 0.80    0.0000      8         32  40
## 3 0.22 0.2727 0.80    0.0000      5         27  32
## 4 0.24 0.2879 0.75    0.0000      3         10  13
## 5 0.24 0.2879 0.75    0.0000      0          1   1
## 6 0.24 0.2576 0.75    0.0896      0          1   1
summary(bike_data)
##     instant         dteday              season            yr        
##  Min.   :    1   Length:17379       Min.   :1.000   Min.   :0.0000  
##  1st Qu.: 4346   Class :character   1st Qu.:2.000   1st Qu.:0.0000  
##  Median : 8690   Mode  :character   Median :3.000   Median :1.0000  
##  Mean   : 8690                      Mean   :2.502   Mean   :0.5026  
##  3rd Qu.:13034                      3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :17379                      Max.   :4.000   Max.   :1.0000  
##       mnth              hr           holiday           weekday     
##  Min.   : 1.000   Min.   : 0.00   Min.   :0.00000   Min.   :0.000  
##  1st Qu.: 4.000   1st Qu.: 6.00   1st Qu.:0.00000   1st Qu.:1.000  
##  Median : 7.000   Median :12.00   Median :0.00000   Median :3.000  
##  Mean   : 6.538   Mean   :11.55   Mean   :0.02877   Mean   :3.004  
##  3rd Qu.:10.000   3rd Qu.:18.00   3rd Qu.:0.00000   3rd Qu.:5.000  
##  Max.   :12.000   Max.   :23.00   Max.   :1.00000   Max.   :6.000  
##    workingday       weathersit         temp           atemp       
##  Min.   :0.0000   Min.   :1.000   Min.   :0.020   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.340   1st Qu.:0.3333  
##  Median :1.0000   Median :1.000   Median :0.500   Median :0.4848  
##  Mean   :0.6827   Mean   :1.425   Mean   :0.497   Mean   :0.4758  
##  3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:0.660   3rd Qu.:0.6212  
##  Max.   :1.0000   Max.   :4.000   Max.   :1.000   Max.   :1.0000  
##       hum           windspeed          casual         registered   
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00   Min.   :  0.0  
##  1st Qu.:0.4800   1st Qu.:0.1045   1st Qu.:  4.00   1st Qu.: 34.0  
##  Median :0.6300   Median :0.1940   Median : 17.00   Median :115.0  
##  Mean   :0.6272   Mean   :0.1901   Mean   : 35.68   Mean   :153.8  
##  3rd Qu.:0.7800   3rd Qu.:0.2537   3rd Qu.: 48.00   3rd Qu.:220.0  
##  Max.   :1.0000   Max.   :0.8507   Max.   :367.00   Max.   :886.0  
##       cnt       
##  Min.   :  1.0  
##  1st Qu.: 40.0  
##  Median :142.0  
##  Mean   :189.5  
##  3rd Qu.:281.0  
##  Max.   :977.0

Grouping 1: Rentals by Season

Grouping and Summary

# Group by season and summarize total rentals
season_group <- bike_data %>%
  group_by(season) %>%
  summarise(total_rentals = sum(cnt), avg_temp = mean(temp))

season_group
## # A tibble: 4 × 3
##   season total_rentals avg_temp
##    <int>         <int>    <dbl>
## 1      1        471348    0.299
## 2      2        918589    0.545
## 3      3       1061129    0.706
## 4      4        841613    0.423

Probability Analysis

# Calculate probabilities for each season group
season_group <- season_group %>%
  mutate(probability = total_rentals / sum(total_rentals))

season_group
## # A tibble: 4 × 4
##   season total_rentals avg_temp probability
##    <int>         <int>    <dbl>       <dbl>
## 1      1        471348    0.299       0.143
## 2      2        918589    0.545       0.279
## 3      3       1061129    0.706       0.322
## 4      4        841613    0.423       0.256

Visualization: Rental by Season

# Create a bar chart to visualize total rentals by season
ggplot(season_group, aes(x = as.factor(season), y = total_rentals, fill = as.factor(season))) +
  geom_bar(stat = "identity") +
  labs(title = "Total Rentals by Season", x = "Season", y = "Total Rentals") +
  theme_minimal()

Insight and Hypothesis

From the above analysis, we observe that summer has the highest probability of bike rentals, while winter has the lowest. This suggests that cold weather significantly affects bike rental behavior.

Hypothesis: Rentals are lower in winter due to unfavorable biking conditions (e.g., cold, snow). We can test this hypothesis by analyzing rental behavior during different weather conditions in winter.

Grouping 2: Rentals by Weather Condition and Holiday

Grouping and Summary

# Group by weather condition and holiday
weather_holiday_group <- bike_data %>%
  group_by(weathersit, holiday) %>%
  summarise(total_rentals = sum(cnt), avg_humidity = mean(hum))
## `summarise()` has grouped output by 'weathersit'. You can override using the
## `.groups` argument.
weather_holiday_group
## # A tibble: 7 × 4
## # Groups:   weathersit [4]
##   weathersit holiday total_rentals avg_humidity
##        <int>   <int>         <int>        <dbl>
## 1          1       0       2282228        0.573
## 2          1       1         55945        0.587
## 3          2       0        775540        0.700
## 4          2       1         20412        0.660
## 5          3       0        156253        0.828
## 6          3       1          2078        0.777
## 7          4       0           223        0.883

Probability Analysis

# Calculate probabilities for each weather-holiday group
weather_holiday_group <- weather_holiday_group %>%
  mutate(probability = total_rentals / sum(total_rentals))

weather_holiday_group
## # A tibble: 7 × 5
## # Groups:   weathersit [4]
##   weathersit holiday total_rentals avg_humidity probability
##        <int>   <int>         <int>        <dbl>       <dbl>
## 1          1       0       2282228        0.573      0.976 
## 2          1       1         55945        0.587      0.0239
## 3          2       0        775540        0.700      0.974 
## 4          2       1         20412        0.660      0.0256
## 5          3       0        156253        0.828      0.987 
## 6          3       1          2078        0.777      0.0131
## 7          4       0           223        0.883      1

Visualization: Rentals by Weather and Holiday

# Create a bar chart to visualize rentals by weather condition and holiday
ggplot(weather_holiday_group, aes(x = as.factor(weathersit), y = total_rentals, fill = as.factor(holiday))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Rentals by Weather Condition and Holiday", x = "Weather Condition", y = "Total Rentals") +
  theme_minimal()

Insight and Hypothesis

Bike rentals are lower on holidays during bad weather. This suggests that holidays may reduce the need for commuting, and bad weather further discourages biking.

Hypothesis: On holidays with bad weather (e.g., rainy or snowy days), rentals drop significantly because leisure biking is less attractive in poor weather conditions. This can be tested by analyzing rentals across more specific weather patterns during holidays.

Grouping 3: Rentals by Weekday and Working Day

Grouping and Summary

# Group by weekday and working day
weekday_workday_group <- bike_data %>%
  group_by(weekday, workingday) %>%
  summarise(total_rentals = sum(cnt), avg_windspeed = mean(windspeed))
## `summarise()` has grouped output by 'weekday'. You can override using the
## `.groups` argument.
weekday_workday_group
## # A tibble: 12 × 4
## # Groups:   weekday [7]
##    weekday workingday total_rentals avg_windspeed
##      <int>      <int>         <int>         <dbl>
##  1       0          0        444027         0.188
##  2       1          0         59605         0.199
##  3       1          1        395898         0.187
##  4       2          0          1013         0.169
##  5       2          1        468096         0.192
##  6       3          0          7403         0.135
##  7       3          1        465645         0.188
##  8       4          0          3920         0.110
##  9       4          1        481475         0.194
## 10       5          0          6494         0.271
## 11       5          1        481296         0.185
## 12       6          0        477807         0.196

Probability Analysis

# Calculate probabilities for weekday-working day groups
weekday_workday_group <- weekday_workday_group %>%
  mutate(probability = total_rentals / sum(total_rentals))

weekday_workday_group
## # A tibble: 12 × 5
## # Groups:   weekday [7]
##    weekday workingday total_rentals avg_windspeed probability
##      <int>      <int>         <int>         <dbl>       <dbl>
##  1       0          0        444027         0.188     1      
##  2       1          0         59605         0.199     0.131  
##  3       1          1        395898         0.187     0.869  
##  4       2          0          1013         0.169     0.00216
##  5       2          1        468096         0.192     0.998  
##  6       3          0          7403         0.135     0.0156 
##  7       3          1        465645         0.188     0.984  
##  8       4          0          3920         0.110     0.00808
##  9       4          1        481475         0.194     0.992  
## 10       5          0          6494         0.271     0.0133 
## 11       5          1        481296         0.185     0.987  
## 12       6          0        477807         0.196     1

Visualization: Rentals by Weekday and Working Day

# Create a line plot to visualize total rentals by weekday and working day
ggplot(weekday_workday_group, aes(x = weekday, y = total_rentals, color = as.factor(workingday), group = workingday)) +
  geom_line() +
  labs(title = "Total Rentals by Weekday and Working Day", x = "Weekday", y = "Total Rentals") +
  theme_minimal()

Insight and Hypothesis

The highest number of rentals occurs on working weekdays, likely due to commuters

Insight and Hypothesis

The highest number of rentals occurs on working weekdays, likely due to commuters using bikes for transport. Conversely, weekends and non-working days show a lower number of rentals.

Hypothesis: Rentals are concentrated on weekdays due to commuting demand. On weekends, people use bikes more for leisure, leading to fewer rentals on average.

Investigating Combinations: Season and Holiday

Unique Combinations of Season and Holiday

# Create unique combinations of season and holiday
season_holiday_combinations <- bike_data %>%
  group_by(season, holiday) %>%
  summarise(total_rentals = sum(cnt)) %>%
  ungroup()
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
season_holiday_combinations
## # A tibble: 8 × 3
##   season holiday total_rentals
##    <int>   <int>         <int>
## 1      1       0        459533
## 2      1       1         11815
## 3      2       0        898952
## 4      2       1         19637
## 5      3       0       1038298
## 6      3       1         22831
## 7      4       0        817461
## 8      4       1         24152

Missing Combinations

# Check for missing combinations
all_combinations <- expand.grid(season = unique(bike_data$season),
                                holiday = unique(bike_data$holiday))

missing_combinations <- anti_join(all_combinations, season_holiday_combinations, by = c("season", "holiday"))
missing_combinations
## [1] season  holiday
## <0 rows> (or 0-length row.names)

Visualization: Season and Holiday Combinations

# Visualize the combinations of season and holiday
ggplot(season_holiday_combinations, aes(x = as.factor(season), y = total_rentals, fill = as.factor(holiday))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Bike Rentals by Season and Holiday", x = "Season", y = "Total Rentals") +
  theme_minimal()

Insight

From the combinations of season and holiday, we find that rentals during holidays in winter are among the least common. This is consistent with the earlier hypothesis that colder weather discourages bike use, especially during holiday periods when fewer people are commuting.

Summary and Conclusion

In this analysis of the UCI Bike Sharing dataset, we find the following observations:

Some further questions that I would like to investigate:

  1. How does temperature variability within each season affect bike rentals?

  2. Do registered users rent bikes more consistently across weather conditions compared to casual users?

  3. Are there any unusual spikes in rentals during extreme weather conditions that could be tied to special events?