FINAL PROJECT: OPTIMIZING BIKE RENTALS:Unraveling Patterns and Trends ##LOADING DATA FILES
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(broom)
library(lindia)
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
##bike
bike <- read.csv('D:/FALL 2023/STATISTICS/datasets/bike.csv')
summary(bike)
## Date Rented.Bike.Count Hour Temperature
## Length:8760 Min. : 0.0 Min. : 0.00 Min. :-17.80
## Class :character 1st Qu.: 191.0 1st Qu.: 5.75 1st Qu.: 3.50
## Mode :character Median : 504.5 Median :11.50 Median : 13.70
## Mean : 704.6 Mean :11.50 Mean : 12.88
## 3rd Qu.:1065.2 3rd Qu.:17.25 3rd Qu.: 22.50
## Max. :3556.0 Max. :23.00 Max. : 39.40
## Humidity Wind.speed Visibility Dew.point.temperature
## Min. : 0.00 Min. :0.000 Min. : 27 Min. :-30.600
## 1st Qu.:42.00 1st Qu.:0.900 1st Qu.: 940 1st Qu.: -4.700
## Median :57.00 Median :1.500 Median :1698 Median : 5.100
## Mean :58.23 Mean :1.725 Mean :1437 Mean : 4.074
## 3rd Qu.:74.00 3rd Qu.:2.300 3rd Qu.:2000 3rd Qu.: 14.800
## Max. :98.00 Max. :7.400 Max. :2000 Max. : 27.200
## Solar.Radiation Rainfall Snowfall Seasons
## Min. :0.0000 Min. : 0.0000 Min. :0.00000 Length:8760
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 Class :character
## Median :0.0100 Median : 0.0000 Median :0.00000 Mode :character
## Mean :0.5691 Mean : 0.1487 Mean :0.07507
## 3rd Qu.:0.9300 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :3.5200 Max. :35.0000 Max. :8.80000
## Holiday Functioning.Day
## Length:8760 Length:8760
## Class :character Class :character
## Mode :character Mode :character
##
##
##
head(bike )
## Date Rented.Bike.Count Hour Temperature Humidity Wind.speed Visibility
## 1 1/12/2017 254 0 -5.2 37 2.2 2000
## 2 1/12/2017 204 1 -5.5 38 0.8 2000
## 3 1/12/2017 173 2 -6.0 39 1.0 2000
## 4 1/12/2017 107 3 -6.2 40 0.9 2000
## 5 1/12/2017 78 4 -6.0 36 2.3 2000
## 6 1/12/2017 100 5 -6.4 37 1.5 2000
## Dew.point.temperature Solar.Radiation Rainfall Snowfall Seasons Holiday
## 1 -17.6 0 0 0 Winter No Holiday
## 2 -17.6 0 0 0 Winter No Holiday
## 3 -17.7 0 0 0 Winter No Holiday
## 4 -17.6 0 0 0 Winter No Holiday
## 5 -18.6 0 0 0 Winter No Holiday
## 6 -18.7 0 0 0 Winter No Holiday
## Functioning.Day
## 1 Yes
## 2 Yes
## 3 Yes
## 4 Yes
## 5 Yes
## 6 Yes
# Load the required libraries
library(lubridate)
library(dplyr)
# Assuming 'Date' is the name of the column in your 'bike' data frame
# Replace 'Date' with the actual name of your column
# Convert the 'Date' column to a Date object
bike$Date <- dmy(bike$Date) # Assuming the date is in dd/mm/yyyy format
# Add Day, Month, and Year columns
bike <- bike %>%
mutate(
Day = day(Date),
Month = month(Date),
Year = year(Date)
)
# Display the last few rows of the 'bike' dataset
tail(bike)
## Date Rented.Bike.Count Hour Temperature Humidity Wind.speed
## 8755 2018-11-30 1384 18 4.7 34 1.9
## 8756 2018-11-30 1003 19 4.2 34 2.6
## 8757 2018-11-30 764 20 3.4 37 2.3
## 8758 2018-11-30 694 21 2.6 39 0.3
## 8759 2018-11-30 712 22 2.1 41 1.0
## 8760 2018-11-30 584 23 1.9 43 1.3
## Visibility Dew.point.temperature Solar.Radiation Rainfall Snowfall Seasons
## 8755 1661 -9.8 0 0 0 Autumn
## 8756 1894 -10.3 0 0 0 Autumn
## 8757 2000 -9.9 0 0 0 Autumn
## 8758 1968 -9.9 0 0 0 Autumn
## 8759 1859 -9.8 0 0 0 Autumn
## 8760 1909 -9.3 0 0 0 Autumn
## Holiday Functioning.Day Day Month Year
## 8755 No Holiday Yes 30 11 2018
## 8756 No Holiday Yes 30 11 2018
## 8757 No Holiday Yes 30 11 2018
## 8758 No Holiday Yes 30 11 2018
## 8759 No Holiday Yes 30 11 2018
## 8760 No Holiday Yes 30 11 2018
# Load the required libraries
library(dplyr)
library(ggplot2)
# Assuming 'Month' and 'Rented Bike count' are the column names in your 'bike' dataset
# Replace them with the actual names of your columns
# Convert 'Month' to a factor with ordered levels for correct sorting
bike$Month <- factor(bike$Month, levels = unique(bike$Month))
# Group by Month and sum the rented bike count
monthly_summary <- bike %>%
group_by(Month) %>%
summarise(TotalRentedBikes = sum(`Rented.Bike.Count`))
# Plot the bar chart
ggplot(monthly_summary, aes(x = Month, y = TotalRentedBikes)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Monthly Sales of Rented Bikes",
x = "Month",
y = "Total Rented Bikes") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
##QUESTONS 1 1. SEASONAL IMPACT ON BIKE RENTALS AND ITS REASONS?
# Create a bar plot
bar_plot <- ggplot(bike, aes(x = Seasons, y = `Rented.Bike.Count`, fill = Seasons)) +
geom_bar(stat = "summary", fun = "mean", position = "dodge") +
labs(title = "Bar Plot: Bike Rentals by Season",
x = "Seasons",
y = "Mean Rented Bike Count") +
theme_minimal()
# Display the bar plot
print(bar_plot)
Interpretation:
Winter season has the lowest business sales, there could be numerous reasons behind it. First one is Low visibility.
# Convert 'Seasons' to a factor with a specified order
bike$Seasons <- factor(bike$Seasons, levels = c("Spring", "Winter", "Summer", "Autumn"))
# Calculate the mean Visibility for each combination of Seasons
visibility_season <- bike %>%
group_by(Seasons) %>%
summarise(mean_visibility = mean(Temperature, na.rm = TRUE))
# Create a heatmap
ggplot(visibility_season, aes(x = Seasons, y = 1, fill = mean_visibility)) +
geom_tile() +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Heatmap of Visibility by Seasons",
x = "Seasons",
y = "") +
theme_minimal() +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())
Interpreation: We could observe that there is low visibility in
winterseason, which could be a hindrance for riders to travel.
Another reason is snowfall. ### Snowfall vs seasons
# Calculate mean snowfall for each season
mean_snowfall_by_season <- bike %>%
group_by(Seasons) %>%
summarise(mean_snowfall = mean(Snowfall, na.rm = TRUE))
# Create a line graph for mean snowfall by season
line_plot_mean_snowfall <- ggplot(mean_snowfall_by_season, aes(x = Seasons, y = mean_snowfall, group = 1)) +
geom_line(color = "blue") +
geom_point(color = "blue") +
labs(title = "Line Graph: Mean Snowfall by Season",
x = "Seasons",
y = "Mean Snowfall") +
theme_minimal()
# Display the line graph
print(line_plot_mean_snowfall)
As snowfall reaches peak during winter season, it becomes impossible to
riders to ride on slippery roads as it could be dangerous.
##QUESTION2: WHAT IS THE BUSINESS SCENARIO IN OTHER SEASONS? ### Other than winter
# Filter data for all seasons except Winter
filtered_data <- bike %>% filter(Seasons != "Winter")
# Create a line plot
line_plot_hourly <- ggplot(filtered_data, aes(x = Hour, y = `Rented.Bike.Count`, color = Seasons)) +
geom_line(stat = "summary", fun = "mean", size = 1) +
geom_point(stat = "summary", fun = "mean", size = 2) +
labs(title = "Line Plot: Mean Rented Bike Count by Hour for Seasons (Excluding Winter)",
x = "Hour of the Day",
y = "Mean Rented Bike Count",
color = "Seasons") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the line plot
print(line_plot_hourly)
Interpretation:
When compared the business among the seasons, excluding winters, we can observe that bike rentals are more in summer season. Temperature could be one of the reason why more people tend to rent bikes in summer ##Linear Modelling
# Fit a linear model
linear_model <- lm(`Rented.Bike.Count` ~ Temperature, data = bike)
# Display the summary of the linear model
summary(linear_model)
##
## Call:
## lm(formula = Rented.Bike.Count ~ Temperature, data = bike)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1100.60 -336.57 -49.69 233.81 2525.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 329.9525 8.5411 38.63 <2e-16 ***
## Temperature 29.0811 0.4862 59.82 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 543.5 on 8758 degrees of freedom
## Multiple R-squared: 0.29, Adjusted R-squared: 0.29
## F-statistic: 3578 on 1 and 8758 DF, p-value: < 2.2e-16
# Create a scatter plot with the regression line
scatter_plot <- ggplot(bike, aes(x = Temperature, y = `Rented.Bike.Count`)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatter Plot with Linear Model: Rented Bike Count vs Temperature",
x = "Temperature",
y = "Rented Bike Count") +
theme_minimal()
# Display the scatter plot
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'
As discussed, more people prefer renting bikes during summer, because
they might feel hectic travelling in public transportation.
# Assuming 'Date' is the name of the column in your 'bike' dataset
# Replace 'Date' with the actual name of your column
# Display unique values in the 'Date' column to identify any anomalies
unique_dates <- unique(bike$Date)
# Try to automatically detect the date format
date_formats <- unique(sapply(unique_dates, function(date) {
tryCatch(format(as.Date(date, format = "%d/%m/%Y"), format = "%d/%m/%Y"), error = function(e) NA)
}))
date_formats <- date_formats[!is.na(date_formats)]
# Convert 'Date' to a Date object using the detected format
bike$Date <- as.Date(bike$Date, format = date_formats[1])
# Add a new column for the day of the week
bike$DayOfWeek <- weekdays(bike$Date)
# Load the required library
library(ggplot2)
# Assuming 'DayOfWeek' and 'Rented Bike count' are the column names in your 'bike' dataset
# Replace them with the actual names of your columns
# Plot the bar chart
ggplot(bike, aes(x = DayOfWeek, y = `Rented.Bike.Count`, fill = DayOfWeek)) +
geom_bar(stat = "identity") +
labs(title = "Rented Bike Count by Day of the Week",
x = "Day of the Week",
y = "Rented Bike Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Interpretation: when compared the business by days, We can observe that
the business is dull on sundays. to optimize business on that day we can
present offers on that day to attract more customers.
bike_aggregate <- bike%>%
group_by(Date) %>%
summarise(Sum_Rented_Bike_Count = sum(Rented.Bike.Count))
bike_time <- select(bike_aggregate, Date, Sum_Rented_Bike_Count)
bike_time$Date <- as.Date(bike_time$Date, format = "%m/%d/%Y")
# Remove rows with NA values in the Date column
bike_time <- bike_time[complete.cases(bike_time$Date), ]
bike_time_ts <- as_tsibble(bike_time, index = Date)
bike_time_ts %>%
ggplot(mapping = aes(x = Date, y = Sum_Rented_Bike_Count)) +
geom_line() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Number of bikes rented over years",
x = 'YEAR',
y = 'BIKE COUNT')
## `geom_smooth()` using formula = 'y ~ x'
Interpretation: over the time period, the business has grown gradually.
This reflects that people are becoming more interested in renting out
bikes.
# Perform ANOVA test
anova_result <- aov(`Rented.Bike.Count` ~ DayOfWeek, data = bike)
# Display ANOVA summary
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## DayOfWeek 6 1.323e+07 2204753 5.315 1.75e-05 ***
## Residuals 8753 3.631e+09 414796
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretation:
The p-value (Pr(>F)) is very small (1.51e-05), much smaller than the commonly used significance level of 0.05. Therefore, you would reject the null hypothesis.
The small p-value suggests that there is a statistically significant difference in mean rented bike counts among different days of the week.
The ‘DayOfWeek’ factor has a significant impact on the variation in rented bike counts.
In conclusion, we have evidence to suggest that there is a significant difference in rented bike counts across different days of the week.