# Load tidyverse as a collection of data science packages (Practically not needed to import any other packages mostly after importing this package)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load dplyr for data manipulation
library(dplyr)
# Load ggplot2 for data visualisation
library(ggplot2)
# Load the dataset
bike_data <- read.csv("/Users/roshannaidu/Desktop/IU Sem 2/Stats 1/bike+sharing+dataset/hour.csv")
# View structure and data types of variables
str(bike_data)
## 'data.frame': 17379 obs. of 17 variables:
## $ instant : int 1 2 3 4 5 6 7 8 9 10 ...
## $ dteday : chr "2011-01-01" "2011-01-01" "2011-01-01" "2011-01-01" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ yr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
## $ hr : int 0 1 2 3 4 5 6 7 8 9 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weathersit: int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
## $ atemp : num 0.288 0.273 0.273 0.288 0.288 ...
## $ hum : num 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
## $ windspeed : num 0 0 0 0 0 0.0896 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ cnt : int 16 40 32 13 1 1 2 3 8 14 ...
# View first few rows of the dataset
head(bike_data)
# View summary statistics for all variables
summary(bike_data)
## instant dteday season yr
## Min. : 1 Length:17379 Min. :1.000 Min. :0.0000
## 1st Qu.: 4346 Class :character 1st Qu.:2.000 1st Qu.:0.0000
## Median : 8690 Mode :character Median :3.000 Median :1.0000
## Mean : 8690 Mean :2.502 Mean :0.5026
## 3rd Qu.:13034 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :17379 Max. :4.000 Max. :1.0000
## mnth hr holiday weekday
## Min. : 1.000 Min. : 0.00 Min. :0.00000 Min. :0.000
## 1st Qu.: 4.000 1st Qu.: 6.00 1st Qu.:0.00000 1st Qu.:1.000
## Median : 7.000 Median :12.00 Median :0.00000 Median :3.000
## Mean : 6.538 Mean :11.55 Mean :0.02877 Mean :3.004
## 3rd Qu.:10.000 3rd Qu.:18.00 3rd Qu.:0.00000 3rd Qu.:5.000
## Max. :12.000 Max. :23.00 Max. :1.00000 Max. :6.000
## workingday weathersit temp atemp
## Min. :0.0000 Min. :1.000 Min. :0.020 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.340 1st Qu.:0.3333
## Median :1.0000 Median :1.000 Median :0.500 Median :0.4848
## Mean :0.6827 Mean :1.425 Mean :0.497 Mean :0.4758
## 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:0.660 3rd Qu.:0.6212
## Max. :1.0000 Max. :4.000 Max. :1.000 Max. :1.0000
## hum windspeed casual registered
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 0.0
## 1st Qu.:0.4800 1st Qu.:0.1045 1st Qu.: 4.00 1st Qu.: 34.0
## Median :0.6300 Median :0.1940 Median : 17.00 Median :115.0
## Mean :0.6272 Mean :0.1901 Mean : 35.68 Mean :153.8
## 3rd Qu.:0.7800 3rd Qu.:0.2537 3rd Qu.: 48.00 3rd Qu.:220.0
## Max. :1.0000 Max. :0.8507 Max. :367.00 Max. :886.0
## cnt
## Min. : 1.0
## 1st Qu.: 40.0
## Median :142.0
## Mean :189.5
## 3rd Qu.:281.0
## Max. :977.0
# Check number of rows and columns
dim(bike_data)
## [1] 17379 17
# Display all variable names
names(bike_data)
## [1] "instant" "dteday" "season" "yr" "mnth"
## [6] "hr" "holiday" "weekday" "workingday" "weathersit"
## [11] "temp" "atemp" "hum" "windspeed" "casual"
## [16] "registered" "cnt"
# Check for missing values in each column
colSums(is.na(bike_data))
## instant dteday season yr mnth hr holiday
## 0 0 0 0 0 0 0
## weekday workingday weathersit temp atemp hum windspeed
## 0 0 0 0 0 0 0
## casual registered cnt
## 0 0 0
# Select relevant columns: categorical (season) and continuous (temp, hum, windspeed, cnt)
bike_data <- bike_data %>%
select(season, temp, hum, windspeed, cnt)
The dataset contains both categorical (season) and continuous (temp, hum, windspeed, cnt) variables, which are useful for analysis.
# For reproducibility
set.seed(123) # for reproducibility
n <- nrow(bike_data)
# Creating five random subsamples
subsample_1 <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
subsample_2 <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
subsample_3 <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
subsample_4 <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
subsample_5 <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
# Function to group by 'season' and calculate means for each subsample
group_means <- function(df) {
df %>%
group_by(season) %>%
summarise(mean_temp = mean(temp),
mean_hum = mean(hum),
mean_windspeed = mean(windspeed),
mean_cnt = mean(cnt))
}
# Applying the function to all of the subsamples
mean_subsample_1 <- group_means(subsample_1)
mean_subsample_2 <- group_means(subsample_2)
mean_subsample_3 <- group_means(subsample_3)
mean_subsample_4 <- group_means(subsample_4)
mean_subsample_5 <- group_means(subsample_5)
# Displaying the results of applying the function on all of the subsamples
mean_subsample_1
mean_subsample_2
mean_subsample_3
mean_subsample_4
mean_subsample_5
# Combine subsamples into one dataframe for visualization
all_samples <- rbind(data.frame(subsample_1, sample = "Sample 1"),
data.frame(subsample_2, sample = "Sample 2"),
data.frame(subsample_3, sample = "Sample 3"),
data.frame(subsample_4, sample = "Sample 4"),
data.frame(subsample_5, sample = "Sample 5"))
# Plot temperature distribution
ggplot(all_samples, aes(x = sample, y = temp, fill = sample)) +
geom_boxplot() +
labs(title = "Temperature Distribution Across Subsamples",
x = "Subsample",
y = "Temperature") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "top"
)
# Plot humidity distribution
ggplot(all_samples, aes(x = sample, y = hum, fill = sample)) +
geom_boxplot() +
labs(title = "Humidity Distribution Across Subsamples",
x = "Subsample",
y = "Humidity") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "top"
)
# Plot windspeed distribution
ggplot(all_samples, aes(x = sample, y = windspeed, fill = sample)) +
geom_boxplot() +
labs(title = "Windspeed Distribution Across Subsamples",
x = "Subsample",
y = "Windspeed") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "top"
)
These insights suggest that while there’s a consistent overall pattern in wind speed distributions, there’s more variability between samples compared to the temperature and humidity data. The prevalence of high outliers is a key feature, indicating that occasional high wind events are a significant characteristic of this dataset. The differences between samples, particularly Sample 3, might warrant further investigation into the conditions during different sampling periods.
# Plot bike rentals distribution
ggplot(all_samples, aes(x = sample, y = cnt, fill = sample)) +
geom_boxplot() +
labs(title = "Bike Rental Counts Across Subsamples",
x = "Subsample",
y = "Bike Rentals") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "top"
)
I am performing a Monte Carlo Simulation by randomly sampling 50% of the data 1,000 times and calculating the mean bike rental count (cnt) for each subsample.
# for reproducibility
set.seed(123)
# Perform Monte Carlo Simulation
mc_results <- replicate(1000, {
sample_data <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
mean(sample_data$cnt)
})
# Mean and standard deviation of the results
mc_mean <- mean(mc_results)
mc_sd <- sd(mc_results)
mc_mean
## [1] 189.4469
mc_sd
## [1] 1.959181
The average mean bike rental count from 1,000 samples is around 190, with a small standard deviation of 1.96, showing low variability across random subsamples.
# Plot the distribution of the mean bike rental counts from the Monte Carlo Simulation
mc_df <- data.frame(mean_cnt = mc_results)
ggplot(mc_df, aes(x = mean_cnt)) +
geom_histogram(binwidth = 1, color = "black", fill = "blue") +
geom_vline(aes(xintercept = mc_mean), color = "red", linetype = "dashed") +
labs(title = "Distribution of Mean Bike Rental Counts from Monte Carlo Simulations",
x = "Mean Bike Rentals",
y = "Frequency") +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "top"
)
The histogram shows a normal distribution centered around 190, confirming that the overall mean is stable, even with random subsampling.
Consistency Across Subsamples: Temperature and windspeed show stable trends across all subsamples, whereas humidity and bike rental counts show more variability. Monte Carlo Simulation Results: The Monte Carlo simulation confirms that the mean bike rental count is reliable, with only minor fluctuations across random subsamples. Anomalies: Individual subsamples can contain outliers or anomalies that don’t appear consistently in other samples. Future Implications: Relying on one subsample can lead to misleading conclusions, but Monte Carlo simulations provide confidence that the dataset’s overall trend is stable.
Could additional variables like weather conditions further explain the variability in humidity and bike rentals? How would removing outliers or extreme anomalies impact the dataset?