# Loading the dataset
bike_data <- read.csv("C:/Statistics for Data Science/Week 2/bike+sharing+dataset/hour.csv")
# Exploring the dataset
head(bike_data)
## instant dteday season yr mnth hr holiday weekday workingday weathersit
## 1 1 2011-01-01 1 0 1 0 0 6 0 1
## 2 2 2011-01-01 1 0 1 1 0 6 0 1
## 3 3 2011-01-01 1 0 1 2 0 6 0 1
## 4 4 2011-01-01 1 0 1 3 0 6 0 1
## 5 5 2011-01-01 1 0 1 4 0 6 0 1
## 6 6 2011-01-01 1 0 1 5 0 6 0 2
## temp atemp hum windspeed casual registered cnt
## 1 0.24 0.2879 0.81 0.0000 3 13 16
## 2 0.22 0.2727 0.80 0.0000 8 32 40
## 3 0.22 0.2727 0.80 0.0000 5 27 32
## 4 0.24 0.2879 0.75 0.0000 3 10 13
## 5 0.24 0.2879 0.75 0.0000 0 1 1
## 6 0.24 0.2576 0.75 0.0896 0 1 1
# Exploring the structure of the dataset and preparing the dataset
str(bike_data)
## 'data.frame': 17379 obs. of 17 variables:
## $ instant : int 1 2 3 4 5 6 7 8 9 10 ...
## $ dteday : chr "2011-01-01" "2011-01-01" "2011-01-01" "2011-01-01" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ yr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
## $ hr : int 0 1 2 3 4 5 6 7 8 9 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weathersit: int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
## $ atemp : num 0.288 0.273 0.273 0.288 0.288 ...
## $ hum : num 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
## $ windspeed : num 0 0 0 0 0 0.0896 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ cnt : int 16 40 32 13 1 1 2 3 8 14 ...
# Select relevant columns: categorical (season) and continuous (temp, hum, windspeed, cnt)
bike_data <- bike_data %>%
select(season, temp, hum, windspeed, cnt)
The dataset contains both categorical (season) and continuous (temp, hum, windspeed, cnt) variables, which are useful for analysis.
# For reproducibility
set.seed(123) # for reproducibility
n <- nrow(bike_data)
# Creating five random subsamples
subsample_1 <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
subsample_2 <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
subsample_3 <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
subsample_4 <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
subsample_5 <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
# Function to group by 'season' and calculate means for each subsample
group_means <- function(df) {
df %>%
group_by(season) %>%
summarise(mean_temp = mean(temp),
mean_hum = mean(hum),
mean_windspeed = mean(windspeed),
mean_cnt = mean(cnt))
}
# Applying the function to all of the subsamples
mean_subsample_1 <- group_means(subsample_1)
mean_subsample_2 <- group_means(subsample_2)
mean_subsample_3 <- group_means(subsample_3)
mean_subsample_4 <- group_means(subsample_4)
mean_subsample_5 <- group_means(subsample_5)
# Displaying the results of applying the function on all of the subsamples
mean_subsample_1
## # A tibble: 4 × 5
## season mean_temp mean_hum mean_windspeed mean_cnt
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.303 0.581 0.213 112.
## 2 2 0.547 0.626 0.202 210.
## 3 3 0.707 0.631 0.172 239.
## 4 4 0.426 0.664 0.172 203.
mean_subsample_2
## # A tibble: 4 × 5
## season mean_temp mean_hum mean_windspeed mean_cnt
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.297 0.588 0.219 109.
## 2 2 0.540 0.627 0.203 213.
## 3 3 0.709 0.631 0.170 245.
## 4 4 0.423 0.665 0.169 195.
mean_subsample_3
## # A tibble: 4 × 5
## season mean_temp mean_hum mean_windspeed mean_cnt
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.295 0.576 0.216 110.
## 2 2 0.550 0.617 0.202 209.
## 3 3 0.705 0.635 0.176 231.
## 4 4 0.423 0.663 0.168 200.
mean_subsample_4
## # A tibble: 4 × 5
## season mean_temp mean_hum mean_windspeed mean_cnt
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.304 0.585 0.212 110.
## 2 2 0.544 0.630 0.202 210.
## 3 3 0.704 0.637 0.168 235.
## 4 4 0.425 0.666 0.172 205.
mean_subsample_5
## # A tibble: 4 × 5
## season mean_temp mean_hum mean_windspeed mean_cnt
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.299 0.575 0.214 111.
## 2 2 0.543 0.623 0.207 203.
## 3 3 0.707 0.631 0.174 238.
## 4 4 0.422 0.664 0.172 200.
# Combine subsamples into one dataframe for visualization
all_samples <- rbind(data.frame(subsample_1, sample = "Sample 1"),
data.frame(subsample_2, sample = "Sample 2"),
data.frame(subsample_3, sample = "Sample 3"),
data.frame(subsample_4, sample = "Sample 4"),
data.frame(subsample_5, sample = "Sample 5"))
# Plot temperature distribution
ggplot(all_samples, aes(x = sample, y = temp, fill = sample)) +
geom_boxplot() +
labs(title = "Temperature Distribution Across Subsamples",
x = "Subsample",
y = "Temperature") +
theme_minimal()
# Plot humidity distribution
ggplot(all_samples, aes(x = sample, y = hum, fill = sample)) +
geom_boxplot() +
labs(title = "Humidity Distribution Across Subsamples",
x = "Subsample",
y = "Humidity") +
theme_minimal()
# Plot windspeed distribution
ggplot(all_samples, aes(x = sample, y = windspeed, fill = sample)) +
geom_boxplot() +
labs(title = "Windspeed Distribution Across Subsamples",
x = "Subsample",
y = "Windspeed") +
theme_minimal()
These insights suggest that while there’s a consistent overall pattern in wind speed distributions, there’s more variability between samples compared to the temperature and humidity data. The prevalence of high outliers is a key feature, indicating that occasional high wind events are a significant characteristic of this dataset. The differences between samples, particularly Sample 3, might warrant further investigation into the conditions during different sampling periods.
##### Boxplot for Bike Rentals
# Plot bike rentals distribution
ggplot(all_samples, aes(x = sample, y = cnt, fill = sample)) +
geom_boxplot() +
labs(title = "Bike Rental Counts Across Subsamples",
x = "Subsample",
y = "Bike Rentals") +
theme_minimal()
These insights suggest a complex distribution of bike rental counts with high variability. The consistency across samples indicates stable underlying factors influencing rentals, but the wide range and numerous outliers point to significant day-to-day fluctuations. The positive skew and high outliers suggest frequent occurrences of high-demand days, which could be linked to factors like weather, events, or seasonal trends.
I am performing a Monte Carlo Simulation , randomly sampling 50% of the data 1,000 times and calculating the mean bike rental count (cnt) for each subsample.
# for reproducibility
set.seed(123)
# Perform Monte Carlo Simulation
mc_results <- replicate(1000, {
sample_data <- bike_data[sample(1:n, size = 0.5 * n, replace = TRUE), ]
mean(sample_data$cnt)
})
# Mean and standard deviation of the results
mc_mean <- mean(mc_results)
mc_sd <- sd(mc_results)
mc_mean
## [1] 189.4469
mc_sd
## [1] 1.959181
The average mean bike rental count from 1,000 samples is around 190, with a small standard deviation of 1.96, showing low variability across random subsamples.
# Plot the distribution of the mean bike rental counts from the Monte Carlo Simulation
mc_df <- data.frame(mean_cnt = mc_results)
ggplot(mc_df, aes(x = mean_cnt)) +
geom_histogram(binwidth = 1, color = "black", fill = "blue") +
geom_vline(aes(xintercept = mc_mean), color = "red", linetype = "dashed") +
labs(title = "Distribution of Mean Bike Rental Counts from Monte Carlo Simulations",
x = "Mean Bike Rentals",
y = "Frequency") +
theme_minimal()
The histogram shows a normal distribution centered around 190, confirming that the overall mean is stable, even with random subsampling.