This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
bike <- read.csv('D:/FALL 2023/STATISTICS/datasets/bike.csv')
library(pwr)
library(ggplot2)
null hypothesis 1. There is no significant difference in the mean ‘Rented.Bike.Count’ between weekdays (Functioning.Day == “Yes”) and weekends/holidays (Functioning.Day == “No”).
Null hypothesis 2. The average ‘Rented.Bike.Count’ is the same across all four seasons (‘Seasons’).
##Calculation of sample size
effect_size <- 0.2 # Desired effect size (Cohen's d)
alpha <- 0.05 # Significance level (e.g., 0.05 for 5%)
power <- 0.80 # Desired power level (e.g., 0.80 for 80%)
required_sample_size <- pwr.t.test(
d = effect_size,
sig.level = alpha,
power = power,
type = "two.sample" # Specify a two-sample t-test
)
# Print the required sample size
print(required_sample_size)
##
## Two-sample t test power calculation
##
## n = 393.4057
## d = 0.2
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
Here the required sample size is less than my actual sample, so i can perform Neyman-Pearson hypothesis test on my data.
You can also embed plots, for example:
##
## Welch Two Sample t-test
##
## data: bike1$Rented.Bike.Count and bike2$Rented.Bike.Count
## t = 104.44, df = 8464, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 715.4712 742.8428
## sample estimates:
## mean of x mean of y
## 729.157 0.000
## [1] "Reject H0. There is a significant difference in the average 'rented bike count' between working days and non working days"
##performing Neyman-Pearson hypothesis test on null hypothesis 2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
alpha <- 0.05
power <- 0.80
if ("Seasons" %in% colnames(bike) && "Rented.Bike.Count" %in% colnames(bike)) {
seasons <- bike$Seasons
rented_bike_count <- bike$Rented.Bike.Count
if (length(seasons) > 0 && length(rented_bike_count) > 0) {
unique_seasons <- unique(seasons)
t_test_results <- list()
for (i in 1:(length(unique_seasons) - 1)) {
for (j in (i + 1):length(unique_seasons)) {
season1 <- unique_seasons[i]
season2 <- unique_seasons[j]
# Subset data for the two seasons
data1 <- rented_bike_count[seasons == season1]
data2 <- rented_bike_count[seasons == season2]
# Perform the t-test
t_test_result <- t.test(data1, data2)
# Store the t-test result
t_test_results[[paste(season1, "-", season2)]] <- t_test_result
}
}
# Interpret the results for each pair
for (pair in names(t_test_results)) {
t_test_result <- t_test_results[[pair]]
p_value <- t_test_result$p.value
if (p_value < alpha) {
cat(paste("Reject Null Hypothesis (H0) for", pair, ": There is a significant difference in means. (p-value:", formatC(p_value, digits = 4), ")"))
} else {
cat(paste("Fail to Reject Null Hypothesis (H0) for", pair, ": There is no significant difference in means. (p-value:", formatC(p_value, digits = 4), ")"))
}
}
} else {
cat("Data is missing in one or both variables. Check your data.")
}
} else {
cat("The required columns 'Seasons' and/or 'Rented.Bike.Count' are not present in the 'bike' dataframe.")
}
## Reject Null Hypothesis (H0) for Winter - Spring : There is a significant difference in means. (p-value: 2.436e-239 )Reject Null Hypothesis (H0) for Winter - Summer : There is a significant difference in means. (p-value: 0 )Reject Null Hypothesis (H0) for Winter - Autumn : There is a significant difference in means. (p-value: 4.546e-285 )Reject Null Hypothesis (H0) for Spring - Summer : There is a significant difference in means. (p-value: 4.929e-52 )Reject Null Hypothesis (H0) for Spring - Autumn : There is a significant difference in means. (p-value: 3.219e-06 )Reject Null Hypothesis (H0) for Summer - Autumn : There is a significant difference in means. (p-value: 6.539e-26 )
ggplot(bike, aes(x = Functioning.Day, y = Rented.Bike.Count)) +
geom_boxplot() +
labs(x = "Day Type", y = "Rented Bike Count") +
ggtitle("Distribution of Rented Bike Count by Day Type") +
theme_minimal()
ggplot(bike, aes(x = Seasons, y = Rented.Bike.Count, fill = Seasons)) +
geom_boxplot() +
labs(x = "Season", y = "Rented Bike Count") +
ggtitle("Rented Bike Count by Season") +
theme_minimal()
##perfomring Fisher’s style test for significance on null hypothesis 1
contingency_table <- table(bike$Functioning.Day, bike$Rented.Bike.Count)
fisher_test_result <- fisher.test(contingency_table, simulate.p.value = TRUE)
p_value <- fisher_test_result$p.value
alpha <- 0.05
# Check if the p-value is less than alpha
if (p_value < alpha) {
cat("Reject the null hypothesis: There is a significant difference in the average 'rented bike count' between working days and non working days")
} else {
cat("Fail to reject the null hypothesis: There is no significant difference in the average 'rented bike count' between working days and non working days")
}
## Reject the null hypothesis: There is a significant difference in the average 'rented bike count' between working days and non working days
# Print the p-value
cat("P-value:",p_value,"\n")
## P-value: 0.0004997501
##Testing Fisher’s style test for significance on null hypothesis 2
contingency_table <- table(bike$Seasons, bike$Rented.Bike.Count)
fisher_test_result <- fisher.test(contingency_table, simulate.p.value = TRUE)
p_value <- fisher_test_result$p.value
alpha <- 0.05
# Check if the p-value is less than alpha
if (p_value < alpha) {
cat("Reject the null hypothesis: There is a significant difference in the distribution of 'Rented.Bike.Count' across seasons")
} else {
cat("Fail to reject the null hypothesis: There is no significant difference in the distribution of 'Rented.Bike.Count' across seasons")
}
## Reject the null hypothesis: There is a significant difference in the distribution of 'Rented.Bike.Count' across seasons
# Print the p-value
cat("P-value:", p_value, "\n")
## P-value: 0.0004997501
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.