week7

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

bike <- read.csv('D:/FALL 2023/STATISTICS/datasets/bike.csv')
library(pwr)
library(ggplot2)

Null hypothesis

null hypothesis 1. There is no significant difference in the mean ‘Rented.Bike.Count’ between weekdays (Functioning.Day == “Yes”) and weekends/holidays (Functioning.Day == “No”).

Null hypothesis 2. The average ‘Rented.Bike.Count’ is the same across all four seasons (‘Seasons’).

##Calculation of sample size

# Set the parameters for the power analysis
effect_size <- 0.2  # Desired effect size (Cohen's d)
alpha <- 0.05       # Significance level (e.g., 0.05 for 5%)
power <- 0.80       # Desired power level (e.g., 0.80 for 80%)

required_sample_size <- pwr.t.test(
  d = effect_size,
  sig.level = alpha,
  power = power,
  type = "two.sample"  # Specify a two-sample t-test
)

# Print the required sample size
print(required_sample_size)

## 
##      Two-sample t test power calculation 
## 
##               n = 393.4057
##               d = 0.2
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Here the required sample size is less than my actual sample, so i can perform Neyman-Pearson hypothesis test on my data.

Testing using Neyman-Pearson hypothesis test

You can also embed plots, for example:

## 
##  Welch Two Sample t-test
## 
## data:  bike1$Rented.Bike.Count and bike2$Rented.Bike.Count
## t = 104.44, df = 8464, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  715.4712 742.8428
## sample estimates:
## mean of x mean of y 
##   729.157     0.000

## [1] "Reject H0. There is a significant difference in the average 'rented bike count' between working days and non working days"

##performing Neyman-Pearson hypothesis test on null hypothesis 2

# Load necessary libraries if not already loaded
# install.packages("dplyr")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Assuming you have your data loaded into a DataFrame called 'bike'

# Hypothesis - Neyman-Pearson Hypothesis Test for Means (T-tests)
alpha <- 0.05
power <- 0.80

# Check if data exists for both 'Seasons' and 'Rented.Bike.Count'
if ("Seasons" %in% colnames(bike) && "Rented.Bike.Count" %in% colnames(bike)) {
  
  # Extract the relevant columns
  seasons <- bike$Seasons
  rented_bike_count <- bike$Rented.Bike.Count
  
  # Check if data exists in both variables
  if (length(seasons) > 0 && length(rented_bike_count) > 0) {
    
    # Get unique season values
    unique_seasons <- unique(seasons)
    
    # Initialize a list to store t-test results
    t_test_results <- list()
    
    # Perform t-tests for each pair of seasons
    for (i in 1:(length(unique_seasons) - 1)) {
      for (j in (i + 1):length(unique_seasons)) {
        season1 <- unique_seasons[i]
        season2 <- unique_seasons[j]
        
        # Subset data for the two seasons
        data1 <- rented_bike_count[seasons == season1]
        data2 <- rented_bike_count[seasons == season2]
        
        # Perform the t-test
        t_test_result <- t.test(data1, data2)
        
        # Store the t-test result
        t_test_results[[paste(season1, "-", season2)]] <- t_test_result
      }
    }
    
    # Interpret the results for each pair
    for (pair in names(t_test_results)) {
      t_test_result <- t_test_results[[pair]]
      p_value <- t_test_result$p.value
      
      if (p_value < alpha) {
        cat(paste("Reject Null Hypothesis (H0) for", pair, ": There is a significant difference in means. (p-value:", formatC(p_value, digits = 4), ")"))
      } else {
        cat(paste("Fail to Reject Null Hypothesis (H0) for", pair, ": There is no significant difference in means. (p-value:", formatC(p_value, digits = 4), ")"))
      }
    }
    
  } else {
    cat("Data is missing in one or both variables. Check your data.")
  }
  
} else {
  cat("The required columns 'Seasons' and/or 'Rented.Bike.Count' are not present in the 'bike' dataframe.")
}

## Reject Null Hypothesis (H0) for Winter - Spring : There is a significant difference in means. (p-value: 2.436e-239 )Reject Null Hypothesis (H0) for Winter - Summer : There is a significant difference in means. (p-value:     0 )Reject Null Hypothesis (H0) for Winter - Autumn : There is a significant difference in means. (p-value: 4.546e-285 )Reject Null Hypothesis (H0) for Spring - Summer : There is a significant difference in means. (p-value: 4.929e-52 )Reject Null Hypothesis (H0) for Spring - Autumn : There is a significant difference in means. (p-value: 3.219e-06 )Reject Null Hypothesis (H0) for Summer - Autumn : There is a significant difference in means. (p-value: 6.539e-26 )

ggplot(bike, aes(x = Functioning.Day, y = Rented.Bike.Count)) +
  geom_boxplot() +
  labs(x = "Day Type", y = "Rented Bike Count") +
  ggtitle("Distribution of Rented Bike Count by Day Type") +
  theme_minimal()

ggplot(bike, aes(x = Seasons, y = Rented.Bike.Count, fill = Seasons)) +
  geom_boxplot() +
  labs(x = "Season", y = "Rented Bike Count") +
  ggtitle("Rented Bike Count by Season") +
  theme_minimal()

##perfomring Fisher’s style test for significance on null hypothesis 1

contingency_table <- table(bike$Functioning.Day, bike$Rented.Bike.Count)

# Perform Fisher's exact test
fisher_test_result <- fisher.test(contingency_table, simulate.p.value = TRUE)

# Extract the p-value from the test result
p_value <- fisher_test_result$p.value

# Set your significance level (alpha)
alpha <- 0.05

# Check if the p-value is less than alpha
if (p_value < alpha) {
  cat("Reject the null hypothesis: There is a significant difference in the average 'rented bike count' between working days and non working days")
} else {
  cat("Fail to reject the null hypothesis: There is no significant difference in the average 'rented bike count' between working days and non working days")
}

## Reject the null hypothesis: There is a significant difference in the average 'rented bike count' between working days and non working days

# Print the p-value
cat("P-value:",p_value,"\n")

## P-value: 0.0004997501

##Testing Fisher’s style test for significance on null hypothesis 2

# Create a contingency table for Seasons and Rented.Bike.Count
contingency_table <- table(bike$Seasons, bike$Rented.Bike.Count)

# Perform Fisher's exact test
fisher_test_result <- fisher.test(contingency_table, simulate.p.value = TRUE)

# Extract the p-value from the test result
p_value <- fisher_test_result$p.value

# Set your significance level (alpha)
alpha <- 0.05

# Check if the p-value is less than alpha
if (p_value < alpha) {
  cat("Reject the null hypothesis: There is a significant difference in the distribution of 'Rented.Bike.Count' across seasons")
} else {
  cat("Fail to reject the null hypothesis: There is no significant difference in the distribution of 'Rented.Bike.Count' across seasons")
}

## Reject the null hypothesis: There is a significant difference in the distribution of 'Rented.Bike.Count' across seasons

# Print the p-value
cat("P-value:", p_value, "\n")

## P-value: 0.0004997501

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

week7

2023-10-05

R Markdown

Null hypothesis

Testing using Neyman-Pearson hypothesis test