df <-read.csv('/Users/fahadmehfooz/Desktop/IUPUI/First Semester/Intro to Statistics/Intro to Stats Dataset/Dataset 1/Superstore.csv')
colnames(df)
##  [1] "Row.ID"        "Order.ID"      "Order.Date"    "Ship.Date"    
##  [5] "Ship.Mode"     "Customer.ID"   "Customer.Name" "Segment"      
##  [9] "Country"       "City"          "State"         "Postal.Code"  
## [13] "Region"        "Product.ID"    "Category"      "Sub.Category" 
## [17] "Product.Name"  "Sales"         "Quantity"      "Discount"     
## [21] "Profit"

Hypothesis 1

H0: There is no significant difference in quantity between different states

HA: There is a significant difference in quantity between different states.

Significance level: 0.05, Power level: 0.8, Minimum Effect Size: 0.3.

# Since we are finding a relationship between continuous and categorical column, we need to write an anova test
result_statevsquantity <- aov(Quantity ~ State, data = df)
summary(result_statevsquantity)
##               Df Sum Sq Mean Sq F value Pr(>F)
## State         48    219   4.556    0.92  0.631
## Residuals   9945  49258   4.953

Since the probability or p value is 0.631 i.e greater than 0.05, we fail to reject the null hypothesis.

effect_size <- 0.3 
sample_size <- nrow(df) 

# Load the pwr package
library(pwr)

# Performing power analysis for ANOVA
power_analysis <- pwr.anova.test(k = 49,
                                  f = effect_size,
                                  sig.level = 0.05,
                                  n = sample_size)

# Printing the power analysis results
power_analysis
## 
##      Balanced one-way analysis of variance power calculation 
## 
##               k = 49
##               n = 9994
##               f = 0.3
##       sig.level = 0.05
##           power = 1
## 
## NOTE: n is number in each group

Since power is 1 the statistical test has a very high probability of correctly rejecting a false null hypothesis. I

Hypotheis 2:

Null Hypothesis (H0): There is no significant interaction effect between “State” and “Category”

Alternative Hypothesis (HA): There is a significant interaction effect between “State” and “Category”

Significance level: 0.05, Power level: 0.8, Minimum Effect Size: 0.3.

state_vs_category <- chisq.test(table(df$State, df$Category))
## Warning in chisq.test(table(df$State, df$Category)): Chi-squared approximation
## may be incorrect
state_vs_category
## 
##  Pearson's Chi-squared test
## 
## data:  table(df$State, df$Category)
## X-squared = 102.86, df = 96, p-value = 0.2974

Since the probability or p value is 0.2974 i.e greater than 0.05, we fail to reject the null hypothesis.

Reasons for choosing alpha level, power and effect size:

Alpha Level (Significance Level): 0.05. The standard alpha level is often set at 0.05, indicating a 5% chance of rejecting the null hypothesis when it is true. This is a common and widely accepted level in statistical testing.

Power Level: 0.80. A power level of 0.80 is commonly used, indicating an 80% chance of detecting a true effect if it exists. This balance between alpha and power is often considered a reasonable compromise in statistical testing.

Minimum Effect Size: 0.3. The effect size represents the practical significance of the result. Choosing a minimum effect size of 0.3 means you are interested in detecting a moderate effect. This value may be determined based on domain knowledge or previous research.

We can perform pearson neymann test

# Creating a contingency table
contingency_table <- table(df$State, df$Segment)

# Performing the chi-square test
chi_square_result <- chisq.test(contingency_table)
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
# Checking if the chi-square test was successful
if (chi_square_result$p.value > 0) {
  # Extract observed and expected frequencies
  observed_freq <- chi_square_result$observed
  expected_freq <- chi_square_result$expected

  # Calculating the likelihood ratio test statistic
  likelihood_ratio_statistic <- 2 * sum(observed_freq * log(observed_freq / expected_freq))

  # Choosing a significance level (alpha)
  alpha <- 0.05

  # Determining the critical value from the chi-square distribution
  critical_value <- qchisq(1 - alpha, df = chi_square_result$parameter)
  

  # Comparing the test statistic to the critical value
  if (!is.na(likelihood_ratio_statistic) && !is.na(critical_value)) {
    if (likelihood_ratio_statistic > critical_value) {
      cat("Reject the null hypothesis at the", alpha, "significance level.\n")
    } else {
      cat("Fail to reject the null hypothesis at the", alpha, "significance level.\n")
    }
  } else {
    cat("Error: Unable to perform the likelihood ratio test.\n")
  }

  # Print the result of the chi-square test
  print(chi_square_result)
} else {
  cat("Error: Chi-square test unsuccessful. Check your data.\n")
}
## Error: Unable to perform the likelihood ratio test.
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 259.12, df = 96, p-value < 2.2e-16
# Print the result of the chi-square test
print(chi_square_result)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 259.12, df = 96, p-value < 2.2e-16

The log likelihood comes out to be Nan here. And hence we won’t be able to compute it.

Fishers exact test for significance

Test 1

Creating a contingency table

contingency_table <- table(df$Category, df$Segment)

# Performing the Fisher's exact test
fisher_test_results <- fisher.test(contingency_table, simulate.p.value = TRUE)

# Checking the p-value
p_value <- fisher_test_results$p.value

# Making a decision
cat("p: ", p_value, "\n")
## p:  0.8295852
if (p_value < 0.05) {
  # Rejecting the null hypothesis
  print( "There is a significant association between Category and Segment")
} else {
  # Failing to reject the null hypothesis
  print("There is not enough evidence to conclude that there is a significant association between Category and Segment")
}
## [1] "There is not enough evidence to conclude that there is a significant association between Category and Segment"

Test 2

contingency_table <- table(df$Category, df$State)

# Performing the Fisher's exact test
fisher_test_results <- fisher.test(contingency_table, simulate.p.value = TRUE)

# Checking the p-value
p_value <- fisher_test_results$p.value

# Making a decision
cat("p: ", p_value, "\n")
## p:  0.2953523
if (p_value < 0.05) {
  # Rejecting the null hypothesis
  print( "There is a significant association between Category and State.")
} else {
  # Failing to reject the null hypothesis
  print("There is not enough evidence to conclude that there is a significant association between Category and State.")
}
## [1] "There is not enough evidence to conclude that there is a significant association between Category and State."

Build two visualizations that best illustrate the results from the two pairs of hypothesis tests, one for each null hypothesis.

library(ggplot2)

ggplot(df, aes(x = State, y = Quantity)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Boxplot of Quantity by State",
       x = "State",
       y = "Quantity")

The plot actually suggests there is no significant difference in quantity between different states.

# Load necessary libraries
library(ggplot2)

# Assuming 'Category' and 'Segment' are categorical variables
df$Category <- as.factor(df$Category)
df$Segment <- as.factor(df$Segment)

# Create a grouped bar plot
ggplot(df, aes(x = Category, fill = Segment)) +
  geom_bar(position = "dodge", color = "black", stat = "count") +
  labs(title = "Grouped Bar Plot of Category and Segment",
       x = "Category",
       y = "Count") +
  scale_fill_manual(values = c("red", "blue", "green"))  

Thus this validates our result that there is no significant difference between category and sements.