df <-read.csv('/Users/fahadmehfooz/Desktop/IUPUI/First Semester/Intro to Statistics/Intro to Stats Dataset/Dataset 1/Superstore.csv')
colnames(df)
## [1] "Row.ID" "Order.ID" "Order.Date" "Ship.Date"
## [5] "Ship.Mode" "Customer.ID" "Customer.Name" "Segment"
## [9] "Country" "City" "State" "Postal.Code"
## [13] "Region" "Product.ID" "Category" "Sub.Category"
## [17] "Product.Name" "Sales" "Quantity" "Discount"
## [21] "Profit"
# Since we are finding a relationship between continuous and categorical column, we need to write an anova test
result_statevsquantity <- aov(Quantity ~ State, data = df)
summary(result_statevsquantity)
## Df Sum Sq Mean Sq F value Pr(>F)
## State 48 219 4.556 0.92 0.631
## Residuals 9945 49258 4.953
Since the probability or p value is 0.631 i.e greater than 0.05, we fail to reject the null hypothesis.
effect_size <- 0.3
sample_size <- nrow(df)
# Load the pwr package
library(pwr)
# Performing power analysis for ANOVA
power_analysis <- pwr.anova.test(k = 49,
f = effect_size,
sig.level = 0.05,
n = sample_size)
# Printing the power analysis results
power_analysis
##
## Balanced one-way analysis of variance power calculation
##
## k = 49
## n = 9994
## f = 0.3
## sig.level = 0.05
## power = 1
##
## NOTE: n is number in each group
Since power is 1 the statistical test has a very high probability of correctly rejecting a false null hypothesis. I
state_vs_category <- chisq.test(table(df$State, df$Category))
## Warning in chisq.test(table(df$State, df$Category)): Chi-squared approximation
## may be incorrect
state_vs_category
##
## Pearson's Chi-squared test
##
## data: table(df$State, df$Category)
## X-squared = 102.86, df = 96, p-value = 0.2974
Since the probability or p value is 0.2974 i.e greater than 0.05, we fail to reject the null hypothesis.
Alpha Level (Significance Level): 0.05. The standard alpha level is often set at 0.05, indicating a 5% chance of rejecting the null hypothesis when it is true. This is a common and widely accepted level in statistical testing.
Power Level: 0.80. A power level of 0.80 is commonly used, indicating an 80% chance of detecting a true effect if it exists. This balance between alpha and power is often considered a reasonable compromise in statistical testing.
Minimum Effect Size: 0.3. The effect size represents the practical significance of the result. Choosing a minimum effect size of 0.3 means you are interested in detecting a moderate effect. This value may be determined based on domain knowledge or previous research.
# Creating a contingency table
contingency_table <- table(df$State, df$Segment)
# Performing the chi-square test
chi_square_result <- chisq.test(contingency_table)
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
# Checking if the chi-square test was successful
if (chi_square_result$p.value > 0) {
# Extract observed and expected frequencies
observed_freq <- chi_square_result$observed
expected_freq <- chi_square_result$expected
# Calculating the likelihood ratio test statistic
likelihood_ratio_statistic <- 2 * sum(observed_freq * log(observed_freq / expected_freq))
# Choosing a significance level (alpha)
alpha <- 0.05
# Determining the critical value from the chi-square distribution
critical_value <- qchisq(1 - alpha, df = chi_square_result$parameter)
# Comparing the test statistic to the critical value
if (!is.na(likelihood_ratio_statistic) && !is.na(critical_value)) {
if (likelihood_ratio_statistic > critical_value) {
cat("Reject the null hypothesis at the", alpha, "significance level.\n")
} else {
cat("Fail to reject the null hypothesis at the", alpha, "significance level.\n")
}
} else {
cat("Error: Unable to perform the likelihood ratio test.\n")
}
# Print the result of the chi-square test
print(chi_square_result)
} else {
cat("Error: Chi-square test unsuccessful. Check your data.\n")
}
## Error: Unable to perform the likelihood ratio test.
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 259.12, df = 96, p-value < 2.2e-16
# Print the result of the chi-square test
print(chi_square_result)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 259.12, df = 96, p-value < 2.2e-16
The log likelihood comes out to be Nan here. And hence we won’t be able to compute it.
contingency_table <- table(df$Category, df$Segment)
# Performing the Fisher's exact test
fisher_test_results <- fisher.test(contingency_table, simulate.p.value = TRUE)
# Checking the p-value
p_value <- fisher_test_results$p.value
# Making a decision
cat("p: ", p_value, "\n")
## p: 0.8295852
if (p_value < 0.05) {
# Rejecting the null hypothesis
print( "There is a significant association between Category and Segment")
} else {
# Failing to reject the null hypothesis
print("There is not enough evidence to conclude that there is a significant association between Category and Segment")
}
## [1] "There is not enough evidence to conclude that there is a significant association between Category and Segment"
contingency_table <- table(df$Category, df$State)
# Performing the Fisher's exact test
fisher_test_results <- fisher.test(contingency_table, simulate.p.value = TRUE)
# Checking the p-value
p_value <- fisher_test_results$p.value
# Making a decision
cat("p: ", p_value, "\n")
## p: 0.2953523
if (p_value < 0.05) {
# Rejecting the null hypothesis
print( "There is a significant association between Category and State.")
} else {
# Failing to reject the null hypothesis
print("There is not enough evidence to conclude that there is a significant association between Category and State.")
}
## [1] "There is not enough evidence to conclude that there is a significant association between Category and State."
library(ggplot2)
ggplot(df, aes(x = State, y = Quantity)) +
geom_boxplot(fill = "lightblue", color = "blue") +
labs(title = "Boxplot of Quantity by State",
x = "State",
y = "Quantity")
The plot actually suggests there is no significant difference in quantity between different states.
# Load necessary libraries
library(ggplot2)
# Assuming 'Category' and 'Segment' are categorical variables
df$Category <- as.factor(df$Category)
df$Segment <- as.factor(df$Segment)
# Create a grouped bar plot
ggplot(df, aes(x = Category, fill = Segment)) +
geom_bar(position = "dodge", color = "black", stat = "count") +
labs(title = "Grouped Bar Plot of Category and Segment",
x = "Category",
y = "Count") +
scale_fill_manual(values = c("red", "blue", "green"))
Thus this validates our result that there is no significant difference between category and sements.