Untitled

I am importing the libraries needed to run these notes.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)

my_data_3 <- read_delim("C:/Users/Surya CST/Documents/CSV_files/Bundy_Shoe_Shop.csv",delim=",",show_col_types = FALSE)

## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

# Print the modified data frame
head(my_data_3)

## # A tibble: 6 × 14
##   Inferential statistics…¹ ...2  ...3   ...4 ...5  ...6   ...7 ...8   ...9 ...10
##   <chr>                    <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Al Bundy's shoe shop     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 2 <NA>                     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 3 InvoiceNo                Date  Coun…    NA Shop  Gend…  NA   Size…  NA   "Uni…
## 4 52389                    1/1/… Unit…  2152 UK2   Male   11   44     10.5 "$15…
## 5 52390                    1/1/… Unit…  2230 US15  Male   11.5 44-45  11   "$19…
## 6 52391                    1/1/… Cana…  2160 CAN7  Male    9.5 42-43   9   "$14…
## # ℹ abbreviated name: ¹`Inferential statistics. Confidence intervals`
## # ℹ 4 more variables: ...11 <chr>, ...12 <dbl>, ...13 <dbl>, ...14 <chr>

Cleaning my dataset

I am renaming my titles from{X1,X2} to {Invoice,Date},,etc to make the date more simple and clear.

new_names <- c("InvoiceNo",'Date', "Country", "ProductID",'Shop','Gender','Size(US)','Size (Europe)',   'Size (UK)','UnitPrice','Discount', 'Year','Month','SalePrice')

# Assign the new column names to the data frame
colnames(my_data_3) <- new_names

# Verify that the column names have been changed
colnames(my_data_3)

##  [1] "InvoiceNo"     "Date"          "Country"       "ProductID"    
##  [5] "Shop"          "Gender"        "Size(US)"      "Size (Europe)"
##  [9] "Size (UK)"     "UnitPrice"     "Discount"      "Year"         
## [13] "Month"         "SalePrice"

I am removing the first 3 rows to remove null values and un-necessary titles for my data set

my_data_3 <- my_data_3[-c(1:3), ]


# Print the modified data frame
print(my_data_3)

## # A tibble: 14,967 × 14
##    InvoiceNo Date     Country  ProductID Shop  Gender `Size(US)` `Size (Europe)`
##    <chr>     <chr>    <chr>        <dbl> <chr> <chr>       <dbl> <chr>          
##  1 52389     1/1/2014 United …      2152 UK2   Male         11   44             
##  2 52390     1/1/2014 United …      2230 US15  Male         11.5 44-45          
##  3 52391     1/1/2014 Canada        2160 CAN7  Male          9.5 42-43          
##  4 52392     1/1/2014 United …      2234 US6   Female        9.5 40             
##  5 52393     1/1/2014 United …      2222 UK4   Female        9   39-40          
##  6 52394     1/1/2014 United …      2173 US15  Male         10.5 43-44          
##  7 52395     1/2/2014 Germany       2200 GER2  Female        9   39-40          
##  8 52396     1/2/2014 Canada        2238 CAN5  Male         10   43             
##  9 52397     1/2/2014 United …      2191 US13  Male         10.5 43-44          
## 10 52398     1/2/2014 United …      2237 UK1   Female        9   39-40          
## # ℹ 14,957 more rows
## # ℹ 6 more variables: `Size (UK)` <dbl>, UnitPrice <chr>, Discount <chr>,
## #   Year <dbl>, Month <dbl>, SalePrice <chr>

my_data_3$SalePrice <- gsub("\\$", "", my_data_3$SalePrice)
my_data_3$SalePrice <- as.numeric(my_data_3$SalePrice)

class(my_data_3$SalePrice)

## [1] "numeric"

# removing $ for Unit Price

my_data_3$UnitPrice <- gsub("\\$", "", my_data_3$UnitPrice)

my_data_3$UnitPrice <- as.numeric(my_data_3$UnitPrice)

# Remove '%' from the Discount column
my_data_3$Discount <- gsub("%", "", my_data_3$Discount)
my_data_3$Discount <- as.numeric(my_data_3$Discount)

head(my_data_3$Discount)

## [1]  0 20 20  0  0  0

class(my_data_3$UnitPrice)

## [1] "numeric"

class(my_data_3$Discount)

## [1] "numeric"

First Hypothesis Test(t_test):

H0 (Null Hypthesis): The mean sale price of shoes is less than or equal to $140 Ha: The mean sale price of shoes is greater than $140

null_mean <- 140

t_test_result <- t.test(my_data_3$SalePrice, mu = null_mean, alternative = "greater")

p_value <- t_test_result$p.value

alpha <- 0.05

if (p_value < alpha) {
  cat("Rejecting the null hypothesis. The mean sale price of shoes is greater than $140.\n")
} else {
  cat("Fail to reject the null hypothesis. There is no sufficient evidence that the mean sale price of shoes is greater than $140.\n")
}

## Rejecting the null hypothesis. The mean sale price of shoes is greater than $140.

# Print the t-test result
#print(t_test_result)

Neyman-Pearson hypothesis test(F_test)

H0 (Null Hypthesis): The mean sale price of shoes is less than or equal to $140 Ha: The mean sale price of shoes is greater than $140

shoe_prices <- my_data_3$SalePrice

null_variance <- 140

alpha <- 0.05

# Calculate the sample variance
sample_variance <- var(shoe_prices)

# Calculate the chi-squared test statistic
test_statistic <- ((length(shoe_prices) - 1) * sample_variance) / null_variance

# Calculate the p-value
p_value <- pchisq(test_statistic, df = length(shoe_prices) - 1, lower.tail = TRUE)

# Compare the p-value to alpha to make a decision
if (p_value < alpha) {
  cat("Reject the null hypothesis. The population variance is less than the specified value.\n")
} else {
  cat("Fail to reject the null hypothesis. There is no sufficient evidence that the population variance is less than the specified value.\n")
}

## Fail to reject the null hypothesis. There is no sufficient evidence that the population variance is less than the specified value.

# Print the test results
cat("Test Statistic:", test_statistic, "\n")

## Test Statistic: 132308.9

cat("Degrees of Freedom:", length(shoe_prices) - 1, "\n")

## Degrees of Freedom: 14966

cat("P-Value:", p_value, "\n")

## P-Value: 1

Choosing alpha value, minimum power effect for the 1st hypothesis:

Alpha Level (α): 0.05 (5%) You’re okay with a 5% chance of being wrong.

Power Level (1 - β): 0.80 (80%) You want an 80% chance of finding real

Minimum Effect Size (Cohen’s d): 0.3

You care about noticing medium-sized differences. These values strike a balance between avoiding wrong conclusions and not needing too much data. They’re typical for many studies but may vary based on your specific situation.

2nd Hypothesis test(t_test)

Null Hypothesis (H0): Women spend the same or higher prices for shoes compared to men.

Alternative Hypothesis (H1): Women spend lower prices for shoes compared to men.

women_prices <- my_data_3$SalePrice[my_data_3$Gender == "Female"]
men_prices <- my_data_3$SalePrice[my_data_3$Gender == "Male"]

# Perform a two-sample t-test to compare means
t_test_result <- t.test(women_prices, men_prices, alternative = "less")

# Get the p-value
p_value <- t_test_result$p.value

# Set your alpha level (significance level)
alpha <- 0.05

# Compare the p-value to alpha to make a decision
if (p_value < alpha) {
  cat("Reject the null hypothesis. Women pay significantly lower prices for shoes compared to men.\n")
} else {
  cat("Fail to reject the null hypothesis. There is no sufficient evidence that women pay significantly lower prices for shoes compared to men.\n")
}

## Fail to reject the null hypothesis. There is no sufficient evidence that women pay significantly lower prices for shoes compared to men.

2nd Neyman Pearson test(f_test)

Null Hypothesis (H0): Women spend the same or higher prices for shoes compared to men.

Alternative Hypothesis (H1): Women spend lower prices for shoes compared to men.

women_prices <- my_data_3$SalePrice[my_data_3$Gender == "Female"]
men_prices <- my_data_3$SalePrice[my_data_3$Gender == "Male"]

# Perform an analysis of variance (ANOVA)
anova_result <- aov(SalePrice ~ Gender, data = my_data_3)

# Get the p-value
p_value <- summary(anova_result)[[1]][["Pr(>F)"]][1]

# Set your alpha level (significance level)
alpha <- 0.05

# Compare the p-value to alpha to make a decision
if (p_value < alpha) {
  cat("Reject the null hypothesis. There is a significant difference in prices between women and men.\n")
} else {
  cat("Fail to reject the null hypothesis. There is no sufficient evidence of a significant difference in prices between women and men.\n")
}

## Fail to reject the null hypothesis. There is no sufficient evidence of a significant difference in prices between women and men.

# Print the ANOVA result
print(summary(anova_result))

##                Df   Sum Sq Mean Sq F value Pr(>F)
## Gender          1      496   496.4   0.401  0.527
## Residuals   14965 18522751  1237.7

Choosing alpha value, minimum power effect for the hypothesis:

Certainly, here are the simplified values and explanations for the second hypothesis test:

Aplha Value: 0.05 (5%) - You’re willing to accept a 5% chance of making a Type I error, which is a false positive.

Power Level (1 - β):0.90 (90%) - You want a higher power to increase the chances of detecting real differences when they exist.

Minimum Effect Size (Cohen’s d): 0.2 - You’re interested in detecting smaller differences.

These values reflect a slightly higher power level to reduce the risk of false negatives and a smaller minimum effect size, as you’re looking for smaller differences between women and men’s shoe prices. However, the alpha level remains standard at 0.05 to control for Type I errors.

Visualization for 1st hypothesis Test

Hypothesis Test 1: Testing Whether the Mean Sale Price of Shoes is Greater Than $140

# Create a box plot to visualize the distribution
boxplot(my_data_3$SalePrice, horizontal = TRUE, col = "lightblue", main = "Box Plot of Shoe Prices")
# Add a horizontal line for the null hypothesis mean
abline(h = 140, col = "red", lwd = 2)

Visualization for 2nd Hypothesis Test:

Hypothesis Test 2: Testing Whether Women Pay Lower Prices for Shoes Compared to Men

mean_women <- mean(women_prices)
mean_men <- mean(men_prices)

barplot(c(mean_women, mean_men), names.arg = c("Women", "Men"), col = c("lightblue", "lightgreen"), 
        main = "Average Shoe Prices by Gender", ylab = "Average Price")