I am importing the libraries needed to run these notes.

library(tidyverse)

## Warning: package 'dplyr' was built under R version 4.3.2

## Warning: package 'lubridate' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.2

## corrplot 0.92 loaded

my_data <- read_delim("C:/Users/Surya CST/Documents/CSV_files/Bundy_Shoe_Shop.csv",delim=",",show_col_types = FALSE)

## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

# Print the modified data frame
head(my_data)

## # A tibble: 6 × 14
##   Inferential statistics…¹ ...2  ...3   ...4 ...5  ...6   ...7 ...8   ...9 ...10
##   <chr>                    <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Al Bundy's shoe shop     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 2 <NA>                     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 3 InvoiceNo                Date  Coun…    NA Shop  Gend…  NA   Size…  NA   "Uni…
## 4 52389                    1/1/… Unit…  2152 UK2   Male   11   44     10.5 "$15…
## 5 52390                    1/1/… Unit…  2230 US15  Male   11.5 44-45  11   "$19…
## 6 52391                    1/1/… Cana…  2160 CAN7  Male    9.5 42-43   9   "$14…
## # ℹ abbreviated name: ¹`Inferential statistics. Confidence intervals`
## # ℹ 4 more variables: ...11 <chr>, ...12 <dbl>, ...13 <dbl>, ...14 <chr>

#1) Cleaning my dataset

I am renaming my titles from{X1,X2} to {Invoice,Date},,etc to make the date more simple and clear.

new_names <- c("InvoiceNo",'Date', "Country", "ProductID",'Shop','Gender','Size(US)','Size (Europe)',   'Size (UK)','UnitPrice','Discount', 'Year','Month','SalePrice')

# Assign the new column names to the data frame
colnames(my_data) <- new_names

# Verify that the column names have been changed
colnames(my_data)

##  [1] "InvoiceNo"     "Date"          "Country"       "ProductID"    
##  [5] "Shop"          "Gender"        "Size(US)"      "Size (Europe)"
##  [9] "Size (UK)"     "UnitPrice"     "Discount"      "Year"         
## [13] "Month"         "SalePrice"

I am removing the first 3 rows to remove null values and un-necessary titles for my data set

my_data <- my_data[-c(1:3), ]


# Print the modified data frame
print(my_data)

## # A tibble: 14,967 × 14
##    InvoiceNo Date     Country  ProductID Shop  Gender `Size(US)` `Size (Europe)`
##    <chr>     <chr>    <chr>        <dbl> <chr> <chr>       <dbl> <chr>          
##  1 52389     1/1/2014 United …      2152 UK2   Male         11   44             
##  2 52390     1/1/2014 United …      2230 US15  Male         11.5 44-45          
##  3 52391     1/1/2014 Canada        2160 CAN7  Male          9.5 42-43          
##  4 52392     1/1/2014 United …      2234 US6   Female        9.5 40             
##  5 52393     1/1/2014 United …      2222 UK4   Female        9   39-40          
##  6 52394     1/1/2014 United …      2173 US15  Male         10.5 43-44          
##  7 52395     1/2/2014 Germany       2200 GER2  Female        9   39-40          
##  8 52396     1/2/2014 Canada        2238 CAN5  Male         10   43             
##  9 52397     1/2/2014 United …      2191 US13  Male         10.5 43-44          
## 10 52398     1/2/2014 United …      2237 UK1   Female        9   39-40          
## # ℹ 14,957 more rows
## # ℹ 6 more variables: `Size (UK)` <dbl>, UnitPrice <chr>, Discount <chr>,
## #   Year <dbl>, Month <dbl>, SalePrice <chr>

my_data$SalePrice <- gsub("\\$", "", my_data$SalePrice)
my_data$SalePrice <- as.numeric(my_data$SalePrice)

class(my_data$SalePrice)

## [1] "numeric"

# removing $ for Unit Price

my_data$UnitPrice <- gsub("\\$", "", my_data$UnitPrice)

my_data$UnitPrice <- as.numeric(my_data$UnitPrice)

# Remove '%' from the Discount column
my_data$Discount <- gsub("%", "", my_data$Discount)
my_data$Discount <- as.numeric(my_data$Discount)

head(my_data$Discount)

## [1]  0 20 20  0  0  0

class(my_data$UnitPrice)

## [1] "numeric"

class(my_data$Discount)

## [1] "numeric"

2) Frequency of shoe sizes vs. Country for males

male_data <- my_data %>% filter(Gender == "Male")

frequency_table_male <- table(male_data$`Size(US)`, male_data$Country)

# Print the result for males
cat("Frequency Table for Male Shoe Sizes in All Countries:\n")

## Frequency Table for Male Shoe Sizes in All Countries:

print(frequency_table_male)

##       
##        Canada Germany United Kingdom United States
##   6        15      30              6            54
##   6.5      15      18             12            45
##   7        24      30             21            39
##   7.5      45      48             12            66
##   8        51     117             45           141
##   8.5     192     174             87           225
##   9       324     348            183           492
##   9.5     375     549            225           741
##   10      237     411            156           543
##   10.5    243     453            150           462
##   11      114     156             69           213
##   11.5     75     129             39           156
##   12       51      78             24            87
##   13       12      33              3            39
##   14       21      30             15            60
##   15       27      48             12            24

3) Frequency of shoe sizes vs. Country for Females

female_data <- my_data %>% filter(Gender == "Female")

# Create a table for the frequency of shoe sizes vs. Country for females
frequency_table_female <- table(female_data$`Size(US)`, female_data$Country)

# Print the result for females
cat("Frequency Table for Female Shoe Sizes in All Countries:\n")

## Frequency Table for Female Shoe Sizes in All Countries:

print(frequency_table_female)

##       
##        Canada Germany United Kingdom United States
##   4.5       6       9             15            21
##   5         6      12              9             9
##   5.5       6       9              6            42
##   6        21      15             12            33
##   6.5      51      84             24            93
##   7        93     156             27           147
##   7.5     153     222             87           318
##   8       192     324            168           618
##   8.5     171     339            129           399
##   9       213     264             93           384
##   9.5      84     126             57           189
##   10       48      87             21            75
##   10.5     36      57             18            87
##   11       18       9              3            15
##   11.5     12      15              3            30
##   12       21      12              6            39

4) Calculating confidence interval for each shoe size for differnent countries for Men

calculate_ci <- function(frequency_table, sample_size, alpha) {
  # Transpose the frequency table to have shoe sizes as columns
  frequency_table <- t(frequency_table)
  alpha <- 0.05
  sample_size<- 36
  
  # Create an empty data frame for results
  results_df <- data.frame(Size = numeric(0), Mean = numeric(0), "Standard error" = numeric(0), "Margin of Error" = numeric(0), "95% CI Lower" = numeric(0), "95% CI Upper" = numeric(0), "Rounded CI Upper" = numeric(0), row.names = NULL)
  
  
  # Calculate Mean, Standard Error, Margin of Error, and Confidence Interval for each shoe size
  for (size_col in colnames(frequency_table)[-1]) {
    frequencies <- frequency_table[, size_col]
    shoe_size <- as.numeric(size_col)
    
    # Mean
    mean_value <- mean(frequencies)
    
    # Standard Deviation
    sd_value <- sd(frequencies)
    
    # Standard Error
    se_value <- sd_value / sqrt(as.numeric(sample_size))
    
    # Degrees of Freedom (sample_size-1 for a sample)
    df <- sample_size - 1  # Convert sample_size to numeric
    
    # Confidence Level (e.g., 95%)
    confidence_level <- 1 - alpha
    
    # t-Statistic (from t-distribution table or use qt function)
    t_statistic <- qt((1 + confidence_level) / 2, df)
    
    # Margin of Error
    margin_of_error <- t_statistic * se_value
    
    # Confidence Interval
    ci_lower <- mean_value - margin_of_error
    ci_upper <- mean_value + margin_of_error
    
    # Rounded Upper Limit
    rounded_ci_upper <- round(ci_upper)
    
    # Add the results to the data frame
    results_df <- rbind(results_df, data.frame(Size = shoe_size, Mean = mean_value, "Standard error" = se_value, "Margin of Error" = margin_of_error, "95% CI Lower" = ci_lower, "95% CI Upper" = ci_upper, "Rounded CI Upper" = rounded_ci_upper))
  }
  
  return(results_df)
}

# Calculate for United States Males
us_male_data <- my_data %>%
  filter(Country == "United States", Gender == "Male")

frequency_table_us_male <- table(us_male_data$`Size(US)`, us_male_data$Month)

results_df_us_male <- calculate_ci(frequency_table_us_male, n, alpha)

# Print the results for United States Males
cat("Confidence Intervals for United States Males:\n")

## Confidence Intervals for United States Males:

print(results_df_us_male)

##    Size  Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1   6.5  3.75      0.5130863       1.0416206      2.708379      4.791621
## 2   7.0  3.25      0.4330127       0.8790625      2.370937      4.129063
## 3   7.5  5.50      0.5435573       1.1034800      4.396520      6.603480
## 4   8.0 11.75      0.8168831       1.6583608     10.091639     13.408361
## 5   8.5 18.75      0.7423856       1.5071229     17.242877     20.257123
## 6   9.0 41.00      1.7100416       3.4715691     37.528431     44.471569
## 7   9.5 61.75      1.7366009       3.5254873     58.224513     65.275487
## 8  10.0 45.25      1.8924691       3.8419166     41.408083     49.091917
## 9  10.5 38.50      1.4503570       2.9443812     35.555619     41.444381
## 10 11.0 17.75      0.6931301       1.4071288     16.342871     19.157129
## 11 11.5 13.00      0.7719842       1.5672112     11.432789     14.567211
## 12 12.0  7.25      0.7355511       1.4932480      5.756752      8.743248
## 13 13.0  3.25      0.4151488       0.8427968      2.407203      4.092797
## 14 14.0  5.00      0.3256695       0.6611442      4.338856      5.661144
## 15 15.0  2.00      0.2562354       0.5201855      1.479815      2.520185
##    Rounded.CI.Upper
## 1                 5
## 2                 4
## 3                 7
## 4                13
## 5                20
## 6                44
## 7                65
## 8                49
## 9                41
## 10               19
## 11               15
## 12                9
## 13                4
## 14                6
## 15                3

# Calculate for Germany Males
germany_male_data <- my_data %>%
  filter(Country == "Germany", Gender == "Male")

frequency_table_germany_male <- table(germany_male_data$`Size(US)`, germany_male_data$Month)

results_df_germany_male <- calculate_ci(frequency_table_germany_male, n, alpha)

# Print the results for Germany Males
cat("Confidence Intervals for Germany Males:\n")

## Confidence Intervals for Germany Males:

print(results_df_germany_male)

##    Size  Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1   6.5  1.50      0.2409996       0.4892552      1.010745      1.989255
## 2   7.0  2.50      0.3793935       0.7702097      1.729790      3.270210
## 3   7.5  4.00      0.4605662       0.9349991      3.065001      4.934999
## 4   8.0  9.75      0.4930066       1.0008567      8.749143     10.750857
## 5   8.5 14.50      0.8483496       1.7222412     12.777759     16.222241
## 6   9.0 29.00      2.2940415       4.6571518     24.342848     33.657152
## 7   9.5 45.75      2.1708875       4.4071359     41.342864     50.157136
## 8  10.0 34.25      1.1216983       2.2771686     31.972831     36.527169
## 9  10.5 37.75      1.7839421       3.6215950     34.128405     41.371595
## 10 11.0 13.00      0.9718253       1.9729103     11.027090     14.972910
## 11 11.5 10.75      0.8992842       1.8256440      8.924356     12.575644
## 12 12.0  6.50      0.6009252       1.2199430      5.280057      7.719943
## 13 13.0  2.75      0.2370377       0.4812121      2.268788      3.231212
## 14 14.0  2.50      0.4174236       0.8474149      1.652585      3.347415
## 15 15.0  4.00      0.5075192       1.0303188      2.969681      5.030319
##    Rounded.CI.Upper
## 1                 2
## 2                 3
## 3                 5
## 4                11
## 5                16
## 6                34
## 7                50
## 8                37
## 9                41
## 10               15
## 11               13
## 12                8
## 13                3
## 14                3
## 15                5

# Calculate for Canada Males
canada_male_data <- my_data %>%
  filter(Country == "Canada", Gender == "Male")

frequency_table_canada_male <- table(canada_male_data$`Size(US)`, canada_male_data$Month)

results_df_canada_male <- calculate_ci(frequency_table_canada_male, n, alpha)

# Print the results for Canada Males
cat("Confidence Intervals for Canada Males:\n")

## Confidence Intervals for Canada Males:

print(results_df_canada_male)

##    Size  Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1   6.5  1.25      0.2853759       0.5793438     0.6706562      1.829344
## 2   7.0  2.00      0.3256695       0.6611442     1.3388558      2.661144
## 3   7.5  3.75      0.4445234       0.9024304     2.8475696      4.652430
## 4   8.0  4.25      0.6518234       1.3232718     2.9267282      5.573272
## 5   8.5 16.00      1.4124268       2.8673789    13.1326211     18.867379
## 6   9.0 27.00      1.5472360       3.1410562    23.8589438     30.141056
## 7   9.5 31.25      1.7668739       3.5869447    27.6630553     34.836945
## 8  10.0 19.75      0.5735104       1.1642879    18.5857121     20.914288
## 9  10.5 20.25      1.3373519       2.7149687    17.5350313     22.964969
## 10 11.0  9.50      0.6134025       1.2452733     8.2547267     10.745273
## 11 11.5  6.25      0.6709145       1.3620288     4.8879712      7.612029
## 12 12.0  4.25      0.5690902       1.1553145     3.0946855      5.405314
## 13 13.0  1.00      0.2752409       0.5587688     0.4412312      1.558769
## 14 14.0  1.75      0.2474619       0.5023743     1.2476257      2.252374
## 15 15.0  2.25      0.3490608       0.7086311     1.5413689      2.958631
##    Rounded.CI.Upper
## 1                 2
## 2                 3
## 3                 5
## 4                 6
## 5                19
## 6                30
## 7                35
## 8                21
## 9                23
## 10               11
## 11                8
## 12                5
## 13                2
## 14                2
## 15                3

# Calculate for United Kingdom Males
uk_male_data <- my_data %>%
  filter(Country == "United Kingdom", Gender == "Male")

frequency_table_uk_male <- table(uk_male_data$`Size(US)`, uk_male_data$Month)

results_df_uk_male <- calculate_ci(frequency_table_uk_male, n, alpha)

# Print the results for United Kingdom Males
cat("Confidence Intervals for United Kingdom Males:\n")

## Confidence Intervals for United Kingdom Males:

print(results_df_uk_male)

##    Size  Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1   6.5  1.00      0.2659080       0.5398220    0.46017804     1.5398220
## 2   7.0  1.75      0.2763854       0.5610922    1.18890781     2.3110922
## 3   7.5  1.00      0.2357023       0.4785010    0.52149897     1.4785010
## 4   8.0  3.75      0.4878576       0.9904036    2.75959639     4.7404036
## 5   8.5  7.25      0.4930066       1.0008567    6.24914329     8.2508567
## 6   9.0 15.25      1.1636867       2.3624095   12.88759046    17.6124095
## 7   9.5 18.75      1.0374916       2.1062200   16.64378001    20.8562200
## 8  10.0 13.00      0.9534626       1.9356320   11.06436804    14.9356320
## 9  10.5 12.50      0.7571211       1.5370375   10.96296250    14.0370375
## 10 11.0  5.75      0.4878576       0.9904036    4.75959639     6.7404036
## 11 11.5  3.25      0.4090208       0.8303563    2.41964372     4.0803563
## 12 12.0  2.00      0.4438127       0.9009876    1.09901236     2.9009876
## 13 13.0  0.25      0.1443376       0.2930208   -0.04302084     0.5430208
## 14 14.0  1.25      0.2853759       0.5793438    0.67065616     1.8293438
## 15 15.0  1.00      0.2010076       0.4080670    0.59193295     1.4080670
##    Rounded.CI.Upper
## 1                 2
## 2                 2
## 3                 1
## 4                 5
## 5                 8
## 6                18
## 7                21
## 8                15
## 9                14
## 10                7
## 11                4
## 12                3
## 13                1
## 14                2
## 15                1

5) Calculating confidence interval for each shoe size for differnent countries for Female

# Calculate for United States Females
us_female_data <- my_data %>%
  filter(Country == "United States", Gender == "Female")

frequency_table_us_female <- table(us_female_data$`Size(US)`, us_female_data$Month)

results_df_us_female <- calculate_ci(frequency_table_us_female, n, alpha)

# Print the results for United States Females
cat("Confidence Intervals for United States Females:\n")

## Confidence Intervals for United States Females:

print(results_df_us_female)

##    Size  Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1   5.0  0.75      0.2025718       0.4112427     0.3387573      1.161243
## 2   5.5  3.50      0.5663100       1.1496705     2.3503295      4.649671
## 3   6.0  2.75      0.6035459       1.2252634     1.5247366      3.975263
## 4   6.5  7.75      1.2311713       2.4994107     5.2505893     10.249411
## 5   7.0 12.25      0.7111131       1.4436363    10.8063637     13.693636
## 6   7.5 26.50      1.4538351       2.9514421    23.5485579     29.451442
## 7   8.0 51.50      1.7588162       3.5705867    47.9294133     55.070587
## 8   8.5 33.25      2.6667850       5.4138614    27.8361386     38.663861
## 9   9.0 32.00      1.7026420       3.4565471    28.5434529     35.456547
## 10  9.5 15.75      1.0078605       2.0460656    13.7039344     17.796066
## 11 10.0  6.25      0.6633440       1.3466599     4.9033401      7.596660
## 12 10.5  7.25      0.7457794       1.5140126     5.7359874      8.764013
## 13 11.0  1.25      0.2146762       0.4358158     0.8141842      1.685816
## 14 11.5  2.50      0.3445096       0.6993917     1.8006083      3.199392
## 15 12.0  3.25      0.6281486       1.2752095     1.9747905      4.525210
##    Rounded.CI.Upper
## 1                 1
## 2                 5
## 3                 4
## 4                10
## 5                14
## 6                29
## 7                55
## 8                39
## 9                35
## 10               18
## 11                8
## 12                9
## 13                2
## 14                3
## 15                5

# Calculate for Germany Females
germany_female_data <- my_data %>%
  filter(Country == "Germany", Gender == "Female")

frequency_table_germany_female <- table(germany_female_data$`Size(US)`, germany_female_data$Month)

results_df_germany_female <- calculate_ci(frequency_table_germany_female, n, alpha)

# Print the results for Germany Females
cat("Confidence Intervals for Germany Females:\n")

## Confidence Intervals for Germany Females:

print(results_df_germany_female)

##    Size  Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1   5.0  1.00      0.2461830       0.4997780     0.5002220      1.499778
## 2   5.5  0.75      0.1896967       0.3851048     0.3648952      1.135105
## 3   6.0  1.25      0.3266373       0.6631090     0.5868910      1.913109
## 4   6.5  7.00      0.6890192       1.3987834     5.6012166      8.398783
## 5   7.0 13.00      0.7849596       1.5935528    11.4064472     14.593553
## 6   7.5 18.50      1.6567889       3.3634603    15.1365397     21.863460
## 7   8.0 27.00      1.6514456       3.3526129    23.6473871     30.352613
## 8   8.5 28.25      0.8261050       1.6770823    26.5729177     29.927082
## 9   9.0 22.00      1.1303883       2.2948103    19.7051897     24.294810
## 10  9.5 10.50      0.8453677       1.7161876     8.7838124     12.216188
## 11 10.0  7.25      0.7789043       1.5812598     5.6687402      8.831260
## 12 10.5  4.75      0.4878576       0.9904036     3.7595964      5.740404
## 13 11.0  0.75      0.1608845       0.3266130     0.4233870      1.076613
## 14 11.5  1.25      0.2763854       0.5610922     0.6889078      1.811092
## 15 12.0  1.00      0.2357023       0.4785010     0.5214990      1.478501
##    Rounded.CI.Upper
## 1                 1
## 2                 1
## 3                 2
## 4                 8
## 5                15
## 6                22
## 7                30
## 8                30
## 9                24
## 10               12
## 11                9
## 12                6
## 13                1
## 14                2
## 15                1

# Calculate for Canada Females
canada_female_data <- my_data %>%
  filter(Country == "Canada", Gender == "Female")

frequency_table_canada_female <- table(canada_female_data$`Size(US)`, canada_female_data$Month)

results_df_canada_female <- calculate_ci(frequency_table_canada_female, n, alpha)

# Print the results for Canada Females
cat("Confidence Intervals for Canada Females:\n")

## Confidence Intervals for Canada Females:

print(results_df_canada_female)

##    Size  Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1   5.0  0.50      0.2071939       0.4206259    0.07937412     0.9206259
## 2   5.5  0.50      0.1329540       0.2699110    0.23008902     0.7699110
## 3   6.0  1.75      0.3188125       0.6472238    1.10277620     2.3972238
## 4   6.5  4.25      0.6440285       1.3074474    2.94255261     5.5574474
## 5   7.0  7.75      1.1549739       2.3447216    5.40527838    10.0947216
## 6   7.5 12.75      0.7949493       1.6138329   11.13616705    14.3638329
## 7   8.0 16.00      0.5458753       1.1081857   14.89181431    17.1081857
## 8   8.5 14.25      1.0495911       2.1307832   12.11921685    16.3807832
## 9   9.0 17.75      0.8964718       1.8199344   15.93006557    19.5699344
## 10  9.5  7.00      0.8498366       1.7252600    5.27474001     8.7252600
## 11 10.0  4.00      0.5504819       1.1175376    2.88246237     5.1175376
## 12 10.5  3.00      0.5075192       1.0303188    1.96968121     4.0303188
## 13 11.0  1.50      0.3370999       0.6843492    0.81565076     2.1843492
## 14 11.5  1.00      0.3956838       0.8032808    0.19671921     1.8032808
## 15 12.0  1.75      0.4330127       0.8790625    0.87093748     2.6290625
##    Rounded.CI.Upper
## 1                 1
## 2                 1
## 3                 2
## 4                 6
## 5                10
## 6                14
## 7                17
## 8                16
## 9                20
## 10                9
## 11                5
## 12                4
## 13                2
## 14                2
## 15                3

# Calculate for United Kingdom Females
uk_female_data <- my_data %>%
  filter(Country == "United Kingdom", Gender == "Female")

frequency_table_uk_female <- table(uk_female_data$`Size(US)`, uk_female_data$Month)

results_df_uk_female <- calculate_ci(frequency_table_uk_female, n, alpha)

# Print the results for United Kingdom Females
cat("Confidence Intervals for United Kingdom Females:\n")

## Confidence Intervals for United Kingdom Females:

print(results_df_uk_female)

##    Size  Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1   5.0  0.75      0.2940917       0.5970378    0.15296221     1.3470378
## 2   5.5  0.50      0.1666667       0.3383513    0.16164868     0.8383513
## 3   6.0  1.00      0.2930164       0.5948548    0.40514517     1.5948548
## 4   6.5  2.00      0.5458753       1.1081857    0.89181431     3.1081857
## 5   7.0  2.25      0.5371314       1.0904346    1.15956536     3.3404346
## 6   7.5  7.25      0.6633440       1.3466599    5.90334010     8.5966599
## 7   8.0 14.00      0.6741999       1.3686985   12.63130151    15.3686985
## 8   8.5 10.75      0.7111131       1.4436363    9.30636369    12.1936363
## 9   9.0  7.75      1.1081083       2.2495795    5.50042052     9.9995795
## 10  9.5  4.75      0.6556861       1.3311135    3.41888648     6.0811135
## 11 10.0  1.75      0.3417498       0.6937890    1.05621101     2.4437890
## 12 10.5  1.50      0.3658393       0.7426932    0.75730676     2.2426932
## 13 11.0  0.25      0.1035969       0.2103129    0.03968706     0.4603129
## 14 11.5  0.25      0.1035969       0.2103129    0.03968706     0.4603129
## 15 12.0  0.50      0.1946247       0.3951092    0.10489078     0.8951092
##    Rounded.CI.Upper
## 1                 1
## 2                 1
## 3                 2
## 4                 3
## 5                 3
## 6                 9
## 7                15
## 8                12
## 9                10
## 10                6
## 11                2
## 12                2
## 13                0
## 14                0
## 15                1

6) Correlation matrix for SalePrice

# Assuming 'my_data' is your dataset

# Check if "SalePrice" is present in column names
if ("SalePrice" %in% colnames(my_data)) {
  # Now "SalePrice" should be numeric
  numerical_data <- my_data[, sapply(my_data, is.numeric)]

  # Check if "SalePrice" is present in numerical_data column names
  if ("SalePrice" %in% colnames(numerical_data)) {
    # Calculate correlations
    correlation_matrix <- cor(numerical_data)

    # Extract correlations with SalePrice
    correlation_with_saleprice <- correlation_matrix["SalePrice", ]

    # Print correlations
    print(correlation_with_saleprice)
  } else {
    cat("SalePrice is not present in numerical_data column names.\n")
  }
} else {
  cat("SalePrice is not present in my_data column names.\n")
}

##     ProductID      Size(US)     Size (UK)     UnitPrice      Discount 
##  0.0120457226 -0.0015988698 -0.0031612731  0.6056385510 -0.8144905797 
##          Year         Month     SalePrice 
## -0.0161758131 -0.0009890055  1.0000000000

corrplot(correlation_matrix, method = "color")

ProductID and SalePrice (0.0120):

There is a very weak positive correlation between ProductID and SalePrice. However, the correlation is close to zero, suggesting that there is almost no linear relationship between these two variables. Size(US) and SalePrice (-0.0016):

There is a very weak negative correlation between the size in US measurements and SalePrice. This implies that as the size increases or decreases, there is almost no linear impact on SalePrice. Size (UK) and SalePrice (-0.0032):

Similar to Size(US), there is a very weak negative correlation between the size in UK measurements and SalePrice. Changes in size have almost no linear impact on SalePrice. UnitPrice and SalePrice (0.6056):

There is a moderate positive correlation between UnitPrice and SalePrice. This suggests that as the unit price increases, the SalePrice tends to increase as well. The correlation is not extremely strong but indicates a noticeable trend. Discount and SalePrice (-0.8145):

There is a strong negative correlation between Discount and SalePrice. This indicates that as the discount increases, the SalePrice tends to decrease, and vice versa. The strong negative correlation suggests a clear inverse relationship. Year and SalePrice (-0.0162):

There is a very weak negative correlation between the year and SalePrice. This implies that there is almost no linear relationship between the year and SalePrice. Month and SalePrice (-0.0010):

There is a very weak negative correlation between the month and SalePrice. Changes in the month have almost no linear impact on SalePrice.

# Boxplot of SalePrice
    boxplot(my_data$SalePrice, main = "Boxplot of SalePrice")

# 7) Total Sales by Discount

if ("Discount" %in% colnames(my_data) & "UnitPrice" %in% colnames(my_data) & "SalePrice" %in% colnames(my_data)) {

  # Analyze the impact of discounts on unit price and total sales
  discount_analysis <- aggregate(cbind(UnitPrice, SalePrice) ~ Discount, data = my_data, FUN = function(x) c(MeanUnitPrice = mean(x), TotalSales = sum(x)))
  
  # Visualize the impact of discounts on total sales
  barplot(discount_analysis$SalePrice[, "TotalSales"], names.arg = discount_analysis$Discount, main = "Total Sales by Discount", xlab = "Discount", ylab = "Total Sales", col = "lightgreen")

} else {
  cat("Required columns are not present in my_data.\n")
}

Conclusion

8)Rounded Upper Confidence Intervals for Different Shoe Sizes (US Male)

ggplot(results_df_us_male, aes(x = Size, y = `Rounded.CI.Upper`)) +
  geom_point(size = 3) +  # Increase the size of points for visibility
  geom_text(aes(label = paste("Size:", Size, "\nCI:", `Rounded.CI.Upper`)), 
            vjust = -1, hjust = 1, check_overlap = TRUE) + # Add labels
  ggtitle("Rounded Upper Confidence Intervals for Different Shoe Sizes (US Males)") +
  xlab("Shoe Size (US)") +
  ylab("Rounded Upper Confidence Interval") +
  theme_minimal() # Use a minimal theme for a cleaner look

ggplot(results_df_uk_female, aes(x = Size, y = `Rounded.CI.Upper`)) +
  geom_point(size = 3) +  # Increase the size of points for visibility
  geom_text(aes(label = paste("Size:", Size, "\nCI:", `Rounded.CI.Upper`)), 
            vjust = -1, hjust = 1, check_overlap = TRUE) + # Add labels
  ggtitle("Rounded Upper Confidence Intervals for Different Shoe Sizes (UK Female)") +
  xlab("Shoe Size (US)") +
  ylab("Rounded Upper Confidence Interval") +
  theme_minimal() # Use a minimal theme for a cleaner look

Final_Project

Surya

2023-11-28