I am importing the libraries needed to run these notes.
library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
my_data <- read_delim("C:/Users/Surya CST/Documents/CSV_files/Bundy_Shoe_Shop.csv",delim=",",show_col_types = FALSE)
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
# Print the modified data frame
head(my_data)
## # A tibble: 6 × 14
## Inferential statistics…¹ ...2 ...3 ...4 ...5 ...6 ...7 ...8 ...9 ...10
## <chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Al Bundy's shoe shop <NA> <NA> NA <NA> <NA> NA <NA> NA <NA>
## 2 <NA> <NA> <NA> NA <NA> <NA> NA <NA> NA <NA>
## 3 InvoiceNo Date Coun… NA Shop Gend… NA Size… NA "Uni…
## 4 52389 1/1/… Unit… 2152 UK2 Male 11 44 10.5 "$15…
## 5 52390 1/1/… Unit… 2230 US15 Male 11.5 44-45 11 "$19…
## 6 52391 1/1/… Cana… 2160 CAN7 Male 9.5 42-43 9 "$14…
## # ℹ abbreviated name: ¹​`Inferential statistics. Confidence intervals`
## # ℹ 4 more variables: ...11 <chr>, ...12 <dbl>, ...13 <dbl>, ...14 <chr>
#1) Cleaning my dataset
I am renaming my titles from{X1,X2} to {Invoice,Date},,etc to make the date more simple and clear.
new_names <- c("InvoiceNo",'Date', "Country", "ProductID",'Shop','Gender','Size(US)','Size (Europe)', 'Size (UK)','UnitPrice','Discount', 'Year','Month','SalePrice')
# Assign the new column names to the data frame
colnames(my_data) <- new_names
# Verify that the column names have been changed
colnames(my_data)
## [1] "InvoiceNo" "Date" "Country" "ProductID"
## [5] "Shop" "Gender" "Size(US)" "Size (Europe)"
## [9] "Size (UK)" "UnitPrice" "Discount" "Year"
## [13] "Month" "SalePrice"
I am removing the first 3 rows to remove null values and un-necessary titles for my data set
my_data <- my_data[-c(1:3), ]
# Print the modified data frame
print(my_data)
## # A tibble: 14,967 × 14
## InvoiceNo Date Country ProductID Shop Gender `Size(US)` `Size (Europe)`
## <chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 52389 1/1/2014 United … 2152 UK2 Male 11 44
## 2 52390 1/1/2014 United … 2230 US15 Male 11.5 44-45
## 3 52391 1/1/2014 Canada 2160 CAN7 Male 9.5 42-43
## 4 52392 1/1/2014 United … 2234 US6 Female 9.5 40
## 5 52393 1/1/2014 United … 2222 UK4 Female 9 39-40
## 6 52394 1/1/2014 United … 2173 US15 Male 10.5 43-44
## 7 52395 1/2/2014 Germany 2200 GER2 Female 9 39-40
## 8 52396 1/2/2014 Canada 2238 CAN5 Male 10 43
## 9 52397 1/2/2014 United … 2191 US13 Male 10.5 43-44
## 10 52398 1/2/2014 United … 2237 UK1 Female 9 39-40
## # ℹ 14,957 more rows
## # ℹ 6 more variables: `Size (UK)` <dbl>, UnitPrice <chr>, Discount <chr>,
## # Year <dbl>, Month <dbl>, SalePrice <chr>
my_data$SalePrice <- gsub("\\$", "", my_data$SalePrice)
my_data$SalePrice <- as.numeric(my_data$SalePrice)
class(my_data$SalePrice)
## [1] "numeric"
# removing $ for Unit Price
my_data$UnitPrice <- gsub("\\$", "", my_data$UnitPrice)
my_data$UnitPrice <- as.numeric(my_data$UnitPrice)
# Remove '%' from the Discount column
my_data$Discount <- gsub("%", "", my_data$Discount)
my_data$Discount <- as.numeric(my_data$Discount)
head(my_data$Discount)
## [1] 0 20 20 0 0 0
class(my_data$UnitPrice)
## [1] "numeric"
class(my_data$Discount)
## [1] "numeric"
male_data <- my_data %>% filter(Gender == "Male")
frequency_table_male <- table(male_data$`Size(US)`, male_data$Country)
# Print the result for males
cat("Frequency Table for Male Shoe Sizes in All Countries:\n")
## Frequency Table for Male Shoe Sizes in All Countries:
print(frequency_table_male)
##
## Canada Germany United Kingdom United States
## 6 15 30 6 54
## 6.5 15 18 12 45
## 7 24 30 21 39
## 7.5 45 48 12 66
## 8 51 117 45 141
## 8.5 192 174 87 225
## 9 324 348 183 492
## 9.5 375 549 225 741
## 10 237 411 156 543
## 10.5 243 453 150 462
## 11 114 156 69 213
## 11.5 75 129 39 156
## 12 51 78 24 87
## 13 12 33 3 39
## 14 21 30 15 60
## 15 27 48 12 24
female_data <- my_data %>% filter(Gender == "Female")
# Create a table for the frequency of shoe sizes vs. Country for females
frequency_table_female <- table(female_data$`Size(US)`, female_data$Country)
# Print the result for females
cat("Frequency Table for Female Shoe Sizes in All Countries:\n")
## Frequency Table for Female Shoe Sizes in All Countries:
print(frequency_table_female)
##
## Canada Germany United Kingdom United States
## 4.5 6 9 15 21
## 5 6 12 9 9
## 5.5 6 9 6 42
## 6 21 15 12 33
## 6.5 51 84 24 93
## 7 93 156 27 147
## 7.5 153 222 87 318
## 8 192 324 168 618
## 8.5 171 339 129 399
## 9 213 264 93 384
## 9.5 84 126 57 189
## 10 48 87 21 75
## 10.5 36 57 18 87
## 11 18 9 3 15
## 11.5 12 15 3 30
## 12 21 12 6 39
calculate_ci <- function(frequency_table, sample_size, alpha) {
# Transpose the frequency table to have shoe sizes as columns
frequency_table <- t(frequency_table)
alpha <- 0.05
sample_size<- 36
# Create an empty data frame for results
results_df <- data.frame(Size = numeric(0), Mean = numeric(0), "Standard error" = numeric(0), "Margin of Error" = numeric(0), "95% CI Lower" = numeric(0), "95% CI Upper" = numeric(0), "Rounded CI Upper" = numeric(0), row.names = NULL)
# Calculate Mean, Standard Error, Margin of Error, and Confidence Interval for each shoe size
for (size_col in colnames(frequency_table)[-1]) {
frequencies <- frequency_table[, size_col]
shoe_size <- as.numeric(size_col)
# Mean
mean_value <- mean(frequencies)
# Standard Deviation
sd_value <- sd(frequencies)
# Standard Error
se_value <- sd_value / sqrt(as.numeric(sample_size))
# Degrees of Freedom (sample_size-1 for a sample)
df <- sample_size - 1 # Convert sample_size to numeric
# Confidence Level (e.g., 95%)
confidence_level <- 1 - alpha
# t-Statistic (from t-distribution table or use qt function)
t_statistic <- qt((1 + confidence_level) / 2, df)
# Margin of Error
margin_of_error <- t_statistic * se_value
# Confidence Interval
ci_lower <- mean_value - margin_of_error
ci_upper <- mean_value + margin_of_error
# Rounded Upper Limit
rounded_ci_upper <- round(ci_upper)
# Add the results to the data frame
results_df <- rbind(results_df, data.frame(Size = shoe_size, Mean = mean_value, "Standard error" = se_value, "Margin of Error" = margin_of_error, "95% CI Lower" = ci_lower, "95% CI Upper" = ci_upper, "Rounded CI Upper" = rounded_ci_upper))
}
return(results_df)
}
# Calculate for United States Males
us_male_data <- my_data %>%
filter(Country == "United States", Gender == "Male")
frequency_table_us_male <- table(us_male_data$`Size(US)`, us_male_data$Month)
results_df_us_male <- calculate_ci(frequency_table_us_male, n, alpha)
# Print the results for United States Males
cat("Confidence Intervals for United States Males:\n")
## Confidence Intervals for United States Males:
print(results_df_us_male)
## Size Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1 6.5 3.75 0.5130863 1.0416206 2.708379 4.791621
## 2 7.0 3.25 0.4330127 0.8790625 2.370937 4.129063
## 3 7.5 5.50 0.5435573 1.1034800 4.396520 6.603480
## 4 8.0 11.75 0.8168831 1.6583608 10.091639 13.408361
## 5 8.5 18.75 0.7423856 1.5071229 17.242877 20.257123
## 6 9.0 41.00 1.7100416 3.4715691 37.528431 44.471569
## 7 9.5 61.75 1.7366009 3.5254873 58.224513 65.275487
## 8 10.0 45.25 1.8924691 3.8419166 41.408083 49.091917
## 9 10.5 38.50 1.4503570 2.9443812 35.555619 41.444381
## 10 11.0 17.75 0.6931301 1.4071288 16.342871 19.157129
## 11 11.5 13.00 0.7719842 1.5672112 11.432789 14.567211
## 12 12.0 7.25 0.7355511 1.4932480 5.756752 8.743248
## 13 13.0 3.25 0.4151488 0.8427968 2.407203 4.092797
## 14 14.0 5.00 0.3256695 0.6611442 4.338856 5.661144
## 15 15.0 2.00 0.2562354 0.5201855 1.479815 2.520185
## Rounded.CI.Upper
## 1 5
## 2 4
## 3 7
## 4 13
## 5 20
## 6 44
## 7 65
## 8 49
## 9 41
## 10 19
## 11 15
## 12 9
## 13 4
## 14 6
## 15 3
# Calculate for Germany Males
germany_male_data <- my_data %>%
filter(Country == "Germany", Gender == "Male")
frequency_table_germany_male <- table(germany_male_data$`Size(US)`, germany_male_data$Month)
results_df_germany_male <- calculate_ci(frequency_table_germany_male, n, alpha)
# Print the results for Germany Males
cat("Confidence Intervals for Germany Males:\n")
## Confidence Intervals for Germany Males:
print(results_df_germany_male)
## Size Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1 6.5 1.50 0.2409996 0.4892552 1.010745 1.989255
## 2 7.0 2.50 0.3793935 0.7702097 1.729790 3.270210
## 3 7.5 4.00 0.4605662 0.9349991 3.065001 4.934999
## 4 8.0 9.75 0.4930066 1.0008567 8.749143 10.750857
## 5 8.5 14.50 0.8483496 1.7222412 12.777759 16.222241
## 6 9.0 29.00 2.2940415 4.6571518 24.342848 33.657152
## 7 9.5 45.75 2.1708875 4.4071359 41.342864 50.157136
## 8 10.0 34.25 1.1216983 2.2771686 31.972831 36.527169
## 9 10.5 37.75 1.7839421 3.6215950 34.128405 41.371595
## 10 11.0 13.00 0.9718253 1.9729103 11.027090 14.972910
## 11 11.5 10.75 0.8992842 1.8256440 8.924356 12.575644
## 12 12.0 6.50 0.6009252 1.2199430 5.280057 7.719943
## 13 13.0 2.75 0.2370377 0.4812121 2.268788 3.231212
## 14 14.0 2.50 0.4174236 0.8474149 1.652585 3.347415
## 15 15.0 4.00 0.5075192 1.0303188 2.969681 5.030319
## Rounded.CI.Upper
## 1 2
## 2 3
## 3 5
## 4 11
## 5 16
## 6 34
## 7 50
## 8 37
## 9 41
## 10 15
## 11 13
## 12 8
## 13 3
## 14 3
## 15 5
# Calculate for Canada Males
canada_male_data <- my_data %>%
filter(Country == "Canada", Gender == "Male")
frequency_table_canada_male <- table(canada_male_data$`Size(US)`, canada_male_data$Month)
results_df_canada_male <- calculate_ci(frequency_table_canada_male, n, alpha)
# Print the results for Canada Males
cat("Confidence Intervals for Canada Males:\n")
## Confidence Intervals for Canada Males:
print(results_df_canada_male)
## Size Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1 6.5 1.25 0.2853759 0.5793438 0.6706562 1.829344
## 2 7.0 2.00 0.3256695 0.6611442 1.3388558 2.661144
## 3 7.5 3.75 0.4445234 0.9024304 2.8475696 4.652430
## 4 8.0 4.25 0.6518234 1.3232718 2.9267282 5.573272
## 5 8.5 16.00 1.4124268 2.8673789 13.1326211 18.867379
## 6 9.0 27.00 1.5472360 3.1410562 23.8589438 30.141056
## 7 9.5 31.25 1.7668739 3.5869447 27.6630553 34.836945
## 8 10.0 19.75 0.5735104 1.1642879 18.5857121 20.914288
## 9 10.5 20.25 1.3373519 2.7149687 17.5350313 22.964969
## 10 11.0 9.50 0.6134025 1.2452733 8.2547267 10.745273
## 11 11.5 6.25 0.6709145 1.3620288 4.8879712 7.612029
## 12 12.0 4.25 0.5690902 1.1553145 3.0946855 5.405314
## 13 13.0 1.00 0.2752409 0.5587688 0.4412312 1.558769
## 14 14.0 1.75 0.2474619 0.5023743 1.2476257 2.252374
## 15 15.0 2.25 0.3490608 0.7086311 1.5413689 2.958631
## Rounded.CI.Upper
## 1 2
## 2 3
## 3 5
## 4 6
## 5 19
## 6 30
## 7 35
## 8 21
## 9 23
## 10 11
## 11 8
## 12 5
## 13 2
## 14 2
## 15 3
# Calculate for United Kingdom Males
uk_male_data <- my_data %>%
filter(Country == "United Kingdom", Gender == "Male")
frequency_table_uk_male <- table(uk_male_data$`Size(US)`, uk_male_data$Month)
results_df_uk_male <- calculate_ci(frequency_table_uk_male, n, alpha)
# Print the results for United Kingdom Males
cat("Confidence Intervals for United Kingdom Males:\n")
## Confidence Intervals for United Kingdom Males:
print(results_df_uk_male)
## Size Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1 6.5 1.00 0.2659080 0.5398220 0.46017804 1.5398220
## 2 7.0 1.75 0.2763854 0.5610922 1.18890781 2.3110922
## 3 7.5 1.00 0.2357023 0.4785010 0.52149897 1.4785010
## 4 8.0 3.75 0.4878576 0.9904036 2.75959639 4.7404036
## 5 8.5 7.25 0.4930066 1.0008567 6.24914329 8.2508567
## 6 9.0 15.25 1.1636867 2.3624095 12.88759046 17.6124095
## 7 9.5 18.75 1.0374916 2.1062200 16.64378001 20.8562200
## 8 10.0 13.00 0.9534626 1.9356320 11.06436804 14.9356320
## 9 10.5 12.50 0.7571211 1.5370375 10.96296250 14.0370375
## 10 11.0 5.75 0.4878576 0.9904036 4.75959639 6.7404036
## 11 11.5 3.25 0.4090208 0.8303563 2.41964372 4.0803563
## 12 12.0 2.00 0.4438127 0.9009876 1.09901236 2.9009876
## 13 13.0 0.25 0.1443376 0.2930208 -0.04302084 0.5430208
## 14 14.0 1.25 0.2853759 0.5793438 0.67065616 1.8293438
## 15 15.0 1.00 0.2010076 0.4080670 0.59193295 1.4080670
## Rounded.CI.Upper
## 1 2
## 2 2
## 3 1
## 4 5
## 5 8
## 6 18
## 7 21
## 8 15
## 9 14
## 10 7
## 11 4
## 12 3
## 13 1
## 14 2
## 15 1
# Calculate for United States Females
us_female_data <- my_data %>%
filter(Country == "United States", Gender == "Female")
frequency_table_us_female <- table(us_female_data$`Size(US)`, us_female_data$Month)
results_df_us_female <- calculate_ci(frequency_table_us_female, n, alpha)
# Print the results for United States Females
cat("Confidence Intervals for United States Females:\n")
## Confidence Intervals for United States Females:
print(results_df_us_female)
## Size Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1 5.0 0.75 0.2025718 0.4112427 0.3387573 1.161243
## 2 5.5 3.50 0.5663100 1.1496705 2.3503295 4.649671
## 3 6.0 2.75 0.6035459 1.2252634 1.5247366 3.975263
## 4 6.5 7.75 1.2311713 2.4994107 5.2505893 10.249411
## 5 7.0 12.25 0.7111131 1.4436363 10.8063637 13.693636
## 6 7.5 26.50 1.4538351 2.9514421 23.5485579 29.451442
## 7 8.0 51.50 1.7588162 3.5705867 47.9294133 55.070587
## 8 8.5 33.25 2.6667850 5.4138614 27.8361386 38.663861
## 9 9.0 32.00 1.7026420 3.4565471 28.5434529 35.456547
## 10 9.5 15.75 1.0078605 2.0460656 13.7039344 17.796066
## 11 10.0 6.25 0.6633440 1.3466599 4.9033401 7.596660
## 12 10.5 7.25 0.7457794 1.5140126 5.7359874 8.764013
## 13 11.0 1.25 0.2146762 0.4358158 0.8141842 1.685816
## 14 11.5 2.50 0.3445096 0.6993917 1.8006083 3.199392
## 15 12.0 3.25 0.6281486 1.2752095 1.9747905 4.525210
## Rounded.CI.Upper
## 1 1
## 2 5
## 3 4
## 4 10
## 5 14
## 6 29
## 7 55
## 8 39
## 9 35
## 10 18
## 11 8
## 12 9
## 13 2
## 14 3
## 15 5
# Calculate for Germany Females
germany_female_data <- my_data %>%
filter(Country == "Germany", Gender == "Female")
frequency_table_germany_female <- table(germany_female_data$`Size(US)`, germany_female_data$Month)
results_df_germany_female <- calculate_ci(frequency_table_germany_female, n, alpha)
# Print the results for Germany Females
cat("Confidence Intervals for Germany Females:\n")
## Confidence Intervals for Germany Females:
print(results_df_germany_female)
## Size Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1 5.0 1.00 0.2461830 0.4997780 0.5002220 1.499778
## 2 5.5 0.75 0.1896967 0.3851048 0.3648952 1.135105
## 3 6.0 1.25 0.3266373 0.6631090 0.5868910 1.913109
## 4 6.5 7.00 0.6890192 1.3987834 5.6012166 8.398783
## 5 7.0 13.00 0.7849596 1.5935528 11.4064472 14.593553
## 6 7.5 18.50 1.6567889 3.3634603 15.1365397 21.863460
## 7 8.0 27.00 1.6514456 3.3526129 23.6473871 30.352613
## 8 8.5 28.25 0.8261050 1.6770823 26.5729177 29.927082
## 9 9.0 22.00 1.1303883 2.2948103 19.7051897 24.294810
## 10 9.5 10.50 0.8453677 1.7161876 8.7838124 12.216188
## 11 10.0 7.25 0.7789043 1.5812598 5.6687402 8.831260
## 12 10.5 4.75 0.4878576 0.9904036 3.7595964 5.740404
## 13 11.0 0.75 0.1608845 0.3266130 0.4233870 1.076613
## 14 11.5 1.25 0.2763854 0.5610922 0.6889078 1.811092
## 15 12.0 1.00 0.2357023 0.4785010 0.5214990 1.478501
## Rounded.CI.Upper
## 1 1
## 2 1
## 3 2
## 4 8
## 5 15
## 6 22
## 7 30
## 8 30
## 9 24
## 10 12
## 11 9
## 12 6
## 13 1
## 14 2
## 15 1
# Calculate for Canada Females
canada_female_data <- my_data %>%
filter(Country == "Canada", Gender == "Female")
frequency_table_canada_female <- table(canada_female_data$`Size(US)`, canada_female_data$Month)
results_df_canada_female <- calculate_ci(frequency_table_canada_female, n, alpha)
# Print the results for Canada Females
cat("Confidence Intervals for Canada Females:\n")
## Confidence Intervals for Canada Females:
print(results_df_canada_female)
## Size Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1 5.0 0.50 0.2071939 0.4206259 0.07937412 0.9206259
## 2 5.5 0.50 0.1329540 0.2699110 0.23008902 0.7699110
## 3 6.0 1.75 0.3188125 0.6472238 1.10277620 2.3972238
## 4 6.5 4.25 0.6440285 1.3074474 2.94255261 5.5574474
## 5 7.0 7.75 1.1549739 2.3447216 5.40527838 10.0947216
## 6 7.5 12.75 0.7949493 1.6138329 11.13616705 14.3638329
## 7 8.0 16.00 0.5458753 1.1081857 14.89181431 17.1081857
## 8 8.5 14.25 1.0495911 2.1307832 12.11921685 16.3807832
## 9 9.0 17.75 0.8964718 1.8199344 15.93006557 19.5699344
## 10 9.5 7.00 0.8498366 1.7252600 5.27474001 8.7252600
## 11 10.0 4.00 0.5504819 1.1175376 2.88246237 5.1175376
## 12 10.5 3.00 0.5075192 1.0303188 1.96968121 4.0303188
## 13 11.0 1.50 0.3370999 0.6843492 0.81565076 2.1843492
## 14 11.5 1.00 0.3956838 0.8032808 0.19671921 1.8032808
## 15 12.0 1.75 0.4330127 0.8790625 0.87093748 2.6290625
## Rounded.CI.Upper
## 1 1
## 2 1
## 3 2
## 4 6
## 5 10
## 6 14
## 7 17
## 8 16
## 9 20
## 10 9
## 11 5
## 12 4
## 13 2
## 14 2
## 15 3
# Calculate for United Kingdom Females
uk_female_data <- my_data %>%
filter(Country == "United Kingdom", Gender == "Female")
frequency_table_uk_female <- table(uk_female_data$`Size(US)`, uk_female_data$Month)
results_df_uk_female <- calculate_ci(frequency_table_uk_female, n, alpha)
# Print the results for United Kingdom Females
cat("Confidence Intervals for United Kingdom Females:\n")
## Confidence Intervals for United Kingdom Females:
print(results_df_uk_female)
## Size Mean Standard.error Margin.of.Error X95..CI.Lower X95..CI.Upper
## 1 5.0 0.75 0.2940917 0.5970378 0.15296221 1.3470378
## 2 5.5 0.50 0.1666667 0.3383513 0.16164868 0.8383513
## 3 6.0 1.00 0.2930164 0.5948548 0.40514517 1.5948548
## 4 6.5 2.00 0.5458753 1.1081857 0.89181431 3.1081857
## 5 7.0 2.25 0.5371314 1.0904346 1.15956536 3.3404346
## 6 7.5 7.25 0.6633440 1.3466599 5.90334010 8.5966599
## 7 8.0 14.00 0.6741999 1.3686985 12.63130151 15.3686985
## 8 8.5 10.75 0.7111131 1.4436363 9.30636369 12.1936363
## 9 9.0 7.75 1.1081083 2.2495795 5.50042052 9.9995795
## 10 9.5 4.75 0.6556861 1.3311135 3.41888648 6.0811135
## 11 10.0 1.75 0.3417498 0.6937890 1.05621101 2.4437890
## 12 10.5 1.50 0.3658393 0.7426932 0.75730676 2.2426932
## 13 11.0 0.25 0.1035969 0.2103129 0.03968706 0.4603129
## 14 11.5 0.25 0.1035969 0.2103129 0.03968706 0.4603129
## 15 12.0 0.50 0.1946247 0.3951092 0.10489078 0.8951092
## Rounded.CI.Upper
## 1 1
## 2 1
## 3 2
## 4 3
## 5 3
## 6 9
## 7 15
## 8 12
## 9 10
## 10 6
## 11 2
## 12 2
## 13 0
## 14 0
## 15 1
# Assuming 'my_data' is your dataset
# Check if "SalePrice" is present in column names
if ("SalePrice" %in% colnames(my_data)) {
# Now "SalePrice" should be numeric
numerical_data <- my_data[, sapply(my_data, is.numeric)]
# Check if "SalePrice" is present in numerical_data column names
if ("SalePrice" %in% colnames(numerical_data)) {
# Calculate correlations
correlation_matrix <- cor(numerical_data)
# Extract correlations with SalePrice
correlation_with_saleprice <- correlation_matrix["SalePrice", ]
# Print correlations
print(correlation_with_saleprice)
} else {
cat("SalePrice is not present in numerical_data column names.\n")
}
} else {
cat("SalePrice is not present in my_data column names.\n")
}
## ProductID Size(US) Size (UK) UnitPrice Discount
## 0.0120457226 -0.0015988698 -0.0031612731 0.6056385510 -0.8144905797
## Year Month SalePrice
## -0.0161758131 -0.0009890055 1.0000000000
corrplot(correlation_matrix, method = "color")
ProductID and SalePrice (0.0120):
There is a very weak positive correlation between ProductID and SalePrice. However, the correlation is close to zero, suggesting that there is almost no linear relationship between these two variables. Size(US) and SalePrice (-0.0016):
There is a very weak negative correlation between the size in US measurements and SalePrice. This implies that as the size increases or decreases, there is almost no linear impact on SalePrice. Size (UK) and SalePrice (-0.0032):
Similar to Size(US), there is a very weak negative correlation between the size in UK measurements and SalePrice. Changes in size have almost no linear impact on SalePrice. UnitPrice and SalePrice (0.6056):
There is a moderate positive correlation between UnitPrice and SalePrice. This suggests that as the unit price increases, the SalePrice tends to increase as well. The correlation is not extremely strong but indicates a noticeable trend. Discount and SalePrice (-0.8145):
There is a strong negative correlation between Discount and SalePrice. This indicates that as the discount increases, the SalePrice tends to decrease, and vice versa. The strong negative correlation suggests a clear inverse relationship. Year and SalePrice (-0.0162):
There is a very weak negative correlation between the year and SalePrice. This implies that there is almost no linear relationship between the year and SalePrice. Month and SalePrice (-0.0010):
There is a very weak negative correlation between the month and SalePrice. Changes in the month have almost no linear impact on SalePrice.
# Boxplot of SalePrice
boxplot(my_data$SalePrice, main = "Boxplot of SalePrice")
# 7) Total Sales by Discount
if ("Discount" %in% colnames(my_data) & "UnitPrice" %in% colnames(my_data) & "SalePrice" %in% colnames(my_data)) {
# Analyze the impact of discounts on unit price and total sales
discount_analysis <- aggregate(cbind(UnitPrice, SalePrice) ~ Discount, data = my_data, FUN = function(x) c(MeanUnitPrice = mean(x), TotalSales = sum(x)))
# Visualize the impact of discounts on total sales
barplot(discount_analysis$SalePrice[, "TotalSales"], names.arg = discount_analysis$Discount, main = "Total Sales by Discount", xlab = "Discount", ylab = "Total Sales", col = "lightgreen")
} else {
cat("Required columns are not present in my_data.\n")
}
ggplot(results_df_us_male, aes(x = Size, y = `Rounded.CI.Upper`)) +
geom_point(size = 3) + # Increase the size of points for visibility
geom_text(aes(label = paste("Size:", Size, "\nCI:", `Rounded.CI.Upper`)),
vjust = -1, hjust = 1, check_overlap = TRUE) + # Add labels
ggtitle("Rounded Upper Confidence Intervals for Different Shoe Sizes (US Males)") +
xlab("Shoe Size (US)") +
ylab("Rounded Upper Confidence Interval") +
theme_minimal() # Use a minimal theme for a cleaner look
ggplot(results_df_uk_female, aes(x = Size, y = `Rounded.CI.Upper`)) +
geom_point(size = 3) + # Increase the size of points for visibility
geom_text(aes(label = paste("Size:", Size, "\nCI:", `Rounded.CI.Upper`)),
vjust = -1, hjust = 1, check_overlap = TRUE) + # Add labels
ggtitle("Rounded Upper Confidence Intervals for Different Shoe Sizes (UK Female)") +
xlab("Shoe Size (US)") +
ylab("Rounded Upper Confidence Interval") +
theme_minimal() # Use a minimal theme for a cleaner look