R_stat_week

I am importing the libraries needed to run these notes.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)

my_data_3 <- read_delim("C:/Users/Surya CST/Documents/CSV_files/Bundy_Shoe_Shop.csv",delim=",",show_col_types = FALSE)

## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

# Print the modified data frame
head(my_data_3)

## # A tibble: 6 × 14
##   Inferential statistics…¹ ...2  ...3   ...4 ...5  ...6   ...7 ...8   ...9 ...10
##   <chr>                    <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Al Bundy's shoe shop     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 2 <NA>                     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 3 InvoiceNo                Date  Coun…    NA Shop  Gend…  NA   Size…  NA   "Uni…
## 4 52389                    1/1/… Unit…  2152 UK2   Male   11   44     10.5 "$15…
## 5 52390                    1/1/… Unit…  2230 US15  Male   11.5 44-45  11   "$19…
## 6 52391                    1/1/… Cana…  2160 CAN7  Male    9.5 42-43   9   "$14…
## # ℹ abbreviated name: ¹`Inferential statistics. Confidence intervals`
## # ℹ 4 more variables: ...11 <chr>, ...12 <dbl>, ...13 <dbl>, ...14 <chr>

Cleaning my dataset

I am renaming my titles from{X1,X2} to {Invoice,Date},,etc to make the date more simple and clear.

new_names <- c("InvoiceNo",'Date', "Country", "ProductID",'Shop','Gender','Size(US)','Size (Europe)',   'Size (UK)','UnitPrice','Discount', 'Year','Month','SalePrice')

# Assign the new column names to the data frame
colnames(my_data_3) <- new_names

# Verify that the column names have been changed
colnames(my_data_3)

##  [1] "InvoiceNo"     "Date"          "Country"       "ProductID"    
##  [5] "Shop"          "Gender"        "Size(US)"      "Size (Europe)"
##  [9] "Size (UK)"     "UnitPrice"     "Discount"      "Year"         
## [13] "Month"         "SalePrice"

I am removing the first 3 rows to remove null values and un-necessary titles for my data set

my_data_3 <- my_data_3[-c(1:3), ]


# Print the modified data frame
print(my_data_3)

## # A tibble: 14,967 × 14
##    InvoiceNo Date     Country  ProductID Shop  Gender `Size(US)` `Size (Europe)`
##    <chr>     <chr>    <chr>        <dbl> <chr> <chr>       <dbl> <chr>          
##  1 52389     1/1/2014 United …      2152 UK2   Male         11   44             
##  2 52390     1/1/2014 United …      2230 US15  Male         11.5 44-45          
##  3 52391     1/1/2014 Canada        2160 CAN7  Male          9.5 42-43          
##  4 52392     1/1/2014 United …      2234 US6   Female        9.5 40             
##  5 52393     1/1/2014 United …      2222 UK4   Female        9   39-40          
##  6 52394     1/1/2014 United …      2173 US15  Male         10.5 43-44          
##  7 52395     1/2/2014 Germany       2200 GER2  Female        9   39-40          
##  8 52396     1/2/2014 Canada        2238 CAN5  Male         10   43             
##  9 52397     1/2/2014 United …      2191 US13  Male         10.5 43-44          
## 10 52398     1/2/2014 United …      2237 UK1   Female        9   39-40          
## # ℹ 14,957 more rows
## # ℹ 6 more variables: `Size (UK)` <dbl>, UnitPrice <chr>, Discount <chr>,
## #   Year <dbl>, Month <dbl>, SalePrice <chr>

my_data_3$SalePrice <- gsub("\\$", "", my_data_3$SalePrice)
my_data_3$SalePrice <- as.numeric(my_data_3$SalePrice)

class(my_data_3$SalePrice)

## [1] "numeric"

# removing $ for Unit Price

my_data_3$UnitPrice <- gsub("\\$", "", my_data_3$UnitPrice)

my_data_3$UnitPrice <- as.numeric(my_data_3$UnitPrice)

# Remove '%' from the Discount column
my_data_3$Discount <- gsub("%", "", my_data_3$Discount)
my_data_3$Discount <- as.numeric(my_data_3$Discount)

head(my_data_3$Discount)

## [1]  0 20 20  0  0  0

class(my_data_3$UnitPrice)

## [1] "numeric"

class(my_data_3$Discount)

## [1] "numeric"

##Set 1: UnitPrice, Discount, and SalePrice

# Response variable: SalePrice
# Explanatory Varibales: UnitPrice and Discount.

# Scatterplot for SalePrice vs UnitPrice
ggplot(data = my_data_3, aes(x = UnitPrice, y = SalePrice)) +
  geom_point() +
  labs(x = "UnitPrice", y = "SalePrice") +
  ggtitle("SalePrice vs UnitPrice")

# Scatterplot for SalePrice vs Discount
ggplot(data = my_data_3, aes(x = Discount, y = SalePrice)) +
  geom_point() +
  labs(x = "Discount", y = "SalePrice") +
  ggtitle("SalePrice vs Discount")

Set-1 Correlation:

# Correlation between SalePrice and UnitPrice
correlation_unitprice <- cor(my_data_3$SalePrice, my_data_3$UnitPrice)

# Correlation between SalePrice and Discount
correlation_discount <- cor(my_data_3$SalePrice, my_data_3$Discount)

correlation_unitprice

## [1] 0.6056386

correlation_discount

## [1] -0.8144906

Set-1 Confidence Interval

# Confidence interval for SalePrice
confidence_interval_saleprice1 <- t.test(my_data_3$SalePrice)$conf.int

confidence_interval_saleprice1

## [1] 143.4242 144.5516
## attr(,"conf.level")
## [1] 0.95

Set1 Conclusion:

In the scatterplot for SalePrice vs UnitPrice, there appears to be a positive linear relationship between UnitPrice and SalePrice. As UnitPrice increases, SalePrice tends to increase.
In the scatterplot for SalePrice vs Discount, there seems to be a negative linear relationship between Discount and SalePrice. As Discount increases, SalePrice tends to decrease.
95% of SalePrice values exists in between 143.42 and 144.55 dollars.

Set-2 Size(US) and ProductID vs UnitPrice

# Response variable: UnitPrice
# Explanatory Variables: Size(US) and ProductID.

# Scatter plot for Size(US) vs UnitPrice
plot(my_data_3$`Size(US)`, my_data_3$UnitPrice, main = "Size(US) vs UnitPrice", 
     xlab = "Size(US)", ylab = "UnitPrice", col = "green")

# Scatter plot for ProductID vs UnitPrice
plot(my_data_3$ProductID, my_data_3$UnitPrice, main = "ProductID vs UnitPrice", 
     xlab = "ProductID", ylab = "UnitPrice", col = "blue")

Set-2 Correation

# Calculate correlation coefficients
correlation_size_uk_unitprice <- cor(my_data_3$`Size(US)`, my_data_3$UnitPrice, method = "pearson")
correlation_product_id_unitprice <- cor(my_data_3$ProductID, my_data_3$UnitPrice, method = "pearson")

# Print correlation coefficients
cat("Correlation between Size(US) and UnitPrice:", correlation_size_uk_unitprice, "\n")

## Correlation between Size(US) and UnitPrice: -0.02079281

cat("Correlation between ProductID and UnitPrice:", correlation_product_id_unitprice, "\n")

## Correlation between ProductID and UnitPrice: 0.02962319

Set-2 Confidence Interval

# Calculate the confidence interval for UnitPrice
confidence_interval_unitprice <- t.test(my_data_3$UnitPrice, conf.level = 0.95)$conf.int

# Print the confidence interval
cat("95% Confidence Interval for UnitPrice:", confidence_interval_unitprice[1], "to", confidence_interval_unitprice[2], "\n")

## 95% Confidence Interval for UnitPrice: 163.8038 to 164.5389

Set-2 Conclusion:

Size(US) and UnitPrice: Changes in the Size(US) variable do not have a significant linear impact on the UnitPrice.
ProductID and UnitPrice: Different ProductIDs do not have a significant linear influence on the UnitPrice.
Based on the 95% confidence interval for UnitPrice in Set 2 (with Size(UK) and ProductID as explanatory variables), we can conclude that we are 95% confident that the true population mean unit price of products falls within the interval.

Set-3 Month and Year vs SalePrice

# Response variable: SalePrice
# Explanatory Variables: Month and Year.

# Scatterplot for SalePrice vs Year
ggplot(data = my_data_3, aes(x = Year, y = SalePrice)) +
  geom_point() +
  labs(x = "Year", y = "SalePrice") +
  ggtitle("SalePrice vs Year")

# Scatterplot for SalePrice vs Month
ggplot(data = my_data_3, aes(x = Month, y = SalePrice)) +
  geom_point() +
  labs(x = "Month", y = "SalePrice") +
  ggtitle("SalePrice vs Month")

Set-3 Correlation Coefficient

# Correlation between SalePrice and Year
correlation_year <- cor(my_data_3$SalePrice, my_data_3$Year)

# Correlation between SalePrice and Month
correlation_month <- cor(my_data_3$SalePrice, my_data_3$Month)

correlation_year

## [1] -0.01617581

correlation_month

## [1] -0.0009890055

Confidence Interval

# Confidence interval for SalePrice
confidence_interval_saleprice3 <- t.test(my_data_3$SalePrice)$conf.int

confidence_interval_saleprice3

## [1] 143.4242 144.5516
## attr(,"conf.level")
## [1] 0.95

Set-3 Conclusion

SalePrice vs Year: There is a very weak negative relationship between SalePrice and Year, meaning that SalePrice may slightly decrease as the Year goes up, but the effect is minimal. 2)SalePrice vs Month: There is almost no relationship between SalePrice and Month; the month of sale doesn’t significantly affect the SalePrice.
95% of Saleprice exists in between 143.42 and 144.56 dollars.

R_stat_week_6

Surya

2023-09-03