I am importing the libraries needed to run these notes.
library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(slider)
## Warning: package 'slider' was built under R version 4.3.2
my_data_3 <- read_delim("C:/Users/Surya CST/Documents/CSV_files/Bundy_Shoe_Shop.csv",delim=",",show_col_types = FALSE)
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
# Print the modified data frame
head(my_data_3)
## # A tibble: 6 × 14
## Inferential statistics…¹ ...2 ...3 ...4 ...5 ...6 ...7 ...8 ...9 ...10
## <chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Al Bundy's shoe shop <NA> <NA> NA <NA> <NA> NA <NA> NA <NA>
## 2 <NA> <NA> <NA> NA <NA> <NA> NA <NA> NA <NA>
## 3 InvoiceNo Date Coun… NA Shop Gend… NA Size… NA "Uni…
## 4 52389 1/1/… Unit… 2152 UK2 Male 11 44 10.5 "$15…
## 5 52390 1/1/… Unit… 2230 US15 Male 11.5 44-45 11 "$19…
## 6 52391 1/1/… Cana… 2160 CAN7 Male 9.5 42-43 9 "$14…
## # ℹ abbreviated name: ¹​`Inferential statistics. Confidence intervals`
## # ℹ 4 more variables: ...11 <chr>, ...12 <dbl>, ...13 <dbl>, ...14 <chr>
I am renaming my titles from{X1,X2} to {Invoice,Date},,etc to make the date more simple and clear.
new_names <- c("InvoiceNo",'Date', "Country", "ProductID",'Shop','Gender','Size(US)','Size (Europe)', 'Size (UK)','UnitPrice','Discount', 'Year','Month','SalePrice')
# Assign the new column names to the data frame
colnames(my_data_3) <- new_names
# Verify that the column names have been changed
colnames(my_data_3)
## [1] "InvoiceNo" "Date" "Country" "ProductID"
## [5] "Shop" "Gender" "Size(US)" "Size (Europe)"
## [9] "Size (UK)" "UnitPrice" "Discount" "Year"
## [13] "Month" "SalePrice"
I am removing the first 3 rows to remove null values and un-necessary titles for my data set
my_data_3 <- my_data_3[-c(1:3), ]
# Print the modified data frame
print(my_data_3)
## # A tibble: 14,967 × 14
## InvoiceNo Date Country ProductID Shop Gender `Size(US)` `Size (Europe)`
## <chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 52389 1/1/2014 United … 2152 UK2 Male 11 44
## 2 52390 1/1/2014 United … 2230 US15 Male 11.5 44-45
## 3 52391 1/1/2014 Canada 2160 CAN7 Male 9.5 42-43
## 4 52392 1/1/2014 United … 2234 US6 Female 9.5 40
## 5 52393 1/1/2014 United … 2222 UK4 Female 9 39-40
## 6 52394 1/1/2014 United … 2173 US15 Male 10.5 43-44
## 7 52395 1/2/2014 Germany 2200 GER2 Female 9 39-40
## 8 52396 1/2/2014 Canada 2238 CAN5 Male 10 43
## 9 52397 1/2/2014 United … 2191 US13 Male 10.5 43-44
## 10 52398 1/2/2014 United … 2237 UK1 Female 9 39-40
## # ℹ 14,957 more rows
## # ℹ 6 more variables: `Size (UK)` <dbl>, UnitPrice <chr>, Discount <chr>,
## # Year <dbl>, Month <dbl>, SalePrice <chr>
my_data_3$SalePrice <- gsub("\\$", "", my_data_3$SalePrice)
my_data_3$SalePrice <- as.numeric(my_data_3$SalePrice)
class(my_data_3$SalePrice)
## [1] "numeric"
# removing $ for Unit Price
my_data_3$UnitPrice <- gsub("\\$", "", my_data_3$UnitPrice)
my_data_3$UnitPrice <- as.numeric(my_data_3$UnitPrice)
# Remove '%' from the Discount column
my_data_3$Discount <- gsub("%", "", my_data_3$Discount)
my_data_3$Discount <- as.numeric(my_data_3$Discount)
head(my_data_3$Discount)
## [1] 0 20 20 0 0 0
class(my_data_3$UnitPrice)
## [1] "numeric"
class(my_data_3$Discount)
## [1] "numeric"
I have selected ‘date’ column in my dataset. It refers to when the customer bought the shoe.
My response variable is SalePrice (price of shoe at which customers buy)
ggplot(my_data_3, aes(x = Date, y = SalePrice)) +
geom_line() +
labs(title = "Sale Price Over Time",
x = "Date",
y = "Sale Price ($)")
.
my_tsibble <- tibble(
Date = my_data_3$Date,
SalePrice = my_data_3$SalePrice
)
print(my_tsibble)
## # A tibble: 14,967 × 2
## Date SalePrice
## <chr> <dbl>
## 1 1/1/2014 159
## 2 1/1/2014 159.
## 3 1/1/2014 119.
## 4 1/1/2014 159
## 5 1/1/2014 159
## 6 1/1/2014 159
## 7 1/2/2014 179
## 8 1/2/2014 169
## 9 1/2/2014 139
## 10 1/2/2014 149
## # ℹ 14,957 more rows
From the above plotting between SalePrice($) vs Date, We can conclude that Sale of the shoes changes frequently as per the date, but the variations seems to be consistent.
linear_model <- lm(SalePrice ~ Date, data = as.data.frame(my_tsibble))
The linear regression analysis suggests that there is no significant linear trend in SalePrice(\() over time. The coefficient for the Date variable is not statistically significant, indicating that there is no strong evidence of a relationship between the date and the sale prices. The model explains only a very small proportion of the variance in SalePrice(\)) (low R-squared), suggesting that other factors not included in the model may influence sale prices.
my_tsibble <- my_tsibble %>%
mutate(Smoothed_SalePrice = slide_dbl(SalePrice, mean, .before = 5, .after = 5, complete = TRUE))
ggplot(my_tsibble, aes(x = Date)) +
geom_line(aes(y = SalePrice, color = "Original")) +
geom_line(aes(y = Smoothed_SalePrice, color = "Smoothed"), linetype = "dashed") +
labs(title = "Original and Smoothed Time Series",
y = "SalePrice",
color = "Legend") +
theme_minimal()
my_ts <- ts(my_tsibble$SalePrice, frequency = 12) # Assuming monthly data
# Plot ACF and PACF
par(mfrow = c(2, 1))
# ACF plot
acf(my_ts, main = "Autocorrelation Function (ACF)")
# PACF plot
pacf(my_ts, main = "Partial Autocorrelation Function (PACF)")