I am importing the libraries needed to run these notes.

library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(slider)
## Warning: package 'slider' was built under R version 4.3.2
my_data_3 <- read_delim("C:/Users/Surya CST/Documents/CSV_files/Bundy_Shoe_Shop.csv",delim=",",show_col_types = FALSE)
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
# Print the modified data frame
head(my_data_3)
## # A tibble: 6 × 14
##   Inferential statistics…¹ ...2  ...3   ...4 ...5  ...6   ...7 ...8   ...9 ...10
##   <chr>                    <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Al Bundy's shoe shop     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 2 <NA>                     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 3 InvoiceNo                Date  Coun…    NA Shop  Gend…  NA   Size…  NA   "Uni…
## 4 52389                    1/1/… Unit…  2152 UK2   Male   11   44     10.5 "$15…
## 5 52390                    1/1/… Unit…  2230 US15  Male   11.5 44-45  11   "$19…
## 6 52391                    1/1/… Cana…  2160 CAN7  Male    9.5 42-43   9   "$14…
## # ℹ abbreviated name: ¹​`Inferential statistics. Confidence intervals`
## # ℹ 4 more variables: ...11 <chr>, ...12 <dbl>, ...13 <dbl>, ...14 <chr>

Cleaning my dataset

I am renaming my titles from{X1,X2} to {Invoice,Date},,etc to make the date more simple and clear.

new_names <- c("InvoiceNo",'Date', "Country", "ProductID",'Shop','Gender','Size(US)','Size (Europe)',   'Size (UK)','UnitPrice','Discount', 'Year','Month','SalePrice')

# Assign the new column names to the data frame
colnames(my_data_3) <- new_names

# Verify that the column names have been changed
colnames(my_data_3)
##  [1] "InvoiceNo"     "Date"          "Country"       "ProductID"    
##  [5] "Shop"          "Gender"        "Size(US)"      "Size (Europe)"
##  [9] "Size (UK)"     "UnitPrice"     "Discount"      "Year"         
## [13] "Month"         "SalePrice"

I am removing the first 3 rows to remove null values and un-necessary titles for my data set

my_data_3 <- my_data_3[-c(1:3), ]

# Print the modified data frame
print(my_data_3)
## # A tibble: 14,967 × 14
##    InvoiceNo Date     Country  ProductID Shop  Gender `Size(US)` `Size (Europe)`
##    <chr>     <chr>    <chr>        <dbl> <chr> <chr>       <dbl> <chr>          
##  1 52389     1/1/2014 United …      2152 UK2   Male         11   44             
##  2 52390     1/1/2014 United …      2230 US15  Male         11.5 44-45          
##  3 52391     1/1/2014 Canada        2160 CAN7  Male          9.5 42-43          
##  4 52392     1/1/2014 United …      2234 US6   Female        9.5 40             
##  5 52393     1/1/2014 United …      2222 UK4   Female        9   39-40          
##  6 52394     1/1/2014 United …      2173 US15  Male         10.5 43-44          
##  7 52395     1/2/2014 Germany       2200 GER2  Female        9   39-40          
##  8 52396     1/2/2014 Canada        2238 CAN5  Male         10   43             
##  9 52397     1/2/2014 United …      2191 US13  Male         10.5 43-44          
## 10 52398     1/2/2014 United …      2237 UK1   Female        9   39-40          
## # ℹ 14,957 more rows
## # ℹ 6 more variables: `Size (UK)` <dbl>, UnitPrice <chr>, Discount <chr>,
## #   Year <dbl>, Month <dbl>, SalePrice <chr>
my_data_3$SalePrice <- gsub("\\$", "", my_data_3$SalePrice)
my_data_3$SalePrice <- as.numeric(my_data_3$SalePrice)

class(my_data_3$SalePrice)
## [1] "numeric"
# removing $ for Unit Price

my_data_3$UnitPrice <- gsub("\\$", "", my_data_3$UnitPrice)

my_data_3$UnitPrice <- as.numeric(my_data_3$UnitPrice)

# Remove '%' from the Discount column
my_data_3$Discount <- gsub("%", "", my_data_3$Discount)
my_data_3$Discount <- as.numeric(my_data_3$Discount)

head(my_data_3$Discount)
## [1]  0 20 20  0  0  0
class(my_data_3$UnitPrice)
## [1] "numeric"
class(my_data_3$Discount)
## [1] "numeric"

1) Selection of date column

I have selected ‘date’ column in my dataset. It refers to when the customer bought the shoe.

2) Plotting response variable over time.

My response variable is SalePrice (price of shoe at which customers buy)

ggplot(my_data_3, aes(x = Date, y = SalePrice)) +
  geom_line() +
  labs(title = "Sale Price Over Time",
       x = "Date",
       y = "Sale Price ($)")

.

3)Creating a tsibble object.

my_tsibble <- tibble(
  Date = my_data_3$Date,
  SalePrice = my_data_3$SalePrice
)
print(my_tsibble)
## # A tibble: 14,967 × 2
##    Date     SalePrice
##    <chr>        <dbl>
##  1 1/1/2014      159 
##  2 1/1/2014      159.
##  3 1/1/2014      119.
##  4 1/1/2014      159 
##  5 1/1/2014      159 
##  6 1/1/2014      159 
##  7 1/2/2014      179 
##  8 1/2/2014      169 
##  9 1/2/2014      139 
## 10 1/2/2014      149 
## # ℹ 14,957 more rows

From the above plotting between SalePrice($) vs Date, We can conclude that Sale of the shoes changes frequently as per the date, but the variations seems to be consistent.

4)Linear regression model between date and SalePrice

linear_model <- lm(SalePrice ~ Date, data = as.data.frame(my_tsibble))

5) Smoothing the time series.

my_tsibble <- my_tsibble %>%
  mutate(Smoothed_SalePrice = slide_dbl(SalePrice, mean, .before = 5, .after = 5, complete = TRUE))

ggplot(my_tsibble, aes(x = Date)) +
  geom_line(aes(y = SalePrice, color = "Original")) +
  geom_line(aes(y = Smoothed_SalePrice, color = "Smoothed"), linetype = "dashed") +
  labs(title = "Original and Smoothed Time Series",
       y = "SalePrice",
       color = "Legend") +
  theme_minimal()

my_ts <- ts(my_tsibble$SalePrice, frequency = 12)  # Assuming monthly data

# Plot ACF and PACF
par(mfrow = c(2, 1))

# ACF plot
acf(my_ts, main = "Autocorrelation Function (ACF)")

# PACF plot
pacf(my_ts, main = "Partial Autocorrelation Function (PACF)")