library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(tseries)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tsibble)

## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(ggplot2)
library(dplyr)

df <-read.csv('/Users/fahadmehfooz/Desktop/IUPUI/First Semester/Intro to Statistics/Intro to Stats Dataset/Dataset 1/Superstore.csv')
colnames(df)

##  [1] "Row.ID"        "Order.ID"      "Order.Date"    "Ship.Date"    
##  [5] "Ship.Mode"     "Customer.ID"   "Customer.Name" "Segment"      
##  [9] "Country"       "City"          "State"         "Postal.Code"  
## [13] "Region"        "Product.ID"    "Category"      "Sub.Category" 
## [17] "Product.Name"  "Sales"         "Quantity"      "Discount"     
## [21] "Profit"

Converting date variable to datetime

df$Ship_DT <- as.POSIXct(df$Ship.Date, format = "%d-%m-%Y")
head(format(df$Ship_DT, "%Y-%m-%d %H:%M:%S"))

## [1] "2013-11-12 00:00:00" "2013-11-12 00:00:00" "2013-06-17 00:00:00"
## [4] "2012-10-18 00:00:00" "2012-10-18 00:00:00" "2011-06-14 00:00:00"

Extracting date from datetime variable

df$Ship_DT <- as.Date(df$Ship_DT)
head(df["Ship_DT"])

##      Ship_DT
## 1 2013-11-12
## 2 2013-11-12
## 3 2013-06-17
## 4 2012-10-18
## 5 2012-10-18
## 6 2011-06-14

Repsonse variable = “Sales”

# we have duplicate rows we need to get rid of them first

duplicates <- df %>%
  count(Ship_DT, Sales) %>%
  filter(n > 1)


print(duplicates)

##       Ship_DT   Sales n
## 1  2011-04-09  22.960 2
## 2  2011-04-27 281.372 2
## 3  2011-11-25   9.960 2
## 4  2012-01-04   9.840 2
## 5  2012-09-12   6.096 2
## 6  2012-11-17   8.720 2
## 7  2012-12-10   6.480 2
## 8  2013-11-08  72.000 2
## 9  2014-04-24  20.736 2
## 10 2014-09-25 391.980 2

df_aggregated <- df %>%
  group_by(Ship_DT) %>%
  summarise(Sales = sum(Sales)) 

head(df_aggregated)

## # A tibble: 6 × 2
##   Ship_DT      Sales
##   <date>       <dbl>
## 1 2011-01-08   29.2 
## 2 2011-01-09  308.  
## 3 2011-01-11 4375.  
## 4 2011-01-13  107.  
## 5 2011-01-14   40.5 
## 6 2011-01-15    9.94

df_tsibble <- df_aggregated %>%
  as_tsibble(index = Ship_DT)

head(df_tsibble)

## # A tsibble: 6 x 2 [1D]
##   Ship_DT      Sales
##   <date>       <dbl>
## 1 2011-01-08   29.2 
## 2 2011-01-09  308.  
## 3 2011-01-11 4375.  
## 4 2011-01-13  107.  
## 5 2011-01-14   40.5 
## 6 2011-01-15    9.94

# Plotting the entire time series
ggplot(df_tsibble, aes(x = Ship_DT, y = Sales)) +
  geom_line() +
  labs(title = "Total Sales over Time", x = "Shipping Date", y = "Sales") +
  theme_minimal()

years <- unique(format(df_tsibble$Ship_DT, "%Y"))

plots <- lapply(years, function(year) {
  df_filtered <- df_tsibble %>% filter(format(Ship_DT, "%Y") == year)
  ggplot(df_filtered, aes(x = Ship_DT, y = Sales)) +
    geom_line() +
    labs(title = paste("Total Sales in", year), x = "Date", y = "Sales") +
    theme_minimal()
})
plots

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

For all the years, the sales are majorly same across the entire year except for few days in between where we see a hike. This could be due to discounts offered or even the festive season when people tend to buy more often.

Creating a linear regression model

cdf <- df %>%
  mutate(Ship_DT = as.Date(Ship_DT, format = "%d-%m-%Y"))

#Creating a numeric time variable that represents each date as the number of days since the first date
cdf <- cdf %>%
  mutate(Time = as.numeric(Ship_DT - min(Ship_DT)))

# Fitting a linear regression model to the data
model <- lm(Sales ~ Time, data = cdf)

# Getting a summary of the model to check the trend
model_summary <- summary(model)

# Printing the summary to see the trend
print(model_summary)

## 
## Call:
## lm(formula = Sales ~ Time, data = cdf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -240.2  -211.6  -175.7   -19.3 22397.7 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 241.87919   14.02430  17.247   <2e-16 ***
## Time         -0.01418    0.01482  -0.957    0.339    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 623.2 on 9992 degrees of freedom
## Multiple R-squared:  9.163e-05,  Adjusted R-squared:  -8.437e-06 
## F-statistic: 0.9157 on 1 and 9992 DF,  p-value: 0.3386

ggplot(cdf, aes(x = Time, y = Sales)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(x = "Time (days since start)", y = "Sales", title = "Sales Trend Over Time") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

> Observation: The (Intercept) has a significant coefficient with a very low p-value (<2e-16), indicating that the average level of sales at the beginning of the time series (when Time = 0) is significantly different from zero. The Time coefficient is very small (-0.01418) and not statistically significant (p-value = 0.339). This suggests that there is no significant linear trend in the data over time.

These are not strong trends. This is a bad fitting model.

sales_ts <- ts(df$Sales, frequency = 12, start = c(year(min(df$Ship_DT)), month(min(df$Ship_DT))))

# Apply STL decomposition
decomp <- stl(sales_ts, s.window = "periodic")

seasonal_comp <- decomp$time.series[, "seasonal"]
plot(seasonal_comp, main = "Seasonal Component", ylab = "Seasonal Effect")

autoplot(decomp)

We can see the seasonal component here.

acf_result <- Acf(sales_ts, main="ACF for Sales Data")

pacf_result <- Pacf(sales_ts, main="PACF for Sales Data")

acf_result

## 
## Autocorrelations of series 'sales_ts', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000  0.007 -0.002 -0.004  0.000 -0.008  0.001  0.001 -0.012  0.002 -0.009 
##     11     12     13     14     15     16     17     18     19     20     21 
## -0.003  0.002  0.011  0.005  0.006 -0.003  0.006  0.009 -0.006  0.012 -0.011 
##     22     23     24     25     26     27     28     29     30     31     32 
## -0.008 -0.014  0.016 -0.003 -0.007  0.000  0.011 -0.013 -0.003  0.003 -0.005 
##     33     34     35     36     37     38     39 
##  0.005 -0.001 -0.002 -0.006  0.009  0.007 -0.006

pacf_result

## 
## Partial autocorrelations of series 'sales_ts', by lag
## 
##      1      2      3      4      5      6      7      8      9     10     11 
##  0.007 -0.002 -0.004  0.000 -0.008  0.001  0.001 -0.012  0.003 -0.009 -0.003 
##     12     13     14     15     16     17     18     19     20     21     22 
##  0.002  0.011  0.005  0.006 -0.003  0.006  0.009 -0.006  0.012 -0.011 -0.008 
##     23     24     25     26     27     28     29     30     31     32     33 
## -0.014  0.016 -0.003 -0.007  0.000  0.011 -0.013 -0.003  0.003 -0.005  0.005 
##     34     35     36     37     38     39 
## -0.001 -0.002 -0.006  0.009  0.007 -0.006

Interpretation for PACF:

The PACF at lag 1 is slightly positive (0.007), which suggests a very small linear relationship between each value and the one immediately prior, after accounting for the relationships explained by the intervening values. Most of the PACF values are very small and close to zero, which usually indicates that there is little to no autocorrelation at those lags after accounting for previous lags. There is no clear pattern of significant spikes at fixed intervals in these PACF values, which might suggest the absence of strong autoregressive effects in the data.

Interpretation for ACF: At lag 0, the ACF is always 1 because the data is perfectly correlated with itself. The ACF values at lags 1 through 39 are all quite small, most of them being close to zero and none showing a strong autocorrelation. This suggests there is little to no linear dependency between past values and current values in the time series.

Data Dive Week 12

2023-11-11

Converting date variable to datetime

Extracting date from datetime variable

Repsonse variable = “Sales”

Creating a linear regression model