library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tseries)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(ggplot2)
library(dplyr)
df <-read.csv('/Users/fahadmehfooz/Desktop/IUPUI/First Semester/Intro to Statistics/Intro to Stats Dataset/Dataset 1/Superstore.csv')
colnames(df)
## [1] "Row.ID" "Order.ID" "Order.Date" "Ship.Date"
## [5] "Ship.Mode" "Customer.ID" "Customer.Name" "Segment"
## [9] "Country" "City" "State" "Postal.Code"
## [13] "Region" "Product.ID" "Category" "Sub.Category"
## [17] "Product.Name" "Sales" "Quantity" "Discount"
## [21] "Profit"
df$Ship_DT <- as.POSIXct(df$Ship.Date, format = "%d-%m-%Y")
head(format(df$Ship_DT, "%Y-%m-%d %H:%M:%S"))
## [1] "2013-11-12 00:00:00" "2013-11-12 00:00:00" "2013-06-17 00:00:00"
## [4] "2012-10-18 00:00:00" "2012-10-18 00:00:00" "2011-06-14 00:00:00"
df$Ship_DT <- as.Date(df$Ship_DT)
head(df["Ship_DT"])
## Ship_DT
## 1 2013-11-12
## 2 2013-11-12
## 3 2013-06-17
## 4 2012-10-18
## 5 2012-10-18
## 6 2011-06-14
# we have duplicate rows we need to get rid of them first
duplicates <- df %>%
count(Ship_DT, Sales) %>%
filter(n > 1)
print(duplicates)
## Ship_DT Sales n
## 1 2011-04-09 22.960 2
## 2 2011-04-27 281.372 2
## 3 2011-11-25 9.960 2
## 4 2012-01-04 9.840 2
## 5 2012-09-12 6.096 2
## 6 2012-11-17 8.720 2
## 7 2012-12-10 6.480 2
## 8 2013-11-08 72.000 2
## 9 2014-04-24 20.736 2
## 10 2014-09-25 391.980 2
df_aggregated <- df %>%
group_by(Ship_DT) %>%
summarise(Sales = sum(Sales))
head(df_aggregated)
## # A tibble: 6 × 2
## Ship_DT Sales
## <date> <dbl>
## 1 2011-01-08 29.2
## 2 2011-01-09 308.
## 3 2011-01-11 4375.
## 4 2011-01-13 107.
## 5 2011-01-14 40.5
## 6 2011-01-15 9.94
df_tsibble <- df_aggregated %>%
as_tsibble(index = Ship_DT)
head(df_tsibble)
## # A tsibble: 6 x 2 [1D]
## Ship_DT Sales
## <date> <dbl>
## 1 2011-01-08 29.2
## 2 2011-01-09 308.
## 3 2011-01-11 4375.
## 4 2011-01-13 107.
## 5 2011-01-14 40.5
## 6 2011-01-15 9.94
# Plotting the entire time series
ggplot(df_tsibble, aes(x = Ship_DT, y = Sales)) +
geom_line() +
labs(title = "Total Sales over Time", x = "Shipping Date", y = "Sales") +
theme_minimal()
years <- unique(format(df_tsibble$Ship_DT, "%Y"))
plots <- lapply(years, function(year) {
df_filtered <- df_tsibble %>% filter(format(Ship_DT, "%Y") == year)
ggplot(df_filtered, aes(x = Ship_DT, y = Sales)) +
geom_line() +
labs(title = paste("Total Sales in", year), x = "Date", y = "Sales") +
theme_minimal()
})
plots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
For all the years, the sales are majorly same across the entire year except for few days in between where we see a hike. This could be due to discounts offered or even the festive season when people tend to buy more often.
cdf <- df %>%
mutate(Ship_DT = as.Date(Ship_DT, format = "%d-%m-%Y"))
#Creating a numeric time variable that represents each date as the number of days since the first date
cdf <- cdf %>%
mutate(Time = as.numeric(Ship_DT - min(Ship_DT)))
# Fitting a linear regression model to the data
model <- lm(Sales ~ Time, data = cdf)
# Getting a summary of the model to check the trend
model_summary <- summary(model)
# Printing the summary to see the trend
print(model_summary)
##
## Call:
## lm(formula = Sales ~ Time, data = cdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -240.2 -211.6 -175.7 -19.3 22397.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 241.87919 14.02430 17.247 <2e-16 ***
## Time -0.01418 0.01482 -0.957 0.339
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 623.2 on 9992 degrees of freedom
## Multiple R-squared: 9.163e-05, Adjusted R-squared: -8.437e-06
## F-statistic: 0.9157 on 1 and 9992 DF, p-value: 0.3386
ggplot(cdf, aes(x = Time, y = Sales)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(x = "Time (days since start)", y = "Sales", title = "Sales Trend Over Time") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
> Observation: The (Intercept) has a significant coefficient with a
very low p-value (<2e-16), indicating that the average level of sales
at the beginning of the time series (when Time = 0) is significantly
different from zero. The Time coefficient is very small (-0.01418) and
not statistically significant (p-value = 0.339). This suggests that
there is no significant linear trend in the data over time.
These are not strong trends. This is a bad fitting model.
sales_ts <- ts(df$Sales, frequency = 12, start = c(year(min(df$Ship_DT)), month(min(df$Ship_DT))))
# Apply STL decomposition
decomp <- stl(sales_ts, s.window = "periodic")
seasonal_comp <- decomp$time.series[, "seasonal"]
plot(seasonal_comp, main = "Seasonal Component", ylab = "Seasonal Effect")
autoplot(decomp)
We can see the seasonal component here.
acf_result <- Acf(sales_ts, main="ACF for Sales Data")
pacf_result <- Pacf(sales_ts, main="PACF for Sales Data")
acf_result
##
## Autocorrelations of series 'sales_ts', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 0.007 -0.002 -0.004 0.000 -0.008 0.001 0.001 -0.012 0.002 -0.009
## 11 12 13 14 15 16 17 18 19 20 21
## -0.003 0.002 0.011 0.005 0.006 -0.003 0.006 0.009 -0.006 0.012 -0.011
## 22 23 24 25 26 27 28 29 30 31 32
## -0.008 -0.014 0.016 -0.003 -0.007 0.000 0.011 -0.013 -0.003 0.003 -0.005
## 33 34 35 36 37 38 39
## 0.005 -0.001 -0.002 -0.006 0.009 0.007 -0.006
pacf_result
##
## Partial autocorrelations of series 'sales_ts', by lag
##
## 1 2 3 4 5 6 7 8 9 10 11
## 0.007 -0.002 -0.004 0.000 -0.008 0.001 0.001 -0.012 0.003 -0.009 -0.003
## 12 13 14 15 16 17 18 19 20 21 22
## 0.002 0.011 0.005 0.006 -0.003 0.006 0.009 -0.006 0.012 -0.011 -0.008
## 23 24 25 26 27 28 29 30 31 32 33
## -0.014 0.016 -0.003 -0.007 0.000 0.011 -0.013 -0.003 0.003 -0.005 0.005
## 34 35 36 37 38 39
## -0.001 -0.002 -0.006 0.009 0.007 -0.006
Interpretation for PACF:
The PACF at lag 1 is slightly positive (0.007), which suggests a very small linear relationship between each value and the one immediately prior, after accounting for the relationships explained by the intervening values. Most of the PACF values are very small and close to zero, which usually indicates that there is little to no autocorrelation at those lags after accounting for previous lags. There is no clear pattern of significant spikes at fixed intervals in these PACF values, which might suggest the absence of strong autoregressive effects in the data.
Interpretation for ACF: At lag 0, the ACF is always 1 because the data is perfectly correlated with itself. The ACF values at lags 1 through 39 are all quite small, most of them being close to zero and none showing a strong autocorrelation. This suggests there is little to no linear dependency between past values and current values in the time series.