retail <- read.csv('C:/Users/laasy/Documents/Fall 2023/Intro to Statistics in R/Datasets for Final Project/OnlineRetail.csv')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
retail_aggregate <- retail%>%
group_by(InvoiceDate) %>%
summarise(Sum_UP = sum(UnitPrice))
print(retail_aggregate)
## # A tibble: 23,260 × 2
## InvoiceDate Sum_UP
## <chr> <dbl>
## 1 1/10/2011 10:04 0
## 2 1/10/2011 10:07 3.75
## 3 1/10/2011 10:08 0
## 4 1/10/2011 10:32 44.2
## 5 1/10/2011 10:35 53.3
## 6 1/10/2011 10:36 0
## 7 1/10/2011 10:44 2.55
## 8 1/10/2011 10:58 125.
## 9 1/10/2011 11:09 40.3
## 10 1/10/2011 11:22 135.
## # ℹ 23,250 more rows
library(dplyr)
retail$InvoiceDate <- as.POSIXct(retail$InvoiceDate, format = "%m/%d/%Y %H:%M")
retail$InvoiceDate <- as.Date(retail$InvoiceDate)
retail_aggregate <- retail %>%
group_by(InvoiceDate) %>%
summarise(Sum_UP = sum(UnitPrice))
print(retail_aggregate)
## # A tibble: 306 × 2
## InvoiceDate Sum_UP
## <date> <dbl>
## 1 2010-12-01 12904.
## 2 2010-12-02 6406.
## 3 2010-12-03 11524.
## 4 2010-12-05 7893.
## 5 2010-12-06 17679.
## 6 2010-12-07 85152.
## 7 2010-12-08 9978.
## 8 2010-12-09 14318.
## 9 2010-12-10 13169.
## 10 2010-12-12 3966.
## # ℹ 296 more rows
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
retail_ts <- as_tsibble(retail_aggregate, index=InvoiceDate) |>
fill_gaps()
View(retail_ts)
library(ggplot2)
retail_ts %>%
ggplot(mapping = aes(x = InvoiceDate, y = Sum_UP)) +
geom_line() +
labs(title = "Unit price sum Over Time",
subtitle = "Overall trends of unit price sum")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
retail_ts |>
filter_index("2010" ~ "2011") |>
drop_na() |>
ggplot(mapping = aes(x = InvoiceDate, y = Sum_UP)) +
geom_point(size=1, shape='O') +
geom_smooth(span=0.2, color = 'blue', se=FALSE) +
labs(title = "Unit price sum variation in 2010-2011")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
The overall trend of the graph is upwards, indicating that the average
unit price is increasing over time. The largest increase in average unit
price occurred between April and July 2011. There is a seasonal pattern
to the data, with the average unit price being highest in the summer and
lowest in the winter. The average unit price in October 2011 is more
than three times higher than the average unit price in January 2011.
retail_ts|>
index_by(month = floor_date(InvoiceDate, 'quarter')) |>
summarise(avg_sum= mean(Sum_UP, na.rm = TRUE)) |>
ggplot(mapping = aes(x = month, y = avg_sum)) + geom_line(color="blue",size=1.5) +geom_smooth(span = 0.3, color = 'red', se=FALSE, size=1.5) +labs(title = 'Average Unit price Over Time', subtitle = "by quarter year",y="Average Unit price",x="Year") + scale_x_date(breaks = "1 year", labels = \(x) year(x))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : at 14881
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : radius 3.3306
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 14881
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1.825
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : at 15250
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : radius 3.3306
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 3.3306
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`
## Caused by error in `predLoess()`:
## ! NA/NaN/Inf in foreign function call (arg 5)
The overall trend of the graph is upwards, indicating that the average
unit price is increasing over time. The largest increase in average unit
price occurred between April and July 2011. The average unit price in
October 2011 is more than three times higher than the average unit price
in January 2011.
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Plot ACF and PACF to illustrate seasonality
retail_ts <- na.omit(retail_ts)
your_acf <- acf(retail_ts$Sum_UP, lag.max = 30)
your_pacf <- pacf(retail_ts$Sum_UP, lag.max = 30)
# Plot ACF
autoplot(your_acf) +
labs(title = "Autocorrelation Function (ACF)",
x = "Lag",
y = "ACF")
# Plot PACF
autoplot(your_pacf) +
labs(title = "Partial Autocorrelation Function (PACF)",
x = "Lag",
y = "PACF")
There is only on negative value at lag 25, which is not that significant
for analysis the trend of unit price over invoice date time. All other
lag values are significantly positive, indicating a positive trend for
unit price ACF. PACF plot justifies the inference of ACP, as the values
at most of the lag points are positive.