# okay, start clean
rm(list = ls())
# set this to my folder
setwd("C:/Users/Una/Documents/R/LBYMET2") LBYMET2 Project 1
Project 1 :“)
# load the stuff I need
library(tidyverse) # for data wrangling if needed── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.2.0 ✔ readr 2.1.6
✔ forcats 1.0.1 ✔ stringr 1.6.0
✔ ggplot2 4.0.2 ✔ tibble 3.3.1
✔ lubridate 1.9.5 ✔ tidyr 1.3.2
✔ purrr 1.2.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forecast) # for time series tools (ACF, PACF, etc.)Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
library(tseries) # for adf.test
library(ggplot2) # for plots# read the CSV
df <- read.csv("OnlineShoppers.csv")
# quick peek
head(df) year percentage
1 2002 18
2 2003 17
3 2004 27
4 2005 27
5 2006 30
6 2007 35
str(df)'data.frame': 17 obs. of 2 variables:
$ year : int 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 ...
$ percentage: int 18 17 27 27 30 35 36 40 42 50 ...
# should have 'year' and 'percentage' — looks okay# turn the 'percentage' column into a time series
# this is annual data from 2002 to 2018
shop_ts <- ts(df$percentage, start = 2002, end = 2018, frequency = 1)# first look — plot the time series
plot(shop_ts,
main = "Online Shoppers in Singapore (2002–2018)",
ylab = "% of Population",
xlab = "Year")# maybe I can try a different way to plot that
plot(shop_ts,
main = "Online Shoppers in Singapore (2002-2018)",
ylab = "% of Population",
xlab = "Year",
type = "o",
col = "blue",
pch = 16,
lwd = 2)
grid()# what if ggplot lol
ggplot(df, aes(x = year, y = percentage)) +
geom_line(color = "steelblue", size = 1) +
geom_point(color = "darkred", size = 2) +
labs(title = "Online Shoppers in Singapore (2002–2018)",
x = "Year",
y = "% of Population") +
theme_minimal()Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
# hmm... is this stationary? test just to be sure
# null = non-stationary, so small p-value = good
adf.test(shop_ts)
Augmented Dickey-Fuller Test
data: shop_ts
Dickey-Fuller = -1.9781, Lag order = 2, p-value = 0.5807
alternative hypothesis: stationary
# it fails to reject the null because p > 0.05
# try differencing, then plot
diff_ts <- diff(shop_ts)
plot(diff_ts, main = "Differenced Series (% of Online Shoppers)")adf.test(diff_ts) # rerun ADF to see if it's stationary now
Augmented Dickey-Fuller Test
data: diff_ts
Dickey-Fuller = -2.2175, Lag order = 2, p-value = 0.4895
alternative hypothesis: stationary
# aaaa still not stationary.. :/
# can i try differencing again? .. :(
diff2_ts <- diff(diff_ts)
plot(diff2_ts, main = "2nd Differenced Series")adf.test(diff2_ts)
Augmented Dickey-Fuller Test
data: diff2_ts
Dickey-Fuller = -2.409, Lag order = 2, p-value = 0.4166
alternative hypothesis: stationary
# i guess we go back to the firsdt difference??? uhm hm
# ACF and PACF time!
# this helps figure out if it's AR or MA or both
acf(diff_ts,
main = "ACF of First-Differenced Series",
lag.max = 10) # just looking at first few lagspacf(diff_ts,
main = "PACF of First-Differenced Series",
lag.max = 10)# need photo for slides (ill try this code again)
png("online_shoppers_timeseries.png", width = 800, height = 600)
plot(shop_ts,
main = "Online Shoppers in Singapore (2002–2018)",
ylab = "% of Population",
xlab = "Year",
type = "o", col = "blue", pch = 16, lwd = 2)
grid()
dev.off()png
2
png("acf_plot.png", width = 800, height = 600)
acf(diff_ts,
main = "ACF of First-Differenced Series",
lag.max = 10)
dev.off()png
2
png("pacf_plot.png", width = 800, height = 600)
pacf(diff_ts,
main = "PACF of First-Differenced Series",
lag.max = 10)
dev.off()png
2