LBYMET2 Project 1

Project 1 :“)

# okay, start clean
rm(list = ls())  

# set this to my folder
setwd("C:/Users/Una/Documents/R/LBYMET2")  
# load the stuff I need
library(tidyverse)     # for data wrangling if needed
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forecast)      # for time series tools (ACF, PACF, etc.)
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
library(tseries)       # for adf.test
library(ggplot2)       # for plots
# read the CSV
df <- read.csv("OnlineShoppers.csv")

# quick peek
head(df)
  year percentage
1 2002         18
2 2003         17
3 2004         27
4 2005         27
5 2006         30
6 2007         35
str(df)
'data.frame':   17 obs. of  2 variables:
 $ year      : int  2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 ...
 $ percentage: int  18 17 27 27 30 35 36 40 42 50 ...
# should have 'year' and 'percentage' — looks okay
# turn the 'percentage' column into a time series
# this is annual data from 2002 to 2018
shop_ts <- ts(df$percentage, start = 2002, end = 2018, frequency = 1)
# first look — plot the time series
plot(shop_ts,
     main = "Online Shoppers in Singapore (2002–2018)",
     ylab = "% of Population",
     xlab = "Year")

# maybe I can try a different way to plot that

plot(shop_ts,
     main = "Online Shoppers in Singapore (2002-2018)",
     ylab = "% of Population",
     xlab = "Year",
     type = "o",
     col = "blue",
     pch = 16,
     lwd = 2)

grid()

# what if ggplot lol 

ggplot(df, aes(x = year, y = percentage)) +
  geom_line(color = "steelblue", size = 1) +
  geom_point(color = "darkred", size = 2) +
  labs(title = "Online Shoppers in Singapore (2002–2018)",
       x = "Year",
       y = "% of Population") +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

# hmm... is this stationary? test just to be sure
# null = non-stationary, so small p-value = good
adf.test(shop_ts)

    Augmented Dickey-Fuller Test

data:  shop_ts
Dickey-Fuller = -1.9781, Lag order = 2, p-value = 0.5807
alternative hypothesis: stationary
# it fails to reject the null because p > 0.05
# try differencing, then plot

diff_ts <- diff(shop_ts)
plot(diff_ts, main = "Differenced Series (% of Online Shoppers)")

adf.test(diff_ts)  # rerun ADF to see if it's stationary now

    Augmented Dickey-Fuller Test

data:  diff_ts
Dickey-Fuller = -2.2175, Lag order = 2, p-value = 0.4895
alternative hypothesis: stationary
# aaaa still not stationary.. :/
# can i try differencing again? .. :(
diff2_ts <- diff(diff_ts)
plot(diff2_ts, main = "2nd Differenced Series")

adf.test(diff2_ts)

    Augmented Dickey-Fuller Test

data:  diff2_ts
Dickey-Fuller = -2.409, Lag order = 2, p-value = 0.4166
alternative hypothesis: stationary
# i guess we go back to the firsdt difference??? uhm hm
# ACF and PACF time!
# this helps figure out if it's AR or MA or both

acf(diff_ts,
    main = "ACF of First-Differenced Series",
    lag.max = 10)      # just looking at first few lags

pacf(diff_ts,
     main = "PACF of First-Differenced Series",
     lag.max = 10)

# need photo for slides (ill try this code again)
png("online_shoppers_timeseries.png", width = 800, height = 600)
plot(shop_ts,
     main = "Online Shoppers in Singapore (2002–2018)",
     ylab = "% of Population",
     xlab = "Year",
     type = "o", col = "blue", pch = 16, lwd = 2)
grid()
dev.off()
png 
  2 
png("acf_plot.png", width = 800, height = 600)
acf(diff_ts,
    main = "ACF of First-Differenced Series",
    lag.max = 10)
dev.off()
png 
  2 
png("pacf_plot.png", width = 800, height = 600)
pacf(diff_ts,
     main = "PACF of First-Differenced Series",
     lag.max = 10)
dev.off()
png 
  2