library(fpp3)
## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
## ✓ tibble      3.1.5     ✓ tsibble     1.1.1
## ✓ dplyr       1.0.7     ✓ tsibbledata 0.4.0
## ✓ tidyr       1.1.4     ✓ feasts      0.2.2
## ✓ lubridate   1.8.0     ✓ fable       0.3.1
## ✓ ggplot2     3.3.5
## Warning: package 'tsibbledata' was built under R version 4.0.5
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## x lubridate::date()    masks base::date()
## x dplyr::filter()      masks stats::filter()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval()  masks lubridate::interval()
## x dplyr::lag()         masks stats::lag()
## x tsibble::setdiff()   masks base::setdiff()
## x tsibble::union()     masks base::union()

1) Use the help function to explore what the series gafa_stock, PBS, vic_elec and pelt represent.

#help("gafa_stock")
#help("PBS")
#help("vic_elec")
#help("pelt")

a) Use autoplot() to plot some of the series in these data sets.

autoplot(gafa_stock, .vars = Close)

autoplot(vic_elec, .vars = Demand)

autoplot(pelt, .vars = Lynx)

How do the 2 animals’ pelt numbers align by year?

pelt %>%    # using the code provided for Q.3
  pivot_longer(-Year) %>% 
  ggplot(aes(x = Year, y = value, colour = name)) + 
  geom_line() + 
  labs(y='pelts')

b) What is the time interval of each series?

interval(pelt)
## <interval[1]>
## [1] 1Y

gafa_stock: 1 day

PBS: 1 month

vic_elec: 30 minutes

pelt: 1 year

2) Use filter() to find what days corresponded to the peak closing price for each of the four stocks in gafa_stock

highs = tibble()
for (stock in unique(gafa_stock$Symbol)) {
  high = gafa_stock %>%
    filter(Symbol==stock) %>%
    filter(Close==max(Close))
  highs = bind_rows(highs, high)
}
data.frame(highs)  # displays better than a tibble
##   Symbol       Date    Open     High     Low   Close Adj_Close   Volume
## 1   AAPL 2018-10-03  230.05  233.470  229.78  232.07  230.2755 28654800
## 2   AMZN 2018-09-04 2026.50 2050.500 2013.00 2039.51 2039.5100  5721100
## 3     FB 2018-07-25  215.72  218.620  214.27  217.50  217.5000 58954200
## 4   GOOG 2018-07-26 1251.00 1269.771 1249.02 1268.33 1268.3300  2405600

3) Download the file tute1.csv from the book website

sales = read.csv("https://raw.githubusercontent.com/ebhtra/msds-624/main/tute1.csv")
head(sales)
##      Quarter  Sales AdBudget   GDP
## 1 1981-03-01 1020.2    659.2 251.8
## 2 1981-06-01  889.2    589.0 290.9
## 3 1981-09-01  795.0    512.5 290.8
## 4 1981-12-01 1003.9    614.1 292.4
## 5 1982-03-01 1057.7    647.2 279.1
## 6 1982-06-01  944.4    602.0 254.0

Convert the data to time series

mytimeseries <- sales %>%
  mutate(Quarter = yearmonth(Quarter)) %>%
  as_tsibble(index = Quarter)
head(mytimeseries)
## # A tsibble: 6 x 4 [3M]
##    Quarter Sales AdBudget   GDP
##      <mth> <dbl>    <dbl> <dbl>
## 1 1981 Mar 1020.     659.  252.
## 2 1981 Jun  889.     589   291.
## 3 1981 Sep  795      512.  291.
## 4 1981 Dec 1004.     614.  292.
## 5 1982 Mar 1058.     647.  279.
## 6 1982 Jun  944.     602   254

Construct time series plots of each of the three series

mytimeseries %>%
  pivot_longer(-Quarter) %>%
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line() +
  facet_grid(name ~ ., scales = "free_y")

4) The USgas package contains data on the demand for natural gas in the US.

a) Install the USgas package.

#install.packages('USgas')
library(USgas)

b) Create a tsibble from us_total with year as the index and state as the key.

us_total %>%
  as_tsibble(state, year) -> us_total

c) Plot the annual natural gas consumption by state for the New England area
(comprising the states of Maine, Vermont, New Hampshire, Massachusetts, Connecticut and Rhode Island).

NE = c('Maine', 'Vermont', 'New Hampshire', 'Massachusetts', 'Connecticut', 'Rhode Island')
autoplot(us_total %>% filter(state %in% NE), .vars = y) + 
  labs(y='natgas consumed')

5) a) Download tourism.xlsx from the book website and read it into R using readxl::read_excel().

#tour = read_excel("~/CUNY_MSDS/data624/tourism.xlsx")
#write.csv(tour, "tour.csv")
tour = read.csv("https://raw.githubusercontent.com/ebhtra/msds-624/main/HW/tour.csv")
str(tour)
## 'data.frame':    24320 obs. of  6 variables:
##  $ X      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Quarter: chr  "1998-01-01" "1998-04-01" "1998-07-01" "1998-10-01" ...
##  $ Region : chr  "Adelaide" "Adelaide" "Adelaide" "Adelaide" ...
##  $ State  : chr  "South Australia" "South Australia" "South Australia" "South Australia" ...
##  $ Purpose: chr  "Business" "Business" "Business" "Business" ...
##  $ Trips  : num  135 110 166 127 137 ...

b) Create a tsibble which is identical to the tourism tsibble from the tsibble package.

What are the index and key for tourism?

index(tourism)
## Quarter
key(tourism)
## [[1]]
## Region
## 
## [[2]]
## State
## 
## [[3]]
## Purpose
tour %>% 
  mutate(Quarter = yearquarter(Quarter)) %>%
  as_tsibble(index = Quarter, key = c(Region, State, Purpose)) -> tour
head(tour)
## # A tsibble: 6 x 6 [1Q]
## # Key:       Region, State, Purpose [1]
##       X Quarter Region   State           Purpose  Trips
##   <int>   <qtr> <chr>    <chr>           <chr>    <dbl>
## 1     1 1998 Q1 Adelaide South Australia Business  135.
## 2     2 1998 Q2 Adelaide South Australia Business  110.
## 3     3 1998 Q3 Adelaide South Australia Business  166.
## 4     4 1998 Q4 Adelaide South Australia Business  127.
## 5     5 1999 Q1 Adelaide South Australia Business  137.
## 6     6 1999 Q2 Adelaide South Australia Business  200.

c) Find what combination of Region and Purpose had the maximum number of overnight trips on average.

avg_trips = aggregate(Trips ~ Region + Purpose, tour, mean)
avg_trips[order(avg_trips$Trips, decreasing = T), ][1, ]
##     Region  Purpose  Trips
## 296 Sydney Visiting 747.27

d) Create a new tsibble which combines the Purposes and Regions, and just has total trips by State.

Unclear from question, but without combining quarters:

total_trips = aggregate(Trips ~ Quarter + State, tour, sum)
head(total_trips)
##   Quarter State    Trips
## 1 1998 Q1   ACT 551.0019
## 2 1998 Q2   ACT 416.0256
## 3 1998 Q3   ACT 436.0290
## 4 1998 Q4   ACT 449.7984
## 5 1999 Q1   ACT 378.5728
## 6 1999 Q2   ACT 558.1781

…and with combining quarters:

total_trips = aggregate(Trips ~ State, tour, sum)
total_trips
##                State     Trips
## 1                ACT  41006.59
## 2    New South Wales 557367.43
## 3 Northern Territory  28613.68
## 4         Queensland 386642.91
## 5    South Australia 118151.35
## 6           Tasmania  54137.09
## 7           Victoria 390462.91
## 8  Western Australia 147819.65

8) Monthly Australian retail data is provided in aus_retail.

Select one of the time series as follows (choose your own seed value):

set.seed(624)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

Explore your chosen retail time series using the following functions:

autoplot(), gg_season(), gg_subseries(), gg_lag(), ACF() %>% autoplot()

autoplot(myseries, .vars = Turnover)

myseries %>%
  gg_season(Turnover)

myseries %>%
  gg_subseries(Turnover)

Use quarterly lags over two years window

myseries %>%
  gg_lag(Turnover, geom = "point", lags = seq(3,24,3))

myseries %>%
  ACF(Turnover, maxLag=24) %>%
  autoplot()
## Warning: The `...` argument of `PACF()` is deprecated as of feasts 0.2.2.
## ACF variables should be passed to the `y` argument. If multiple variables are to be used, specify them using `vars(...)`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: ACF currently only supports one column, `Turnover` will be used.

Can you spot any seasonality, cyclicity and trend? What do you learn about the series?

The trend is upwards, and not just linearly but with increasing change over time.
Autoplot and gg_season show this long-term trend most clearly. The points in time where the trend seems to change slope might be some sign of cyclicity, since they start with an unusually high batch of numbers (1996-1998, 2010-2012), followed by a downturn, and then an extended and more sharply increasing period of years. The autoplot and gg_subseries show this possible cyclicity most clearly.

Although there is clear seasonality, where Christmas shopping is presumably causing December’s turnover to be the highest every year (also spilling over into January a little bit), the ACF plot suggests that the trend is stronger than the seasonality. One false appearance of seasonality in these plots comes from the fact that different months have different numbers of days, such that there is a misleading dip in February, and to a lesser extent in June, September, and November. It would be less misleading to look at average turnover per day for each month.

Checking what type of company was randomly selected here:

unique(myseries$Industry)
## [1] "Takeaway food services"

Knowing this company sells takeout food makes the Christmas shopping hypothesis seem a bit less likely, but without knowing what Australians’ December tendencies are, it remains safe to say that the seasonal trend is real.