Data 624 Homework 1

library(fpp3)

## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.4
## ✔ dplyr       1.1.2     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.0     ✔ feasts      0.3.1
## ✔ lubridate   1.9.2     ✔ fable       0.3.3
## ✔ ggplot2     3.4.4     ✔ fabletools  0.3.4

## Warning: package 'tsibble' was built under R version 4.3.2

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(USgas)
library(knitr)
library(DT)
library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.3.2

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

options(knitr.table.format = "html")

Ask

Exercises 2.1, 2.2, 2.3, 2.4, 2.5 and 2.8 from the Hyndman online Forecasting book.

2.1

Explore the following four time series: Bricks from aus_production, Lynx from pelt, Close from gafa_stock, Demand from vic_elec.

What is the time interval of each series?
Use autoplot() to produce a time plot of each series.
For the last plot, modify the axis labels and title.

Aus Production - Bricks

datatable(aus_production)

The time intervals are quarters.

aus_production |> 
  autoplot(Bricks) + 
  labs(y= "million units", title = "Australian Brick Collection")

## Warning: Removed 20 rows containing missing values (`geom_line()`).

Pelt - lynx

datatable(pelt)

The time interval is single year.

pelt |> 
  autoplot(Lynx) + 
  labs(y= "million units", title = "")

gafa-stock - Close

gafa_stock is the historical stock prices from 2014-2018 for Google, Amazon, Facebook and Apple. The close variable is the closing price for the stock.

The time series are daily.

datatable(gafa_stock)

gafa_stock |> 
  autoplot(Close) + 
  labs(y= "$USD", title = "Historical Stock Closing Price 2014-2018")

vic_elec - Demand

This data is for operational demand, which is the demand met by local scheduled generating units, semi-scheduled generating units, and non-scheduled intermittent generating units of aggregate capacity larger than 30 MWh, and by generation imports to the region.

vic_elec is a half-hourly time series. It contains three values:

Demand: Total electricity demand in MWh.
Temperature: Temperature of Melbourne (BOM site 086071).
Holiday: Indicator for if that day is a public holiday.

head(vic_elec)

## # A tsibble: 6 x 5 [30m] <Australia/Melbourne>
##   Time                Demand Temperature Date       Holiday
##   <dttm>               <dbl>       <dbl> <date>     <lgl>  
## 1 2012-01-01 00:00:00  4383.        21.4 2012-01-01 TRUE   
## 2 2012-01-01 00:30:00  4263.        21.0 2012-01-01 TRUE   
## 3 2012-01-01 01:00:00  4049.        20.7 2012-01-01 TRUE   
## 4 2012-01-01 01:30:00  3878.        20.6 2012-01-01 TRUE   
## 5 2012-01-01 02:00:00  4036.        20.4 2012-01-01 TRUE   
## 6 2012-01-01 02:30:00  3866.        20.2 2012-01-01 TRUE

vic_elec |> 
  autoplot(Demand) + 
  labs(y= "mWh", title = "Australia Total Electricity Demand")

### 2.2 Use filter() to find what days corresponded to the peak closing price for each of the four stocks in gafa_stock.

gafa_stock |> 
  group_by(Symbol) |> 
  filter(Close == max(Close)) |>
  select(Symbol, Date, Close)

## # A tsibble: 4 x 3 [!]
## # Key:       Symbol [4]
## # Groups:    Symbol [4]
##   Symbol Date       Close
##   <chr>  <date>     <dbl>
## 1 AAPL   2018-10-03  232.
## 2 AMZN   2018-09-04 2040.
## 3 FB     2018-07-25  218.
## 4 GOOG   2018-07-26 1268.

2.3

tute1 <- readr::read_csv("tute1.csv")

## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (3): Sales, AdBudget, GDP
## date (1): Quarter
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

tute1

## # A tibble: 100 × 4
##    Quarter    Sales AdBudget   GDP
##    <date>     <dbl>    <dbl> <dbl>
##  1 1981-03-01 1020.     659.  252.
##  2 1981-06-01  889.     589   291.
##  3 1981-09-01  795      512.  291.
##  4 1981-12-01 1004.     614.  292.
##  5 1982-03-01 1058.     647.  279.
##  6 1982-06-01  944.     602   254 
##  7 1982-09-01  778.     531.  296.
##  8 1982-12-01  932.     608.  272.
##  9 1983-03-01  996.     638.  260.
## 10 1983-06-01  908.     582.  280.
## # ℹ 90 more rows

mytimeseries <- tute1 |>
  mutate(Quarter = yearquarter(Quarter)) |>
  as_tsibble(index = Quarter)

mytimeseries |>
  pivot_longer(-Quarter) |>
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line() +
  facet_grid(name ~ ., scales = "free_y")

Check what happens when you don’t include facet_grid().

mytimeseries |>
  pivot_longer(-Quarter) |>
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line()

Without facet grid all the values are plotted on the same graph. This visually skews the values and makes it harder to read and compare.

2.4

Create a tsibble from us_total with year as the index and state as the key.

Plot the annual natural gas consumption by state for the New England area (comprising the states of Maine, Vermont, New Hampshire, Massachusetts, Connecticut and Rhode Island).

datatable(us_total)

summary(us_total)

##       year         state                 y           
##  Min.   :1949   Length:1266        Min.   :       0  
##  1st Qu.:2002   Class :character   1st Qu.:  142810  
##  Median :2008   Mode  :character   Median :  285244  
##  Mean   :2007                      Mean   : 1528206  
##  3rd Qu.:2014                      3rd Qu.:  651525  
##  Max.   :2020                      Max.   :31099061

us_total_1 <- us_total |>
             as_tibble(key = state, index = year)

datatable(us_total_1)

2.5

Create a tsibble which is identical to the tourism tsibble from the tsibble package.

tourism <- readxl::read_excel("tourism.xlsx")
datatable(tourism)

## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

tourism <- tourism |>
  mutate(Quarter = yearquarter(Quarter)) |>
  as_tsibble(key = c(Region, State, Purpose),
             index = Quarter)
datatable(tourism)

## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

datatable(tsibble::tourism)

## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Find what combination of Region and Purpose had the maximum number of overnight trips on average.

tourism |> 
  group_by(Region, Purpose) |>
  summarise (avg_trips = mean(Trips)) |>
  ungroup() |>
  filter(avg_trips == max(avg_trips)) |>
  datatable()

Create a new tsibble which combines the Purposes and Regions, and just has total trips by State.

tourism |>
  group_by(State) |>
  summarise(TotalTrips = sum(Trips)) |> 
  ungroup() |>
  datatable()

2.8

Use the following graphics functions: autoplot(), gg_season(), gg_subseries(), gg_lag(), ACF() and explore features from the following time series:

Can you spot any seasonality, cyclicity and trend? What do you learn about the series? What can you say about the seasonal patterns? Can you identify any unusual years?

“Total Private” `Employed` from `us_employment`

us_employment_1 <- us_employment |>
  filter(Title == "Total Private")
datatable(us_employment_1)

autoplot(us_employment_1, Employed)

us_employment_1 |>
  gg_season(Employed)

us_employment_1 |>
  gg_subseries(Employed )

us_employment_1 |>
  gg_lag(Employed, geom = "point" )

us_employment_1 |>
  ACF(Employed ) |>
  autoplot()

This time series shows that the “Total Private” Employment numbers has generally increased from 1940 to 2020. From the ACF graph, we can see that the values are highly correlated.

Seasonal patterns show that employment is low in the beginning of the year but increase as the year goes on. There are no obvious outliers in the series.

`Bricks` from `aus_production`

The Bricks variable is the clay brick production in millions of bricks.

aus_production |>
  autoplot(Bricks)

## Warning: Removed 20 rows containing missing values (`geom_line()`).

aus_production |>
  gg_season(Bricks)

## Warning: Removed 20 rows containing missing values (`geom_line()`).

aus_production |>
  gg_subseries(Bricks )

## Warning: Removed 5 rows containing missing values (`geom_line()`).

aus_production |>
  gg_lag(Bricks, geom = "point" )

## Warning: Removed 20 rows containing missing values (gg_lag).

aus_production |>
  ACF(Bricks ) |>
  autoplot()

The above graphs show that Bricks production has increased over the years. It peaked in production in the 1980s. Additionally, across all years the Bricks saw the highest production in quarter 3, while Quarter 1 had the lowest. After 1980 there is a sharp decline in production. The autocorrelation for the small lags are large and positive because the data has a trend and the observations nearby in time are also nearby in value.

`Hare` from `pelt`

The Hare variable is the number of Snowshoe Hare pelts that are traded.

datatable(pelt)

pelt |>
  autoplot(Hare)

pelt |>
  gg_subseries(Hare ) +
  labs(
    title = "Number of Snowshoe Hare pelts Traded"
  )

pelt |>
  gg_lag(Hare, geom = "point" )

pelt |>
  ACF(Hare ) |>
  autoplot()

The time series show a cyclic behavior. The lag lot does not show an obvious pattern. The autocorrelation plot, however, show a negative autocorrelation.

“H02” `Cost` from `PBS`

cost <- PBS |>
  filter(ATC2 == "H02")

datatable(cost)

cost |>
  autoplot(Cost)

cost |>
  gg_season(Cost)

cost |>
  gg_subseries(Cost )

cost |>
  ACF(Cost) |>
  autoplot()

Concessional Copayments saw a peak in the March, April, and May and had the highest Cost values.

Concessional Safety net had a negative autocorrelation. There was a peak in production in the January months across the years, and then a surprising drop in February. Then there was in an increase in the Costs.

General Copayments had a steady average across each month in each year. General Saftey net saw the same drop in Cost in February and then increase to the end of the year. It also had a negative correlation.

Barrels`from`us_gasoline.`

The Barrels variable is the weekle data of barrels per day )by the millions.

us_gasoline |>
  autoplot(Barrels)

us_gasoline |>
  gg_season(Barrels)

us_gasoline |>
  gg_lag(Barrels, geom = "point" )

us_gasoline |>
  ACF(Barrels ) |>
  autoplot()

Barrels timeseries has an upward trend. There autrocorrelation showing higher values in the lower lag values. The lag charts show a linear correlation.

Data 624 Homework 1

Moiya Josephs

2024-01-31

Ask

2.1

Aus Production - Bricks

Pelt - lynx

gafa-stock - Close

vic_elec - Demand

2.3

2.4

2.5

2.8

“Total Private” `Employed` from `us_employment`

`Bricks` from `aus_production`

`Hare` from `pelt`

“H02” `Cost` from `PBS`

Barrels`from`us_gasoline.`

Data 624 Homework 1

Moiya Josephs

2024-01-31

Ask

2.1

Aus Production - Bricks

Pelt - lynx

gafa-stock - Close

vic_elec - Demand

2.3

2.4

2.5

2.8

“Total Private” Employed from us_employment

Bricks from aus_production

Hare from pelt

“H02” Cost from PBS

Barrelsfromus_gasoline.`

“Total Private” `Employed` from `us_employment`

`Bricks` from `aus_production`

`Hare` from `pelt`

“H02” `Cost` from `PBS`

Barrels`from`us_gasoline.`