HW1 624

Exercises FPP3, section 2.10

Q1:

?pelt
#Annual data on pelt trade 1845-1935
?aus_production
#Quarterly manufacturing production estimates starting in 1956
?gafa_stock
#Daily stock data on open and close of 4 tickers (Google, Amazon, Facebook, and Apple) 2014-2018
?vic_elec
#Electricity demand in MWh at half hour intervals 2012

#Creating ts objects for all datasets

lynx_ts<-pelt |>
  select(Year,Lynx)|>
  as_tsibble(index = Year)

lynx_ts

## # A tsibble: 91 x 2 [1Y]
##     Year  Lynx
##    <dbl> <dbl>
##  1  1845 30090
##  2  1846 45150
##  3  1847 49150
##  4  1848 39520
##  5  1849 21230
##  6  1850  8420
##  7  1851  5560
##  8  1852  5080
##  9  1853 10170
## 10  1854 19600
## # ℹ 81 more rows

bricks_ts<-aus_production |>
  select(Quarter,Bricks)|>
  as_tsibble(index = Quarter)

bricks_ts

## # A tsibble: 218 x 2 [1Q]
##    Quarter Bricks
##      <qtr>  <dbl>
##  1 1956 Q1    189
##  2 1956 Q2    204
##  3 1956 Q3    208
##  4 1956 Q4    197
##  5 1957 Q1    187
##  6 1957 Q2    214
##  7 1957 Q3    227
##  8 1957 Q4    222
##  9 1958 Q1    199
## 10 1958 Q2    229
## # ℹ 208 more rows

stockclose_ts<-gafa_stock |> 
  select(Symbol, Date, Close) |> 
  as_tsibble(key = Symbol, index = Date) 
stockclose_ts

## # A tsibble: 5,032 x 3 [!]
## # Key:       Symbol [4]
##    Symbol Date       Close
##    <chr>  <date>     <dbl>
##  1 AAPL   2014-01-02  79.0
##  2 AAPL   2014-01-03  77.3
##  3 AAPL   2014-01-06  77.7
##  4 AAPL   2014-01-07  77.1
##  5 AAPL   2014-01-08  77.6
##  6 AAPL   2014-01-09  76.6
##  7 AAPL   2014-01-10  76.1
##  8 AAPL   2014-01-13  76.5
##  9 AAPL   2014-01-14  78.1
## 10 AAPL   2014-01-15  79.6
## # ℹ 5,022 more rows

demand_ts<-vic_elec |>
  select(Time, Demand) |>
  as_tsibble(index = Time)
  
demand_ts

## # A tsibble: 52,608 x 2 [30m] <Australia/Melbourne>
##    Time                Demand
##    <dttm>               <dbl>
##  1 2012-01-01 00:00:00  4383.
##  2 2012-01-01 00:30:00  4263.
##  3 2012-01-01 01:00:00  4049.
##  4 2012-01-01 01:30:00  3878.
##  5 2012-01-01 02:00:00  4036.
##  6 2012-01-01 02:30:00  3866.
##  7 2012-01-01 03:00:00  3694.
##  8 2012-01-01 03:30:00  3562.
##  9 2012-01-01 04:00:00  3433.
## 10 2012-01-01 04:30:00  3359.
## # ℹ 52,598 more rows

autoplot(lynx_ts, Lynx)

autoplot(bricks_ts,Bricks)

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

autoplot(stockclose_ts, Close)

autoplot(demand_ts, Demand)

  labs(
    title = "Electricity Demand",
    x = "Time per 30 min",
    y = "Demand"
  )

## <ggplot2::labels> List of 3
##  $ x    : chr "Time per 30 min"
##  $ y    : chr "Demand"
##  $ title: chr "Electricity Demand"

Q2:

#This was used as a test but figured I would use the more concise method and group_by 
#stockclose_ts |>
#  filter(Symbol == "GOOG")|>
#  filter(Close == max(Close)) |>
#  select(Symbol, Date, Close)

#Max Closing Stock prices and data associated
max_stock_pricegafa<- stockclose_ts |>
  group_by(Symbol) |>
  filter(Close == max(Close)) |>
  select(Symbol, Date, Close)
max_stock_pricegafa

## # A tsibble: 4 x 3 [!]
## # Key:       Symbol [4]
## # Groups:    Symbol [4]
##   Symbol Date       Close
##   <chr>  <date>     <dbl>
## 1 AAPL   2018-10-03  232.
## 2 AMZN   2018-09-04 2040.
## 3 FB     2018-07-25  218.
## 4 GOOG   2018-07-26 1268.

Q3:

#Data Download
tute1 <- readr::read_csv("tute1.csv")

## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (3): Sales, AdBudget, GDP
## date (1): Quarter
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Ts conversion
tute1_ts <- tute1 |>
  mutate(Quarter = yearquarter(Quarter)) |>
  as_tsibble(index = Quarter)
#Graph plots
tute1_ts |>
  pivot_longer(-Quarter) |>
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line() +
  facet_grid(name ~ ., scales = "free_y")

#Graph plots without facet_grid()
tute1_ts |>
  pivot_longer(-Quarter) |>
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line()

#Facet removes them from a single chart and instead makes one chart for each column. It also allows them to have different y axis scales which if put together certainly has the ability to mislead when one column moves on a greatly different scale than the others.

Q4:

#install.packages("USgas")

usgas_consumption<-us_total |> 
  select(state, year, y) |> 
  as_tsibble(key = state, index = year) 
usgas_consumption

## # A tsibble: 1,266 x 3 [1Y]
## # Key:       state [53]
##    state    year      y
##    <chr>   <int>  <int>
##  1 Alabama  1997 324158
##  2 Alabama  1998 329134
##  3 Alabama  1999 337270
##  4 Alabama  2000 353614
##  5 Alabama  2001 332693
##  6 Alabama  2002 379343
##  7 Alabama  2003 350345
##  8 Alabama  2004 382367
##  9 Alabama  2005 353156
## 10 Alabama  2006 391093
## # ℹ 1,256 more rows

northeast_ts <- us_total |>
  filter(state %in% c( "Maine", "Vermont", "New Hampshire", "Massachusetts", "Connecticut", "Rhode Island"
  ))

northeast_ts |>
  ggplot(aes(x = year, y = y)) +
  geom_line() +
  facet_grid(state ~ ., scales = "free_y") +
  labs(
    title = "Annual Natural Gas Consumption in New England States",
    x = "Year",
    y = "Annual Consumption (Million Cubic Feet)"
  )

#download and create temporary tourism file path for download, this is read with .xlsx type of file. Then to download we had to specify the mode to remove error that caused a transformation of file. For this reason the mode was selected as wb in order to preserve reading. This error occurred on my work computer but seemed to be all right on my personal mac laptop. I left this convention simply to preserve across OS since error was present on one machine I used. 

tourismdata_file <- tempfile(fileext = ".xlsx")
download.file("http://robjhyndman.com/data/tourism.xlsx", tourismdata_file, mode = "wb") 

tourism_rawdata <- read_excel(tourismdata_file)

tourism_rawdata

## # A tibble: 24,320 × 5
##    Quarter    Region   State           Purpose  Trips
##    <chr>      <chr>    <chr>           <chr>    <dbl>
##  1 1998-01-01 Adelaide South Australia Business  135.
##  2 1998-04-01 Adelaide South Australia Business  110.
##  3 1998-07-01 Adelaide South Australia Business  166.
##  4 1998-10-01 Adelaide South Australia Business  127.
##  5 1999-01-01 Adelaide South Australia Business  137.
##  6 1999-04-01 Adelaide South Australia Business  200.
##  7 1999-07-01 Adelaide South Australia Business  169.
##  8 1999-10-01 Adelaide South Australia Business  134.
##  9 2000-01-01 Adelaide South Australia Business  154.
## 10 2000-04-01 Adelaide South Australia Business  169.
## # ℹ 24,310 more rows

#this matches the tourism tsiblle in book
maxovernight_tourismdata <- tourism_rawdata |>
  mutate(Quarter = yearquarter(Quarter)) |>
  as_tsibble(index = Quarter, key = c(Region, State, Purpose))

maxovernight_tourismdata

## # A tsibble: 24,320 x 5 [1Q]
## # Key:       Region, State, Purpose [304]
##    Quarter Region   State           Purpose  Trips
##      <qtr> <chr>    <chr>           <chr>    <dbl>
##  1 1998 Q1 Adelaide South Australia Business  135.
##  2 1998 Q2 Adelaide South Australia Business  110.
##  3 1998 Q3 Adelaide South Australia Business  166.
##  4 1998 Q4 Adelaide South Australia Business  127.
##  5 1999 Q1 Adelaide South Australia Business  137.
##  6 1999 Q2 Adelaide South Australia Business  200.
##  7 1999 Q3 Adelaide South Australia Business  169.
##  8 1999 Q4 Adelaide South Australia Business  134.
##  9 2000 Q1 Adelaide South Australia Business  154.
## 10 2000 Q2 Adelaide South Australia Business  169.
## # ℹ 24,310 more rows

#initially did not use .groups to drop, but since implicitly that meant I was leaving every purpose/region grouping, resulting table was far too large as it failed its purpose. Added this to find the true combo max and replaced filter(avg_trips == max(avg_trips)) with filter(avg_trips, n = 1)
max_region_purpose <- maxovernight_tourismdata |>
  group_by(Region, Purpose) |>
  summarise(avg_trips = mean(Trips), .groups = "drop") |>
  slice_max(avg_trips, n = 1)

max_region_purpose

## # A tsibble: 1 x 4 [1Q]
## # Key:       Region, Purpose [1]
##   Region    Purpose  Quarter avg_trips
##   <chr>     <chr>      <qtr>     <dbl>
## 1 Melbourne Visiting 2017 Q4      985.

state_totals <- maxovernight_tourismdata |>
  group_by(State) |>
  index_by(Quarter)|>
  summarise(Trips = sum(Trips, na.rm = TRUE), .groups = "drop") 

state_totals

## # A tsibble: 640 x 3 [1Q]
## # Key:       State [8]
##    State Quarter Trips
##    <chr>   <qtr> <dbl>
##  1 ACT   1998 Q1  551.
##  2 ACT   1998 Q2  416.
##  3 ACT   1998 Q3  436.
##  4 ACT   1998 Q4  450.
##  5 ACT   1999 Q1  379.
##  6 ACT   1999 Q2  558.
##  7 ACT   1999 Q3  449.
##  8 ACT   1999 Q4  595.
##  9 ACT   2000 Q1  600.
## 10 ACT   2000 Q2  557.
## # ℹ 630 more rows

#added additional table to see where the max was for each state and which Quarter it was. Done out of curiosity but feels like a logical next question to be answered
statetotals_max <- state_totals |>
  group_by(State) |>
  slice_max(Trips, n = 1, with_ties = TRUE) |>
  ungroup()

statetotals_max

## # A tsibble: 8 x 3 [1Q]
## # Key:       State [8]
##   State              Quarter Trips
##   <chr>                <qtr> <dbl>
## 1 ACT                2017 Q2  748.
## 2 New South Wales    2017 Q4 8542.
## 3 Northern Territory 2016 Q3  714.
## 4 Queensland         2017 Q3 6534.
## 5 South Australia    2016 Q1 1871.
## 6 Tasmania           2017 Q1 1135.
## 7 Victoria           2017 Q1 7270.
## 8 Western Australia  2016 Q1 2871.

Q8.

#us_employment Employed plots filtered by "Total Private"
head(us_employment, n=10)

## # A tsibble: 10 x 4 [1M]
## # Key:       Series_ID [1]
##       Month Series_ID     Title         Employed
##       <mth> <chr>         <chr>            <dbl>
##  1 1939 Jan CEU0500000001 Total Private    25338
##  2 1939 Feb CEU0500000001 Total Private    25447
##  3 1939 Mar CEU0500000001 Total Private    25833
##  4 1939 Apr CEU0500000001 Total Private    25801
##  5 1939 May CEU0500000001 Total Private    26113
##  6 1939 Jun CEU0500000001 Total Private    26485
##  7 1939 Jul CEU0500000001 Total Private    26481
##  8 1939 Aug CEU0500000001 Total Private    26848
##  9 1939 Sep CEU0500000001 Total Private    27468
## 10 1939 Oct CEU0500000001 Total Private    27830

us_employment |>
  filter(Title == "Total Private") |>
  autoplot(Employed)

us_employment |>
  filter(Title == "Total Private") |>
  gg_season(Employed) + geom_point(alpha=0.5, size = 1)

## Warning: `gg_season()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_season()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

us_employment |>
  filter(Title == "Total Private") |>
  gg_subseries(Employed)

## Warning: `gg_subseries()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_subseries()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

us_employment |>
  filter(Title == "Total Private") |>
  ACF(Employed)

## # A tsibble: 29 x 3 [1M]
## # Key:       Series_ID [1]
##    Series_ID          lag   acf
##    <chr>         <cf_lag> <dbl>
##  1 CEU0500000001       1M 0.997
##  2 CEU0500000001       2M 0.993
##  3 CEU0500000001       3M 0.990
##  4 CEU0500000001       4M 0.986
##  5 CEU0500000001       5M 0.983
##  6 CEU0500000001       6M 0.980
##  7 CEU0500000001       7M 0.977
##  8 CEU0500000001       8M 0.974
##  9 CEU0500000001       9M 0.971
## 10 CEU0500000001      10M 0.968
## # ℹ 19 more rows

#bricks from aus_production
head(bricks_ts, n=10)

## # A tsibble: 10 x 2 [1Q]
##    Quarter Bricks
##      <qtr>  <dbl>
##  1 1956 Q1    189
##  2 1956 Q2    204
##  3 1956 Q3    208
##  4 1956 Q4    197
##  5 1957 Q1    187
##  6 1957 Q2    214
##  7 1957 Q3    227
##  8 1957 Q4    222
##  9 1958 Q1    199
## 10 1958 Q2    229

bricks_ts |>
  autoplot(Bricks)

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

gg_season(bricks_ts, Bricks) +
  geom_point(alpha = 0.5, size = 1)

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).

bricks_ts |>
  gg_subseries(Bricks)

## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_line()`).

bricks_ts |>
  ACF(Bricks)

## # A tsibble: 22 x 2 [1Q]
##         lag   acf
##    <cf_lag> <dbl>
##  1       1Q 0.900
##  2       2Q 0.815
##  3       3Q 0.813
##  4       4Q 0.828
##  5       5Q 0.720
##  6       6Q 0.642
##  7       7Q 0.655
##  8       8Q 0.692
##  9       9Q 0.609
## 10      10Q 0.556
## # ℹ 12 more rows

#The bricks series shows obvious seasonality as the Q1 is consistently lowest and Q3 is the highest for all the observable years shown. The subseries plot confirms this with the average and pattern moving across quarters being so similar just raised or lowered on the y-axis. Even with this clear seasonality, we can still observe the pattern over the decades splitting around 1980. There is a clear trend upward going into that time that takes an abrupt reversal towards the second half of these plots. I would say this implies some event around or slightly prior to 1980 that caused this. There was a clear recovery in about 1990 but the trend still continued downward.

#hare from pelt

hare_ts <- pelt |>
  select(Year, Hare) |>
  as_tsibble(index = Year)

head(hare_ts, n=10)

## # A tsibble: 10 x 2 [1Y]
##     Year  Hare
##    <dbl> <dbl>
##  1  1845 19580
##  2  1846 19600
##  3  1847 19610
##  4  1848 11990
##  5  1849 28040
##  6  1850 58000
##  7  1851 74600
##  8  1852 75090
##  9  1853 88480
## 10  1854 61280

autoplot(hare_ts, Hare)

hare_ts |> 
  gg_lag(Hare, lags = 1:9)

## Warning: `gg_lag()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_lag()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

hare_ts |> 
  ACF(Hare)

## # A tsibble: 19 x 2 [1Y]
##         lag     acf
##    <cf_lag>   <dbl>
##  1       1Y  0.658 
##  2       2Y  0.214 
##  3       3Y -0.155 
##  4       4Y -0.401 
##  5       5Y -0.493 
##  6       6Y -0.401 
##  7       7Y -0.168 
##  8       8Y  0.113 
##  9       9Y  0.307 
## 10      10Y  0.340 
## 11      11Y  0.296 
## 12      12Y  0.206 
## 13      13Y  0.0372
## 14      14Y -0.153 
## 15      15Y -0.285 
## 16      16Y -0.295 
## 17      17Y -0.202 
## 18      18Y -0.0676
## 19      19Y  0.0956

#From a perspective of seasonality we cannot consider due to annualized data. Our gg_lag here shows some strange looking behavior but this is likely due to the connection of multi-year trends. ACF is very telling here as our 1 year showed the strongest relationship but the further lag plots only showed a weakened linear correlation despite persistent patterns. This makes me wonder if the better use here would be prediction through a multivariable function of several non-linear lagged terms rather than the linear coefficients simply from ACF. There is clearly a more complex interaction here that has cyclical behavior not easily predicted by a linear relationship but perhaps limiting factors that determine the absolute number based on the biological interactions.

#PBS dataset filtered for h02
h02_ts <- PBS |>
  filter(ATC2 == "H02")

head(h02_ts)

## # A tsibble: 6 x 9 [1M]
## # Key:       Concession, Type, ATC1, ATC2 [1]
##      Month Concession   Type      ATC1  ATC1_desc ATC2  ATC2_desc Scripts   Cost
##      <mth> <chr>        <chr>     <chr> <chr>     <chr> <chr>       <dbl>  <dbl>
## 1 1991 Jul Concessional Co-payme… H     Systemic… H02   CORTICOS…   63261 317384
## 2 1991 Aug Concessional Co-payme… H     Systemic… H02   CORTICOS…   53528 269891
## 3 1991 Sep Concessional Co-payme… H     Systemic… H02   CORTICOS…   52822 269703
## 4 1991 Oct Concessional Co-payme… H     Systemic… H02   CORTICOS…   54016 280418
## 5 1991 Nov Concessional Co-payme… H     Systemic… H02   CORTICOS…   49281 268070
## 6 1991 Dec Concessional Co-payme… H     Systemic… H02   CORTICOS…   51798 277139

autoplot(h02_ts, Cost)

h02_ts |> 
  gg_season(Cost)

h02_ts |> 
  gg_subseries(Cost)

h02_ts |> 
  index_by(Month) |>
  summarise(Cost = sum(Cost, na.rm = TRUE))|>
    gg_lag(Cost)

h02_ts |> 
  ACF(Cost)

## # A tsibble: 92 x 6 [1M]
## # Key:       Concession, Type, ATC1, ATC2 [4]
##    Concession   Type        ATC1  ATC2       lag   acf
##    <chr>        <chr>       <chr> <chr> <cf_lag> <dbl>
##  1 Concessional Co-payments H     H02         1M 0.834
##  2 Concessional Co-payments H     H02         2M 0.679
##  3 Concessional Co-payments H     H02         3M 0.514
##  4 Concessional Co-payments H     H02         4M 0.352
##  5 Concessional Co-payments H     H02         5M 0.264
##  6 Concessional Co-payments H     H02         6M 0.219
##  7 Concessional Co-payments H     H02         7M 0.253
##  8 Concessional Co-payments H     H02         8M 0.337
##  9 Concessional Co-payments H     H02         9M 0.464
## 10 Concessional Co-payments H     H02        10M 0.574
## # ℹ 82 more rows

h02_ts |> 
  ACF(Cost)|>
  autoplot()

#This dataset shows clear seasonality especially in our concessional safety net and general safety net seasonal plot. January is highest for every year with an immediate drop followed by a slow build up in cost. Short term lag seems very strong but it drops off with later emergence again according to the acf values. It seems to peak again which suggests a yearly cycle and like the previous dataset I would imagine this could be better described by a multivariable relationship for stronger prediction. There aer also multiple cost types that I decided to sum up for the acf table to make more clear, but this is where I believe we see the strongest correlation to these being run on 12 month cycles.

#Barrels from US gasoline dataset
head(us_gasoline, n=12)

## # A tsibble: 12 x 2 [1W]
##        Week Barrels
##      <week>   <dbl>
##  1 1991 W06    6.62
##  2 1991 W07    6.43
##  3 1991 W08    6.58
##  4 1991 W09    7.22
##  5 1991 W10    6.88
##  6 1991 W11    6.95
##  7 1991 W12    7.33
##  8 1991 W13    6.78
##  9 1991 W14    7.50
## 10 1991 W15    6.92
## 11 1991 W16    7.04
## 12 1991 W17    6.96

us_gasoline |>
  autoplot(Barrels)

us_gasoline|>
gg_season(Barrels)

us_gasoline |> 
  gg_subseries(Barrels)

us_gasoline |> 
  gg_lag(Barrels)

ACF(us_gasoline, Barrels)

## # A tsibble: 31 x 2 [1W]
##         lag   acf
##    <cf_lag> <dbl>
##  1       1W 0.893
##  2       2W 0.882
##  3       3W 0.873
##  4       4W 0.866
##  5       5W 0.847
##  6       6W 0.844
##  7       7W 0.832
##  8       8W 0.831
##  9       9W 0.822
## 10      10W 0.808
## # ℹ 21 more rows

ACF(us_gasoline, Barrels)|>
  autoplot()

#This plot of barrels of gasoline shows a strong upward trend going into the early 2000s followed by a stark flattening of our trend. With weekly data we see tons of movement in the short term plots but we eventually run into the seasonality which shows higher values in spring and summer as opposed to winter. Likely due to economic instability, we can see a strong dip around 2008 which may have been correlated to tthe financial crisis. Our lag plots show an almost linear relationship which is interesting and positive to see. We find that even at lag 9, we do not see the same looping trends the hare plots showed still suggesting non-stationarity. This is confirmed by our ACF plot and table which show a consistent downward trend.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

HW1 624

2026-02-05