DATA624_HW1_CameronGray

Author

Cameron Gray

library(fpp3)
library(tidyverse)
library(USgas)
library(readxl)

DATA624: Homework 1

2.1

aus_production: 1956 Q1 to 2010 Q2

pelt: 1845 to 1935

gafa_stock: 2014-01-02 to 2018-12-31

vic_elec: 2012-01-01 AEDT to 2014-12-31 23:30:00 AEDT

min(aus_production$Quarter)
<yearquarter[1]>
[1] "1956 Q1"
# Year starts on: January
max(aus_production$Quarter)
<yearquarter[1]>
[1] "2010 Q2"
# Year starts on: January
min(pelt$Year)
[1] 1845
max(pelt$Year)
[1] 1935
min(gafa_stock$Date)
[1] "2014-01-02"
max(gafa_stock$Date)
[1] "2018-12-31"
min(vic_elec$Time)
[1] "2012-01-01 AEDT"
max(vic_elec$Time)
[1] "2014-12-31 23:30:00 AEDT"
aus_production[,c('Quarter', 'Bricks')] |>
  autoplot()

pelt[,c('Year', 'Lynx')] |>
  autoplot()

gafa_stock |>
  autoplot(vars(Close))

vic_elec[,c('Time', 'Demand')] |>
  autoplot() +
  labs(title='Electricity Demand', subtitle = '(Victoria, Australia)') +
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5)) +
  xlab('Time (30 min)') +
  ylab('Electricity Demand (MWh)')

2.2

gafa_stock |> filter(Close == max(Close, na.rm=TRUE), .by = Symbol)
# A tsibble: 4 x 8 [!]
# Key:       Symbol [4]
  Symbol Date        Open  High   Low Close Adj_Close   Volume
  <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>
1 AAPL   2018-10-03  230.  233.  230.  232.      230. 28654800
2 AMZN   2018-09-04 2026. 2050. 2013  2040.     2040.  5721100
3 FB     2018-07-25  216.  219.  214.  218.      218. 58954200
4 GOOG   2018-07-26 1251  1270. 1249. 1268.     1268.  2405600

2.3

tute1 <- readr::read_csv("tute1.csv")
# View(tute1)

mytimeseries <- tute1 |>
  mutate(Quarter = yearquarter(Quarter)) |>
  as_tsibble(index = Quarter)

mytimeseries |>
  pivot_longer(-Quarter) |>
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line() +
  facet_grid(name ~ ., scales = "free_y")

mytimeseries |>
  pivot_longer(-Quarter) |>
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line()

  # facet_grid(name ~ ., scales = "free_y")

2.4

ts_us_total <- tsibble(us_total, index = year, key = state)

print(ts_us_total)
# A tsibble: 1,266 x 3 [1Y]
# Key:       state [53]
    year state        y
   <int> <chr>    <int>
 1  1997 Alabama 324158
 2  1998 Alabama 329134
 3  1999 Alabama 337270
 4  2000 Alabama 353614
 5  2001 Alabama 332693
 6  2002 Alabama 379343
 7  2003 Alabama 350345
 8  2004 Alabama 382367
 9  2005 Alabama 353156
10  2006 Alabama 391093
# ℹ 1,256 more rows
ts_us_total |>
  filter(state %in% c("Maine", "Vermont", "New Hampshire", "Massachusetts", 
                      "Connectivut", "Rhode Island")) |>
  ggplot(aes(x=year, y=y, color = state)) +
  geom_line() +
  facet_wrap('state', scales = "free_y")

2.5

The Region and Purpose with the highest average of trips is Sydney/Visiting.

tourism_excel <- read_excel('tourism.xlsx') 

ts_tourism_excel <- tourism_excel |>
  mutate(Quarter = yearquarter(Quarter)) |>
  as_tsibble(index = Quarter, key = c(Region, State, Purpose)) |>
  print()
# A tsibble: 24,320 x 5 [1Q]
# Key:       Region, State, Purpose [304]
   Quarter Region   State           Purpose  Trips
     <qtr> <chr>    <chr>           <chr>    <dbl>
 1 1998 Q1 Adelaide South Australia Business  135.
 2 1998 Q2 Adelaide South Australia Business  110.
 3 1998 Q3 Adelaide South Australia Business  166.
 4 1998 Q4 Adelaide South Australia Business  127.
 5 1999 Q1 Adelaide South Australia Business  137.
 6 1999 Q2 Adelaide South Australia Business  200.
 7 1999 Q3 Adelaide South Australia Business  169.
 8 1999 Q4 Adelaide South Australia Business  134.
 9 2000 Q1 Adelaide South Australia Business  154.
10 2000 Q2 Adelaide South Australia Business  169.
# ℹ 24,310 more rows
tourism_tsibble <- tsibble::tourism |>
  print()
# A tsibble: 24,320 x 5 [1Q]
# Key:       Region, State, Purpose [304]
   Quarter Region   State           Purpose  Trips
     <qtr> <chr>    <chr>           <chr>    <dbl>
 1 1998 Q1 Adelaide South Australia Business  135.
 2 1998 Q2 Adelaide South Australia Business  110.
 3 1998 Q3 Adelaide South Australia Business  166.
 4 1998 Q4 Adelaide South Australia Business  127.
 5 1999 Q1 Adelaide South Australia Business  137.
 6 1999 Q2 Adelaide South Australia Business  200.
 7 1999 Q3 Adelaide South Australia Business  169.
 8 1999 Q4 Adelaide South Australia Business  134.
 9 2000 Q1 Adelaide South Australia Business  154.
10 2000 Q2 Adelaide South Australia Business  169.
# ℹ 24,310 more rows
tourism_excel |>
  group_by(Region, Purpose) |>
  summarise(mean = mean(Trips)) |>
  arrange(desc(mean)) |>
  # head(1) |>
  print()
# A tibble: 304 × 3
# Groups:   Region [76]
   Region          Purpose   mean
   <chr>           <chr>    <dbl>
 1 Sydney          Visiting  747.
 2 Melbourne       Visiting  619.
 3 Sydney          Business  602.
 4 North Coast NSW Holiday   588.
 5 Sydney          Holiday   550.
 6 Gold Coast      Holiday   528.
 7 Melbourne       Holiday   507.
 8 South Coast     Holiday   495.
 9 Brisbane        Visiting  493.
10 Melbourne       Business  478.
# ℹ 294 more rows
ts_tourism_excel |>
  group_by(State) |>
  summarise(Trips = sum(Trips)) |>
  print()
# A tsibble: 640 x 3 [1Q]
# Key:       State [8]
   State Quarter Trips
   <chr>   <qtr> <dbl>
 1 ACT   1998 Q1  551.
 2 ACT   1998 Q2  416.
 3 ACT   1998 Q3  436.
 4 ACT   1998 Q4  450.
 5 ACT   1999 Q1  379.
 6 ACT   1999 Q2  558.
 7 ACT   1999 Q3  449.
 8 ACT   1999 Q4  595.
 9 ACT   2000 Q1  600.
10 ACT   2000 Q2  557.
# ℹ 630 more rows

2.8

“Total Private” Employed from us_employment

The most applicable trend in this data can be found in the autoplot graph where it is clear that the trend is in the upward growth and not really in the seasonality or cyclicity. Based on all the other graphs there is barely any other cyclic trends. The gg_season plot has very similar lines for each year, gg_subseries the plots are almost identical month to month, gg_lag all the lags appear the same and the ACF has a very small downward trend. In general none of these have a noticeable trend which makes sense for employment since that tends to stay pretty consistent throughout the year with seasonal occupations canceling each other out. There don’t seem to be any unusual years in the data apart from the occasion dip in growth but that’s how employment tends to work, it waxes and wanes over time.

total_private <- us_employment |>
  filter(Title == "Total Private") |>
  print()
# A tsibble: 969 x 4 [1M]
# Key:       Series_ID [1]
      Month Series_ID     Title         Employed
      <mth> <chr>         <chr>            <dbl>
 1 1939 Jan CEU0500000001 Total Private    25338
 2 1939 Feb CEU0500000001 Total Private    25447
 3 1939 Mar CEU0500000001 Total Private    25833
 4 1939 Apr CEU0500000001 Total Private    25801
 5 1939 May CEU0500000001 Total Private    26113
 6 1939 Jun CEU0500000001 Total Private    26485
 7 1939 Jul CEU0500000001 Total Private    26481
 8 1939 Aug CEU0500000001 Total Private    26848
 9 1939 Sep CEU0500000001 Total Private    27468
10 1939 Oct CEU0500000001 Total Private    27830
# ℹ 959 more rows
total_private |>
  autoplot()

total_private |>
  gg_season()

total_private |>
  gg_subseries()

total_private |>
  gg_lag(geom = 'point')

total_private |>
  ACF(Employed) |>
  autoplot()

Bricks from aus_production

While there was a general increase in the number of bricks produced there is also noticeable cyclic and seasonal trends. You can see a bulge toward the middle of the month in the gg_season plot which indicates an increase in brick production for Q2 and Q3. The gg_subseries plot clearly shows how much more is produced in Q2 and Q3 while also showing that from year to year the pattern stays largely the same since they all tend to have the same shape. The ACF graph shows that there is indeed a cycle to it as the graph tends downward with a cyclic pattern as the lag is increased. There don’t seem to be any years that stand out other than the growth period at the beginning but after that the cycle seemed pretty stable with a bit of growth and decline over the years. There seemed to be a peak of production around 1980 but then a more stable amount produced before and after.

bricks <- aus_production |>
  select(Quarter, Bricks) |>
  print()
# A tsibble: 218 x 2 [1Q]
   Quarter Bricks
     <qtr>  <dbl>
 1 1956 Q1    189
 2 1956 Q2    204
 3 1956 Q3    208
 4 1956 Q4    197
 5 1957 Q1    187
 6 1957 Q2    214
 7 1957 Q3    227
 8 1957 Q4    222
 9 1958 Q1    199
10 1958 Q2    229
# ℹ 208 more rows
bricks |>
  autoplot()

bricks |>
  gg_season()

bricks |>
  gg_subseries()

bricks |>
  gg_lag(geom = 'point')

bricks |>
  ACF(Bricks) |>
  autoplot()

Hare from pelt

Since this data is yearly data there isn’t as much to discern seasonally since it isn’t granular enough to get anything seasonal out of it. But there does seem to be a weirdly cyclic trend to the data despite this. gg-season and gg_subseries as mentioned before don’t work with this data since the data is annual so their graphs either won’t generate or don’t provide any useful information. ACF on the other hand shows a very clear picture with it’s almost sine wave pattern. The data from year to year is extremely cyclic with a clear pattern, this most likely has an underlying cause that would require more insight into the data set to understand. Maybe there is a cyclic pattern to rabbit populations or prices for pelts changed or maybe there were legal years to hunt for pelts and others were not, there is most likely something driving this pattern but hard to discern from the data provided alone. There are a couple of abnormal peaks in ~1864 and ~1885 but other than that the data seems to follow the trend.

hare <- pelt |>
  select(Year, Hare) |>
  print()
# A tsibble: 91 x 2 [1Y]
    Year  Hare
   <dbl> <dbl>
 1  1845 19580
 2  1846 19600
 3  1847 19610
 4  1848 11990
 5  1849 28040
 6  1850 58000
 7  1851 74600
 8  1852 75090
 9  1853 88480
10  1854 61280
# ℹ 81 more rows
hare |>
  autoplot()

# gg_season() doesn't work with this data set because it is yearly data
# hare |>
#   gg_season()
hare |>
  gg_subseries()

hare |>
  gg_lag(geom = 'point')

hare |>
  ACF(Hare) |>
  autoplot()

“H02” Cost from PBS

This data is very interesting. Generally from year to year there is either an upward or downward trend by type but it is very consistent but the cycles are extremely consistent by type. This is well illustrated by the gg_season, gg_subseries, and the ACF plots. In the gg_season and gg_subseries plot you can clearly see that there is a seasonality to the data, especially in the “safety net” data where it increases throughout the year (likely due to people hitting their co-payment threshold as described in the data help). The “concessional co-payments” seem to follow the inverse of these by starting low increasing toward the middle and then decreasing again back down toward the end of the year. The ACF graph clearly shows that this is highly cyclic with it’s sine pattern for the types described previously. There don’t seem to be any strange years in this data.

cost <- PBS |>
  filter(ATC2 == "H02") |>
  print()
# A tsibble: 816 x 9 [1M]
# Key:       Concession, Type, ATC1, ATC2 [4]
      Month Concession   Type     ATC1  ATC1_desc ATC2  ATC2_desc Scripts   Cost
      <mth> <chr>        <chr>    <chr> <chr>     <chr> <chr>       <dbl>  <dbl>
 1 1991 Jul Concessional Co-paym… H     Systemic… H02   CORTICOS…   63261 317384
 2 1991 Aug Concessional Co-paym… H     Systemic… H02   CORTICOS…   53528 269891
 3 1991 Sep Concessional Co-paym… H     Systemic… H02   CORTICOS…   52822 269703
 4 1991 Oct Concessional Co-paym… H     Systemic… H02   CORTICOS…   54016 280418
 5 1991 Nov Concessional Co-paym… H     Systemic… H02   CORTICOS…   49281 268070
 6 1991 Dec Concessional Co-paym… H     Systemic… H02   CORTICOS…   51798 277139
 7 1992 Jan Concessional Co-paym… H     Systemic… H02   CORTICOS…   42436 221772
 8 1992 Feb Concessional Co-paym… H     Systemic… H02   CORTICOS…   52913 272345
 9 1992 Mar Concessional Co-paym… H     Systemic… H02   CORTICOS…   62908 325700
10 1992 Apr Concessional Co-paym… H     Systemic… H02   CORTICOS…   68499 349271
# ℹ 806 more rows
cost |>
  autoplot()

cost |>
  gg_season()

cost |>
  gg_subseries()

# cost |>
#   # this plot has trouble with the different categories in the other columns
#   select(Month, Cost) |>
#   gg_lag(geom = 'point')
cost |>
  ACF(Cost) |>
  autoplot()

Barrels from us_gasoline

This data is quite granular for the year which provides good detail but to the point of creating a lot of noise, making it hard to read the data (if I were doing my own analysis I would probably bin by month or quarter to make the graphs easier to read but I will answer the questions as they are asked for this assignment). There is very slight trend visible in the gg_subseries plot where there is a small increase toward the middle of the year, which is the summer time for the northern hemisphere, which makes sense since demand tends to be high for gasoline since a lot of people are driving more to spend more time outside and go on vacation. It follows naturally that the supply will increase to meet this demand. Other than that there is a slight decrease on the ACF graph pointing to a trend but it is most likely a small one and the same one that can be seen in the gg_subseries. There is clearly general growth from the beginning of the chart to the end with a bit of decline and stablization in there but that is most likely from political and economic forces that I am too young to remember.

us_gasoline |>
  print()
# A tsibble: 1,355 x 2 [1W]
       Week Barrels
     <week>   <dbl>
 1 1991 W06    6.62
 2 1991 W07    6.43
 3 1991 W08    6.58
 4 1991 W09    7.22
 5 1991 W10    6.88
 6 1991 W11    6.95
 7 1991 W12    7.33
 8 1991 W13    6.78
 9 1991 W14    7.50
10 1991 W15    6.92
# ℹ 1,345 more rows
us_gasoline |>
  autoplot()

us_gasoline |>
  gg_season()

us_gasoline |>
  gg_subseries()

us_gasoline |>
  gg_lag(geom = 'point')

us_gasoline |>
  ACF(Barrels) |>
  autoplot()