DATA 624: Homework 1

Exercise 2.1

help(aus_production)

# Select series of interest from datasets
aus_production <- aus_production %>% select(Quarter, Bricks)
pelt <- pelt %>% select(Year, Lynx)
gafa_stock <- gafa_stock %>% select(Date, Close)
vic_elec <- vic_elec %>% select(Time, Demand)

Looking at each of our datasets, the timescale of each is listed below:

aus_production - Quarterly
pelt - Yearly
gafa_stock - Daily (trading days)
vic_elec - Half-Hourly

Now we can plot our data using autoplot, we’ll plot each of the time series of interest per dataset:

# Plotting each series using `autoplot`
autoplot(aus_production, Bricks)

## Warning: Removed 20 rows containing missing values (`geom_line()`).

autoplot(pelt, Lynx)

autoplot(gafa_stock, Close)

autoplot(vic_elec, Demand)

# Modify  axes labels for Victoria, Aus
autoplot(vic_elec, Demand) + labs(x="Half-Hours", y="Electricity Demand")

Exercise 2.2

To find the peak (max) closing price for each stock, we’ll need to first group our data by the stock symbol (Stock) then filter by our value (in this case Close) to grab the maximum of the Closing price:

# First group by the stock, then find the max closing price for each symbol
gafa_stock %>% group_by(Symbol) %>% filter(Close == max(Close))

## # A tsibble: 4 x 3 [!]
## # Key:       Symbol [4]
## # Groups:    Symbol [4]
##   Date       Close Symbol
##   <date>     <dbl> <chr> 
## 1 2018-10-03  232. AAPL  
## 2 2018-09-04 2040. AMZN  
## 3 2018-07-25  218. FB    
## 4 2018-07-26 1268. GOOG

Exercise 2.3

Let’s read in the sales data from the book website:

tute1 <- readr::read_csv("https://otexts.com/fpp3/extrafiles/tute1.csv")

## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (3): Sales, AdBudget, GDP
## date (1): Quarter
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(tute1)

## # A tibble: 6 × 4
##   Quarter    Sales AdBudget   GDP
##   <date>     <dbl>    <dbl> <dbl>
## 1 1981-03-01 1020.     659.  252.
## 2 1981-06-01  889.     589   291.
## 3 1981-09-01  795      512.  291.
## 4 1981-12-01 1004.     614.  292.
## 5 1982-03-01 1058.     647.  279.
## 6 1982-06-01  944.     602   254

# Now convert the data.frame to a tsibble object, and set up the TS index
mytimeseries <- tute1 |>
  mutate(Quarter = yearquarter(Quarter)) |>
  as_tsibble(index = Quarter)

Now let’s plot the 3 time series, first using facet_grid

mytimeseries |>
  pivot_longer(-Quarter) |>
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line() +
  facet_grid(name ~ ., scales = "free_y")

Now let’s remove facet_grid

mytimeseries |>
  pivot_longer(-Quarter) |>
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line() #+

  # facet_grid(name ~ ., scales = "free_y")

The plots are included in the same panel and share the y-axis! This can smooth out the graphs unnecessarily and introduce different scaling. Depending on context, this can be a good or a bad thing.

Exercise 2.4

First let’s conevet the us_total dataset from USGas to be a tsibble, then filter down to the states in New England.

us_total <- us_total |> as_tsibble(index=year, key=state)

# Plot consumption for New England
new_england <- us_total %>%
                  filter(state == "Massachusetts" |
                         state=="Vermont" |
                         state==" New Hampshire" | 
                         state=="Maine" |
                         state=="Connecticut" |
                         state=="Rhode Island")

# Now plot
autoplot(new_england, y) + labs(x="Year", y="Annual Gas Consumption (millions of cubic feet)")

Exercise 2.5

Let’s read in the tourism data and then convert to a tsibble object, with a quarterly index:

# Formatting quarter: 1998-01-01 => 1998 Q1
tourism_xl <- readxl::read_excel("../data/tourism.xlsx") |>
  mutate(Quarter= yearquarter(Quarter)) |> 
  as_tsibble(index=Quarter, key=c(Region, State, Purpose))

Finding which combination of Region and Purpose had the maximum number of overnight trips on average. Similar to above, we can use group_by here and then sort/filter our aggregated data:

mean_trips <- tourism_xl %>%
  group_by(Region, Purpose) %>%
  mutate(trips=mean(Trips)) %>%
  filter(trips == max(trips))

mean_trips

## # A tsibble: 24,320 x 6 [1Q]
## # Key:       Region, State, Purpose [304]
## # Groups:    Region, Purpose [304]
##    Quarter Region   State           Purpose  Trips trips
##      <qtr> <chr>    <chr>           <chr>    <dbl> <dbl>
##  1 1998 Q1 Adelaide South Australia Business  135.  156.
##  2 1998 Q2 Adelaide South Australia Business  110.  156.
##  3 1998 Q3 Adelaide South Australia Business  166.  156.
##  4 1998 Q4 Adelaide South Australia Business  127.  156.
##  5 1999 Q1 Adelaide South Australia Business  137.  156.
##  6 1999 Q2 Adelaide South Australia Business  200.  156.
##  7 1999 Q3 Adelaide South Australia Business  169.  156.
##  8 1999 Q4 Adelaide South Australia Business  134.  156.
##  9 2000 Q1 Adelaide South Australia Business  154.  156.
## 10 2000 Q2 Adelaide South Australia Business  169.  156.
## # ℹ 24,310 more rows

From our aggregated dataframe it appears Business in Adelaide produces the highest number of trips on average.

Now we can get the total trips by state using similar group_by functionality:

# Getting total trips by state using the groupo_by function
total_trips <- tourism %>% 
                group_by(State) %>%
                summarise(sum(Trips))
total_trips

## # A tsibble: 640 x 3 [1Q]
## # Key:       State [8]
##    State Quarter `sum(Trips)`
##    <chr>   <qtr>        <dbl>
##  1 ACT   1998 Q1         551.
##  2 ACT   1998 Q2         416.
##  3 ACT   1998 Q3         436.
##  4 ACT   1998 Q4         450.
##  5 ACT   1999 Q1         379.
##  6 ACT   1999 Q2         558.
##  7 ACT   1999 Q3         449.
##  8 ACT   1999 Q4         595.
##  9 ACT   2000 Q1         600.
## 10 ACT   2000 Q2         557.
## # ℹ 630 more rows

Exercise 2.8

# Get the Total Private time series from the employment data
us_emp <- fpp3::us_employment %>% filter(Title == "Total Private") %>% select(Month, Employed)

# Plotting US employment for each plot type
autoplot(us_emp, Employed)

gg_season(us_emp, Employed)

gg_lag(us_emp, Employed)

gg_subseries(us_emp, Employed)

# Selecting needed features
pelt <- tsibbledata::pelt %>% select(Year, Hare)
us_gas <- us_gasoline %>% select(Week, Barrels)
pbs <- PBS %>% filter(ATC2 == 'H02') %>% index_by(Month) %>% summarise(Cost = sum(Cost))

Plotting Pelt data first. A lag of 5 shows an interesting negative relationship. In other words, data in this dataset froma. certain point in time correlates with data from 5 years prior. We see this in the basic TS plot with a periodicity of 5 years.

autoplot(pelt, Hare)

# gg_season(pelt, Hare, period="5") # This is yearly data, so no seasonality
gg_lag(pelt, Hare, geom="point")

gg_subseries(pelt, Hare)

Now we can plot from our aus_production dataset. Looking at this data, we definitely see seasonality between the quarters of the year. Rows with missing data are removed from gg_lag by default.

autoplot(aus_production, Bricks)

gg_season(aus_production, Bricks)

gg_lag(aus_production, Bricks)

gg_subseries(aus_production, Bricks)

This data is a bit less granular (monthly, instead of quarterly). However, seasonal cycles can still be observed within a given year. There was an outlier year in the early 80s likely due to a larger economic issue.

Now we can plot our summed PBS Cost data for H02 ATC2 codes. We definitely see seasonality within these time series, as well as a general increase over a longer time scale. Feb - May seems to be a down period for safety net payments as well.

autoplot(pbs, Cost)

gg_season(pbs, Cost)

gg_lag(pbs, Cost) # More than one series present

gg_subseries(pbs, Cost)

Finally, we can plot out the data on US gasoline supplied. Again, we see seasonal effects present in this data. One thing about this time series is that the variance of the seasonal shifts is pretty small. In other words, the amount by which production swings due to seasonality is pretty consistent over time. Also, no larger outlier years jump out at us visually.

autoplot(us_gas, Barrels)

gg_season(us_gas, Barrels)

gg_lag(us_gas, Barrels)

gg_subseries(us_gas, Barrels)