Assignment1_data

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'tibble' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.5.2

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tsibble     1.1.6     ✔ feasts      0.4.2
## ✔ tsibbledata 0.4.1     ✔ fable       0.5.0

## Warning: package 'tsibble' was built under R version 4.5.2

## Warning: package 'tsibbledata' was built under R version 4.5.2

## Warning: package 'feasts' was built under R version 4.5.2

## Warning: package 'fabletools' was built under R version 4.5.2

## Warning: package 'fable' was built under R version 4.5.2

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

Please submit exercises 2.1, 2.2, 2.3, 2.4, 2.5 and 2.8 from the Hyndman online Forecasting book. Please submit both your Rpubs link as well as attach the .pdf file with your code.

2.1

Bricks in aus_production

?aus_production

## starting httpd help server ... done

# : We can see using the glimpse or view function that the data is broken out by quarters.

glimpse(aus_production)

## Rows: 218
## Columns: 7
## $ Quarter     <qtr> 1956 Q1, 1956 Q2, 1956 Q3, 1956 Q4, 1957 Q1, 1957 Q2, 1957…
## $ Beer        <dbl> 284, 213, 227, 308, 262, 228, 236, 320, 272, 233, 237, 313…
## $ Tobacco     <dbl> 5225, 5178, 5297, 5681, 5577, 5651, 5317, 6152, 5758, 5641…
## $ Bricks      <dbl> 189, 204, 208, 197, 187, 214, 227, 222, 199, 229, 249, 234…
## $ Cement      <dbl> 465, 532, 561, 570, 529, 604, 603, 582, 554, 620, 646, 637…
## $ Electricity <dbl> 3923, 4436, 4806, 4418, 4339, 4811, 5259, 4735, 4608, 5196…
## $ Gas         <dbl> 5, 6, 7, 6, 5, 7, 7, 6, 5, 7, 8, 6, 5, 7, 8, 6, 6, 8, 8, 7…

print(n=20,(aus_production |> select(Quarter, Bricks)))

## # A tsibble: 218 x 2 [1Q]
##    Quarter Bricks
##      <qtr>  <dbl>
##  1 1956 Q1    189
##  2 1956 Q2    204
##  3 1956 Q3    208
##  4 1956 Q4    197
##  5 1957 Q1    187
##  6 1957 Q2    214
##  7 1957 Q3    227
##  8 1957 Q4    222
##  9 1958 Q1    199
## 10 1958 Q2    229
## 11 1958 Q3    249
## 12 1958 Q4    234
## 13 1959 Q1    208
## 14 1959 Q2    251
## 15 1959 Q3    267
## 16 1959 Q4    255
## 17 1960 Q1    242
## 18 1960 Q2    268
## 19 1960 Q3    290
## 20 1960 Q4    277
## # ℹ 198 more rows

aus_production$Bricks

##   [1] 189 204 208 197 187 214 227 222 199 229 249 234 208 251 267 255 242 268
##  [19] 290 277 241 253 265 236 229 265 275 258 231 263 308 313 293 328 349 340
##  [37] 309 349 366 340 302 350 362 337 306 358 359 357 341 380 404 409 383 417
##  [55] 454 428 386 428 434 417 385 433 453 436 399 461 476 477 452 461 534 516
##  [73] 478 526 518 417 340 437 459 449 424 501 540 533 457 513 522 478 421 487
##  [91] 470 482 458 526 573 563 513 551 589 564 519 581 581 578 501 560 512 412
## [109] 303 409 420 413 400 469 482 484 447 507 533 505 442 503 506 443 414 485
## [127] 495 458 428 519 555 538 510 571 556 509 458 510 494 460 372 436 422 423
## [145] 383 404 446 420 394 462 475 443 421 475 497 476 430 457 417 370 310 358
## [163] 379 369 330 390 416 383 339 394 412 420 376 401 430 417 416 447 421 379
## [181] 304 337 385 381 345 405 417 420 387 415 440 413 409 423 428 397 355 435
## [199]  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## [217]  NA  NA

#autoplot of bricks produced by quarter

autoplot(aus_production, Bricks) + 
  labs(title = "Australian Brick Production by Quarter")

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

Lynx from pelt

?pelt

#Lynx is broken down by year from 1845 to 1935.

glimpse(pelt)

## Rows: 91
## Columns: 3
## $ Year <dbl> 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855,…
## $ Hare <dbl> 19580, 19600, 19610, 11990, 28040, 58000, 74600, 75090, 88480, 61…
## $ Lynx <dbl> 30090, 45150, 49150, 39520, 21230, 8420, 5560, 5080, 10170, 19600…

#pelt$Lynx (I commented this out because the result is quite long.)

#autoplot of lynx pelts by year

autoplot(pelt, Lynx)

Close from gafa_stock

?gafa_stock

#This interval is daily, with different price markers

glimpse(gafa_stock)

## Rows: 5,032
## Columns: 8
## Key: Symbol [4]
## $ Symbol    <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAP…
## $ Date      <date> 2014-01-02, 2014-01-03, 2014-01-06, 2014-01-07, 2014-01-08,…
## $ Open      <dbl> 79.38286, 78.98000, 76.77857, 77.76000, 76.97285, 78.11429, …
## $ High      <dbl> 79.57571, 79.10000, 78.11429, 77.99429, 77.93714, 78.12286, …
## $ Low       <dbl> 78.86000, 77.20428, 76.22857, 76.84571, 76.95571, 76.47857, …
## $ Close     <dbl> 79.01857, 77.28286, 77.70428, 77.14857, 77.63715, 76.64571, …
## $ Adj_Close <dbl> 66.96433, 65.49342, 65.85053, 65.37959, 65.79363, 64.95345, …
## $ Volume    <dbl> 58671200, 98116900, 103152700, 79302300, 64632400, 69787200,…

#gafa_stock$Close (I've commented this out because it prints 5,000 lines of data)

#autoplot of stock prices by day

autoplot(gafa_stock, Close)

Demand from vic_elec

?vic_elec

# this is broken down into half hour intervals.

glimpse(vic_elec)

## Rows: 52,608
## Columns: 5
## $ Time        <dttm> 2012-01-01 00:00:00, 2012-01-01 00:30:00, 2012-01-01 01:0…
## $ Demand      <dbl> 4382.825, 4263.366, 4048.966, 3877.563, 4036.230, 3865.597…
## $ Temperature <dbl> 21.40, 21.05, 20.70, 20.55, 20.40, 20.25, 20.10, 19.60, 19…
## $ Date        <date> 2012-01-01, 2012-01-01, 2012-01-01, 2012-01-01, 2012-01-0…
## $ Holiday     <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE…

#vic_elec$Demand

#autoplot of demand, with axes labeled

autoplot(vic_elec, Demand) + 
    labs(y = "Electricity Demand", x = "Time, in half hour intervals", title = "Electricity Demand by Day")

2.2 The highest stock prices were all in the latter half of 2018, which matches the graph.

gafa_stock |>
    group_by(Symbol) |>
    filter(High == max(High)) |>
    select(Symbol, High, Date)

## # A tsibble: 4 x 3 [!]
## # Key:       Symbol [4]
## # Groups:    Symbol [4]
##   Symbol  High Date      
##   <chr>  <dbl> <date>    
## 1 AAPL    233. 2018-10-03
## 2 AMZN   2050. 2018-09-04
## 3 FB      219. 2018-07-25
## 4 GOOG   1274. 2018-07-27

2.3 Without facet_grid, we just get one graph that shows ad budget, GDP, and sales together. (Looking this over made me realize that both British and American spellings work for color.)

#loading the CSV

tute1 <- readr::read_csv("tute1.csv")

## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (3): Sales, AdBudget, GDP
## date (1): Quarter
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(tute1)

mytimeseries <- tute1 |>
  mutate(Quarter = yearquarter(Quarter)) |>
  as_tsibble(index = Quarter)

#with facet_grid
mytimeseries |>
  pivot_longer(-Quarter) |>
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line() +
  facet_grid(name ~ ., scales = "free_y")

#without facet_grid

mytimeseries |>
    pivot_longer(-Quarter) |>
    ggplot(aes(x = Quarter, y = value, colour = name)) +
    geom_line()

2.4

#install and load US gas

#install.packages("USgas")
#left this code but commented this out in case your'e running it on your own computer.

library(USgas)

## Warning: package 'USgas' was built under R version 4.5.2

#Create a tsibble from us_total with year as the index and state as the key

us_tsibble <- us_total |>
    as_tsibble(index = year, key = state)

#Plot the annual natural gas consumption by state for the New England area (comprising the states of Maine, Vermont, New Hampshire, Massachusetts, Connecticut and Rhode Island).
#filtered only to include the NE states
#I added labels to clarify 
#obviously we're seeing larger overall consumption in larger states (Vermont is on of the smallest states), but it's interesting that RI consumption drops before 2000 while Maine's goes up. Not all states experience a steady increase over time.I'd love to see how this correlates with temperature.

us_tsibble |> filter(state %in% c("Rhode Island", "Maine", "Vermont", "New Hampshire", "Massachusetts", "Connecticut")) |> autoplot(y) + labs(y = "Natural Gas Consumption", x = "Year", title = "Annual Gas Consumption by State")

2.5

#reading tourism from excel (using forward slashes)

tourismdata <- readxl::read_excel("C:/Users/Sam Barbaro/Desktop/Data 624/tourism.xlsx")


#Create tsibble identical to tourism (convert dates to quarters)
tour_tsibble <- tourismdata |>
    mutate(Quarter = yearquarter(Quarter)) |>
    as_tsibble(key = c(Region, State, Purpose), index = Quarter)

#Find what combination of Region and Purpose had the maximum number of overnight trips on average.
total_trips <- tourismdata |>
    group_by(Region, Purpose) |>
    summarise(Avg_Trips = mean(Trips, na.rm = TRUE), .groups = "drop") |>
    arrange(desc(Avg_Trips))

#Sydney (visiting) had, on average, the max number of overnight trips.

#Create a new tsibble which combines the Purposes and Regions, and just has total trips by State.
#one way I interpreted this loses the index necessarily and can't be a tsibble (Keeping the quarter column means having the same number of lines), but here's everything grouped by state, with the region and purpose columns combined

tourism_by_state <- tourismdata |> unite("Region_Purpose", Region, Purpose) |> group_by(Region_Purpose, State) |>     summarise(Sum_Trips = sum(Trips, na.rm = TRUE), .groups = "drop")

#Here's a tsibble that just groups everything by state (Which I *think* is what the question means to get at)

tourism_by_state2 <- tourismdata |> mutate(Quarter = yearquarter(Quarter)) |> group_by(Quarter, State) |> summarise(Sum_Trips = sum(Trips, na.rm = TRUE), .groups = "drop") |> as_tsibble(key = State, index = Quarter)

2.8 Use the following graphics functions: autoplot(), gg_season(), gg_subseries(), gg_lag(), ACF() and explore features from the following time series: “Total Private” Employed from us_employment, Bricks from aus_production, Hare from pelt, “H02” Cost from PBS, and Barrels from us_gasoline.

Can you spot any seasonality, cyclicity and trend? What do you learn about the series? What can you say about the seasonal patterns? Can you identify any unusual years?

#US employment

#filtering for total private and autoplotting the number of people employed over time.
#This shows up the number of private employees has gone up over time. There are general dips for downturns (e.g., in the early/mid 80s) and a big dip around 2008/2010-ish for the great recession

us_employment |> filter(Title == "Total Private") |> autoplot(Employed)

#Here we look at seasonal trends, which look similar in recent yaers, with employment ticking up in the middle of the year, around the summer, and generally ending higher at the end of the year than at the beginning. (I tried this without selecting a varaible, and the function automatically selects employed). Again we see a few years that dip below the previous years, including the great recession.

us_employment |> filter(Title == "Total Private") |> gg_season(Employed)

## Warning: `gg_season()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_season()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#subseries - shows the general trend of private employment independent of seasonality (generally going upward over time, with some dips)

us_employment |> filter(Title == "Total Private") |> gg_subseries(Employed)

## Warning: `gg_subseries()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_subseries()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#the lag plot shows there is a very clear pattern to employment data
us_employment |> filter(Title == "Total Private") |> gg_lag(Employed)

## Warning: `gg_lag()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_lag()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#ACF shows an extremely strong correlation coefficient (close to 1)
us_employment |> filter(Title == "Total Private") |> ACF(Employed) |> autoplot()

Bricks from aus_production

# Autoplot shows some years with big dips in production (around 1970 and in the late 80s). Production trended upward through the 80s, then trended downward.

aus_production |> autoplot(Bricks)

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

#Production looks somewhat seasonal with a spike midyear that seems to vary between Q2 and Q3. 
aus_production |> gg_season(Bricks)

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

#These all have a similar shape, which shows the data is cyclical. They show the trend over years, and that Q1 production has tended to be lower in general, followed by Q4 production. 
aus_production |> gg_subseries(Bricks)

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

#The relationship is strongly positive for the first 4-ish lags, then becomes more random.
aus_production |> gg_lag(Bricks)

## Warning: Removed 20 rows containing missing values (gg_lag).

#A strong correlation coefficient that gets weaker as time goes by
aus_production |> ACF(Bricks) |> autoplot()

Hare from pelt

# Autoplot shows a couple of higher spikes in the 1860s and 1880s, generally some years with spikes and others with very low demand.

pelt |> autoplot(Hare)

#We cannot plot this data seasonally because it is only available by year, but here's the code.
#pelt |> gg_season(Hare)

#We don't have monthly or quarterly data, so this is just plotted by year. It shows an average 
pelt |> gg_subseries(Hare)

#The data is more random, which makes sense given the large differences between years.
pelt |> gg_lag(Hare)

#The data almost falls into white noise territory, with some strong negative correlations, suggesting that perhaps a strong demand in some years could lead to less demand in others. I wondered if the great variation had to do with fashion or weather.
pelt |> ACF(Hare) |> autoplot()

Cost rom PBS

#Let's calculate total cost first and name this object PBS2
PBS2 <- PBS |>  select(Month, Concession, Type, Cost) |> summarise(TotalC = sum(Cost))

# Autoplot shows the spend generally trending upward as years go by

PBS2 |> autoplot(TotalC)

#The seasonal plot shows a spike in January with sales tending to spike toward the end of the year. There's a pretty consistent drop in Feb/March. Drug sale data is cyclical and tends to have a similar shape year-on-year.

PBS2 |> gg_season(TotalC)

#Monthly data reflects the overall trend of sales going up. January has the highest average. 

PBS2 |> gg_subseries(TotalC)

#The lag plot shows that the data is pretty consistent and shows a clear pattern. 

PBS2 |> gg_lag(TotalC)

#The ACF shows a strong correlation coefficient, indicating that the data is strongly related.

PBS2 |> ACF(TotalC) |> autoplot()

Barrels from us_gasoline.

#This is weekly data. The autoplot shows general demand increasing from the 90s until the early 2010s, when demand dips and then rises again. This could reflect the number of electric vehicles sold in the early 2010s/200s. I wonder if later data (post-2020) would show more of a dip.

us_gasoline |> autoplot(Barrels)

#The seasonal plot shows a consistent spike in summer (road trips?), suggesting seasonality, and fairly cyclical demand.

us_gasoline |> gg_season(Barrels)

#Weekly data reinforces the consistent summertime spike and shows a slightly higher demand toward the end of the year.

us_gasoline |> gg_subseries(Barrels)

#The lag plot shows that the data is somewhat consistent and shows a clear pattern. 

us_gasoline |> gg_lag(Barrels)

#The shows a strong positive correlation coefficient, showing that the data is strongly related.

us_gasoline |> ACF(Barrels) |> autoplot()

Assignment1_data_624

Samantha Barbaro

2026-02-02