library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.2
## ✔ lubridate 1.9.4 ✔ fable 0.5.0
## ✔ ggplot2 4.0.0
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## Warning: package 'tsibble' was built under R version 4.3.3
## Warning: package 'tsibbledata' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
# Access help documentation
# ?aus_production
Quarterly production of selected commodities in Australia.
Quarterly estimates of selected indicators of manufacturing production in Australia.
Time series of class tsibble.
aus_production is a half-hourly tsibble with six values:
Australian Bureau of Statistics, catalogue number 8301.0.55.001 table 1.
# Plot of the Bricks time series
aus_production |>
autoplot(Bricks) +
labs(title = "Australian Quarterly Clay Brick Production", y = "Millions of Bricks")
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
### Lynx from pelt
# Access help documentation
# ?pelt
This is Annual Pelt trading records
Hudson Bay Company trading records for Snowshoe Hare and Canadian Lynx furs from 1845 to 1935. This data contains trade records for all areas of the company.
Time series of class tsibble
pelt is an annual tsibble with two values:
Hudson Bay Company
# Plot of the Lynx time series
pelt |>
autoplot(Lynx) +
labs(title = "Canadian Annual Lynx Pelts Trade", y = "Number of Pelts")
# Access help documentation
# ?gafa_stock
This is Daily GAFA stock prices
Historical stock prices from 2014-2018 for Google, Amazon, Facebook and Apple. All prices are in $USD.
Time series of class tsibble
gafa_stock is a tsibble containing data on irregular trading days:
Each stock is uniquely identified by one key:
Yahoo Finance historical data
# Plot of the Closing prices for all stocks
gafa_stock |>
autoplot(Close) +
labs(title = "GAFA Daily Closing Stock Prices", y = "Price ($US)")
### Demand from vic_elec
# Access help documentation
# ?vic_elec
Half-hourly electricity demand for Victoria, Australia
vic_elec is a half-hourly tsibble with three values:
Time series of class tsibble.
This data is for operational demand, which is the demand met by local scheduled generating units, semi-scheduled generating units, and non-scheduled intermittent generating units of aggregate capacity larger than 30 MWh, and by generation imports to the region. The operational demand excludes the demand met by non-scheduled non-intermittent generating units, non-scheduled intermittent generating units of aggregate capacity smaller than 30 MWh, exempt generation (e.g. rooftop solar, gas tri-generation, very small wind farms, etc), and demand of local scheduled loads. It also excludes some very large industrial users (such as mines or smelters).
Australian Energy Market Operator.
# Plot of the Electricity Demand
vic_elec |>
autoplot(Demand) +
labs(title = "Australia: Victoria Half-Hourly Electricity Demand", y = "MWh")
# Plot of the Electricity Demand
vic_elec |>
autoplot(Demand) +
labs(
title = "Victoria Electricity Half-Hourly Demand",
y = "Demand (Megawatts)",
x = "Time (Half-Hourly)"
)
# view(gafa_stock)
gafa_stock |>
group_by(Symbol) |>
filter(Close == max(Close)) |>
arrange(desc(Close)) -> gafa_peak_closing_price
# view(gafa_peak_closing_price)
tute1 <- readr::read_csv("https://raw.githubusercontent.com/uzmabb182/Data_624_Predictive_Analytics/refs/heads/main/tute1.csv")
# View(tute1)
mytimeseries <- tute1 |>
mutate(Quarter = yearquarter(Quarter)) |>
as_tsibble(index = Quarter)
# view(mytimeseries)
mytimeseries |>
pivot_longer(-Quarter) |>
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line() +
facet_grid(name ~ ., scales = "free_y")
mytimeseries |>
pivot_longer(-Quarter) |>
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line()
facet_grid with scales = “free_y” allow each variable to have its own
y-axis range. Without it all variables will fit on the same y-axis. The
small variable will look like a flat straight line at the bottom of the
chart and the pattern will not be displayed clearly.
library(USgas)
## Warning: package 'USgas' was built under R version 4.3.3
# view(us_total)
# Convert to tsibble
us_total_tsibble <- us_total |>
as_tsibble(index = year, key = state)
# view(us_total_tsibble)
# Define the New England states
new_england_states <- c("Maine", "Vermont", "New Hampshire",
"Massachusetts", "Connecticut", "Rhode Island")
us_total_tsibble |>
filter(state %in% new_england_states) |>
autoplot(y) +
labs(
title = "New England Annual Natural Gas Consumption",
y = "Consumption (Million Cubic Feet)",
x = "Year"
)
Massachusetts consumes significantly more natural gas than the other states, which makes the lines for states like Vermont or Rhode Island appear quite low on the chart. This is a common scaling issue when comparing states of vastly different population sizes.
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
tourism_data <- readxl::read_excel("C:/Users/Uzma/CUNY-SPS-Assignments/Data_624/Data_624_Predictive_Analytics/tourism.xlsx")
# head(tourism_data)
library(tsibble)
# the index is 'Quarter' and keys are Region, State, Purpose
tourism_tsibble <- as_tsibble(tourism)
tourism_tsibble
## # A tsibble: 24,320 x 5 [1Q]
## # Key: Region, State, Purpose [304]
## Quarter Region State Purpose Trips
## <qtr> <chr> <chr> <chr> <dbl>
## 1 1998 Q1 Adelaide South Australia Business 135.
## 2 1998 Q2 Adelaide South Australia Business 110.
## 3 1998 Q3 Adelaide South Australia Business 166.
## 4 1998 Q4 Adelaide South Australia Business 127.
## 5 1999 Q1 Adelaide South Australia Business 137.
## 6 1999 Q2 Adelaide South Australia Business 200.
## 7 1999 Q3 Adelaide South Australia Business 169.
## 8 1999 Q4 Adelaide South Australia Business 134.
## 9 2000 Q1 Adelaide South Australia Business 154.
## 10 2000 Q2 Adelaide South Australia Business 169.
## # ℹ 24,310 more rows
tourism_tsibble %>%
as_tibble() %>%
group_by(Region, Purpose) %>%
summarise(Avg_Trips = mean(Trips), .groups = "drop") %>%
# This is the new part:
filter(Avg_Trips == max(Avg_Trips))
## # A tibble: 1 × 3
## Region Purpose Avg_Trips
## <chr> <chr> <dbl>
## 1 Sydney Visiting 747.
When we use group_by(Region, Purpose), the data becomes
“grouped.”
If you don’t ungroup it, any future calculations like filter or mutate
will happen inside those groups, which can lead to weird errors or
unexpected results.
When .groups = “drop” is applied, it tells R that calculation of average
step is finished and now remove all the groups and turn it back into a
normal, flat dataframe.
state_tourism <- tourism_tsibble %>%
# 1. Group by the new key State
# This automatically drops Region and Purpose from the grouping
group_by(State) %>%
# 2. Sum the trips
# tsibble automatically groups by Quarter
summarise(Trips = sum(Trips))
# View the result
state_tourism
## # A tsibble: 640 x 3 [1Q]
## # Key: State [8]
## State Quarter Trips
## <chr> <qtr> <dbl>
## 1 ACT 1998 Q1 551.
## 2 ACT 1998 Q2 416.
## 3 ACT 1998 Q3 436.
## 4 ACT 1998 Q4 450.
## 5 ACT 1999 Q1 379.
## 6 ACT 1999 Q2 558.
## 7 ACT 1999 Q3 449.
## 8 ACT 1999 Q4 595.
## 9 ACT 2000 Q1 600.
## 10 ACT 2000 Q2 557.
## # ℹ 630 more rows
library(fpp3)
#view(us_employment)
# Filter the data
total_private <- us_employment |>
filter(Title == "Total Private")
# 1. Time Plot (Trend)
total_private |> autoplot(Employed) +
labs(title = "Total Private Employment Trend")
# 2. Seasonal Plot (Seasonality within years)
total_private |> gg_season(Employed) +
labs(title = "Seasonal Plot")
## Warning: `gg_season()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_season()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# 3. Subseries Plot (Seasonality across years)
total_private |> gg_subseries(Employed) +
labs(title = "Subseries Plot")
## Warning: `gg_subseries()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_subseries()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# 4. Lag Plot (Autocorrelation visual)
total_private |> gg_lag(Employed, geom="point")
## Warning: `gg_lag()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_lag()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# 5. ACF Plot (Autocorrelation statistical)
total_private |> ACF(Employed) |> autoplot() +
labs(title = "ACF Plot")
There is a strong, consistent upward trend over time.
Yes, there is a clear annual seasonal pattern.
There is evidence of a business cycle, most notably the dip around Global Financial Crisis of 2008.
The private sector of US economy has grown steadily over the decades.
Employment consistently drops in January after the holiday season and
tends to peak in the summer/late year.
The seasonal plot shows lines that are very paralle showing the stable
seasonal pattern.
2008-2009: You can see a significant break in the trend where employment drops sharply due to the recession.
# Filter the data (removing NAs at the end)
bricks <- aus_production |>
filter(!is.na(Bricks))
# 1. Time Plot
bricks |> autoplot(Bricks) +
labs(title = "Australian Clay Brick Production Trend")
# 2. Seasonal Plot
bricks |> gg_season(Bricks) +
labs(title = "Bricks Seasonal Plot")
# 3. Subseries Plot
bricks |> gg_subseries(Bricks) +
labs(title = "Bricks Subseries Plot")
# 4. Lag Plot
bricks |> gg_lag(Bricks, geom="point")
# 5. ACF Plot
bricks |> ACF(Bricks) |> autoplot() +
labs(title = "Bricks ACF Plot")
There is an upward trend until about 1980 after which the trend flattens and slightly declines.
As you can see that the the jagged pattern repeats every 4 points showing strong quarterly seasonality.
Yes, there are clear boom and bust cycles in construction that span several years starting on top of the trend.
Brick production is highly sensitive to the housing market cycles. It stopped growing around 1980.
Q1 is lowest: Production almost always drops in Quarter 1 due to may be Australian summer holidays.
Q3 is highest: Production tends to peak in Quarter 3.
The shift around 1980-1982 is notable and the long-term growth
stops.
This suggest a structural change in the industry due to a switch to
other building materials probably.
SKIPPED - Annual Data:
We cannot plot seasonality if you don’t have seasons.
The pelt dataset only has one observation per year
gg_season() tries to put “Month” or “Quarter” on the x-axis to compare Jan 1845 vs. Jan 1846.
gg_subseries() tries to group all “Januarys” together.
Since there are no months or quarters in annual data, R has nothing to plot on the x-axis for a seasonal plot.
# 1. Time Plot
pelt |> autoplot(Hare) +
labs(title = "Canadian Hare Pelts Trend")
# 2. Seasonal Plot (SKIPPED - Annual Data)
# pelt |> gg_season(Hare)
# 3. Subseries Plot (SKIPPED - Annual Data)
# pelt |> gg_subseries(Hare)
# 4. Lag Plot
pelt |> gg_lag(Hare, geom="point")
# 5. ACF Plot
pelt |> ACF(Hare) |> autoplot() +
labs(title = "Hare ACF Plot")
There is no long-term upward or downward trend.
None. This is annual data, so it cannot have a seasonal pattern.
There is a very regular rise and fall every almost 10 years.
The population follows a cycle. The peaks and troughs are massive going from near zero to huge numbers.
N/A (Annual data).
The cycles are fairly regular, but the magnitude of the peaks
varies.
The peak around 1865 was exceptionally high as compare to others.
# Filter and Summarise the data since PBS contains many breakdowns (Concession/Type)
h02_cost <- PBS |>
filter(ATC2 == "H02") |>
summarise(Cost = sum(Cost))
# 1. Time Plot
h02_cost |> autoplot(Cost) +
labs(title = "Total Cost of H02 Scripts Trend")
# 2. Seasonal Plot
h02_cost |> gg_season(Cost) +
labs(title = "Seasonal Plot: H02 Cost")
# 3. Subseries Plot
h02_cost |> gg_subseries(Cost) +
labs(title = "Subseries Plot: H02 Cost")
# 4. Lag Plot
h02_cost |> gg_lag(Cost, geom="point")
# 5. ACF Plot
h02_cost |> ACF(Cost) |> autoplot() +
labs(title = "H02 Cost ACF Plot")
A steady upward trend in cost.
There is a very strong annual seasonality.
Not clearly visible; the series is dominated by the trend and seasonality.
The cost of H02 drugs is growing, and the seasonal swings are getting larger.
There is a sharp drop at the start of every year in January.
Costs rise throughout the year and peak in December.
The seasonal jumps become much larger in later years but the pattern itself is consistent.
# 1. Time Plot
us_gasoline |> autoplot(Barrels) +
labs(title = "US Gasoline Production Trend")
# 2. Seasonal Plot
# This will make visual dense because it is Weekly data
us_gasoline |> gg_season(Barrels) +
labs(title = "Gasoline Seasonal Plot")
# 3. Subseries Plot
us_gasoline |> gg_subseries(Barrels) +
labs(title = "Gasoline Subseries Plot")
# 4. Lag Plot
us_gasoline |> gg_lag(Barrels, geom="point")
# 5. ACF Plot
us_gasoline |> ACF(Barrels) |> autoplot() +
labs(title = "Gasoline ACF Plot")
Upward trend until about 2008, then it flattens out.
Clear annual seasonality visible as a wave in the weekly data.
Minimal.
Gasoline production grew steadily but plateaued after 2008.
Production is highest in the middle of the year.
Production drops in the winter months.
If you look closely at late 2005, you might see a sharp, unusual drop in production.
In 2008 the trend clearly changes.