Explore the following four time series: Bricks from aus_production, Lynx from pelt, Close from gafa_stock, Demand from vic_elec.
# Loading the datasets
data(aus_production)
data(pelt)
data(gafa_stock)
data(vic_elec)
?aus_production
?pelt
?gafa_stock
?vic_elec
| Series | Time Interval | Description |
|---|---|---|
| aus_production | Quarterly | Quarterly production of selected commodities in Australia. |
| pelt | Annual | Pelt trading records |
| gafa_stock | Daily | GAFA stock prices |
| vic_elec | Half-Hourly | Half-hourly electricity demand for Victoria, Australia |
# Using autoplot to plot charts
autoplot(aus_production, Bricks) + ggtitle(
"Quarterly Brick production of selected commodities in Australia."
) + geom_line(color = "blue", size = 1.5) +
theme(axis.text = element_text(size = 12))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
autoplot(pelt, Lynx) + ggtitle(
"Lynx Pelt trading records"
) + geom_line(color = "red", size = 1.5) +
theme(axis.text = element_text(size = 12))
autoplot(gafa_stock, Close) + ggtitle(
"GAFA stock Close prices"
) + theme(axis.text = element_text(size = 12))
autoplot(vic_elec, Demand) + ggtitle(
"Half-hourly electricity demand for Victoria, Australia"
) + geom_line(color = "orange", size = 0.5) +
theme(axis.text = element_text(size = 12))
# Modifying chart legends and axis
autoplot(vic_elec, Demand) + ggtitle(
"Half-hourly electricity demand for Victoria, Australia"
) + geom_line(color = "green", size = 0.5) +
theme(axis.text = element_text(size = 12), aspect.ratio = 0.5) +
xlab("Half-Hour Interval") +
ylab("Demand [MWh]")
Use filter() to find what days corresponded to the peak closing price for each of the four stocks in gafa_stock.
# Importing dplyr
library(dplyr)
# inspecting the first few rows of the data
head(gafa_stock)
# Filtering the data
gafa_stock |>
select(
Symbol,
Date,
Close
) |>
group_by(Symbol) |>
filter(
Close == max(Close)
)
Download the file tute1.csv from the book website, open it in Excel (or some other spreadsheet application), and review its contents. You should find four columns of information. Columns B through D each contain a quarterly series, labelled Sales, AdBudget and GDP. Sales contains the quarterly sales for a small company over the period 1981-2005. AdBudget is the advertising budget and GDP is the gross domestic product. All series have been adjusted for inflation.
# importing readr
library(readr)
# Reading and viewing the csv
tute1 <- read_csv("https://raw.githubusercontent.com/riverar9/cuny-msds/main/data624-predictive-analytics/homework/homework-1/tute1.csv")
## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Sales, AdBudget, GDP
## date (1): Quarter
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(tute1)
# Converting the data into a timeseries
mytimeseries <- tute1 |>
mutate(Quarter = yearquarter(Quarter)) |>
as_tsibble(index = Quarter)
head(mytimeseries)
# Create a series of plots using facet_grid
mytimeseries |>
pivot_longer(-Quarter) |>
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line() +
facet_grid(name ~ ., scales = "free_y")
Check what happens when you don’t include facet_grid()
# Removing facet wrap
mytimeseries |>
pivot_longer(-Quarter) |>
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line()
Without facet_grid, the plots are all on the same chart. In my opinion this is more helpful as it provides immediate insight into the relative value of these timeseries against eachother.
The USgas package contains data on the demand for natural gas in the US.
# Installing the package.
#install.packages("USgas")
# Improting USgas and tsibble
library(USgas)
## Warning: package 'USgas' was built under R version 4.3.3
library(tsibble)
library(tibble)
# Loading us_total and displaying the first few records
?us_total
## starting httpd help server ... done
data(us_total)
head(us_total)
# creating the tsibble
us_total_tsibble <- us_total |>
as_tsibble(
key = state,
index = year
)
us_total_tsibble
# Create a variable with just the states of interest
filtered_states <- us_total_tsibble |>
filter(
state %in% c(
"Maine",
"Vermont",
"New Hampshire",
"Massachusetts",
"Connecticut",
"Rhode Island"
)
)
# Plot the annual consumption by state
autoplot(
filtered_states,
y
) + geom_line(
size = 1.5
)
# Import the readxl and httr libraries
library(readxl)
library(httr)
# Specify the file URL
file_url <- "https://github.com/riverar9/cuny-msds/raw/main/data624-predictive-analytics/homework/homework-1/tourism.xlsx"
# Download the file to the local repository
GET(
file_url,
write_disk(
temp_file <- tempfile(
fileext = ".xlsx"
)
)
)
## Response [https://raw.githubusercontent.com/riverar9/cuny-msds/main/data624-predictive-analytics/homework/homework-1/tourism.xlsx]
## Date: 2024-09-09 00:00
## Status: 200
## Content-Type: application/octet-stream
## Size: 679 kB
## <ON DISK> C:\Users\Richie\AppData\Local\Temp\Rtmpg9OTrd\file29f44cf17325.xlsx
# Read in the file
tourism <- read_excel(temp_file)
# Delete the temp file
file.remove(temp_file)
## [1] TRUE
# Display part of the file
head(tourism)
# Converting tourism into a tsibble
tourism_ts <- tourism |>
mutate(
Quarter = yearquarter(Quarter)
) |>
as_tsibble(
key = c(
Region,
State,
Purpose
),
index = Quarter
)
head(tourism_ts)
key(tourism_ts)
## [[1]]
## Region
##
## [[2]]
## State
##
## [[3]]
## Purpose
index(tourism_ts)
## Quarter
# Using the tibble, we'll:
# 1. group by region and purpose
# 2. calculate the average trip by the group
# 3. Ungroup the data to remove the grouping structure
# 4. filter to display the entry that has the maximum value of trip_avg
tourism |>
group_by(
Region,
Purpose
) |>
summarize(
trip_avg = mean(Trips)
) |>
ungroup() |>
filter(
trip_avg == max(trip_avg)
)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
# Using the tourism tibble, we'll:
# 1. group by state
# 2. summarize to create a total_trips feature
tourism_ts |>
group_by(
State
) |>
summarize(
total_trips = sum(Trips)
)
Use the following graphics functions: autoplot(), gg_season(), gg_subseries(), gg_lag(), ACF() and explore features from the following time series: “Total Private” Employed from us_employment, Bricks from aus_production, Hare from pelt, “H02” Cost from PBS, and Barrels from us_gasoline.
All of these are answered in their respective cells below
# loading our datasets
data(us_employment)
data(aus_production)
data(pelt)
data(PBS)
data(us_gasoline)
# Inspect our datasets
View(us_employment)
View(aus_production)
View(pelt)
View(PBS)
View(us_gasoline)
# us_employment: Check for seasonality, cyclicality and trend.
autoplot(
us_employment |>
filter(
Title == "Total Private"
) |>
select(
Month,
Employed
),
Employed
)
gg_season(
us_employment |>
filter(
Title == "Total Private"
) |>
select(
Month,
Employed
),
Employed
)
gg_subseries(
us_employment |>
filter(
Title == "Total Private"
) |>
select(
Month,
Employed
),
Employed
)
From the 1st plot above, we can see that the value of “Employed” increases as time goes on. In the seasonal plot, we can see that the rate of that growth seems to slow or even decrease in the summer months (June onward). This is especially true in the more recent years (values along the top of the plot).
autoplot(
aus_production |>
select(
Quarter,
Bricks
),
Bricks
)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
gg_season(
aus_production |>
select(
Quarter,
Bricks
),
Bricks
)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
gg_subseries(
aus_production |>
select(
Quarter,
Bricks
),
Bricks
)
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_line()`).
There seems to have been great growth from 1960 to 1980 and since then there seems to be a stagnation and a decrease that in 1990. In the season plot, we can see that Q1 often sees the lowest values and Q3 sees the highest. We can also see in the subseries plot where the mean is notably higher than the rest of the quarters. appears to begin
autoplot(
pelt |>
select(
Year,
Hare
),
Hare
)
gg_lag(
pelt |>
select(
Year,
Hare
),
Hare,
geom = "point"
)
ACF(
pelt |>
select(
Year,
Hare
),
Hare,
geom = "point"
)
## Warning: The `...` argument of `PACF()` is deprecated as of feasts 0.2.2.
## ℹ ACF variables should be passed to the `y` argument. If multiple variables are
## to be used, specify them using `vars(...)`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ACF currently only supports one column, `Hare` will be used.
The hare pelts dataset seems to have a series of peaks and valleys. From the granularity of the data, we won’t be able to see any information on annual trends, but we can see that over the years their periods of strong year over year growth followed by periods of sharp declines. Looking at the lag plot, we can see that there seems to be a correlation with the data and lag 1, indicating that there may be a relationship between one year’s trades and the next. Looking at the results of ACF, we see that lag1 is a bit of an exception and the correlation is not very good (65.8%).
autoplot(
PBS |>
filter(
ATC2 == "H02"
) |>
select(
Month,
Cost
),
Cost
)
gg_season(
PBS |>
filter(
ATC2 == "H02"
) |>
select(
Month,
Cost
),
Cost
)
gg_subseries(
PBS |>
filter(
ATC2 == "H02"
) |>
select(
Month,
Cost
),
Cost
)
From the first plot we can see that there is some cyclicality to each of these metrics and that some of them remain relatively constant while others increase over time. This cyclicality can be observed better using the season plot where we see that the concessional and general safety nets have a sharp decline from January to February and then slowly ramps up for the remainder of the year. A new insight that we can see here is that the concessional co-payments seem to show an inverse relationship as the safety net metrics.
Lastly, from the sub-series we can see more evidence for the seasonality and trends we’ve noticed in the other plots.
autoplot(
us_gasoline,
Barrels
)
gg_season(
us_gasoline,
Barrels
)
From the first plot, we can see that the number of barrels per day increases until around 2006 where the trend reverses and it seems to decrease a bit and then remain steady.
The values on a weekly basis jump greatly, making it difficult to see if there is any seasonality.
gg_subseries(
us_gasoline,
Barrels
)
By looking at the mean value (blue line) in the subseries plot, we can see that the number of barrels a day increases as the weeks go on until around week 35 which is where the mean begins to decrease.
gg_lag(
us_gasoline,
Barrels
)
print(
ACF(
us_gasoline,
Barrels
),
n = 100
)
## # A tsibble: 31 x 2 [1W]
## lag acf
## <cf_lag> <dbl>
## 1 1W 0.893
## 2 2W 0.882
## 3 3W 0.873
## 4 4W 0.866
## 5 5W 0.847
## 6 6W 0.844
## 7 7W 0.832
## 8 8W 0.831
## 9 9W 0.822
## 10 10W 0.808
## 11 11W 0.801
## 12 12W 0.792
## 13 13W 0.783
## 14 14W 0.779
## 15 15W 0.769
## 16 16W 0.768
## 17 17W 0.763
## 18 18W 0.747
## 19 19W 0.736
## 20 20W 0.737
## 21 21W 0.724
## 22 22W 0.717
## 23 23W 0.709
## 24 24W 0.704
## 25 25W 0.701
## 26 26W 0.704
## 27 27W 0.699
## 28 28W 0.699
## 29 29W 0.700
## 30 30W 0.703
## 31 31W 0.708
From the lag plot and the ACF results, we can see that the correlation is consistently strong.