This assignment explores time series data to uncover trends, patterns, and cycles across various datasets. Through data analysis and visualization, it highlights real-world fluctuations and emphasizes the importance of proper data preparation for accurate insights.
# Check dataset structure
glimpse(aus_production)
## Rows: 218
## Columns: 7
## $ Quarter <qtr> 1956 Q1, 1956 Q2, 1956 Q3, 1956 Q4, 1957 Q1, 1957 Q2, 1957…
## $ Beer <dbl> 284, 213, 227, 308, 262, 228, 236, 320, 272, 233, 237, 313…
## $ Tobacco <dbl> 5225, 5178, 5297, 5681, 5577, 5651, 5317, 6152, 5758, 5641…
## $ Bricks <dbl> 189, 204, 208, 197, 187, 214, 227, 222, 199, 229, 249, 234…
## $ Cement <dbl> 465, 532, 561, 570, 529, 604, 603, 582, 554, 620, 646, 637…
## $ Electricity <dbl> 3923, 4436, 4806, 4418, 4339, 4811, 5259, 4735, 4608, 5196…
## $ Gas <dbl> 5, 6, 7, 6, 5, 7, 7, 6, 5, 7, 8, 6, 5, 7, 8, 6, 6, 8, 8, 7…
glimpse(pelt)
## Rows: 91
## Columns: 3
## $ Year <dbl> 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855,…
## $ Hare <dbl> 19580, 19600, 19610, 11990, 28040, 58000, 74600, 75090, 88480, 61…
## $ Lynx <dbl> 30090, 45150, 49150, 39520, 21230, 8420, 5560, 5080, 10170, 19600…
glimpse(gafa_stock)
## Rows: 5,032
## Columns: 8
## Key: Symbol [4]
## $ Symbol <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAP…
## $ Date <date> 2014-01-02, 2014-01-03, 2014-01-06, 2014-01-07, 2014-01-08,…
## $ Open <dbl> 79.38286, 78.98000, 76.77857, 77.76000, 76.97285, 78.11429, …
## $ High <dbl> 79.57571, 79.10000, 78.11429, 77.99429, 77.93714, 78.12286, …
## $ Low <dbl> 78.86000, 77.20428, 76.22857, 76.84571, 76.95571, 76.47857, …
## $ Close <dbl> 79.01857, 77.28286, 77.70428, 77.14857, 77.63715, 76.64571, …
## $ Adj_Close <dbl> 66.96433, 65.49342, 65.85053, 65.37959, 65.79363, 64.95345, …
## $ Volume <dbl> 58671200, 98116900, 103152700, 79302300, 64632400, 69787200,…
glimpse(vic_elec)
## Rows: 52,608
## Columns: 5
## $ Time <dttm> 2012-01-01 00:00:00, 2012-01-01 00:30:00, 2012-01-01 01:0…
## $ Demand <dbl> 4382.825, 4263.366, 4048.966, 3877.563, 4036.230, 3865.597…
## $ Temperature <dbl> 21.40, 21.05, 20.70, 20.55, 20.40, 20.25, 20.10, 19.60, 19…
## $ Date <date> 2012-01-01, 2012-01-01, 2012-01-01, 2012-01-01, 2012-01-0…
## $ Holiday <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE…
# Generate time series plots
autoplot(aus_production, Bricks) +
ggtitle("Bricks Production in Australia")
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
autoplot(pelt, Lynx) +
ggtitle("Lynx Pelts Data")
autoplot(gafa_stock, Close) +
ggtitle("Stock Closing Prices")
autoplot(vic_elec, Demand) +
ggtitle("Electricity Demand in Victoria") +
xlab("Year") +
ylab("Electricity Demand (MW)") +
theme_minimal()
gafa_stock %>%
group_by(Symbol) %>%
filter(Close == max(Close)) %>%
select(Date, Symbol, Close)
## # A tsibble: 4 x 3 [!]
## # Key: Symbol [4]
## # Groups: Symbol [4]
## Date Symbol Close
## <date> <chr> <dbl>
## 1 2018-10-03 AAPL 232.
## 2 2018-09-04 AMZN 2040.
## 3 2018-07-25 FB 218.
## 4 2018-07-26 GOOG 1268.
# Load CSV file
tute1 <- readr::read_csv("tute1.csv")
## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Sales, AdBudget, GDP
## date (1): Quarter
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Check column names
glimpse(tute1)
## Rows: 100
## Columns: 4
## $ Quarter <date> 1981-03-01, 1981-06-01, 1981-09-01, 1981-12-01, 1982-03-01, …
## $ Sales <dbl> 1020.2, 889.2, 795.0, 1003.9, 1057.7, 944.4, 778.5, 932.5, 99…
## $ AdBudget <dbl> 659.2, 589.0, 512.5, 614.1, 647.2, 602.0, 530.7, 608.4, 637.9…
## $ GDP <dbl> 251.8, 290.9, 290.8, 292.4, 279.1, 254.0, 295.6, 271.7, 259.6…
# Convert data to time series
tute1_ts <- tute1 %>%
mutate(Quarter = yearquarter(Quarter)) %>%
as_tsibble(index = Quarter)
# Plot all three series
autoplot(tute1_ts, vars(Sales, AdBudget, GDP)) +
ggtitle("Quarterly Time Series: Sales, AdBudget, and GDP")
# Reshape data for facet_grid()
tute1_ts_long <- tute1_ts %>%
pivot_longer(cols = c(Sales, AdBudget, GDP), names_to = "name", values_to = "value")
# Verify structure
glimpse(tute1_ts_long)
## Rows: 300
## Columns: 3
## Key: name [3]
## $ Quarter <qtr> 1981 Q1, 1981 Q1, 1981 Q1, 1981 Q2, 1981 Q2, 1981 Q2, 1981 Q3,…
## $ name <chr> "Sales", "AdBudget", "GDP", "Sales", "AdBudget", "GDP", "Sales…
## $ value <dbl> 1020.2, 659.2, 251.8, 889.2, 589.0, 290.9, 795.0, 512.5, 290.8…
# Plot with facet_grid()
tute1_ts_long %>%
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line() +
facet_grid(name ~ ., scales = "free_y") +
labs(title = "Quarterly Trends: Sales, AdBudget, and GDP",
x = "Year", y = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##### Exercise 2.4: US Natural Gas Consumption
# Convert US gas data to tsibble
us_gas_ts <- us_total %>%
as_tsibble(index = year, key = state)
# Check column names
glimpse(us_gas_ts)
## Rows: 1,266
## Columns: 3
## Key: state [53]
## $ year <int> 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007…
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama"…
## $ y <int> 324158, 329134, 337270, 353614, 332693, 379343, 350345, 382367, …
# Filter for New England states and plot using correct column
us_gas_ts %>%
filter(state %in% c("Maine", "Vermont", "New Hampshire", "Massachusetts", "Connecticut", "Rhode Island")) %>%
ggplot(aes(x = year, y = y, colour = state)) + # Use correct column name
geom_line() +
ggtitle("Annual Natural Gas Consumption in New England")
# Load tourism data
tourism_data <- readxl::read_excel("tourism.xlsx")
library(lubridate) # Ensure lubridate is loaded
# Convert Quarter to yearquarter format
tourism_data <- tourism_data %>%
mutate(Quarter = yearquarter(ymd(Quarter))) # Convert character to proper year-quarter format
# Convert to tsibble
tourism_ts <- tourism_data %>%
as_tsibble(index = Quarter, key = c(Region, State, Purpose))
# Verify structure
glimpse(tourism_ts)
## Rows: 24,320
## Columns: 5
## Key: Region, State, Purpose [304]
## $ Quarter <qtr> 1998 Q1, 1998 Q2, 1998 Q3, 1998 Q4, 1999 Q1, 1999 Q2, 1999 Q3,…
## $ Region <chr> "Adelaide", "Adelaide", "Adelaide", "Adelaide", "Adelaide", "A…
## $ State <chr> "South Australia", "South Australia", "South Australia", "Sout…
## $ Purpose <chr> "Business", "Business", "Business", "Business", "Business", "B…
## $ Trips <dbl> 135.0777, 109.9873, 166.0347, 127.1605, 137.4485, 199.9126, 16…
#####Exercise 2.8: Analyzing Time Series Features
#### Total Private Employment
glimpse(us_employment)
## Rows: 143,412
## Columns: 4
## Key: Series_ID [148]
## $ Month <mth> 1939 Jan, 1939 Feb, 1939 Mar, 1939 Apr, 1939 May, 1939 Jun, …
## $ Series_ID <chr> "CEU0500000001", "CEU0500000001", "CEU0500000001", "CEU05000…
## $ Title <chr> "Total Private", "Total Private", "Total Private", "Total Pr…
## $ Employed <dbl> 25338, 25447, 25833, 25801, 26113, 26485, 26481, 26848, 2746…
# Filter only "Total Private" employment data before plotting
us_employment %>%
filter(Title == "Total Private") %>% # Select only relevant data
autoplot(Employed) +
ggtitle("Total Private Employment Over Time")
### Bricks Production
autoplot(aus_production, Bricks) +
ggtitle("Bricks Production Over Time")
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
#### Hare Population
autoplot(pelt, Hare) +
ggtitle("Hare Population Over Time")
#### HO2 Cost From PBS
PBS %>%
filter(ATC2 == "H02") %>%
autoplot(Cost) +
ggtitle("H02 Cost Over Time")
### US Gasoline Barrels
autoplot(us_gasoline, Barrels) +
ggtitle("US Gasoline Barrels Over Time")
### Conclusion This assignment helped me understand time series analysis
by exploring trends, seasonality, and cycles in different datasets. I
observed upward trends in employment and stock prices, seasonal patterns
in electricity demand and bricks production, and cyclic fluctuations in
animal populations.
I also learned the importance of data preprocessing, such as handling duplicates and ensuring correct formats. Moving forward, I would explore forecasting techniques to predict future trends. Overall, this assignment gave me a better understanding of how time series data can be used to analyze real-world patterns and inform decision-making.