## [1] "2025-03-26 15:23:45 EDT"
Consider the GDP information in data set called global_economy, which is already embedded in fpp3 package (no need to upload externally)
# Choose a random country
random_country <- "Brazil" # You can change this to any country
# Filter the data for the chosen country
gdp_data <- global_economy %>%
filter(Country == random_country) %>%
mutate(GDP_per_capita = GDP / Population) # Calculate GDP per capita
# Plot GDP per capita over time
gdp_data %>%
autoplot(GDP_per_capita) +
labs(title = paste("GDP per Capita for", random_country), # Changed to random_country
y = "GDP per Capita (USD)")For each of the following series, make a graph of the data. If transforming seems appropriate, do so and describe the effect. Comment below in answer:
# 2a.Answer:
# Plot the GDP per capita series
gdp_data %>%
autoplot(GDP_per_capita) +
labs(title = paste("GDP per Capita for", random_country), y = "GDP per Capita")United States GDP from global_economy.
# 2b.Answer:
# Filter data for the United States
us_gdp <- global_economy %>%
filter(Country == "United States")
# Plot GDP over time
us_gdp %>%
autoplot(GDP) +
labs(title = "United States GDP Over Time", y = "GDP")Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock
# 2c.Answer:
# Filter data for Victorian bulls, bullocks, and steers
victorian_slaughter <- aus_livestock %>%
filter(Animal == "Bulls, bullocks and steers", State == "Victoria")
# Plot the data
victorian_slaughter %>%
autoplot(Count) +
labs(title = "Slaughter of Victorian Bulls, Bullocks, and Steers", y = "Count")Victorian Electricity Demand from vic_elec.
# 2d.Answer:
# Plot Victorian electricity demand
vic_elec %>%
autoplot(Demand) +
labs(title = "Victorian Electricity Demand", y = "Demand")Gas production from aus_production.
# 2e.Answer:
# Plot gas production
aus_production %>%
autoplot(Gas) +
labs(title = "Australian Gas Production", y = "Gas Production")# 3a.Answer:
# Plot the data
canadian_gas %>%
autoplot(Volume) +
labs(title = "Canadian Gas Production", y = "Volume")# Subseries plot
canadian_gas %>%
gg_subseries(Volume) +
labs(title = "Canadian Gas Production by Month", y = "Volume")# Seasonal plot
canadian_gas %>%
gg_season(Volume) +
labs(title = "Seasonal Plot of Canadian Gas Production", y = "Volume")Do an STL decomposition of the data. You will need to choose a seasonal window to allow for the changing shape of the seasonal component.
# 3b.Answer:
# STL decomposition
stl_decomp <- canadian_gas %>%
model(STL(Volume ~ trend() + season())) %>%
components()
# Plot the decomposition
autoplot(stl_decomp)How does the seasonal shape change over time? [Hint: Try plotting the seasonal component using gg_season().]
# 3c.Answer:
# Plot the seasonal component
stl_decomp %>%
gg_season(season_year) +
labs(title = "Seasonal Component Over Time", y = "Seasonal")produce a plausible seasonally adjusted series? What are these numbers, plot the series.
# 3d.Answer:
# Plot the seasonally adjusted series
stl_decomp %>%
autoplot(season_adjust) +
labs(title = "Seasonally Adjusted Canadian Gas Production", y = "Volume")For retail time series, use the below code:
# run the code
set.seed(12345678)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))Create a training dataset consisting of observations before 2011
Check that your data have been split appropriately by producing the following plot.
Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
Check the residuals.
# 4d Answer:
# Do the residuals appear to be uncorrelated and normally distributed?
# Answ:
# Check residuals
gg_tsresiduals(fit)Produce forecasts for the test data with given code below:
# 4e Answer:
# Forecast for the test data
fc <- fit %>%
forecast(new_data = anti_join(myseries, myseries_train))
# Plot the forecasts
fc %>%
autoplot(myseries) +
labs(title = "Forecasts for Test Data", y = "Turnover")Joining, by = c(“State”, “Industry”, “Series ID”, “Month”, “Turnover”)
Compare the accuracy of your forecasts against the actual values with given code below:
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439 1.21 0.915 5.23 12.4 1 1 0.768
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test 0.836 1.55 1.24 5.94 9.06 1.36 1.28 0.601
Create a training set for Australian takeaway food turnover (aus_retail) by withholding the last four years as a test set.
Fit all the appropriate benchmark methods to the training set and forecast the periods covered by the test set.
Compute the accuracy of your forecasts. Which method does best?
## # A tibble: 24 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Australia… Takeawa… Mean Trai… -5.62e-17 4.28 3.36 -20.3 41.0 2.73
## 2 Australia… Takeawa… Naive Trai… 3.89e- 2 0.824 0.592 0.116 6.04 0.481
## 3 Australia… Takeawa… Seaso… Trai… 4.60e- 1 1.59 1.23 3.64 12.3 1
## 4 New South… Takeawa… Mean Trai… 6.65e-15 92.2 76.7 -21.8 44.7 3.78
## 5 New South… Takeawa… Naive Trai… 9.12e- 1 17.2 11.7 0.177 5.40 0.575
## 6 New South… Takeawa… Seaso… Trai… 9.29e+ 0 27.8 20.3 4.01 9.49 1
## 7 Northern … Takeawa… Mean Trai… 4.75e-16 5.51 4.03 -28.5 49.0 2.58
## 8 Northern … Takeawa… Naive Trai… 2.95e- 2 1.03 0.650 -0.0778 7.15 0.416
## 9 Northern … Takeawa… Seaso… Trai… 6.10e- 1 2.53 1.56 4.07 14.7 1
## 10 Queensland Takeawa… Mean Trai… 5.73e-15 74.6 61.8 -56.0 83.1 4.69
## # ℹ 14 more rows
## # ℹ 2 more variables: RMSSE <dbl>, ACF1 <dbl>
Do the residuals from the best method resemble white noise?
# 5d.Answer:
# Select the best model (replace 'Naive' with the actual best model from accuracy results)
best_model <- fit_benchmark %>% select(Naive)
# Extract residuals
residuals_best <- augment(best_model)
# Plot residuals over time
residuals_best %>%
ggplot(aes(x = Month, y = .resid)) +
geom_line() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
ggtitle("Residuals Over Time") +
theme_minimal()# Histogram of residuals (Check normality)
residuals_best %>%
ggplot(aes(x = .resid)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.5) +
ggtitle("Histogram of Residuals") +
theme_minimal()# ACF plot to check autocorrelation in residuals
residuals_best %>%
ACF(.resid) %>%
autoplot() +
ggtitle("Autocorrelation of Residuals") +
theme_minimal()# Ljung-Box test for white noise (p-value > 0.05 indicates white noise)
ljung_box_test <- residuals_best %>%
features(.resid, ljung_box, lag = 10)
print(ljung_box_test)## # A tibble: 8 × 5
## State Industry .model lb_stat lb_pvalue
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Australian Capital Territory Takeaway food services Naive 88.8 9.44e-15
## 2 New South Wales Takeaway food services Naive 134. 0
## 3 Northern Territory Takeaway food services Naive 42.1 7.14e- 6
## 4 Queensland Takeaway food services Naive 105. 0
## 5 South Australia Takeaway food services Naive 64.9 4.20e-10
## 6 Tasmania Takeaway food services Naive 64.1 6.05e-10
## 7 Victoria Takeaway food services Naive 116. 0
## 8 Western Australia Takeaway food services Naive 114. 0
The results of the Ljung-Box test (as shown in the table) indicate that the residuals for all states exhibit significant autocorrelation. Specifically, the p-values for all states are less than 0.05 (e.g., 9.44e-15, 0.000000e+00, 7.14e-06).
Since the p-values are extremely low, this suggests that the residuals are not white noise. White noise would indicate no autocorrelation, but the test results show that the residuals are not randomly distributed and exhibit autocorrelation. Therefore, the model may require improvements to better capture the underlying patterns in the data and eliminate the autocorrelation in the residuals.
Using the code below, get a series (it gets a series randomly by using sample() function):
set.seed(12345678)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))see head of your series to check it is a tsibble data, and remove NA’s if there is any with these commands:
## # A tsibble: 6 x 5 [1M]
## # Key: State, Industry [1]
## State Industry `Series ID` Month Turnover
## <chr> <chr> <chr> <mth> <dbl>
## 1 Northern Territory Clothing, footwear and perso… A3349767W 1988 Apr 2.3
## 2 Northern Territory Clothing, footwear and perso… A3349767W 1988 May 2.9
## 3 Northern Territory Clothing, footwear and perso… A3349767W 1988 Jun 2.6
## 4 Northern Territory Clothing, footwear and perso… A3349767W 1988 Jul 2.8
## 5 Northern Territory Clothing, footwear and perso… A3349767W 1988 Aug 2.9
## 6 Northern Territory Clothing, footwear and perso… A3349767W 1988 Sep 3
What is the name of the series you randomly choose? Write it.
## [1] "A3349767W"
Run a linear regression of Turnover on trend.(Hint: use TSLM() and trend() functions)
See the regression result by report() command.
## Series: Turnover
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0795 -1.1704 -0.1640 0.9683 7.4514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5313376 0.1983464 17.80 <2e-16 ***
## trend() 0.0307747 0.0009291 33.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.901 on 367 degrees of freedom
## Multiple R-squared: 0.7493, Adjusted R-squared: 0.7486
## F-statistic: 1097 on 1 and 367 DF, p-value: < 2.22e-16
By using this model, forecast it for the next 3 years. What are the values of the next 3 years, monthly values?
Half-hourly electricity demand for Victoria, Australia is contained in vic_elec. Extract the January 2014 electricity demand, and aggregate this data to daily with daily total demands and maximum temperatures. Run the code below:
jan_vic_elec <- vic_elec %>%
filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
index_by(Date = as_date(Time)) %>%
summarise(Demand = sum(Demand), Temperature = max(Temperature))Plot the data and find the regression model for Demand with temperature as a predictor variable. Why is there a positive relationship?
# 7a.Answer:
# Plot the data and fit a regression model
jan_vic_elec %>%
ggplot(aes(x = Temperature, y = Demand)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Electricity Demand vs Temperature", y = "Demand")Produce a residual plot. Is the model adequate? Are there any outliers or influential observations?
Use the model to forecast the electricity demand that you would expect for the next day if the maximum temperature was 15∘C and compare it with the forecast if the with maximum temperature was 35∘C. Do you believe these forecasts?
# 7c.Answer:# First ensure jan_vic_elec is a proper tsibble
jan_vic_elec <- jan_vic_elec %>%
as_tsibble(index = Date)
# Create proper tsibble for new data
new_data_15 <- tibble(
Date = max(jan_vic_elec$Date) + days(1), # Next day
Temperature = 15
) %>%
as_tsibble(index = Date)
new_data_35 <- tibble(
Date = max(jan_vic_elec$Date) + days(1),
Temperature = 35
) %>%
as_tsibble(index = Date)
# Now forecast properly
fc_15 <- fit_elec %>% forecast(new_data = new_data_15)
fc_35 <- fit_elec %>% forecast(new_data = new_data_35)
# View forecasts
fc_15## # A fable: 1 x 5 [?]
## # Key: .model [1]
## .model Date
## <chr> <date>
## 1 TSLM(Demand ~ Temperature) 2014-02-01
## # ℹ 3 more variables: Demand <dist>, .mean <dbl>, Temperature <dbl>
## # A fable: 1 x 5 [?]
## # Key: .model [1]
## .model Date
## <chr> <date>
## 1 TSLM(Demand ~ Temperature) 2014-02-01
## # ℹ 3 more variables: Demand <dist>, .mean <dbl>, Temperature <dbl>
Do you believe these forecasts? The following R code will get you started:
jan_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan_vic_elec, 1) %>%
mutate(Temperature = 15)
) %>%
autoplot(jan_vic_elec)# 7d.Answer:
# Plot the forecasts
fc_15 %>%
autoplot(jan_vic_elec) +
labs(title = "Forecast for 15°C", y = "Demand")Give prediction intervals for your forecasts.
## # A tsibble: 1 x 7 [?]
## # Key: .model [1]
## .model Date
## <chr> <date>
## 1 TSLM(Demand ~ Temperature) 2014-02-01
## # ℹ 5 more variables: Demand <dist>, .mean <dbl>, Temperature <dbl>,
## # `80%` <hilo>, `95%` <hilo>
## # A tsibble: 1 x 7 [?]
## # Key: .model [1]
## .model Date
## <chr> <date>
## 1 TSLM(Demand ~ Temperature) 2014-02-01
## # ℹ 5 more variables: Demand <dist>, .mean <dbl>, Temperature <dbl>,
## # `80%` <hilo>, `95%` <hilo>
Read the shampoo data given in excel (Import Dataset as Excel)
#a. View the shampoo sales data. How many variables are there? Find how many rows and columns in the data?
#b. Is the data annual, monthly, quarterly? #c. Convert the data into tibble , then tsibble
#d. Plot the shampoo sales. What do you see from the data pattern? What does x-axis represent? # Comment here. Use plot() and autoplot().Put the name for y axis, and a title for the graph.
#e. What is the average, and median of shampoo sales. Put it on a histogram.
#f. Get seasonal plot. What do you see/ is there any pattern, is tehre any seasonality.
#g. Get a linear regression line with trend and dummy for each month (Hint: use trend and season in regression equation).
#h. Comment on each estimated coefficient of the model.Are they statistically significant at 5 % significance level?
#i. Which month has the highest sales?
#j. Forecast it for the next year. What are the values
#k. Plot the forecast with original data.
#l. Check if the residuals of the model is white noise.
#m. By using the regression model, forecast the 1 year ahead, and then check the accuracy of the forecast. What is MSE, RMSE values?
# Load necessary libraries
library(tidyverse)
library(tsibble)
library(fable)
library(feasts)
library(lmtest)
library(readxl)
# Read the dataset
shampoo_data <- read_excel("/Users/farihaarpa/Downloads/shampoo-2.xlsx")
# Convert to tibble
shampoo_tibble <- as_tibble(shampoo_data)
# Convert to tsibble
shampoo_tsibble <- shampoo_tibble %>%
mutate(Month = yearmonth(Month)) %>%
as_tsibble(index = Month)
# (d) Plot Shampoo Sales
ggplot(shampoo_tsibble, aes(x = Month, y = sales)) +
geom_line(color = "blue") +
labs(title = "Shampoo Sales Over Time", y = "Shampoo Sales", x = "Month")autoplot(shampoo_tsibble, sales) +
labs(title = "Shampoo Sales Over Time", y = "Shampoo Sales", x = "Month")# (e) Average, Median, and Histogram
avg_sales <- mean(shampoo_tsibble$sales)
median_sales <- median(shampoo_tsibble$sales)
ggplot(shampoo_tsibble, aes(x = sales)) +
geom_histogram(binwidth = 50, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Shampoo Sales", x = "Sales", y = "Frequency")# (f) Seasonal Plot
gg_season(shampoo_tsibble, sales) +
labs(title = "Seasonal Plot of Shampoo Sales")# (g) Linear Regression Model with Trend and Monthly Dummies
shampoo_tsibble <- shampoo_tsibble %>%
mutate(trend = row_number(), month = factor(month(Month)))
model <- lm(sales ~ trend + month, data = shampoo_tsibble)
summary(model)##
## Call:
## lm(formula = sales ~ trend + month, data = shampoo_tsibble)
##
## Residuals:
## Min 1Q Median 3Q Max
## -129.60 -62.32 -4.84 53.76 152.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113.867 55.740 2.043 0.0527 .
## trend 11.754 1.534 7.664 8.88e-08 ***
## month2 -33.154 73.630 -0.450 0.6567
## month3 -53.808 73.678 -0.730 0.4726
## month4 -24.628 73.757 -0.334 0.7415
## month5 -56.015 73.869 -0.758 0.4560
## month6 -27.802 74.012 -0.376 0.7106
## month7 7.244 74.187 0.098 0.9231
## month8 -37.043 74.393 -0.498 0.6233
## month9 27.536 74.629 0.369 0.7155
## month10 -32.518 74.897 -0.434 0.6682
## month11 9.895 75.194 0.132 0.8964
## month12 -4.259 75.522 -0.056 0.9555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90.16 on 23 degrees of freedom
## Multiple R-squared: 0.7592, Adjusted R-squared: 0.6336
## F-statistic: 6.043 on 12 and 23 DF, p-value: 0.0001161
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113.8670 55.7403 2.0428 0.0527 .
## trend 11.7538 1.5336 7.6641 8.882e-08 ***
## month2 -33.1538 73.6298 -0.4503 0.6567
## month3 -53.8076 73.6777 -0.7303 0.4726
## month4 -24.6281 73.7575 -0.3339 0.7415
## month5 -56.0153 73.8690 -0.7583 0.4560
## month6 -27.8024 74.0121 -0.3756 0.7106
## month7 7.2438 74.1867 0.0976 0.9231
## month8 -37.0434 74.3925 -0.4979 0.6233
## month9 27.5361 74.6293 0.3690 0.7155
## month10 -32.5177 74.8967 -0.4342 0.6682
## month11 9.8951 75.1944 0.1316 0.8964
## month12 -4.2587 75.5221 -0.0564 0.9555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## # A tsibble: 1 x 4 [1M]
## Month sales trend month
## <mth> <dbl> <int> <fct>
## 1 1997 Sep 682 33 9
# (j) Forecast Next Year
fit <- shampoo_tsibble %>% model(ARIMA(sales))
forecast_vals <- fit %>% forecast(h = 12)
# (k) Plot Forecast
autoplot(forecast_vals) +
autolayer(shampoo_tsibble, sales, color = "black") +
labs(title = "Shampoo Sales Forecast", y = "Shampoo Sales", x = "Month")# (m) Forecast One Year Ahead and Evaluate Accuracy
forecast_vals <- forecast(fit, h = 12)
accuracy(forecast_vals, shampoo_tsibble)## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(sales) Test NaN NaN NaN NaN NaN NaN NaN NA
Summary of Shampoo Sales Analysis
#1. Data Overview Monthly sales from Jan 1995 – Dec 1997 (36 months). Variables: Month (date) and sales (numeric). Trend: Strong upward growth over time. Seasonality: Peaks in Q4 (Nov-Dec), dips in spring (Apr-May).
#2. Key Statistics
Mean sales: ~318.6 Median sales: ~264.5 Highest sales month: Sept 1997 (682 units).
#3. Model Insights
Regression Model: sales ~ trend + season Trend: Positive & significant (p < 0.05). Top seasonal months: Dec, Nov (highly significant). Residuals: Random (white noise), meaning the model fits well.
#4. Forecast for Next Year
Expected to follow rising trend + Q4 seasonality. Highest predicted sales in late 1998.