library(fpp3)
library(tidyverse)
Sys.time()
## [1] "2025-01-11 20:16:58 EST"

1.

Consider the GDP information in data set called global_economy, which is already embedded in fpp3 package (no need to upload externally)

1. Choose a random country by yourself. Then plot the GDP per capita for this country over time? How GDP per capita has changed over time for the series you chose? Explain briefly.

global_economy # see the data.
## # A tsibble: 15,150 x 9 [1Y]
## # Key:       Country [263]
##    Country     Code   Year         GDP Growth   CPI Imports Exports Population
##    <fct>       <fct> <dbl>       <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
##  1 Afghanistan AFG    1960  537777811.     NA    NA    7.02    4.13    8996351
##  2 Afghanistan AFG    1961  548888896.     NA    NA    8.10    4.45    9166764
##  3 Afghanistan AFG    1962  546666678.     NA    NA    9.35    4.88    9345868
##  4 Afghanistan AFG    1963  751111191.     NA    NA   16.9     9.17    9533954
##  5 Afghanistan AFG    1964  800000044.     NA    NA   18.1     8.89    9731361
##  6 Afghanistan AFG    1965 1006666638.     NA    NA   21.4    11.3     9938414
##  7 Afghanistan AFG    1966 1399999967.     NA    NA   18.6     8.57   10152331
##  8 Afghanistan AFG    1967 1673333418.     NA    NA   14.2     6.77   10372630
##  9 Afghanistan AFG    1968 1373333367.     NA    NA   15.2     8.90   10604346
## 10 Afghanistan AFG    1969 1408888922.     NA    NA   15.0    10.1    10854428
## # ℹ 15,140 more rows
# 1.Answer:

gdp_India <- global_economy %>%
  filter(Country == "India") %>%
  mutate(GDP_per_capita = GDP / Population) %>%
  select(Year, GDP_per_capita)


autoplot(gdp_India, GDP_per_capita) +
  labs(title = "India's GDP per Capita Shows Gradual Growth from 1960s to 2000s,\n with Rapid Increase in Recent Years.",
       x = "Year",
       y = "GDP per Capita (USD)") +
  geom_smooth(method = "lm", se = FALSE, color = "Green", linetype = "dashed") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#The GDP per capita in India has generally increased over time, with steady growth from the 1960s until a sharp acceleration in the early 2000s. This recent rapid rise highlights significant economic progress, likely driven by structural reforms and technological advancements.

2.

For each of the following series, make a graph of the data. If transforming seems appropriate, do so and describe the effect. Comment below in answer:

2a. Use the series you chose in #1.

# 2a.Answer:

gdp_India <- gdp_India%>%
  mutate(Log_GDP_per_capita = log(GDP_per_capita))

autoplot(gdp_India, Log_GDP_per_capita) +
  labs(title = "The log-transformed GDP per capita for India shows a consistent upward\n trend over time, indicating steady economic growth with\n reduced variability in recent years.",
       x = "Year",
       y = "Log of GDP per Capita") +
    geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "dashed") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

2b.

United States GDP from global_economy.

# 2b.Answer:

gdp_us <- global_economy %>%
  filter(Country == "United States") %>%
  select(Year, GDP)

# # Plotting US GDP over time
autoplot(gdp_us, GDP) +
  labs(title = "U.S. GDP Demonstrates a Strong Upward Trend, Growing from Approximately $5 Trillion\n in the 1960s to Over $20 Trillion by 2020, Reflecting Steady Economic Growth\n Throughout the Decades.",
       x = "Year",
       y = "GDP (USD)") +
    geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "dashed") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# The graph illustrates a consistent increase in U.S. GDP over time, showcasing steady economic growth, with an accelerating trend observed in recent decades.

2c.

Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock

# 2c.Answer:

victorian_cattle <- aus_livestock %>%
  filter(State == "Victoria", Animal == "Bulls, bullocks and steers")

# Plotting the number of slaughters over time
autoplot(victorian_cattle, Count) +
  labs(title = "
The Slaughter of Bulls, Bullocks, and Steers in Victoria Decreased from Over\n 125,000 in the 1980s to Approximately 50,000 in Recent Years.",
       x = "Year",
       y = "Count") +
    geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "dashed") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# The number of bulls, bullocks, and steers slaughtered in Victoria has decreased from over 125,000 in the 1980s to about 50,000 in recent years.

2d.

Victorian Electricity Demand from vic_elec.

# 2d.Answer:

vic_elec_monthly <- vic_elec %>%
  index_by(Month = ~ yearmonth(.)) %>%
  summarise(Demand = mean(Demand, na.rm = TRUE))


autoplot(vic_elec_monthly, Demand) +
  labs(title = "Victorian Electricity Demand Exhibits Seasonal Peaks in Summer,\nDemonstrating a Distinct Seasonal Trend.",
       x = "Date",
       y = "Electricity Demand (MW)") +
  theme_minimal()

#The monthly average electricity demand in Victoria reveals a distinct seasonal trend, with peaks generally occurring during the summer months, likely reflecting increased demand for cooling.

2e.

Gas production from aus_production.

# 2e.Answer:

gas_production <- aus_production %>%
  select(Quarter, Gas)


autoplot(gas_production, Gas) +
  labs(title = "Australian Gas Production Increased from Approximately 0 to Over 250 Petajoules,\nExhibiting Significant Seasonal Peaks Throughout the Period.",
       x = "Year",
       y = "Gas Production (in petajoules)") +
    geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "dashed") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#Australian gas production demonstrates a consistent upward trend with notable seasonal variations, indicating both steady growth and regular peak demands likely influenced by seasonal consumption patterns.

3. Use the canadian_gas data (monthly Canadian gas production in billions of cubic metres, January 1960 – February 2005).

2a. Plot the data using autoplot(), gg_subseries() , gg_season() to look at the effect of the changing seasonality over time. Describe the graphs in your own words. What do you see? What type pf pattern do you observe?

# 3a.Answer:

# General time series plot
autoplot(canadian_gas) +
  labs(title = "Canadian Monthly Gas Production(1960–2005)",
       x = "Year",
       y = "Gas Production (Billions of Cubic Metres)") +
  theme_minimal()
## Plot variable not specified, automatically selected `.vars = Volume`

# The time series plot shows a general upward trend in Canadian gas production from 1960 to 2005, with visible seasonal variations. These fluctuations become more prominent in recent years as production levels increase.

# Monthly Breakdown of Gas Production
gg_subseries(canadian_gas, Volume) +
  labs(title = "Monthly Trends in Canadian Gas Production",
       y = "Gas Production (Billions of Cubic Metres)") +
  theme_minimal()

# The monthly subseries plot reveals higher production levels in winter months, such as January and February, likely due to higher demand for heating. Production levels are lower in summer months, indicating a regular seasonal pattern.

# Seasonal Variation in Gas Production Over Time
gg_season(canadian_gas, Volume) +
  labs(title = "Seasonal Pattern in Canadian Gas Production",
       y = "Gas Production (Billions of Cubic Metres)") +
  theme_minimal()

# The seasonal plot highlights a consistent pattern, with production peaking in colder months and decreasing in warmer months. The amplitude of these seasonal changes increases over time, reflecting a growing variation in production as overall levels rise.

3b.

Do an STL decomposition of the data. You will need to choose a seasonal window to allow for the changing shape of the seasonal component.

# 3b.Answer:

canadian_gas_stl <- canadian_gas %>%
  model(STL(Volume ~ season(window = "periodic")))

# Plotting Decomposition Components
components(canadian_gas_stl) %>%
  autoplot() +
  labs(title = "STL Decomposition of Monthly Gas Production in Canada",
       x = "Year") +
  theme_minimal()

#  The STL decomposition shows a strong and growing seasonal pattern in Canadian gas production, with peaks in winter reflecting higher demand. This confirms the seasonal trends observed earlier. The remainder component reveals any irregularities or unexpected changes in production, potentially due to external factors or anomalies.

3c.

How does the seasonal shape change over time? [Hint: Try plotting the seasonal component using gg_season().]

# 3c.Answer:

# Performing STL decomposition with a periodic seasonal window
canadian_gas_stl <- canadian_gas %>%
  model(STL_decomp = STL(Volume ~ season(window = "periodic")))

# Extracting components from the STL decomposition
seasonal_component <- canadian_gas_stl %>%
  components() %>%
  as_tsibble() %>%
  select(Month, season_adjust)

gg_season(seasonal_component, season_adjust) +
  labs(title = "Evolution of the Seasonal Pattern in Canadian Gas Production",
       y = "Seasonal Component (Billions of Cubic Metres)") +
  theme_minimal()

# The seasonal component shows peak production during winter months, with the amplitude of these peaks increasing over time. This trend suggests a rising demand for gas in winter as production levels grow.

3d.

produce a plausible seasonally adjusted series? What are these numbers, plot the series.

# 3d.Answer:

# Perform STL decomposition with a periodic seasonal window
canadian_gas_stl <- canadian_gas %>%
  model(STL_decomp = STL(Volume ~ season(window = "periodic")))

# Generate the seasonally adjusted series by removing seasonal components
canadian_gas_seasonally_adjusted <- components(canadian_gas_stl) %>%
  mutate(Seasonally_Adjusted = Volume - season_year) %>%
  select(Month, Seasonally_Adjusted)

# Visualize the seasonally adjusted series
autoplot(canadian_gas_seasonally_adjusted, Seasonally_Adjusted) +
  labs(title = "Trend in Seasonally Adjusted Canadian Gas Production",
       x = "Year",
       y = "Gas Production (Billions of Cubic Metres)") +
    geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "dashed") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# The seasonally adjusted series reveals a clear upward trend in Canadian gas production from the 1960s to the early 2000s, highlighting consistent growth in production independent of seasonal fluctuations.

4.

For retail time series, use the below code:

# run the code
set.seed(12345678)

myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

4a.

Create a training dataset consisting of observations before 2011

myseries_train <- myseries %>%
  filter(year(Month) < 2011)

4b.

Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

4c.

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

 #Answer:
    fit <- myseries_train %>%
      model(SNAIVE(Turnover))

4d.

Check the residuals.

# 4d Answer:

fit %>% gg_tsresiduals()

# Do the residuals appear to be uncorrelated and normally distributed?
# Answ: The residuals display an approximately normal distribution, forming a bell-shaped curve centered around zero, suggesting they may be normally distributed.

4e.

Produce forecasts for the test data with given code below:

# 4e Answer:
fc <- fit %>%  
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc %>% autoplot(myseries)

Joining, by = c(“State”, “Industry”, “Series ID”, “Month”, “Turnover”)

4f.

Compare the accuracy of your forecasts against the actual values with given code below:

fit %>% accuracy()
## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439  1.21 0.915  5.23  12.4     1     1 0.768
fc %>% accuracy(myseries)
## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test  0.836  1.55  1.24  5.94  9.06  1.36  1.28 0.601
# 4f Answ:

fc <- forecast(fit)
autoplot(fc, myseries)

accuracy(fc, myseries)
## # A tibble: 1 × 12
##   .model State Industry .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>  <chr> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 SNAIV… Nort… Clothin… Test  -0.125 0.765   0.6 -1.14  4.59 0.656 0.631 -0.0541
# The model is quite accurate considering the RMSE Score of 1.228089

4g.

How sensitive are the accuracy measures to the amount of training data used?

# 4g Answer: The accuracy measures are influenced by the quantity of training data used; with limited data, errors may increase, and metrics such as RMSE, MAE, and MAPE can become less stable.

5.

5a.

Create a training set for Australian takeaway food turnover (aus_retail) by withholding the last four years as a test set.

# 5a.Answer:

# Filter the aus_retail dataset to include only takeaway food turnover data
takeaway_data <- aus_retail %>%
  filter(Industry == "Takeaway food services") %>%
  select(-Industry)  # Drop the Industry column

#Set the cutoff date by subtracting 48 months (4 years) from the latest date in the dataset
cutoff_date <- max(takeaway_data$Month) - 48 

# Define the training set with data before the cutoff date
takeaway_train <- takeaway_data %>%
  filter(Month < cutoff_date)

# Define the test set with data from the last four years
takeaway_test <- takeaway_data %>%
  filter(Month >= cutoff_date)

#  Display the structure of the training and test sets
takeaway_train
## # A tsibble: 3,064 x 4 [1M]
## # Key:       State [8]
##    State                        `Series ID`    Month Turnover
##    <chr>                        <chr>          <mth>    <dbl>
##  1 Australian Capital Territory A3349850K   1982 Apr      3.2
##  2 Australian Capital Territory A3349850K   1982 May      3.3
##  3 Australian Capital Territory A3349850K   1982 Jun      3.5
##  4 Australian Capital Territory A3349850K   1982 Jul      3.5
##  5 Australian Capital Territory A3349850K   1982 Aug      3.7
##  6 Australian Capital Territory A3349850K   1982 Sep      3.9
##  7 Australian Capital Territory A3349850K   1982 Oct      4  
##  8 Australian Capital Territory A3349850K   1982 Nov      4.3
##  9 Australian Capital Territory A3349850K   1982 Dec      4.3
## 10 Australian Capital Territory A3349850K   1983 Jan      3.9
## # ℹ 3,054 more rows
takeaway_test
## # A tsibble: 392 x 4 [1M]
## # Key:       State [8]
##    State                        `Series ID`    Month Turnover
##    <chr>                        <chr>          <mth>    <dbl>
##  1 Australian Capital Territory A3349850K   2014 Dec     21.1
##  2 Australian Capital Territory A3349850K   2015 Jan     17.3
##  3 Australian Capital Territory A3349850K   2015 Feb     17.3
##  4 Australian Capital Territory A3349850K   2015 Mar     19.9
##  5 Australian Capital Territory A3349850K   2015 Apr     20.1
##  6 Australian Capital Territory A3349850K   2015 May     20.6
##  7 Australian Capital Territory A3349850K   2015 Jun     20.5
##  8 Australian Capital Territory A3349850K   2015 Jul     21.5
##  9 Australian Capital Territory A3349850K   2015 Aug     21.3
## 10 Australian Capital Territory A3349850K   2015 Sep     20.9
## # ℹ 382 more rows

5b.

Fit all the appropriate benchmark methods to the training set and forecast the periods covered by the test set.

# 5b.Answer:


# Fit benchmark forecasting models to the training data
fit_benchmarks <- takeaway_train %>%
  model(
    Mean = MEAN(Turnover),
    Naive = NAIVE(Turnover),
    Seasonal_Naive = SNAIVE(Turnover),
  )

#  Generate forecasts for the test period using each benchmark model
fc_benchmarks <- fit_benchmarks %>%
  forecast(new_data = takeaway_test)

# Plot benchmark forecasts alongside actual values in the test set
fc_benchmarks %>%
  autoplot(takeaway_train, level = NULL) +
  autolayer(takeaway_test, Turnover, color = "yellow", linetype = "dashed") +
  labs(title = "Australian Takeaway Food Turnover: Benchmark Model Forecasts",
       x = "Date",
       y = "Turnover (AUD)") +
  theme_minimal()

5c.

Compute the accuracy of your forecasts. Which method does best?

# 5c.Answer:

# Calculate accuracy metrics for each benchmark model against the test set
accuracy_metrics <- fc_benchmarks %>%
  accuracy(takeaway_test)

# Display the table of accuracy metrics
accuracy_metrics
## # A tibble: 24 × 11
##    .model State         .type     ME   RMSE    MAE   MPE  MAPE  MASE RMSSE  ACF1
##    <chr>  <chr>         <chr>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 Mean   Australian C… Test   13.1   13.5   13.1  54.8   54.8   NaN   NaN 0.819
##  2 Mean   New South Wa… Test  316.   320.   316.   59.3   59.3   NaN   NaN 0.718
##  3 Mean   Northern Ter… Test    9.70   9.85   9.70 49.9   49.9   NaN   NaN 0.689
##  4 Mean   Queensland    Test  166.   167.   166.   53.1   53.1   NaN   NaN 0.198
##  5 Mean   South Austra… Test   43.6   44.2   43.6  48.9   48.9   NaN   NaN 0.551
##  6 Mean   Tasmania      Test   13.6   13.9   13.6  49.9   49.9   NaN   NaN 0.706
##  7 Mean   Victoria      Test  171.   173.   171.   52.9   52.9   NaN   NaN 0.609
##  8 Mean   Western Aust… Test   97.2   98.3   97.2  59.5   59.5   NaN   NaN 0.791
##  9 Naive  Australian C… Test    2.64   3.94   3.22  9.74  12.8   NaN   NaN 0.819
## 10 Naive  New South Wa… Test   74.6   89.9   77.3  13.3   14.0   NaN   NaN 0.718
## # ℹ 14 more rows
# Determine the model with the best performance based on the lowest RMSE value
best_model <- accuracy_metrics %>%
  filter(RMSE == min(RMSE)) %>%
  select(.model, RMSE, MAE, MAPE)

# Output the performance metrics for the best model
best_model
## # A tibble: 1 × 4
##   .model  RMSE   MAE  MAPE
##   <chr>  <dbl> <dbl> <dbl>
## 1 Naive   2.94  2.53  12.6
# The Naive model demonstrates the highest accuracy among the benchmarks..

5d.

Do the residuals from the best method resemble white noise?

# 5d.Answer:

# Extract the optimal model based on RMSE
best_fit <- fit_benchmarks %>%
  select(!!best_model$.model)

# Compute residuals for the optimal model using the training data
residuals_best <- residuals(best_fit)

# Assess autocorrelation in the residuals
acf_residuals <- residuals_best %>%
  ACF(.resid) %>%
  autoplot() +
  labs(title = "Autocorrelation Function (ACF) of Residuals from the Optimal Model")

# Create a histogram and QQ plot to evaluate the normality of residuals
hist_residuals <- residuals_best %>%
  ggplot(aes(x = .resid)) +
  geom_histogram(aes(y = ..density..), bins = 20, fill = "yellow", color = "black") +
  geom_density(color = "red") +
  labs(title = "Residuals Histogram", x = "Residuals", y = "Density")

qq_residuals <- residuals_best %>%
  ggplot(aes(sample = .resid)) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "QQ Plot of Residuals", x = "Theoretical Quantiles", y = "Sample Quantiles")

# Display the visualizations
acf_residuals

hist_residuals

qq_residuals

#  The residuals exhibit some autocorrelation, as indicated by the ACF plot, and show deviations from normality in the QQ plot. This suggests that the model may not have fully captured all underlying patterns in the data, indicating some remaining structure in the residuals.

6.

Using the code below, get a series (it gets a series randomly by using sample() function):

set.seed(12345678)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

see head of your series to check it is a tsibble data, and remove NA’s if there is any with these commands:

head(myseries)
## # A tsibble: 6 x 5 [1M]
## # Key:       State, Industry [1]
##   State              Industry                      `Series ID`    Month Turnover
##   <chr>              <chr>                         <chr>          <mth>    <dbl>
## 1 Northern Territory Clothing, footwear and perso… A3349767W   1988 Apr      2.3
## 2 Northern Territory Clothing, footwear and perso… A3349767W   1988 May      2.9
## 3 Northern Territory Clothing, footwear and perso… A3349767W   1988 Jun      2.6
## 4 Northern Territory Clothing, footwear and perso… A3349767W   1988 Jul      2.8
## 5 Northern Territory Clothing, footwear and perso… A3349767W   1988 Aug      2.9
## 6 Northern Territory Clothing, footwear and perso… A3349767W   1988 Sep      3
myseries =  myseries %>% filter(!is.na(`Series ID`))

6a.

What is the name of the series you randomly choose? Write it.

# 6a.Answer:

myseries_name <- unique(myseries$`Series ID`)
print(myseries_name)
## [1] "A3349767W"

6b.

Run a linear regression of Turnover on trend.(Hint: use TSLM() and trend() functions)

# 6b.Answer:

model <- myseries %>%
  model(Linear_Trend = TSLM(Turnover ~ trend()))

6c.

See the regression result by report() command.

# 6c.Answer:

report(model)
## Series: Turnover 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0795 -1.1704 -0.1640  0.9683  7.4514 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.5313376  0.1983464   17.80   <2e-16 ***
## trend()     0.0307747  0.0009291   33.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.901 on 367 degrees of freedom
## Multiple R-squared: 0.7493,  Adjusted R-squared: 0.7486
## F-statistic:  1097 on 1 and 367 DF, p-value: < 2.22e-16

6d.

By using this model, forecast it for the next 3 years. What are the values of the next 3 years, monthly values?

# 6d.Answer:
# Create a forecast for the next 3 years (36 months)
forecast_values <- model %>%
  forecast(h = 36)

# Show the forecasted values
forecast_values %>%
  as_tibble() %>%
  select(Month, .mean) %>%
  print()
## # A tibble: 36 × 2
##       Month .mean
##       <mth> <dbl>
##  1 2019 Jan  14.9
##  2 2019 Feb  14.9
##  3 2019 Mar  15.0
##  4 2019 Apr  15.0
##  5 2019 May  15.0
##  6 2019 Jun  15.1
##  7 2019 Jul  15.1
##  8 2019 Aug  15.1
##  9 2019 Sep  15.2
## 10 2019 Oct  15.2
## # ℹ 26 more rows

6d.

Plot the forecast values along with the original data.

# 6d.Answer:
# Create a forecast for the next 3 years (36 months)
forecast_values <- model %>%
  forecast(h = 36)

# Vizualize the original data and  forecasted values
autoplot(myseries, Turnover) +
  autolayer(forecast_values, .mean, series = "Forecast", PI = FALSE) +
  labs(title = "3-Year Turnover Forecast",
       x = "Month",
       y = "Turnover (AUD)") +
  theme_minimal()

6e.

Get the residuals from the model. And check the residuals to check whether or not it satisfies the requirements for white noise error terms.(hint: augment() and gg_tsresiduals() functions)

# 6e.Answer:

gg_tsresiduals(model)

#The residuals show patterns, autocorrelation at lag 12, and deviations from normality, indicating they do not resemble white noise.

7.

Half-hourly electricity demand for Victoria, Australia is contained in vic_elec. Extract the January 2014 electricity demand, and aggregate this data to daily with daily total demands and maximum temperatures. Run the code below:

jan_vic_elec <- vic_elec %>%
  filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
  index_by(Date = as_date(Time)) %>%
  summarise(Demand = sum(Demand), Temperature = max(Temperature))

7a.

Plot the data and find the regression model for Demand with temperature as a predictor variable. Why is there a positive relationship?

# 7a.Answer:

# Visualize the relationship between Demand and Temperature
jan_vic_elec %>%
  ggplot(aes(x = Temperature, y = Demand)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Relationship Between Daily Electricity Demand and Temperature (January 2014)",
       x = expression("Temperature ("* degree * "C)"),
       y = "Demand (MW)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Fit a linear regression model using Temperature as a predictor for Demand

model <- jan_vic_elec %>%
  model(lm_model = TSLM(Demand ~ Temperature))

# The observed positive correlation between temperature and electricity demand suggests that higher temperatures drive up demand, likely due to increased air conditioning use. This trend is especially evident on hotter days, where cooling needs lead to a direct increase in electricity consumption.

7b.

Produce a residual plot. Is the model adequate? Are there any outliers or influential observations?

# 7b.Answer:
gg_tsresiduals(model)

# The model may be insufficient, as the residuals show patterns and autocorrelation, suggesting potential outliers or influential points impacting the model's accuracy.

7c.

Use the model to forecast the electricity demand that you would expect for the next day if the maximum temperature was 15∘C and compare it with the forecast if the with maximum temperature was 35∘C. Do you believe these forecasts?

# 7c.Answer:

electricity_demand_15 <- jan_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan_vic_elec, 1) %>%
      mutate(Temperature = 15)
  ) %>%
  autoplot(jan_vic_elec)+labs(title=expression("Projected Electricity Demand for Next Day with Max Temperature of 15" * degree * "C"))
electricity_demand_15

#Forecast electricity demand for the next day if the maximum temperature is 35°C

electricity_demand_35 <- jan_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan_vic_elec, 1) %>%
      mutate(Temperature = 35)
  ) %>%
  autoplot(jan_vic_elec)+labs(title=expression("Projected Electricity Demand for Next Day with Max Temperature of 35" * degree * "C"))
electricity_demand_35

# These forecasts offer a general indication of demand but should be interpreted with caution.

7d.

Do you believe these forecasts? The following R code will get you started:

 jan_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan_vic_elec, 1) %>%
      mutate(Temperature = 15)
  ) %>%
  autoplot(jan_vic_elec)

# 7d.Answer:
#The forecast appears inaccurate, as the average demand has never dropped below 190000.

7e.

Give prediction intervals for your forecasts.

# 7e.Answer:

# Forecast and plot electricity demand for the next day assuming a maximum temperature of 15°C
electricity_demand_15 <- jan_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan_vic_elec, 1) %>%
      mutate(Temperature = 15)
  ) %>%
  autoplot(jan_vic_elec) +
  labs(title = expression("Next-Day Electricity Demand Forecast at Max Temperature of 15" * degree * "C"),
       y = "Electricity Demand (MW)", x = "Date")

# Display the forecast plot
print(electricity_demand_15)

# Forecast and plot electricity demand for the next day assuming a maximum temperature of 35°C
electricity_demand_35 <- jan_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan_vic_elec, 1) %>%
      mutate(Temperature = 35)
  ) %>%
  autoplot(jan_vic_elec) +
  labs(title = expression("Next-Day Electricity Demand Forecast at Max Temperature of 35"* degree * "C"),
       y = "Electricity Demand (MW)", x = "Date")

# Display the forecast plot
print(electricity_demand_35)

8.

Read the shampoo data given in excel (Import Dataset as Excel)

#a. View the shampoo sales data. How many variables are there? Find how many rows and columns in the data?
  library(readxl)
shampoo <- read_excel("~/Business Forecating/Midterm/shampoo-2.xlsx")
dim(shampoo)
## [1] 36  2
#b. Is the data annual, monthly, quarterly?
unique(shampoo$Month)
##  [1] "1995-01-01 UTC" "1995-02-01 UTC" "1995-03-01 UTC" "1995-04-01 UTC"
##  [5] "1995-05-01 UTC" "1995-06-01 UTC" "1995-07-01 UTC" "1995-08-01 UTC"
##  [9] "1995-09-01 UTC" "1995-10-01 UTC" "1995-11-01 UTC" "1995-12-01 UTC"
## [13] "1996-01-01 UTC" "1996-02-01 UTC" "1996-03-01 UTC" "1996-04-01 UTC"
## [17] "1996-05-01 UTC" "1996-06-01 UTC" "1996-07-01 UTC" "1996-08-01 UTC"
## [21] "1996-09-01 UTC" "1996-10-01 UTC" "1996-11-01 UTC" "1996-12-01 UTC"
## [25] "1997-01-01 UTC" "1997-02-01 UTC" "1997-03-01 UTC" "1997-04-01 UTC"
## [29] "1997-05-01 UTC" "1997-06-01 UTC" "1997-07-01 UTC" "1997-08-01 UTC"
## [33] "1997-09-01 UTC" "1997-10-01 UTC" "1997-11-01 UTC" "1997-12-01 UTC"
#c. Convert the data into tibble , then tsibble 
shampoo <- shampoo %>%
  mutate(monthly = yearmonth(Month)) %>%
  select(-Month) %>%
  as_tibble() %>%
  as_tsibble(index = monthly)
  
#d. Plot the shampoo sales. What do you see from the data pattern? What does x-axis represent? 
# Comment here. Use plot() and autoplot().Put the name for y axis, and a title for the graph.

plot(shampoo$monthly, shampoo$sales, type = "l", main = "Shampoo Sales Trends Over Time", xlab = "Date", ylab = "Sales")

autoplot(shampoo, sales) + 
  labs(title = "Shampoo Sales Trends Over Time", y = "Sales", x = "Date")

#e. What is the average, and median of shampoo sales. Put it on a histogram.

avg_sales <- mean(shampoo$sales, na.rm = TRUE)
median_sales <- median(shampoo$sales, na.rm = TRUE)

ggplot(shampoo, aes(x = sales)) +
  geom_histogram(binwidth = 50, fill = "yellow", alpha = 0.7) +
  labs(title = "Distribution of Shampoo Sales", x = "Sales", y = "Frequency") +
  geom_vline(aes(xintercept = avg_sales), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = median_sales), color = "blue", linetype = "dashed")

#f. Get seasonal plot. What do you see/ is there any pattern, is tehre any seasonality.
gg_season(shampoo, sales) +
  labs(title = "Seasonal Plot of Shampoo Sales", y = "Sales", x = "Month")

#g. Get a linear regression line with trend and dummy for each month (Hint: use trend and season in regression equation).
lm_model <- shampoo %>%
  model(TSLM(sales ~ trend() + season()))
report(lm_model)
## Series: sales 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -129.60  -62.32   -4.84   53.76  152.72 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     113.867     55.740   2.043   0.0527 .  
## trend()          11.754      1.534   7.664 8.88e-08 ***
## season()year2   -33.154     73.630  -0.450   0.6567    
## season()year3   -53.808     73.678  -0.730   0.4726    
## season()year4   -24.628     73.757  -0.334   0.7415    
## season()year5   -56.015     73.869  -0.758   0.4560    
## season()year6   -27.802     74.012  -0.376   0.7106    
## season()year7     7.244     74.187   0.098   0.9231    
## season()year8   -37.043     74.393  -0.498   0.6233    
## season()year9    27.536     74.629   0.369   0.7155    
## season()year10  -32.518     74.897  -0.434   0.6682    
## season()year11    9.895     75.194   0.132   0.8964    
## season()year12   -4.259     75.522  -0.056   0.9555    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 90.16 on 23 degrees of freedom
## Multiple R-squared: 0.7592,  Adjusted R-squared: 0.6336
## F-statistic: 6.043 on 12 and 23 DF, p-value: 0.00011612
#h. Comment on each estimated coefficient of the model.Are they statistically significant at 5 % significance level?

#No, the majority of them are not statistically significant at the 5% significance level.
  
#i. Which month has the highest sales?
highest_sales_month <- shampoo %>%
  filter(sales == max(sales, na.rm = TRUE)) %>%
  select(monthly, sales)
print(highest_sales_month)
## # A tsibble: 1 x 2 [1M]
##    monthly sales
##      <mth> <dbl>
## 1 1997 Sep   682
#j. Forecast it for the next year. What are the values
  
forecast_next_year <- lm_model %>%
  forecast(h = "1 year")
print(forecast_next_year)
## # A fable: 12 x 4 [1M]
## # Key:     .model [1]
##    .model                            monthly
##    <chr>                               <mth>
##  1 TSLM(sales ~ trend() + season()) 1998 Jan
##  2 TSLM(sales ~ trend() + season()) 1998 Feb
##  3 TSLM(sales ~ trend() + season()) 1998 Mar
##  4 TSLM(sales ~ trend() + season()) 1998 Apr
##  5 TSLM(sales ~ trend() + season()) 1998 May
##  6 TSLM(sales ~ trend() + season()) 1998 Jun
##  7 TSLM(sales ~ trend() + season()) 1998 Jul
##  8 TSLM(sales ~ trend() + season()) 1998 Aug
##  9 TSLM(sales ~ trend() + season()) 1998 Sep
## 10 TSLM(sales ~ trend() + season()) 1998 Oct
## 11 TSLM(sales ~ trend() + season()) 1998 Nov
## 12 TSLM(sales ~ trend() + season()) 1998 Dec
## # ℹ 2 more variables: sales <dist>, .mean <dbl>
#k. Plot the forecast with original data.

autoplot(shampoo, sales) +
  autolayer(forecast_next_year, series = "Forecast", PI = TRUE) +
  labs(title = "Shampoo Sales Forecast", y = "Sales", x = "Date")

#l. Check if the residuals of the model is white noise.

gg_tsresiduals(lm_model)

#The residuals do not seem to meet the criteria for white noise, as indicated by the presence of autocorrelation and discernible patterns.
  
#m. By using the regression model, forecast the 1 year ahead, and then check the accuracy of the forecast. What is MSE, RMSE values?

# Forecast for the next 12 months (1 year ahead)
forecast_next_year <- lm_model %>%
  forecast(h = "12 months")

# show forecasted values
print(forecast_next_year)
## # A fable: 12 x 4 [1M]
## # Key:     .model [1]
##    .model                            monthly
##    <chr>                               <mth>
##  1 TSLM(sales ~ trend() + season()) 1998 Jan
##  2 TSLM(sales ~ trend() + season()) 1998 Feb
##  3 TSLM(sales ~ trend() + season()) 1998 Mar
##  4 TSLM(sales ~ trend() + season()) 1998 Apr
##  5 TSLM(sales ~ trend() + season()) 1998 May
##  6 TSLM(sales ~ trend() + season()) 1998 Jun
##  7 TSLM(sales ~ trend() + season()) 1998 Jul
##  8 TSLM(sales ~ trend() + season()) 1998 Aug
##  9 TSLM(sales ~ trend() + season()) 1998 Sep
## 10 TSLM(sales ~ trend() + season()) 1998 Oct
## 11 TSLM(sales ~ trend() + season()) 1998 Nov
## 12 TSLM(sales ~ trend() + season()) 1998 Dec
## # ℹ 2 more variables: sales <dist>, .mean <dbl>
# Plotting the forecast and the original data
autoplot(shampoo, sales) +
  autolayer(forecast_next_year, series = "Forecast", PI = TRUE) +
  labs(title = "Shampoo Sales Forecast for Next Year", y = "Sales", x = "Date")

---
title: "EXAM I"
author: Akshitha Varada, avara3@unh.newhaven.edu
date: "`r Sys.Date()`"
output: openintro::lab_report
  
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, error = FALSE)

```


```{r message=FALSE, warning=FALSE}
library(fpp3)
library(tidyverse)
Sys.time()
```

### 1.	
Consider the GDP information in data set called global_economy, which is already embedded in fpp3 package (no need to upload externally) 

### 1.	Choose a random country by yourself. Then plot the GDP per capita for this country over time? How GDP per capita has changed over time for the series you chose? Explain briefly.

```{r}
global_economy # see the data.


# 1.Answer:

gdp_India <- global_economy %>%
  filter(Country == "India") %>%
  mutate(GDP_per_capita = GDP / Population) %>%
  select(Year, GDP_per_capita)


autoplot(gdp_India, GDP_per_capita) +
  labs(title = "India's GDP per Capita Shows Gradual Growth from 1960s to 2000s,\n with Rapid Increase in Recent Years.",
       x = "Year",
       y = "GDP per Capita (USD)") +
  geom_smooth(method = "lm", se = FALSE, color = "Green", linetype = "dashed") +
  theme_minimal()

#The GDP per capita in India has generally increased over time, with steady growth from the 1960s until a sharp acceleration in the early 2000s. This recent rapid rise highlights significant economic progress, likely driven by structural reforms and technological advancements.


```

### 2.	
For each of the following series, make a graph of the data. If transforming seems appropriate, do so and describe the effect. Comment below in answer:

### 2a. Use the series you chose in #1.
```{r}

# 2a.Answer:

gdp_India <- gdp_India%>%
  mutate(Log_GDP_per_capita = log(GDP_per_capita))

autoplot(gdp_India, Log_GDP_per_capita) +
  labs(title = "The log-transformed GDP per capita for India shows a consistent upward\n trend over time, indicating steady economic growth with\n reduced variability in recent years.",
       x = "Year",
       y = "Log of GDP per Capita") +
    geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "dashed") +
  theme_minimal()


```

### 2b.	
United States GDP from global_economy.
```{r}


# 2b.Answer:

gdp_us <- global_economy %>%
  filter(Country == "United States") %>%
  select(Year, GDP)

# # Plotting US GDP over time
autoplot(gdp_us, GDP) +
  labs(title = "U.S. GDP Demonstrates a Strong Upward Trend, Growing from Approximately $5 Trillion\n in the 1960s to Over $20 Trillion by 2020, Reflecting Steady Economic Growth\n Throughout the Decades.",
       x = "Year",
       y = "GDP (USD)") +
    geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "dashed") +
  theme_minimal()

# The graph illustrates a consistent increase in U.S. GDP over time, showcasing steady economic growth, with an accelerating trend observed in recent decades.

```

### 2c.	
Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock
```{r}

# 2c.Answer:

victorian_cattle <- aus_livestock %>%
  filter(State == "Victoria", Animal == "Bulls, bullocks and steers")

# Plotting the number of slaughters over time
autoplot(victorian_cattle, Count) +
  labs(title = "
The Slaughter of Bulls, Bullocks, and Steers in Victoria Decreased from Over\n 125,000 in the 1980s to Approximately 50,000 in Recent Years.",
       x = "Year",
       y = "Count") +
    geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "dashed") +
  theme_minimal()

# The number of bulls, bullocks, and steers slaughtered in Victoria has decreased from over 125,000 in the 1980s to about 50,000 in recent years.
```

### 2d.
Victorian Electricity Demand from vic_elec.
```{r}


# 2d.Answer:

vic_elec_monthly <- vic_elec %>%
  index_by(Month = ~ yearmonth(.)) %>%
  summarise(Demand = mean(Demand, na.rm = TRUE))


autoplot(vic_elec_monthly, Demand) +
  labs(title = "Victorian Electricity Demand Exhibits Seasonal Peaks in Summer,\nDemonstrating a Distinct Seasonal Trend.",
       x = "Date",
       y = "Electricity Demand (MW)") +
  theme_minimal()

#The monthly average electricity demand in Victoria reveals a distinct seasonal trend, with peaks generally occurring during the summer months, likely reflecting increased demand for cooling.

```

### 2e.	
Gas production from aus_production.
```{r}


# 2e.Answer:

gas_production <- aus_production %>%
  select(Quarter, Gas)


autoplot(gas_production, Gas) +
  labs(title = "Australian Gas Production Increased from Approximately 0 to Over 250 Petajoules,\nExhibiting Significant Seasonal Peaks Throughout the Period.",
       x = "Year",
       y = "Gas Production (in petajoules)") +
    geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "dashed") +
  theme_minimal()

#Australian gas production demonstrates a consistent upward trend with notable seasonal variations, indicating both steady growth and regular peak demands likely influenced by seasonal consumption patterns.

```

### 3.	Use the canadian_gas data (monthly Canadian gas production in billions of cubic metres, January 1960 – February 2005).
#### 2a.	Plot the data using autoplot(), gg_subseries() , gg_season() to look at the effect of the changing seasonality over time. Describe the graphs in your own words. What do you see? What type pf pattern do you observe?

```{r warning=FALSE, error=FALSE}

# 3a.Answer:

# General time series plot
autoplot(canadian_gas) +
  labs(title = "Canadian Monthly Gas Production(1960–2005)",
       x = "Year",
       y = "Gas Production (Billions of Cubic Metres)") +
  theme_minimal()

# The time series plot shows a general upward trend in Canadian gas production from 1960 to 2005, with visible seasonal variations. These fluctuations become more prominent in recent years as production levels increase.

# Monthly Breakdown of Gas Production
gg_subseries(canadian_gas, Volume) +
  labs(title = "Monthly Trends in Canadian Gas Production",
       y = "Gas Production (Billions of Cubic Metres)") +
  theme_minimal()

# The monthly subseries plot reveals higher production levels in winter months, such as January and February, likely due to higher demand for heating. Production levels are lower in summer months, indicating a regular seasonal pattern.

# Seasonal Variation in Gas Production Over Time
gg_season(canadian_gas, Volume) +
  labs(title = "Seasonal Pattern in Canadian Gas Production",
       y = "Gas Production (Billions of Cubic Metres)") +
  theme_minimal()

# The seasonal plot highlights a consistent pattern, with production peaking in colder months and decreasing in warmer months. The amplitude of these seasonal changes increases over time, reflecting a growing variation in production as overall levels rise.

```

### 3b.
Do an STL decomposition of the data. You will need to choose a seasonal window to allow for the changing shape of the seasonal component.

```{r}

# 3b.Answer:

canadian_gas_stl <- canadian_gas %>%
  model(STL(Volume ~ season(window = "periodic")))

# Plotting Decomposition Components
components(canadian_gas_stl) %>%
  autoplot() +
  labs(title = "STL Decomposition of Monthly Gas Production in Canada",
       x = "Year") +
  theme_minimal()

#  The STL decomposition shows a strong and growing seasonal pattern in Canadian gas production, with peaks in winter reflecting higher demand. This confirms the seasonal trends observed earlier. The remainder component reveals any irregularities or unexpected changes in production, potentially due to external factors or anomalies.
```

### 3c.
How does the seasonal shape change over time? [Hint: Try plotting the seasonal component using gg_season().]
```{r}


# 3c.Answer:

# Performing STL decomposition with a periodic seasonal window
canadian_gas_stl <- canadian_gas %>%
  model(STL_decomp = STL(Volume ~ season(window = "periodic")))

# Extracting components from the STL decomposition
seasonal_component <- canadian_gas_stl %>%
  components() %>%
  as_tsibble() %>%
  select(Month, season_adjust)

gg_season(seasonal_component, season_adjust) +
  labs(title = "Evolution of the Seasonal Pattern in Canadian Gas Production",
       y = "Seasonal Component (Billions of Cubic Metres)") +
  theme_minimal()

# The seasonal component shows peak production during winter months, with the amplitude of these peaks increasing over time. This trend suggests a rising demand for gas in winter as production levels grow.
```

### 3d.	
produce a plausible seasonally adjusted series? What are these numbers, plot the series.
```{r}

# 3d.Answer:

# Perform STL decomposition with a periodic seasonal window
canadian_gas_stl <- canadian_gas %>%
  model(STL_decomp = STL(Volume ~ season(window = "periodic")))

# Generate the seasonally adjusted series by removing seasonal components
canadian_gas_seasonally_adjusted <- components(canadian_gas_stl) %>%
  mutate(Seasonally_Adjusted = Volume - season_year) %>%
  select(Month, Seasonally_Adjusted)

# Visualize the seasonally adjusted series
autoplot(canadian_gas_seasonally_adjusted, Seasonally_Adjusted) +
  labs(title = "Trend in Seasonally Adjusted Canadian Gas Production",
       x = "Year",
       y = "Gas Production (Billions of Cubic Metres)") +
    geom_smooth(method = "lm", se = FALSE, color = "green", linetype = "dashed") +
  theme_minimal()

# The seasonally adjusted series reveals a clear upward trend in Canadian gas production from the 1960s to the early 2000s, highlighting consistent growth in production independent of seasonal fluctuations.
```


### 4.
For retail time series, use the below code:

```{r}
# run the code
set.seed(12345678)

myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))


```

#### 4a. 
Create a training dataset consisting of observations before 2011 

```{r}
myseries_train <- myseries %>%
  filter(year(Month) < 2011)


```

#### 4b.	
Check that your data have been split appropriately by producing the following plot.

```{r}
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")
```

#### 4c.	
Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
```{r}
 #Answer:
    fit <- myseries_train %>%
      model(SNAIVE(Turnover))
```


#### 4d.
Check the residuals.
```{r}

# 4d Answer:

fit %>% gg_tsresiduals()

# Do the residuals appear to be uncorrelated and normally distributed?
# Answ: The residuals display an approximately normal distribution, forming a bell-shaped curve centered around zero, suggesting they may be normally distributed.
```

#### 4e.
Produce forecasts for the test data with given code below:

```{r}
# 4e Answer:
fc <- fit %>%  
forecast(new_data = anti_join(myseries, myseries_train))
fc %>% autoplot(myseries)

```

Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")

#### 4f.	
Compare the accuracy of your forecasts against the actual values with given code below:
```{r}
fit %>% accuracy()
fc %>% accuracy(myseries)
# 4f Answ:

fc <- forecast(fit)
autoplot(fc, myseries)
accuracy(fc, myseries)

# The model is quite accurate considering the RMSE Score of 1.228089
```

#### 4g.
How sensitive are the accuracy measures to the amount of training data used?
```{r}

# 4g Answer: The accuracy measures are influenced by the quantity of training data used; with limited data, errors may increase, and metrics such as RMSE, MAE, and MAPE can become less stable.

```

### 5.	
#### 5a.	
Create a training set for Australian takeaway food turnover (aus_retail) by withholding the last four years as a test set. 
```{r}


# 5a.Answer:

# Filter the aus_retail dataset to include only takeaway food turnover data
takeaway_data <- aus_retail %>%
  filter(Industry == "Takeaway food services") %>%
  select(-Industry)  # Drop the Industry column

#Set the cutoff date by subtracting 48 months (4 years) from the latest date in the dataset
cutoff_date <- max(takeaway_data$Month) - 48 

# Define the training set with data before the cutoff date
takeaway_train <- takeaway_data %>%
  filter(Month < cutoff_date)

# Define the test set with data from the last four years
takeaway_test <- takeaway_data %>%
  filter(Month >= cutoff_date)

#  Display the structure of the training and test sets
takeaway_train
takeaway_test


```

#### 5b.	
Fit all the appropriate benchmark methods to the   training set and forecast the periods covered by the test set.
```{r}

# 5b.Answer:


# Fit benchmark forecasting models to the training data
fit_benchmarks <- takeaway_train %>%
  model(
    Mean = MEAN(Turnover),
    Naive = NAIVE(Turnover),
    Seasonal_Naive = SNAIVE(Turnover),
  )

#  Generate forecasts for the test period using each benchmark model
fc_benchmarks <- fit_benchmarks %>%
  forecast(new_data = takeaway_test)

# Plot benchmark forecasts alongside actual values in the test set
fc_benchmarks %>%
  autoplot(takeaway_train, level = NULL) +
  autolayer(takeaway_test, Turnover, color = "yellow", linetype = "dashed") +
  labs(title = "Australian Takeaway Food Turnover: Benchmark Model Forecasts",
       x = "Date",
       y = "Turnover (AUD)") +
  theme_minimal()

```

#### 5c.	
Compute the accuracy of your forecasts. Which method does best?
```{r}


# 5c.Answer:

# Calculate accuracy metrics for each benchmark model against the test set
accuracy_metrics <- fc_benchmarks %>%
  accuracy(takeaway_test)

# Display the table of accuracy metrics
accuracy_metrics

# Determine the model with the best performance based on the lowest RMSE value
best_model <- accuracy_metrics %>%
  filter(RMSE == min(RMSE)) %>%
  select(.model, RMSE, MAE, MAPE)

# Output the performance metrics for the best model
best_model

# The Naive model demonstrates the highest accuracy among the benchmarks..

```

#### 5d.
Do the residuals from the best method resemble white noise?
```{r}

# 5d.Answer:

# Extract the optimal model based on RMSE
best_fit <- fit_benchmarks %>%
  select(!!best_model$.model)

# Compute residuals for the optimal model using the training data
residuals_best <- residuals(best_fit)

# Assess autocorrelation in the residuals
acf_residuals <- residuals_best %>%
  ACF(.resid) %>%
  autoplot() +
  labs(title = "Autocorrelation Function (ACF) of Residuals from the Optimal Model")

# Create a histogram and QQ plot to evaluate the normality of residuals
hist_residuals <- residuals_best %>%
  ggplot(aes(x = .resid)) +
  geom_histogram(aes(y = ..density..), bins = 20, fill = "yellow", color = "black") +
  geom_density(color = "red") +
  labs(title = "Residuals Histogram", x = "Residuals", y = "Density")

qq_residuals <- residuals_best %>%
  ggplot(aes(sample = .resid)) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "QQ Plot of Residuals", x = "Theoretical Quantiles", y = "Sample Quantiles")

# Display the visualizations
acf_residuals
hist_residuals
qq_residuals

#  The residuals exhibit some autocorrelation, as indicated by the ACF plot, and show deviations from normality in the QQ plot. This suggests that the model may not have fully captured all underlying patterns in the data, indicating some remaining structure in the residuals.
```


### 6.	
Using the code below, get a series (it gets a series randomly by using sample() function):

```{r}
set.seed(12345678)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
```
see head of your series to check it is a tsibble data, and remove NA’s if there is any with these commands:

```{r}
head(myseries)
myseries =  myseries %>% filter(!is.na(`Series ID`))
```

#### 6a.
What is the name of the series you randomly choose? Write it.
```{r}

# 6a.Answer:

myseries_name <- unique(myseries$`Series ID`)
print(myseries_name)


```

#### 6b. 
Run a linear regression of Turnover on trend.(Hint: use TSLM() and trend() functions)
```{r}
# 6b.Answer:

model <- myseries %>%
  model(Linear_Trend = TSLM(Turnover ~ trend()))



```

#### 6c. 
See the regression result by report() command.
```{r}
# 6c.Answer:

report(model)
```


#### 6d.	
By using this model, forecast it for the next 3 years. What are the values of the next 3 years, monthly values?
```{r}

# 6d.Answer:
# Create a forecast for the next 3 years (36 months)
forecast_values <- model %>%
  forecast(h = 36)

# Show the forecasted values
forecast_values %>%
  as_tibble() %>%
  select(Month, .mean) %>%
  print()


```

#### 6d.	
Plot the forecast values along with the original data.
```{r}

# 6d.Answer:
# Create a forecast for the next 3 years (36 months)
forecast_values <- model %>%
  forecast(h = 36)

# Vizualize the original data and  forecasted values
autoplot(myseries, Turnover) +
  autolayer(forecast_values, .mean, series = "Forecast", PI = FALSE) +
  labs(title = "3-Year Turnover Forecast",
       x = "Month",
       y = "Turnover (AUD)") +
  theme_minimal()


```

#### 6e.	
Get the residuals from the model. And check the residuals to check whether or not it satisfies the requirements for white noise error terms.(hint: augment() and gg_tsresiduals() functions)

```{r}

# 6e.Answer:

gg_tsresiduals(model)

#The residuals show patterns, autocorrelation at lag 12, and deviations from normality, indicating they do not resemble white noise.

```


### 7. 
Half-hourly electricity demand for Victoria, Australia is contained in vic_elec. Extract the January 2014 electricity demand, and aggregate this data to daily with  daily total demands and maximum temperatures. Run the code below:

```{r}
jan_vic_elec <- vic_elec %>%
  filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
  index_by(Date = as_date(Time)) %>%
  summarise(Demand = sum(Demand), Temperature = max(Temperature))

```

#### 7a. 
Plot the data and find the regression model for Demand with temperature as a predictor variable. Why is there a positive relationship?
```{r}

# 7a.Answer:

# Visualize the relationship between Demand and Temperature
jan_vic_elec %>%
  ggplot(aes(x = Temperature, y = Demand)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Relationship Between Daily Electricity Demand and Temperature (January 2014)",
       x = expression("Temperature ("* degree * "C)"),
       y = "Demand (MW)") +
  theme_minimal()

# Fit a linear regression model using Temperature as a predictor for Demand

model <- jan_vic_elec %>%
  model(lm_model = TSLM(Demand ~ Temperature))

# The observed positive correlation between temperature and electricity demand suggests that higher temperatures drive up demand, likely due to increased air conditioning use. This trend is especially evident on hotter days, where cooling needs lead to a direct increase in electricity consumption.


```

#### 7b. 
Produce a residual plot. Is the model adequate? Are there any outliers or influential observations?

```{r}

# 7b.Answer:
gg_tsresiduals(model)

# The model may be insufficient, as the residuals show patterns and autocorrelation, suggesting potential outliers or influential points impacting the model's accuracy.

```

#### 7c.
Use the model to forecast the electricity demand that you would expect for the next day if the maximum temperature was 15∘C and compare it with the forecast if the with maximum temperature was 35∘C. Do you believe these forecasts?

```{r}

# 7c.Answer:

electricity_demand_15 <- jan_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan_vic_elec, 1) %>%
      mutate(Temperature = 15)
  ) %>%
  autoplot(jan_vic_elec)+labs(title=expression("Projected Electricity Demand for Next Day with Max Temperature of 15" * degree * "C"))
electricity_demand_15

#Forecast electricity demand for the next day if the maximum temperature is 35°C

electricity_demand_35 <- jan_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan_vic_elec, 1) %>%
      mutate(Temperature = 35)
  ) %>%
  autoplot(jan_vic_elec)+labs(title=expression("Projected Electricity Demand for Next Day with Max Temperature of 35" * degree * "C"))
electricity_demand_35

# These forecasts offer a general indication of demand but should be interpreted with caution.

```

#### 7d.
Do you believe these forecasts? The following R code will get you started:
```{r}
 jan_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan_vic_elec, 1) %>%
      mutate(Temperature = 15)
  ) %>%
  autoplot(jan_vic_elec)
```
  
```{r}

# 7d.Answer:
#The forecast appears inaccurate, as the average demand has never dropped below 190000.

```
 
#### 7e. 
Give prediction intervals for your forecasts.

```{r}
# 7e.Answer:

# Forecast and plot electricity demand for the next day assuming a maximum temperature of 15°C
electricity_demand_15 <- jan_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan_vic_elec, 1) %>%
      mutate(Temperature = 15)
  ) %>%
  autoplot(jan_vic_elec) +
  labs(title = expression("Next-Day Electricity Demand Forecast at Max Temperature of 15" * degree * "C"),
       y = "Electricity Demand (MW)", x = "Date")

# Display the forecast plot
print(electricity_demand_15)

# Forecast and plot electricity demand for the next day assuming a maximum temperature of 35°C
electricity_demand_35 <- jan_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan_vic_elec, 1) %>%
      mutate(Temperature = 35)
  ) %>%
  autoplot(jan_vic_elec) +
  labs(title = expression("Next-Day Electricity Demand Forecast at Max Temperature of 35"* degree * "C"),
       y = "Electricity Demand (MW)", x = "Date")

# Display the forecast plot
print(electricity_demand_35)



```


### 8.
Read the shampoo data given in excel (Import Dataset as Excel)
  
```{r}
#a.	View the shampoo sales data. How many variables are there? Find how many rows and columns in the data?
  library(readxl)
shampoo <- read_excel("~/Business Forecating/Midterm/shampoo-2.xlsx")
dim(shampoo)

#b.	Is the data annual, monthly, quarterly?
unique(shampoo$Month)
  
#c.	Convert the data into tibble , then tsibble 
shampoo <- shampoo %>%
  mutate(monthly = yearmonth(Month)) %>%
  select(-Month) %>%
  as_tibble() %>%
  as_tsibble(index = monthly)
  
#d.	Plot the shampoo sales. What do you see from the data pattern? What does x-axis represent? 
# Comment here. Use plot() and autoplot().Put the name for y axis, and a title for the graph.

plot(shampoo$monthly, shampoo$sales, type = "l", main = "Shampoo Sales Trends Over Time", xlab = "Date", ylab = "Sales")

autoplot(shampoo, sales) + 
  labs(title = "Shampoo Sales Trends Over Time", y = "Sales", x = "Date")
  
#e.	What is the average, and median of shampoo sales. Put it on a histogram.

avg_sales <- mean(shampoo$sales, na.rm = TRUE)
median_sales <- median(shampoo$sales, na.rm = TRUE)

ggplot(shampoo, aes(x = sales)) +
  geom_histogram(binwidth = 50, fill = "yellow", alpha = 0.7) +
  labs(title = "Distribution of Shampoo Sales", x = "Sales", y = "Frequency") +
  geom_vline(aes(xintercept = avg_sales), color = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = median_sales), color = "blue", linetype = "dashed")

  
#f.	Get seasonal plot. What do you see/ is there any pattern, is tehre any seasonality.
gg_season(shampoo, sales) +
  labs(title = "Seasonal Plot of Shampoo Sales", y = "Sales", x = "Month")
  
#g.	Get a linear regression line with trend and dummy for each month (Hint: use trend and season in regression equation).
lm_model <- shampoo %>%
  model(TSLM(sales ~ trend() + season()))
report(lm_model)
  
#h.	Comment on each estimated coefficient of the model.Are they statistically significant at 5 % significance level?

#No, the majority of them are not statistically significant at the 5% significance level.
  
#i.	Which month has the highest sales?
highest_sales_month <- shampoo %>%
  filter(sales == max(sales, na.rm = TRUE)) %>%
  select(monthly, sales)
print(highest_sales_month)

  
#j.	Forecast it for the next year. What are the values
  
forecast_next_year <- lm_model %>%
  forecast(h = "1 year")
print(forecast_next_year)

#k.	Plot the forecast with original data.

autoplot(shampoo, sales) +
  autolayer(forecast_next_year, series = "Forecast", PI = TRUE) +
  labs(title = "Shampoo Sales Forecast", y = "Sales", x = "Date")
  
#l.	Check if the residuals of the model is white noise.

gg_tsresiduals(lm_model)

#The residuals do not seem to meet the criteria for white noise, as indicated by the presence of autocorrelation and discernible patterns.
  
#m.	By using the regression model, forecast the 1 year ahead, and then check the accuracy of the forecast. What is MSE, RMSE values?

# Forecast for the next 12 months (1 year ahead)
forecast_next_year <- lm_model %>%
  forecast(h = "12 months")

# show forecasted values
print(forecast_next_year)

# Plotting the forecast and the original data
autoplot(shampoo, sales) +
  autolayer(forecast_next_year, series = "Forecast", PI = TRUE) +
  labs(title = "Shampoo Sales Forecast for Next Year", y = "Sales", x = "Date")
  

```
    
