data624 assignment 2

#install and load libary
#install.packages("fpp3")
library(fpp3)

## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──

## ✔ tibble      3.1.6     ✔ tsibble     1.1.3
## ✔ dplyr       1.0.7     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.1.4     ✔ feasts      0.3.0
## ✔ lubridate   1.8.0     ✔ fable       0.3.2
## ✔ ggplot2     3.3.5     ✔ fabletools  0.3.2

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(tsibble)
library(seasonal)

## 
## Attaching package: 'seasonal'

## The following object is masked from 'package:tibble':
## 
##     view

3.1 Consider the GDP information in global_economy. Plot the GDP per capita for each country over time. Which country has the highest GDP per capita? How has this changed over time?

summary(global_economy)

##            Country           Code            Year           GDP           
##  Afghanistan   :   58   ABW    :   58   Min.   :1960   Min.   :8.824e+06  
##  Albania       :   58   AFG    :   58   1st Qu.:1974   1st Qu.:2.155e+09  
##  Algeria       :   58   AGO    :   58   Median :1989   Median :1.483e+10  
##  American Samoa:   58   ALB    :   58   Mean   :1989   Mean   :1.034e+12  
##  Andorra       :   58   AND    :   58   3rd Qu.:2003   3rd Qu.:1.839e+11  
##  Angola        :   58   ARB    :   58   Max.   :2017   Max.   :8.074e+13  
##  (Other)       :14802   (Other):14802                  NA's   :3322       
##      Growth             CPI             Imports          Exports      
##  Min.   :-64.047   Min.   :   0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  1.617   1st Qu.:  13.46   1st Qu.: 21.61   1st Qu.: 17.54  
##  Median :  3.913   Median :  53.86   Median : 31.13   Median : 26.95  
##  Mean   :  3.909   Mean   :  56.93   Mean   : 38.03   Mean   : 33.39  
##  3rd Qu.:  6.242   3rd Qu.:  90.28   3rd Qu.: 49.08   3rd Qu.: 42.25  
##  Max.   :149.973   Max.   :4583.70   Max.   :427.58   Max.   :433.22  
##  NA's   :3756      NA's   :7480      NA's   :4554     NA's   :4563    
##    Population       
##  Min.   :4.279e+03  
##  1st Qu.:9.251e+05  
##  Median :6.345e+06  
##  Mean   :2.060e+08  
##  3rd Qu.:4.220e+07  
##  Max.   :7.530e+09  
##  NA's   :3

# I would like to check what kind of data that global_economy contain.

head(global_economy)

## # A tsibble: 6 x 9 [1Y]
## # Key:       Country [1]
##   Country     Code   Year         GDP Growth   CPI Imports Exports Population
##   <fct>       <fct> <dbl>       <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
## 1 Afghanistan AFG    1960  537777811.     NA    NA    7.02    4.13    8996351
## 2 Afghanistan AFG    1961  548888896.     NA    NA    8.10    4.45    9166764
## 3 Afghanistan AFG    1962  546666678.     NA    NA    9.35    4.88    9345868
## 4 Afghanistan AFG    1963  751111191.     NA    NA   16.9     9.17    9533954
## 5 Afghanistan AFG    1964  800000044.     NA    NA   18.1     8.89    9731361
## 6 Afghanistan AFG    1965 1006666638.     NA    NA   21.4    11.3     9938414

# look a little bit deeper to see the actual data

global_economy %>%
  autoplot(GDP/Population, show.legend = FALSE) +
  labs(title= "GDP per capita for each country", y = "Dollar", x ="change over year")

## Warning: Removed 3242 row(s) containing missing values (geom_path).

#set up a formula to get GDP per capita for each country

GDPpercapita <- global_economy %>%
  group_by(Country, GDP, Population) %>%
  summarise(GDPPerCap = GDP/Population) %>% 
  arrange(desc(GDPPerCap))

#sorting from the highest to lowest

head(GDPpercapita)

## # A tsibble: 6 x 5 [1Y]
## # Key:       Country, GDP, Population [6]
## # Groups:    Country, GDP [6]
##   Country               GDP Population  Year GDPPerCap
##   <fct>               <dbl>      <dbl> <dbl>     <dbl>
## 1 Monaco        7060236168.      38132  2014   185153.
## 2 Monaco        6476490406.      35853  2008   180640.
## 3 Liechtenstein 6657170923.      37127  2014   179308.
## 4 Liechtenstein 6391735894.      36834  2013   173528.
## 5 Monaco        6553372278.      37971  2013   172589.
## 6 Monaco        6468252212.      38499  2016   168011.

#getting the top one. we can see Monaco has the highest GDPPerCap, the next is Liechtenstein

3.2 For each of the following series, make a graph of the data. If transforming seems appropriate, do so and describe the effect.

United States GDP from global_economy. Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock. Victorian Electricity Demand from vic_elec. Gas production from aus_production.

#United States GDP from global_economy

global_economy %>% 
  filter(Country == "United States") %>% 
  autoplot(GDP) +
  labs(title= "United States GDP from global_economy")

global_economy %>% 
  filter(Country == "United States") %>% 
  autoplot(GDP/Population)  +
  labs(title= "Edited United States GDP from global_economy")

#trying to see the GDP per Person like question 3.1


#Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock.

aus_livestock %>% 
  filter(State == "Victoria", Animal == "Bulls, bullocks and steers") %>% 
  autoplot(Count) +
  labs(title= "Slaughter of Victorian “Bulls, bullocks and steers")

aus_livestock %>% 
  filter(State == "Victoria", Animal == "Bulls, bullocks and steers") %>%
  mutate(Year = year(Month)) %>%
  index_by(Year) %>%
  summarise(Count = sum(Count)) %>%
  autoplot(Count) +
  labs(title= "Edited Slaughter of Victorian “Bulls, bullocks and steers")

#trying to see oversee the change by year since it is a bit too busy by month.


#Victorian Electricity Demand from vic_elec.

vic_elec %>% 
  autoplot(Demand)+
  labs(title= "Victorian Electricity Demand")

vic_elec  %>% 
  mutate(Date = yearmonth(Date)) %>%
  index_by(Date) %>%
  summarise(Demand = sum(Demand)) %>%
  autoplot(Demand)+
  labs(title= "Edited Victorian Electricity Demand")

#try to use summarise(Demand = sum(Demand)) to make it more readable.

#Gas production from aus_production

aus_production %>% 
  autoplot(Gas)+
  labs(title= "Gas production")

lambda <- aus_production %>%
  features(Gas, features = guerrero) %>%
  pull(lambda_guerrero)

aus_production %>%
  autoplot(box_cox(Gas, lambda))+
  labs(title= "Gas production")

#A good value of  λ  is one which makes the size of the seasonal variation about the same across the whole series

Why is a Box-Cox transformation unhelpful for the canadian_gas data?

canadian_gas %>%
  autoplot(Volume)

lambda <- canadian_gas %>%
  features(Volume, features = guerrero) %>%
  pull(lambda_guerrero)

canadian_gas %>%
  autoplot(box_cox(Volume, lambda))

#the value of  λ for canadian_gas does not make the size of the seasonal variation about the same across the whole series

3.4 What Box-Cox transformation would you select for your retail data (from Exercise 8 in Section 2.10)?

#from ex.8 in section 2.1
set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

autoplot(myseries, Turnover)

lambda <- myseries %>%
  features(Turnover, features = guerrero) %>%
  pull(lambda_guerrero)
myseries %>%
  autoplot(box_cox(Turnover, lambda))

lambda

## [1] 0.08303631

#the λ is 0.083

3.5 For the following series, find an appropriate Box-Cox transformation in order to stabilise the variance. Tobacco from aus_production, Economy class passengers between Melbourne and Sydney from ansett, and Pedestrian counts at Southern Cross Station from pedestrian.

#Tobacco from aus_production
autoplot(aus_production, Tobacco)

## Warning: Removed 24 row(s) containing missing values (geom_path).

lambda <- aus_production %>%
  features(Tobacco, features = guerrero) %>%
  pull(lambda_guerrero)

aus_production %>%
  autoplot(box_cox(Tobacco, lambda)) +
  labs(title= "Edited aus_production")

## Warning: Removed 24 row(s) containing missing values (geom_path).

#Economy class passengers between Melbourne and Sydney from ansett

econfrommeltosyd <- ansett %>% 
  filter(Class == "Economy", Airports == "MEL-SYD")
econfrommeltosyd %>% autoplot(Passengers)

lambda <- econfrommeltosyd %>%
  features(Passengers, features = guerrero) %>%
  pull(lambda_guerrero)

econfrommeltosyd %>%
  autoplot(box_cox(Passengers, lambda))+
  labs(title= "Edited econfrommeltosyd")

#Pedestrian counts at Southern Cross Station from pedestrian

Southerncount <- pedestrian %>%
  filter(Sensor == "Southern Cross Station") %>%
  group_by(Sensor) %>%
  index_by(Week = yearweek(Date_Time)) %>%
  summarise(Count = sum(Count))
Southerncount %>% autoplot(Count)

lambda <- Southerncount %>%
  features(Count, features = guerrero) %>%
  pull(lambda_guerrero)

Southerncount %>%
  autoplot(box_cox(Count, lambda)) +
  labs(title= "Edited Southerncount")

3.7 Consider the last five years of the Gas data from aus_production.

gas <- tail(aus_production, 5*4) |> select(Gas) Plot the time series. Can you identify seasonal fluctuations and/or a trend-cycle? Use classical_decomposition with type=multiplicative to calculate the trend-cycle and seasonal indices. Do the results support the graphical interpretation from part a? Compute and plot the seasonally adjusted data. Change one observation to be an outlier (e.g., add 300 to one observation), and recompute the seasonally adjusted data. What is the effect of the outlier? Does it make any difference if the outlier is near the end rather than in the middle of the time series?

gas <- tail(aus_production, 5*4) %>% 
  select(Gas)
#get it from the question

 autoplot(gas, Gas) +
   labs(title= "plot for a.")

 #there is a trend that starting of the year is low, and it rising during Q3.
 
 gas_multi <- gas %>%
  model(classical_decomposition(Gas, type = "multiplicative")) 

components(gas_multi) %>%
  autoplot()+
   labs(title= "plot for b.")

## Warning: Removed 2 row(s) containing missing values (geom_path).

#ans of c, yes, we can see the increase trend from plot for b match the plot for a.

 components(gas_multi) %>%
  as_tsibble() %>%
  autoplot(Gas) +
  geom_line(aes(y=season_adjust)) +
   labs(title= "plot for d.")

 #pick the highest gas which is 252
 gas %>%
  mutate(Gas = ifelse(Gas == 252, Gas + 300, Gas)) %>%
  model(classical_decomposition(Gas, type = "multiplicative")) %>%
  components() %>%
  as_tsibble() %>%
  autoplot(Gas) +
  geom_line(aes(y=season_adjust))

# one line does not got affected as much as the other. 
 
 #pick the 2020Q2 which is 236
  gas %>%
  mutate(Gas = ifelse(Gas == 236, Gas + 300, Gas)) %>%
  model(classical_decomposition(Gas, type = "multiplicative")) %>%
  components() %>%
  as_tsibble() %>%
  autoplot(Gas) +
  geom_line(aes(y=season_adjust))

  #2 lines are really close

3.8 Recall your retail time series data (from Exercise 8 in Section 2.10). Decompose the series using X-11. Does it reveal any outliers, or unusual features that you had not noticed previously?

set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

x11_decomposition <- myseries %>%
  model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) %>%
  components()

autoplot(x11_decomposition) +
  labs(title = "X11")

#from 2000 to 2010, it has less noise.

Figures 3.19 and 3.20 show the result of decomposing the number of persons in the civilian labour force in Australia each month from February 1978 to August 1995.

Decomposition of the number of persons in the civilian labour force in Australia each month from February 1978 to August 1995. Figure 3.19: Decomposition of the number of persons in the civilian labour force in Australia each month from February 1978 to August 1995.

Seasonal component from the decomposition shown in the previous figure. Figure 3.20: Seasonal component from the decomposition shown in the previous figure.

Write about 3–5 sentences describing the results of the decomposition. Pay particular attention to the scales of the graphs in making your interpretation.

ans: we can see a increasing trend besides there is a drop around 1991, the trend is flater during that period of time.

Is the recession of 1991/1992 visible in the estimated components?

ans: yes, as we mentioned early, i am more interested to see if 2019 to 2023 count as recession if there is a data and plot like this one.

data624 assignment 2

Jay Lee

2/11/2023