Question 2
# a.
ggplot(data = olympic_running, mapping = aes(x = Year, y = Time, color = Sex, size = Length))+
geom_point() +
labs(y = "Time (in seconds)", title = "Winning Time Against the Year by Sex and Length")
## Warning: Removed 31 rows containing missing values (geom_point).

# I used scatter plot since length and sex is categorical
# b.
# Since I am running a regression line , I am going to have to split
# the data
length1 <- olympic_running %>%
filter(Length == 100)
length1
## # A tsibble: 54 x 4 [4Y]
## # Key: Length, Sex [2]
## Year Length Sex Time
## <int> <int> <chr> <dbl>
## 1 1896 100 men 12
## 2 1900 100 men 11
## 3 1904 100 men 11
## 4 1908 100 men 10.8
## 5 1912 100 men 10.8
## 6 1916 100 men NA
## 7 1920 100 men 10.8
## 8 1924 100 men 10.6
## 9 1928 100 men 10.8
## 10 1932 100 men 10.3
## # … with 44 more rows
## # ℹ Use `print(n = ...)` to see more rows
ggplot(length1, aes(x = Year, y = Time, color = Sex)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

# The decreasing average of time per year can be found by running the regression:
reg1 <- length1 %>%
filter(Sex == "men") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3443995 -0.1081360 0.0007715 0.0750701 0.9015537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.011581 2.347437 14.91 2.94e-14 ***
## Year -0.012612 0.001198 -10.52 7.24e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2321 on 26 degrees of freedom
## Multiple R-squared: 0.8099, Adjusted R-squared: 0.8026
## F-statistic: 110.8 on 1 and 26 DF, p-value: 7.2403e-11
# Thus, for men, there has been an average of -0.012612 decline of time in seconds as
# each year progresses for the 100 length race.
# Now for women:
reg2 <- length1 %>%
filter(Sex == "women") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44833 -0.11159 0.05775 0.11731 0.36057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.188164 3.697686 10.598 2.05e-09 ***
## Year -0.014185 0.001872 -7.577 3.72e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2232 on 19 degrees of freedom
## Multiple R-squared: 0.7513, Adjusted R-squared: 0.7382
## F-statistic: 57.4 on 1 and 19 DF, p-value: 3.7226e-07
# Therefore, for women, there has been a average decline of -0.014185 time in seconds
# for the 100 length race.
# Now for 200 length race:
length2 <- olympic_running %>%
filter(Length == 200)
ggplot(length2, aes(x = Year, y = Time, color = Sex)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

# And now for the average decline
# men:
reg3 <- length2 %>%
filter(Sex == "men") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6112 -0.1872 -0.1046 0.1935 0.6959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.376498 3.553448 19.52 < 2e-16 ***
## Year -0.024881 0.001812 -13.73 3.8e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3315 on 25 degrees of freedom
## Multiple R-squared: 0.8829, Adjusted R-squared: 0.8782
## F-statistic: 188.5 on 1 and 25 DF, p-value: 3.7956e-13
# The average decline for 200 length race for men is -0.024881 seconds
# women:
reg4 <- length2 %>%
filter(Sex == "women") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93598 -0.39344 0.08043 0.37624 0.78230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89.137613 11.267724 7.911 6.41e-07 ***
## Year -0.033633 0.005685 -5.916 2.17e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5005 on 16 degrees of freedom
## Multiple R-squared: 0.6863, Adjusted R-squared: 0.6667
## F-statistic: 35 on 1 and 16 DF, p-value: 2.1706e-05
# The average decline is -0.033633 seconds
# Now, for the 400 length race
length3 <- olympic_running %>%
filter(Length == 400)
ggplot(length3, aes(x = Year, y = Time, color = Sex))+
geom_point()+
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Removed 3 rows containing missing values (geom_point).

# And so
reg5 <- length3 %>%
filter(Sex == "men") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6001 -0.5747 -0.2858 0.5751 4.1505
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 172.481477 11.487522 15.02 2.52e-14 ***
## Year -0.064574 0.005865 -11.01 2.75e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.136 on 26 degrees of freedom
## Multiple R-squared: 0.8234, Adjusted R-squared: 0.8166
## F-statistic: 121.2 on 1 and 26 DF, p-value: 2.7524e-11
# The average decline is -0.064574 seconds each year
reg6 <- length3 %>%
filter(Sex == "women") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1593 -1.0142 0.1024 0.7749 1.4797
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129.39165 33.89755 3.817 0.00245 **
## Year -0.04008 0.01703 -2.353 0.03652 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.028 on 12 degrees of freedom
## Multiple R-squared: 0.3157, Adjusted R-squared: 0.2587
## F-statistic: 5.536 on 1 and 12 DF, p-value: 0.036523
# Average decline for women is -0.04008 seconds for the 400 length race
# Now, the 800
length4 <- olympic_running %>%
filter(Length == 800)
ggplot(length4, aes(x = Year, y = Time, color = Sex)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).

# And so the average decline for Time is:
reg7 <- length4 %>%
filter(Sex == "men") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7306 -1.8734 -0.8449 0.6851 12.9408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 405.8457 34.6506 11.712 1.21e-11 ***
## Year -0.1518 0.0177 -8.576 6.47e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.387 on 25 degrees of freedom
## Multiple R-squared: 0.7463, Adjusted R-squared: 0.7361
## F-statistic: 73.54 on 1 and 25 DF, p-value: 6.4705e-09
# -0.1518 seconds for men and
reg8 <- length4 %>%
filter(Sex == "women") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.90197 -1.54437 -0.08512 1.55387 7.10393
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 511.36950 76.14729 6.716 9.85e-06 ***
## Year -0.19796 0.03837 -5.159 0.000145 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.401 on 14 degrees of freedom
## Multiple R-squared: 0.6553, Adjusted R-squared: 0.6307
## F-statistic: 26.61 on 1 and 14 DF, p-value: 0.00014512
# -0.19796 seconds for women in the 800 length race as each year has
# progressed
# Now, for the 1500
length5 <- olympic_running %>%
filter(Length == 1500)
ggplot(length5, aes(x = Year, y = Time, color = Sex)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

# There is a decline for men, but surprisingly an increase
# for women
# The average for men is
reg9 <- length5 %>%
filter(Sex == "men") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.302 -4.585 -1.215 1.925 27.133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 843.43666 81.81773 10.309 1.12e-10 ***
## Year -0.31507 0.04177 -7.543 5.23e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.09 on 26 degrees of freedom
## Multiple R-squared: 0.6864, Adjusted R-squared: 0.6743
## F-statistic: 56.9 on 1 and 26 DF, p-value: 5.2345e-08
# -0.31507 each year and the average for women is a
reg10 <- length5 %>%
filter(Sex == "women") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8003 -3.9012 0.7371 3.3202 6.4808
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -50.9926 211.3851 -0.241 0.814
## Year 0.1468 0.1060 1.384 0.196
##
## Residual standard error: 5.071 on 10 degrees of freedom
## Multiple R-squared: 0.1608, Adjusted R-squared: 0.07691
## F-statistic: 1.917 on 1 and 10 DF, p-value: 0.19635
# 0.1468 increase in seconds per year
# Now for the 5000 race
length6 <- olympic_running %>%
filter(Length == 5000)
length6
## # A tsibble: 33 x 4 [4Y]
## # Key: Length, Sex [2]
## Year Length Sex Time
## <int> <int> <chr> <dbl>
## 1 1912 5000 men 877.
## 2 1916 5000 men NA
## 3 1920 5000 men 896.
## 4 1924 5000 men 871.
## 5 1928 5000 men 878
## 6 1932 5000 men 870
## 7 1936 5000 men 862.
## 8 1940 5000 men NA
## 9 1944 5000 men NA
## 10 1948 5000 men 858.
## # … with 23 more rows
## # ℹ Use `print(n = ...)` to see more rows
ggplot(length6, aes(x = Year, y = Time, color = Sex)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Removed 3 rows containing missing values (geom_point).

# There is an obvious negative association between time and year
# for men, however, because of small quantities of data for women,
# the associate isn't very clear. We will calculate the averages
# nonetheless.
# For men, the average decline is
reg10 <- length6 %>%
filter(Sex == "men") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.311 -11.668 -1.096 7.515 40.596
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2853.1995 205.1246 13.910 2.22e-12 ***
## Year -1.0299 0.1042 -9.881 1.50e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.66 on 22 degrees of freedom
## Multiple R-squared: 0.8161, Adjusted R-squared: 0.8078
## F-statistic: 97.64 on 1 and 22 DF, p-value: 1.4995e-09
# -1.0299 seconds per year and for women the average declien is
reg11 <- length6 %>%
filter(Sex == "women") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1504.175 3467.321 0.434 0.687
## Year -0.303 1.728 -0.175 0.869
##
## Residual standard error: 28.92 on 4 degrees of freedom
## Multiple R-squared: 0.007624, Adjusted R-squared: -0.2405
## F-statistic: 0.03073 on 1 and 4 DF, p-value: 0.86936
# -0.303 seocnds per year for the 1500 length race
# Finally, last but not least, the 10000
length7 <- olympic_running %>%
filter(Length == 10000)
ggplot(length7, aes(x = Year, y = Time, color = Sex)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Removed 3 rows containing missing values (geom_point).

# and so the average decline for men is
reg12 <- length7 %>%
filter(Sex == "men") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.964 -24.222 -4.091 11.911 58.859
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6965.4594 378.4635 18.41 7.52e-15 ***
## Year -2.6659 0.1923 -13.86 2.37e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.89 on 22 degrees of freedom
## Multiple R-squared: 0.8973, Adjusted R-squared: 0.8926
## F-statistic: 192.2 on 1 and 22 DF, p-value: 2.3727e-12
# -2.6659 seconds per year and the average decline for women is
reg13 <- length7 %>%
filter(Sex == "women") %>%
model(TSLM(Time ~ Year)) %>%
report()
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.550 -11.594 -2.285 7.731 29.765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8825.2600 1402.4211 6.293 0.00075 ***
## Year -3.4962 0.7005 -4.991 0.00247 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.16 on 6 degrees of freedom
## Multiple R-squared: 0.8059, Adjusted R-squared: 0.7735
## F-statistic: 24.91 on 1 and 6 DF, p-value: 0.0024746
# -3.4962 seconds per year for the 10000 length race.
# c.
# Now, I am plotting the residuals against the year. I am
# starting to think that I shouldn't have plotted each
# length individually, so I am going to use facet_wrap to now plot
# the residuals.
# I am putting them all together on the same graph now :).
# THIS TOOK 5 MINUTES COMPARED TO 2 HOURS!! Grrrr
ggplot(data = olympic_running, mapping = aes(x = Year, y = Time, color = Sex))+
geom_point() +
labs(y = "Time (in seconds)", title = "Winning Time Against the Year by Sex and Length") +
facet_wrap(~ Length, nrow = 3, scales = "free")
## Warning: Removed 31 rows containing missing values (geom_point).

# I am plotting the residuals for each Race now
reg1 %>%
gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_bin).

reg2 %>%
gg_tsresiduals()
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing non-finite values (stat_bin).

reg3 %>%
gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_bin).

reg4 %>%
gg_tsresiduals()

reg5 %>%
gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing non-finite values (stat_bin).

reg6 %>%
gg_tsresiduals()

reg7 %>%
gg_tsresiduals()
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).

reg8 %>%
gg_tsresiduals()
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing non-finite values (stat_bin).

reg9 %>%
gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_bin).

reg10 %>%
gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing non-finite values (stat_bin).

reg11 %>%
gg_tsresiduals()

reg12 %>%
gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing non-finite values (stat_bin).

reg13 %>%
gg_tsresiduals()

# All of the residuals are pretty random. Every graph
# seems to have a random pattern. The residuals are centered
# around the mean. They are not autocorrelated which
# is expected.
# d.
fc_reg1 <- reg1 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg1
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 100 men TSLM(Time ~ Yea… 2020 N(9.5, 0.061) 9.53 [9.217343, 9.851671]80
fc_reg2 <- reg2 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg2
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 100 women TSLM(Time ~ Year) 2020 N(11, 0.059) 10.5 [10.22224, 10.84658]80
fc_reg3 <- reg3 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg3
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 200 men TSLM(Time ~ Year) 2020 N(19, 0.13) 19.1 [18.66343, 19.57145]80
fc_reg4 <- reg4 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg4
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 200 women TSLM(Time ~ Year) 2020 N(21, 0.31) 21.2 [20.48494, 21.91454]80
fc_reg5 <- reg5 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg5
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 400 men TSLM(Time ~ Year) 2020 N(42, 1.5) 42.0 [40.49022, 43.5944]80
fc_reg6 <- reg6 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg6
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 400 women TSLM(Time ~ Year) 2020 N(48, 1.4) 48.4 [46.9239, 49.94863]80
fc_reg7 <- reg7 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg7
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 800 men TSLM(Time ~ Year) 2020 N(99, 13) 99.2 [94.59505, 103.8803]80
fc_reg8 <- reg8 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg8
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 800 women TSLM(Time ~ Year) 2020 N(111, 14) 111. [106.659, 116.3079]80
fc_reg9 <- reg9 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg9
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 1500 men TSLM(Time ~ Year) 2020 N(207, 74) 207. [195.9438, 218.0527]80
fc_reg10 <- reg10 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg10
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 5000 men TSLM(Time ~ Year) 2020 N(773, 285) 773. [751.1888, 794.4603]80
fc_reg11 <- reg11 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg11
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 5000 women TSLM(Time ~ Year) 2020 N(892, 1562) 892. [841.4729, 942.7565]80
fc_reg12 <- reg12 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg12
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 10000 men TSLM(Time ~ Year) 2020 N(1580, 970) 1580. [1540.432, 1620.27]80
fc_reg13 <- reg13 %>%
forecast(h = 1) %>%
hilo(level = 80)
fc_reg13
## # A tsibble: 1 x 7 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean `80%`
## <int> <chr> <chr> <dbl> <dist> <dbl> <hilo>
## 1 10000 women TSLM(Time ~ Year) 2020 N(1763, 530) 1763. [1733.513, 1792.518]80
# My mean is my point forecast (my prediction) and the prediction
# intervals can be seen from above.
Question 3
# a.
autoplot(souvenirs)
## Plot variable not specified, automatically selected `.vars = Sales`

# As the description says, there is seasonality within the data from
# tourists. The spikes are from tourists visiting around Christmas.
# There is an increasing trend because of the expantion of its premises,
# range of products, and staff.
# b.
souvenirs %>%
features(Sales, features = guerrero)
## # A tibble: 1 × 1
## lambda_guerrero
## <dbl>
## 1 0.00212
# The lambda is very close to zero, meaning that the log will make
# the size of the seasonal variation about the same across the whole
# series, making the forecasting model simpler.
souvenirs %>%
autoplot(log(Sales)) +
labs(y = "Log Sales")

# Compared to
souvenirs %>%
autoplot(box_cox(Sales, 0.00212)) +
labs(y = "Box-Cox transformed Sales")

# They are the same graph.
# c.
# linear trend
sales_reg <- souvenirs %>%
autoplot(log(Sales)) +
labs(y = "Log Sales") +
geom_smooth(method = "lm", se = FALSE)
sales_reg
## `geom_smooth()` using formula 'y ~ x'

# We use 3 seasonal dummies for quarterly data
t <- nrow(souvenirs)
trend <- seq(1:t)
souvenirs1 <- souvenirs %>%
mutate(Quarter = yearquarter(Month)) %>%
mutate(q2 = ifelse(quarter(Month) == 2,1,0)) %>%
mutate(q3 = ifelse(quarter(Month) == 3,1,0)) %>%
mutate(q4 = ifelse(quarter(Month) == 4,1,0)) %>%
mutate(td = trend)
View(souvenirs1)
# Fitting the regression to my seasonal dummies
sales_reg1 <- souvenirs1 %>%
autoplot(log(Sales)) +
geom_smooth(method = "lm", se = FALSE)
sales_reg1
## `geom_smooth()` using formula 'y ~ x'

# I am guessing that the "surfing festival" is during
# Australia's summers, so the months of Jan, Feb, and
# March (or quarter 1).
souvenirs2 <- souvenirs1 %>%
mutate(Surfing_festival = ifelse(quarter(Month) == 1,1,0))
View(souvenirs2)
# Thus, the fitted regression line for all three is
sales_reg3 <- souvenirs2 %>%
autoplot(log(Sales)) +
geom_smooth(method = "lm", se = FALSE)
sales_reg3
## `geom_smooth()` using formula 'y ~ x'

# Now, the fitting the values to the actual values in one
# graph. I included trend into the model since there is an increasing
# trend for the data.
fit_sales <- souvenirs2 %>%
model(lm = TSLM(Sales ~ q2+q3+q4+td+Surfing_festival)) %>%
report()
## Series: Sales
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -19813.1 -4578.2 -572.2 4047.9 64023.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4601.90 3050.77 -1.508 0.135
## q2 -148.11 3386.77 -0.044 0.965
## q3 2634.71 3396.65 0.776 0.440
## q4 17944.85 3413.04 5.258 1.21e-06 ***
## td 324.93 49.81 6.523 6.01e-09 ***
## Surfing_festival NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10960 on 79 degrees of freedom
## Multiple R-squared: 0.5387, Adjusted R-squared: 0.5154
## F-statistic: 23.07 on 4 and 79 DF, p-value: 1.1873e-12
aug_sales <- augment(fit_sales)
ggplot(aug_sales, aes(x = Month, y = .fitted, color = "Fitted")) +
geom_line() +
geom_line(aes(y = Sales, color = "Data")) +
labs(y = "Sales")

# d.
# I can't figure out if this question wants me to use the model
# I just used or not. So, I am assuming that we plotting the residuals
# based on the model I just created.
fit_sales %>%
gg_tsresiduals()

# There seems to be a couple of outliers and some autocorrelation.
# The residuals also have a seasonal pattern.
aug_sales %>%
ggplot(aes(x = .fitted, y = .resid, color = factor(quarter(Month)))) +
geom_point()

# Again, most are centered around zero. However, there remains to be outliers
# that all seem to be in quarter 4 since it's close to Christmas time??
# e.
aug2 <- aug_sales %>%
mutate(month = factor(month(Month)))
View(aug2)
boxplot <- aug2 %>%
ggplot(aes(x = month, y = .resid)) +
geom_boxplot()
boxplot

# The numbers represent each month. There is are outliers for
# months November and December (around Christmas time). I also am
# guessing that the residuals are so large for these times because
# the actual data has an increasing trend.
# f.
coe <- souvenirs2 %>%
model(TSLM(Sales ~ q2+q3+q4+td+Surfing_festival)) %>%
report()
## Series: Sales
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -19813.1 -4578.2 -572.2 4047.9 64023.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4601.90 3050.77 -1.508 0.135
## q2 -148.11 3386.77 -0.044 0.965
## q3 2634.71 3396.65 0.776 0.440
## q4 17944.85 3413.04 5.258 1.21e-06 ***
## td 324.93 49.81 6.523 6.01e-09 ***
## Surfing_festival NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10960 on 79 degrees of freedom
## Multiple R-squared: 0.5387, Adjusted R-squared: 0.5154
## F-statistic: 23.07 on 4 and 79 DF, p-value: 1.1873e-12
# The coefficients of each dummy variable represents the increase/decrease
# of a shift in the intercept of sales when they are equal to one (aka are true).
# For example, during quarter two (q2 == 1), the graph shifts down by
# -148.11 (the value of its coefficient). The trend has an increasing slope
# average of 324.93 per Month.
# g.
aug3 <- souvenirs2 %>%
model(TSLM(Sales ~ q2+q3+q4+td+Surfing_festival)) %>%
augment()
lun <- aug3 %>%
features(.innov, ljung_box, lag = 12, dof = 0)
lun
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 TSLM(Sales ~ q2 + q3 + q4 + td + Surfing_festival) 47.4 0.00000395
# This is a very small value of Q*, meaning that the autocorrelation
# of residuals come from white noise.
# h.
# I couldn't find the forecast using my new model, but
# I forecast the values based on my original data.
sales_fc1 <- souvenirs2 %>%
model(TSLM(Sales ~ Month)) %>%
forecast(h = 36) %>%
autoplot(souvenirs2)
sales_fc1

# i. I can use the special fucntions trend and season that
# are within the TSLM model.
sales_fc2 <- souvenirs2 %>%
model(TSLM(Sales ~ Month + trend() + season())) %>%
forecast(h = 36) %>%
autoplot(souvenirs2)
sales_fc2

# Seems pretty accurate to me.