Load packages and data

rm(list = ls())

library(ggplot2)
library(tidyverse)
library(tidyquant)
library(forecast)
library(dplyr)
library(feasts)
library(fpp3)
library(seasonal)
library(fable)

Question 1

# Manipulating the data

jan <- vic_elec %>% 
  mutate(Day = as_date(Time)) %>% 
  filter(yearmonth(Day) == yearmonth("Jan 2014")) %>% 
  index_by(Day) %>% 
  summarise(Demand = sum(Demand), Temperature = max(Temperature))
jan
## # A tsibble: 31 x 3 [1D]
##    Day         Demand Temperature
##    <date>       <dbl>       <dbl>
##  1 2014-01-01 175185.        26  
##  2 2014-01-02 188351.        23  
##  3 2014-01-03 189086.        22.2
##  4 2014-01-04 173798.        20.3
##  5 2014-01-05 169733.        26.1
##  6 2014-01-06 195241.        19.6
##  7 2014-01-07 199770.        20  
##  8 2014-01-08 205339.        27.4
##  9 2014-01-09 227334.        32.4
## 10 2014-01-10 258111.        34  
## # … with 21 more rows
## # ℹ Use `print(n = ...)` to see more rows
# a.

reg <- ggplot(data = jan, mapping = aes(x = Temperature, y = Demand)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
reg
## `geom_smooth()` using formula 'y ~ x'

# As temperature increases, the demand for electricity (A/C) increases.

# b. 

# Here is the plot of the residuals 

resid <- jan %>% 
  model(TSLM(Demand ~ Temperature)) %>% 
  gg_tsresiduals()
resid

# I wanted to see the actual values

jan %>% 
  model(TSLM(Demand ~ Temperature)) %>% 
  augment()
## # A tsibble: 31 x 6 [1D]
## # Key:       .model [1]
##    .model                     Day         Demand .fitted  .resid  .innov
##    <chr>                      <date>       <dbl>   <dbl>   <dbl>   <dbl>
##  1 TSLM(Demand ~ Temperature) 2014-01-01 175185. 219096. -43911. -43911.
##  2 TSLM(Demand ~ Temperature) 2014-01-02 188351. 200633. -12282. -12282.
##  3 TSLM(Demand ~ Temperature) 2014-01-03 189086. 195709.  -6624.  -6624.
##  4 TSLM(Demand ~ Temperature) 2014-01-04 173798. 184016. -10219. -10219.
##  5 TSLM(Demand ~ Temperature) 2014-01-05 169733. 219711. -49978. -49978.
##  6 TSLM(Demand ~ Temperature) 2014-01-06 195241. 179708.  15533.  15533.
##  7 TSLM(Demand ~ Temperature) 2014-01-07 199770. 182170.  17601.  17601.
##  8 TSLM(Demand ~ Temperature) 2014-01-08 205339. 227712. -22373. -22373.
##  9 TSLM(Demand ~ Temperature) 2014-01-09 227334. 258483. -31149. -31149.
## 10 TSLM(Demand ~ Temperature) 2014-01-10 258111. 268330. -10219. -10219.
## # … with 21 more rows
## # ℹ Use `print(n = ...)` to see more rows
# The variance is not centered around zero and the 
# there are x outliers where the temperature is above 40. 
# Depending on how the regression line shifts/pivots when 
# you remove these outliers determines whether or not they
# are influential observations. In this case, there seems to
# residual or regression outliers at Jan 1st, 5th, Jan 26th, 
# Jan 27th, Jan 28th, and Jan 30th. To be an influential observation, 
# it must be an x and regression outlier. Thus, the influential 
# observations are Jan 15th and 16th.

# c.


jan1 <- vic_elec %>%
  filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
  index_by(Date = as_date(Time)) %>%
  summarise(Demand = sum(Demand), Temperature = max(Temperature))


# When temperature equals 15

jan_fc <- jan1 %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(new_data(jan1, 1) %>%
      mutate(Temperature = 15)) %>%
  autoplot(jan1)
jan_fc

# When temperature equals 35 

jan_fc1 <- jan1 %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(new_data(jan1, 1) %>%
      mutate(Temperature = 35)) %>%
  autoplot(jan1)
jan_fc1

# The forecast for when the temperature was 35 degrees is much 
# more accurate than when the temperature is equal to 15 
# degrees. But why are they so different from one another? Makes 
# think that I cannot rely on either.

# d. 

jan1 %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(new_data(jan1, 1) %>%
      mutate(Temperature = 15)) %>% 
  hilo(level = 80)
## # A tsibble: 1 x 6 [1D]
## # Key:       .model [1]
##   .model     Date                   Demand  .mean Tempe…¹                  `80%`
##   <chr>      <date>                 <dist>  <dbl>   <dbl>                 <hilo>
## 1 TSLM(Dema… 2014-02-01 N(151398, 6.8e+08) 1.51e5      15 [117908.1, 184888.6]80
## # … with abbreviated variable name ¹​Temperature
jan_fc1 <- jan1 %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(new_data(jan1, 1) %>%
      mutate(Temperature = 35)) %>% 
  hilo(level = 80)
      

# e. 


agg <- vic_elec %>% 
  mutate(Day = as_date(Time)) %>% 
  index_by(Day) %>% 
  summarise(Demand = sum(Demand), Temperature = max(Temperature))
agg
## # A tsibble: 1,096 x 3 [1D]
##    Day         Demand Temperature
##    <date>       <dbl>       <dbl>
##  1 2012-01-01 222438.        32.7
##  2 2012-01-02 257965.        39.6
##  3 2012-01-03 267099.        31.8
##  4 2012-01-04 222742.        25.1
##  5 2012-01-05 210585.        21.2
##  6 2012-01-06 210247.        23.6
##  7 2012-01-07 202526.        29  
##  8 2012-01-08 193413.        27.8
##  9 2012-01-09 213804.        24  
## 10 2012-01-10 215020.        19.6
## # … with 1,086 more rows
## # ℹ Use `print(n = ...)` to see more rows
ggplot(agg, aes(x = Temperature, y = Demand)) +
  geom_point()

# The highest demand for electricity is at the extreme levels of 
# temperature which makes since. This graph also displays that our 
# model of max temperatures will be inaccurate since it is doesn't
# follow only one trend. It is a nonlinear trend and, therefore, we
# should try another method to forecast it. 

Question 2

# a. 


ggplot(data = olympic_running, mapping = aes(x = Year, y = Time, color = Sex, size = Length))+
  geom_point() +
  labs(y = "Time (in seconds)", title = "Winning Time Against the Year by Sex and Length")
## Warning: Removed 31 rows containing missing values (geom_point).

# I used scatter plot since length and sex is categorical 



# b. 



# Since I am running a regression line , I am going to have to split
# the data 

length1 <- olympic_running %>% 
  filter(Length == 100)
length1
## # A tsibble: 54 x 4 [4Y]
## # Key:       Length, Sex [2]
##     Year Length Sex    Time
##    <int>  <int> <chr> <dbl>
##  1  1896    100 men    12  
##  2  1900    100 men    11  
##  3  1904    100 men    11  
##  4  1908    100 men    10.8
##  5  1912    100 men    10.8
##  6  1916    100 men    NA  
##  7  1920    100 men    10.8
##  8  1924    100 men    10.6
##  9  1928    100 men    10.8
## 10  1932    100 men    10.3
## # … with 44 more rows
## # ℹ Use `print(n = ...)` to see more rows
ggplot(length1, aes(x = Year, y = Time, color = Sex)) + 
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

# The decreasing average of time per year can be found by running the regression:

reg1 <- length1 %>% 
  filter(Sex == "men") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.3443995 -0.1081360  0.0007715  0.0750701  0.9015537 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 35.011581   2.347437   14.91 2.94e-14 ***
## Year        -0.012612   0.001198  -10.52 7.24e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2321 on 26 degrees of freedom
## Multiple R-squared: 0.8099,  Adjusted R-squared: 0.8026
## F-statistic: 110.8 on 1 and 26 DF, p-value: 7.2403e-11
# Thus, for men, there has been an average of -0.012612 decline of time in seconds as
# each year progresses for the 100 length race. 

# Now for women:

reg2 <- length1 %>% 
  filter(Sex == "women") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44833 -0.11159  0.05775  0.11731  0.36057 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.188164   3.697686  10.598 2.05e-09 ***
## Year        -0.014185   0.001872  -7.577 3.72e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2232 on 19 degrees of freedom
## Multiple R-squared: 0.7513,  Adjusted R-squared: 0.7382
## F-statistic:  57.4 on 1 and 19 DF, p-value: 3.7226e-07
# Therefore, for women, there has been a average decline of -0.014185 time in seconds
# for the 100 length race. 

# Now for 200 length race:

length2 <- olympic_running %>% 
  filter(Length == 200)

ggplot(length2, aes(x = Year, y = Time, color = Sex)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

# And now for the average decline 

# men:

reg3 <- length2 %>% 
  filter(Sex == "men") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6112 -0.1872 -0.1046  0.1935  0.6959 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 69.376498   3.553448   19.52  < 2e-16 ***
## Year        -0.024881   0.001812  -13.73  3.8e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3315 on 25 degrees of freedom
## Multiple R-squared: 0.8829,  Adjusted R-squared: 0.8782
## F-statistic: 188.5 on 1 and 25 DF, p-value: 3.7956e-13
# The average decline for 200 length race for men is -0.024881 seconds 

# women:

reg4 <- length2 %>% 
  filter(Sex == "women") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93598 -0.39344  0.08043  0.37624  0.78230 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 89.137613  11.267724   7.911 6.41e-07 ***
## Year        -0.033633   0.005685  -5.916 2.17e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5005 on 16 degrees of freedom
## Multiple R-squared: 0.6863,  Adjusted R-squared: 0.6667
## F-statistic:    35 on 1 and 16 DF, p-value: 2.1706e-05
# The average decline is -0.033633 seconds 

# Now, for the 400 length race 

length3 <- olympic_running %>% 
  filter(Length == 400)

ggplot(length3, aes(x = Year, y = Time, color = Sex))+
         geom_point()+
         geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Removed 3 rows containing missing values (geom_point).

# And so

reg5 <- length3 %>% 
  filter(Sex == "men") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6001 -0.5747 -0.2858  0.5751  4.1505 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 172.481477  11.487522   15.02 2.52e-14 ***
## Year         -0.064574   0.005865  -11.01 2.75e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.136 on 26 degrees of freedom
## Multiple R-squared: 0.8234,  Adjusted R-squared: 0.8166
## F-statistic: 121.2 on 1 and 26 DF, p-value: 2.7524e-11
# The average decline is -0.064574 seconds each year

reg6 <- length3 %>% 
  filter(Sex == "women") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1593 -1.0142  0.1024  0.7749  1.4797 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 129.39165   33.89755   3.817  0.00245 **
## Year         -0.04008    0.01703  -2.353  0.03652 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.028 on 12 degrees of freedom
## Multiple R-squared: 0.3157,  Adjusted R-squared: 0.2587
## F-statistic: 5.536 on 1 and 12 DF, p-value: 0.036523
# Average decline for women is -0.04008 seconds for the 400 length race

# Now, the 800 

length4 <- olympic_running %>% 
  filter(Length == 800)

ggplot(length4, aes(x = Year, y = Time, color = Sex)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).

# And so the average decline for Time is:

reg7 <- length4 %>% 
  filter(Sex == "men") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7306 -1.8734 -0.8449  0.6851 12.9408 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 405.8457    34.6506  11.712 1.21e-11 ***
## Year         -0.1518     0.0177  -8.576 6.47e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.387 on 25 degrees of freedom
## Multiple R-squared: 0.7463,  Adjusted R-squared: 0.7361
## F-statistic: 73.54 on 1 and 25 DF, p-value: 6.4705e-09
# -0.1518 seconds for men and 

reg8 <- length4 %>% 
  filter(Sex == "women") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5.90197 -1.54437 -0.08512  1.55387  7.10393 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 511.36950   76.14729   6.716 9.85e-06 ***
## Year         -0.19796    0.03837  -5.159 0.000145 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.401 on 14 degrees of freedom
## Multiple R-squared: 0.6553,  Adjusted R-squared: 0.6307
## F-statistic: 26.61 on 1 and 14 DF, p-value: 0.00014512
# -0.19796 seconds for women in the 800 length race as each year has 
# progressed 

# Now, for the 1500

length5 <- olympic_running %>% 
  filter(Length == 1500)

ggplot(length5, aes(x = Year, y = Time, color = Sex)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

# There is a decline for men, but surprisingly an increase 
# for women 

# The average for men is 

reg9 <- length5 %>% 
  filter(Sex == "men") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.302  -4.585  -1.215   1.925  27.133 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 843.43666   81.81773  10.309 1.12e-10 ***
## Year         -0.31507    0.04177  -7.543 5.23e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.09 on 26 degrees of freedom
## Multiple R-squared: 0.6864,  Adjusted R-squared: 0.6743
## F-statistic:  56.9 on 1 and 26 DF, p-value: 5.2345e-08
# -0.31507 each year and the average for women is a

reg10 <- length5 %>% 
  filter(Sex == "women") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8003 -3.9012  0.7371  3.3202  6.4808 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -50.9926   211.3851  -0.241    0.814
## Year          0.1468     0.1060   1.384    0.196
## 
## Residual standard error: 5.071 on 10 degrees of freedom
## Multiple R-squared: 0.1608,  Adjusted R-squared: 0.07691
## F-statistic: 1.917 on 1 and 10 DF, p-value: 0.19635
# 0.1468 increase in seconds per year 

# Now for the 5000 race

length6 <- olympic_running %>% 
  filter(Length == 5000)
length6
## # A tsibble: 33 x 4 [4Y]
## # Key:       Length, Sex [2]
##     Year Length Sex    Time
##    <int>  <int> <chr> <dbl>
##  1  1912   5000 men    877.
##  2  1916   5000 men     NA 
##  3  1920   5000 men    896.
##  4  1924   5000 men    871.
##  5  1928   5000 men    878 
##  6  1932   5000 men    870 
##  7  1936   5000 men    862.
##  8  1940   5000 men     NA 
##  9  1944   5000 men     NA 
## 10  1948   5000 men    858.
## # … with 23 more rows
## # ℹ Use `print(n = ...)` to see more rows
ggplot(length6, aes(x = Year, y = Time, color = Sex)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Removed 3 rows containing missing values (geom_point).

# There is an obvious negative association between time and year 
# for men, however, because of small quantities of data for women, 
# the associate isn't very clear. We will calculate the averages 
# nonetheless. 

# For men, the average decline is

reg10 <- length6 %>% 
  filter(Sex == "men") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.311 -11.668  -1.096   7.515  40.596 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2853.1995   205.1246  13.910 2.22e-12 ***
## Year          -1.0299     0.1042  -9.881 1.50e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.66 on 22 degrees of freedom
## Multiple R-squared: 0.8161,  Adjusted R-squared: 0.8078
## F-statistic: 97.64 on 1 and 22 DF, p-value: 1.4995e-09
# -1.0299 seconds per year and for women the average declien is 

reg11 <- length6 %>% 
  filter(Sex == "women") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1504.175   3467.321   0.434    0.687
## Year          -0.303      1.728  -0.175    0.869
## 
## Residual standard error: 28.92 on 4 degrees of freedom
## Multiple R-squared: 0.007624,    Adjusted R-squared: -0.2405
## F-statistic: 0.03073 on 1 and 4 DF, p-value: 0.86936
# -0.303 seocnds per year for the 1500 length race

# Finally, last but not least, the 10000

length7 <- olympic_running %>% 
  filter(Length == 10000)

ggplot(length7, aes(x = Year, y = Time, color = Sex)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Removed 3 rows containing missing values (geom_point).

# and so the average decline for men is 

reg12 <- length7 %>% 
  filter(Sex == "men") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.964 -24.222  -4.091  11.911  58.859 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6965.4594   378.4635   18.41 7.52e-15 ***
## Year          -2.6659     0.1923  -13.86 2.37e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.89 on 22 degrees of freedom
## Multiple R-squared: 0.8973,  Adjusted R-squared: 0.8926
## F-statistic: 192.2 on 1 and 22 DF, p-value: 2.3727e-12
# -2.6659 seconds per year and the average decline for women is 

reg13 <- length7 %>% 
  filter(Sex == "women") %>% 
  model(TSLM(Time ~ Year)) %>% 
  report()
## Series: Time 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.550 -11.594  -2.285   7.731  29.765 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8825.2600  1402.4211   6.293  0.00075 ***
## Year          -3.4962     0.7005  -4.991  0.00247 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.16 on 6 degrees of freedom
## Multiple R-squared: 0.8059,  Adjusted R-squared: 0.7735
## F-statistic: 24.91 on 1 and 6 DF, p-value: 0.0024746
# -3.4962 seconds per year for the 10000 length race.




# c. 



# Now, I am plotting the residuals against the year. I am 
# starting to think that I shouldn't have plotted each
# length individually, so I am going to use facet_wrap to now plot
# the residuals. 


# I am putting them all together on the same graph now :). 
# THIS TOOK 5 MINUTES COMPARED TO 2 HOURS!! Grrrr


ggplot(data = olympic_running, mapping = aes(x = Year, y = Time, color = Sex))+
  geom_point() +
  labs(y = "Time (in seconds)", title = "Winning Time Against the Year by Sex and Length") +
  facet_wrap(~ Length, nrow = 3, scales = "free") 
## Warning: Removed 31 rows containing missing values (geom_point).

# I am plotting the residuals for each Race now 

reg1 %>% 
  gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_bin).

reg2 %>% 
  gg_tsresiduals()
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing non-finite values (stat_bin).

reg3 %>% 
  gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_bin).

reg4 %>% 
  gg_tsresiduals()

reg5 %>% 
  gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing non-finite values (stat_bin).

reg6 %>% 
  gg_tsresiduals()

reg7 %>% 
  gg_tsresiduals()
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).

reg8 %>% 
  gg_tsresiduals()
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing non-finite values (stat_bin).

reg9 %>% 
  gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_bin).

reg10 %>% 
  gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing non-finite values (stat_bin).

reg11 %>% 
  gg_tsresiduals()

reg12 %>% 
  gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing non-finite values (stat_bin).

reg13 %>% 
  gg_tsresiduals()

# All of the residuals are pretty random. Every graph 
# seems to have a random pattern. The residuals are centered 
# around the mean. They are not autocorrelated which
# is expected. 

# d. 


fc_reg1 <- reg1 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg1
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model            Year          Time .mean                  `80%`
##    <int> <chr> <chr>            <dbl>        <dist> <dbl>                 <hilo>
## 1    100 men   TSLM(Time ~ Yea…  2020 N(9.5, 0.061)  9.53 [9.217343, 9.851671]80
fc_reg2 <- reg2 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg2
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year         Time .mean                  `80%`
##    <int> <chr> <chr>             <dbl>       <dist> <dbl>                 <hilo>
## 1    100 women TSLM(Time ~ Year)  2020 N(11, 0.059)  10.5 [10.22224, 10.84658]80
fc_reg3 <- reg3 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg3
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year        Time .mean                  `80%`
##    <int> <chr> <chr>             <dbl>      <dist> <dbl>                 <hilo>
## 1    200 men   TSLM(Time ~ Year)  2020 N(19, 0.13)  19.1 [18.66343, 19.57145]80
fc_reg4 <- reg4 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg4
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year        Time .mean                  `80%`
##    <int> <chr> <chr>             <dbl>      <dist> <dbl>                 <hilo>
## 1    200 women TSLM(Time ~ Year)  2020 N(21, 0.31)  21.2 [20.48494, 21.91454]80
fc_reg5 <- reg5 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg5
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year       Time .mean                 `80%`
##    <int> <chr> <chr>             <dbl>     <dist> <dbl>                <hilo>
## 1    400 men   TSLM(Time ~ Year)  2020 N(42, 1.5)  42.0 [40.49022, 43.5944]80
fc_reg6 <- reg6 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg6
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year       Time .mean                 `80%`
##    <int> <chr> <chr>             <dbl>     <dist> <dbl>                <hilo>
## 1    400 women TSLM(Time ~ Year)  2020 N(48, 1.4)  48.4 [46.9239, 49.94863]80
fc_reg7 <- reg7 %>%
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg7
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year      Time .mean                  `80%`
##    <int> <chr> <chr>             <dbl>    <dist> <dbl>                 <hilo>
## 1    800 men   TSLM(Time ~ Year)  2020 N(99, 13)  99.2 [94.59505, 103.8803]80
fc_reg8 <- reg8 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg8
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year       Time .mean                 `80%`
##    <int> <chr> <chr>             <dbl>     <dist> <dbl>                <hilo>
## 1    800 women TSLM(Time ~ Year)  2020 N(111, 14)  111. [106.659, 116.3079]80
fc_reg9 <- reg9 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg9
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year       Time .mean                  `80%`
##    <int> <chr> <chr>             <dbl>     <dist> <dbl>                 <hilo>
## 1   1500 men   TSLM(Time ~ Year)  2020 N(207, 74)  207. [195.9438, 218.0527]80
fc_reg10 <- reg10 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg10
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year        Time .mean                  `80%`
##    <int> <chr> <chr>             <dbl>      <dist> <dbl>                 <hilo>
## 1   5000 men   TSLM(Time ~ Year)  2020 N(773, 285)  773. [751.1888, 794.4603]80
fc_reg11 <- reg11 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg11
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year         Time .mean                  `80%`
##    <int> <chr> <chr>             <dbl>       <dist> <dbl>                 <hilo>
## 1   5000 women TSLM(Time ~ Year)  2020 N(892, 1562)  892. [841.4729, 942.7565]80
fc_reg12 <- reg12 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg12
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year         Time .mean                 `80%`
##    <int> <chr> <chr>             <dbl>       <dist> <dbl>                <hilo>
## 1  10000 men   TSLM(Time ~ Year)  2020 N(1580, 970) 1580. [1540.432, 1620.27]80
fc_reg13 <- reg13 %>% 
  forecast(h = 1) %>% 
  hilo(level = 80)
fc_reg13
## # A tsibble: 1 x 7 [4Y]
## # Key:       Length, Sex, .model [1]
##   Length Sex   .model             Year         Time .mean                  `80%`
##    <int> <chr> <chr>             <dbl>       <dist> <dbl>                 <hilo>
## 1  10000 women TSLM(Time ~ Year)  2020 N(1763, 530) 1763. [1733.513, 1792.518]80
# My mean is my point forecast (my prediction) and the prediction 
# intervals can be seen from above. 

Question 3

# a. 

autoplot(souvenirs)
## Plot variable not specified, automatically selected `.vars = Sales`

# As the description says, there is seasonality within the data from 
# tourists. The spikes are from tourists visiting around Christmas. 
# There is an increasing trend because of the expantion of its premises, 
# range of products, and staff. 

# b. 

souvenirs %>% 
  features(Sales, features = guerrero)
## # A tibble: 1 × 1
##   lambda_guerrero
##             <dbl>
## 1         0.00212
# The lambda is very close to zero, meaning that the log will make
# the size of the seasonal variation about the same across the whole
# series, making the forecasting model simpler. 

souvenirs %>% 
  autoplot(log(Sales)) +
  labs(y = "Log Sales")

# Compared to 

souvenirs %>% 
  autoplot(box_cox(Sales, 0.00212)) +
  labs(y = "Box-Cox transformed Sales")

# They are the same graph. 

# c. 

# linear trend

sales_reg <- souvenirs %>% 
  autoplot(log(Sales)) +
  labs(y = "Log Sales") +
  geom_smooth(method = "lm", se = FALSE)
sales_reg
## `geom_smooth()` using formula 'y ~ x'

# We use 3 seasonal dummies for quarterly data 

t <- nrow(souvenirs)
trend <- seq(1:t)

souvenirs1 <- souvenirs %>% 
  mutate(Quarter = yearquarter(Month)) %>% 
  mutate(q2 = ifelse(quarter(Month) == 2,1,0)) %>% 
  mutate(q3 = ifelse(quarter(Month) == 3,1,0)) %>% 
  mutate(q4 = ifelse(quarter(Month) == 4,1,0)) %>% 
  mutate(td = trend)
View(souvenirs1)

# Fitting the regression to my seasonal dummies

sales_reg1 <- souvenirs1 %>% 
  autoplot(log(Sales)) +
  geom_smooth(method = "lm", se = FALSE)
sales_reg1
## `geom_smooth()` using formula 'y ~ x'

# I am guessing that the "surfing festival" is during 
# Australia's summers, so the months of Jan, Feb, and 
# March (or quarter 1).

souvenirs2 <- souvenirs1 %>% 
  mutate(Surfing_festival = ifelse(quarter(Month) == 1,1,0))
View(souvenirs2)

# Thus, the fitted regression line for all three is  

sales_reg3 <- souvenirs2 %>% 
  autoplot(log(Sales)) +
  geom_smooth(method = "lm", se = FALSE)
sales_reg3
## `geom_smooth()` using formula 'y ~ x'

# Now, the fitting the values to the actual values in one 
# graph. I included trend into the model since there is an increasing
# trend for the data. 


fit_sales <- souvenirs2 %>% 
  model(lm = TSLM(Sales ~ q2+q3+q4+td+Surfing_festival)) %>% 
  report()
## Series: Sales 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19813.1  -4578.2   -572.2   4047.9  64023.4 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4601.90    3050.77  -1.508    0.135    
## q2                -148.11    3386.77  -0.044    0.965    
## q3                2634.71    3396.65   0.776    0.440    
## q4               17944.85    3413.04   5.258 1.21e-06 ***
## td                 324.93      49.81   6.523 6.01e-09 ***
## Surfing_festival       NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10960 on 79 degrees of freedom
## Multiple R-squared: 0.5387,  Adjusted R-squared: 0.5154
## F-statistic: 23.07 on 4 and 79 DF, p-value: 1.1873e-12
aug_sales <- augment(fit_sales)


ggplot(aug_sales, aes(x = Month, y = .fitted, color = "Fitted")) +
  geom_line() +
  geom_line(aes(y = Sales, color = "Data")) +
  labs(y = "Sales")

# d. 

# I can't figure out if this question wants me to use the model 
# I just used or not. So, I am assuming that we plotting the residuals 
# based on the model I just created. 

fit_sales %>% 
  gg_tsresiduals()

# There seems to be a couple of outliers and some autocorrelation. 
# The residuals also have a seasonal pattern.

aug_sales %>% 
  ggplot(aes(x = .fitted, y = .resid, color = factor(quarter(Month)))) +
  geom_point()

# Again, most are centered around zero. However, there remains to be outliers
# that all seem to be in quarter 4 since it's close to Christmas time??

# e. 

aug2 <- aug_sales %>% 
  mutate(month = factor(month(Month)))
View(aug2)

boxplot <- aug2 %>% 
  ggplot(aes(x = month, y = .resid)) +
  geom_boxplot()
boxplot

# The numbers represent each month. There is are outliers for 
# months November and December (around Christmas time). I also am
# guessing that the residuals are so large for these times because
# the actual data has an increasing trend. 


# f. 

coe <- souvenirs2 %>% 
  model(TSLM(Sales ~ q2+q3+q4+td+Surfing_festival)) %>% 
  report()
## Series: Sales 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19813.1  -4578.2   -572.2   4047.9  64023.4 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4601.90    3050.77  -1.508    0.135    
## q2                -148.11    3386.77  -0.044    0.965    
## q3                2634.71    3396.65   0.776    0.440    
## q4               17944.85    3413.04   5.258 1.21e-06 ***
## td                 324.93      49.81   6.523 6.01e-09 ***
## Surfing_festival       NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10960 on 79 degrees of freedom
## Multiple R-squared: 0.5387,  Adjusted R-squared: 0.5154
## F-statistic: 23.07 on 4 and 79 DF, p-value: 1.1873e-12
# The coefficients of each dummy variable represents the increase/decrease
# of a shift in the intercept of sales when they are equal to one (aka are true). 
# For example, during quarter two (q2 == 1), the graph shifts down by 
# -148.11 (the value of its coefficient). The trend has an increasing slope
# average of 324.93 per Month.

# g. 

aug3 <- souvenirs2 %>% 
  model(TSLM(Sales ~ q2+q3+q4+td+Surfing_festival)) %>% 
  augment()
  
lun <- aug3 %>% 
  features(.innov, ljung_box, lag = 12, dof = 0)
lun
## # A tibble: 1 × 3
##   .model                                             lb_stat  lb_pvalue
##   <chr>                                                <dbl>      <dbl>
## 1 TSLM(Sales ~ q2 + q3 + q4 + td + Surfing_festival)    47.4 0.00000395
# This is a very small value of Q*, meaning that the autocorrelation 
# of residuals come from white noise.

# h. 

# I couldn't find the forecast using my new model, but 
# I forecast the values based on my original data. 

sales_fc1 <- souvenirs2 %>% 
  model(TSLM(Sales ~ Month)) %>% 
  forecast(h = 36) %>% 
  autoplot(souvenirs2)
sales_fc1

# i. I can use the special fucntions trend and season that 
# are within the TSLM model. 


sales_fc2 <- souvenirs2 %>% 
  model(TSLM(Sales ~ Month + trend() + season())) %>% 
  forecast(h = 36) %>% 
  autoplot(souvenirs2)
sales_fc2

# Seems pretty accurate to me. 

Question 4

# Manipulating the problem 

gas <- us_gasoline %>% 
  filter(year(Week) <= 2004)
View(gas)

# Getting a visualization

 autoplot(gas)+
   labs(y = "Thousand barrel per day")
## Plot variable not specified, automatically selected `.vars = Barrels`

 # a. 
 
 # The seasonality occurs about every year so m = 52, making 
 # fourier terms useful compared to seasonal dummy variables 
 
 # I am doing a bunch of K values. The max is k = m/2 (so k = 26).
 # I am going to choose only 6 for now to experiment with.
 
 
 fit <- gas %>%
   model(K1 = TSLM(Barrels ~ trend() + fourier(K = 1)),
         K2 = TSLM(Barrels ~ trend() + fourier(K = 2)),
         K3 = TSLM(Barrels ~ trend() + fourier(K = 3)),
         K4 = TSLM(Barrels ~ trend() + fourier(K = 4)),
         K5 = TSLM(Barrels ~ trend() + fourier(K = 5)),
         K6 = TSLM(Barrels ~ trend() + fourier(K = 6)))
 
 # Now, I am plotting them seperately to the true effects 
 
 aug_fit <- augment(fit)
 
 # When K = 1:
 
 fit_plot <- aug_fit %>% 
   filter(.model == "K1")
 View(fit_plot)
 
 ggplot(fit_plot, aes(x = Week)) +
   geom_line(aes(y = Barrels, color = "Data")) +
   geom_line(aes(y = .fitted, color = "Fitted")) +
   scale_color_manual(values = c(Data = "grey", Fitted = "red"))

 # K = 2:
 
 fit_plot1 <- aug_fit %>% 
   filter(.model == "K2")
 View(fit_plot1)
 
 ggplot(fit_plot1, aes(x = Week)) +
   geom_line(aes(y = Barrels, color = "Data")) +
   geom_line(aes(y = .fitted, color = "Fitted")) +
   scale_color_manual(values = c(Data = "grey", Fitted = "red"))

 # K = 6
 
 fit_plot2 <- aug_fit %>% 
   filter(.model == "K6")
 View(fit_plot)
 
 ggplot(fit_plot2, aes(x = Week)) +
   geom_line(aes(y = Barrels, color = "Data")) +
   geom_line(aes(y = .fitted, color = "Fitted")) +
   scale_color_manual(values = c(Data = "grey", Fitted = "red"))

 # It is clear that as the K value increases, the fitted becomes more
 # precise. If we chose k = 26, it would fit the data exactly. 
 
# b. 
 
 # CV is for cross-valadtion, so I will be finding the AIC. The model
 # with the minimum value of the AIC is often the best model
 # for forecasting. 
 
 
 glance(fit) %>% select(.model, r_squared, adj_r_squared, AICc)
## # A tibble: 6 × 4
##   .model r_squared adj_r_squared   AICc
##   <chr>      <dbl>         <dbl>  <dbl>
## 1 K1         0.825         0.824 -1809.
## 2 K2         0.827         0.826 -1814.
## 3 K3         0.837         0.835 -1853.
## 4 K4         0.839         0.837 -1856.
## 5 K5         0.841         0.838 -1862.
## 6 K6         0.846         0.844 -1884.
 # The AIC is increasing as K increases. Therefore, K1 is our best
 # bet. 
 
 # c. 
 
 barrels_fit <- gas %>% 
   model(TSLM(Barrels ~ trend() + fourier(K = 1)))
 barrels_fit
## # A mable: 1 x 1
##   `TSLM(Barrels ~ trend() + fourier(K = 1))`
##                                      <model>
## 1                                     <TSLM>
 barrels_fit %>% 
   gg_tsresiduals()

 # The residuals are centered around the mean for the most part
 # There seems to be an outlier. The variance seems to be pretty 
 # constant across. 
 
 barrels_fit %>% 
   augment() %>% 
   features(.innov, ljung_box, lag = 52, dof = 0)
## # A tibble: 1 × 3
##   .model                                   lb_stat  lb_pvalue
##   <chr>                                      <dbl>      <dbl>
## 1 TSLM(Barrels ~ trend() + fourier(K = 1))    113. 0.00000198
 # The p value is very small. Thus, the autocorrelation of residuals are 
 # distinguishable from a white noise noise series. 
 
 
 # d. 
 
# gas is the training data, so we need to create test data of 2015
 
gas1 <- us_gasoline %>% 
  filter(year(Week) <= 2005)

# Now, the forecast :)
 
 barrels_fc <- gas %>% 
  model(TSLM(Barrels ~ trend() + fourier(K = 1))) %>% 
  forecast(h = 52) %>% 
  autoplot(gas1)
barrels_fc 

# The PI seems to be pretty accurate, maybe a little too accurate. I 
# want to check the accuracy of it 

barrels_fc1 <- gas %>% 
  model(TSLM(Barrels ~ trend() + fourier(K = 1))) %>% 
  forecast(h = 52)

accuracy(barrels_fc1, gas1) 
## # A tibble: 1 × 10
##   .model                .type      ME  RMSE   MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                 <chr>   <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TSLM(Barrels ~ trend… Test  -0.0515 0.228 0.154 -0.608  1.70 0.470 0.568 0.770
# The RMSE is pretty low! Seems a little too accurate!  

Question 5

# The annual population of Afghanistan:

afg <- global_economy %>% 
  filter(Country == "Afghanistan") %>% 
  select(Population)
afg
## # A tsibble: 58 x 2 [1Y]
##    Population  Year
##         <dbl> <dbl>
##  1    8996351  1960
##  2    9166764  1961
##  3    9345868  1962
##  4    9533954  1963
##  5    9731361  1964
##  6    9938414  1965
##  7   10152331  1966
##  8   10372630  1967
##  9   10604346  1968
## 10   10854428  1969
## # … with 48 more rows
## # ℹ Use `print(n = ...)` to see more rows
# a. 

autoplot(afg)
## Plot variable not specified, automatically selected `.vars = Population`

# The Soviet-Afghan war was during the years 1979 to 1989. 
# This is the same time as when Afghanistan's population declines 
# on the graph. 

# b. 

# Linear trend model 

ggplot(afg, aes(x = Year, y = Population)) +
  geom_line() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

# And now a piecewise linear trend model with knots at 1980 and 
# 1989. 

fit_afg <- afg %>% 
  model(piecewise = TSLM(Population ~ trend(knots = c(1980, 1989))))
fit_afg
## # A mable: 1 x 1
##   piecewise
##     <model>
## 1    <TSLM>
ggplot(afg, aes(x = Year, y = Population)) +
  geom_line() +
  geom_line(data = fitted(fit_afg), aes(y = .fitted, color = .model))

# The piecewise is obviously much more accurate than the linear regression 
# model. 

# c. 

# For the linear model 

afg %>% 
  model(TSLM(Population ~ Year)) %>% 
  forecast(h = 5) %>% 
  autoplot(afg)

# For the piecewise model 

fit_afg %>% 
  forecast(h = 5) %>% 
  autoplot(afg)

# The piecewise forecast is much more accurate than the linear model 

Question 6

# a. 


# The OLS Assumptions

# 1. We assume that the model is a reasonable approximation to
# reality (linear). So basically, there's correct specification 
# and the model is linear in the coefficients. 

# 2. No multicollinearity - highly (or perfectly) correlated predictors.
# This is because it would be hard to measure the variables independently 
# of one another, making the standard error of the coefficients increase. 

# 3. The sum of the errors is zero. This is because they are treated as 
# the sum of the standard devuiations from the mean zero, without squaring 
# the nagative values. 

# 4. Errors have a constant variance (homoskedasticity) - if not, it gives 
# biased standard errors of the coefficent. 

# 5. The errors are not autocorrelated; otherwise the forecasts will be 
# inefficient, as there is more information in the data that can be exploited. 
# This usually happens when the data is a time series. 

# 6. Errors are unrelated to the predictor variables; otherwise there would 
# be more information that should be included in the systematic part of the
# model, aka ommitted variables, and would cause bias for the coefficients.

# 7. Errors follow a normal distribution. So, there are no outliers and there's 
# a constant variance. 


# b. 



# An unbiased estimator means that the expected value of the estimation 
# equals the true value of the coefficient. Consistency means that as you 
# increase the sample size of the estimators (even to infinity), it still 
# equals the true value of the coefficient. 



# c. 



# the difference between R squared and adjusted R squared is that 
# the adjusted R squared is adjusted to the degrees of freedom.