Regression (Model Prediction) Exercises

Model 1: Plotting data & testing assumptions (Exercise 0)

# loading datasets and saving them as objects
wine <- read.csv("QRS_winedata.csv", row.names=1) # use indent key to see all data in working directory
runners <- read.csv("QRS_100meterSprint.csv")

# checking the 1,2,3! (mean, spread, distribution)
summary(wine)

##       temp             time            type          
##  Min.   : 9.547   Min.   :   0.0   Length:72         
##  1st Qu.:11.010   1st Qu.: 613.2   Class :character  
##  Median :12.125   Median :1226.5   Mode  :character  
##  Mean   :12.858   Mean   :1226.5                     
##  3rd Qu.:14.020   3rd Qu.:1839.7                     
##  Max.   :18.691   Max.   :2453.0

str(wine)

## 'data.frame':    72 obs. of  3 variables:
##  $ temp: num  9.81 10 9.95 10.67 10.97 ...
##  $ time: num  0 70.1 140.2 210.3 280.3 ...
##  $ type: chr  "Bowl" "Bowl" "Bowl" "Bowl" ...

summary(runners)

##   olympic_year       time          event          
##  Min.   :1900   Min.   : 9.63   Length:62         
##  1st Qu.:1929   1st Qu.:10.08   Class :character  
##  Median :1960   Median :10.73   Mode  :character  
##  Mean   :1960   Mean   :10.63                     
##  3rd Qu.:1991   3rd Qu.:11.00                     
##  Max.   :2020   Max.   :12.20                     
##                 NA's   :12

str(runners)

## 'data.frame':    62 obs. of  3 variables:
##  $ olympic_year: int  1900 1904 1908 1912 1916 1920 1924 1928 1932 1936 ...
##  $ time        : num  11 11 10.8 10.8 NA 10.8 10.6 10.8 10.3 10.3 ...
##  $ event       : chr  "mens" "mens" "mens" "mens" ...

# creating a simple model for each dataset
wine$type <- as.factor(wine$type) # turning holding type category into factor
runners$event <- as.factor(runners$event) # turning mens vs womens event category into factor
colnames(runners)[1] <- "year" # making the olympic_year column name shorter for ease of use
wineMod1 <- lm(temp~time, data=wine)
runnersMod1 <- lm(time~year, data=runners)

# extracting intercept coefficients from simple model summaries
coefficients(wineMod1)[2] # slope of time

##       time 
## 0.00262675

summary(wineMod1) # or: gives full wine model summary

## 
## Call:
## lm(formula = temp ~ time, data = wine)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.52440 -1.06482  0.02575  1.20746  2.61117 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.6364788  0.3254305   29.61   <2e-16 ***
## time        0.0026267  0.0002282   11.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.41 on 70 degrees of freedom
## Multiple R-squared:  0.6544, Adjusted R-squared:  0.6494 
## F-statistic: 132.5 on 1 and 70 DF,  p-value: < 2.2e-16

coefficients(runnersMod1)[2] # slope of year

##         year 
## -0.008445314

summary(runnersMod1) # or: gives full runners model summary

## 
## Call:
## lm(formula = time ~ year, data = runners)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6898 -0.4961 -0.2276  0.4774  1.2224 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.260162   4.656643   5.854 4.19e-07 ***
## year        -0.008445   0.002365  -3.572 0.000819 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5568 on 48 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:   0.21,  Adjusted R-squared:  0.1935 
## F-statistic: 12.76 on 1 and 48 DF,  p-value: 0.0008192

# checking plots at the same time
par(mfrow=c(1,2))
# wine temperature data:
wineCols <- c("#66c2a5","#fc8d62")
names(wineCols) <- c("stem", "bowl")
plot(temp~time, data=wine, pch=ifelse(wine$type=="Bowl", 2, 1), # if bowl pch 2, if stem pch 16
     col=ifelse(wine$type=="Bowl", wineCols["bowl"], wineCols["stem"]), # if bowl teal, if stem terracotta
     xlab="Time (s)", ylab=expression("Temperature ("*degree*"C)"), main="Wine Temperature and Hold")
legend("topleft", legend=c("Holding by Stem","Holding by Bowl"), pch=c(1,2), col=wineCols, bty="n")
grid() # insert a grid to more easily read
wineParamMod1 <- coef(wineMod1)
abline(wineParamMod1[1], wineParamMod1[2], col="darkblue", lwd=2, lty=2)
# olympics 100m sprint data:
runCols <- c("#2b8cbe", "#8856a7")
names(runCols) <- c("mens", "womens")
plot(time~year, data=runners, pch=ifelse(runners$event=="mens", 5, 2), # if mens pch 5, if womens pch 2
     col=ifelse(runners$event=="mens", runCols["mens"], runCols["womens"]), # if mens blue, if womens purple
     xlab="Olympic Year", ylab=expression("Time (s)"), main="Olympics 100m Sprint Time")
legend("topright", legend=c("Men's 100m","Women's 100m"), pch=c(1,2), col=runCols, bty="n")
grid() # insert a grid to more easily read
runnersParamMod1 <- coef(runnersMod1)
abline(runnersParamMod1[1], runnersParamMod1[2], col="darkblue", lwd=2, lty=2)

# testing assumptions for both first models
par(mfrow=c(2,2))
require(car)

## Loading required package: car

## Loading required package: carData

# wine temperature data:
wineRes1 <- residuals(wineMod1) # extracting residuals
wineFit1 <- fitted(wineMod1) # extracting fitted values (predictions)
plot(wineFit1, wineRes1, xlab="Fitted values", ylab="Residuals", 
     main="Wine Temperature and Hold Mod. I") # testing for homoscedasticity and linearity
abline(h=0, lty=2, col="orangered")
qqPlot(wineRes1, xlab="Theoretical quantiles", ylab="Standardized quantiles",
       main="QQ-Plot: Wine Temperature Mod. I") # testing for normality

## 36 94 
## 36 70

# olympics 100m sprint data:
runnersRes1 <- residuals(runnersMod1)
runnersFit1 <- fitted(runnersMod1)
plot(runnersFit1, runnersRes1, xlab="Fitted values", ylab="Residuals", 
     main="Olympics 100m Sprint Time Mod. I") # testing for homoscedasticity and linearity
abline(h=0, lty=2, col="orangered")
qqPlot(runnersRes1, xlab="Theoretical quantiles", ylab="Standardized quantiles",
       main="QQ-Plot: 100m Sprint Time Mod. I") # testing for normality

## 39 44 
## 29 32

Hypotheses based on Model 1

Evidently, model 1 does not fit the data well, since the data seems to contain two groups (respectively). Testing the assumptions also reveals that model 1 is not a good model, since the assumptions of both homoscedasticity and linearity are clearly violated. While testing results for normality remain within acceptable limits (though more questionable for the olympic 100m sprint data), there is still deviation at the tails as well as some clustering as seen on the QQ-Plots.

Hypotheses for the wine temperature by hold type scenario -

H0: Holding a wine glass by the bowl versus stem has no effect on the rate at which the temperature of the wine increases.
HA: Holding a wine glass by the bowl significantly increases the rate at which the temperature of the wine increases.

Hypotheses for the Olympic 100m sprint scenario -

H0: There is no difference in the rate of improvement in 100m sprint times between men and women over time.
HA: The rate of improvement in 100m sprint times differs between men and women over time.

Model 2: Plotting data (Exercise 1)

Which statistic, that you can extract from a regression model, measures the rate at which wine temperature increases over time?

# creating interaction model for each dataset
wineMod2 <- lm(temp~time*type,data=wine)
runnersMod2 <- lm(time~year*event,data=runners)

# plotting them again with the new regression lines
par(mfrow=c(2,1))
# wine temperature data:
plot(temp~time, data=wine, pch=ifelse(wine$type=="Bowl", 2, 1), # if bowl pch 2, if stem pch 16
     col=ifelse(wine$type=="Bowl", wineCols["bowl"], wineCols["stem"]), # if bowl teal, if stem terracotta
     xlab="Time (s)", ylab=expression("Temperature ("*degree*"C)"), main="Wine Temperature and Hold")
legend("topleft", legend=c("Holding by Stem","Holding by Bowl"), pch=c(1,2), col=wineCols, bty="n")
grid() # insert a grid to more easily read
wineParamMod2 <- coef(wineMod2)
abline(wineParamMod2[1], wineParamMod2[2], col="#fc8d62", lwd=2, lty=2)
abline(wineParamMod2[1] + wineParamMod2[3], wineParamMod2[2] + wineParamMod2[4], 
       col="#66c2a5", lwd=2, lty=2)
# olympics 100m sprint data:
plot(time~year, data=runners, pch=ifelse(runners$event=="mens", 5, 2), # if mens pch 5, if womens pch 2
     col=ifelse(runners$event=="mens", runCols["mens"], runCols["womens"]), # if mens blue, if womens purple
     xlab="Olympic Year", ylab=expression("Time (s)"), main="Olympics 100m Sprint Time")
legend("topright", legend=c("Men's 100m","Women's 100m"), pch=c(1,2), col=runCols, bty="n")
grid() # insert a grid to more easily read
runnersParamMod2 <- coef(runnersMod2)
abline(runnersParamMod2[1], runnersParamMod2[2], col="#2b8cbe", lwd=2, lty=2)
abline(runnersParamMod2[1] + runnersParamMod2[3], runnersParamMod2[2] + runnersParamMod2[4], 
       col="#8856a7", lwd=2, lty=2)

Answer: The slope $beta1$! In a linear regression, the slope informs how much Y changes as X increases. As temperature is the response variable (Y), and time is the independent variable (X), the slope therefore measures how much temperature changes as time changes.

\[ Y\sim\beta0+\beta1*X+\epsilon \] Here y is the response, and x the predictor variable. The regression coefficients are $beta0$ and $beta1$, which are the typical symbols statisticians use to refer to the intercept and slope respectively. The final symbol $epsilon$ refers to the regression error - and not the residuals. This is because $epsilon$ is a property of the population, and not the sample (like the residuals). It is the expected error for a prediction in y - which is assumed to be normally distributed with a mean of zero and a standard error of $epsilon$.

Model 2: Testing assumptions (Exercise 2)

Are the assumptions of the second model violated?

\[ y_i \sim (\beta_0 + \alpha_0 z) + (\beta_1 + \alpha_1 z)x + \epsilon \]

# testing assumptions for both second models
par(mfrow=c(2,2))
require(car)

# wine data
wineRes2 <- residuals(wineMod2) # extracting residuals
wineFit2 <- fitted(wineMod2) # extracting fitted values (predictions)
plot(wineFit2, wineRes2, xlab="Fitted values", ylab="Residuals", 
     main="Wine Temperature and Hold Mod. II") # testing for homoscedasticity and linearity
abline(h=0, lty=2, col="orangered")
qqPlot(wineRes2, xlab="Theoretical quantiles", ylab="Standardized quantiles",
       main="QQ-Plot: Wine Temperature Mod. II") # testing for normality

## [1] 27 13

# runners data
runnersRes2 <- residuals(runnersMod2)
runnersFit2 <- fitted(runnersMod2)
plot(runnersFit2, runnersRes2, xlab="Fitted values", ylab="Residuals", 
     main="Olympics 100m Sprint Time Mod. II") # testing for homoscedasticity and linearity
abline(h=0, lty=2, col="orangered")
qqPlot(runnersRes2, xlab="Theoretical quantiles", ylab="Standardized quantiles",
       main="QQ-Plot: 100m Sprint Time Mod. II") # testing for normality

## 54 47 
## 42 35

Answer: …

Brief note on prediction vs confidence intervals

Confidence Interval (CI):
- Uncertainty about the average/mean response
- “Where is the true regression LINE?”
- Narrower
Prediction Interval (PI):
- Uncertainty about an individual observation
- “Where will a NEW data point fall?”
- Wider (because individual points scatter around the line)

For example, if predicting wine temperature at 10mins, the average temperature of all wines at 10mins is 15 degrees Celsius ± 0.5 degrees Celsius (confidence interval), while the expected temperature of a single wine glass at 10mins will be 15 degrees Celsius ± 2 degrees Celsius (prediction interval).

Model 2: Extracting information from the summary table (Exercise 3)

Answer the list of questions on the regression summary table statistics in this weeks online quiz.

Wine temperature data:

What was the temperature of the wine when the experiment started (time = 0 i.e. x = 0)?
Was the starting temperature of the wine in the glass held by the stem (“typeStem”) significantly colder or warmer than that of the one held by the bowl?
Does the temperature of the wine in the glass held by the stem increase at a significantly slower rate compared to one held by the bowl?

Olympic 100m sprint data:

Is the gold medal finalist time for woman decreasing significantly faster than that for men?

# extracting information from model summaries
summary(wineMod2) # or: gives full wine model summary

## 
## Call:
## lm(formula = temp ~ time * type, data = wine)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49384 -0.16574  0.01764  0.17268  0.52854 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.656e+00  8.001e-02  120.69   <2e-16 ***
## time           3.575e-03  5.610e-05   63.73   <2e-16 ***
## typeStem      -3.958e-02  1.131e-01   -0.35    0.728    
## time:typeStem -1.896e-03  7.933e-05  -23.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.245 on 68 degrees of freedom
## Multiple R-squared:  0.9899, Adjusted R-squared:  0.9894 
## F-statistic:  2211 on 3 and 68 DF,  p-value: < 2.2e-16

summary(runnersMod2) # or: gives full runners model summary

## 
## Call:
## lm(formula = time ~ year * event, data = runners)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43447 -0.06877  0.00188  0.10829  0.35216 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      30.9592503  1.7114584  18.089  < 2e-16 ***
## year             -0.0105586  0.0008718 -12.112 6.55e-16 ***
## eventwomens       8.9531569  3.1154356   2.874  0.00612 ** 
## year:eventwomens -0.0039977  0.0015791  -2.532  0.01484 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1674 on 46 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.9316, Adjusted R-squared:  0.9271 
## F-statistic: 208.8 on 3 and 46 DF,  p-value: < 2.2e-16

# wine temperature model:
# question 1 --> intercept
cat("Answer Q1: The temperature at time = 0 was ", coef(wineMod2)[1], ".")

## Answer Q1: The temperature at time = 0 was  9.656267 .

# question 2 --> interaction term (typeStem)
cat("Answer Q2: There is no significant difference.")

## Answer Q2: There is no significant difference.

# question 3 --> interaction term (time:typeStem)
cat("Answer Q3: Yes, because when held by the stem, temperature increases by ",
    (coef(wineMod2)[2]+coef(wineMod2)[4]),".")

## Answer Q3: Yes, because when held by the stem, temperature increases by  0.001678682 .

# olympic 100m sprint model:
# question 1 --> interaction term (year:eventwomens) p-value - it is negative, so yes!
cat("Answer Q3: ", (coef(runnersMod2)[2]+coef(runnersMod2)[4]))

## Answer Q3:  -0.01455631

Answer: All questions successfully answered on this session’s quiz.

Model 2 - Wine: Graphically predicting future values (Exercise 4)

Answer the following questions by making predictions with the second model.

How much earlier - on average and in whole minutes - does a wine glass held by the bowl reach room temperature compared to if it was held by the stem? To be precise, we are asking what is the difference in minutes between the points (on the x-axis) at which the bowl line and the stem line intersect with the temperature = 20 line.
What is the 5% earliest observable time to reach 20 oC for a glass held by the bowl (in whole minutes)? To answer B, decide if you should use the prediction or confidence interval - and think about the correct interval (the level argument in predict) including the upper and lower limits of this interval.
Calculate after how many minutes the temperature of the glass held by the bowl is predicted to reach boiling point (100 oC) on average?

# first checking the graphs
par(mfrow=c(2,1))
# extending y range to 20 degrees C (room temperature) and x range to 6,500 seconds
plot(temp~time, data=wine, pch=ifelse(wine$type=="Bowl", 2, 1), # if bowl pch 2, if stem pch 16
     col=ifelse(wine$type=="Bowl", wineCols["bowl"], wineCols["stem"]), # if bowl teal, if stem terracotta
     xlab="Time (s)", xlim=c(0,6500), ylab=expression("Temperature ("*degree*"C)"), ylim=c(10,24),
     main="Wine Temperature and Hold")
legend("topleft", legend=c("Holding by Stem","Holding by Bowl"), pch=c(1,2), col=wineCols, bty="n")
grid() # insert a grid to more easily read
wineParamMod2 <- coef(wineMod2)
abline(wineParamMod2[1], wineParamMod2[2], col="#fc8d62", lwd=2)
abline(wineParamMod2[1] + wineParamMod2[3], wineParamMod2[2] + wineParamMod2[4], 
       col="#66c2a5", lwd=2)
abline(h=20, col="red", lwd=2)

# extending y range to 100 degrees C (boiling temperature)
plot(temp~time, data=wine, pch=ifelse(wine$type=="Bowl", 2, 1), # if bowl pch 2, if stem pch 16
     col=ifelse(wine$type=="Bowl", wineCols["bowl"], wineCols["stem"]), # if bowl teal, if stem terracotta
     xlab="Time (s)", xlim=c(0,60000), ylab=expression("Temperature ("*degree*"C)"), ylim=c(10,120),
     main="Wine Temperature and Hold")
legend("topleft", legend=c("Holding by Stem","Holding by Bowl"), pch=c(1,2), col=wineCols, bty="n")
grid() # insert a grid to more easily read
wineParamMod2 <- coef(wineMod2)
abline(wineParamMod2[1], wineParamMod2[2], col="#fc8d62", lwd=2)
abline(wineParamMod2[1] + wineParamMod2[3], wineParamMod2[2] + wineParamMod2[4], 
       col="#66c2a5", lwd=2)
abline(h=100, col="red", lwd=2)

Model 2 - Wine: Algebraically predicting future values

How much earlier - on average and in whole minutes - does a wine glass held by the bowl reach room temperature compared to if it was held by the stem? To be precise, we are asking what is the difference in minutes between the points (on the x-axis) at which the bowl line and the stem line intersect with the temperature = 20 line.

For bowl type (z equals 0):
\[ \begin{align} 20 &= (\beta_0 + \alpha_0 z) + (\beta_1 + \alpha_1 z)x \\ 20 &= \beta_0 + \beta_1 x \\ 20 - \beta_0 &= \beta_1 x \\ \frac{20 - \beta_0}{\beta_1} & = x \end{align} \]
For stem type (z equals 1):
\[ \begin{align} 20 &= (\beta_0 + \alpha_0 z) + (\beta_1 + \alpha_1 z)x \\ 20 &= (\beta_0 + \alpha_0) + (\beta_1 + \alpha_1)x \\ 20 - (\beta_0 + \alpha_0) &= (\beta_1 + \alpha_1)x \\ \frac{20 - (\beta_0 + \alpha_0)}{\beta_1 + \alpha_1} &= x \end{align} \]

# wine temperature model:
coefficients(wineMod2)

##   (Intercept)          time      typeStem time:typeStem 
##   9.656266924   0.003574818  -0.039576190  -0.001896136

wineBowlb0 <- coef(wineMod2)[1] # intercept
wineBowlb1 <- coef(wineMod2)[2] # time slope (holding by bowl)
wineStemb0 <- coef(wineMod2)[3] # intercept  (holding by stem)
wineStemb1 <- coef(wineMod2)[4] # interaction term slope
# time comparison holding by bowl vs stem at temperature = 20 degrees C
stemTime <- ((20 - (wineBowlb0 + wineStemb0)) / (wineBowlb1 + wineStemb1)) / 60
bowlTime <- ((20 - wineBowlb0) / wineBowlb1) / 60
answerWineQ1 <- round(stemTime - bowlTime)
cat("Answer to Q1: ", answerWineQ1)

## Answer to Q1:  55

What is the 5% earliest observable time to reach 20 oC for a glass held by the bowl (in whole minutes)? To answer B, decide if you should use the prediction or confidence interval - and think about the correct interval (the level argument in predict) including the upper and lower limits of this interval.

This refers to the earliest observable time, not to an average time, therefore requiring prediction intervals and not confidence intervals.

# making a new dataset to extrapolate beyond data range!

# making a sequence xnew of potential times (in mins)
timeNew <- seq(1, 450, length.out=10000) # creating new entries for time
wineNew=data.frame(time=timeNew*60, type="Bowl")

# making predictions with prediction intervals (using seconds as in the data!)
wineyhat <- predict(wineMod2, wineNew, interval="prediction", level=0.1)
# interval="prediction" gets prediction interval with lower (lwr), fitted and upper (upr) bounds
#     --> uses 95% confidence by default (alpha = 0.05) - this means 2.5% in each tail

# finding when the upper PI bound crosses 20 degrees C (at or above 20)
rownumW <- which(wineyhat[,"upr"] >= 20)[1] # [,"upr] gets upper bound of prediction interval results
#     --> "upr" represents the 97.5th percentile (only 2.5% of glasses would be hotter)
answerWineQ2 <- round(timeNew[rownumW]) # uses row number to get corresponding time in mins
cat("Answer to Q2: ", answerWineQ2)

## Answer to Q2:  48

Calculate after how many minutes the temperature of the glass held by the bowl is predicted to reach boiling point (100 oC) on average?
\[ \begin{align} 100 &= (\beta_0 + \alpha_0 z) + (\beta_1 + \alpha_1 z)x \\ 100 &= \beta_0 + \beta_1 x \\ 100 - \beta_0 &= \beta_1 x \\ \frac{100 - \beta_0}{\beta_1} & = x \end{align} \]

# time holding by bowl at temperature = 100 degrees C
answerWineQ3 <- round(((100 - wineBowlb0) / wineBowlb1) / 60)
cat("Answer to Q3: ", answerWineQ3)

## Answer to Q3:  421

Model 2 - Runners: Graphically predicting future values

Based on a model fit to the data, at which Olympic games (give the year) can we expect women to be faster than men on average?
Predict the first possible Olympic games (give the year) at which the woman’s event will be won instantaneously?

# olympic game year when women's time exceeds men's
plot(time~year, data=runners, pch=ifelse(runners$event=="mens", 5, 2), # if mens pch 5, if womens pch 2
     col=ifelse(runners$event=="mens", runCols["mens"], runCols["womens"]), # if mens blue, if womens purple
     xlab="Olympic Year", xlim=c(1900,2250), ylab=expression("Time (s)"), ylim=c(0,13),
     main="Olympics 100m Sprint Time")
legend("topright", legend=c("Men's 100m","Women's 100m"), pch=c(1,2), col=runCols, bty="n")
grid() # insert a grid to more easily read
runnersParamMod2 <- coef(runnersMod2)
abline(runnersParamMod2[1], runnersParamMod2[2], col="#2b8cbe", lwd=2)
abline(runnersParamMod2[1] + runnersParamMod2[3], runnersParamMod2[2] + runnersParamMod2[4], 
       col="#8856a7", lwd=2)

# first possible olympic game year when women's will be won after 0 seconds
plot(time~year, data=runners, pch=ifelse(runners$event=="mens", 5, 2), # if mens pch 5, if womens pch 2
     col=ifelse(runners$event=="mens", runCols["mens"], runCols["womens"]), # if mens blue, if womens purple
     xlab="Olympic Year", xlim=c(1900,3000), ylab=expression("Time (s)"), ylim=c(0,13),
     main="Olympics 100m Sprint Time")
legend("topright", legend=c("Men's 100m","Women's 100m"), pch=c(1,2), col=runCols, bty="n")
grid() # insert a grid to more easily read
runnersParamMod2 <- coef(runnersMod2)
abline(runnersParamMod2[1], runnersParamMod2[2], col="#2b8cbe", lwd=2)
abline(runnersParamMod2[1] + runnersParamMod2[3], runnersParamMod2[2] + runnersParamMod2[4], 
       col="#8856a7", lwd=2)

Model 2 - Runners: Algebraically predicting future values

Based on a model fit to the data, at which Olympic games (give the year) can we expect women to be faster than men on average?

men vs women times (in seconds):

\[ \begin{align} \text{Men: } time &= \beta_0 + \beta_1 \cdot year \\ \text{Women: } time &= (\beta_0 + \beta_2) + (\beta_1 + \beta_3) \cdot year \end{align} \] - setting them equal to find crossover point:

\[\begin{align} \beta_0 + \beta_1 \cdot year &= (\beta_0 + \beta_2) + (\beta_1 + \beta_3) \cdot year \\ \beta_0 + \beta_1 \cdot year &= \beta_0 + \beta_2 + \beta_1 \cdot year + \beta_3 \cdot year \\ 0 &= \beta_2 + \beta_3 \cdot year \\ year &= \frac{-\beta_2}{\beta_3} \end{align}\] $$

# saving the model coefficients into objects
coefficients(runnersMod2)

##      (Intercept)             year      eventwomens year:eventwomens 
##     30.959250279     -0.010558570      8.953156887     -0.003997738

runnersMenb0 <- coef(runnersMod2)[1]
runnersMenb1 <- coef(runnersMod2)[2]
runnersWomb0 <- coef(runnersMod2)[3]
runnersWomb1 <- coef(runnersMod2)[4]

# calculating in which olympic year men's and women's times are equal
answerRunQ1 <- round(-runnersWomb0/runnersWomb1)
cat("Answer Q1: ", answerRunQ1)

## Answer Q1:  2240

Predict the first possible Olympic games (give the year) at which the woman’s event will be won instantaneously?

# general prediction
newYear <- seq(1900,2800,4) # olympic years occur every 4 years, not every 1 year!
runnersNew <- data.frame(year=newYear, event="womens")
goldTime <- predict(runnersMod2, runnersNew, interval="prediction")

# calculating the first year
instant<-head(runnersNew[goldTime[,"lwr"]<=0,], 1)
instant

##     year  event
## 182 2624 womens