# loading datasets and saving them as objects
wine <- read.csv("QRS_winedata.csv", row.names=1) # use indent key to see all data in working directory
runners <- read.csv("QRS_100meterSprint.csv")
# checking the 1,2,3! (mean, spread, distribution)
summary(wine)
## temp time type
## Min. : 9.547 Min. : 0.0 Length:72
## 1st Qu.:11.010 1st Qu.: 613.2 Class :character
## Median :12.125 Median :1226.5 Mode :character
## Mean :12.858 Mean :1226.5
## 3rd Qu.:14.020 3rd Qu.:1839.7
## Max. :18.691 Max. :2453.0
str(wine)
## 'data.frame': 72 obs. of 3 variables:
## $ temp: num 9.81 10 9.95 10.67 10.97 ...
## $ time: num 0 70.1 140.2 210.3 280.3 ...
## $ type: chr "Bowl" "Bowl" "Bowl" "Bowl" ...
summary(runners)
## olympic_year time event
## Min. :1900 Min. : 9.63 Length:62
## 1st Qu.:1929 1st Qu.:10.08 Class :character
## Median :1960 Median :10.73 Mode :character
## Mean :1960 Mean :10.63
## 3rd Qu.:1991 3rd Qu.:11.00
## Max. :2020 Max. :12.20
## NA's :12
str(runners)
## 'data.frame': 62 obs. of 3 variables:
## $ olympic_year: int 1900 1904 1908 1912 1916 1920 1924 1928 1932 1936 ...
## $ time : num 11 11 10.8 10.8 NA 10.8 10.6 10.8 10.3 10.3 ...
## $ event : chr "mens" "mens" "mens" "mens" ...
# creating a simple model for each dataset
wine$type <- as.factor(wine$type) # turning holding type category into factor
runners$event <- as.factor(runners$event) # turning mens vs womens event category into factor
colnames(runners)[1] <- "year" # making the olympic_year column name shorter for ease of use
wineMod1 <- lm(temp~time, data=wine)
runnersMod1 <- lm(time~year, data=runners)
# extracting intercept coefficients from simple model summaries
coefficients(wineMod1)[2] # slope of time
## time
## 0.00262675
summary(wineMod1) # or: gives full wine model summary
##
## Call:
## lm(formula = temp ~ time, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.52440 -1.06482 0.02575 1.20746 2.61117
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6364788 0.3254305 29.61 <2e-16 ***
## time 0.0026267 0.0002282 11.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.41 on 70 degrees of freedom
## Multiple R-squared: 0.6544, Adjusted R-squared: 0.6494
## F-statistic: 132.5 on 1 and 70 DF, p-value: < 2.2e-16
coefficients(runnersMod1)[2] # slope of year
## year
## -0.008445314
summary(runnersMod1) # or: gives full runners model summary
##
## Call:
## lm(formula = time ~ year, data = runners)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6898 -0.4961 -0.2276 0.4774 1.2224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.260162 4.656643 5.854 4.19e-07 ***
## year -0.008445 0.002365 -3.572 0.000819 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5568 on 48 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.21, Adjusted R-squared: 0.1935
## F-statistic: 12.76 on 1 and 48 DF, p-value: 0.0008192
# checking plots at the same time
par(mfrow=c(1,2))
# wine temperature data:
wineCols <- c("#66c2a5","#fc8d62")
names(wineCols) <- c("stem", "bowl")
plot(temp~time, data=wine, pch=ifelse(wine$type=="Bowl", 2, 1), # if bowl pch 2, if stem pch 16
col=ifelse(wine$type=="Bowl", wineCols["bowl"], wineCols["stem"]), # if bowl teal, if stem terracotta
xlab="Time (s)", ylab=expression("Temperature ("*degree*"C)"), main="Wine Temperature and Hold")
legend("topleft", legend=c("Holding by Stem","Holding by Bowl"), pch=c(1,2), col=wineCols, bty="n")
grid() # insert a grid to more easily read
wineParamMod1 <- coef(wineMod1)
abline(wineParamMod1[1], wineParamMod1[2], col="darkblue", lwd=2, lty=2)
# olympics 100m sprint data:
runCols <- c("#2b8cbe", "#8856a7")
names(runCols) <- c("mens", "womens")
plot(time~year, data=runners, pch=ifelse(runners$event=="mens", 5, 2), # if mens pch 5, if womens pch 2
col=ifelse(runners$event=="mens", runCols["mens"], runCols["womens"]), # if mens blue, if womens purple
xlab="Olympic Year", ylab=expression("Time (s)"), main="Olympics 100m Sprint Time")
legend("topright", legend=c("Men's 100m","Women's 100m"), pch=c(1,2), col=runCols, bty="n")
grid() # insert a grid to more easily read
runnersParamMod1 <- coef(runnersMod1)
abline(runnersParamMod1[1], runnersParamMod1[2], col="darkblue", lwd=2, lty=2)
# testing assumptions for both first models
par(mfrow=c(2,2))
require(car)
## Loading required package: car
## Loading required package: carData
# wine temperature data:
wineRes1 <- residuals(wineMod1) # extracting residuals
wineFit1 <- fitted(wineMod1) # extracting fitted values (predictions)
plot(wineFit1, wineRes1, xlab="Fitted values", ylab="Residuals",
main="Wine Temperature and Hold Mod. I") # testing for homoscedasticity and linearity
abline(h=0, lty=2, col="orangered")
qqPlot(wineRes1, xlab="Theoretical quantiles", ylab="Standardized quantiles",
main="QQ-Plot: Wine Temperature Mod. I") # testing for normality
## 36 94
## 36 70
# olympics 100m sprint data:
runnersRes1 <- residuals(runnersMod1)
runnersFit1 <- fitted(runnersMod1)
plot(runnersFit1, runnersRes1, xlab="Fitted values", ylab="Residuals",
main="Olympics 100m Sprint Time Mod. I") # testing for homoscedasticity and linearity
abline(h=0, lty=2, col="orangered")
qqPlot(runnersRes1, xlab="Theoretical quantiles", ylab="Standardized quantiles",
main="QQ-Plot: 100m Sprint Time Mod. I") # testing for normality
## 39 44
## 29 32
Evidently, model 1 does not fit the data well, since the data seems to contain two groups (respectively). Testing the assumptions also reveals that model 1 is not a good model, since the assumptions of both homoscedasticity and linearity are clearly violated. While testing results for normality remain within acceptable limits (though more questionable for the olympic 100m sprint data), there is still deviation at the tails as well as some clustering as seen on the QQ-Plots.
Which statistic, that you can extract from a regression model, measures the rate at which wine temperature increases over time?
# creating interaction model for each dataset
wineMod2 <- lm(temp~time*type,data=wine)
runnersMod2 <- lm(time~year*event,data=runners)
# plotting them again with the new regression lines
par(mfrow=c(2,1))
# wine temperature data:
plot(temp~time, data=wine, pch=ifelse(wine$type=="Bowl", 2, 1), # if bowl pch 2, if stem pch 16
col=ifelse(wine$type=="Bowl", wineCols["bowl"], wineCols["stem"]), # if bowl teal, if stem terracotta
xlab="Time (s)", ylab=expression("Temperature ("*degree*"C)"), main="Wine Temperature and Hold")
legend("topleft", legend=c("Holding by Stem","Holding by Bowl"), pch=c(1,2), col=wineCols, bty="n")
grid() # insert a grid to more easily read
wineParamMod2 <- coef(wineMod2)
abline(wineParamMod2[1], wineParamMod2[2], col="#fc8d62", lwd=2, lty=2)
abline(wineParamMod2[1] + wineParamMod2[3], wineParamMod2[2] + wineParamMod2[4],
col="#66c2a5", lwd=2, lty=2)
# olympics 100m sprint data:
plot(time~year, data=runners, pch=ifelse(runners$event=="mens", 5, 2), # if mens pch 5, if womens pch 2
col=ifelse(runners$event=="mens", runCols["mens"], runCols["womens"]), # if mens blue, if womens purple
xlab="Olympic Year", ylab=expression("Time (s)"), main="Olympics 100m Sprint Time")
legend("topright", legend=c("Men's 100m","Women's 100m"), pch=c(1,2), col=runCols, bty="n")
grid() # insert a grid to more easily read
runnersParamMod2 <- coef(runnersMod2)
abline(runnersParamMod2[1], runnersParamMod2[2], col="#2b8cbe", lwd=2, lty=2)
abline(runnersParamMod2[1] + runnersParamMod2[3], runnersParamMod2[2] + runnersParamMod2[4],
col="#8856a7", lwd=2, lty=2)
Answer: The slope \(beta1\)! In a linear regression, the slope informs how much Y changes as X increases. As temperature is the response variable (Y), and time is the independent variable (X), the slope therefore measures how much temperature changes as time changes.
\[ Y\sim\beta0+\beta1*X+\epsilon \] Here y is the response, and x the predictor variable. The regression coefficients are \(beta0\) and \(beta1\), which are the typical symbols statisticians use to refer to the intercept and slope respectively. The final symbol \(epsilon\) refers to the regression error - and not the residuals. This is because \(epsilon\) is a property of the population, and not the sample (like the residuals). It is the expected error for a prediction in y - which is assumed to be normally distributed with a mean of zero and a standard error of \(epsilon\).
Are the assumptions of the second model violated?
\[ y_i \sim (\beta_0 + \alpha_0 z) + (\beta_1 + \alpha_1 z)x + \epsilon \]
# testing assumptions for both second models
par(mfrow=c(2,2))
require(car)
# wine data
wineRes2 <- residuals(wineMod2) # extracting residuals
wineFit2 <- fitted(wineMod2) # extracting fitted values (predictions)
plot(wineFit2, wineRes2, xlab="Fitted values", ylab="Residuals",
main="Wine Temperature and Hold Mod. II") # testing for homoscedasticity and linearity
abline(h=0, lty=2, col="orangered")
qqPlot(wineRes2, xlab="Theoretical quantiles", ylab="Standardized quantiles",
main="QQ-Plot: Wine Temperature Mod. II") # testing for normality
## [1] 27 13
# runners data
runnersRes2 <- residuals(runnersMod2)
runnersFit2 <- fitted(runnersMod2)
plot(runnersFit2, runnersRes2, xlab="Fitted values", ylab="Residuals",
main="Olympics 100m Sprint Time Mod. II") # testing for homoscedasticity and linearity
abline(h=0, lty=2, col="orangered")
qqPlot(runnersRes2, xlab="Theoretical quantiles", ylab="Standardized quantiles",
main="QQ-Plot: 100m Sprint Time Mod. II") # testing for normality
## 54 47
## 42 35
Answer: …
Confidence Interval (CI):
Prediction Interval (PI):
For example, if predicting wine temperature at 10mins, the average temperature of all wines at 10mins is 15 degrees Celsius ± 0.5 degrees Celsius (confidence interval), while the expected temperature of a single wine glass at 10mins will be 15 degrees Celsius ± 2 degrees Celsius (prediction interval).
Answer the list of questions on the regression summary table statistics in this weeks online quiz.
Wine temperature data:
Olympic 100m sprint data:
# extracting information from model summaries
summary(wineMod2) # or: gives full wine model summary
##
## Call:
## lm(formula = temp ~ time * type, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49384 -0.16574 0.01764 0.17268 0.52854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.656e+00 8.001e-02 120.69 <2e-16 ***
## time 3.575e-03 5.610e-05 63.73 <2e-16 ***
## typeStem -3.958e-02 1.131e-01 -0.35 0.728
## time:typeStem -1.896e-03 7.933e-05 -23.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.245 on 68 degrees of freedom
## Multiple R-squared: 0.9899, Adjusted R-squared: 0.9894
## F-statistic: 2211 on 3 and 68 DF, p-value: < 2.2e-16
summary(runnersMod2) # or: gives full runners model summary
##
## Call:
## lm(formula = time ~ year * event, data = runners)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43447 -0.06877 0.00188 0.10829 0.35216
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.9592503 1.7114584 18.089 < 2e-16 ***
## year -0.0105586 0.0008718 -12.112 6.55e-16 ***
## eventwomens 8.9531569 3.1154356 2.874 0.00612 **
## year:eventwomens -0.0039977 0.0015791 -2.532 0.01484 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1674 on 46 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.9316, Adjusted R-squared: 0.9271
## F-statistic: 208.8 on 3 and 46 DF, p-value: < 2.2e-16
# wine temperature model:
# question 1 --> intercept
cat("Answer Q1: The temperature at time = 0 was ", coef(wineMod2)[1], ".")
## Answer Q1: The temperature at time = 0 was 9.656267 .
# question 2 --> interaction term (typeStem)
cat("Answer Q2: There is no significant difference.")
## Answer Q2: There is no significant difference.
# question 3 --> interaction term (time:typeStem)
cat("Answer Q3: Yes, because when held by the stem, temperature increases by ",
(coef(wineMod2)[2]+coef(wineMod2)[4]),".")
## Answer Q3: Yes, because when held by the stem, temperature increases by 0.001678682 .
# olympic 100m sprint model:
# question 1 --> interaction term (year:eventwomens) p-value - it is negative, so yes!
cat("Answer Q3: ", (coef(runnersMod2)[2]+coef(runnersMod2)[4]))
## Answer Q3: -0.01455631
Answer: All questions successfully answered on this session’s quiz.
Answer the following questions by making predictions with the second model.
# first checking the graphs
par(mfrow=c(2,1))
# extending y range to 20 degrees C (room temperature) and x range to 6,500 seconds
plot(temp~time, data=wine, pch=ifelse(wine$type=="Bowl", 2, 1), # if bowl pch 2, if stem pch 16
col=ifelse(wine$type=="Bowl", wineCols["bowl"], wineCols["stem"]), # if bowl teal, if stem terracotta
xlab="Time (s)", xlim=c(0,6500), ylab=expression("Temperature ("*degree*"C)"), ylim=c(10,24),
main="Wine Temperature and Hold")
legend("topleft", legend=c("Holding by Stem","Holding by Bowl"), pch=c(1,2), col=wineCols, bty="n")
grid() # insert a grid to more easily read
wineParamMod2 <- coef(wineMod2)
abline(wineParamMod2[1], wineParamMod2[2], col="#fc8d62", lwd=2)
abline(wineParamMod2[1] + wineParamMod2[3], wineParamMod2[2] + wineParamMod2[4],
col="#66c2a5", lwd=2)
abline(h=20, col="red", lwd=2)
# extending y range to 100 degrees C (boiling temperature)
plot(temp~time, data=wine, pch=ifelse(wine$type=="Bowl", 2, 1), # if bowl pch 2, if stem pch 16
col=ifelse(wine$type=="Bowl", wineCols["bowl"], wineCols["stem"]), # if bowl teal, if stem terracotta
xlab="Time (s)", xlim=c(0,60000), ylab=expression("Temperature ("*degree*"C)"), ylim=c(10,120),
main="Wine Temperature and Hold")
legend("topleft", legend=c("Holding by Stem","Holding by Bowl"), pch=c(1,2), col=wineCols, bty="n")
grid() # insert a grid to more easily read
wineParamMod2 <- coef(wineMod2)
abline(wineParamMod2[1], wineParamMod2[2], col="#fc8d62", lwd=2)
abline(wineParamMod2[1] + wineParamMod2[3], wineParamMod2[2] + wineParamMod2[4],
col="#66c2a5", lwd=2)
abline(h=100, col="red", lwd=2)
For bowl type (z equals 0):
\[
\begin{align}
20 &= (\beta_0 + \alpha_0 z) + (\beta_1 + \alpha_1 z)x \\
20 &= \beta_0 + \beta_1 x \\
20 - \beta_0 &= \beta_1 x \\
\frac{20 - \beta_0}{\beta_1} & = x
\end{align}
\]
For stem type (z equals 1):
\[
\begin{align}
20 &= (\beta_0 + \alpha_0 z) + (\beta_1 + \alpha_1 z)x \\
20 &= (\beta_0 + \alpha_0) + (\beta_1 + \alpha_1)x \\
20 - (\beta_0 + \alpha_0) &= (\beta_1 + \alpha_1)x \\
\frac{20 - (\beta_0 + \alpha_0)}{\beta_1 + \alpha_1} &= x
\end{align}
\]
# wine temperature model:
coefficients(wineMod2)
## (Intercept) time typeStem time:typeStem
## 9.656266924 0.003574818 -0.039576190 -0.001896136
wineBowlb0 <- coef(wineMod2)[1] # intercept
wineBowlb1 <- coef(wineMod2)[2] # time slope (holding by bowl)
wineStemb0 <- coef(wineMod2)[3] # intercept (holding by stem)
wineStemb1 <- coef(wineMod2)[4] # interaction term slope
# time comparison holding by bowl vs stem at temperature = 20 degrees C
stemTime <- ((20 - (wineBowlb0 + wineStemb0)) / (wineBowlb1 + wineStemb1)) / 60
bowlTime <- ((20 - wineBowlb0) / wineBowlb1) / 60
answerWineQ1 <- round(stemTime - bowlTime)
cat("Answer to Q1: ", answerWineQ1)
## Answer to Q1: 55
# making a new dataset to extrapolate beyond data range!
# making a sequence xnew of potential times (in mins)
timeNew <- seq(1, 450, length.out=10000) # creating new entries for time
wineNew=data.frame(time=timeNew*60, type="Bowl")
# making predictions with prediction intervals (using seconds as in the data!)
wineyhat <- predict(wineMod2, wineNew, interval="prediction", level=0.1)
# interval="prediction" gets prediction interval with lower (lwr), fitted and upper (upr) bounds
# --> uses 95% confidence by default (alpha = 0.05) - this means 2.5% in each tail
# finding when the upper PI bound crosses 20 degrees C (at or above 20)
rownumW <- which(wineyhat[,"upr"] >= 20)[1] # [,"upr] gets upper bound of prediction interval results
# --> "upr" represents the 97.5th percentile (only 2.5% of glasses would be hotter)
answerWineQ2 <- round(timeNew[rownumW]) # uses row number to get corresponding time in mins
cat("Answer to Q2: ", answerWineQ2)
## Answer to Q2: 48
# time holding by bowl at temperature = 100 degrees C
answerWineQ3 <- round(((100 - wineBowlb0) / wineBowlb1) / 60)
cat("Answer to Q3: ", answerWineQ3)
## Answer to Q3: 421
# olympic game year when women's time exceeds men's
plot(time~year, data=runners, pch=ifelse(runners$event=="mens", 5, 2), # if mens pch 5, if womens pch 2
col=ifelse(runners$event=="mens", runCols["mens"], runCols["womens"]), # if mens blue, if womens purple
xlab="Olympic Year", xlim=c(1900,2250), ylab=expression("Time (s)"), ylim=c(0,13),
main="Olympics 100m Sprint Time")
legend("topright", legend=c("Men's 100m","Women's 100m"), pch=c(1,2), col=runCols, bty="n")
grid() # insert a grid to more easily read
runnersParamMod2 <- coef(runnersMod2)
abline(runnersParamMod2[1], runnersParamMod2[2], col="#2b8cbe", lwd=2)
abline(runnersParamMod2[1] + runnersParamMod2[3], runnersParamMod2[2] + runnersParamMod2[4],
col="#8856a7", lwd=2)
# first possible olympic game year when women's will be won after 0 seconds
plot(time~year, data=runners, pch=ifelse(runners$event=="mens", 5, 2), # if mens pch 5, if womens pch 2
col=ifelse(runners$event=="mens", runCols["mens"], runCols["womens"]), # if mens blue, if womens purple
xlab="Olympic Year", xlim=c(1900,3000), ylab=expression("Time (s)"), ylim=c(0,13),
main="Olympics 100m Sprint Time")
legend("topright", legend=c("Men's 100m","Women's 100m"), pch=c(1,2), col=runCols, bty="n")
grid() # insert a grid to more easily read
runnersParamMod2 <- coef(runnersMod2)
abline(runnersParamMod2[1], runnersParamMod2[2], col="#2b8cbe", lwd=2)
abline(runnersParamMod2[1] + runnersParamMod2[3], runnersParamMod2[2] + runnersParamMod2[4],
col="#8856a7", lwd=2)
\[ \begin{align} \text{Men: } time &= \beta_0 + \beta_1 \cdot year \\ \text{Women: } time &= (\beta_0 + \beta_2) + (\beta_1 + \beta_3) \cdot year \end{align} \] - setting them equal to find crossover point:
$$
\[\begin{align} \beta_0 + \beta_1 \cdot year &= (\beta_0 + \beta_2) + (\beta_1 + \beta_3) \cdot year \\ \beta_0 + \beta_1 \cdot year &= \beta_0 + \beta_2 + \beta_1 \cdot year + \beta_3 \cdot year \\ 0 &= \beta_2 + \beta_3 \cdot year \\ year &= \frac{-\beta_2}{\beta_3} \end{align}\] $$
# saving the model coefficients into objects
coefficients(runnersMod2)
## (Intercept) year eventwomens year:eventwomens
## 30.959250279 -0.010558570 8.953156887 -0.003997738
runnersMenb0 <- coef(runnersMod2)[1]
runnersMenb1 <- coef(runnersMod2)[2]
runnersWomb0 <- coef(runnersMod2)[3]
runnersWomb1 <- coef(runnersMod2)[4]
# calculating in which olympic year men's and women's times are equal
answerRunQ1 <- round(-runnersWomb0/runnersWomb1)
cat("Answer Q1: ", answerRunQ1)
## Answer Q1: 2240
# general prediction
newYear <- seq(1900,2800,4) # olympic years occur every 4 years, not every 1 year!
runnersNew <- data.frame(year=newYear, event="womens")
goldTime <- predict(runnersMod2, runnersNew, interval="prediction")
# calculating the first year
instant<-head(runnersNew[goldTime[,"lwr"]<=0,], 1)
instant
## year event
## 182 2624 womens