TextBook Page 699

Forbes studied the linear relationship between boiling point and pressure.

We would like to explain the pressure by using the boiling point as an explanatory variable.

library(MASS) # this library contains many datasets and functions
dataset <- data.frame(forbes)
# Conversion Fahrenheit to Celsius
dataset$bp_c <- (data$bp-32)*5/9
# Transform the pressure to 100*log(pressure) 
dataset$lnpres <- 100*log10(dataset$pres)
show(dataset)
      bp  pres      bp_c   lnpres
1  194.5 20.79  90.27778 131.7854
2  194.3 20.79  90.16667 131.7854
3  197.9 22.40  92.16667 135.0248
4  198.4 22.67  92.44444 135.5452
5  199.4 23.15  93.00000 136.4551
6  199.9 23.35  93.27778 136.8287
7  200.9 23.89  93.83333 137.8216
8  201.1 23.99  93.94444 138.0030
9  201.4 24.02  94.11111 138.0573
10 201.3 24.01  94.05556 138.0392
11 203.6 25.14  95.33333 140.0365
12 204.6 26.57  95.88889 142.4392
13 209.5 28.49  98.61111 145.4692
14 208.6 27.76  98.11111 144.3419
15 210.7 29.04  99.27778 146.2997
16 211.9 29.88  99.94444 147.5381
17 212.2 30.06 100.11111 147.7989
bp: boiling point (degrees Farenheit)
bp_c: boiling point (degrees Celsius)
pres: barometric pressure in inches of mercury
# Create scatter plot 
plot(bp_c~pres, data=data)

Run regression pres~bp

# Use optimization to find regression coefficients
ls <- function(dataset, par)
{with(dataset, sum((pres-par[1]-par[2]*bp)^2))}
result <- optim(par=c(0,0),ls,data=dataset)
# starting values for the parameters to be optimised (0,0,0)
coef <- result$par
coef
[1] -81.1864866   0.5234972

Run regression pres~bp_c

ls2 <- function(dataset, par)
{with(dataset, sum((pres-par[1]-par[2]*bp_c)^2))}
result2 <- optim(par=c(0,0),ls,data=dataset)
# starting values for the parameters to be optimised (0,0,0)
coef2 <- result2$par
coef2
[1] -81.1864866   0.5234972

Run regression pres~bp_c

ls <- function(dataset, par)
{with(dataset, sum((lnpres-par[1]-par[2]*bp)^2))}
result3 <- optim(par=c(0,0),ls,data=dataset)
# starting values for the parameters to be optimised (0,0,0)
coef3 <- result2$par
coef3
[1] -81.1864866   0.5234972

From the above, we can observe that linear transformation of independent variable(s) does not change the coefficients.

# Fit a linear regression model and check out the output
fit <- lm(pres~bp, data=dataset)
summary(fit)

Call:
lm(formula = pres ~ bp, data = dataset)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.25717 -0.11246 -0.05102  0.14283  0.64994 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -81.06373    2.05182  -39.51   <2e-16 ***
bp            0.52289    0.01011   51.74   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2328 on 15 degrees of freedom
Multiple R-squared:  0.9944,    Adjusted R-squared:  0.9941 
F-statistic:  2677 on 1 and 15 DF,  p-value: < 2.2e-16
# build a regression line by constructing the line
line <- coef[1]+coef[2]*dataset$bp
plot(pres~bp, data=dataset)
curve(coef[1]+coef[2]*x, from=190,to=215, add=TRUE, col="red")

# Do same thing with lm function
plot(pres~bp, data=dataset)
abline(lm(pres~bp,data=dataset),col="red")

Let’s calculate a 95% Confidence Interval for the slope

To construct the 95% CI we need mean, standard error, and criticacl value.

coef.table <- summary(fit)$coefficients
bp.mean <- coef.table[2,1]
bp.se <- coef.table[2,2]
df <- fit$df.residual # df =15
critval <- abs(qt(0.025,df)) # absolute value of 2.5th percentiles of the Student t distribution with 15 d.f
# 2.13145
print(coef.table)
               Estimate Std. Error   t value     Pr(>|t|)
(Intercept) -81.0637271 2.05182108 -39.50819 1.405140e-16
bp            0.5228924 0.01010601  51.74075 2.527702e-18
qt(c(.025, .975), df=15) # 2.5 percentile and 97.5 percentile of the Student t distribution with df = 15
[1] -2.13145  2.13145
# Calculate 95% CI
CIlow <- bp.mean - critval*bp.se
CIup <- bp.mean + critval*bp.se
interval <- c(CIlow,CIup)
interval
[1] 0.5013520 0.5444328

A 95% CI for the slope is (0.501,0.5444)

Since it didn’t contain 0, we can reject the null hypothesis

# Check with built in fucntion
confint(fit,"bp",level=0.95)
      2.5 %    97.5 %
bp 0.501352 0.5444328

Diagnostics and Residual Analysis

Is there evidence that the residuals do not follow a normal distribution?

fit$residuals
           1            2            3            4            5            6            7            8            9 
 0.151155176  0.255733656 -0.016678987 -0.008125187 -0.051017588 -0.112463788 -0.095356189 -0.099934669 -0.226802389 
          10           11           12           13           14           15           16           17 
-0.184513149 -0.257165671  0.649941928  0.007769164 -0.251627675 -0.069701717  0.142827402  0.165959682 
# plot residuals. We can spot an outlier
plot(fit$residuals)

#create normal probability plot with qqnorm function
qqnorm(fit$residuals)
qqline(fit$residuals)

fit.stdres = rstandard(fit)
plot(fit$fitted, fit.stdres, xlab="fitted values", ylab="standardized residuals")
abline(h=0, col=2,lwd=2)

In the presence of outliers, it is useful to do the analysis with and without the point.

Let’s remove the large residuals which deviates more than 2 s.d from the mean 0

index <- which(abs(fit.stdres) > 2)
index
12 
12 
dataset1 <- dataset[-index,]
str(dataset1)
'data.frame':   16 obs. of  4 variables:
 $ bp    : num  194 194 198 198 199 ...
 $ pres  : num  20.8 20.8 22.4 22.7 23.1 ...
 $ bp_c  : num  90.3 90.2 92.2 92.4 93 ...
 $ lnpres: num  132 132 135 136 136 ...
#Use dataset1 to run the same regression
fit1 <- lm(pres~bp, data=dataset1)
summary(fit1)

Call:
lm(formula = pres ~ bp, data = dataset1)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.21493 -0.09546 -0.01500  0.09048  0.27793 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -80.667294   1.419984  -56.81   <2e-16 ***
bp            0.520738   0.006997   74.42   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1608 on 14 degrees of freedom
Multiple R-squared:  0.9975,    Adjusted R-squared:  0.9973 
F-statistic:  5538 on 1 and 14 DF,  p-value: < 2.2e-16

The regression fit and coefficient did not change much

fit.stdres = rstandard(fit1)
plot(fit1$fitted, fit.stdres, xlab="Fitted values", ylab="Standardized residuals")
abline(h=0, col=2, lwd=2)

LS0tDQp0aXRsZTogIlJlbGF0aW9uc2hpcCBiZXR3ZWVuIEJvaWxpbmcgcG9pbnQgYW5kIFByZXNzdXJlIg0KYXV0aG9yOiBTdGF0aXN0aWNzMiBDbGFzcw0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyMjIyBUZXh0Qm9vayBQYWdlIDY5OQ0KIyMjIyBGb3JiZXMgc3R1ZGllZCB0aGUgbGluZWFyIHJlbGF0aW9uc2hpcCBiZXR3ZWVuIGJvaWxpbmcgcG9pbnQgYW5kIHByZXNzdXJlLiANCiMjIyMgV2Ugd291bGQgbGlrZSB0byBleHBsYWluIHRoZSBwcmVzc3VyZSBieSB1c2luZyB0aGUgYm9pbGluZyBwb2ludCBhcyBhbiBleHBsYW5hdG9yeSB2YXJpYWJsZS4NCg0KDQpgYGB7cn0NCmxpYnJhcnkoTUFTUykgIyB0aGlzIGxpYnJhcnkgY29udGFpbnMgbWFueSBkYXRhc2V0cyBhbmQgZnVuY3Rpb25zDQpkYXRhc2V0IDwtIGRhdGEuZnJhbWUoZm9yYmVzKQ0KIyBDb252ZXJzaW9uIEZhaHJlbmhlaXQgdG8gQ2Vsc2l1cw0KZGF0YXNldCRicF9jIDwtIChkYXRhJGJwLTMyKSo1LzkNCiMgVHJhbnNmb3JtIHRoZSBwcmVzc3VyZSB0byAxMDAqbG9nKHByZXNzdXJlKSANCmRhdGFzZXQkbG5wcmVzIDwtIDEwMCpsb2cxMChkYXRhc2V0JHByZXMpDQpzaG93KGRhdGFzZXQpDQpgYGANCg0KIyMjIyNicDogYm9pbGluZyBwb2ludCAoZGVncmVlcyBGYXJlbmhlaXQpDQojIyMjI2JwX2M6IGJvaWxpbmcgcG9pbnQgKGRlZ3JlZXMgQ2Vsc2l1cykNCiMjIyMjcHJlczogYmFyb21ldHJpYyBwcmVzc3VyZSBpbiBpbmNoZXMgb2YgbWVyY3VyeQ0KDQpgYGB7cn0NCiMgQ3JlYXRlIHNjYXR0ZXIgcGxvdCANCnBsb3QoYnBfY35wcmVzLCBkYXRhPWRhdGEpDQpgYGANCg0KIyMjIFJ1biByZWdyZXNzaW9uIHByZXN+YnANCmBgYHtyfQ0KIyBVc2Ugb3B0aW1pemF0aW9uIHRvIGZpbmQgcmVncmVzc2lvbiBjb2VmZmljaWVudHMNCmxzIDwtIGZ1bmN0aW9uKGRhdGFzZXQsIHBhcikNCnt3aXRoKGRhdGFzZXQsIHN1bSgocHJlcy1wYXJbMV0tcGFyWzJdKmJwKV4yKSl9DQpyZXN1bHQgPC0gb3B0aW0ocGFyPWMoMCwwKSxscyxkYXRhPWRhdGFzZXQpDQojIHN0YXJ0aW5nIHZhbHVlcyBmb3IgdGhlIHBhcmFtZXRlcnMgdG8gYmUgb3B0aW1pc2VkICgwLDAsMCkNCmNvZWYgPC0gcmVzdWx0JHBhcg0KY29lZg0KYGBgDQojIyMjIyANCiMjIyBSdW4gcmVncmVzc2lvbiBwcmVzfmJwX2MNCmBgYHtyfQ0KbHMyIDwtIGZ1bmN0aW9uKGRhdGFzZXQsIHBhcikNCnt3aXRoKGRhdGFzZXQsIHN1bSgocHJlcy1wYXJbMV0tcGFyWzJdKmJwX2MpXjIpKX0NCnJlc3VsdDIgPC0gb3B0aW0ocGFyPWMoMCwwKSxscyxkYXRhPWRhdGFzZXQpDQojIHN0YXJ0aW5nIHZhbHVlcyBmb3IgdGhlIHBhcmFtZXRlcnMgdG8gYmUgb3B0aW1pc2VkICgwLDAsMCkNCmNvZWYyIDwtIHJlc3VsdDIkcGFyDQpjb2VmMg0KYGBgDQojIyMjIyANCiMjIyBSdW4gcmVncmVzc2lvbiBwcmVzfmJwX2MNCmBgYHtyfQ0KbHMgPC0gZnVuY3Rpb24oZGF0YXNldCwgcGFyKQ0Ke3dpdGgoZGF0YXNldCwgc3VtKChsbnByZXMtcGFyWzFdLXBhclsyXSpicCleMikpfQ0KcmVzdWx0MyA8LSBvcHRpbShwYXI9YygwLDApLGxzLGRhdGE9ZGF0YXNldCkNCiMgc3RhcnRpbmcgdmFsdWVzIGZvciB0aGUgcGFyYW1ldGVycyB0byBiZSBvcHRpbWlzZWQgKDAsMCwwKQ0KY29lZjMgPC0gcmVzdWx0MiRwYXINCmNvZWYzDQpgYGANCg0KIyMjIEZyb20gdGhlIGFib3ZlLCB3ZSBjYW4gb2JzZXJ2ZSB0aGF0IGxpbmVhciB0cmFuc2Zvcm1hdGlvbiBvZiBpbmRlcGVuZGVudCB2YXJpYWJsZShzKSBkb2VzIG5vdCBjaGFuZ2UgdGhlIGNvZWZmaWNpZW50cy4NCg0KYGBge3J9DQojIEZpdCBhIGxpbmVhciByZWdyZXNzaW9uIG1vZGVsIGFuZCBjaGVjayBvdXQgdGhlIG91dHB1dA0KZml0IDwtIGxtKHByZXN+YnAsIGRhdGE9ZGF0YXNldCkNCnN1bW1hcnkoZml0KQ0KYGBgDQoNCmBgYHtyfQ0KIyBidWlsZCBhIHJlZ3Jlc3Npb24gbGluZSBieSBjb25zdHJ1Y3RpbmcgdGhlIGxpbmUNCiMgbGluZSA8LSBjb2VmWzFdK2NvZWZbMl0qZGF0YXNldCRicA0KcGxvdChwcmVzfmJwLCBkYXRhPWRhdGFzZXQpDQpjdXJ2ZShjb2VmWzFdK2NvZWZbMl0qeCwgZnJvbT0xOTAsdG89MjE1LCBhZGQ9VFJVRSwgY29sPSJyZWQiKQ0KYGBgDQoNCmBgYHtyfQ0KIyBEbyBzYW1lIHRoaW5nIHdpdGggbG0gZnVuY3Rpb24NCnBsb3QocHJlc35icCwgZGF0YT1kYXRhc2V0KQ0KYWJsaW5lKGxtKHByZXN+YnAsZGF0YT1kYXRhc2V0KSxjb2w9InJlZCIpDQpgYGANCg0KIyMjIyBMZXQncyBjYWxjdWxhdGUgYSA5NSUgQ29uZmlkZW5jZSBJbnRlcnZhbCBmb3IgdGhlIHNsb3BlDQojIyMjIFRvIGNvbnN0cnVjdCB0aGUgOTUlIENJIHdlIG5lZWQgbWVhbiwgc3RhbmRhcmQgZXJyb3IsIGFuZCBjcml0aWNhY2wgdmFsdWUuDQpgYGB7cn0NCmNvZWYudGFibGUgPC0gc3VtbWFyeShmaXQpJGNvZWZmaWNpZW50cw0KYnAubWVhbiA8LSBjb2VmLnRhYmxlWzIsMV0NCmJwLnNlIDwtIGNvZWYudGFibGVbMiwyXQ0KZGYgPC0gZml0JGRmLnJlc2lkdWFsICMgZGYgPTE1DQpjcml0dmFsIDwtIGFicyhxdCgwLjAyNSxkZikpICMgYWJzb2x1dGUgdmFsdWUgb2YgMi41dGggcGVyY2VudGlsZXMgb2YgdGhlIFN0dWRlbnQgdCBkaXN0cmlidXRpb24gd2l0aCAxNSBkLmYNCiMgMi4xMzE0NQ0KcHJpbnQoY29lZi50YWJsZSkNCmBgYA0KYGBge3J9DQpxdChjKC4wMjUsIC45NzUpLCBkZj0xNSkgIyAyLjUgcGVyY2VudGlsZSBhbmQgOTcuNSBwZXJjZW50aWxlIG9mIHRoZSBTdHVkZW50IHQgZGlzdHJpYnV0aW9uIHdpdGggZGYgPSAxNQ0KYGBgDQpgYGB7cn0NCiMgQ2FsY3VsYXRlIDk1JSBDSQ0KQ0lsb3cgPC0gYnAubWVhbiAtIGNyaXR2YWwqYnAuc2UNCkNJdXAgPC0gYnAubWVhbiArIGNyaXR2YWwqYnAuc2UNCmludGVydmFsIDwtIGMoQ0lsb3csQ0l1cCkNCmludGVydmFsDQpgYGANCg0KIyMjIEEgOTUlIENJIGZvciB0aGUgc2xvcGUgaXMgKDAuNTAxLDAuNTQ0NCkNCiMjIyBTaW5jZSBpdCBkaWRuJ3QgY29udGFpbiAwLCB3ZSBjYW4gcmVqZWN0IHRoZSBudWxsIGh5cG90aGVzaXMNCg0KYGBge3J9DQojIENoZWNrIHdpdGggYnVpbHQgaW4gZnVjbnRpb24NCmNvbmZpbnQoZml0LCJicCIsbGV2ZWw9MC45NSkNCmBgYA0KDQojIyBEaWFnbm9zdGljcyBhbmQgUmVzaWR1YWwgQW5hbHlzaXMNCiMjIyBJcyB0aGVyZSBldmlkZW5jZSB0aGF0IHRoZSByZXNpZHVhbHMgZG8gbm90IGZvbGxvdyBhIG5vcm1hbCBkaXN0cmlidXRpb24/DQpgYGB7cn0NCmZpdCRyZXNpZHVhbHMNCmBgYA0KYGBge3J9DQojIHBsb3QgcmVzaWR1YWxzLiBXZSBjYW4gc3BvdCBhbiBvdXRsaWVyDQpwbG90KGZpdCRyZXNpZHVhbHMpDQpgYGANCmBgYHtyfQ0KI2NyZWF0ZSBub3JtYWwgcHJvYmFiaWxpdHkgcGxvdCB3aXRoIHFxbm9ybSBmdW5jdGlvbg0KcXFub3JtKGZpdCRyZXNpZHVhbHMpDQpxcWxpbmUoZml0JHJlc2lkdWFscykNCmBgYA0KDQpgYGB7cn0NCmZpdC5zdGRyZXMgPSByc3RhbmRhcmQoZml0KQ0KcGxvdChmaXQkZml0dGVkLCBmaXQuc3RkcmVzLCB4bGFiPSJmaXR0ZWQgdmFsdWVzIiwgeWxhYj0ic3RhbmRhcmRpemVkIHJlc2lkdWFscyIpDQphYmxpbmUoaD0wLCBjb2w9Mixsd2Q9MikNCmBgYA0KIyMjIEluIHRoZSBwcmVzZW5jZSBvZiBvdXRsaWVycywgaXQgaXMgdXNlZnVsIHRvIGRvIHRoZSBhbmFseXNpcyB3aXRoIGFuZCB3aXRob3V0IHRoZSBwb2ludC4NCiMjIyBMZXQncyByZW1vdmUgdGhlIGxhcmdlIHJlc2lkdWFscyB3aGljaCBkZXZpYXRlcyBtb3JlIHRoYW4gMiBzLmQgZnJvbSB0aGUgbWVhbiAwDQoNCmBgYHtyfQ0KaW5kZXggPC0gd2hpY2goYWJzKGZpdC5zdGRyZXMpID4gMikNCmluZGV4DQpkYXRhc2V0MSA8LSBkYXRhc2V0Wy1pbmRleCxdDQpzdHIoZGF0YXNldDEpDQpgYGANCg0KYGBge3J9DQojVXNlIGRhdGFzZXQxIHRvIHJ1biB0aGUgc2FtZSByZWdyZXNzaW9uDQpmaXQxIDwtIGxtKHByZXN+YnAsIGRhdGE9ZGF0YXNldDEpDQpzdW1tYXJ5KGZpdDEpDQpgYGANCiMjIyBUaGUgcmVncmVzc2lvbiBmaXQgYW5kIGNvZWZmaWNpZW50IGRpZCBub3QgY2hhbmdlIG11Y2gNCg0KYGBge3J9DQpmaXQuc3RkcmVzID0gcnN0YW5kYXJkKGZpdDEpDQpwbG90KGZpdDEkZml0dGVkLCBmaXQuc3RkcmVzLCB4bGFiPSJGaXR0ZWQgdmFsdWVzIiwgeWxhYj0iU3RhbmRhcmRpemVkIHJlc2lkdWFscyIpDQphYmxpbmUoaD0wLCBjb2w9MiwgbHdkPTIpDQpgYGANCg0KDQo=