RPub URL:
Y=b0+b1Xi+Ei ... where Ei are iid N(0,σ2)
b1=Cor(Y,X) * (Sd(Y)/Sd(X))
b0=Y−b1X
library(UsingR)
data(mtcars)
[, 1] mpg Miles/(US) gallon
[, 2] cyl Number of cylinders
[, 3] disp Displacement (cu.in.)
[, 4] hp Gross horsepower
[, 5] drat Rear axle ratio
[, 6] wt Weight (lb/1000)
[, 7] qsec 1/4 mile time
[, 8] vs V/S
[, 9] am Transmission (0 = automatic, 1 = manual)
[,10] gear Number of forward gears
[,11] carb Number of carburetors
Interpreting Data
x<-mtcars$wt
y<-mtcars$mpg
fit<-lm(y ~ x)
plot(x,y,xlab='Weight (1000lb)',ylab='MPG')
abline(fit,col="red")
Let’s look at slope and intercept
coef(fit)
## (Intercept) x
## 37.285 -5.344
This means, “For every 1000lb weight increases, MPG decreases by 5.3 miles per gallon” and “If there is 0lb car, the MPG is 37.2 miles per gallon”
How about if we need to scale / convert the unit of weight in Kg instead of lb? One way is to convert everything before applying lm() function.
# 1lb is 0.45kg
x<-mtcars$wg * 0.45
or, use I() function to scale within formula
fit2<-lm(y~I(x*0.45))
coef(fit2)
## (Intercept) I(x * 0.45)
## 37.29 -11.88
plot(x*0.45,y,xlab='Weight (1000Kg)',ylab='MPG')
abline(fit2,col="red")
This means, “For every 1000kg weight increases, MPG decreases by -11.88 miles per gallon”.
In same way, you can apply centering on the formula itself. Center and scale them with the standard deviation.
fit3<-lm( I( (y - mean(y))/sd(y) ) ~ I( (x - mean(x))/sd(x) ) )
coef(fit3)
## (Intercept) I((x - mean(x))/sd(x))
## 8.053e-17 -8.677e-01
plot((x - mean(x))/sd(x),(y - mean(y))/sd(y))
abline(fit3,col="red")
abline(h=0,col="gray")
abline(v=0,col="gray")
Use predict() function. You can also calculate the values from slope and intercept from coef() function.
est<-predict(fit,data.frame(x))
plot(x,y,xlab='Weight (1000lb)',ylab='MPG')
abline(fit,col="red")
points(x,est,col="blue",pch=3)
Prediction Interval
p2<-predict(fit,data.frame(x), interval="prediction")
plot(x,y,xlab='Weight (1000lb)',ylab='MPG')
abline(fit,col="red")
lines(x,p2[,2],col="lightblue")
lines(x,p2[,3],col="lightblue")
Use resid() function.
est<-predict(fit,newdata=data.frame(x))
err<-resid(fit)
Both should match:
head(y-est)
## 1 2 3 4 5 6
## -2.2826 -0.9198 -2.0860 1.2973 -0.2001 -0.6933
head(err)
## 1 2 3 4 5 6
## -2.2826 -0.9198 -2.0860 1.2973 -0.2001 -0.6933
Draw the residuals in green bar
plot(x,y,xlab='Weight (1000lb)',ylab='MPG')
segments(x,y,x,est, col="darkgreen", lwd = 2)
abline(fit,col="red")
points(x,est, col="blue", pch=3)
What is the residual variation, sigma? This indicates how good the fit is.
summary(fit)$sigma
## [1] 3.046
Which is calculated in this way (same result)
n<-length(x)
sqrt( sum(resid(fit)^2) / (n-2) )
## [1] 3.046
Also, see appendix below for comparing different models and misleading examples.
Use summary() function for p-value.
p-value is the probability to have a value equal to or greater than the observed value.
If alpha (first type error) is equal to 0.05, then:
summary(fit)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.543 -2.365 -0.125 1.410 6.873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.285 1.878 19.86 < 2e-16 ***
## x -5.344 0.559 -9.56 1.3e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.05 on 30 degrees of freedom
## Multiple R-squared: 0.753, Adjusted R-squared: 0.745
## F-statistic: 91.4 on 1 and 30 DF, p-value: 1.29e-10
#direct access to p-value
#names(summary(fit))
summary(fit)$coefficients[2,4]
## [1] 1.294e-10
t.test(x,y)$p.value
## [1] 1.028e-16
R Squared Value
summary(fit)$r.squared
## [1] 0.7528
tbl<-summary(fit)$coefficients
tbl
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.285 1.8776 19.858 8.242e-19
## x -5.344 0.5591 -9.559 1.294e-10
mn<-tbl[2,1] #mean is the estimated slope
std_err<-tbl[2,2] #standard error
deg_fr<-fit$df #degree of freedom
#Two sides T-Tests
mn + c(-1,1) * qt(0.975,df=deg_fr) * std_err
## [1] -6.486 -4.203
Conclusion:
With 95% confidence, we estimate that every 1000lb weight increases, MPG decreases within -6.486 and -4.203 MGP
p1<-predict(fit,data.frame(x), interval="confidence")
plot(x,y,xlab='Weight (1000lb)',ylab='MPG')
abline(fit,col="red")
lines(x,p1[,2],col="purple")
lines(x,p1[,3],col="purple")
The actual values of upper and lowers are displayed below:
p1<-predict(fit,data.frame(x), interval="confidence")
#combine predictor, outcome, estimate, lower and upper
head(cbind(x,y,p1)[order(x),])
## x y fit lwr upr
## 28 1.513 30.4 29.20 26.96 31.43
## 19 1.615 30.4 28.65 26.52 30.79
## 20 1.835 33.9 27.48 25.55 29.40
## 26 1.935 27.3 26.94 25.11 28.77
## 27 2.140 26.0 25.85 24.20 27.50
## 18 2.200 32.4 25.53 23.93 27.13
What about specific value to predict? What is the mpg when the weight is 2000lb?
predict(fit,data.frame(x=2), interval="confidence")
## fit lwr upr
## 1 26.6 24.82 28.37
x<-mtcars$wt
x2<-mtcars$hp
x3<-mtcars$disp
y<-mtcars$mpg
par(mfrow=c(2,2))
plot(x,y)
plot(x2,y)
plot(x3,y)
Find residuals
#-1 removes the intercept term
coef(lm(y ~ x + x2 + x3 - 1))
## x x2 x3
## 12.06391 0.04382 -0.11686
Which is same as
ex<-resid(lm(x ~ x2 + x3 - 1))
ey<-resid(lm(y ~ x2 + x3 - 1))
sum(ex * ey) / sum(ex^2)
## [1] 12.06
Fit without slope (Horizontal)
fit_no_slope<-lm(y ~ 1)
Fit without intercept = force to go throuh (0,0)
fit_no_intercept<-lm(y ~ x - 1)
plot(x,y,xlim=c(0,7),ylim=c(0,40))
abline(fit,col="red")
abline(fit_no_slope,col="blue")
abline(fit_no_intercept,col="green")
How can we measure which one is the best model? It’s obvious, but let’s verify it. Use Sigma.
c(summary(fit)$sigma,
summary(fit_no_slope)$sigma,
summary(fit_no_intercept)$sigma)
## [1] 3.046 6.027 11.269
Sum Sq Values Comparison
c(anova(fit)[1,'Sum Sq'],
anova(fit_no_slope)[1,'Sum Sq'],
anova(fit_no_intercept)[1,'Sum Sq'])
## [1] 847.7 1126.0 10105.7
anova(fit,fit_no_slope,fit_no_intercept)
## Analysis of Variance Table
##
## Model 1: y ~ x
## Model 2: y ~ 1
## Model 3: y ~ x - 1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 30 278
## 2 31 1126 -1 -848 91.4 1.3e-10 ***
## 3 31 3937 0 -2811
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Check R Squared Values
c(summary(fit)$r.squared,
summary(fit_no_slope)$r.squared,
summary(fit_no_intercept)$r.squared)
## [1] 0.7528 0.0000 0.7197
Note: the linear model without slope shows 0 for sum of redistuals. Residual values are canceled out each other; however sigma is larger than the best fit one.
Below 4 different datasets show the same slope, and also sigmas and Sum Sq are same.
data(anscombe)
par(mfrow=c(2,2))
a<-anscombe
f1<-lm(a$y1~a$x1)
f2<-lm(a$y2~a$x2)
f3<-lm(a$y3~a$x3)
f4<-lm(a$y4~a$x4)
plot(a$x1,a$y1)
abline(f1,col="red")
plot(a$x2,a$y2)
abline(f2,col="red")
plot(a$x3,a$y3)
abline(f3,col="red")
plot(a$x4,a$y4)
abline(f4,col="red")
You can also call example(anscombe)
Compare Sigmas. They are same.
c(summary(f1)$sigma,
summary(f2)$sigma,
summary(f3)$sigma,
summary(f4)$sigma)
## [1] 1.237 1.237 1.236 1.236
Compare Sum Sq. They are same.
c(anova(f1)[1,'Sum Sq'],
anova(f2)[1,'Sum Sq'],
anova(f3)[1,'Sum Sq'],
anova(f4)[1,'Sum Sq'])
## [1] 27.51 27.50 27.47 27.49
Compare R-Squared Values. They are same.
c(summary(f1)$r.squared,
summary(f2)$r.squared,
summary(f3)$r.squared,
summary(f4)$r.squared)
## [1] 0.6665 0.6662 0.6663 0.6667