library(UsingR)
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.0.3
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.0.3
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
##
## cancer
# library(manipulate)
data(galton)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
freqData<-as.data.frame(table(galton$child, galton$parent))
names(freqData)<-c("child", "parent", "freq")
freqData$child<-as.numeric(as.character(freqData$child))
freqData$parent<-as.numeric(as.character(freqData$parent))
Splot<-function(beta){
g <- ggplot(filter(freqData, freq>0), aes(x=parent, y=child))
g <- g + scale_size(range=c(2,20),guide="none")
g <- g + geom_point(colour="grey50", aes(size=freq+20))#,show_guide=FALSE))
g <- g + geom_point(aes(colour=freq,size=freq))
g <- g + scale_colour_gradient(low="lightblue",high="white")
g
}
#manipulate(Splot(beta),beta = slider(0.6,1.2, step=0.02))
set.seed(-123)
x<-rnorm(100)
y<-rnorm(100)
odr<-order(x)
x[odr[100]]#The maximum X
## [1] 2.6273
y[odr[100]]#The maximum Y
## [1] 0.233102
library(UsingR)
data(father.son)
y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight)
x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight)
rho <- cor(x, y)
library(ggplot2)
g = ggplot(data.frame(x, y), aes(x = x, y = y))
g = g + geom_point(size = 5, alpha = .2, colour = "black")
g = g + geom_point(size = 4, alpha = .2, colour = "red")
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
g = g + geom_abline(position = "identity")
## Warning: Ignoring unknown parameters: position
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
g = g + xlab("Father's height, normalized")
g = g + ylab("Son's height, normalized")
g
library(UsingR)
data(diamond)
library(ggplot2)
g = ggplot(diamond, aes(x = carat, y = price))
g = g + xlab("Mass (carats)")
g = g + ylab("Price (SIN $)")
g = g + geom_point(size = 7, colour = "black", alpha=0.5)
g = g + geom_point(size = 5, colour = "blue", alpha=0.2)
g = g + geom_smooth(method = "lm", colour = "black")
g
## `geom_smooth()` using formula 'y ~ x'
fit<-lm(price~carat,data=diamond)
coef(fit)
## (Intercept) carat
## -259.6259 3721.0249
summary(fit)
##
## Call:
## lm(formula = price ~ carat, data = diamond)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.159 -21.448 -0.869 18.972 79.370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -259.63 17.32 -14.99 <2e-16 ***
## carat 3721.02 81.79 45.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.84 on 46 degrees of freedom
## Multiple R-squared: 0.9783, Adjusted R-squared: 0.9778
## F-statistic: 2070 on 1 and 46 DF, p-value: < 2.2e-16
coef(fit)
## (Intercept) carat
## -259.6259 3721.0249
fit2<-lm(price~I(carat-mean(carat)),data=diamond)
coef(fit2)
## (Intercept) I(carat - mean(carat))
## 500.0833 3721.0249
Slope remains same for the intercept changes to 500, which is the expected price for the average sized diamond of the data (0.2042 carats. 1 carat increase is actually kind of big. How about changing it to 1/10th increase?
fit3<-lm(price~I(carat*10),data=diamond)
coef(fit3)
## (Intercept) I(carat * 10)
## -259.6259 372.1025
Shows SIN$ 372 increase per one tenth of a carate increase. Imagine someone came to you with three diamonds of following carats
newx<-c(0.16, 0.27, 0.34)
They wanted to know the estimated price of those diamond. Well, you could do it manually by grabbing the two coefficients in multiplying the intercept or adding the intercept plus the slope times these new values.
coef(fit)[1]+coef(fit)[2]*newx
## [1] 335.7381 745.0508 1005.5225
Based on the fitted regression model, the price of those diamond would be SIN 335, 745, & 1005. Often, you don’t want to do even that much coding, you want to more general method, especially when you get lots of regression variables. So there’s this general method called predict that will take the output from several different kinds of model fits. Linear models are one example, but predict is a generic function, you know, or, and it applies to several different prediction models. So we predict from the output of our lm fit and then you need to give it some new data to predict that. So new data is a data.frame that has the new values of x for the carat variable. Then when we do that, what you’ll see is of course, it gives you the same answer. The, now in a way that scales up when we have lots of regressors in much more complicated settings. So you generally, want to predict using the predict function. If you omit this new data statement, if you just do predict fit, I’ll show it to you.
predict(fit,newdata=data.frame(carat=newx))
## 1 2 3
## 335.7381 745.0508 1005.5225
The fitted values when we do the predict command, the fitted values in red all of the observed x values and their associated fitted points on the line. These are if we were to draw vertical lines from the observed data points on to the fitted line, they would occur on these red points. When we predicted a new value of x, what we’re doing is we’re finding a point along this horizontal axis. That here, I’m giving the three values that we want, 0.16, 0.27 and 0.34. We’re drawing a line up to the fitted regression line and then over to dollars and those are our predicted dollar amounts.
# library(dplyr)
# library(UsingR)
# data(diamond)
# plot(diamond$carat, diamond$price, xlab = "Mass (carats)",ylab = #"Price (SIN $)", bg = "lightblue",col = "black", cex = 1.1, pch = #21,frame = #FALSE) abline (lm(price~carat,data = diamond), lwd = 2)
Residuals represent variation left unexplained by our model. We emphasize the difference between residuals and errors. The errors unobservable true errors from the known coefficients, while residuals are the observable errors from the estimated coefficients. In a sense, the residuals are estimates of the errors.
Let’s start by talking about our motivating example using the diamond data set. Remember, in this data set the diamonds were diamond prices in Singapore dollars. And, the diamond, the explanatory variable is the weight of the diamond in carats.
Generating a Plot
library(UsingR)
data(diamond)
library(ggplot2)
g= ggplot(diamond, aes(x=carat,y=price))
g= g + xlab("Mass(Carats)")
g= g + ylab("Price(SIN $)")
g= g + geom_point(size=8, colour="black", alpha=0.5)
g= g + geom_point(size=5, colour="red", alpha=0.2)
g= g + geom_smooth(method="lm", color="blue")
g
## `geom_smooth()` using formula 'y ~ x'
Looking at the plot we can see that not all dots lie in the regression line. The variation around the regression line, is residual variation. That variation that was left unexplained by having accounted for mass. So, we start out with a lot of variation. We’ve explained a substantial fraction of it by the linear relationship with mass, but there’s some left over variation. We’ll call that residual variation. These distances that make up the residual variation are called the residuals. The residuals are quite useful for diagnosing many things, including poor model fit.
Let’s remind ourselves of the model that we’re considering. Our outcome and our example, which would be price, is Yi, which we’re assuming is a line. Beta nought plus Beta Xi, where Xi, in this case, is mass in our example. We’re adding on some gaussian error, epsolon i, which we’re assuming to be normally distributed with a mean of zero and a variance of sigma squared. Remember, the outcome is Yi for predictor value, Xi at that specific index i. So, Yi is a price, Xi is a mass. The point on the line corresponding to Xi is Yi hat.
We can figure out Yi hat just by plugging in Beta0 hat plus Beta1 hat times Xi.Well, our residual is nothing other than the vertical distance between the observed data point, observed outcome, Yi, and the fitted value, Yi hat. So, ei is equal to Yi minus Yi hat. Remember, our least square’s criteria tried to minimize the sum of the squared vertical distances. So in essence, it was minimizing the sum of the squared residual, summation ei squared.One way to think about the residuals are as an estimate of epsilon i. Though, you have to be careful with that, because as we’ll see later on, we can decrease the residuals just by adding irrelevant regressors into the equation.Let’s talk about some aspects of residuals that will help us interpret them.
same thing in words First of all, their population is expected value is zero.But also, their sum, their empirical sum, hence the empirical mean also, is zero if you include an intercept. If you don’t include an intercept, this property doesn’t have to hold. Well, the generalization of this property is, if you include any regression term in, in linear regression, in our generalization in linear models,
library(UsingR)
library(ggplot2)
library(dplyr)
data(diamond)
y<-diamond$price;
x<-diamond$carat;
n<-length(y)
fit<-lm(y~x)
e<-resid(fit)#the easiest way to get residual
e
## 1 2 3 4 5 6
## -17.9483176 -7.7380691 -22.9483176 -85.1585661 -28.6303057 6.2619309
## 7 8 9 10 11 12
## 23.4721795 37.6311854 -38.7893116 24.4721795 51.8414339 40.7389488
## 13 14 15 16 17 18
## 0.2619309 13.4209369 -1.2098087 40.5287002 36.1029250 -44.8405542
## 19 20 21 22 23 24
## 79.3696943 -25.0508027 57.8414339 9.2619309 -20.9483176 -3.7380691
## 25 26 27 28 29 30
## -19.9483176 27.8414339 -54.9483176 8.8414339 -26.9483176 16.4721795
## 31 32 33 34 35 36
## -22.9483176 -13.1020453 -12.1020453 -0.5278205 3.2619309 2.2619309
## 37 38 39 40 41 42
## -1.2098087 -43.2098087 -27.9483176 -23.3122938 -15.6303057 43.2672091
## 43 44 45 46 47 48
## 32.8414339 7.3696943 4.3696943 -11.5278205 -14.8405542 17.4721795
yhat<-predict(fit)# If we were to get the predicted fitted values and (remember), if we don't give the predict function new data, but just give it the output from lm-function, then it just predicts the observed x values.
max(abs(e-(y-yhat)))#other way of getting residual
## [1] 9.485746e-13
max(abs(e-(y-coef(fit)[1]-coef(fit)[2]*x)))#if we manually even calculate the fitted values, coef fit 1 and then coef fit 2 times x, that of course gives exactly the same numbers.
## [1] 9.485746e-13
So the way we want to do to get the residuals is resid, but hopefully showing other codes illustrate what’s going on in the background with ‘resid’, or the actual calculation that ‘resid’ is doing.
sum(e)
## [1] -1.865175e-14
sum(e*x)#and also the sum of the residual times x variable should also be zero
## [1] 6.959711e-15
So the residuals are the sign lengths of the red line that undershown the following plot. It can be done using base R graphics. Sometimes,base R can also be used with ggplot graphics.
plot(diamond$carat, diamond$price,xlab="Mass(carats)",ylab="Price(SIN $)",bg="lightblue",col="black", cex=1.1, pch=21, frame=FALSE)
We then add the fitted line and in base R. If we want to add the fitted line we can just do abline and put the object that we assign to the lm fit just as an argument, and it adds the regression line.
plot(diamond$carat, diamond$price,xlab="Mass(carats)",ylab="Price(SIN $)",bg="lightblue",col="black", cex=1.1, pch=21, frame=FALSE)
abline(fit, lwd=2)
for(i in 1:n)
lines(c(x[i],x[i]),c(y[i],yhat[i]),col="red", lwd=2)
This scatter plot isn’t particularly useful for assessing residual variation because there is a lot of white space above and below the fitted line. We can make it more useful by plotting residuals on the vertical axis versus mass on the horizontal axis
plot(x, e,
xlab="Mass(Carats)",
ylab="Residuals(SIN$)",
bg="lightblue",
col="black", cex=2, pch=21, frame= FALSE)
abline(h=0, lwd=2)
for(i in 1:n)
lines(c(x[i],x[i]),c(e[i],0),col="red", lwd=2)
Now we can see the residual variation much more clearly. When we look at the residual plot, we’re looking for any form of pattern. The residuals should be mostly patternless. Also, remember that if we’ve included an intercept, residuals have to sum to zero. So they have to lie above and below the horizontal line at zero, and we’d like them to spread nicely in a random looking fashion distributed both above and below zero. In this plot particularly, we can see some interesting patterns. For example, there were lots of diamonds of exactly the same mass.
Next, we’re going to go through some pathological residual plots, just to highlight what residual plots can do for us. So, we’ve concocted some examples that help us understand how residuals can highlight model fit.
x=runif(100,-3,3);#x is going to be 100 different numbers between -3 and +3
y=x+sin(x)+rnorm(100,sd=0.2);#y is equal to x, an identity line plus 'sin(x)' which makes it oscillate around it a little bit plus some normal noise.
library(ggplot2)
g=ggplot(data.frame(x=x,y=y),aes(x=x,y=y))
g=g+geom_smooth(method="lm",colour="black")
g=g+geom_point(size=7,colour="black", alpha=0.4)
g=g+geom_point(size=5,colour="red",alpha=0.4)
g
## `geom_smooth()` using formula 'y ~ x'
It’s a little difficult to see the non-linearity, that ‘sin x’ term is little apparent. Looking at the plot, we would immediately notice something pattern but nonetheless, it’s maybe a little hard to see. This model is actually not the correct model for this data and this might happen in practice. This doesn’t mean that this model is unimportant. There is a linear trend and the model is accounting for it, it’s just not accounting for the secondary variation in the ‘sin x’ term. We have an average identity line that represents the relationship between y and x and it explains a lot of the variation. So, it is essential to remember that in regression, having the exact right model is not always the primary goal. We can get meaningful information about trends from incorrect models.
Now we are going to plot the residuals versus x and see if the ‘sin x’ term seems more apparent.
g=ggplot (data.frame(x=x, y=resid(lm(y~x))),aes(x=x, y=y))#x in this case is x. We have defined the x variable as the variable named x. But now, my y is not the y variable, but the residual from the linear model fit.
g=g+geom_hline(yintercept=0,size=2);#We want to put the horizontal line at zero
g=g+geom_point(size=7,colour="black",alpha=0.4)#And add the points
g=g+geom_point(size=5,colour="red",alpha=0.4)
g=g+xlab("X")+ylab("Residual")
g
The above plot shows the sin (x) term extremely apparently. That’s what the residual plot has done. It’s zoomed in, on this part of the model inadequacy and highlighted it, and that’s one thing that residual plots are especially good at.
We are going to create another plot where by all appearances, the plot falls perfectly on a line. But when we highlight in, at the residuals, it looks quite different.
x<-runif(100, 0, 6);#x variable is a bunch of uniform random variables
y<-x+rnorm(100, mean=0, sd=0.001*x);#y variable is the x variable, so an identity line, plus the errors, the standard deviation of the errors times the x term. That's how we generated data with heteroscedasticity.
g=ggplot(data.frame(x=x,y=y),aes(x=x,y=y))
g=g+geom_smooth(method="lm", colour="black")
g=g+geom_point(size=7,colour="black", alpha=0.4)
g=g+geom_point(size=5,colour="red",alpha=0.4)
g
## `geom_smooth()` using formula 'y ~ x'
So there’s the plot. It seems like the points fall exactly on an identity line. As we highlight in on the residuals (in below figure),we see a trend toward greater variability as we head along the x variable. That property, where the variability increases with the x variables is called heteroscedasticity. Heteroscedasticity is one of those things that residual plots are quite good at diagnosing. If I go back to the above plot, we can’t see it at all. Zoom in on the residuals and there we see it there.
g=ggplot(data.frame(x=x,y=resid(lm(y~x))),aes(x=x,y=y))
g=g+geom_hline(yintercept=0, size=2);
g=g+geom_point(size=7,colour="black", alpha=0.4)
g=g+geom_point(size=5,colour="red",alpha=0.4)
g=g+xlab("X")+ylab("Residual")
g
diamond$e<-resid(lm(price~carat, data=diamond))
head(diamond)#Checking the dataframe
g=ggplot(diamond,aes(x=carat,y=e))
g=g+xlab("Mass(carats)")
g=g+ylab("Residual Price(SIN$)")
g=g+geom_hline(yintercept=0, size=2);
g=g+geom_point(size=7,colour="black", alpha=0.5)
g=g+geom_point(size=5,colour="blue",alpha=0.2)
g
So there doesn’t appear to be a lot of pattern in the plot, so this is good. It seems like it’s a pretty good fit.
Now, let’s illustrate something about variability in a diamond dataset that will help us set the stage for defining some new properties about our regression model fit. So we are going to create two residual vectors. The first residual vector is the one where we just fit an intercept. So the residuals are just the deviations around the average price. Then,we are going to find the residuals where we add in our variation around the regression line. So, the first one is variation around the average price. The second is the variation around the regression line with carats as the explanatory variable and price as the outcome.
e<-c(resid(lm(price~1,data=diamond)),
resid(lm(price~carat, data=diamond)))
Creating a factor variable The first one is just going to be labeled as an intercept only model residuals and the second set are going to be labeled as intercept and slope residuals.
fit=factor(c(rep("Itc",nrow(diamond)),
rep("Itc,slope", nrow(diamond))))
And finally creating a ggplot Y variable is going to be the residuals, when x variable is going to be the fit. Linear model or linear model with just an intercept or the linear model with carat, the mass as the predictor.
g=ggplot(data.frame(e=e, fit=fit),aes(y=e,x=fit, fill=fit))
g=g+xlab("Fitting Approach")
g=g+ylab("Residual Price(SIN$)")
g=g+geom_dotplot(binaxis="y", size=2, stackdir="center",binwidth=30);
## Warning: Ignoring unknown parameters: size
g
So what we see on this left-hand plot with just the intercept is the variation in diamond prices around the average diamond price. So just the, basically the variation in diamond prices from the sample.
What we’re seeing in the rightmost plot, this is displaying the variation around the regression line. So what we’ve done is we’ve explained a lot of this variation with the relationship with mass.
And what we’re going to talk about next is r squared, which basically says, we can decompose this total variation in the y’s by themselves leaving just the mean into the variation explained by the regression model and the variation that’s left over after accounting for the regression model. So this is the variation that’s left over after counting for the regression, but also this sort of subtraction of these two would be the variation that was explained by the regression model. But there’s a formula for that and we’re going to dive into that next.
Residual variation is the variation around the regression line.Residuals are the vertical distances between the outcomes and the fitted regression line. if we include an intercept, the residuals have to sum to zero, which means their mean is zero. Model Yi = β0 + β1 Xi + ϵi where ϵi ∼ N(0,σ^2)
So if we want to take the variance of the residuals, it’s just the average of the squares. So the sum of the squared residuals, times 1/n, is an estimate of sigma squared. The variation around the regression line. The true population variation around the regression line.
Now most people use (n-2), instead of n. So it’s not the average squared residual. And for large n, the difference between 1/n and 1/(n-2) is irrelevant. But for small n, it can make a difference. The way to think about that is, remember, if we include the intercept the residuals have to sum to zero. So, that puts a constraint. If you know (n-1) of them, and, we know the n. If we have a line term (i.e., a co-variant in there)then, that puts a second constrain on the residuals. So, we lose 2 degrees of freedom. If we put another regression variable there, we have another constraint, and we lose 3 df. In that sense, we really don’t have n residuals, but n-2 of them, because if we knew n-2 of them we could figure out the last two. And that’s why it’s 1/(n-2).
y <- diamond$price;
x <- diamond$carat;
n <- length(y)
fit <- lm(y ~ x)
#shows Intercepts, slopes, estimated values, and so on, and we see the
#residual standard deviation estimate among other elements in the printout.
summary(fit)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.159 -21.448 -0.869 18.972 79.370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -259.63 17.32 -14.99 <2e-16 ***
## x 3721.02 81.79 45.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.84 on 46 degrees of freedom
## Multiple R-squared: 0.9783, Adjusted R-squared: 0.9778
## F-statistic: 2070 on 1 and 46 DF, p-value: < 2.2e-16
#However, if we want to grab it as an object, then we can assign it
#to something by putting dollar sign sigma.
summary(fit)$sigma
## [1] 31.84052
# So if we do *resid fit*, it grabs the residuals. We can square it
#then sum it and add up the squared values. And finally, divide by
#(n-2), it takes the average of the unique residuals. The answer is
#same in both cases.
sqrt(sum(resid(fit)^2) / (n - 2))
## [1] 31.84052
We might call the regression variability as the component of that variability, which then gets explained away by the regression line. So we would take the points on the regression line, ‘the heights’. Let’s see how much that deviates around the average and that’s the regression variability. The variability in diamond prices disregarding everything except for where they’re centered at is equal to the regression variability, that is the variability explained by the model Plus the residual variability, the variability left over not explained by the model.
R squared is the percentage of the variation in the response explained by the linear relationship with the predictor. R squared has to be between 0 and 1 because the regression variability and the error variability and the sums of the squares add up to the total sums of squares, and they’re all positive. So that forces our square to be between zero and one. If we define R as the sample correlation between the predictor and the outcome, then R squared is literally that sample correlation R, squared. R squared can be a misleading summary of model fit. For example, if we have somewhat noisy data and we delete all the points in the middle, we get a much higher R squared. Or if we just add arbitrary regression variables into a linear model fit, we increase R squared and decrease mean squared error. So, these things have to be taken into mind if we’re using them to assess a model fit.
data(anscombe)
example(anscombe)
##
## anscmb> require(stats); require(graphics)
##
## anscmb> summary(anscombe)
## x1 x2 x3 x4 y1
## Min. : 4.0 Min. : 4.0 Min. : 4.0 Min. : 8 Min. : 4.260
## 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 8 1st Qu.: 6.315
## Median : 9.0 Median : 9.0 Median : 9.0 Median : 8 Median : 7.580
## Mean : 9.0 Mean : 9.0 Mean : 9.0 Mean : 9 Mean : 7.501
## 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.: 8 3rd Qu.: 8.570
## Max. :14.0 Max. :14.0 Max. :14.0 Max. :19 Max. :10.840
## y2 y3 y4
## Min. :3.100 Min. : 5.39 Min. : 5.250
## 1st Qu.:6.695 1st Qu.: 6.25 1st Qu.: 6.170
## Median :8.140 Median : 7.11 Median : 7.040
## Mean :7.501 Mean : 7.50 Mean : 7.501
## 3rd Qu.:8.950 3rd Qu.: 7.98 3rd Qu.: 8.190
## Max. :9.260 Max. :12.74 Max. :12.500
##
## anscmb> ##-- now some "magic" to do the 4 regressions in a loop:
## anscmb> ff <- y ~ x
##
## anscmb> mods <- setNames(as.list(1:4), paste0("lm", 1:4))
##
## anscmb> for(i in 1:4) {
## anscmb+ ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
## anscmb+ ## or ff[[2]] <- as.name(paste0("y", i))
## anscmb+ ## ff[[3]] <- as.name(paste0("x", i))
## anscmb+ mods[[i]] <- lmi <- lm(ff, data = anscombe)
## anscmb+ print(anova(lmi))
## anscmb+ }
## Analysis of Variance Table
##
## Response: y1
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 27.510 27.5100 17.99 0.00217 **
## Residuals 9 13.763 1.5292
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Variance Table
##
## Response: y2
## Df Sum Sq Mean Sq F value Pr(>F)
## x2 1 27.500 27.5000 17.966 0.002179 **
## Residuals 9 13.776 1.5307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Variance Table
##
## Response: y3
## Df Sum Sq Mean Sq F value Pr(>F)
## x3 1 27.470 27.4700 17.972 0.002176 **
## Residuals 9 13.756 1.5285
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Variance Table
##
## Response: y4
## Df Sum Sq Mean Sq F value Pr(>F)
## x4 1 27.490 27.4900 18.003 0.002165 **
## Residuals 9 13.742 1.5269
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## anscmb> ## See how close they are (numerically!)
## anscmb> sapply(mods, coef)
## lm1 lm2 lm3 lm4
## (Intercept) 3.0000909 3.000909 3.0024545 3.0017273
## x1 0.5000909 0.500000 0.4997273 0.4999091
##
## anscmb> lapply(mods, function(fm) coef(summary(fm)))
## $lm1
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0000909 1.1247468 2.667348 0.025734051
## x1 0.5000909 0.1179055 4.241455 0.002169629
##
## $lm2
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.000909 1.1253024 2.666758 0.025758941
## x2 0.500000 0.1179637 4.238590 0.002178816
##
## $lm3
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0024545 1.1244812 2.670080 0.025619109
## x3 0.4997273 0.1178777 4.239372 0.002176305
##
## $lm4
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0017273 1.1239211 2.670763 0.025590425
## x4 0.4999091 0.1178189 4.243028 0.002164602
##
##
## anscmb> ## Now, do what you should have done in the first place: PLOTS
## anscmb> op <- par(mfrow = c(2, 2), mar = 0.1+c(4,4,1,1), oma = c(0, 0, 2, 0))
##
## anscmb> for(i in 1:4) {
## anscmb+ ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
## anscmb+ plot(ff, data = anscombe, col = "red", pch = 21, bg = "orange", cex = 1.2,
## anscmb+ xlim = c(3, 19), ylim = c(3, 13))
## anscmb+ abline(mods[[i]], col = "blue")
## anscmb+ }
##
## anscmb> mtext("Anscombe's 4 Regression data sets", outer = TRUE, cex = 1.5)
##
## anscmb> par(op)
The first is a nice regression line, exactly along the lines with just a slightly noisy x y relationship. The second one clearly shows a missing term in order to address some of this curvature in the data. The third one, there’s an outlier. And in the fourth one, all the data stacked up at one particular location. And then there’s one point way out at the end. So, we could imagine getting this if you had the first example and we deleted a lot of the points in the middle.So in all these cases we have equivalent R squared. But the summary to the single number certainly has thrown out a lot of the important information that you get from a simple scatter plot
Inference is the process of drawing conclusions about a population using a sample. In statistical inference, we must account for the uncertainty in our estimates in a principled way. Hypothesis tests and confidence intervals are among the most common forms of statistical inference.
These statements apply generally, and, of course, to the regression setting that we’ve been studying. In the next few lectures, we’ll cover inference in regression where we make some Gaussian assumptions about the errors.
In the regression case, with IID sampling assumptions for our normally distributed errors, inference will follow along these lines very similarly to what we see in that class. But remember, it’s not mandatory for the errors to be Gaussian for our statistical inferences in regression to hold. You can appeal to large sample theory, though it’s a little bit more complicated.
If our regressors, our predictors, are all packed in very tightly, closely together, then it’s clear we’re not going to estimate a very good line. It could sort of bend around that cloud of points very easily and get equivalent, nonequivalent fits. On the other hand, if we spread our axis out, then we’ll get a better fitted regression line. We’ll get a lower variance for the slope.
The diamond data set
library(UsingR)
data(diamond)
y <- diamond$price
x <- diamond$carat
n <- length(y)
beta1 <- cor(y,x)*sd(y)/sd(x) #beta1 is the correlation between y and the x times the standard deviation of y divided by the standard deviation of x.
beta0<- mean(y)-beta1*mean(x) #beta0 is the mean of the y's minus beta1 times the mean of the x or the predicted value
e <- y - beta0 - beta1 * x #residual (e) are responsey minus the predicted values, beta0 plus beta1 x etc.
sigma <- sqrt(sum(e^2) / (n-2))
ssx <- sum((x - mean(x))^2)#sums of squares of x's is just the numerator of the variance calculation,
seBeta0 <- (1 / n + mean(x) ^ 2 / ssx) ^ .5 * sigma #standard error for beta0
seBeta1 <- sigma / sqrt(ssx)#standard error for beta1
tBeta0 <- beta0 / seBeta0; tBeta1 <- beta1 / seBeta1 #t-statistics we are testing the hypothesis if the beta1 or beta0 are zero
pBeta0 <- 2 * pt(abs(tBeta0), df = n - 2, lower.tail = FALSE)#p-values for beta0
pBeta1 <- 2 * pt(abs(tBeta1), df = n - 2, lower.tail = FALSE)##p-values for beta1
coefTable <- rbind(c(beta0, seBeta0, tBeta0, pBeta0), c(beta1, seBeta1, tBeta1, pBeta1))
colnames(coefTable) <- c("Estimate", "Std. Error", "t value", "P(>|t|)")
rownames(coefTable) <- c("(Intercept)", "x")
coefTable
## Estimate Std. Error t value P(>|t|)
## (Intercept) -259.6259 17.31886 -14.99094 2.523271e-19
## x 3721.0249 81.78588 45.49715 6.751260e-40
fit<-lm(y~x);
summary(fit)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -259.6259 17.31886 -14.99094 2.523271e-19
## x 3721.0249 81.78588 45.49715 6.751260e-40
# or
sumCoef <- summary(fit)$coefficients
summary(fit)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.159 -21.448 -0.869 18.972 79.370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -259.63 17.32 -14.99 <2e-16 ***
## x 3721.02 81.79 45.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.84 on 46 degrees of freedom
## Multiple R-squared: 0.9783, Adjusted R-squared: 0.9778
## F-statistic: 2070 on 1 and 46 DF, p-value: < 2.2e-16
sumCoef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -259.6259 17.31886 -14.99094 2.523271e-19
## x 3721.0249 81.78588 45.49715 6.751260e-40
##Getting Confidence Interval
sumCoef[1,1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[1, 2]
## [1] -294.4870 -224.7649
## the matrix shows which value to grab, for example [1,1] refers to the value in row and column 1
sumCoef[2,1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[2, 2]
## [1] 3556.398 3885.651
Regression and generalized linear models are some of the most core techniques for performing prediction. They often produce very good predictions, they’re parsimonious and interpretable, and as an added bonus we can get inference on top of our predictions without doing any sort of data re-sampling. Inference in this case refer to something for example example confidence intervals around our predictions to evaluate the uncertainty in those predictions. Its very easy in regression and generalized linear models to get prediction, however, quite difficult in some more advanced machine learning algorithms.
We may want to predict a response for example the price of a diamond at a particular mass, in carats. Or we might want to predict a child’s height for a particular value of the parent’s height. Now we are going through diamond data set for the demonstration purpose.
But, if we want to be good statisticians, we need to evaluate some uncertainty in that prediction, so, it would be nice to have a prediction interval.
There’s a small intricacy in that
plot(x, y, frame=FALSE,
xlab="Carat",
ylab="Dollars",
pch=21,
col="black",
bg="lightblue",
cex=2)
abline(fit, lwd = 2)
xVals <- seq(min(x),
max(x), by = .01)
yVals <- beta0 + beta1 * xVals
se1 <- sigma * sqrt(1 / n + (xVals - mean(x))^2/ssx)
se2 <- sigma * sqrt(1 + 1 / n + (xVals - mean(x))^2/ssx)
lines(xVals, yVals + 2 * se1)
lines(xVals, yVals - 2 * se1)
lines(xVals, yVals + 2 * se2)
lines(xVals, yVals - 2 * se2)
Note a. The more variable the Xs are, the smaller the prediction error is.
So let’s go through some pictures so we can try to solidify these concepts. But this is an essential part of using regression for prediction, that we get easy and convenient prediction on certainty associated with parsimonious predictors.
We are going to generate some prediction intervals for the diamond data set. It includes the predict function, and the output of LM. For a lot of prediction algorithms, especially linear models and generalized linear models, random forests and other things in R, the predict function is a generic method that applies to them, so we are going to give it the output, in this case, the LM, and then we want a nice dense grid of x values. To predict a plot using the observed data, we give it some new x values, and a confidence interval, not a prediction interval
library(ggplot2)
newx = data.frame(x = seq(min(x), max(x), length = 100))
p1 = data.frame(predict(fit,newdata = newx, interval = ("confidence")))
p2 = data.frame(predict(fit,newdata = newx, interval = ("prediction")))
p1$interval = "confidence"
p2$interval = "prediction"
p1$x = newx$x
p2$x = newx$x
dat = rbind(p1, p2)
names(dat)[1] = "y"
#Creating the plot
g = ggplot(dat, aes(x=x, y=y))
g = g + geom_ribbon(aes(ymin = lwr, ymax = upr, fill = interval), alpha = 0.2)
g = g + geom_line()
g = g+ geom_point(data=data.frame(x=x,y=y), aes(x=x, y=y), size = 4)
g
The blue is the prediction arrow, and the salmon color is for prediction of the line at the particular values of x. We can also see the confidence interval. For the R command predict is much narrower than the prediction interval. If we collected an infinite amount of data at all different values of x along the regression line, the salmon colored confidence interval would get narrower and narrower around the line to the point where it was just the line itself. And that’s what we would expect to have. On the other hand, the prediction interval is variable and it does not have anything to do with how well we estimated β0 or β1. In fact, imagine if we were to assign the correct β0 and β1, there would still be variability in the Ys, because of the error term. So, if we want to predict a new y, there’s some uncertainty that just doesn’t go away with better estimation, it’s inherent in the leftover residual variation from my regression line. Thus, the variability is always present in the prediction interval and that’s why the one shows up in the equation. It doesn’t go away with N. It doesn’t go away as we collect more X’s. It is the reason that the prediction interval has a certain amount of width that never goes away.
Finally, you can see a little bit more in the confidence interval than in the prediction interval but that both of these get narrower toward the center of the data cloud and then get wider as we head out into the tails suggesting that we’re more confident in our predictions.
Question 1. Consider the following data with x as the predictor and y as as the outcome. x <- c(0.61, 0.93, 0.83, 0.35, 0.54, 0.16, 0.91, 0.62, 0.62) y <- c(0.67, 0.84, 0.6, 0.18, 0.85, 0.47, 1.1, 0.65, 0.36) Give a P-value for the two sided hypothesis test of whether 1 from a linear regression model is 0 or not.
Solution 1
Question 2. Consider the previous problem, give the estimate of the residual standard deviation.
Solution
Question 3.In the mtcars dataset, fit a linear regression model of weight (predictor) on mpg (outcome). Get a 95% confidence interval for the expected mpg at the average weight. What is the lower endpoint?
Question 4. Refer to the previous question. Read the help file for mtcars. What is the weight coefficient interpreted as?
Solution
Question 5. Consider again the mtcars dataset and a linear regression model with mpg as predicted by weight (1,000 lbs). A new car is coming weighing 3000 pounds. Construct a 95% prediction interval for its mpg. What is the upper endpoint?
Solution
Question 6. Consider again the mtcars dataset and a linear regression model with mpg as predicted by weight (in 1,000 lbs). A “short” ton is defined as 2,000 lbs. Construct a 95% confidence interval for the expected change in mpg per 1 short ton increase in weight. Give the lower endpoint.
Solution
Question 7. If my X from a linear regression is measured in centimeters and I convert it to meters what would happen to the slope coefficient?
Solution
Question 8. I have an outcome Y and a predictor X and fit a linear regression model with Y = _0 + _1 X + to obtaint β0(hat)and β1(hat). What would be the consequence to the subsequent slope and intercept if I were to refit the mode with a new regressor, X + c for some constant, c?
Solution
Question 9. Refer back to the mtcars data set with mpg as an outcome and weight (wt) as the predictor. About what is the ratio of the the sum of the squared errors,∑{i=1}^n (Yi - Yihat)^2 when comparing a model with just an intercept (denominator) to the model with the intercept and slope (numerator)?
Solution
Question 10.Do the residuals always have to sum to 0 in linear regression?
Solution
We now extend linear regression so that our models can contain more variables. A natural first approach is to assume additive effects, basically extending our line to a plane, or generalized version of a plane as we add more variables. Multivariable regression represents one of the most widely used and successful methods in statistics.
In any instance, where we’re using a predictor X to predict a response Y, and we find a nice relationship, if that predictor hasn’t been randomized to the subjects or units, whatever we’re looking at, then there’s always a concern that there’s another variable that we either know or don’t know about might explain the relationship.
Multivariable regression is sort of automated way to look at the relationship of a predictor and a response, while having, at some level, accounted for other variables.It’s actually a very good prediction model.So there’s multiple parts about this, so one of the main things is model search, (a)How do we pick which predictors to include? That’s an important component of this, and (b)And the other is to avoid overfittings,because as we put enough variables into a multivariable regression model, we’ll get zero residuals just by virtue of having included even random vectors into the regression model. So, certainly, there’s consequences to throwing lots of non-useful predictors or even omitting important predictors into a model.
And in the practical machine learning class, we learn a lot about model selection strategies as they relate to the idea of prediction. In this class, we want to generate parsimonious models, where we’re deeply interested in interpreting the coefficients from the linear model. Multivariable regression is a pretty good starting point in any prediction, anytime where we’re developing a prediction algorithm. Multivariable regression gets us very close to the winning entry, and lots of machine learning, and random forest, and boosting etc. only cause minor bumps on top. And so, to get huge drops in prediction error, well-thought out linear models sufficed, and then to get really minor increments beyond that, you had to throw a lot of computing. And to be fair, those did improve your chances, and we moved up a little bit in the leaderboard by adding some of these more complicated things. But it was remarkable how far you could get with just well done linear models. So, our linear model looks an awful lot like our simple linear model. It’s just that we have more variables.
The least squares, look at the differences between the outcome and the prediction from the linear models,i.e., summation of the predictors times their coefficients. Because that’s not necessarily positive, we it and add it up so this difference for every observation equally weights into this loss function that we created. So, least squares simply wants to minimize this equation. And it’s a direct extension of the equation we wanted to minimize when we had simple linear regression.
If we only have two regressors (an intercept and a slope) the least square criteria amounts to minimizing the sum of the squared vertical distances between the outcomes Ys and the predictors and the prediction on the line.If we have three regressors (an intercept and two regression variables) then the fitted line or the fitted construct is no longer a line, it’s a plane and the least squares criteria is minimizing the sum of the squared vertical distances between the outcomes in the plane. Finally, if we take more than three variables and an intercept and may be some covariants then, we can’t visualize it any more. However, it’s sort of the generalized idea of the distance, the vertical distance between the Ys and some generalized version of a plane.
Let’s Demonstrate How the Multivariable Regression Work
set.seed(123)
n=100;#We have 100 observations
x=rnorm(n); #X is a random normal variable
x2=rnorm(n); #x2 is also a random normal variable
x3=rnorm(n)
y=1+x+x2+x3+rnorm(n,sd=.1)#Y is an intercept 1, plus predictor variables x, x2, and x3, plus some random noise, i.e., the error term
ey=resid(lm(y~x2+x3))#residual for y,'lm' by default contains and intercept
head(ey)
## 1 2 3 4 5 6
## -0.47609542 -0.22591191 1.32345301 -0.07664928 -0.10183861 1.58749673
ex=resid(lm(x~x2+x3))#regression of x with x2 and x3, with an intercept removed
head(ex)
## 1 2 3 4 5 6
## -0.42163562 -0.15852256 1.41495859 0.02167515 -0.06314274 1.55407845
sum(ey*ex)/sum(ex^2)#this gives us the regression through origin statistics
## [1] 0.9944549
coef(lm(ey~ex-1))#the same thing with 'lm' function
## ex
## 0.9944549
coef(lm(y~x+x2+x3))#This equation gives pretty much the same thing, y regressed on x, x2, and x3 and the intercept.
## (Intercept) x x2 x3
## 0.9980674 0.9944549 1.0046218 0.9942613
So, the interpretation of a multivariable regression coefficient is it’s expected change in the response for a unit change in a regressor but holding all the other variables constant. It’s the essential part of interpreting a multivariable regression relationship “holding the other ones constant”*. And that way, the interpretation we can think of as being adjusting for the other variables, because we have to hold the other variables constant.
Before we do any machine learning or any complex algorithm, we need to understand the linear models. They offer parsimonious and well understood relationships between predictors and response. However, to be fair some modern machine learning algorithms can beat some of the things, like the imposed linearity. Nonetheless, linear models should always be our starting point. And there’s some amazing things, for example,
a. We can take a time series like a music sound and decompose it into its harmonics, the so-called discrete Fourier transform can be thought of the as the fit from a linear model.
b. We can flexibly fit rather complicated functions and curves and things like that using linear models.
c. We can fit factor variables as predictors, ANOVA is a special case of linear models, and ANCOVA is a special case of linear models.
d. We can uncover complex multivariate relationships within a response and we can build fairly accurate prediction models.
The dataset is a standardized fertility measure and some socio-economic status indicators for 47 French speaking provinces of Switzerland in the year 1888. The variables include
i. Fertility (a common standardised fertility measure),
ii. Agriculture (% of the males involved in agriculture as an occupation),
iii. Examination (% of draftees that scored a particular mark on a military examination in the province)
iv. Education (% of the population that made it beyond primary school)
v. Catholic in this region during the period of data collection (% of Catholic as opposed to Protestant), and
vi. Infant mortality (% of live births who live less than one year) from the province.
we’re interested in looking into what explains fertility in this province. So, before we go any further, let’s do some basic scatter plots of the data. Fertility is in the x-axis for all the plots in the first column; agrigulture is on the x-axis, or the horizontal axis, for all of the plots in the second column etc.
library(datasets);
data(swiss);
require(stats);
require(graphics)
pairs(swiss, panel = panel.smooth,
main = "Swiss Data Pairwise Scatter Plot",
col = 3 + (swiss$Catholic > 50))
The corresponding upper triangular part of this matrix gives the correlation between two variables. For example, lets discuss fertility and agriculture, the relationship looks fairly linear.
Let’s see it using the plots from ggplot2 as well
require(datasets);
data(swiss);
require(GGally);
## Loading required package: GGally
## Warning: package 'GGally' was built under R version 4.0.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
require(ggplot2);
g=ggpairs(swiss,lower=list(continuous=wrap("smooth", method="lm")))
g
Now, ‘Fertility’ is going to be an outcome. In the first model, we are going to use all other variables as predictors and in the subsequent plot we are seeing how we can fit all variables at once. Outcome variable Tilde Period in which, Tilde shows the relationship, and the . adds all other variables linearly, however, there will be no square, no interaction terms. Below are the results:
summary(lm(Fertility~ . , data=swiss))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.9151817 10.70603759 6.250229 1.906051e-07
## Agriculture -0.1721140 0.07030392 -2.448142 1.872715e-02
## Examination -0.2580082 0.25387820 -1.016268 3.154617e-01
## Education -0.8709401 0.18302860 -4.758492 2.430605e-05
## Catholic 0.1041153 0.03525785 2.952969 5.190079e-03
## Infant.Mortality 1.0770481 0.38171965 2.821568 7.335715e-03
So, let’s focus on the second row, i.e., -0.17. This number can be interpreted as: “We expect a 0.17 decrease in standardized fertility for every 1% increase in the percentage of males involved in agriculture, holding the other variables constant”. Yes, you got it right. We mentioned "holding ‘examination’, ‘education’, ‘percent Catholic’, and ‘infant mortality’ constant. The second column, i.e., the “standard error”, which is 0.07, talks about how precise the Estimate" is. In other words, it talks about the statistical variability of that coefficient. The third column provides the t-value, which is nothing other than the estimate divided by the standard error. That t- statistic, we can calculate the probability of getting a t-statistic as extreme as -2.448 or smaller, and because we’re doing a two-sided test, we would double that p-value. Remember, we can calculate the p-value by ourselves. The degrees of freedom are (n-the number of coefficients), including the intercept. But again, r does that on our behalf, and that works out to be 0.018. So, by standard thresholding rules, type one error rate of say 5%, that would be statistically significant.
Now let’s contrast the model with a model having just ‘agriculture’ as a ‘predictor’ and have a look at their association.
summary(lm(Fertility~Agriculture, data=swiss))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.3043752 4.25125562 14.185074 3.216304e-18
## Agriculture 0.1942017 0.07671176 2.531577 1.491720e-02
Interestingly, we can see that the Estimate has been changed to positive 0.194 from negative 0.17. It suggests that Agriculture has a positive effect in Fertility. Why are they different? The reason is adjusting for other variables change the direction of the effect of agriculture on Fertility. And, in this case, this is the impact of something so called Simpson’s Paradox, however, it is not difficult to understand why it happened. An important thing to look for, though, in both cases, Agriculture is statistically significantly associated with Fertility.
Let’s create a simulation in which an effect will reverse itself and see what happens. Remember, regression is a dynamic process, where we have to think about what variables to include, we also have to make the scientific arguments if we want. If there hasn’t been randomization to protect us from ‘confounding’, we have to go through a scientific dynamic process of putting confounders in and out and thinking about what they’re doing to our effective interest in order to evaluate it.
n <- 100; #total 100 data points
x2 <- 1:n;## the values 1 to n
x1 <- .01 * x2 + runif(n, -.1, .1);#x1 depends on x2 and random noise. It grows linearly
head(x1) #lets see the sample of these values
## [1] 0.004136367 -0.006830905 -0.045745589 -0.050601264 0.002559261
## [6] 0.153728234
y = -x1 + x2 + rnorm(n, sd=.01);#y is negatively associated with x1 and positively with x2, plus random noise added to them
head(y)#lets see the sample of these values
## [1] 1.010168 2.017297 3.050098 4.057753 5.006612 5.819663
summary(lm(y~x1))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.583778 1.119024 2.308956 2.304623e-02
## x1 94.433779 1.918806 49.214857 6.519966e-71
Lets look at the finding. We get enormous coefficient, i.e., 95, which is clearly wrong. It’s nothing near to the -1 that it’s supposed to be or that we would hope it would be. It’s in fact quite the opposite. And what’s happening is it’s sort of picking up this residual effect of x2, which is a big driver of y. If we model it correctly, let’s see what happens:
summary(lm(y~x1+x2))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.002620358 0.0018513605 1.415369 1.601630e-01
## x1 -1.017052864 0.0158406252 -64.205349 2.795876e-81
## x2 1.000147781 0.0001627882 6143.857819 6.123761e-273
Looking at the results, we can see that we now, got the right values. It suggests that if we correct or adjust our model, it works out. Now let’s do some plots to highlight what we have just said:
dat=data.frame(y=y, x1=x1, x2=x2, ey=resid(lm(y~x2)),ex1=resid(lm(x1~x2)))
summary(dat)
## y x1 x2 ey
## Min. : 1.01 Min. :-0.0506 Min. : 1.00 Min. :-0.12934
## 1st Qu.:25.54 1st Qu.: 0.2503 1st Qu.: 25.75 1st Qu.:-0.04607
## Median :50.05 Median : 0.5033 Median : 50.50 Median :-0.01138
## Mean :50.00 Mean : 0.5021 Mean : 50.50 Mean : 0.00000
## 3rd Qu.:74.44 3rd Qu.: 0.7515 3rd Qu.: 75.25 3rd Qu.: 0.05207
## Max. :98.93 Max. : 1.0596 Max. :100.00 Max. : 0.11471
## ex1
## Min. :-0.096764
## 1st Qu.:-0.051216
## Median : 0.005274
## Mean : 0.000000
## 3rd Qu.: 0.048134
## Max. : 0.103195
head(dat)#Let's check how does this data frame look
library(ggplot2)
g=ggplot(dat,aes(y=y, x=x1, colour=x2))
g=g+geom_point(colour="grey70", size=5)
g=g+geom_smooth(method=lm,se=FALSE, colour="black")
g=g+geom_point(size=3)
g
## `geom_smooth()` using formula 'y ~ x'
There we can see the linear relationship. The y goes up as X1 goes up. Likewise, both y and x1 go up as the x2 go up. Now we want to see what happens when we plot the residuals.
g2=ggplot(dat, aes(y=ey, x=ex1, colour=x2))
g2=g2+geom_point(colour="grey50", size=5)
g2=g2+geom_smooth(method=lm, se= FALSE, colour="black")
g2=g2+geom_point(size=3)
g2
## `geom_smooth()` using formula 'y ~ x'
Now, we can see that on the residual of y and residual of x1, there is clearly the negative relationship. The slope of this line should be around negative 1. The x2 is clearly not related to the residual of x1 variable. So, that’s what linear models is doing. It’s removing the x2 from both x1 and y. And that’s why we got this correct relationship when you fit both variables. It, however, doesn’t mean that throwing every variable into our regression model is the right thing to do. There’s consequences to throwing in extra variables, unnecessary variables, into the regression relationship.
Lets go back to our dataset, i.e., Swiss agricultural dataset
Finally, let’s see what happens if we include absolutely unnecessary variable in the model. Here, we mean by completely unnecessary variable like a variable that is simply a linear combination of the other variables that we’ve already included. We have created a ‘z’ value that is simply the addition of ‘Agriculture’ and ‘Education’ variables. There is no addition possible though. And, we have already included these variables in the model. As seen in the codes, we run all variables plus the newly created ‘z’ value.
z<-swiss$Agriculture + swiss$Education
lm(Fertility ~ . + z, data = swiss)
## Warning in terms.formula(formula, data = data): 'varlist' has changed (from
## nvar=6) to new 7 after EncodeVars() -- should no longer happen!
##
## Call:
## lm(formula = Fertility ~ . + z, data = swiss)
##
## Coefficients:
## (Intercept) Agriculture Examination Education
## 66.9152 -0.1721 -0.2580 -0.8709
## Catholic Infant.Mortality z
## 0.1041 1.0770 NA
Based on the result, the value of ‘Agriculture’ doesn’t change. However, the ‘z’ value is giving ‘NA’. Why? R takes the completely unnecessary variable included in the model and gives the one an ‘NA’ coefficient. So that’s an important thing to remember, if we see an NA in r after we fit a linear model, then probably the most likely culprit is we’ve included a variable that is either numerically or exactly a linear combination of the other variables.
require(datasets)
data("InsectSprays")
require(stats)
g=ggplot(data=InsectSprays, aes(y=count, x=spray, fill=spray))
g=g+geom_violin(colour="black", size=2)
g=g+xlab("Type of Spray")
g=g+ylab("Insect Count")
g
Let’s talk about how we can test the difference between different factor levels in this case using linear models. And then, at the end we’ll talk about some shortcomings of the approach that we’re proposing here.
So, let’s fit our model, our outcome is the ‘count of the number of insects’. Our predictor is the spray, which spray was used as a factor variable. To keep the printing a little bit self-contained, we grabbing the coefficient table.
summary(lm(count~spray, data=InsectSprays))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.5000000 1.132156 12.8074279 1.470512e-19
## sprayB 0.8333333 1.601110 0.5204724 6.044761e-01
## sprayC -12.4166667 1.601110 -7.7550382 7.266893e-11
## sprayD -9.5833333 1.601110 -5.9854322 9.816910e-08
## sprayE -11.0000000 1.601110 -6.8702352 2.753922e-09
## sprayF 2.1666667 1.601110 1.3532281 1.805998e-01
Here’s the table we are looking for, the Intercept, spray B, spray C, spray D, spray E, and spray F. And spray A is conspicuously missing. It suggests that everything here, is now in comparison with spray A. So, the value of 0.833 in case fo sprayB, is the change in the mean between spray B and spray A.In this case, 14.5, the intercept is the mean for spray A.
The same results can be obtained manually(i.e., hard coding the dummy variables), though it is quite tedious. We are forcing A to be the reference variable as we are defining all other variables but not the ‘A’.
summary(lm(count~
I(1*(spray=='B'))+I(1*(spray=='C'))+
I(1*(spray=='D'))+I(1*(spray=='E'))+
I(1*(spray=='F'))
, data=InsectSprays))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.5000000 1.132156 12.8074279 1.470512e-19
## I(1 * (spray == "B")) 0.8333333 1.601110 0.5204724 6.044761e-01
## I(1 * (spray == "C")) -12.4166667 1.601110 -7.7550382 7.266893e-11
## I(1 * (spray == "D")) -9.5833333 1.601110 -5.9854322 9.816910e-08
## I(1 * (spray == "E")) -11.0000000 1.601110 -6.8702352 2.753922e-09
## I(1 * (spray == "F")) 2.1666667 1.601110 1.3532281 1.805998e-01
We can see the exact same value. In automatic case, picked the spray level with the smallest alpha numeric value. Let’s keep exploring this. Don’t mistake A , B, C, D, E, F variables to be the factor variables. They are not. They are subcategories of the same variable. Now, what happens if we include all 6 subcategories? Let’s see:
summary(lm(count~
I(1*(spray=='B'))+I(1*(spray=='C'))+
I(1*(spray=='D'))+I(1*(spray=='E'))+
I(1*(spray=='F'))+I(1*(spray=='A'))
, data=InsectSprays))
##
## Call:
## lm(formula = count ~ I(1 * (spray == "B")) + I(1 * (spray ==
## "C")) + I(1 * (spray == "D")) + I(1 * (spray == "E")) + I(1 *
## (spray == "F")) + I(1 * (spray == "A")), data = InsectSprays)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.333 -1.958 -0.500 1.667 9.333
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.5000 1.1322 12.807 < 2e-16 ***
## I(1 * (spray == "B")) 0.8333 1.6011 0.520 0.604
## I(1 * (spray == "C")) -12.4167 1.6011 -7.755 7.27e-11 ***
## I(1 * (spray == "D")) -9.5833 1.6011 -5.985 9.82e-08 ***
## I(1 * (spray == "E")) -11.0000 1.6011 -6.870 2.75e-09 ***
## I(1 * (spray == "F")) 2.1667 1.6011 1.353 0.181
## I(1 * (spray == "A")) NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.922 on 66 degrees of freedom
## Multiple R-squared: 0.7244, Adjusted R-squared: 0.7036
## F-statistic: 34.7 on 5 and 66 DF, p-value: < 2.2e-16
R gives NA in case of A Coefficient. And the reason for that is because it’s redundant.
Now, what if we want the coefficients, instead of being interpreted as levels referenced to a control level? What if we want the coefficients to be the mean for each of the groups? Well, we can do that, but we have to remove the intercept. So, Let’s just do that. Watch what happens when we say count is our outcome, and spray is our predictor, but we remove the intercept.
summary(lm(count~spray - 1, data = InsectSprays))$coef
## Estimate Std. Error t value Pr(>|t|)
## sprayA 14.500000 1.132156 12.807428 1.470512e-19
## sprayB 15.333333 1.132156 13.543487 1.001994e-20
## sprayC 2.083333 1.132156 1.840148 7.024334e-02
## sprayD 4.916667 1.132156 4.342749 4.953047e-05
## sprayE 3.500000 1.132156 3.091448 2.916794e-03
## sprayF 16.666667 1.132156 14.721181 1.573471e-22
We get different set of parameters. The estimates are simply the mean of these six subcategories. Let’s check if they are.
summarise(group_by(InsectSprays, spray), mn=mean(count))
## `summarise()` ungrouping output (override with `.groups` argument)
Yes. They are.
We usually want one of them to be a reference level because we want to do test. For examples, the p-values are testing for the t-test whether or not A is different from B, and A is different from C, and A is different from D, and so on. In other words, the p-values from this test are just testing whether or not those means are different from 0, which is a very different test. How you play around with factor variables in LM is very important in terms of how we interpret it. It’s not just a conceptual or theoretical thing to worry about. It’s a very practical thing to worry about. What our intercept means changes dramatically depending on what our reference level is or whether or not we include an intercept.
Let’s do one last thing, i.e., Reordering the reference level.
spray2<-relevel(InsectSprays$spray,"C")
summary(lm(count~spray2, data=InsectSprays))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.083333 1.132156 1.840148 7.024334e-02
## spray2A 12.416667 1.601110 7.755038 7.266893e-11
## spray2B 13.250000 1.601110 8.275511 8.509776e-12
## spray2D 2.833333 1.601110 1.769606 8.141205e-02
## spray2E 1.416667 1.601110 0.884803 3.794750e-01
## spray2F 14.583333 1.601110 9.108266 2.794343e-13
The intercept, now, is interpreted as the mean for sprayC, i.e., 2.0833. And the coefficient 12.41 is interpreted as the comparison of sprayA to sprayC. The 13.25 is the comparison of sprayB to sprayC. And so, if we want to test sprayA versus sprayC we’ve got to look at the P-value.
Recap
If we include a factor of a level like ‘spray’ in R, then R automatically includes an intercept, and treats the first level of the factor as the reference level. So, the intercept is interpreted as the mean for that reference level.
The data we analyzed is not exactly a complete data analysis. The modeling of the data as if they are normally distributed is perhaps problematic. They’re count data so they’re bounded from below by zero. It would probably be a little more natural to model this data as if it was Poisson or at least over dispersed Poisson or something like that which we’re going to cover In our GLM version of the class.
In addition, the variance is clearly not constant. So, what we mean by the variance is, the variance around the mean. And it’s clearly not constant as our regression models would assume. So, this is a potential problem. And so, our means are probably correct. Our estimates are probably correct. But our inferences assuredly not. It creates an issue that needs to be handled at some level.
And later on in the class we’ll talk about some things for handling this and some of the rest we may have to take some further statistical inferences classes to deal with some of the more advanced topics like when the variances are unequal they call that heteroscedasticity.
Let’s move on to another example of multivariable regression, a very important example that underlies the topic of so-called ANCOVA. We are illustrating fitting multiple lines with different intercepts and different slopes using the Swiss data set. Earlier, we were trying to model fertility as an outcome of agriculture, which is the percent of that province that was working in agriculture as the predictor. Now, let’s see how to fit different lines:
library(datasets)
data(swiss)
head(swiss)#Just seeing what's in the data
library(dplyr)
hist(swiss$Catholic)#Checking how the Catholic data are distributed
swiss=mutate(swiss, CatholicBin=1*(Catholic>50))#Creating a new a binary Catholic variable. If majority of people in the province are protestant it is (0), and the province with Catholic majority as (1)
head(swiss)#checking if the CatholicBin was created
Now, lets plot the data.
library(ggplot2)
g=ggplot(swiss, aes(x=Agriculture, y=Fertility, colour=factor(CatholicBin)))
g=g+geom_point(size=6, colour="black")
g=g+geom_point(size = 4)
g=g+xlab("% in Agriculture")+ ylab("Fertility")
g
We can see two variables, i.e., the CatholicBin variable (0-protestant majority districts, and 1-Catholic majority districts). Clearly, there are some outliers but we will discuss them afterwards. We are now, going to copy the previous scatter plot and add two lines to it (one for those provinces that are majority Catholic and the second for majority Protestant provinces). In this case, we are showing the outcome for different intercepts but the same slopes. Now, lets consider Model 1:
fit=lm(Fertility~Agriculture+factor(CatholicBin), data=swiss)
g1=g
g1=g1+geom_abline(intercept=coef(fit)[1],slope=coef(fit)[2], size=2)
g1=g1+geom_abline(intercept=coef(fit)[1]+coef(fit)[3],slope=coef(fit)[2], size=2)
g1
This line just assumes one line through the data (linear regression). The coefficient in front of the binary variable is the change in the intercept between non-Catholic and Catholic majority provinces. In other words, this model fits parallel lines for the two levels of the factor variable. If the factor variable had 4 levels, it would fit 4 parallel lines, where the coefficients for the factors are the change in the intercepts to the reference level.Thus, 7.8843 is the estimated change in the intercept in the expected relationship between Agriculture and Fertility going from a non-Catholic majority province to a Catholic majority.
We can also do some lines where they have different intercepts and different slopes depending on the % of the province that is Catholic.
fit=lm(Fertility~Agriculture*factor(CatholicBin), data=swiss)#The asterisk after agriculture gives us the interaction
summary(fit)$coef#checking how the fit statistics
## Estimate Std. Error t value
## (Intercept) 62.04993019 4.78915566 12.9563402
## Agriculture 0.09611572 0.09881204 0.9727127
## factor(CatholicBin)1 2.85770359 10.62644275 0.2689238
## Agriculture:factor(CatholicBin)1 0.08913512 0.17610660 0.5061430
## Pr(>|t|)
## (Intercept) 1.919379e-16
## Agriculture 3.361364e-01
## factor(CatholicBin)1 7.892745e-01
## Agriculture:factor(CatholicBin)1 6.153416e-01
g1=g
g1=g1+geom_abline(intercept=coef(fit)[1],slope=coef(fit)[2], size=2)
g1=g1+geom_abline(intercept=coef(fit)[1]+coef(fit)[3],
slope=coef(fit)[2]+coef(fit)[4], size=2)
g1
Thus, 2.8577 is the estimated change in the intercept of the linear relationship between Agriculture and Fertility going from non-Catholic majority to Catholic majority to Catholic majority provinces. The interaction term, 0.08914, is the estimate change in the slope. The estimated intercept in non- Catholic provinces is 62.04993 while the estimated intercept in Catholic provinces is 62.04993 + 2.85770. The estimated slope in non-Catholic majority provinces is 0.09612 while it is 0.09612 + 0.08914 for Catholic majority provinces. If the factor has more than two levels, all of the main effects are change in the intercepts from the reference level while all of the interaction terms are changes in slope (again compared to the reference level).
Adjustment, is the idea of putting regressors into a linear model to investigate the role of a third variable on the relationship between another two. Since it is often the case that a third variable can distort, or confound if you will, the relationship between two others.
As an example, consider looking at lung cancer rates and breath mint usage. For the sake of completeness, imagine if you were looking at forced expiratory volume (a measure of lung function) and breath mint usage. If you found a statistically significant regression relationship, it wouldn’t be wise to rush off to the newspapers with the headline “Breath mint usage causes shortness of breath!”, for a variety of reasons. First off, even if the association is sound, you don’t know that it’s causal. But, more importantly in this case, the likely culprit is smoking habits. Smoking rates are likely related to both breath mint usage rates and lung function. How would you defend your finding against the accusation that it’s just variability in smoking habits?
If your finding held up among non-smokers and smokers analyzed separately, then you might have something. In other words, people wouldn’t even begin to believe this finding unless it held up while holding smoking status constant. That is the idea of adding a regression variable into a model as adjustment. The coefficient of interest is interpreted as the effect of the predictor on the response, holding the adjustment variable constant.
In this part, we’ll use simulation to investigate how adding a regressor into a model addresses the idea of adjustment.
The easiest way to see this is to look at a two group variables, for example, a treatment versus a control variable.Let’s simulate some data and create some plots.
n <- 100 #total of 100 data points
t <- rep(c(0, 1), c(n/2, n/2))
x <- c(runif(n/2), runif(n/2))
beta0 <- 0
beta1 <- 2
tau <- 1
sigma <- .2
y <- beta0 + x * beta1 + t * tau + rnorm(n, sd = sigma)
head(y)#checking how 'y' looks
## [1] 1.8166381 1.4500560 0.4274299 1.1852635 1.8913845 1.0166647
Now let’s plot the data.
plot(x,y, type = "n", frame = FALSE)
abline(lm(y ~ x), lwd = 2)
abline(h = mean(y[1 : (n/2)]), lwd = 3)
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3)
fit <- lm(y ~ x + t)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2)
Here’s the results of the first simulation, there’s one group colored red and another group colored blue. In this case, it looks pretty straightforward and there’s a pretty clear linear relationship between the outcome and the regressor. So if, for example, y was blood pressure, then we would think that if we hadn’t factored in x, this would be the mean for the group that, say, received the treatment, and this would be the mean for the control group. So, we are going to fit a model that looks like y = β0+β1T+β2x+e. This would fit two parallel lines. β1 would represent the change in intercepts between the groups, whereas β2 would be the common slope that exists across the two groups.
n <- 100;
t <- rep(c(0, 1), c(n/2, n/2));
x <- c(runif(n/2), 1.5 + runif(n/2));
beta0 <- 0;
beta1 <- 2;
tau <- 0;
sigma <- .2
y <- beta0 + x * beta1 + t * tau + rnorm(n, sd = sigma)
plot(x, y, type = "n", frame = FALSE)
abline(lm(y ~ x), lwd = 2)
abline(h = mean(y[1 : (n/2)]), lwd = 3)
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3)
fit <- lm(y ~ x + t)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2)
In this experiment, the X variable is highly related to group status. That is, if we know the X variable,we could easily predict their group affiliation because the control groups have the value of 1 or less when the treatment groups have more than 1.5, and there is no overlapping, whatsoever. If we disregard X, there’s an apparent strong relationship between the group variable and Y. However, if we account for X, there’s basically none. In this case, the apparent effect of group on Y is entirely explained by X. Our regression model would likely have a strong significant effect if group was included by itself and this effect would vanish if X was included. Further notice, there are no data to directly compare the groups at any particular value of X. (There’s Adjustment 81 no vertical overlap between the blue and red points.) Thus the adjusted effect is entirely based on the model, specifically the assumption of linearity. This is an important concept, the so-called propensity score, but it basically goes to show that this is the exact opposite of what would happen if we had randomized.
If we try to draw curves on this plot assuming non-linear relationships outside of their cloud of points for the blue and red groups. We quickly will conclude that many relationships are possible that would differ from this model’s conclusions. Worse still, you have no data to check the assumptions. Of course, R will churn forward without any complaints fitting this model and reporting no significant difference between the groups. It’s worth noting at this point, that our experiments just show how the data can arrive at different effects when X is included or not. In a real application, it may be the case that X should be included and maybe that it shouldn’t be.
n <- 100;
t <- rep(c(0, 1), c(n/2, n/2));
x <- c(runif(n/2), .9 + runif(n/2));
beta0 <- 0;
beta1 <- 2;
tau <- -1;
sigma <- .2
y <- beta0 + x * beta1 + t * tau + rnorm(n, sd = sigma)
plot(x, y, type = "n", frame = FALSE)
abline(lm(y ~ x), lwd = 2)
abline(h = mean(y[1 : (n/2)]), lwd = 3)
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3)
fit <- lm(y ~ x + t)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2)
In this experiment, we simulated data where the marginal (ignoring X) and conditional (using X)associations differ. First note that if X is ignored, one would estimate a higher marginal mean for Y of the red group over the blue group. However, if we look at the intercept in the fitted model, the blue group has a higher intercept. In other words, if you were to fit this linear model as lm(Y~Group) we would get one answer and lm(Y~Group + X) would give us the exact opposite answer, and in both cases the group effect would be highly statistically significant!
Also in this settings, there isn’t a lot of overlap between the groups for any given X. That means there isn’t a lot of direct evidence to compare the groups without relying heavily on the model. In other words, group status is related to X quite strongly (though not as strongly as in the previous example). The adjusted relationship suggest that the blue group is larger than the red group. However, the reversal of the effect comes as bigger X means more likely red and bigger X means higher Y. In this case, again, randomization of the data would likely have eliminated this problem.
n <- 100;
t <- rep(c(0, 1), c(n/2, n/2));
x <- c(.5 + runif(n/2), runif(n/2));
beta0 <- 0;
beta1 <- 2;
tau <- 1;
sigma <- .2
y <- beta0 + x * beta1 + t * tau + rnorm(n, sd = sigma)
plot(x, y, type = "n", frame = FALSE)
abline(lm(y ~ x), lwd = 2)
abline(h = mean(y[1 : (n/2)]), lwd = 3)
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3)
fit <- lm(y ~ x + t)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2)
Now that we got it under control, a little bit. We can see how marginal and conditional associations can differ. Experiment 4 is a case where the marginal association is minimal yet the conditional association is large. In this case, by adding X to the model, the group effect became more statistically significant.
n <- 100;
t <- rep(c(0, 1), c(n/2, n/2));
x <- c(runif(n/2, -1, 1), runif(n/2, -1, 1));
beta0 <- 0;
beta1 <- 2;
tau <- 0;
tau1 <- -4;
sigma <- .2
y <- beta0 + x * beta1 + t * tau + t * x * tau1 + rnorm(n, sd = sigma)
plot(x, y, type = "n", frame = FALSE)
abline(lm(y ~ x), lwd = 2)
abline(h = mean(y[1 : (n/2)]), lwd = 3)
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3)
fit <- lm(y ~ x + t + I(x * t))
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2] + coef(fit)[4], lwd = 3)
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2)
Let’s look at this plot. It obviously looks weird. In this case, the best fitting model has both a group main effect and interaction with X. The main point here is that there is no meaningful group effect, the effect of group depends on what level of X somebody is at. At a small value of X, the red group is at a large value of X, the blue group is higher; at intermediate values, they’re the same. Thus, it makes no sense to talk about a group effect in this example; group and X are intrinsically linked in their impact on Y. This model could be fit using y = β0+β1T+β2x+β3TX+e.
p <- 1
n <- 100;
x2 <- runif(n);
x1 <- p * runif(n) - (1 - p) * x2
beta0 <- 0;
beta1 <- 1;
tau <- 4 ;
sigma <- .01
y <- beta0 + x1 * beta1 + tau * x2 + rnorm(n, sd = sigma)
plot(x1, y, type = "n", frame = FALSE)
abline(lm(y ~ x1), lwd = 2)
co.pal <- heat.colors(n)
points(x1, y, pch = 21, col = "black", bg = co.pal[round((n - 1) * x2 + 1)], cex = 2)
So far, there was nothing we’ve talked about is specific to having a binary treatment and a continuous x. In this case here, we have our same outcome, y. But, x1 and x2 variables are continuous, but because it’s hard to show 3 variables at the same time, x2 is color coded. The higher lighter points show higher, and red darker points show lower values. In this case, looking at the graph, we don’t see much of a relationship between y and x1.
However, lets look at this in three dimensions. We are going to create a three dimensional plot which can rotate around using RGL-package.
library(rgl)
## Warning: package 'rgl' was built under R version 4.0.3
p <- 1
x2 <- runif(n);
x1 <- p * runif(n) - (1 - p) * x2
beta0 <- 0;
beta1 <- 1;
tau <- 4 ;
sigma <- .01
y <- beta0 + x1 * beta1 + tau * x2 + rnorm(n, sd = sigma)
plot3d(x1, x2, y)
Looking at the picture, we can say that that there is a very small amount of sharing. We can confirm them by checking the residual plots as well.
plot(resid(lm(x1 ~ x2)), resid(lm(y ~ x2)), frame = FALSE, col = "black", bg = "lightblue", pch = 21, cex = 2)
abline(lm(I(resid(lm(x1 ~ x2))) ~ I(resid(lm(y ~ x2)))), lwd = 2)
We can see the very strong effect of x1 after having removed the effect of x2. And the main point here (binary variant) and the continuous variant is just to show that there’s nothing specific about the binary case other than it is being easy to visualize. We can get regression effects, reverse themselves, get bigger, get smaller, go from significant to not, from not significant to significant, and all of the possible permutations when we have two or more multiple continuous regressors.
Probably the best way to think about is we have to bring in some of the specific subject matter, or clinical scientific subject matter expertise into our model building exercise. Sometimes the treatment and the other regressor are highly related, but merely because they’re related things, that it isn’t interesting to adjust for x.
The vertical distances between the observed data points and the fitted regression line are called residuals. We can generalize this idea to the vertical distances between the observed data and the fitted surface in multivariable settings.
Obtaining and plotting residuals in R is particularly easy. The function resid return the residuals of a model fit with lm. Some useful plots, including a residual plot, can be performed with the plot function on the output of a lm fit. Let’s consider the swiss dataset from previous chapters. In this section, we’re going to go over the basics of these plots are.
data(swiss)
par(mfrow = c(2, 2))#To fit all the plots in one page
fit <- lm(Fertility ~ . ,data = swiss)#just includes all the terms in the data frame
plot(fit) #just plots the fit and co-creates a series of residual and diagnostic plots for us
One of such basic things is to understand and discuss the influential points and outlier points and their distinctions. To understand it, let’s create a plot where there’s a clear linear relationship between the predictor and the response.
In the first plot, we can see that the residuals are in spreading in cone shape as they move away from the origin. The case like this or its vice-versa suggest heteroscedasticity. However, the plot doesn’t look so bad because we don’t seen too many aspects of absence of model departures.
The Q-Q plot is specifically designed to test normality of the data or they are used to test or to evaluate normality of the error terms. The 45* pattern shows the normality.
The third plot, i.e., scale location plot is plotting the standardized residuals (they’re the ordinary residuals but standardize, so they have a more kind of comparable scale across subject, across experiments) and the scale to try and make them to be T-like scale.
The final plot shows the relationship between the Residuals and Leverage. There’s the standardized Residuals on y-scale and the Leverage on x-scale. we have to look if there exists any systematic pattern. We have to try to reason why points with higher leverage are having higher or particularly small residual values, like in the 4th plot.
n <- 100;
x <- rnorm(n);
y <- x + rnorm(n, sd = .3)
plot(c(-3, 6), c(-3, 6), type = "n", frame = FALSE, xlab = "X", ylab = "Y")
abline(lm(y ~ x), lwd = 2)
points(x, y, cex = 2, bg = "lightblue", col = "black", pch = 21)
points(0, 0, cex = 2, bg = "darkorange", col = "black", pch = 21)
points(0, 5, cex = 2, bg = "darkorange", col = "black", pch = 21)
points(5, 5, cex = 2, bg = "darkorange", col = "black", pch = 21)
points(5, 0, cex = 2, bg = "darkorange", col = "black", pch = 21)
So there’s these four orange colored points. They are the influential, high leverage and outlying points. We concentrate our discussion around them.
Calling a point an outlier is vague. * Outliers can be the result of spurious or real processes. If it’s a spurious process we want to remove that point from our fitted model and if it’s part of a real process, we don’t want to get rid of a point and pretend like it never happened. * Outliers can have varying degrees of influence. * Outliers can conform to the regression relationship (i.e being marginally outlying in X or Y, but not outlying given the regression relationship). * Upper left hand point has low leverage, low influence, outlies in a way not conforming to the regression relationship. * Lower left hand point has low leverage, low influence and is not to be an outlier in any sense. * Upper right hand point has high leverage, but chooses not to extert it and thus would have low actual influence by conforming to the regresison relationship of the other points. * Lower right hand point has high leverage and would exert it if it were included in the fit.
The residuals have the same unit. If our Y is measured in pounds, then our residuals are in pounds. So they’re not necessarily comparable across settings, and we can’t for example just set a threshold and say here’s our residual threshold. the above information shows types of different types of outliers and their meanings.
?influence.measures
to see the full suite of influence measures in stats. The measures include
rstandard
- standardized residuals, residuals divided by their standard deviations)rstudent
- standardized residuals, residuals divided by their standard deviations, where the ith data point was deleted in the calculation of the standard deviation for the residual to follow a t distributionhatvalues
- measures of leveragedffits
- change in the predicted response when the \(i^{th}\) point is deleted in fitting the model.dfbetas
- change in individual coefficients when the \(i^{th}\) point is deleted in fitting the model.cooks.distance
- overall change in the coefficients when the \(i^{th}\) point is deleted.resid
- returns the ordinary residualsresid(fit) / (1 - hatvalues(fit))
where fit
is the linear model fit returns the PRESS residuals, i.e. the leave one out cross validation residuals - the difference in the response and the predicted response at data point \(i\), where it was not included in the model fitting.So, let’s go through some simulation experiments to understand how some of these diagnostic measures work.
In this case there’s a big cloud of uncorrelated data, generated by using a bunch of pairs of independent random normals.
x <- c(10, rnorm(n))#create a bunch of random standard normal and add a point at 10
y <- c(10, c(rnorm(n)))
plot(x, y, frame = FALSE, cex = 2, pch = 21, bg = "lightblue", col = "black")
abline(lm(y ~ x))
fit<-lm(y~x)
round(dfbetas(fit)[1:10,2],3)
## 1 2 3 4 5 6 7 8 9 10
## 8.624 -0.081 -0.018 0.005 0.055 -0.011 -0.082 -0.038 0.012 -0.172
round(hatvalues(fit)[1:10],3)
## 1 2 3 4 5 6 7 8 9 10
## 0.460 0.019 0.012 0.010 0.020 0.011 0.046 0.010 0.015 0.016
The plot shows that there’s a strong correlation estimated by the data merely because of the existence of this point otherwise the correlation would be estimated to be zero.
The 10 point (the isolated point) has higher (8.624) dfbeta magnitude than rest of the points, and its hatvalue (0.460) is much larger than other, as well. The hatvalues always lie between 0 and 1.
Now, let’s look at the another instance, where there is a clear regression relationship. The data points lie along the regression line. Howevere, there as well, we have an outlier, which is little farther and isolated from the rest of of data points. However, this line nicely adheres to the regression line. Let’s see how our diagnostic values look into this specific case.
x<-rnorm(n)
y<-x+rnorm(n, sd=.3)
x<-c(5,x)
y<-c(5,y)
plot(x,y, frame=FALSE, cex=2, pch=21, bg="lightblue", col="black")
fit2<-lm(y~x)
abline(fit2)
round(dfbetas(fit2)[1:10,2],3) #the beta value rounds to the third decimal values
## 1 2 3 4 5 6 7 8 9 10
## 0.284 -0.032 -0.017 -0.017 -0.006 0.084 0.072 -0.014 0.018 -0.026
round(hatvalues(fit2)[1:10],3)
## 1 2 3 4 5 6 7 8 9 10
## 0.267 0.010 0.012 0.018 0.010 0.013 0.021 0.010 0.011 0.011
Checking at the dfbetas, it looks still large (0.284) and hatvalue (0.267) however, they are not as big in the first case. It seems to have a little infuence on the regression line, though.
Example described by Stefanski TAS 2007 Vol 61. which shows why we need to run the residual plots. The residual plots zooms they zoom on potential problems with our model.
dat <- read.table('http://www4.stat.ncsu.edu/~stefanski/NSF_Supported/Hidden_Images/orly_owl_files/orly_owl_Lin_4p_5_flat.txt', header = FALSE)
pairs(dat)
summary(lm(V1 ~ . -1, data = dat))$coef #the intercept has been omitted (-1)
## Estimate Std. Error t value Pr(>|t|)
## V2 0.9856157 0.12798121 7.701253 1.989126e-14
## V3 0.9714707 0.12663829 7.671225 2.500259e-14
## V4 0.8606368 0.11958267 7.197003 8.301184e-13
## V5 0.9266981 0.08328434 11.126919 4.778110e-28
If we look at the model fit, every single p-value is statistically significant. Is it okay? Let’s check.
fit <- lm(V1 ~ . - 1, data = dat); plot(predict(fit), resid(fit), pch = '.')
We see a clever little picture. We wouldn’t be able to see it had we not checked the residual. It shows a lot of patterns suggesting a very poor model fit.
This section represents a challenging question: “How do we chose what variables to include in a regression model?”. Sadly, no single easy answer exists and the most reasonable answer would be “It depends.”. These concepts bleed into ideas of machine learning, which is largely focused on high dimensional variable selection and weighting. In this part we cover some of the basics and, most importantly, the consequences of over- and under-fitting a model.
Here, it is important to distinguish between multivariable regression and the prediction class as part of the data science specialization.In prediction, we’re less concerned with interpretability, so we’re far more tolerant for complex models, lots of interactions, and automated search algorithms and rules to come up with the best prediction with respect to a specific loss function. In contrast, in a lot of regression examples, we want simple interpretable models. That means we’re going to place a heavy emphasis on the idea of parsimony.
Model selection has broad consequences. So if a variable in a regression model explains a tiny bit extra about a variation but really harms the interpretability, we omit that variable, even if it really does explain a little bit of positive amount of variability. Scott Zeger likes to think of a model as a lens to which we’re looking at the data. In that context, our model is neither right nor wrong. Any model that teaches us something valuable about our data is right, as long as it teaches us something true. In this context, to find a super fine detail, we need a microscope, while, to find super high level detail, we need a telescope. And so the idea of a model being something like that where we’re looking at the same thing, but the right model might highlight the right feature that teaches you something true.
Let’s cite a quote by Donald Rumsfeld:
This quote was heavily criticized but for regression it makes a lot of sense. Known knowns are regressors that we know we should include in our model, and we have recorded so we just put them in. There are known unknowns, and these are the things that we know we should include in our regression model, but unfortunately we didn’t collect them. And there are unknown unknowns and these are things that we should include in our regression model for example, but we didn’t collect them or don’t have or don’t know whether or not they’re important.
Now let’s simulate some data and see the how the \(R^2\) keeps increasing as the number of variables keeps increasing in the model. we redid the simulation for every number of regressors and just included them into the model, and it just shows the \(R^2\) just going up even though all we doing is generating a bunch of random normal vectors and putting them into a regression model.
n <- 100#one hundred data points
plot(c(1, n), 0 : 1, type = "n", frame = FALSE, xlab = "p", ylab = "R^2")
r <- sapply(1 : n, function(p)
{
y <- rnorm(n); x <- matrix(rnorm(n * p), n, p)
summary(lm(y ~ x))$r.squared
}
)
lines(1 : n, r, lwd = 2)
abline(h = 1)
Let’s talk a little about the impact of excluding important aggressors, which results in bias. And now let’s deep down a little bit further in this phenomenon. If we include an important aggressors, that causes so called variance inflation. Let’s see it with a little simulation. And the reason we are doing is using a simulated data is because the variance is estimated rather than actual. And that has a kind of conflicting property. So we don’t necessarily see the variance go up when we include an unnecessary regressor into a regression model. We have to look at it through simulation. Let’s go ahead and do this.
n <- 100; #100 data points
nosim <- 1000 #1000 simulations
## Then 3 independent normal vectors (x1, x2, & x3)
x1 <- rnorm(n);
x2 <- rnorm(n);
x3 <- rnorm(n);
betas <- sapply(1 : nosim, function(i){
y <- x1 + rnorm(n, sd = .3)#model where the outcome y only depends on x1, and an error (standard random normals with SD of 0.3)
c(coef(lm(y ~ x1))[2],#Going to check the SE for the model that just includes the x1
coef(lm(y ~ x1 + x2))[2],#Going to check the SE for the model that just includes the x1 and x2
coef(lm(y ~ x1 + x2 + x3))[2])#Going to check the SE for the model that includes all x1, x2, & x3
})#They are with in a function suggesting that we are going to do it over and over again, and look at the standard deviation of the simulated estimated coefficients.
round(apply(betas, 1, sd), 5)
## x1 x1 x1
## 0.02824 0.02824 0.02866
These values are about the same. The variance inflation by including the extra variables did almost nothing because standard deviation of the simulated estimated coefficients are virtually the same. And the reason is because these other two variables, x2 and x3, have nothing to do with x1 because we simulated them independently.
Let’s look at a case now, where we simulate them in such a way that the x2 and x3 depend heavily on x1.
n <- 100; #100 data points
nosim <- 1000 #1000 simulations
## Then 3 independent normal vectors (x1, x2, & x3)
x1 <- rnorm(n);
x2 <- x1/sqrt(2) + rnorm(n)/sqrt(2);
x3 <- x1* 0.95+ rnorm(n)*sqrt(1-0.95^2);
betas <- sapply(1 : nosim, function(i){
y <- x1 + rnorm(n, sd = .3)#model where the outcome y only depends on x1, and an error (standard random normals with SD of 0.3)
c(coef(lm(y ~ x1))[2],#Going to check the SE for the model that just includes the x1
coef(lm(y ~ x1 + x2))[2],#Going to check the SE for the model that just includes the x1 and x2
coef(lm(y ~ x1 + x2 + x3))[2])#Going to check the SE for the model that includes all x1, x2, & x3
})#They are with in a function suggesting that we are going to do it over and over again, and look at the standard deviation of the simulated estimated coefficients.
round(apply(betas, 1, sd), 5)
## x1 x1 x1
## 0.03042 0.04129 0.10171
Now we see huge among of variance of inflation especially for this third model where we include x2 and x3. We see the standard error has gone up by a factor of three for the third model where we include x2 and x3.
For example, if we have height in a model and we also include weight or something that’s highly related. Height and weight tend to track pretty well with one another. If that was necessary to include, then we just have to bite the bullet and deal with the extra standard error that we get. However, if it’s unnecessary, then we don’t want to include it. So again, the main point is, is the more correlated the covariats are to the regressors that we’re interested in, the worse off we’re going to be in terms of paying a penalty for increased standard deviation.
So the examples above show that there’s no free launch. If we omit variables that are important, we get bias. If we include variables that are unnecessary, we inflate standard errors. Especially if we include variables that are unnecessary that are uncorrelated with our variable of interest. And this again reiterates why randomization is so important.
Let’s check variance inflation with the Swiss data.
library(datasets)
data(swiss)
library(car)
## Warning: package 'car' was built under R version 4.0.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.0.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
fit<-lm(Fertility~., data=swiss)
vif(fit)
## Agriculture Examination Education Catholic
## 2.284129 3.675420 2.774943 1.937160
## Infant.Mortality
## 1.107542
So, we obtained the Variance Inflation Factors above and the square rood of them below. The VIFs of Agriculture is 2.28, suggesting that the standard error for the agriculturee effect is double of what it would be if it were orthogonal (mathematics: of or involving right angles; statistics: statistically independent) to all other regressors. We can see that some of them are quite high, like the “Examination”, which is approximately 3.68, suggesting almost four times higher SE if it were orthogonal to all the other regressors. We can easily see that the factors like Examination and Eduation are hugely cohesive and that’s why their VFS are the highest. To this point, infant mortality is not as cohesive, thus its VIF is much smaller, i.e., 1.107542.
sqrt(vif(fit))#the professor prefered sd
## Agriculture Examination Education Catholic
## 1.511334 1.917138 1.665816 1.391819
## Infant.Mortality
## 1.052398
So that’s gets us a little bit of headway into the world of looking at what happens when we include unnecessary variables, and one of the ways to look at that is so-called variance inflation factors, and they’re incredibly useful little tools.
So up to this point all that we’ve talked about is the impact of unnecessary or necessary variables included on our regression models. ***But there’s one other parameter that we estimate in a regression model, the residual variant, $$ sigma squared $. We can mathematically describe the impact of omitting necessary variables or including unnecessary variables, the impact is additive. And it follows along the same lines as what we discussed before, if we underfit the model, in other words, we omit important variables, then the variance estimate is biased. Why is that? Because we’ve attributed to variation things that are actually systematically explained by these co-variants that we’ve omitted. On the other hand, if we either correctly fit the model, we include all the right terms, or if we overfit the model, then the variance estimates in unbiased; however, the variance of the variance estimate gets inflated if we include unnecessary variables. So it’s actually kind of the same rule, if we omit variables then we get biased, if we include variables, then we get a less reliable estimate, so it’s roughly the same impact going on.
So let’s talk about model selection in general, automated model selection. In one point it was a statistical topic but it has really moved in to the realm of machine learning for the most part. Even for relatively simple linear regression models, the space of model terms that we have to search among, explodes really quickly when you start including interactions and polynomial terms like the square of a regressor and so on. If we have a lot of regressors and we’re interested reducing this space, then there’s a lot of factor analytic things like principal components are available. However, those tools come with consequences for example the principal components or factors that we obtain might be less interpretable than the original data.
A good design can often eliminate the need for a lot of model discussion. We’ve talked a lot about how randomization can really prevent a lot of the problems that we’re talking about with making our variable of interest unrelated to nuisance variables that we’re not interested in, or in nuisance variables that we don’t even know about. However, there’s other aspects of design that can serve the same purpose. For example, if we stratify and randomize within strata. The classic example of this, when this was developed, was R.A. Fisher was working in field crop experiments and they needed, let’s say we were trying a different kind of seed, we might block on different areas of the field that we were going to plant in and randomize the different seeds to those areas. So, we might have two different kinds of seeds, but they will have been distributed in a systematic way that is fair across the field, but also then, within that design, there will also be some randomization.
It’s often the case that good thoughtful experimental design can really eliminate the need for some of the main considerations that we would have to go through in model building if we were to just collect data in an observational fashion. The last thing in this topic is, there’s one very useful automated model search technique and it’s the idea of looking at nested models. For example, we are often interested in a particular variable and how the other variables that we’ve collected impacts it. We are may be interested in a treatment or something like that but we are worried that these treatment groups are imbalanced or so on. So, what we like to look at is the model that just includes the treatment by itself and the model that includes the treatment and let’s say, age. If the ages weren’t really balanced between the two treatment groups and then one that looks at age and gender, if maybe the genders between the two groups weren’t really balanced and then so on. And this idea of creating models that are nested, every successive model contains all the terms of the previous model leads to a very easy way of testing each successive model. And these nested model are very easy to do .
Let’s check how it works. So, we fitted three linear models to the swiss dataset. The first one just includes agriculture. Let’s pretend that, that’s the variable that we’re interested in. The second model includes agriculture and examination, and education because we are thinking that they’re measuring the same thing. And then the third model includes all the terms. We are interested in seeing what happens to the effect as we go through these three models. The point being, in this case, we can test whether or not the inclusion of the additional set of extra term is necessary with the ANOVA function.
fit1 <- lm(Fertility ~ Agriculture, data = swiss)
fit3 <- update(fit, Fertility ~ Agriculture + Examination + Education)
fit5 <- update(fit, Fertility ~ Agriculture + Examination + Education + Catholic + Infant.Mortality)
anova(fit1, fit3, fit5)
So, what you got here is a listing of the models, Model 1, model 2, model 3; the degrees of freedomm (the number of data points minus the number of parameters that it had to fit); the residual sums of squares (RSS), and the excess degrees of freedom going from model 1 to model 2 and then model 2 to model 3. We added two parameters going from model 1 to model 2, that’s why that Df is 2 and then we added two additional parameters going from model 2 to model 3. Then, comes the residual sum of squares and a p-value. We see that the inclusion of education and examination (Model 3) appear to be necessary compared to the model where we’re just looking at agriculture (Model 1) by itself. Likewise, looking at the Model 5, we can say that the inclusion of Catholic and Infant.Mortality appear to be necessary beyond just including examination, education and agriculture.
To sum up, if the way in which we’re interested in looking at the data naturally falls into a nested model search as it often does, then some kind of nested model searches a reasonable thing to do. It doesn’t work if the models that we’re looking at aren’t nested. For example, if we had the Model 1 or Model 2 had an examination, but not education and the third model had education, but not examination, this wouldn’t apply, we’d have to do something else. And there, we get into the harder world of automated model selection with things like information criteria.
fit <- lm(mpg ~ factor(cyl) + wt, data = mtcars)
summary(fit)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.990794 1.8877934 18.005569 6.257246e-17
## factor(cyl)6 -4.255582 1.3860728 -3.070244 4.717834e-03
## factor(cyl)8 -6.070860 1.6522878 -3.674214 9.991893e-04
## wt -3.205613 0.7538957 -4.252065 2.130435e-04
Correct: Holding weight constant, cylinder appears to have less of an impact on mpg than if weight is disregarded.
fit1 <- lm(mpg ~ factor(cyl) + wt, data = mtcars)
fit2 <- lm(mpg ~ factor(cyl) * wt, data = mtcars)
summary(fit1)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.990794 1.8877934 18.005569 6.257246e-17
## factor(cyl)6 -4.255582 1.3860728 -3.070244 4.717834e-03
## factor(cyl)8 -6.070860 1.6522878 -3.674214 9.991893e-04
## wt -3.205613 0.7538957 -4.252065 2.130435e-04
summary(fit2)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.571196 3.193940 12.3894599 2.058359e-12
## factor(cyl)6 -11.162351 9.355346 -1.1931522 2.435843e-01
## factor(cyl)8 -15.703167 4.839464 -3.2448150 3.223216e-03
## wt -5.647025 1.359498 -4.1537586 3.127578e-04
## factor(cyl)6:wt 2.866919 3.117330 0.9196716 3.661987e-01
## factor(cyl)8:wt 3.454587 1.627261 2.1229458 4.344037e-02
anova(fit1, fit2)
lm(mpg ~ I(wt * 0.5) + factor(cyl), data = mtcars)
##
## Call:
## lm(formula = mpg ~ I(wt * 0.5) + factor(cyl), data = mtcars)
##
## Coefficients:
## (Intercept) I(wt * 0.5) factor(cyl)6 factor(cyl)8
## 33.991 -6.411 -4.256 -6.071
x <- c(0.586, 0.166, -0.042, -0.614, 11.72)
y <- c(0.549, -0.026, -0.127, -0.751, 1.344)
influence(lm(y ~ x))$hat
## 1 2 3 4 5
## 0.2286650 0.2438146 0.2525027 0.2804443 0.9945734
## showing how it's actually calculated
xm <- cbind(1, x)
diag(xm %*% solve(t(xm) %*% xm) %*% t(xm))
## [1] 0.2286650 0.2438146 0.2525027 0.2804443 0.9945734
x <- c(0.586, 0.166, -0.042, -0.614, 11.72)
y <- c(0.549, -0.026, -0.127, -0.751, 1.344)
influence.measures(lm(y ~ x))
## Influence measures of
## lm(formula = y ~ x) :
##
## dfb.1_ dfb.x dffit cov.r cook.d hat inf
## 1 1.0621 -3.78e-01 1.0679 0.341 2.93e-01 0.229 *
## 2 0.0675 -2.86e-02 0.0675 2.934 3.39e-03 0.244
## 3 -0.0174 7.92e-03 -0.0174 3.007 2.26e-04 0.253 *
## 4 -1.2496 6.73e-01 -1.2557 0.342 3.91e-01 0.280 *
## 5 0.2043 -1.34e+02 -149.7204 0.107 2.70e+02 0.995 *
Correct: It is possible for the coefficient to reverse sign afte adjustment. For example, it can be strongly significant and positive before adjustment and strongly significant and negative after adjustment.