# install.packages("swirl") # library(swirl) # install_from_swirl("Regression Models") # swirl()
That code coincides with a Coursera course that is like a more R focused version of ECO 380.
October 2, 2016
# install.packages("swirl") # library(swirl) # install_from_swirl("Regression Models") # swirl()
That code coincides with a Coursera course that is like a more R focused version of ECO 380.
In general we're looking for a line that estimates a value of \(y\) based on an observed value of \(x\).
The line that turns out to perform best is the least squares estimator which minimizes the sum of squared residuals:
\[ min \{ \sum_{i=1}^n (y_i - (\beta_0 + \beta_1 x_i))^2 \}\]
Which yields…
\[\hat{\beta_1} = \frac{\sum_{i=1}^{n}(x_i - \bar{x})(y_i-\bar{y})}{\sum_{i=1}^{n}(x_i - \bar{x})(x_i - \bar{x})} = \frac{\sum_{i=1}^{n}(x_i - \bar{x})(y_i-\bar{y})}{\sum_{i=1}^{n}(x_i - \bar{x})^2}\]
Which is the same as:
\[ \hat{\beta_1} = \frac{(n-1)cov(x,y)}{(n-1)var(x)}= \frac{cov(x,y)}{var(x)} \]
Our line will go through the point \((\bar{x},\bar{y})\), so we can rearrange our model to find:
\[\hat{\beta_0} = \bar{y} - \hat{\beta_1} \bar{x}\]
What happens if you demean your data, then run the regression through the origin?
x <- x - mean(x)
and y <- y - mean(y)
library(tidyverse) # tidyverse includes dplyr, ggplot2, etc. # Very convenient! library(UsingR) # galton data in here data(galton) galton.demean <- data.frame( child = galton$child - mean(galton$child), parent = galton$parent - mean(galton$parent) ) out1 <- lm(formula = child ~ parent - 1, data = galton.demean)
Notice: some of our commands take place over multiple lines. If a line of code ends with (
or ,
R will wait for something to complete the statement. On the console it will give a line that starts with +
instead of >
.
## ## Call: lm(formula = child ~ parent - 1, data = galton.demean) ## Estimate Std. Error t value Pr(>|t|) ## parent 0.64629 0.04111 15.72 <1e-04 *** ## ## Residual standard error: 2.237 on 927 degrees of freedom ## Multiple R-squared: 0.2105, Adjusted R-squared: 0.2096 ## F-statistic: 247.1 on 1 and 927 DF, p-value: 1e-04
## ## Call: ## lm(formula = child ~ parent, data = galton) ## ## Residuals: ## Min 1Q Median 3Q Max ## -7.8050 -1.3661 0.0487 1.6339 5.9264 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 23.94153 2.81088 8.517 <2e-16 *** ## parent 0.64629 0.04114 15.711 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 2.239 on 926 degrees of freedom ## Multiple R-squared: 0.2105, Adjusted R-squared: 0.2096 ## F-statistic: 246.8 on 1 and 926 DF, p-value: 0
print(h + geom_smooth(method="lm"))
origin.story <- lm(child ~ parent -1,galton) print(h + geom_abline(slope = origin.story$coefficients[[1]]),color="red")
origin.story <- lm(child ~ parent -1,galton) h + coord_cartesian(xlim = c(0,72), ylim = c(0,72)) + geom_abline(slope = origin.story$coefficients[[1]], color="red") + geom_smooth(method="lm",se = FALSE, color = "blue") + geom_vline(aes(xintercept = mean(galton$child)), color = "green") + geom_hline(aes(yintercept = mean(galton$parent)), color = "green")
Here comes the hyper-plane
Set a timer to avoid getting bogged down here.
Don't Panic! If you miss anything, just check the slides later. If it still doesn't make sense, ask next class.
data("mtcars") #?mtcars # Uncomment to get help on this dataset dim(mtcars) # how many rows and columns of data?
## [1] 32 11
# What does the data look like? # Not statistically, but specifically... head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb ## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 ## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 ## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 ## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 ## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 ## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
str(mtcars) # structure of mtcars
## 'data.frame': 32 obs. of 11 variables: ## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ... ## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ... ## $ disp: num 160 160 108 258 360 ... ## $ hp : num 110 110 93 110 175 105 245 62 95 123 ... ## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ... ## $ wt : num 2.62 2.88 2.32 3.21 3.44 ... ## $ qsec: num 16.5 17 18.6 19.4 17 ... ## $ vs : num 0 0 1 1 0 1 0 1 1 1 ... ## $ am : num 1 1 1 0 0 0 0 0 0 0 ... ## $ gear: num 4 4 4 3 3 3 3 4 4 4 ... ## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
pairs(mtcars[,1:7],upper.panel = NULL)
library(GGally) # install.packages("GGally") mtcars <- mutate(mtcars,cyl = as.factor(cyl)) mainplot <- ggpairs(mtcars[,c(1:4,6,7,9)], columns = c(1,3:6), upper = list( continuous = "smooth_loess", combo = "box", discrete = "" ), title = "mtcars")
mainplot
What sort of questions might we ask?
We want to create a model where mpg
is a function of hp
. But we want to hold relevant factors constant (as best we can).
So let's create a linear model:
\[mpg = \beta_0 + \beta_1 hp + \beta_2 wt + ... \]
This is why linear regression is still the go-to statistical tool for most economists and statisticians.
# Simple regression fit0 <- lm(mpg ~ hp, mtcars) # Multiple regression fit1 <- lm(mpg ~ hp + wt + disp, mtcars)
## ## Call: lm(formula = mpg ~ hp, data = mtcars) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 30.09886 1.63392 18.421 <1e-04 *** ## hp -0.06823 0.01012 -6.742 <1e-04 *** ## ## Residual standard error: 3.863 on 30 degrees of freedom ## Multiple R-squared: 0.6024, Adjusted R-squared: 0.5892 ## F-statistic: 45.46 on 1 and 30 DF, p-value: 1e-04
## ## Call: lm(formula = mpg ~ hp + wt + disp, data = mtcars) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 37.105505 2.110815 17.579 < 1e-04 *** ## hp -0.031157 0.011436 -2.724 0.01097 * ## wt -3.800891 1.066191 -3.565 0.00133 ** ## disp -0.000937 0.010350 -0.091 0.92851 ## ## Residual standard error: 2.639 on 28 degrees of freedom ## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8083 ## F-statistic: 44.57 on 3 and 28 DF, p-value: 1e-04
We should be skeptical of overly complex models and small datasets. Humans have an important weakness: we often see patterns that aren't really there.
Degrees of freedom is a score for a model that balances these two desires.
Estimating \(\mu_x\) as \(\bar{x}\), and \(\sigma_x\) as \(s_x\) isn't doing "anything fancy" so even with only 30 observations you can feel reasonably good about what you're doing.
More complex models \(\implies\) more assumptions.
If we make too many assumptions, the relationship we draw might be the result of those assumptions rather than some true underlying relationship between our variables of interest.
Degrees of Freedom is (sort of) about how much the data could vary within our model.
Now just the confidence interval:
confint(fit1)
## 2.5 % 97.5 % ## (Intercept) 32.78169625 41.429314293 ## hp -0.05458171 -0.007731388 ## wt -5.98488310 -1.616898063 ## disp -0.02213750 0.020263482
Unsurprisingly, power, weight, and engine displacement are negatively associated with mpg.
It's not clear that the effect of displacement is statistically significant. That is, when we do a hypothesis test to see if its slope parameter is different from 0, we fail to reject the null hypothesis
The Student's t distribution was created by William Gosset in 1908, but published under the pseudonym "Student" because (it's said) his employer (Guinness Breweries) didn't allow him to publish the paper because it would give their competitors an advantage.
As the sample size increases, it approaches the normal distribution.
Recall z-tests from MTH 110. If you're told: > men's height is normally distributed with mean 70 inches and standard deviation 3 inches > find the probability of meeting a 7 foot tall man
h.bar <- 70 h.sd <- 3 h.1 <- 12*7 z.1 <- (h.1-h.bar)/h.sd print(1 - pnorm(z.1))
## [1] 1.530627e-06
R lets us skip some steps by entering 1 - prnorm(12*7,mean = 70, sd = 3)
The t-test works the same way, except we have to consider the degrees of freedom.
If we're testing something normally distributed but we're worried about underestimating the standard deviation, we use the t-test.
Under \(H_0: \beta_i = 0\) and the assumption that our random variable (e.g. \(\beta_wt\)) is normally distributed, we have to use the t-test because we don't know the true variance of this particular estimate.
Define:
\[t = \frac{\beta_i - \beta_{NULL}}{se(\beta_i)}\]
Where the standard error is our estimate of the variance of \(\beta_i\).
print(summary(fit1),concise = TRUE)
## ## Call: lm(formula = mpg ~ hp + wt + disp, data = mtcars) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 37.105505 2.110815 17.579 < 1e-04 *** ## hp -0.031157 0.011436 -2.724 0.01097 * ## wt -3.800891 1.066191 -3.565 0.00133 ** ## disp -0.000937 0.010350 -0.091 0.92851 ## ## Residual standard error: 2.639 on 28 degrees of freedom ## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8083 ## F-statistic: 44.57 on 3 and 28 DF, p-value: 1e-04
g <- ggplot(data = data.frame( yhat = fit1$fitted.values, residuals = fit1$residuals), mapping = aes(x = yhat, y = residuals)) g + geom_point() + geom_smooth() + geom_hline(yintercept = 0, color = "red")
What graph are we drawing here?
g <- ggplot(data = data.frame( yhat = fit1$fitted.values, residuals = fit1$residuals), mapping = aes(x = yhat, y = residuals)) g + geom_point() + geom_smooth() + geom_hline(yintercept = 0, color = "red")
reshape::tips
ggpairs(tips, c(1,8,3), columnLabels = c("Total Bill", "Tip", "Sex"))
head(tips)
## total_bill tip sex smoker day time size tip.pct ## 1 16.99 1.01 Female No Sun Dinner 2 0.05944673 ## 2 10.34 1.66 Male No Sun Dinner 3 0.16054159 ## 3 21.01 3.50 Male No Sun Dinner 3 0.16658734 ## 4 23.68 3.31 Male No Sun Dinner 2 0.13978041 ## 5 24.59 3.61 Female No Sun Dinner 4 0.14680765 ## 6 25.29 4.71 Male No Sun Dinner 4 0.18623962
fit2 <- lm(tip.pct*100 ~ total_bill + sex + smoker + time, tips) print(summary(fit2), concise = TRUE)
## ## Call: lm(formula = tip.pct * 100 ~ total_bill + sex + smoker + time, ## data = tips) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 20.82327 1.11236 18.720 <1e-04 *** ## total_bill -0.23713 0.04275 -5.547 <1e-04 *** ## sexMale -0.33116 0.79415 -0.417 0.677 ## smokerYes 0.73781 0.76519 0.964 0.336 ## timeLunch -0.42812 0.85453 -0.501 0.617 ## ## Residual standard error: 5.778 on 239 degrees of freedom ## Multiple R-squared: 0.1196, Adjusted R-squared: 0.1049 ## F-statistic: 8.117 on 4 and 239 DF, p-value: 1e-04
confint(fit2)
## 2.5 % 97.5 % ## (Intercept) 18.6319977 23.0145485 ## total_bill -0.3213460 -0.1529132 ## sexMale -1.8955811 1.2332592 ## smokerYes -0.7695682 2.2451812 ## timeLunch -2.1114920 1.2552507