Thirty-nine MBA students were asked about happiness and how this related to their income and social life. The data are found in ‘happy’. Fit a regression model with happy
as the response and the other four variables as predictors.
# Loading data
data(happy, package='faraway')
lmod <- lm(happy ~ ., happy)
summary(lmod)
##
## Call:
## lm(formula = happy ~ ., data = happy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7186 -0.5779 -0.1172 0.6340 2.0651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.072081 0.852543 -0.085 0.9331
## money 0.009578 0.005213 1.837 0.0749 .
## sex -0.149008 0.418525 -0.356 0.7240
## love 1.919279 0.295451 6.496 1.97e-07 ***
## work 0.476079 0.199389 2.388 0.0227 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.058 on 34 degrees of freedom
## Multiple R-squared: 0.7102, Adjusted R-squared: 0.6761
## F-statistic: 20.83 on 4 and 34 DF, p-value: 9.364e-09
Only love was significant at the 1% level.
table
function to produce a numerical summary of the response. What assumption used to perform the t-tests seems questionable in light of this summary?# Page 36 (Not sure if this is correct.)
nullmod <- lm(happy ~ 1, happy)
anova(nullmod, lmod)
## Analysis of Variance Table
##
## Model 1: happy ~ 1
## Model 2: happy ~ money + sex + love + work
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 38 131.436
## 2 34 38.087 4 93.349 20.833 9.364e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(happy ~ ., data=happy)
## Call:
## aov(formula = happy ~ ., data = happy)
##
## Terms:
## money sex love work Residuals
## Sum of Squares 9.64865 1.93974 75.37413 6.38635 38.08703
## Deg. of Freedom 1 1 1 1 34
##
## Residual standard error: 1.058398
## Estimated effects may be unbalanced
anova1 <- summary(aov(happy ~ ., data = happy))
anova1
## Df Sum Sq Mean Sq F value Pr(>F)
## money 1 9.65 9.65 8.613 0.00594 **
## sex 1 1.94 1.94 1.732 0.19701
## love 1 75.37 75.37 67.286 1.43e-09 ***
## work 1 6.39 6.39 5.701 0.02266 *
## Residuals 34 38.09 1.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
I don’t know the answer to this question.
money
predictor.lmod <- lm(happy ~ money, happy)
lms <- summary(lmod)
nreps <- 4000
set.seed(123)
fstats <- numeric(nreps)
for(i in 1:nreps){
lmods <- lm(sample(happy) ~ money, happy)
fstats[i] <- summary(lmods)$fstat[1]
}
mean(fstats > lms$fstat[1])
## [1] 0.09425
Estimated p-value using the permutation test is 0.09425.
hist(fstats, prob = TRUE, xlim = c(0,5), breaks = 500)
grid <- seq(-3, 3, length = 300)
to create a grid of values, then use the dt
function to compute the t-density on this grid and the lines
function to superimpose the result.grid <- seq(-3, 3, length = 300)
hist(fstats, prob = TRUE, xlim = c(0,5), breaks = 500)
lines(dt(grid,df=34), col='blue')
???? I don’t know the answer to this question. Why isn’t there a solutions manual?
lmod <- lm(happy ~ ., happy)
set.seed(123)
nb <- 4000
coefmat <- matrix(NA, nb, 5)
resids <- residuals(lmod)
preds <- fitted(lmod)
for(i in 1:nb){
Happy <- preds + sample(resids, rep=TRUE)
bmod <- update(lmod, Happy ~ .)
coefmat[i,] <- coef(bmod)
}
colnames(coefmat) <- c("Intercept", colnames(happy[2:5]))
coefmat <- data.frame(coefmat)
apply(coefmat,2,function(x) quantile(x,c(0.025,0.975)))
## Intercept money sex love work
## 2.5% -1.657002 0.000297174 -0.9016588 1.383265 0.1178916
## 97.5% 1.450177 0.019075282 0.6205241 2.459571 0.8484568
I believe so.
For the fat
data used in this chapter, a smaller model using only age
, weight
, height
, and abdom
was proposed on the grounds that these predictors are either known by the individual or easily measured.
data(fat, package='faraway')
lmod_fat <- lm(brozek ~ age + weight + height + abdom, fat)
summary(lmod_fat)
##
## Call:
## lm(formula = brozek ~ age + weight + height + abdom, data = fat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5105 -2.9346 0.0087 2.8942 9.4179
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -32.769636 6.541902 -5.009 1.04e-06 ***
## age -0.007051 0.024342 -0.290 0.772
## weight -0.123722 0.025046 -4.940 1.44e-06 ***
## height -0.116694 0.082727 -1.411 0.160
## abdom 0.889704 0.067267 13.226 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.126 on 247 degrees of freedom
## Multiple R-squared: 0.7211, Adjusted R-squared: 0.7166
## F-statistic: 159.7 on 4 and 247 DF, p-value: < 2.2e-16
lmod_fat_full <- lm(brozek ~ ., fat)
anova(lmod_fat, lmod_fat_full)
## Analysis of Variance Table
##
## Model 1: brozek ~ age + weight + height + abdom
## Model 2: brozek ~ siri + density + age + weight + height + adipos + free +
## neck + chest + abdom + hip + thigh + knee + ankle + biceps +
## forearm + wrist
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 247 4205.0
## 2 234 6.8 13 4198.2 11096 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There appears to be a statistical difference. ?Perhaps we cannot justify the smaller model.?
x <- model.matrix(lmod_fat_full)
(x0 <- apply(x,2,median))
## (Intercept) siri density age weight height
## 1.0000 19.2000 1.0549 43.0000 176.5000 70.0000
## adipos free neck chest abdom hip
## 25.0500 141.5500 38.0000 99.6500 90.9500 99.3000
## thigh knee ankle biceps forearm wrist
## 59.0000 38.5000 22.8000 32.0500 28.7000 18.3000
(y0 <- sum(x0 * coef(lmod_fat_full)))
## [1] 18.99539
Predicted value for the median predictor values: 18.99539
predict(lmod_fat_full, new=data.frame(t(x0)))
## 1
## 18.99539
predict(lmod_fat_full, new=data.frame(t(x0)), interval="prediction")
## fit lwr upr
## 1 18.99539 18.65831 19.33247
predict(lmod_fat_full, new=data.frame(t(x0)), interval="confidence")
## fit lwr upr
## 1 18.99539 18.96985 19.02094
They do not differ a substantial amount.
new_df_25_50 <- fat[25:50,c(4,5,6,11)]
predict(lmod_fat, new=data.frame(new_df_25_50), interval="prediction")
## fit lwr upr
## 25 8.298418 0.08103281 16.51580
## 26 9.903086 1.71322935 18.09294
## 27 9.216292 1.01552526 17.41706
## 28 19.740864 11.48411519 27.99761
## 29 8.747253 0.49903716 16.99547
## 30 13.376012 5.19875861 21.55326
## 31 14.797935 6.62619278 22.96968
## 32 14.065015 5.88455369 22.24548
## 33 8.315872 0.10061987 16.53112
## 34 21.038046 12.84083747 29.23525
## 35 30.623842 22.38906643 38.85862
## 36 36.201628 27.83841989 44.56484
## 37 23.528151 15.36925601 31.68705
## 38 22.473944 14.31151961 30.63637
## 39 45.310482 36.47199166 54.14897
## 40 30.120799 21.92013711 38.32146
## 41 38.663109 30.34468421 46.98153
## 42 30.910811 20.40248479 41.41914
## 43 30.810797 22.61170204 39.00989
## 44 25.164766 16.99699489 33.33254
## 45 11.141535 2.93651578 19.34655
## 46 10.568910 2.38586739 18.75195
## 47 8.125807 -0.07361926 16.32523
## 48 10.999713 2.82955826 19.16987
## 49 16.325612 8.12181572 24.52941
## 50 5.970276 -2.26886915 14.20942
There are two negative values in observation 47 and 50. You can’t have negative fat.
new_fat <- fat[c(25:46, 48, 49),]
predict(lmod_fat, new=data.frame(new_fat), interval="prediction")
## fit lwr upr
## 25 8.298418 0.08103281 16.51580
## 26 9.903086 1.71322935 18.09294
## 27 9.216292 1.01552526 17.41706
## 28 19.740864 11.48411519 27.99761
## 29 8.747253 0.49903716 16.99547
## 30 13.376012 5.19875861 21.55326
## 31 14.797935 6.62619278 22.96968
## 32 14.065015 5.88455369 22.24548
## 33 8.315872 0.10061987 16.53112
## 34 21.038046 12.84083747 29.23525
## 35 30.623842 22.38906643 38.85862
## 36 36.201628 27.83841989 44.56484
## 37 23.528151 15.36925601 31.68705
## 38 22.473944 14.31151961 30.63637
## 39 45.310482 36.47199166 54.14897
## 40 30.120799 21.92013711 38.32146
## 41 38.663109 30.34468421 46.98153
## 42 30.910811 20.40248479 41.41914
## 43 30.810797 22.61170204 39.00989
## 44 25.164766 16.99699489 33.33254
## 45 11.141535 2.93651578 19.34655
## 46 10.568910 2.38586739 18.75195
## 48 10.999713 2.82955826 19.16987
## 49 16.325612 8.12181572 24.52941
Use the odor
dataset with odor as the response and temp
as a predictor. Consider all possible models that also include all, some or none of the other two predictors. Report the coefficience for temperature, its standard error, t-statistics and p-value in each case. Discuss what stays the same, what changes and why. Which model is best?
data(odor, package='faraway')
require(faraway)
## Loading required package: faraway
odor_lm_1 <- lm(odor ~ temp, odor)
odor_lm_all <- lm(odor ~ ., odor)
odor_lm_2 <- lm(odor ~ temp + gas, odor)
odor_lm_none <- lm(odor ~ 1, odor)
sumary(odor_lm_none)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.200 10.097 1.5054 0.1544
##
## n = 15, p = 1, Residual SE = 39.10462, R-Squared = 0
sumary(odor_lm_1)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.200 10.186 1.4922 0.1595
## temp -12.125 13.948 -0.8693 0.4004
##
## n = 15, p = 2, Residual SE = 39.45033, R-Squared = 0.05
sumary(odor_lm_2)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.2000 9.9778 1.5234 0.1536
## temp -12.1250 13.6627 -0.8875 0.3923
## gas -17.0000 13.6627 -1.2443 0.2372
##
## n = 15, p = 3, Residual SE = 38.64397, R-Squared = 0.16
sumary(odor_lm_all)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.2000 9.2981 1.6347 0.1304
## temp -12.1250 12.7320 -0.9523 0.3614
## gas -17.0000 12.7320 -1.3352 0.2088
## pack -21.3750 12.7320 -1.6788 0.1213
##
## n = 15, p = 4, Residual SE = 36.01155, R-Squared = 0.33
Intercept and coefficient stays the same. The p-values and t-values appear to change as more independent variables are added on. Simply going by R-Squared, the full model appears to be doing the best.
Using the infmort
data, find a simple model for the infant mortality in terms of the other variables. Be alert for transformations and unusual points. Interpret your model by explaining what the regresion parameter estimates mean.
data(infmort, package='faraway')
inf_lm <- lm(mortality ~ ., infmort)
sumary(inf_lm)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1516e+02 2.9743e+01 7.2340 1.193e-10
## regionEurope -1.0149e+02 3.0728e+01 -3.3027 0.0013506
## regionAsia -4.5885e+01 2.0144e+01 -2.2779 0.0249765
## regionAmericas -8.3649e+01 2.1798e+01 -3.8374 0.0002239
## income -5.2900e-03 7.4041e-03 -0.7145 0.4766846
## oilno oil exports -7.8335e+01 2.8910e+01 -2.7096 0.0079918
##
## n = 101, p = 6, Residual SE = 77.35705, R-Squared = 0.31
Mortality improves in areas like Asia and Americas, and as well as being from a no oil export country (statistically significant). However, can’t infer causation.
The manager of the purchasing department of a large company would like to develop a regression model to predict the average amount of time it takes to process a given number of invoices. Over a 30-day period, data are collected on the number of invoices processed and the total time taken (in hours). The data are available on the book web site in the file invoices.txt. The following model was fit to the data: \(Y = \beta_0 + \beta_1 x + e\) where Y is the processing time and x is the number of invoices. A plot of the data and the fitted model can be found in Figure 2.7. Utilizing the output from the fit of this model provided below, complete the following tasks.
invoices <- read.table('http://www.stat.tamu.edu/~sheather/book/docs/datasets/invoices.txt', header = TRUE)
plot(invoices$Invoices, invoices$Time, xlab="Number of Invoice", ylab="Processing Time")
invoice_lm <- lm(Time ~ Invoices, invoices)
abline(invoice_lm)
head(invoices)
## Day Invoices Time
## 1 1 149 2.1
## 2 2 60 1.8
## 3 3 188 2.3
## 4 4 23 0.8
## 5 5 201 2.7
## 6 6 58 1.0
summary(invoice_lm)
##
## Call:
## lm(formula = Time ~ Invoices, data = invoices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59516 -0.27851 0.03485 0.19346 0.53083
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6417099 0.1222707 5.248 1.41e-05 ***
## Invoices 0.0112916 0.0008184 13.797 5.17e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3298 on 28 degrees of freedom
## Multiple R-squared: 0.8718, Adjusted R-squared: 0.8672
## F-statistic: 190.4 on 1 and 28 DF, p-value: 5.175e-14
((confint(invoice_lm, '(Intercept)', level=0.95)))
## 2.5 % 97.5 %
## (Intercept) 0.3912496 0.8921701
#Solution from jolars github responsitory - solution manual
beta0 <- 0.6417099
beta0_se <- 0.122707
beta0_t <- 5.248
beta0_margin <- 1.96 * beta0_se
(beta0_95 <- c(beta0 - beta0_margin, beta0 + beta0_margin))
## [1] 0.4012042 0.8822156
confint(invoice_lm, 'Invoices', levels=0.95)
## 2.5 % 97.5 %
## Invoices 0.009615224 0.01296806
If alpha = 0.05, then 95% confidence interval for Invoices parameter is 0.009615224 to 0.01296806. Thus the value 0.01 falls within this, thus we reject the null hypothesis.
# Solutions from jolars github responsitory - solution manual
beta <- 0.01
beta_obs_se <- 0.0008184
beta_obs <- 0.0112916
tval <- (beta - beta_obs) / beta_obs_se
(pobs <- 2 * pt(abs(tval), 30 - 1, lower.tail = FALSE))
## [1] 0.1253666
(predict(invoice_lm, data.frame(Invoices = 130), interval="prediction"))
## fit lwr upr
## 1 2.109624 1.422947 2.7963
# Solutions from jolars github responsitory - solution manual
beta0 <- 0.6417099
beta1 <- 0.0112916
rse <- 0.3298
n <- 30
df <- n - 2
rss <- rse^2 * df
mse <- rss / n
time <- beta0 + beta1 * 130
err <- qt(0.975, 28) * sqrt(mse) * sqrt(1 + 1 / n) # since x0 = xbar
upr <- time + err
lwr <- time - err
(lwr)
## [1] 1.446172
(upr)
## [1] 2.773064