library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(Stat2Data)
data(LeafWidth)

1.15

  1. Fit the regression of Width on Year. What is the fitted regression model?
library(Stat2Data)
data("LeafWidth")
fm = lm(Width ~ Year, data=LeafWidth)
summary(fm)
## 
## Call:
## lm(formula = Width ~ Year, data = LeafWidth)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1214 -1.1253 -0.3136  0.9320  5.4144 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.723091   8.574977   4.399 1.61e-05 ***
## Year        -0.017560   0.004358  -4.029 7.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.424 on 250 degrees of freedom
## Multiple R-squared:  0.06098,    Adjusted R-squared:  0.05723 
## F-statistic: 16.24 on 1 and 250 DF,  p-value: 7.425e-05
ggplot(fm, aes(x=Year, y=Width)) + geom_point() + stat_smooth(method=lm, se=FALSE)

\[ \hat{Width}=37.723 - 0.01756*Year \]

  1. Interpret the coefficient of Year in the context of this setting
  1. What is the predicted width of these leaves in the year 1966?

\[ \hat{Width}=37.723 - 0.017560*(1966)\\ \hat{Width}\approx3.2 \]

1.23

  1. Scatterplot that includes the least squares line. Are there any obvious outliers or influential points in this plot?
library(Stat2Data)
data("Sparrows")
gm = lm(Weight ~ WingLength, data=Sparrows)
summary(gm)
## 
## Call:
## lm(formula = Weight ~ WingLength, data = Sparrows)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5440 -0.9935  0.0809  1.0559  3.4168 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.36549    0.95731   1.426    0.156    
## WingLength   0.46740    0.03472  13.463   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.4 on 114 degrees of freedom
## Multiple R-squared:  0.6139, Adjusted R-squared:  0.6105 
## F-statistic: 181.3 on 1 and 114 DF,  p-value: < 2.2e-16
ggplot(gm, aes(x=WingLength, y=Weight)) + geom_point() + stat_smooth(method=lm, se=FALSE)

  1. Histogram of the residual
ggplot(Sparrows, aes(x=gm$residuals)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  1. Normal probability plot of the residuals
plot(gm, which=2)

1.29

  1. Produce a scatterplot for predicting WetFrass based on Mass. Comment on any patterns.
library(Stat2Data)
data("Caterpillars")
hm = lm(WetFrass ~ Mass, data=Caterpillars)
summary(hm)
## 
## Call:
## lm(formula = WetFrass ~ Mass, data = Caterpillars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.65454 -0.04796 -0.03336 -0.01014  1.50828 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.033198   0.027436    1.21    0.227    
## Mass        0.247696   0.007463   33.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3885 on 265 degrees of freedom
## Multiple R-squared:  0.8061, Adjusted R-squared:  0.8054 
## F-statistic:  1102 on 1 and 265 DF,  p-value: < 2.2e-16
ggplot(hm, aes(x=Mass, y=WetFrass)) + geom_point() 

  1. Produce a similar plot using the log (base 10) transformed variables, LogWetFrass versus LogMass. Again, comment on any patterns.
ggplot(data=Caterpillars) + geom_point(aes(y=log(WetFrass), x=log(Mass)))

  1. Would you prefer the plot in part (a) or part (b) to predict the amount of wet frass produced for caterpillars? Fit a linear regression model for the plot you chose and write down the prediction equation.
jm = lm(Caterpillars$LogWetFrass ~ Caterpillars$LogMass, data=Caterpillars)
summary(jm)
## 
## Call:
## lm(formula = Caterpillars$LogWetFrass ~ Caterpillars$LogMass, 
##     data = Caterpillars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5392 -0.2063  0.1525  0.2906  0.9517 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.73861    0.02977  -24.81   <2e-16 ***
## Caterpillars$LogMass  1.05361    0.02054   51.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4089 on 265 degrees of freedom
## Multiple R-squared:  0.9085, Adjusted R-squared:  0.9082 
## F-statistic:  2632 on 1 and 265 DF,  p-value: < 2.2e-16
ggplot(jm, aes(x=Caterpillars$LogMass, y=Caterpillars$LogWetFrass)) + geom_point() + stat_smooth(method=lm, se=FALSE)

\[ \hat{LogWetFrass} = -0.73861 + 1.05361*LogMass \]

  1. Add a plotting symbol for the grouping variable Instar to the scatterplot that you chose in (c). Does the linear trend appear consistent for all five stages of a caterpillar’s life
ggplot(Caterpillars) + geom_point(aes(x=Caterpillars$LogMass, y=Caterpillars$LogWetFrass, col=Instar))

  1. Repeat part (d) using plotting symbols (or colors) for the groups defined by the free growth period variable Frp. Does the linear trend appear to be better when the caterpillars are in a free growth period?
ggplot(Caterpillars) + geom_point(aes(x=Caterpillars$LogMass, y=Caterpillars$LogWetFrass, col=Fgp))

1.33

  1. Plot price (in cents) versus Year and comment on any patterns
library(Stat2Data)
data("USstamps")
km = lm(Price ~ Year, data=USstamps)
ggplot(km, aes(x=Year, y=Price)) + geom_point()

  1. Regular increases in the postal rates started in 1958. Remove the first four observations from the dataset and fit a regression line for predicting Price from Year. What is the equation of the regression line?
miniUSstamps = USstamps %>%
  slice(5:45)
km1 = lm(Price ~ Year, data=miniUSstamps)
ggplot(km1, aes(x=Year, y=Price)) + geom_point() + stat_smooth(method=lm, se=FALSE)

qm = lm(Price ~ Year, data=miniUSstamps)
summary(qm)
## 
## Call:
## lm(formula = Price ~ Year, data = miniUSstamps)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9232 -0.9478  0.1195  1.1899  4.5325 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.647e+03  4.686e+01  -35.15   <2e-16 ***
## Year         8.410e-01  2.357e-02   35.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.737 on 19 degrees of freedom
## Multiple R-squared:  0.9853, Adjusted R-squared:  0.9845 
## F-statistic:  1273 on 1 and 19 DF,  p-value: < 2.2e-16

\[ \hat{Price} = -1647 + 0.841*Year \]

  1. Analyze appropriate residual plots for the linear model relating stamp price and year. Are the conditions for the regression met?
plot(qm, which=1)

plot(qm, which=2)

  1. Identify any unusual residuals

1.35

  1. Construct a scatterplot to examine the relationship between the initial height in 1990 and the height in 1996. Comment on any relationship seen.
library(Stat2Data)
data("Pines")
wm = lm(Hgt96 ~ Hgt90, data=Pines)
ggplot(wm, aes(x=Hgt90, y=Hgt96)) + geom_point()

  1. Fit a least squares line for predicting the height in 1996 from the initial height in 1990.
summary(wm)
## 
## Call:
## lm(formula = Hgt96 ~ Hgt90, data = Pines)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -275.293  -42.798    7.208   46.332  181.457 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 241.2846     8.6209   27.99  < 2e-16 ***
## Hgt90         2.2504     0.4311    5.22 2.28e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69.02 on 805 degrees of freedom
##   (193 observations deleted due to missingness)
## Multiple R-squared:  0.03274,    Adjusted R-squared:  0.03154 
## F-statistic: 27.25 on 1 and 805 DF,  p-value: 2.276e-07
ggplot(wm, aes(x=Hgt90, y=Hgt96)) + geom_point() + stat_smooth(method=lm, se=FALSE)

\[ \hat{Hgt96} = 241.2846 + 2.2504*Hgt90 \]

  1. Are you satisfied with the fit of this simple linear model? Explain.