Notes 463 Mar 3

Harold Nelson

3/3/2021

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(socviz)

Check mpg

Examine the dataframe mpg, which is in ggplot2 using glimpse and summary.

Answer

glimpse(mpg)
## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", …
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", …
## $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2…
## $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 20…
## $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8,…
## $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "aut…
## $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "…
## $ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, …
## $ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, …
## $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "…
## $ class        <chr> "compact", "compact", "compact", "compact", "compact", "…
summary(mpg)
##  manufacturer          model               displ            year     
##  Length:234         Length:234         Min.   :1.600   Min.   :1999  
##  Class :character   Class :character   1st Qu.:2.400   1st Qu.:1999  
##  Mode  :character   Mode  :character   Median :3.300   Median :2004  
##                                        Mean   :3.472   Mean   :2004  
##                                        3rd Qu.:4.600   3rd Qu.:2008  
##                                        Max.   :7.000   Max.   :2008  
##       cyl           trans               drv                 cty       
##  Min.   :4.000   Length:234         Length:234         Min.   : 9.00  
##  1st Qu.:4.000   Class :character   Class :character   1st Qu.:14.00  
##  Median :6.000   Mode  :character   Mode  :character   Median :17.00  
##  Mean   :5.889                                         Mean   :16.86  
##  3rd Qu.:8.000                                         3rd Qu.:19.00  
##  Max.   :8.000                                         Max.   :35.00  
##       hwy             fl               class          
##  Min.   :12.00   Length:234         Length:234        
##  1st Qu.:18.00   Class :character   Class :character  
##  Median :24.00   Mode  :character   Mode  :character  
##  Mean   :23.44                                        
##  3rd Qu.:27.00                                        
##  Max.   :44.00

Demo Basic Regression

Create model da using mpg. Use disp to predict cty.

Answer

da = lm(cty~displ,data = mpg)

Model Structure

Examine the structure of da.

Answer

str(da)
## List of 12
##  $ coefficients : Named num [1:2] 25.99 -2.63
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "displ"
##  $ residuals    : Named num [1:234] -3.257 -0.257 -0.731 0.269 -2.626 ...
##   ..- attr(*, "names")= chr [1:234] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:234] -257.893 -51.875 -0.527 0.473 -2.425 ...
##   ..- attr(*, "names")= chr [1:234] "(Intercept)" "displ" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:234] 21.3 21.3 20.7 20.7 18.6 ...
##   ..- attr(*, "names")= chr [1:234] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:234, 1:2] -15.2971 0.0654 0.0654 0.0654 0.0654 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:234] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "displ"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.07 1.08
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 232
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = cty ~ displ, data = mpg)
##  $ terms        :Classes 'terms', 'formula'  language cty ~ displ
##   .. ..- attr(*, "variables")= language list(cty, displ)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "cty" "displ"
##   .. .. .. ..$ : chr "displ"
##   .. ..- attr(*, "term.labels")= chr "displ"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(cty, displ)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "cty" "displ"
##  $ model        :'data.frame':   234 obs. of  2 variables:
##   ..$ cty  : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##   ..$ displ: num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language cty ~ displ
##   .. .. ..- attr(*, "variables")= language list(cty, displ)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "cty" "displ"
##   .. .. .. .. ..$ : chr "displ"
##   .. .. ..- attr(*, "term.labels")= chr "displ"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(cty, displ)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "cty" "displ"
##  - attr(*, "class")= chr "lm"

Note that the model is a list. We can see a few potentially useful item - residuals and fitted.values. Get these out of the list using the $ operator and use str() to see what they are. Call the new vectors resa and fa,

Answer

resa = da$residuals
str(resa)
##  Named num [1:234] -3.257 -0.257 -0.731 0.269 -2.626 ...
##  - attr(*, "names")= chr [1:234] "1" "2" "3" "4" ...
fa = da$fitted.values
str(fa)
##  Named num [1:234] 21.3 21.3 20.7 20.7 18.6 ...
##  - attr(*, "names")= chr [1:234] "1" "2" "3" "4" ...

Augmentation

Create a new dataframe dfa using cbind to combine mpg with resa and fa. We want to examine the relationships among the residuals and the other items in this augmented dataframe. Look at the first few observation to verify your work.

Answer

dfa = cbind(mpg,resa,fa)
head(dfa)
##   manufacturer model displ year cyl      trans drv cty hwy fl   class
## 1         audi    a4   1.8 1999   4   auto(l5)   f  18  29  p compact
## 2         audi    a4   1.8 1999   4 manual(m5)   f  21  29  p compact
## 3         audi    a4   2.0 2008   4 manual(m6)   f  20  31  p compact
## 4         audi    a4   2.0 2008   4   auto(av)   f  21  30  p compact
## 5         audi    a4   2.8 1999   6   auto(l5)   f  16  26  p compact
## 6         audi    a4   2.8 1999   6 manual(m5)   f  18  26  p compact
##         resa       fa
## 1 -3.2566002 21.25660
## 2 -0.2566002 21.25660
## 3 -0.7305039 20.73050
## 4  0.2694961 20.73050
## 5 -2.6261185 18.62612
## 6 -0.6261185 18.62612

It is useful to keep a basic equation in mind.

\[ Actual = Fitted + Residual\]

From this it follows that

\[ Residual = Actual - Fitted\] So a positive residual means that the actual value is greater than the fitted value.

The term “predicted” is just another word for “fitted”.

Residual and Fitted

Visualize the relationship between the fitted values and the residuals in this cases.

Answer

dfa %>% 
  ggplot(aes(fa,resa)) +
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

The issue we see is that the value of the residual depends on the fitted value. This means that there is unexploited information in the simple linear model.

We might try a quadratic term in the model. Call this second model db. Create it and the corresponding dfb.

Answer

db = lm(cty~displ+I(displ^2),data = mpg)

resb = db$residuals
fb = db$fitted.values
dfb = cbind(mpg,resb,fb)

dfb %>% 
  ggplot(aes(fb,resb)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

This is better since I wouldn’t feel so strongly that I could improve on the fitted value.

What about the other variables in the data. Could I predict the residual using the variable class?

Visualize the relationship between the categorical variable class and the residuals using one or more appropriate methods.

Answer

dfb %>% ggplot(aes(factor(class),resb)) + geom_boxplot() +
geom_jitter() +
geom_hline(yintercept = 0,color="red")

I would be inclined to do something about the 2seaters and the pickups. I would not add the categorical variable class since most types don’t have aproblem.

Create dummy variables for these two types and use them to create a new dataframe mpgc. Run tables to compare class with the new variables.

Answer

mpgc = mpg %>% 
  mutate(d2seat = class == "2seater",
         dpu = class == "pickup")

table(mpgc$class,mpgc$d2seat)
##             
##              FALSE TRUE
##   2seater        0    5
##   compact       47    0
##   midsize       41    0
##   minivan       11    0
##   pickup        33    0
##   subcompact    35    0
##   suv           62    0
table(mpgc$class,mpgc$dpu)
##             
##              FALSE TRUE
##   2seater        5    0
##   compact       47    0
##   midsize       41    0
##   minivan       11    0
##   pickup         0   33
##   subcompact    35    0
##   suv           62    0

Now create the c version of our work with the new dummies.

Answer

dc = lm(cty~displ+
        I(displ^2) + 
          d2seat +
          dpu,
        data = mpgc)

resc = dc$residuals
fc = dc$fitted.values
dfc = cbind(mpg,resc,fc)

dfc %>% 
  ggplot(aes(fc,resc)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Now visualize the relationship between the categorical variable class and the residuals again.

Answer

dfc %>% ggplot(aes(factor(class),resc)) + geom_boxplot() +
geom_jitter() +
geom_hline(yintercept = 0,color="red")