Library

library(tidyverse)

## ── Attaching packages ──────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ─────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(broom)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Data

After downloading from Moodle, import state_year_TFR.Rdata, long_PCINC_real.Rdata, state_region.Rdata.

Answer

load("state_year_TFR.Rdata")
load("long_PCINC_real.Rdata")
load("state_region.Rdata")

Task 1

Join these three dataframes into TFR_analysis. Make the following changes:

Make Region a factor
Subtract 2000 from Year
Divide PCINC_real by 1000.

glimpse TFR_analysis.

Answer

TFR_analysis =
state_year_TFR %>% 
   left_join(long_PCINC_real) %>% 
   left_join(state_region) %>% 
   mutate(Region = factor(Region),
          Year = Year - 2000,
          PCINC_real = PCINC_real/1000)

## Joining, by = c("State", "Year")

## Joining, by = "State"

glimpse(TFR_analysis)

## Rows: 800
## Columns: 5
## $ State      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Al…
## $ Year       <dbl> 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 3…
## $ TFR        <dbl> 1.92205, 1.91695, 1.93785, 2.00555, 2.04290, 2.02050, 1.94…
## $ PCINC_real <dbl> 14.50707, 15.04169, 15.27757, 15.55976, 15.72169, 15.49470…
## $ Region     <fct> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4…

Model 1

Create Model 1 to predict TFR on the basis of the numerical variable Year. Use tidy() and glance() from the broom package to see your results.

Answer

Model_1 = lm(TFR~Year,data=TFR_analysis)
tidy(Model_1)

## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   2.18     0.0158      138.  0.      
## 2 Year         -0.0218   0.00138     -15.8 3.95e-49

glance(Model_1)

## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.238         0.237 0.180      250. 3.95e-49     2   237. -468. -454.
## # … with 2 more variables: deviance <dbl>, df.residual <int>

Model 1a

Repeat Model 1 but use factor(Year) instead of Year itself. Compare this model with Model 1.

Answer

Model_1a = lm(TFR~factor(Year),data=TFR_analysis)
tidy(Model_1a)

## # A tibble: 16 x 5
##    term           estimate std.error statistic  p.value
##    <chr>             <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)     2.04       0.0250    81.6   0.      
##  2 factor(Year)4   0.00668    0.0353     0.189 8.50e- 1
##  3 factor(Year)5   0.0116     0.0353     0.327 7.43e- 1
##  4 factor(Year)6   0.0692     0.0353     1.96  5.03e- 2
##  5 factor(Year)7   0.0803     0.0353     2.27  2.32e- 2
##  6 factor(Year)8   0.0399     0.0353     1.13  2.59e- 1
##  7 factor(Year)9  -0.0281     0.0353    -0.796 4.26e- 1
##  8 factor(Year)10 -0.0911     0.0353    -2.58  1.00e- 2
##  9 factor(Year)11 -0.125      0.0353    -3.53  4.33e- 4
## 10 factor(Year)12 -0.138      0.0353    -3.92  9.67e- 5
## 11 factor(Year)13 -0.156      0.0353    -4.42  1.11e- 5
## 12 factor(Year)14 -0.152      0.0353    -4.32  1.78e- 5
## 13 factor(Year)15 -0.166      0.0353    -4.71  2.95e- 6
## 14 factor(Year)16 -0.192      0.0353    -5.44  7.12e- 8
## 15 factor(Year)17 -0.245      0.0353    -6.94  8.01e-12
## 16 factor(Year)18 -0.283      0.0353    -8.01  4.08e-15

glance(Model_1a)

## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.282         0.268 0.177      20.5 4.75e-47    16   260. -487. -407.
## # … with 2 more variables: deviance <dbl>, df.residual <int>

Model 2

Use the single explanatory variable Region. Compare the results with Model 1a.

Answer

Model_2 = lm(TFR~Region,data=TFR_analysis)
tidy(Model_2)

## # A tibble: 4 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    1.72     0.0143     120.  0.      
## 2 Region2        0.293    0.0190      15.5 1.90e-47
## 3 Region3        0.221    0.0179      12.3 3.63e-32
## 4 Region4        0.330    0.0186      17.7 2.52e-59

glance(Model_2)

## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.308         0.305 0.172      118. 2.91e-63     4   275. -540. -517.
## # … with 2 more variables: deviance <dbl>, df.residual <int>

Model 3

Use PCINC_real as the single explanatory variable. How do the results compare with the earlier models? Does the p-value for the independent variable indicate significance?

Answer

Model_3 = lm(TFR~PCINC_real,data=TFR_analysis)
tidy(Model_3)

## # A tibble: 2 x 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)   2.43     0.0420       57.8 2.60e-287
## 2 PCINC_real   -0.0252   0.00220     -11.5 2.57e- 28

glance(Model_3)

## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.142         0.141 0.191      132. 2.57e-28     2   189. -372. -358.
## # … with 2 more variables: deviance <dbl>, df.residual <int>

Model_4

Include all of the explanatory variables, with the factor version of Year. Is PCINC_real significant?

Answer

Model_4 = lm(TFR~PCINC_real + factor(Year) + Region,data=TFR_analysis)

tidy(Model_4)

## # A tibble: 20 x 5
##    term            estimate std.error statistic   p.value
##    <chr>              <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)     1.83       0.0458     39.9   2.15e-190
##  2 PCINC_real     -0.000874   0.00198    -0.441 6.59e-  1
##  3 factor(Year)4   0.00699    0.0268      0.261 7.94e-  1
##  4 factor(Year)5   0.0121     0.0268      0.451 6.52e-  1
##  5 factor(Year)6   0.0702     0.0268      2.61  9.10e-  3
##  6 factor(Year)7   0.0815     0.0269      3.03  2.51e-  3
##  7 factor(Year)8   0.0411     0.0269      1.53  1.27e-  1
##  8 factor(Year)9  -0.0274     0.0268     -1.02  3.06e-  1
##  9 factor(Year)10 -0.0902     0.0268     -3.36  8.08e-  4
## 10 factor(Year)11 -0.123      0.0269     -4.59  5.19e-  6
## 11 factor(Year)12 -0.137      0.0270     -5.06  5.17e-  7
## 12 factor(Year)13 -0.155      0.0269     -5.74  1.36e-  8
## 13 factor(Year)14 -0.150      0.0271     -5.55  3.89e-  8
## 14 factor(Year)15 -0.164      0.0274     -5.98  3.33e-  9
## 15 factor(Year)16 -0.189      0.0274     -6.92  9.26e- 12
## 16 factor(Year)17 -0.242      0.0275     -8.81  8.23e- 18
## 17 factor(Year)18 -0.280      0.0277    -10.1   1.52e- 22
## 18 Region2         0.290      0.0163     17.8   5.60e- 60
## 19 Region3         0.217      0.0168     12.9   1.44e- 34
## 20 Region4         0.327      0.0160     20.4   2.10e- 74

glance(Model_4)

## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.590         0.580 0.134      59.1 1.09e-136    20   485. -927. -829.
## # … with 2 more variables: deviance <dbl>, df.residual <int>

Model_5

Drop PCINC_real from the model and rerun it. Did the overall performance of the model change?

Answer

Model_5 = lm(TFR~factor(Year) + Region,data=TFR_analysis)

tidy(Model_5)

## # A tibble: 19 x 5
##    term           estimate std.error statistic  p.value
##    <chr>             <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)     1.81       0.0214    84.5   0.      
##  2 factor(Year)4   0.00668    0.0267     0.250 8.03e- 1
##  3 factor(Year)5   0.0116     0.0267     0.432 6.66e- 1
##  4 factor(Year)6   0.0692     0.0267     2.59  9.81e- 3
##  5 factor(Year)7   0.0803     0.0267     3.00  2.76e- 3
##  6 factor(Year)8   0.0399     0.0267     1.49  1.36e- 1
##  7 factor(Year)9  -0.0281     0.0267    -1.05  2.93e- 1
##  8 factor(Year)10 -0.0911     0.0267    -3.41  6.83e- 4
##  9 factor(Year)11 -0.125      0.0267    -4.67  3.58e- 6
## 10 factor(Year)12 -0.138      0.0267    -5.18  2.89e- 7
## 11 factor(Year)13 -0.156      0.0267    -5.84  7.60e- 9
## 12 factor(Year)14 -0.152      0.0267    -5.70  1.67e- 8
## 13 factor(Year)15 -0.166      0.0267    -6.22  8.16e-10
## 14 factor(Year)16 -0.192      0.0267    -7.18  1.57e-12
## 15 factor(Year)17 -0.245      0.0267    -9.17  4.09e-19
## 16 factor(Year)18 -0.283      0.0267   -10.6   1.52e-24
## 17 Region2         0.293      0.0147    19.9   1.06e-71
## 18 Region3         0.221      0.0139    15.9   1.87e-49
## 19 Region4         0.330      0.0145    22.8   2.18e-88

glance(Model_5)

## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.590         0.580 0.134      62.4 1.49e-137    19   485. -929. -835.
## # … with 2 more variables: deviance <dbl>, df.residual <int>

Detailed Results

Use augment() from the broom package to create detailed results in a dataframe Results. Add the variable State from TFR_analysis since augment only includes variables used in the model

Answer

Results = augment(Model_5)
Results$State = TFR_analysis$State
summary(Results)

##       TFR         factor.Year. Region     .fitted         .se.fit       
##  Min.   :1.440   3      : 50   1:144   Min.   :1.526   Min.   :0.02012  
##  1st Qu.:1.810   4      : 50   2:192   1st Qu.:1.858   1st Qu.:0.02012  
##  Median :1.940   5      : 50   3:256   Median :1.964   Median :0.02051  
##  Mean   :1.951   6      : 50   4:208   Mean   :1.951   Mean   :0.02059  
##  3rd Qu.:2.069   7      : 50           3rd Qu.:2.075   3rd Qu.:0.02069  
##  Max.   :2.679   8      : 50           Max.   :2.219   Max.   :0.02142  
##                  (Other):500                                            
##      .resid              .hat             .sigma          .cooksd         
##  Min.   :-0.37041   Min.   :0.02266   Min.   :0.1326   Min.   :6.000e-09  
##  1st Qu.:-0.08319   1st Qu.:0.02266   1st Qu.:0.1336   1st Qu.:1.328e-04  
##  Median :-0.02503   Median :0.02356   Median :0.1337   Median :4.950e-04  
##  Mean   : 0.00000   Mean   :0.02375   Mean   :0.1336   Mean   :1.277e-03  
##  3rd Qu.: 0.08079   3rd Qu.:0.02396   3rd Qu.:0.1337   3rd Qu.:1.433e-03  
##  Max.   : 0.48548   Max.   :0.02569   Max.   :0.1337   Max.   :1.716e-02  
##                                                                           
##    .std.resid         State          
##  Min.   :-2.8048   Length:800        
##  1st Qu.:-0.6297   Class :character  
##  Median :-0.1894   Mode  :character  
##  Mean   : 0.0000                     
##  3rd Qu.: 0.6118                     
##  Max.   : 3.6761                     
##

TFR Regressions

Library

Data

Answer

Task 1

Answer

Model 1

Answer

Model 1a

Answer

Model 2

Answer

Model 3

Answer

Model_4

Answer

Model_5

Answer

Detailed Results

Answer

Big Misses?