Advanced Statistical Methods in Policy Analysis

Amit Patel, Jose Magallanes, Scott Atherley

Session 1: Regression Basics

  1. GET DATA
folder = "~/Documents/CoursesMPP/AdvStats/s1"
filename = "salary.dta"

library(foreign)  # needed for .dat files

setwd(folder)
data = read.dta(filename)
attach(data)
  1. SEE WHAT'S IN THE DATA
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(data)
## Variables:
## $ id       (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ salbeg   (int) 8400, 24000, 10200, 8700, 17400, 12996, 6900, 5400, 5...
## $ sex      (fctr) males, males, males, males, males, males, males, mal...
## $ time     (int) 81, 73, 83, 93, 83, 80, 79, 67, 96, 77, 84, 88, 93, 9...
## $ age      (dbl) 28.50, 40.33, 31.08, 31.17, 41.92, 29.50, 28.00, 28.7...
## $ salnow   (int) 16080, 41400, 21960, 19200, 28350, 27250, 16080, 1410...
## $ edlevel  (int) 16, 16, 15, 16, 19, 18, 15, 15, 15, 12, 15, 12, 17, 1...
## $ work     (dbl) 0.25, 12.50, 4.08, 1.83, 13.00, 2.42, 3.17, 0.50, 1.1...
## $ jobcat   (fctr) college trainee, exempt employee, exempt employee, c...
## $ minority (fctr) white, white, white, white, white, white, white, whi...
## $ sexrace  (fctr) white males, white males, white males, white males, ...
  1. Describing the data
library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
describe(data)
## data 
## 
##  11  Variables      474  Observations
## ---------------------------------------------------------------------------
## id 
##       n missing  unique    Mean     .05     .10     .25     .50     .75 
##     474       0     474   237.5   24.65   48.30  119.25  237.50  355.75 
##     .90     .95 
##  426.70  450.35 
## 
## lowest :   1   2   3   4   5, highest: 470 471 472 473 474 
## ---------------------------------------------------------------------------
## salbeg 
##       n missing  unique    Mean     .05     .10     .25     .50     .75 
##     474       0      90    6806    4080    4380    4995    6000    6996 
##     .90     .95 
##   11000   13200 
## 
## lowest :  3600  3900  4020  4080  4200
## highest: 18000 18996 21000 24000 31992 
## ---------------------------------------------------------------------------
## sex 
##       n missing  unique 
##     474       0       2 
## 
## males (258, 54%), females (216, 46%) 
## ---------------------------------------------------------------------------
## time 
##       n missing  unique    Mean     .05     .10     .25     .50     .75 
##     474       0      36   81.11      65      67      72      81      90 
##     .90     .95 
##      94      97 
## 
## lowest : 63 64 65 66 67, highest: 94 95 96 97 98 
## ---------------------------------------------------------------------------
## age 
##       n missing  unique    Mean     .05     .10     .25     .50     .75 
##     474       0     259   37.19   24.42   25.19   28.50   32.00   45.98 
##     .90     .95 
##   56.84   60.67 
## 
## lowest : 23.00 23.25 23.33 23.42 23.58
## highest: 63.75 63.83 63.92 64.25 64.50 
## ---------------------------------------------------------------------------
## salnow 
##       n missing  unique    Mean     .05     .10     .25     .50     .75 
##     474       0     221   13768    7797    8418    9600   11550   14775 
##     .90     .95 
##   23757   28000 
## 
## lowest :  6300  6360  6480  6540  6600
## highest: 40000 41400 41500 44250 54000 
## ---------------------------------------------------------------------------
## edlevel 
##       n missing  unique    Mean     .05     .10     .25     .50     .75 
##     474       0      10   13.49       8       8      12      12      15 
##     .90     .95 
##      17      19 
## 
##            8  12 14  15 16 17 18 19 20 21
## Frequency 53 190  6 116 59 11  9 27  2  1
## %         11  40  1  24 12  2  2  6  0  0
## ---------------------------------------------------------------------------
## work 
##       n missing  unique    Mean     .05     .10     .25     .50     .75 
##     474       0     208   7.989  0.1105  0.4200  1.6025  4.5800 11.5600 
##     .90     .95 
## 21.6750 26.7855 
## 
## lowest :  0.00  0.17  0.25  0.33  0.42
## highest: 36.50 37.00 37.58 38.33 39.67 
## ---------------------------------------------------------------------------
## jobcat 
##       n missing  unique 
##     474       0       7 
## 
## clerical (227, 48%), office trainee (136, 29%) 
## security officer (27, 6%) 
## college trainee (41, 9%) 
## exempt employee (32, 7%), mba trainee (5, 1%) 
## technical (6, 1%) 
## ---------------------------------------------------------------------------
## minority 
##       n missing  unique 
##     474       0       2 
## 
## white (370, 78%), nonwhite (104, 22%) 
## ---------------------------------------------------------------------------
## sexrace 
##       n missing  unique 
##     474       0       4 
## 
## white males (194, 41%), minority males (64, 14%) 
## white females (176, 37%) 
## minority females (40, 8%) 
## ---------------------------------------------------------------------------
table(jobcat)
## jobcat
##         clerical   office trainee security officer  college trainee 
##              227              136               27               41 
##  exempt employee      mba trainee        technical 
##               32                5                6
cumsum(table(jobcat))
##         clerical   office trainee security officer  college trainee 
##              227              363              390              431 
##  exempt employee      mba trainee        technical 
##              463              468              474
prop.table(table(jobcat))
## jobcat
##         clerical   office trainee security officer  college trainee 
##          0.47890          0.28692          0.05696          0.08650 
##  exempt employee      mba trainee        technical 
##          0.06751          0.01055          0.01266
cbind(Freq = table(jobcat), Cumul = cumsum(table(jobcat)), relative = prop.table(table(jobcat)))
##                  Freq Cumul relative
## clerical          227   227  0.47890
## office trainee    136   363  0.28692
## security officer   27   390  0.05696
## college trainee    41   431  0.08650
## exempt employee    32   463  0.06751
## mba trainee         5   468  0.01055
## technical           6   474  0.01266
# graph box salnow, over(jobcat)
plot(salnow ~ jobcat)

plot of chunk unnamed-chunk-5

# gen jobcat_L = (jobcat == 1 | jobcat == 2 | jobcat == 3)

data$jobcat_L = ifelse(as.numeric(jobcat) <= 3, 1, 0)
data$jobcat_M = ifelse(as.numeric(jobcat) %in% c(4, 5, 6), 1, 0)
data$jobcat_H = ifelse(as.numeric(jobcat) == 7, 1, 0)
table(data$jobcat_L)
## 
##   0   1 
##  84 390
table(data$jobcat_M)
## 
##   0   1 
## 396  78
table(data$jobcat_H)
## 
##   0   1 
## 468   6

Regression

# regress salnow sex age work minority salbeg time jobcat_L jobcat_M
reg1 = lm(salnow ~ sex + age + work + minority + salbeg + time + jobcat_L + 
    jobcat_M, data)
summary(reg1)
## 
## Call:
## lm(formula = salnow ~ sex + age + work + minority + salbeg + 
##     time + jobcat_L + jobcat_M, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10508  -1307   -272    994  18624 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.18e+03   2.38e+03    2.60  0.00958 ** 
## sexfemales       -1.08e+03   3.17e+02   -3.42  0.00069 ***
## age              -3.73e+01   1.94e+01   -1.92  0.05500 .  
## work             -4.94e+01   2.68e+01   -1.85  0.06549 .  
## minoritynonwhite -3.27e+02   3.21e+02   -1.02  0.30789    
## salbeg            1.36e+00   8.09e-02   16.84  < 2e-16 ***
## time              6.42e+01   1.29e+01    4.97  9.2e-07 ***
## jobcat_L         -5.42e+03   1.59e+03   -3.40  0.00073 ***
## jobcat_M         -5.66e+02   1.36e+03   -0.42  0.67754    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2780 on 465 degrees of freedom
## Multiple R-squared:  0.837,  Adjusted R-squared:  0.835 
## F-statistic:  299 on 8 and 465 DF,  p-value: <2e-16

Descriptive Analysis Practice

# by sex, sort : ttest salnow == 0
t.test(salnow[sex == "males"], mu = 0)
## 
##  One Sample t-test
## 
## data:  salnow[sex == "males"]
## t = 34.14, df = 257, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  15620 17533
## sample estimates:
## mean of x 
##     16577
t.test(salnow[sex == "males"], mu = 0)
## 
##  One Sample t-test
## 
## data:  salnow[sex == "males"]
## t = 34.14, df = 257, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  15620 17533
## sample estimates:
## mean of x 
##     16577
# ttest salnow, by(sex)
t.test(salnow ~ sex, mu = 0)
## 
##  Welch Two Sample t-test
## 
## data:  salnow by sex
## t = 11.69, df = 344.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5127 7201
## sample estimates:
##   mean in group males mean in group females 
##                 16577                 10413
# tabulate sex jobcat, chi2
library(gmodels)
CrossTable(sex, jobcat, chisq = T)
## Warning: Chi-squared approximation may be incorrect
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  474 
## 
##  
##              | jobcat 
##          sex |         clerical |   office trainee | security officer |  college trainee |  exempt employee |      mba trainee |        technical |        Row Total | 
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
##        males |              110 |               47 |               27 |               34 |               30 |                4 |                6 |              258 | 
##              |            1.488 |            9.866 |           10.301 |            6.117 |            9.089 |            0.601 |            2.289 |                  | 
##              |            0.426 |            0.182 |            0.105 |            0.132 |            0.116 |            0.016 |            0.023 |            0.544 | 
##              |            0.485 |            0.346 |            1.000 |            0.829 |            0.938 |            0.800 |            1.000 |                  | 
##              |            0.232 |            0.099 |            0.057 |            0.072 |            0.063 |            0.008 |            0.013 |                  | 
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
##      females |              117 |               89 |                0 |                7 |                2 |                1 |                0 |              216 | 
##              |            1.777 |           11.785 |           12.304 |            7.306 |           10.857 |            0.717 |            2.734 |                  | 
##              |            0.542 |            0.412 |            0.000 |            0.032 |            0.009 |            0.005 |            0.000 |            0.456 | 
##              |            0.515 |            0.654 |            0.000 |            0.171 |            0.062 |            0.200 |            0.000 |                  | 
##              |            0.247 |            0.188 |            0.000 |            0.015 |            0.004 |            0.002 |            0.000 |                  | 
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## Column Total |              227 |              136 |               27 |               41 |               32 |                5 |                6 |              474 | 
##              |            0.479 |            0.287 |            0.057 |            0.086 |            0.068 |            0.011 |            0.013 |                  | 
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  87.23     d.f. =  6     p =  1.139e-16 
## 
## 
## 
# tabulate sex minority, chi2
CrossTable(sex, minority, prop.r = F, prop.c = F, prop.t = F, prop.chisq = F, 
    chi = TRUE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  474 
## 
##  
##              | minority 
##          sex |     white |  nonwhite | Row Total | 
## -------------|-----------|-----------|-----------|
##        males |       194 |        64 |       258 | 
## -------------|-----------|-----------|-----------|
##      females |       176 |        40 |       216 | 
## -------------|-----------|-----------|-----------|
## Column Total |       370 |       104 |       474 | 
## -------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  2.714     d.f. =  1     p =  0.09948 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  2.359     d.f. =  1     p =  0.1245 
## 
##