Advanced Statistical Methods in Policy Analysis

Amit Patel, Jose Magallanes, Scott Atherley

Session 1: Regression Basics

GET DATA

# define variables that will be used later
folder="C:/Users/Administrator/Downloads"
filename="salary.dta"

library(foreign) # needed for .dat files

## Warning: package 'foreign' was built under R version 3.0.3

setwd(folder)
# sets the R working directory, telling the computer where to look for Files

# read in data
data=read.dta(filename)

# attach the data-frame; R will automatically look for variables in this data frame while attached
# attach(data)

A few things to take note of: you should now see the variables you created (folder and filename) on the right, listed under values. You can use them again if you want, or erase them. Similarly, you will see the data you’ve loaded at top right under data. The default R object for data files is a ‘data-frame’ - this is essentially a matrix with column and row names (it corresponds nicely to an Excel spreadsheet, or a loaded datafile in a program such as SPSS or STATA). Note that you can load multiple data-frames in R - you just have to refer to each one when you want to use them. This is something we will take advantage of throughout the course.

For now, we can remove the filename and folder variables, just for practice.

rm(filename, folder)

SEE WHAT’S IN THE DATA

require(dplyr)

## Loading required package: dplyr

## Warning: package 'dplyr' was built under R version 3.0.3

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# use a user-developed function to look at the data
# we will also be using a great many of these
# if you don't have them, you'll have to install using install.packages("package")
# you load the packages, after installing, using library(package)
glimpse(data)

## Variables:
## $ id       (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ salbeg   (int) 8400, 24000, 10200, 8700, 17400, 12996, 6900, 5400, 5...
## $ sex      (fctr) males, males, males, males, males, males, males, mal...
## $ time     (int) 81, 73, 83, 93, 83, 80, 79, 67, 96, 77, 84, 88, 93, 9...
## $ age      (dbl) 28.50, 40.33, 31.08, 31.17, 41.92, 29.50, 28.00, 28.7...
## $ salnow   (int) 16080, 41400, 21960, 19200, 28350, 27250, 16080, 1410...
## $ edlevel  (int) 16, 16, 15, 16, 19, 18, 15, 15, 15, 12, 15, 12, 17, 1...
## $ work     (dbl) 0.25, 12.50, 4.08, 1.83, 13.00, 2.42, 3.17, 0.50, 1.1...
## $ jobcat   (fctr) college trainee, exempt employee, exempt employee, c...
## $ minority (fctr) white, white, white, white, white, white, white, whi...
## $ sexrace  (fctr) white males, white males, white males, white males, ...

Describing the data

library(Hmisc)

## Warning: package 'Hmisc' was built under R version 3.0.3

## Loading required package: grid
## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.0.3

## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula

## Warning: package 'Formula' was built under R version 3.0.3

## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

describe(data)

## data 
## 
##  11  Variables      474  Observations
## ---------------------------------------------------------------------------
## id 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##     474       0     474       1   237.5   24.65   48.30  119.25  237.50 
##     .75     .90     .95 
##  355.75  426.70  450.35 
## 
## lowest :   1   2   3   4   5, highest: 470 471 472 473 474 
## ---------------------------------------------------------------------------
## salbeg 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##     474       0      90       1    6806    4080    4380    4995    6000 
##     .75     .90     .95 
##    6996   11000   13200 
## 
## lowest :  3600  3900  4020  4080  4200
## highest: 18000 18996 21000 24000 31992 
## ---------------------------------------------------------------------------
## sex 
##       n missing  unique 
##     474       0       2 
## 
## males (258, 54%), females (216, 46%) 
## ---------------------------------------------------------------------------
## time 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##     474       0      36       1   81.11      65      67      72      81 
##     .75     .90     .95 
##      90      94      97 
## 
## lowest : 63 64 65 66 67, highest: 94 95 96 97 98 
## ---------------------------------------------------------------------------
## age 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##     474       0     259       1   37.19   24.42   25.19   28.50   32.00 
##     .75     .90     .95 
##   45.98   56.84   60.67 
## 
## lowest : 23.00 23.25 23.33 23.42 23.58
## highest: 63.75 63.83 63.92 64.25 64.50 
## ---------------------------------------------------------------------------
## salnow 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##     474       0     221       1   13768    7797    8418    9600   11550 
##     .75     .90     .95 
##   14775   23757   28000 
## 
## lowest :  6300  6360  6480  6540  6600
## highest: 40000 41400 41500 44250 54000 
## ---------------------------------------------------------------------------
## edlevel 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##     474       0      10    0.92   13.49       8       8      12      12 
##     .75     .90     .95 
##      15      17      19 
## 
##            8  12 14  15 16 17 18 19 20 21
## Frequency 53 190  6 116 59 11  9 27  2  1
## %         11  40  1  24 12  2  2  6  0  0
## ---------------------------------------------------------------------------
## work 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##     474       0     208       1   7.989  0.1105  0.4200  1.6025  4.5800 
##     .75     .90     .95 
## 11.5600 21.6750 26.7855 
## 
## lowest :  0.00  0.17  0.25  0.33  0.42
## highest: 36.50 37.00 37.58 38.33 39.67 
## ---------------------------------------------------------------------------
## jobcat 
##       n missing  unique 
##     474       0       7 
## 
## clerical (227, 48%), office trainee (136, 29%) 
## security officer (27, 6%) 
## college trainee (41, 9%) 
## exempt employee (32, 7%), mba trainee (5, 1%) 
## technical (6, 1%) 
## ---------------------------------------------------------------------------
## minority 
##       n missing  unique 
##     474       0       2 
## 
## white (370, 78%), nonwhite (104, 22%) 
## ---------------------------------------------------------------------------
## sexrace 
##       n missing  unique 
##     474       0       4 
## 
## white males (194, 41%), minority males (64, 14%) 
## white females (176, 37%) 
## minority females (40, 8%) 
## ---------------------------------------------------------------------------

Might also want to describe specific variables.

describe(data$salbeg)

## data$salbeg 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##     474       0      90       1    6806    4080    4380    4995    6000 
##     .75     .90     .95 
##    6996   11000   13200 
## 
## lowest :  3600  3900  4020  4080  4200
## highest: 18000 18996 21000 24000 31992

# the '$' operator allows you to refer to a named column (a variable) within a data frame

Describe gives you a lot of useful descriptive statistics, such as the total N (number of observations), missing values, unique values, the mean and percentiles for the data, and a list of the 5 lowest and highest values.

Other ways of describing data are also possible. We can generate % Frequency Distribution Tables of variables as follows.

attach(data)
table(jobcat)

## jobcat
##         clerical   office trainee security officer  college trainee 
##              227              136               27               41 
##  exempt employee      mba trainee        technical 
##               32                5                6

cumsum(table(jobcat))

##         clerical   office trainee security officer  college trainee 
##              227              363              390              431 
##  exempt employee      mba trainee        technical 
##              463              468              474

prop.table(table(jobcat))

## jobcat
##         clerical   office trainee security officer  college trainee 
##       0.47890295       0.28691983       0.05696203       0.08649789 
##  exempt employee      mba trainee        technical 
##       0.06751055       0.01054852       0.01265823

cbind(Freq=table(jobcat), Cumul=cumsum(table(jobcat)), relative=prop.table(table(jobcat)))

##                  Freq Cumul   relative
## clerical          227   227 0.47890295
## office trainee    136   363 0.28691983
## security officer   27   390 0.05696203
## college trainee    41   431 0.08649789
## exempt employee    32   463 0.06751055
## mba trainee         5   468 0.01054852
## technical           6   474 0.01265823

detach(data)

There are more steps to this process than in a program like STATA. ‘Table’ will give you counts for each value of jobcat, ‘cumsum’ will give you the cumulative sum of counts, and prop.table will give you the proportion of observations in each category. These three columns together make up a standard % frequency distribution table, which is defined in the last row above.

The R command ‘cbind’ stands for column-bind - it combines two column vectors into a single data object (they must be the same length). In the above example, we put the column of counts next to the column of cumulative counts next to the column of proportions. This gives us the table we want.

Exercises: - Store the table as a variable - Generate a 4th column, the cumulative proportion for each category, and add it to the table.

Graphical descriptions of data. Box-plots are another good way to describe variables.

plot(data$salnow~data$jobcat)

The basic R ‘plot’ command is quite flexible and, generally, pretty good at giving you the right plot by default. The syntax above is structured like an R formula: salnow is the DV, ‘~’ is the formula-format equal sign, and jobcat is the IV (grouping variable).

# the help command will give you information about commands and describe possible options
help(plot)

## starting httpd help server ... done

Descriptive Analysis Practice

Run some basic bivariate statistical analyses (t-test) and cross-tabulations of variables of interest

attach(data)
# test that the mean of salary for males is not equal to 0
t.test(salnow[sex=='males'],mu=0)

## 
##  One Sample t-test
## 
## data:  salnow[sex == "males"]
## t = 34.1375, df = 257, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  15620.48 17532.95
## sample estimates:
## mean of x 
##  16576.71

# test if males salaries are statistically different from females
t.test(salnow~sex,mu=0)

## 
##  Welch Two Sample t-test
## 
## data:  salnow by sex
## t = 11.6883, df = 344.262, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5126.691 7201.198
## sample estimates:
##   mean in group males mean in group females 
##              16576.71              10412.77

# mu is the null hypothesis; in the first case, we theorize that the mean salary for males is non-0
# in the second case, we hypothesize that the mean difference in salary between males and non-males is non-0

#tabulate sex jobcat, chi2
library(gmodels)

## Warning: package 'gmodels' was built under R version 3.0.3

CrossTable(sex,jobcat,chisq=T)

## Warning in chisq.test(t, correct = FALSE, ...): Chi-squared approximation
## may be incorrect

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  474 
## 
##  
##              | jobcat 
##          sex |         clerical |   office trainee | security officer |  college trainee |  exempt employee |      mba trainee |        technical |        Row Total | 
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
##        males |              110 |               47 |               27 |               34 |               30 |                4 |                6 |              258 | 
##              |            1.488 |            9.866 |           10.301 |            6.117 |            9.089 |            0.601 |            2.289 |                  | 
##              |            0.426 |            0.182 |            0.105 |            0.132 |            0.116 |            0.016 |            0.023 |            0.544 | 
##              |            0.485 |            0.346 |            1.000 |            0.829 |            0.938 |            0.800 |            1.000 |                  | 
##              |            0.232 |            0.099 |            0.057 |            0.072 |            0.063 |            0.008 |            0.013 |                  | 
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
##      females |              117 |               89 |                0 |                7 |                2 |                1 |                0 |              216 | 
##              |            1.777 |           11.785 |           12.304 |            7.306 |           10.857 |            0.717 |            2.734 |                  | 
##              |            0.542 |            0.412 |            0.000 |            0.032 |            0.009 |            0.005 |            0.000 |            0.456 | 
##              |            0.515 |            0.654 |            0.000 |            0.171 |            0.062 |            0.200 |            0.000 |                  | 
##              |            0.247 |            0.188 |            0.000 |            0.015 |            0.004 |            0.002 |            0.000 |                  | 
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## Column Total |              227 |              136 |               27 |               41 |               32 |                5 |                6 |              474 | 
##              |            0.479 |            0.287 |            0.057 |            0.086 |            0.068 |            0.011 |            0.013 |                  | 
## -------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  87.23029     d.f. =  6     p =  1.138509e-16 
## 
## 
##

#tabulate sex minority, chi2
CrossTable(sex,minority, prop.r=F, prop.c=F, prop.t=F, prop.chisq=F,chi = TRUE)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  474 
## 
##  
##              | minority 
##          sex |     white |  nonwhite | Row Total | 
## -------------|-----------|-----------|-----------|
##        males |       194 |        64 |       258 | 
## -------------|-----------|-----------|-----------|
##      females |       176 |        40 |       216 | 
## -------------|-----------|-----------|-----------|
## Column Total |       370 |       104 |       474 | 
## -------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  2.713926     d.f. =  1     p =  0.0994759 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  2.359218     d.f. =  1     p =  0.1245446 
## 
##

detach(data)

Finally, we’ll run a basic linear regression predicting a person’s salary with several explanatory variables. This has a lot of practical applications; for example, one might want to determine whether men and women are paid equally, after controlling for other factors that could influence salary outcomes.

We already have most of the control variables we need in the data, but will generate one more. Because we have limited sample size in several of the job categories, we might want to combine categories to help generate more meaningful results.

# generate some categorical variables for analysis
# divide jobs into 3 categories (low, medium and high skill levels)

data$jobcat_L <- ifelse(as.numeric(data$jobcat)<=3,1,0)
data$jobcat_M <- ifelse(as.numeric(data$jobcat) %in% c(4,5,6),1,0)
# the %in% category is a good shortcut for complicated categorical statements 
# it tells R to set jobcat_m to 1 if it is in the list (4,5,6)
# the c() format is a common way to feed lists to functions
data$jobcat_H <- ifelse(as.numeric(data$jobcat)==7,1,0)
table (data$jobcat_L)

## 
##   0   1 
##  84 390

table (data$jobcat_M)

## 
##   0   1 
## 396  78

table (data$jobcat_H)

## 
##   0   1 
## 468   6

Regression

Now we can run our regression.

reg1=lm(salnow~sex + age + work + minority + salbeg + time + jobcat_L + jobcat_M, data)
summary(reg1)

## 
## Call:
## lm(formula = salnow ~ sex + age + work + minority + salbeg + 
##     time + jobcat_L + jobcat_M, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10507.6  -1306.8   -271.6    994.2  18623.8 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.181e+03  2.376e+03   2.602 0.009577 ** 
## sexfemales       -1.085e+03  3.174e+02  -3.417 0.000689 ***
## age              -3.735e+01  1.941e+01  -1.924 0.055004 .  
## work             -4.945e+01  2.678e+01  -1.846 0.065489 .  
## minoritynonwhite -3.274e+02  3.208e+02  -1.021 0.307891    
## salbeg            1.363e+00  8.092e-02  16.844  < 2e-16 ***
## time              6.423e+01  1.291e+01   4.975 9.21e-07 ***
## jobcat_L         -5.417e+03  1.593e+03  -3.400 0.000733 ***
## jobcat_M         -5.663e+02  1.361e+03  -0.416 0.677539    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2778 on 465 degrees of freedom
## Multiple R-squared:  0.8374, Adjusted R-squared:  0.8346 
## F-statistic: 299.4 on 8 and 465 DF,  p-value: < 2.2e-16