Measurement Paper Revisions

Check difference between excluded sample and subsample for certain variables

Race

##    
##     White AfrAm
##   0   304   419
##   1   878  1199
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl2
## X-squared = 0.0038454373, df = 1, p-value = 0.9505536

Sex

##    
##     Women  Men
##   0   391  332
##   1  1183  894
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl3
## X-squared = 1.6884265, df = 1, p-value = 0.1938085

Income

##    
##     Above 20,000 Below 20,000
##   0           62           70
##   1          981         1096
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl4
## X-squared = 4.3634385e-29, df = 1, p-value = 1

Employment

##    
##      Yes   No
##   0  333  320
##   1 1204  873
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl5
## X-squared = 9.5364426, df = 1, p-value = 0.002014318

Education

##    
##     8th grade or below Bachelor's degree HS graduate/GED
##   0                 65                21             204
##   1                118               137             705
##    
##     Some college/Associate Degree
##   0                           138
##   1                           490
##    
##     Some Graduate School/Graduate Degree (MA,PhD,MD,JD) Some HS
##   0                                                  33     189
##   1                                                 104     523
## 
##  Pearson's Chi-squared test
## 
## data:  tbl6
## X-squared = 28.49065, df = 5, p-value = 2.918025e-05

Poverty Status

##    
##     Above Below
##   0   440   283
##   1  1214   863
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl7
## X-squared = 1.1884274, df = 1, p-value = 0.275647

Age

## 
##  Welch Two Sample t-test
## 
## data:  y by x
## t = 1.8462909, df = 1255.168, p-value = 0.06508529
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.0461739977  1.5215464328
## sample estimates:
## mean in group 0 mean in group 1 
##     48.58506224     47.84737602

Education Frequency Data

summary(MeasurementData1$Edcat)
##                                  8th grade or below 
##                                                 118 
##                                   Bachelor's degree 
##                                                 137 
##                                     HS graduate/GED 
##                                                 705 
##                       Some college/Associate Degree 
##                                                 490 
## Some Graduate School/Graduate Degree (MA,PhD,MD,JD) 
##                                                 104 
##                                             Some HS 
##                                                 523
histogram(MeasurementData1$Educ)

plot of chunk unnamed-chunk-9

Education Frequency by Race

par(mfrow=c(1,2))
hist(MeasurementData1$Educ[MeasurementData1$Race==0],main="African American",xlab="Years of Education")
## Error in hist.default(MeasurementData1$Educ[MeasurementData1$Race == 0], : invalid number of 'breaks'
hist(MeasurementData1$Educ[MeasurementData1$Race==1],main="White",xlab="Years of Education")
## Error in hist.default(MeasurementData1$Educ[MeasurementData1$Race == 1], : invalid number of 'breaks'

Run Regression Model Diagnostics

fit = lm(MCLcountry~Race + Sex + Educ + Employment01 + acasiincomx01 + Neighborhood02 + CES + sHealthNum + Race * Educ + Race * Employment01 + Sex * Employment01 + Race * Sex , data = MeasurementData1,x=T)

Assessing Outliers

outlierTest(fit) # Bonferonni p-value for most extreme obs
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##        rstudent unadjusted p-value Bonferonni p
## 601 3.645496042         0.00027348      0.56802
qqPlot(fit, main="QQ Plot") #qq plot for studentized resid 

plot of chunk unnamed-chunk-12

leveragePlots(fit) # leverage plots

plot of chunk unnamed-chunk-12plot of chunk unnamed-chunk-12

Assessing Non-normality

# Normality of Residuals
# qq plot for studentized resid
qqPlot(fit, main="QQ Plot")

plot of chunk unnamed-chunk-13

# distribution of studentized residuals
library(MASS)
sresid <- studres(fit) 
hist(sresid, freq=FALSE,main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40) 
yfit<-dnorm(xfit) 
lines(xfit, yfit)

plot of chunk unnamed-chunk-13

Non-constant Error Variance

# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fit)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 45.08660365    Df = 1     p = 1.885105553e-11
# plot studentized residuals vs. fitted values 
spreadLevelPlot(fit)

plot of chunk unnamed-chunk-14

## 
## Suggested power transformation:  1.868632856

Multi-collinearity

# Evaluate Collinearity
vif(fit) # variance inflation factors 
##              Race               Sex              Educ      Employment01 
##      24.259507886       2.547697591       2.748440810       3.368656430 
##     acasiincomx01    Neighborhood02               CES        sHealthNum 
##       1.387073133       1.100131360       1.208089679       1.209289174 
##         Race:Educ Race:Employment01  Sex:Employment01          Race:Sex 
##      22.918430035       2.550543515       3.307577387       3.139138713
sqrt(vif(fit)) > 2 # problem?
##              Race               Sex              Educ      Employment01 
##              TRUE             FALSE             FALSE             FALSE 
##     acasiincomx01    Neighborhood02               CES        sHealthNum 
##             FALSE             FALSE             FALSE             FALSE 
##         Race:Educ Race:Employment01  Sex:Employment01          Race:Sex 
##              TRUE             FALSE             FALSE             FALSE

Non-independence of Errors

# Test for Autocorrelated Errors
durbinWatsonTest(fit)
##  lag Autocorrelation D-W Statistic p-value
##    1   0.03843837402   1.922005271   0.072
##  Alternative hypothesis: rho != 0

Global test of model assumptions

library(gvlma)
gvmodel <- gvlma(fit) 
summary(gvmodel)
## 
## Call:
## lm(formula = MCLcountry ~ Race + Sex + Educ + Employment01 + 
##     acasiincomx01 + Neighborhood02 + CES + sHealthNum + Race * 
##     Educ + Race * Employment01 + Sex * Employment01 + Race * 
##     Sex, data = MeasurementData1, x = T)
## 
## Residuals:
## MCL: SES Standing in country 
##        Min         1Q     Median         3Q        Max 
## -5.1129700 -1.1779336  0.2989504  1.9761218  6.6717081 
## 
## Coefficients:
##                                      Estimate   Std. Error  t value
## (Intercept)                       4.825348372  0.379893060 12.70186
## RaceWhite                        -0.840511672  0.403304645 -2.08406
## SexWomen                         -0.075600086  0.130395285 -0.57978
## Educ                              0.026229490  0.022372069  1.17242
## Employment01Unemployed           -0.277576321  0.150403199 -1.84555
## acasiincomx01Below 20,000        -0.328536933  0.095424299 -3.44291
## Neighborhood02                   -0.125862875  0.039799930 -3.16239
## CES                              -0.036803866  0.003921716 -9.38463
## sHealthNum                        0.212745163  0.058485155  3.63759
## RaceWhite:Educ                    0.071920283  0.028647758  2.51050
## RaceWhite:Employment01Unemployed -0.437636335  0.174592700 -2.50661
## SexWomen:Employment01Unemployed   0.616750455  0.167815293  3.67517
## RaceWhite:SexWomen               -0.384755428  0.167063807 -2.30304
##                                    Pr(>|t|)
## (Intercept)                      < 2.22e-16
## RaceWhite                        0.03727729
## SexWomen                         0.56212881
## Educ                             0.24116325
## Employment01Unemployed           0.06510101
## acasiincomx01Below 20,000        0.00058698
## Neighborhood02                   0.00158754
## CES                              < 2.22e-16
## sHealthNum                       0.00028195
## RaceWhite:Educ                   0.01213185
## RaceWhite:Employment01Unemployed 0.01226575
## SexWomen:Employment01Unemployed  0.00024375
## RaceWhite:SexWomen               0.02137533
## 
## Residual standard error: 1.843448 on 2064 degrees of freedom
## Multiple R-squared:  0.1540766,  Adjusted R-squared:  0.1491585 
## F-statistic: 31.32811 on 12 and 2064 DF,  p-value: < 2.2204e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = fit) 
## 
##                         Value      p-value                   Decision
## Global Stat        22.4206500 0.0001652513 Assumptions NOT satisfied!
## Skewness           14.7761517 0.0001210568 Assumptions NOT satisfied!
## Kurtosis            3.0792498 0.0792970942    Assumptions acceptable.
## Link Function       0.5396667 0.4625708771    Assumptions acceptable.
## Heteroscedasticity  4.0255818 0.0448151599 Assumptions NOT satisfied!

ACASI Income Variable for Race and Sex

RACE

#NOTE: 0 = African American; 1 = Whites
tblRACE = table(MeasurementData1$acasiincomx01, MeasurementData1$Race) 

Frequencies

tblRACE
##               
##                African American White
##   Above 20,000              531   565
##   Below 20,000              668   313

Proportions

prop.table(tblRACE)
##               
##                African American        White
##   Above 20,000     0.2556571979 0.2720269620
##   Below 20,000     0.3216177179 0.1506981223

SEX

tblSEX = table(MeasurementData1$acasiincomx01, MeasurementData1$Sex)
#NOTE: 0 = Women; 1 = Men

Frequencies

tblSEX
##               
##                Men Women
##   Above 20,000 509   587
##   Below 20,000 385   596

Proportions

prop.table(tblSEX)
##               
##                         Men        Women
##   Above 20,000 0.2450649976 0.2826191623
##   Below 20,000 0.1853635051 0.2869523351

Bar Plots

Education

op <- par(mar = c(10,12,1,2) + 0.1)
counts <- table(MeasurementData1$Edcat)
barplot(counts, main="Frequency of Education", las=2, beside=TRUE,cex.names=0.7)

plot of chunk unnamed-chunk-25

par(op)

Employment Status by Race

counts <- table(MeasurementData1$Race, MeasurementData1$Employment01)
barplot(counts, main="Employment by Race",
  xlab="Employment", col=c("black","gray"),ylim=c(0,700),
    legend = rownames(counts), beside=TRUE)

plot of chunk unnamed-chunk-26

Employment Status by Sex

counts <- table(MeasurementData1$Sex, MeasurementData1$Employment01)
barplot(counts, main="Employment by Sex",
  xlab="Employment", col=c("black","gray"),ylim=c(0,800),
    legend = rownames(counts), beside=TRUE)

plot of chunk unnamed-chunk-27

Education by Race

op <- par(mar = c(10,12,1,2) + 0.1)
ylabels = c("8th grade or below", "Bachelor's Degree","HS graduate/GED", "Graduate Degree","Some College/Associate Degree","Some grad school/Graduate degree","Some HS")
counts <- table(MeasurementData1$Race, MeasurementData1$Edcat)
barplot(counts, main="Education by Race", col=c("black","gray"),ylim=c(0,500),
    legend = rownames(counts), las=2,beside=TRUE,cex.names = 0.7)

plot of chunk unnamed-chunk-28

par(op)

Education by Sex

op <- par(mar = c(10,12,1,2) + 0.1)
counts <- table(MeasurementData1$Sex, MeasurementData1$Edcat)
barplot(counts, main="Education by Sex", col=c("black","gray"),ylim=c(0,400),
    legend = rownames(counts),las=2, beside=TRUE,cex.names=0.7)

plot of chunk unnamed-chunk-29

par(op)

Income by Race

counts <- table(MeasurementData1$Race, MeasurementData1$acasiincomx01)
barplot(counts, main="Income by Race",
  xlab="Income", col=c("black","gray"),ylim=c(0,800),
    legend = rownames(counts), beside=TRUE)

plot of chunk unnamed-chunk-30

Income by Sex

counts <- table(MeasurementData1$Sex, MeasurementData1$acasiincomx01)
barplot(counts, main="Income by Sex",ylim=c(0,700),
  xlab="Income", col=c("black","gray"),
    legend = rownames(counts), beside=TRUE)

plot of chunk unnamed-chunk-31

Education (numeric) by Race

par(mfrow=c(1,2))
hist(MeasurementData1$Educ[MeasurementData1$Race=="African American"],xlab="African American",main="Years of Education",xlim=c(0,21),ylim=c(0,600))
hist(MeasurementData1$Educ[MeasurementData1$Race=="White"],xlab="White",main="Years of Education",xlim=c(0,21),ylim=c(0,600))

plot of chunk unnamed-chunk-32

Education (numeric) by Sex

par(mfrow=c(1,2))
hist(MeasurementData1$Educ[MeasurementData1$Sex=="Women"],xlab="Women",main="Years of Education",xlim=c(0,21),ylim=c(0,600))
hist(MeasurementData1$Educ[MeasurementData1$Sex=="Men"],xlab="Men",main="Years of Education",xlim=c(0,21),ylim=c(0,600))

plot of chunk unnamed-chunk-33