Importing Data

library(readr)
AHS <- read_csv("/Users/rafaelareavlo/Desktop/household.csv")
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
## Warning: 37653540 parsing failures.
##  row        col               expected actual                                         file
## 1893 JPORCH     no trailing characters    '   '/Users/rafaelareavlo/Desktop/household.csv'
## 1893 JPOVLVLINC a double                  '0' '/Users/rafaelareavlo/Desktop/household.csv'
## 1893 JRADONACT  a double                  '0' '/Users/rafaelareavlo/Desktop/household.csv'
## 1893 JRADONSUG  a double                  '0' '/Users/rafaelareavlo/Desktop/household.csv'
## 1893 JRADONTEST a double                  '0' '/Users/rafaelareavlo/Desktop/household.csv'
## .... .......... ...................... ...... ............................................
## See problems(...) for more details.

Selecting Variables

library(dplyr)
library(tidyverse)
library(tibble)
AHS1 <- select(AHS, "HHRACE", "HHSPAN", "HHSEX", "HINCP")

Renaming Selected Variables

AHS1 <- AHS1 %>%
  rename('Race' = HHRACE,
         'Hispanic' = HHSPAN,
         'Gender' = HHSEX,
         'Income' = HINCP)

Recoding Some Values

AHS1$Race[AHS1$Race == -6] <- "0"
AHS1$Hispanic[AHS1$Hispanic == -6] <- "0"
AHS1$Income[AHS1$Income == -6] <- "0"
AHS1$Gender[AHS1$Gender == -6] <- "0"

Converting to Appropriate Variable Type

Removing NA’s

AHS1 <- na.omit(AHS1)

Grouping Income Levels

AHS1$Income.Class <- as.numeric(AHS1$Income)
AHS1$Income.Class[AHS1$Income <= 25000] <- '1'
AHS1$Income.Class[AHS1$Income > 25000 & AHS1$Income < 45000] <- '2'
AHS1$Income.Class[AHS1$Income >= 45000 & AHS1$Income <= 140000 ] <- '3'
AHS1$Income.Class[AHS1$Income > 140000] <- '4'

Categorizing Race

AHS1$Race.Categorized <- as.character(AHS1$Race)
AHS1$Race.Categorized[AHS1$Race.Categorized == 1] <- "White"
AHS1$Race.Categorized[AHS1$Race.Categorized == 2] <- "Black"
AHS1$Race.Categorized[AHS1$Race.Categorized == 3] <- "Asian"
AHS1$Race.Categorized[AHS1$Race.Categorized == 4] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 5] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 0] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 6] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 7] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 9] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 10] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 14] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 15] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 18] <- "Other"

Arranging Data by Race

AHS1 = AHS1 %>%
  arrange(Race.Categorized)

Deleting Unessecasry Rows

AHS2 <- AHS1[-c(370:770),]

Research Question & Analysis

For this week’s assignment, I will be using the 2015 American Housing Survey which is composed of over 100 variables and 66K observations. Using this dataset, I will mostly be focusing on the following question:

The exploratory variable is Income and the dependent variable is Gender. The control variables are composed of: * Race * Hispanic * Income Class

Race

  • 1 = White
  • 2 = Black
  • 3 = Asian

Hispanic

  • 1 = Yes
  • 2 = No

Gender

  • 1 = Male
  • 2 = Female

Income Class

  • 1 = Poverty
  • 2 = Working Class
  • 3 = Middle Class
  • 4 = Upper Middle

Starting Analysis

Building The Models

Model 1

a1 <- glm(Gender ~ Income, family = 'binomial', data = AHS2)
summary(a1)
## 
## Call:
## glm(formula = Gender ~ Income, family = "binomial", data = AHS2)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.303  -1.257   1.058   1.083   4.090  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.907e-01  6.691e-02   4.344  1.4e-05 ***
## Income      -1.901e-06  7.129e-07  -2.667  0.00765 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2055.8  on 1490  degrees of freedom
## Residual deviance: 2043.9  on 1489  degrees of freedom
## AIC: 2047.9
## 
## Number of Fisher Scoring iterations: 5

Model 1 indicates that on average, females make an income that is 1.9 less than makes.

Model 2

a2 <- glm(Gender ~ Income + Hispanic, family = 'binomial', data = AHS2)
summary(a2)
## 
## Call:
## glm(formula = Gender ~ Income + Hispanic, family = "binomial", 
##     data = AHS2)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.331  -1.266   1.033   1.065   4.194  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -7.505e-02  1.474e-01  -0.509  0.61063   
## Income      -2.010e-06  7.234e-07  -2.779  0.00546 **
## Hispanic2    4.295e-01  1.541e-01   2.787  0.00532 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2055.8  on 1490  degrees of freedom
## Residual deviance: 2036.1  on 1488  degrees of freedom
## AIC: 2042.1
## 
## Number of Fisher Scoring iterations: 5

Model 2 adds the variable of hispanic and indicates that non-hispanic make an income average of 4.29 higher than compared to their hispanic conunter-parts.

Model 3

a3 <- glm(Gender ~ Income + Hispanic + Race, family = 'binomial', data = AHS2)
summary(a3)
## 
## Call:
## glm(formula = Gender ~ Income + Hispanic + Race, family = "binomial", 
##     data = AHS2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5081  -1.2078   0.8848   1.1224   3.6484  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.386e-01  1.481e-01  -0.936   0.3493    
## Income      -1.499e-06  6.771e-07  -2.214   0.0268 *  
## Hispanic2    3.073e-01  1.571e-01   1.956   0.0505 .  
## Race2        5.816e-01  1.316e-01   4.420 9.89e-06 ***
## Race3        5.981e-01  4.742e-01   1.261   0.2072    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2055.8  on 1490  degrees of freedom
## Residual deviance: 2015.0  on 1486  degrees of freedom
## AIC: 2025
## 
## Number of Fisher Scoring iterations: 5

Model 3 takes the variable of race into consideration and indicates non-hispanic blacks, on average make an income thats 3.07 higher when compared to the income earned by black hispanics.

Regression Models

Model 1 Model 2 Model 3
(Intercept) 0.291*** -0.075 -0.139
(0.067) (0.147) (0.148)
Income -0.000** -0.000** -0.000*
(0.000) (0.000) (0.000)
Hispanic2 0.429** 0.307
(0.154) (0.157)
Race2 0.582***
(0.132)
Race3 0.598
(0.474)
AIC 2047.888 2042.082 2024.970
BIC 2058.503 2058.004 2051.506
Log Likelihood -1021.944 -1018.041 -1007.485
Deviance 2043.888 2036.082 2014.970
Num. obs. 1491 1491 1491
p < 0.001, p < 0.01, p < 0.05

Based on both the AIC and BIC, Model 3 is conisdered to be the best model to be used for further analysis.

Model 3 - Visualization

library(visreg)
visreg(a3, "Income", by = "Hispanic", scale = "response")

Upon looking at the graph for measuring income by gender and controlling for hispanic, we can conclude that not only do female make considerably less money than their male counter parts, but that hispanic females tend to make less than their non-hispanic counterparts.

Multinomial Analysis using Zelig

analysis <- zelig(Income.Class ~ Hispanic + Gender + Race, model = 'mlogit', data = AHS2)
## How to cite this model in Zelig:
##   Thomas W. Yee. 2007.
##   mlogit: Multinomial Logistic Regression for Dependent Variables with Unordered Categorical Values
##   in Christine Choirat, Christopher Gandrud, James Honaker, Kosuke Imai, Gary King, and Olivia Lau,
##   "Zelig: Everyone's Statistical Software," http://zeligproject.org/
summary(analysis)
## Model: 
## 
## Call:
## z5$zelig(formula = Income.Class ~ Hispanic + Gender + Race, data = AHS2)
## 
## Pearson residuals:
##                       Min      1Q  Median      3Q   Max
## log(mu[,1]/mu[,4]) -3.538 -0.5663 -0.2974  1.0175 1.651
## log(mu[,2]/mu[,4]) -3.082 -0.2886 -0.2676 -0.2227 2.156
## log(mu[,3]/mu[,4]) -3.381 -0.5508 -0.4340  1.0407 1.837
## 
## Coefficients: 
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept):1  1.75955    0.38063   4.623 3.79e-06
## (Intercept):2  1.24763    0.39953   3.123  0.00179
## (Intercept):3  2.15483    0.37720   5.713 1.11e-08
## Hispanic2:1   -1.25023    0.38979  -3.207  0.00134
## Hispanic2:2   -1.15331    0.41023  -2.811  0.00493
## Hispanic2:3   -1.02937    0.38671  -2.662  0.00777
## Gender2:1      0.99521    0.19919   4.996 5.85e-07
## Gender2:2      0.57581    0.21825   2.638  0.00833
## Gender2:3      0.29303    0.19699   1.488  0.13688
## Race2:1        1.17166    0.26733   4.383 1.17e-05
## Race2:2        0.70706    0.29239   2.418  0.01560
## Race2:3        0.09934    0.27908   0.356  0.72188
## Race3:1        0.87447    1.07074   0.817  0.41410
## Race3:2        0.79248    1.13017   0.701  0.48318
## Race3:3        0.36649    1.08835   0.337  0.73632
## 
## Names of linear predictors: log(mu[,1]/mu[,4]), log(mu[,2]/mu[,4]), 
## log(mu[,3]/mu[,4])
## 
## Residual deviance: 3616.616 on 4458 degrees of freedom
## 
## Log-likelihood: -1808.308 on 4458 degrees of freedom
## 
## Number of Fisher scoring iterations: 5 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Reference group is level  4  of the response
## Next step: Use 'setx' method

The independent variable used for the multinominal analysis is income.class and the dependent variables are: * Hispanic * Gender * Race

Similar to the regression analysis, the primary focus of this analysis is to determine if the income level that hispanics/non-hispanics would fall in. Under the Hispanics 2, the data indicates that non-hispanics are least likely to fall within the poverty income class. Upon further analysis of the Race variables, we can intepret the data as stating that the log odds of being within the poverty income class for blacks is 1.17 higher than the other race categories. Upom analyzing gender, we can state that the log odds of females falling within the poverty income class is 1.0 higher than that of their male counter parts.

Looking at Income Class & Hispanic

hispanic <- setx(analysis, Hispanic = 1)
non.hispanic <- setx(analysis, Hispanic = 2)
analysis1 <- sim(analysis, x=hispanic, x1=non.hispanic)
summary(analysis1)
## 
##  sim x :
##  -----
## ev
##               mean         sd        50%       2.5%      97.5%
## Pr(Y=1) 0.45590329 0.03978101 0.45256101 0.38780288 0.53635060
## Pr(Y=2) 0.17925905 0.02886955 0.17848975 0.12828808 0.23966611
## Pr(Y=3) 0.33367070 0.03643288 0.33271969 0.26472402 0.41048671
## Pr(Y=4) 0.03116696 0.01176287 0.02923002 0.01416031 0.05890367
## pv
##          1     2     3     4
## [1,] 0.463 0.169 0.331 0.037
## 
##  sim x1 :
##  -----
## ev
##               mean         sd        50%      2.5%     97.5%
## Pr(Y=1) 0.38960515 0.01976312 0.38895779 0.3530582 0.4287812
## Pr(Y=2) 0.16803049 0.01483874 0.16764603 0.1409012 0.1978972
## Pr(Y=3) 0.35565436 0.01949100 0.35516026 0.3178026 0.3943087
## Pr(Y=4) 0.08671001 0.01218567 0.08618024 0.0654986 0.1126532
## pv
##          1     2     3     4
## [1,] 0.389 0.161 0.353 0.097
## fd
##                mean         sd         50%        2.5%       97.5%
## Pr(Y=1) -0.06629814 0.04037739 -0.06588589 -0.14561358 0.009067949
## Pr(Y=2) -0.01122856 0.02906754 -0.01046869 -0.07418551 0.040683246
## Pr(Y=3)  0.02198365 0.03642315  0.02280510 -0.05050093 0.090661203
## Pr(Y=4)  0.05554305 0.01514309  0.05545652  0.02447837 0.084939039

Based on the data above, we can conclude that the log odds of white hispanics falling within the poverty income class is .065 less than their non-white hispanic counterparts. Furhtermore, the log odds of white hispanics falling within the upper middle class income class os .056 higher than their non-white hispanic counterparts.

Plot - Income Class & Hispanics

par(mar=c(2,2,2,2))
plot(analysis1)