library(readr)
AHS <- read_csv("/Users/rafaelareavlo/Desktop/household.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
## Warning: 37653540 parsing failures.
## row col expected actual file
## 1893 JPORCH no trailing characters ' '/Users/rafaelareavlo/Desktop/household.csv'
## 1893 JPOVLVLINC a double '0' '/Users/rafaelareavlo/Desktop/household.csv'
## 1893 JRADONACT a double '0' '/Users/rafaelareavlo/Desktop/household.csv'
## 1893 JRADONSUG a double '0' '/Users/rafaelareavlo/Desktop/household.csv'
## 1893 JRADONTEST a double '0' '/Users/rafaelareavlo/Desktop/household.csv'
## .... .......... ...................... ...... ............................................
## See problems(...) for more details.
library(dplyr)
library(tidyverse)
library(tibble)
AHS1 <- select(AHS, "HHRACE", "HHSPAN", "HHSEX", "HINCP")
AHS1 <- AHS1 %>%
rename('Race' = HHRACE,
'Hispanic' = HHSPAN,
'Gender' = HHSEX,
'Income' = HINCP)
AHS1$Race[AHS1$Race == -6] <- "0"
AHS1$Hispanic[AHS1$Hispanic == -6] <- "0"
AHS1$Income[AHS1$Income == -6] <- "0"
AHS1$Gender[AHS1$Gender == -6] <- "0"
AHS1 <- na.omit(AHS1)
AHS1$Income.Class <- as.numeric(AHS1$Income)
AHS1$Income.Class[AHS1$Income <= 25000] <- '1'
AHS1$Income.Class[AHS1$Income > 25000 & AHS1$Income < 45000] <- '2'
AHS1$Income.Class[AHS1$Income >= 45000 & AHS1$Income <= 140000 ] <- '3'
AHS1$Income.Class[AHS1$Income > 140000] <- '4'
AHS1$Race.Categorized <- as.character(AHS1$Race)
AHS1$Race.Categorized[AHS1$Race.Categorized == 1] <- "White"
AHS1$Race.Categorized[AHS1$Race.Categorized == 2] <- "Black"
AHS1$Race.Categorized[AHS1$Race.Categorized == 3] <- "Asian"
AHS1$Race.Categorized[AHS1$Race.Categorized == 4] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 5] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 0] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 6] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 7] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 9] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 10] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 14] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 15] <- "Other"
AHS1$Race.Categorized[AHS1$Race.Categorized == 18] <- "Other"
AHS1 = AHS1 %>%
arrange(Race.Categorized)
AHS2 <- AHS1[-c(370:770),]
For this week’s assignment, I will be using the 2015 American Housing Survey which is composed of over 100 variables and 66K observations. Using this dataset, I will mostly be focusing on the following question:
The exploratory variable is Income and the dependent variable is Gender. The control variables are composed of: * Race * Hispanic * Income Class
a1 <- glm(Gender ~ Income, family = 'binomial', data = AHS2)
summary(a1)
##
## Call:
## glm(formula = Gender ~ Income, family = "binomial", data = AHS2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.303 -1.257 1.058 1.083 4.090
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.907e-01 6.691e-02 4.344 1.4e-05 ***
## Income -1.901e-06 7.129e-07 -2.667 0.00765 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2055.8 on 1490 degrees of freedom
## Residual deviance: 2043.9 on 1489 degrees of freedom
## AIC: 2047.9
##
## Number of Fisher Scoring iterations: 5
Model 1 indicates that on average, females make an income that is 1.9 less than makes.
a2 <- glm(Gender ~ Income + Hispanic, family = 'binomial', data = AHS2)
summary(a2)
##
## Call:
## glm(formula = Gender ~ Income + Hispanic, family = "binomial",
## data = AHS2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.331 -1.266 1.033 1.065 4.194
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.505e-02 1.474e-01 -0.509 0.61063
## Income -2.010e-06 7.234e-07 -2.779 0.00546 **
## Hispanic2 4.295e-01 1.541e-01 2.787 0.00532 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2055.8 on 1490 degrees of freedom
## Residual deviance: 2036.1 on 1488 degrees of freedom
## AIC: 2042.1
##
## Number of Fisher Scoring iterations: 5
Model 2 adds the variable of hispanic and indicates that non-hispanic make an income average of 4.29 higher than compared to their hispanic conunter-parts.
a3 <- glm(Gender ~ Income + Hispanic + Race, family = 'binomial', data = AHS2)
summary(a3)
##
## Call:
## glm(formula = Gender ~ Income + Hispanic + Race, family = "binomial",
## data = AHS2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5081 -1.2078 0.8848 1.1224 3.6484
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.386e-01 1.481e-01 -0.936 0.3493
## Income -1.499e-06 6.771e-07 -2.214 0.0268 *
## Hispanic2 3.073e-01 1.571e-01 1.956 0.0505 .
## Race2 5.816e-01 1.316e-01 4.420 9.89e-06 ***
## Race3 5.981e-01 4.742e-01 1.261 0.2072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2055.8 on 1490 degrees of freedom
## Residual deviance: 2015.0 on 1486 degrees of freedom
## AIC: 2025
##
## Number of Fisher Scoring iterations: 5
Model 3 takes the variable of race into consideration and indicates non-hispanic blacks, on average make an income thats 3.07 higher when compared to the income earned by black hispanics.
| Model 1 | Model 2 | Model 3 | ||
|---|---|---|---|---|
| (Intercept) | 0.291*** | -0.075 | -0.139 | |
| (0.067) | (0.147) | (0.148) | ||
| Income | -0.000** | -0.000** | -0.000* | |
| (0.000) | (0.000) | (0.000) | ||
| Hispanic2 | 0.429** | 0.307 | ||
| (0.154) | (0.157) | |||
| Race2 | 0.582*** | |||
| (0.132) | ||||
| Race3 | 0.598 | |||
| (0.474) | ||||
| AIC | 2047.888 | 2042.082 | 2024.970 | |
| BIC | 2058.503 | 2058.004 | 2051.506 | |
| Log Likelihood | -1021.944 | -1018.041 | -1007.485 | |
| Deviance | 2043.888 | 2036.082 | 2014.970 | |
| Num. obs. | 1491 | 1491 | 1491 | |
| p < 0.001, p < 0.01, p < 0.05 | ||||
Based on both the AIC and BIC, Model 3 is conisdered to be the best model to be used for further analysis.
library(visreg)
visreg(a3, "Income", by = "Hispanic", scale = "response")
Upon looking at the graph for measuring income by gender and controlling for hispanic, we can conclude that not only do female make considerably less money than their male counter parts, but that hispanic females tend to make less than their non-hispanic counterparts.
analysis <- zelig(Income.Class ~ Hispanic + Gender + Race, model = 'mlogit', data = AHS2)
## How to cite this model in Zelig:
## Thomas W. Yee. 2007.
## mlogit: Multinomial Logistic Regression for Dependent Variables with Unordered Categorical Values
## in Christine Choirat, Christopher Gandrud, James Honaker, Kosuke Imai, Gary King, and Olivia Lau,
## "Zelig: Everyone's Statistical Software," http://zeligproject.org/
summary(analysis)
## Model:
##
## Call:
## z5$zelig(formula = Income.Class ~ Hispanic + Gender + Race, data = AHS2)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## log(mu[,1]/mu[,4]) -3.538 -0.5663 -0.2974 1.0175 1.651
## log(mu[,2]/mu[,4]) -3.082 -0.2886 -0.2676 -0.2227 2.156
## log(mu[,3]/mu[,4]) -3.381 -0.5508 -0.4340 1.0407 1.837
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 1.75955 0.38063 4.623 3.79e-06
## (Intercept):2 1.24763 0.39953 3.123 0.00179
## (Intercept):3 2.15483 0.37720 5.713 1.11e-08
## Hispanic2:1 -1.25023 0.38979 -3.207 0.00134
## Hispanic2:2 -1.15331 0.41023 -2.811 0.00493
## Hispanic2:3 -1.02937 0.38671 -2.662 0.00777
## Gender2:1 0.99521 0.19919 4.996 5.85e-07
## Gender2:2 0.57581 0.21825 2.638 0.00833
## Gender2:3 0.29303 0.19699 1.488 0.13688
## Race2:1 1.17166 0.26733 4.383 1.17e-05
## Race2:2 0.70706 0.29239 2.418 0.01560
## Race2:3 0.09934 0.27908 0.356 0.72188
## Race3:1 0.87447 1.07074 0.817 0.41410
## Race3:2 0.79248 1.13017 0.701 0.48318
## Race3:3 0.36649 1.08835 0.337 0.73632
##
## Names of linear predictors: log(mu[,1]/mu[,4]), log(mu[,2]/mu[,4]),
## log(mu[,3]/mu[,4])
##
## Residual deviance: 3616.616 on 4458 degrees of freedom
##
## Log-likelihood: -1808.308 on 4458 degrees of freedom
##
## Number of Fisher scoring iterations: 5
##
## No Hauck-Donner effect found in any of the estimates
##
##
## Reference group is level 4 of the response
## Next step: Use 'setx' method
The independent variable used for the multinominal analysis is income.class and the dependent variables are: * Hispanic * Gender * Race
Similar to the regression analysis, the primary focus of this analysis is to determine if the income level that hispanics/non-hispanics would fall in. Under the Hispanics 2, the data indicates that non-hispanics are least likely to fall within the poverty income class. Upon further analysis of the Race variables, we can intepret the data as stating that the log odds of being within the poverty income class for blacks is 1.17 higher than the other race categories. Upom analyzing gender, we can state that the log odds of females falling within the poverty income class is 1.0 higher than that of their male counter parts.
hispanic <- setx(analysis, Hispanic = 1)
non.hispanic <- setx(analysis, Hispanic = 2)
analysis1 <- sim(analysis, x=hispanic, x1=non.hispanic)
summary(analysis1)
##
## sim x :
## -----
## ev
## mean sd 50% 2.5% 97.5%
## Pr(Y=1) 0.45590329 0.03978101 0.45256101 0.38780288 0.53635060
## Pr(Y=2) 0.17925905 0.02886955 0.17848975 0.12828808 0.23966611
## Pr(Y=3) 0.33367070 0.03643288 0.33271969 0.26472402 0.41048671
## Pr(Y=4) 0.03116696 0.01176287 0.02923002 0.01416031 0.05890367
## pv
## 1 2 3 4
## [1,] 0.463 0.169 0.331 0.037
##
## sim x1 :
## -----
## ev
## mean sd 50% 2.5% 97.5%
## Pr(Y=1) 0.38960515 0.01976312 0.38895779 0.3530582 0.4287812
## Pr(Y=2) 0.16803049 0.01483874 0.16764603 0.1409012 0.1978972
## Pr(Y=3) 0.35565436 0.01949100 0.35516026 0.3178026 0.3943087
## Pr(Y=4) 0.08671001 0.01218567 0.08618024 0.0654986 0.1126532
## pv
## 1 2 3 4
## [1,] 0.389 0.161 0.353 0.097
## fd
## mean sd 50% 2.5% 97.5%
## Pr(Y=1) -0.06629814 0.04037739 -0.06588589 -0.14561358 0.009067949
## Pr(Y=2) -0.01122856 0.02906754 -0.01046869 -0.07418551 0.040683246
## Pr(Y=3) 0.02198365 0.03642315 0.02280510 -0.05050093 0.090661203
## Pr(Y=4) 0.05554305 0.01514309 0.05545652 0.02447837 0.084939039
Based on the data above, we can conclude that the log odds of white hispanics falling within the poverty income class is .065 less than their non-white hispanic counterparts. Furhtermore, the log odds of white hispanics falling within the upper middle class income class os .056 higher than their non-white hispanic counterparts.
par(mar=c(2,2,2,2))
plot(analysis1)