In the first part of this week’s assignment I will first recreate the titanic example slides using Zelig5 syntax. In the second part of this week’s assignment I will use logit regression models and the Zelig package with Zelig4 syntax to look at income differences using extracted data from the 1994 US Census Data set. I will look at gender, race, number of years of education to analyze at differences within the categories.
library(Zelig)
library(readr)
library(pander)
library(radiant.data)
library(texreg)
library(lmtest)
library(visreg)
library(tidyverse)
library(sjmisc)
data(titanic)
head(titanic)
titanic2 <- titanic%>%
mutate(age = as.integer(age))%>%
mutate(surv = as.integer(survived))%>%
mutate(survival = sjmisc::rec(surv, rec = "2=0; 1=1")) %>%
select(pclass, survived, survival, everything())
head(titanic2)
z5titanic <- zlogit$new()
z5titanic$zelig(survival ~ age + sex*pclass + fare, data = titanic2)
summary(z5titanic)
Model:
Call:
z5titanic$zelig(formula = survival ~ age + sex * pclass + fare,
data = titanic2)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.0628 -0.6636 -0.4941 0.4337 2.4940
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.8959649 0.6128145 7.989 0.00000000000000136
age -0.0386781 0.0067926 -5.694 0.00000001239977237
sexmale -3.9001038 0.5015680 -7.776 0.00000000000000750
pclass2nd -1.5922712 0.6024689 -2.643 0.00822
pclass3rd -4.1381922 0.5601346 -7.388 0.00000000000014922
fare -0.0009074 0.0020510 -0.442 0.65820
sexmale:pclass2nd -0.0603255 0.6373047 -0.095 0.92459
sexmale:pclass3rd 2.5016703 0.5479814 4.565 0.00000498908018340
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1409.99 on 1042 degrees of freedom
Residual deviance: 931.42 on 1035 degrees of freedom
AIC: 947.42
Number of Fisher Scoring iterations: 5
Next step: Use 'setx' method
z5titanic$setrange(age = min(titanic2$age):max(titanic2$age))
z5titanic$sim()
ci.plot(z5titanic)
z5titanicfare <- zlogit$new()
z5titanicfare$zelig(survival ~ age + sex*pclass + fare, data = titanic2)
z5titanicfare$setrange(fare = min(titanic2$fare):max(titanic2$fare))
z5titanicfare$sim()
ci.plot(z5titanicfare)
z5titanicgender <- zlogit$new()
z5titanicgender$zelig(survival ~ age + sex*pclass + fare, data = titanic2)
z5titanicgender$setx(sex = "male")
z5titanicgender$setx1(sex = "female")
z5titanicgender$sim()
summary(z5titanicgender)
sim x :
-----
ev
mean sd 50% 2.5% 97.5%
[1,] 0.1407152 0.02003904 0.1394777 0.105417 0.1836181
pv
0 1
[1,] 0.869 0.131
sim x1 :
-----
ev
mean sd 50% 2.5% 97.5%
[1,] 0.3977725 0.0451144 0.3966921 0.3143706 0.4929217
pv
0 1
[1,] 0.583 0.417
fd
mean sd 50% 2.5% 97.5%
[1,] 0.2570573 0.04502188 0.2554332 0.1756974 0.3481262
plot(z5titanicgender)
z5gender1<- zlogit$new()
z5gender1$zelig(survival ~ age + sex*pclass + fare, data = titanic2)
z5gender1$setx(sex = "male", pclass = "1st")
z5gender1$setx1(sex = "female", pclass = "1st")
z5gender1$sim()
z5gender2 <- zlogit$new()
z5gender2$zelig(survival ~ age + sex*pclass + fare, data = titanic2)
z5gender2$setx(sex = "male", pclass = "2nd")
z5gender2$setx1(sex = "female", pclass = "2nd")
z5gender2$sim()
z5gender3 <- zlogit$new()
z5gender3$zelig(survival ~ age + sex*pclass + fare, data = titanic2)
z5gender3$setx(sex = "male", pclass = "1st")
z5gender3$setx1(sex = "female", pclass = "1st")
z5gender3$sim()
plot(z5gender1)
plot(z5gender2)
plot(z5gender3)
d1 <- z5gender1$get_qi(xvalue="x1", qi="fd")
d2 <- z5gender2$get_qi(xvalue="x1", qi="fd")
d3 <- z5gender3$get_qi(xvalue="x1", qi="fd")
dfd <- as.data.frame(cbind(d1, d2, d3))
dfd
tidd <- dfd %>%
gather(class, simv, 1:3)
head(tidd)
tidd %>%
group_by(class) %>%
summarise(mean = mean(simv), sd = sd(simv))
library(ggplot2)
ggplot(tidd, aes(simv)) + geom_histogram() + facet_grid(~class)
Income<- read_csv("C:/Users/Papa/Desktop/Soc 712 -R/adult.csv", col_names = TRUE)
head(Income)
Income2<-Income %>% mutate(newsex = factor(ifelse(sex == "Female",1, ifelse(sex =="Male",0, "no")))) %>%
mutate(newrace = factor(ifelse(race == "White", 1,
ifelse(race == "Asian-Pac-Islander", 2,
ifelse(race == "Black", 3,
ifelse(race == "Other", 4,
ifelse(race == "Amer-Indian-Eskimo", 5, "error")))))))%>%
mutate(newincome= ifelse(income ==">50K",1,0))%>%
mutate(incomelevel = capital.gain - capital.loss)
Income2
z_income <- zelig(newincome ~ age + newsex*newrace + education.num, model = "logit", data = Income2, cite = F)
summary(z_income)
Model:
Call: z5$zelig(formula = newincome ~ age + newsex * newrace + education.num, data = Income2)
Deviance Residuals: Min 1Q Median 3Q Max
-2.4255 -0.6737 -0.4392 -0.1347 3.2191
Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -6.351948 0.090912 -69.869 < 0.0000000000000002 age 0.042181 0.001133 37.221 < 0.0000000000000002 newsex1 -1.303943 0.039625 -32.907 < 0.0000000000000002 newrace2 -0.289713 0.091746 -3.158 0.001590 newrace3 -0.410500 0.071930 -5.707 0.0000000115 newrace4 -0.908946 0.266658 -3.409 0.000653 newrace5 -0.801809 0.233548 -3.433 0.000597 education.num 0.366546 0.006585 55.664 < 0.0000000000000002 newsex1:newrace2 0.320664 0.197456 1.624 0.104381 newsex1:newrace3 -0.167422 0.137700 -1.216 0.224045 newsex1:newrace4 0.539736 0.515293 1.047 0.294899 newsex1:newrace5 0.791635 0.397953 1.989 0.046672
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 35948 on 32560 degrees of freedom
Residual deviance: 28622 on 32549 degrees of freedom AIC: 28646
Number of Fisher Scoring iterations: 5
Next step: Use ‘setx’ method
htmlreg(list(z_income))
| Model 1 | ||
|---|---|---|
| (Intercept) | -6.35*** | |
| (0.09) | ||
| age | 0.04*** | |
| (0.00) | ||
| newsex1 | -1.30*** | |
| (0.04) | ||
| newrace2 | -0.29** | |
| (0.09) | ||
| newrace3 | -0.41*** | |
| (0.07) | ||
| newrace4 | -0.91*** | |
| (0.27) | ||
| newrace5 | -0.80*** | |
| (0.23) | ||
| education.num | 0.37*** | |
| (0.01) | ||
| newsex1:newrace2 | 0.32 | |
| (0.20) | ||
| newsex1:newrace3 | -0.17 | |
| (0.14) | ||
| newsex1:newrace4 | 0.54 | |
| (0.52) | ||
| newsex1:newrace5 | 0.79* | |
| (0.40) | ||
| AIC | 28646.35 | |
| BIC | 28747.04 | |
| Log Likelihood | -14311.17 | |
| Deviance | 28622.35 | |
| Num. obs. | 32561 | |
| p < 0.001, p < 0.01, p < 0.05 | ||
summary(Income2$age)
Min. 1st Qu. Median Mean 3rd Qu. Max.
17.00 28.00 37.00 38.58 48.00 90.00
age.range = min(Income2$age):max(Income2$age,age =95)
x <- setx(z_income, age = age.range)
s <- sim(z_income, x = x)
ci.plot(s)
numeducation.range = min(Income2$education.num):max(Income2$education.num)
x <- setx(z_income, education.num = numeducation.range)
s <- sim(z_income, x = x)
ci.plot(s)
x <- setx(z_income, newsex = "0")
x1 <- setx(z_income, newsex = "1")
s <- sim(z_income, x = x, x1 = x1)
summary(s)
sim x :
-----
ev
mean sd 50% 2.5% 97.5%
[1,] 0.26296 0.003574392 0.2628967 0.256062 0.2702805
pv
0 1
[1,] 0.738 0.262
sim x1 :
-----
ev
mean sd 50% 2.5% 97.5%
[1,] 0.08853573 0.003002269 0.0884863 0.08252058 0.09416025
pv
0 1
[1,] 0.932 0.068
fd
mean sd 50% 2.5% 97.5%
[1,] -0.1744243 0.0044224 -0.1744116 -0.18287 -0.1657943
fd <- s$get_qi(xvalue="x1", qi="fd")
summary(fd)
V1
Min. :-0.1891
1st Qu.:-0.1775
Median :-0.1744
Mean :-0.1744
3rd Qu.:-0.1716
Max. :-0.1584
plot(s)
r1x <- setx(z_income, newsex = "0", newrace = "1")
r1x1 <- setx(z_income, newsex = "1", newrace = "1")
r1s <- sim(z_income, x = r1x, x1 = r1x1)
r2x <- setx(z_income, newsex = "0", newrace = "2")
r2x1 <- setx(z_income, newsex = "1", newrace = "2")
r2s <- sim(z_income, x = r2x, x1 = r2x1)
r3x <- setx(z_income, newsex = "0", newrace = "3")
r3x1 <- setx(z_income, newsex = "1", newrace = "3")
r3s <- sim(z_income, x = r3x, x1 = r3x1)
r4x <- setx(z_income, newsex = "0", newrace = "4")
r4x1 <- setx(z_income, newsex = "1", newrace = "4")
r4s <- sim(z_income, x = r4x, x1 = r4x1)
r5x <- setx(z_income, newsex = "0", newrace = "4")
r5x1 <- setx(z_income, newsex = "1", newrace = "4")
r5s <- sim(z_income, x = r5x, x1 = r5x1)
plot(r1s)
plot(r2s)
plot(r3s)
plot(r4s)
plot(r5s)
d1 <- r1s$get_qi(xvalue="x1", qi="fd")
d2 <- r2s$get_qi(xvalue="x1", qi="fd")
d3 <- r3s$get_qi(xvalue="x1", qi="fd")
d4 <- r4s$get_qi(xvalue="x1", qi="fd")
d5 <- r5s$get_qi(xvalue="x1", qi="fd")
dfd <- as.data.frame(cbind(d1, d2, d3, d4, d5))
head(dfd)
In the second part of the homework we used a logit regression model to look at income differences. From the graphs we can see that years of education and age do affect income positively. From our predicted and expectected values we can also see that there are significant differences in income between men and women and that income tends to vary significantly among the races.