Ricardo Lowe DEM 7223 – Event History Analysis Homework 5
library(car)
## Loading required package: carData
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(muhaz)
library(eha)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ stringr 1.4.0
## ✓ tidyr 1.1.1 ✓ forcats 0.5.0
## ✓ readr 1.3.1
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")
## Loading required package: ipumsr
usa_00128 <- read_ipums_ddi("usa_00128.xml")
usa_00128_ <- read_ipums_micro(usa_00128)
## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
ipums_view(usa_00128_)
usa_00122_ <- usa_00128_
usa_00122_$Female <- ifelse(usa_00122_$SEX == 1, 0, 1)
usa_00122_$MAR <- ifelse(usa_00122_$MARST <= 2, 0, 1)
usa_00122_$HighEd <- ifelse(usa_00122_$EDUC <= 6, 0, 1)
usa_00122_$Rev <- ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG >= 1960 & usa_00122_$YRIMMIG <= 1978, "Pre-",
ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG >= 1979 & usa_00122_$YRIMMIG <= 1983, "During",
ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG > 1983 & usa_00122_$YRIMMIG <= 1990, "Post-",NA)))
usa_00122_$Rev2 <- ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG < 1979, 1,
ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG >= 1979 & usa_00122_$YRIMMIG <= 1983, 2,
ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG > 1983, 3,NA)))
usa_00122_$Rev <-as.factor(usa_00122_$Rev)
usa_00122_$Rev2 <-as.numeric(usa_00122_$Rev2)
usa_00122_$ImmAge <- usa_00122_$YRIMMIG-usa_00122_$BIRTHYR
usa_00122_$AGEC<-cut(usa_00122_$AGE, breaks=c(0,24,39,59,79,99))
usa_00122_$AGEW<-ifelse(usa_00122_$AGE >=15 & usa_00122_$AGE <=24, "15_24",
ifelse(usa_00122_$AGE >=25 & usa_00122_$AGE <=54, "25_54",
ifelse(usa_00122_$AGE >=55 & usa_00122_$AGE <=64, "55_64","Other")))
usa_00122_$AGEW <-as.factor(usa_00122_$AGEW)
usa_00122_$AGEW<-relevel(usa_00122_$AGEW,ref="25_54")
n2 <- usa_00122_%>%
filter(YRIMMIG < 9000 &
BIRTHYR < 9000 &
YRSUSA1 > 5) #&
#RACE == 2
#n2$YRIMMIG3 <- as.numeric(ifelse(n2$YRIMMIG >= 1990,"",n2$YRIMMIG))
n2$YRNATUR3 <- as.numeric(ifelse(n2$YRNATUR >= 9997,NA,n2$YRNATUR))
n2$YRIMMIG3 <- as.numeric((n2$YEAR - n2$BIRTHYR))
n2$YEAR <- as.factor(n2$YEAR)
n2$secbi<-ifelse(is.na(n2$YRNATUR3)==T,
n2$YRIMMIG3,
(n2$YRNATUR3-n2$BIRTHYR))
n2$secbi2<-ifelse(is.na(n2$YRIMMIG)<=1990,
n2$YRIMMIG3,
(n2$YRIMMIG-n2$BIRTHYR))
n2$b2event<-ifelse(is.na(n2$YRNATUR3)==T,0,1)
n2$b2event2<-ifelse(n2$YRIMMIG<=1990,0,1)
n2$b2event<-as.numeric(n2$b2event)
n2$secbi<-as.numeric(n2$secbi)
fit<-survfit(Surv(secbi, b2event)~1, n2)
fit
## Call: survfit(formula = Surv(secbi, b2event) ~ 1, data = n2)
##
## n events median 0.95LCL 0.95UCL
## 911 539 46 43 49
I limited the dataset to Afro-Caribbean immigrants and created my observed and censored duration variables. Here is test output:
n2 %>%
select(RACE, HISPAN,LANGUAGED,BPLD,BIRTHYR,YRIMMIG,YRNATUR,YRSUSA1,YRNATUR3,b2event,secbi) %>%
filter(b2event == 1)
## # A tibble: 539 x 11
## RACE HISPAN LANGUAGED BPLD BIRTHYR YRIMMIG YRNATUR YRSUSA1
## <int+l> <int+l> <int+lbl> <int+lbl> <dbl+l> <int+lbl> <int+l> <int+l>
## 1 2 [Bla… 0 [Not… 100 [Eng… 21070 [Pan… 1959 1986 [198… 1991 27
## 2 2 [Bla… 0 [Not… 100 [Eng… 21070 [Pan… 1935 1957 1957 59
## 3 2 [Bla… 0 [Not… 100 [Eng… 21070 [Pan… 1937 1960 1998 56
## 4 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan… 1940 1962 1986 54
## 5 2 [Bla… 0 [Not… 100 [Eng… 21070 [Pan… 1976 1978 1980 35
## 6 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan… 1923 1948 1958 65
## 7 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan… 1959 1977 1987 36
## 8 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan… 1943 1960 1966 53
## 9 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan… 1934 1968 1968 45
## 10 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan… 1954 1972 1992 41
## # … with 529 more rows, and 3 more variables: YRNATUR3 <dbl>, b2event <dbl>,
## # secbi <dbl>
n2 %>%
group_by(BPLD)%>%
filter(YEAR == 2018)%>%
summarise(Total = sum(PERWT))%>%
arrange(desc(Total))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 2
## BPLD Total
## <int+lbl> <dbl>
## 1 21070 [Panama] 11032
levels(n2$YEAR)
## [1] "2017" "2018"
n2$LANG <- as.factor(ifelse(n2$LANGUAGE == 1, 1, ifelse(n2$LANGUAGE == 12, 0, NA)))
#make person period file
pp<-survSplit(Surv(secbi, b2event)~. , data = n2[n2$secbi>0,],
cut=c(0,12, 24, 36, 48, 60, 72), episode="year_nat")
pp$year <- pp$year_nat-1
pp<-pp[order(pp$CBSERIAL, pp$year_nat),]
head(pp[, c("CBSERIAL", "secbi", "STRATA", "b2event", "year")], n=50)
## CBSERIAL secbi STRATA b2event year
## 678 2.013e+12 12 401536 0 1
## 683 2.013e+12 12 401536 0 1
## 689 2.013e+12 12 401536 0 1
## 679 2.013e+12 24 401536 0 2
## 684 2.013e+12 24 401536 0 2
## 690 2.013e+12 24 401536 0 2
## 680 2.013e+12 36 401536 0 3
## 685 2.013e+12 36 401536 0 3
## 691 2.013e+12 36 401536 0 3
## 681 2.013e+12 48 401536 0 4
## 686 2.013e+12 48 401536 0 4
## 692 2.013e+12 48 401536 0 4
## 682 2.013e+12 51 401536 1 5
## 687 2.013e+12 60 401536 0 5
## 693 2.013e+12 52 401536 0 5
## 688 2.013e+12 65 401536 0 6
## 624 2.013e+12 12 90434 0 1
## 625 2.013e+12 21 90434 1 2
## 694 2.013e+12 12 410536 0 1
## 695 2.013e+12 24 410536 0 2
## 696 2.013e+12 28 410536 1 3
## 697 2.013e+12 12 370136 0 1
## 698 2.013e+12 24 370136 0 2
## 699 2.013e+12 36 370136 0 3
## 700 2.013e+12 48 370136 0 4
## 701 2.013e+12 60 370136 0 5
## 702 2.013e+12 72 370136 0 6
## 703 2.013e+12 73 370136 1 7
## 1579 2.013e+12 12 231848 0 1
## 1580 2.013e+12 24 231848 0 2
## 1581 2.013e+12 36 231848 0 3
## 1582 2.013e+12 41 231848 0 4
## 25 2.013e+12 4 530206 1 1
## 704 2.013e+12 12 410536 0 1
## 705 2.013e+12 18 410536 1 2
## 706 2.013e+12 12 410536 0 1
## 709 2.013e+12 12 410536 0 1
## 707 2.013e+12 24 410536 0 2
## 710 2.013e+12 24 410536 0 2
## 708 2.013e+12 33 410536 1 3
## 711 2.013e+12 25 410536 1 3
## 509 2.013e+12 12 50224 0 1
## 510 2.013e+12 24 50224 0 2
## 511 2.013e+12 25 50224 0 3
## 712 2.013e+12 12 400936 0 1
## 716 2.013e+12 12 400936 0 1
## 713 2.013e+12 24 400936 0 2
## 717 2.013e+12 24 400936 0 2
## 714 2.013e+12 36 400936 0 3
## 718 2.013e+12 34 400936 1 3
pp%>% group_by(year)%>%
summarise(prop_bir=mean(b2event, na.rm=T))%>%
ggplot(aes(x=year, y=prop_bir))+
geom_line()+
ggtitle(label = "Hazard of naturalizing by year after eligibility")
## `summarise()` ungrouping output (override with `.groups` argument)
pp%>%
group_by(year, MAR)%>%
summarise(prop_bir=mean(b2event, na.rm=T))%>%
ggplot(aes(x=year, y=prop_bir))+
geom_line(aes(group=factor(MAR), color=factor(MAR) ))+
ggtitle(label = "Hazard")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
#generate survey design
des<-svydesign(ids=~SERIAL, strata = ~STRATA, weights=~PERWT, data=pp)
#https://r-survey.r-forge.r-project.org/survey/exmample-lonely.html
options(survey.lonely.psu="adjust")
fit.0<-svyglm(b2event~as.factor(year)-1, design=des, family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.0)
##
## Call:
## svyglm(formula = b2event ~ as.factor(year) - 1, design = des,
## family = "binomial")
##
## Survey design:
## svydesign(ids = ~SERIAL, strata = ~STRATA, weights = ~PERWT,
## data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(year)1 -3.90391 0.16627 -23.479 < 2e-16 ***
## as.factor(year)2 -2.13160 0.10934 -19.495 < 2e-16 ***
## as.factor(year)3 -1.21131 0.08252 -14.679 < 2e-16 ***
## as.factor(year)4 -1.21087 0.09640 -12.561 < 2e-16 ***
## as.factor(year)5 -1.36102 0.12462 -10.921 < 2e-16 ***
## as.factor(year)6 -1.55716 0.18041 -8.631 < 2e-16 ***
## as.factor(year)7 -1.49149 0.21297 -7.003 7.99e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.00028)
##
## Number of Fisher Scoring iterations: 6
#Plot the hazard function on the probability scale
haz<-1/(1+exp(-coef(fit.0)))
time<-seq(1,7,1)
plot(haz~time, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for Natrualization")
fit.1<-svyglm(b2event~as.factor(year)+MAR-1,design=des , family=binomial(link="cloglog"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.1)
##
## Call:
## svyglm(formula = b2event ~ as.factor(year) + MAR - 1, design = des,
## family = binomial(link = "cloglog"))
##
## Survey design:
## svydesign(ids = ~SERIAL, strata = ~STRATA, weights = ~PERWT,
## data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(year)1 -3.79447 0.16992 -22.332 < 2e-16 ***
## as.factor(year)2 -2.06711 0.12803 -16.145 < 2e-16 ***
## as.factor(year)3 -1.22300 0.09475 -12.907 < 2e-16 ***
## as.factor(year)4 -1.22070 0.10520 -11.604 < 2e-16 ***
## as.factor(year)5 -1.35644 0.13052 -10.392 < 2e-16 ***
## as.factor(year)6 -1.51902 0.18533 -8.196 2.05e-15 ***
## as.factor(year)7 -1.45964 0.22778 -6.408 3.38e-10 ***
## MAR -0.20584 0.09664 -2.130 0.0337 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9958998)
##
## Number of Fisher Scoring iterations: 6
dat4<-expand.grid(year=1:7, MAR=c(0,1))
dat4$pred<-as.numeric(predict(fit.1, newdata=dat4, type="response"))
dat4%>%
ggplot(aes(x=year, y=pred,color=factor(MAR), group=factor(MAR) ))+ geom_line()+
ggtitle(label="Hazard of Naturalization by Marital Status and Time since Elgibility")