Ricardo Lowe DEM 7223 – Event History Analysis Homework 5

library(car)
## Loading required package: carData
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(muhaz)
library(eha)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ stringr 1.4.0
## ✓ tidyr   1.1.1     ✓ forcats 0.5.0
## ✓ readr   1.3.1
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")
## Loading required package: ipumsr
usa_00128 <- read_ipums_ddi("usa_00128.xml")
usa_00128_ <- read_ipums_micro(usa_00128)
## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
ipums_view(usa_00128_)
usa_00122_ <- usa_00128_

usa_00122_$Female <-   ifelse(usa_00122_$SEX == 1, 0, 1)
usa_00122_$MAR <-   ifelse(usa_00122_$MARST <= 2, 0, 1)

usa_00122_$HighEd <-   ifelse(usa_00122_$EDUC <= 6, 0, 1)

usa_00122_$Rev <-     ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG >= 1960 & usa_00122_$YRIMMIG <= 1978, "Pre-",
                      ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG >= 1979 & usa_00122_$YRIMMIG <= 1983, "During",
                      ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG > 1983 & usa_00122_$YRIMMIG <= 1990, "Post-",NA)))

usa_00122_$Rev2 <-     ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG < 1979, 1,
                      ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG >= 1979 & usa_00122_$YRIMMIG <= 1983, 2,
                      ifelse(usa_00122_$BPLD == 26055 & usa_00122_$YRIMMIG > 1983, 3,NA)))


usa_00122_$Rev <-as.factor(usa_00122_$Rev)
usa_00122_$Rev2 <-as.numeric(usa_00122_$Rev2)



usa_00122_$ImmAge <-  usa_00122_$YRIMMIG-usa_00122_$BIRTHYR

usa_00122_$AGEC<-cut(usa_00122_$AGE, breaks=c(0,24,39,59,79,99))

usa_00122_$AGEW<-ifelse(usa_00122_$AGE >=15 & usa_00122_$AGE <=24, "15_24",
                 ifelse(usa_00122_$AGE >=25 & usa_00122_$AGE <=54, "25_54",
                 ifelse(usa_00122_$AGE >=55 & usa_00122_$AGE <=64, "55_64","Other")))
                        
usa_00122_$AGEW <-as.factor(usa_00122_$AGEW)
usa_00122_$AGEW<-relevel(usa_00122_$AGEW,ref="25_54")
n2 <- usa_00122_%>%
  filter(YRIMMIG < 9000 & 
         BIRTHYR < 9000 &
         YRSUSA1 > 5) #& 
         #RACE == 2
         


#n2$YRIMMIG3 <- as.numeric(ifelse(n2$YRIMMIG >= 1990,"",n2$YRIMMIG))
n2$YRNATUR3 <- as.numeric(ifelse(n2$YRNATUR >= 9997,NA,n2$YRNATUR))
n2$YRIMMIG3 <- as.numeric((n2$YEAR - n2$BIRTHYR))


n2$YEAR <- as.factor(n2$YEAR)

n2$secbi<-ifelse(is.na(n2$YRNATUR3)==T, 
                 n2$YRIMMIG3, 
                 (n2$YRNATUR3-n2$BIRTHYR))
                 
n2$secbi2<-ifelse(is.na(n2$YRIMMIG)<=1990, 
                 n2$YRIMMIG3, 
                 (n2$YRIMMIG-n2$BIRTHYR))
                                  


n2$b2event<-ifelse(is.na(n2$YRNATUR3)==T,0,1) 
n2$b2event2<-ifelse(n2$YRIMMIG<=1990,0,1)

n2$b2event<-as.numeric(n2$b2event)
n2$secbi<-as.numeric(n2$secbi)


fit<-survfit(Surv(secbi, b2event)~1, n2)
fit
## Call: survfit(formula = Surv(secbi, b2event) ~ 1, data = n2)
## 
##       n  events  median 0.95LCL 0.95UCL 
##     911     539      46      43      49

I limited the dataset to Afro-Caribbean immigrants and created my observed and censored duration variables. Here is test output:

n2 %>% 
  select(RACE, HISPAN,LANGUAGED,BPLD,BIRTHYR,YRIMMIG,YRNATUR,YRSUSA1,YRNATUR3,b2event,secbi) %>%
  filter(b2event == 1)
## # A tibble: 539 x 11
##       RACE  HISPAN  LANGUAGED        BPLD BIRTHYR    YRIMMIG YRNATUR YRSUSA1
##    <int+l> <int+l>  <int+lbl>   <int+lbl> <dbl+l>  <int+lbl> <int+l> <int+l>
##  1 2 [Bla… 0 [Not…  100 [Eng… 21070 [Pan…    1959 1986 [198…    1991      27
##  2 2 [Bla… 0 [Not…  100 [Eng… 21070 [Pan…    1935 1957          1957      59
##  3 2 [Bla… 0 [Not…  100 [Eng… 21070 [Pan…    1937 1960          1998      56
##  4 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan…    1940 1962          1986      54
##  5 2 [Bla… 0 [Not…  100 [Eng… 21070 [Pan…    1976 1978          1980      35
##  6 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan…    1923 1948          1958      65
##  7 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan…    1959 1977          1987      36
##  8 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan…    1943 1960          1966      53
##  9 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan…    1934 1968          1968      45
## 10 2 [Bla… 0 [Not… 1200 [Spa… 21070 [Pan…    1954 1972          1992      41
## # … with 529 more rows, and 3 more variables: YRNATUR3 <dbl>, b2event <dbl>,
## #   secbi <dbl>
n2 %>% 
     group_by(BPLD)%>% 
     filter(YEAR == 2018)%>%
     summarise(Total = sum(PERWT))%>%
     arrange(desc(Total))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 2
##             BPLD Total
##        <int+lbl> <dbl>
## 1 21070 [Panama] 11032
levels(n2$YEAR)
## [1] "2017" "2018"
n2$LANG <- as.factor(ifelse(n2$LANGUAGE == 1, 1, ifelse(n2$LANGUAGE == 12, 0, NA)))

#make person period file
pp<-survSplit(Surv(secbi, b2event)~. , data = n2[n2$secbi>0,], 
              cut=c(0,12, 24, 36, 48, 60, 72), episode="year_nat")

pp$year <- pp$year_nat-1
pp<-pp[order(pp$CBSERIAL, pp$year_nat),]
head(pp[, c("CBSERIAL", "secbi", "STRATA", "b2event", "year")], n=50)
##       CBSERIAL secbi STRATA b2event year
## 678  2.013e+12    12 401536       0    1
## 683  2.013e+12    12 401536       0    1
## 689  2.013e+12    12 401536       0    1
## 679  2.013e+12    24 401536       0    2
## 684  2.013e+12    24 401536       0    2
## 690  2.013e+12    24 401536       0    2
## 680  2.013e+12    36 401536       0    3
## 685  2.013e+12    36 401536       0    3
## 691  2.013e+12    36 401536       0    3
## 681  2.013e+12    48 401536       0    4
## 686  2.013e+12    48 401536       0    4
## 692  2.013e+12    48 401536       0    4
## 682  2.013e+12    51 401536       1    5
## 687  2.013e+12    60 401536       0    5
## 693  2.013e+12    52 401536       0    5
## 688  2.013e+12    65 401536       0    6
## 624  2.013e+12    12  90434       0    1
## 625  2.013e+12    21  90434       1    2
## 694  2.013e+12    12 410536       0    1
## 695  2.013e+12    24 410536       0    2
## 696  2.013e+12    28 410536       1    3
## 697  2.013e+12    12 370136       0    1
## 698  2.013e+12    24 370136       0    2
## 699  2.013e+12    36 370136       0    3
## 700  2.013e+12    48 370136       0    4
## 701  2.013e+12    60 370136       0    5
## 702  2.013e+12    72 370136       0    6
## 703  2.013e+12    73 370136       1    7
## 1579 2.013e+12    12 231848       0    1
## 1580 2.013e+12    24 231848       0    2
## 1581 2.013e+12    36 231848       0    3
## 1582 2.013e+12    41 231848       0    4
## 25   2.013e+12     4 530206       1    1
## 704  2.013e+12    12 410536       0    1
## 705  2.013e+12    18 410536       1    2
## 706  2.013e+12    12 410536       0    1
## 709  2.013e+12    12 410536       0    1
## 707  2.013e+12    24 410536       0    2
## 710  2.013e+12    24 410536       0    2
## 708  2.013e+12    33 410536       1    3
## 711  2.013e+12    25 410536       1    3
## 509  2.013e+12    12  50224       0    1
## 510  2.013e+12    24  50224       0    2
## 511  2.013e+12    25  50224       0    3
## 712  2.013e+12    12 400936       0    1
## 716  2.013e+12    12 400936       0    1
## 713  2.013e+12    24 400936       0    2
## 717  2.013e+12    24 400936       0    2
## 714  2.013e+12    36 400936       0    3
## 718  2.013e+12    34 400936       1    3
pp%>% group_by(year)%>%
summarise(prop_bir=mean(b2event, na.rm=T))%>%
ggplot(aes(x=year, y=prop_bir))+
geom_line()+
ggtitle(label = "Hazard of naturalizing by year after eligibility")
## `summarise()` ungrouping output (override with `.groups` argument)

pp%>%
group_by(year, MAR)%>%
summarise(prop_bir=mean(b2event, na.rm=T))%>%
ggplot(aes(x=year, y=prop_bir))+
geom_line(aes(group=factor(MAR), color=factor(MAR) ))+
ggtitle(label = "Hazard")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

#generate survey design
des<-svydesign(ids=~SERIAL, strata = ~STRATA, weights=~PERWT, data=pp)

#https://r-survey.r-forge.r-project.org/survey/exmample-lonely.html
options(survey.lonely.psu="adjust")
fit.0<-svyglm(b2event~as.factor(year)-1, design=des, family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.0)
## 
## Call:
## svyglm(formula = b2event ~ as.factor(year) - 1, design = des, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~SERIAL, strata = ~STRATA, weights = ~PERWT, 
##     data = pp)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## as.factor(year)1 -3.90391    0.16627 -23.479  < 2e-16 ***
## as.factor(year)2 -2.13160    0.10934 -19.495  < 2e-16 ***
## as.factor(year)3 -1.21131    0.08252 -14.679  < 2e-16 ***
## as.factor(year)4 -1.21087    0.09640 -12.561  < 2e-16 ***
## as.factor(year)5 -1.36102    0.12462 -10.921  < 2e-16 ***
## as.factor(year)6 -1.55716    0.18041  -8.631  < 2e-16 ***
## as.factor(year)7 -1.49149    0.21297  -7.003 7.99e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.00028)
## 
## Number of Fisher Scoring iterations: 6
#Plot the hazard function on the probability scale
haz<-1/(1+exp(-coef(fit.0)))
time<-seq(1,7,1)
plot(haz~time, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for Natrualization")

fit.1<-svyglm(b2event~as.factor(year)+MAR-1,design=des , family=binomial(link="cloglog"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.1)
## 
## Call:
## svyglm(formula = b2event ~ as.factor(year) + MAR - 1, design = des, 
##     family = binomial(link = "cloglog"))
## 
## Survey design:
## svydesign(ids = ~SERIAL, strata = ~STRATA, weights = ~PERWT, 
##     data = pp)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## as.factor(year)1 -3.79447    0.16992 -22.332  < 2e-16 ***
## as.factor(year)2 -2.06711    0.12803 -16.145  < 2e-16 ***
## as.factor(year)3 -1.22300    0.09475 -12.907  < 2e-16 ***
## as.factor(year)4 -1.22070    0.10520 -11.604  < 2e-16 ***
## as.factor(year)5 -1.35644    0.13052 -10.392  < 2e-16 ***
## as.factor(year)6 -1.51902    0.18533  -8.196 2.05e-15 ***
## as.factor(year)7 -1.45964    0.22778  -6.408 3.38e-10 ***
## MAR              -0.20584    0.09664  -2.130   0.0337 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9958998)
## 
## Number of Fisher Scoring iterations: 6
dat4<-expand.grid(year=1:7, MAR=c(0,1)) 
dat4$pred<-as.numeric(predict(fit.1, newdata=dat4, type="response"))

dat4%>%
ggplot(aes(x=year, y=pred,color=factor(MAR), group=factor(MAR) ))+ geom_line()+
ggtitle(label="Hazard of Naturalization by Marital Status and Time since Elgibility")