Homework 4 10-3

Quarto

Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.

Running Code

When you click the Render button a document will be generated that includes both content and the output of embedded code. You can embed code like this:

1 + 1
[1] 2

You can add options to executable code like this

[1] 4

The echo: false option disables the printing of code (only output is displayed).

##load libraries
##load libraries
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.8     ✔ dplyr   1.0.9
✔ tidyr   1.2.0     ✔ stringr 1.4.1
✔ readr   2.1.2     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(haven)
library(survival)
library(car)
Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some
library(survey)
Loading required package: grid
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack


Attaching package: 'survey'

The following object is masked from 'package:graphics':

    dotchart
library(muhaz)
library(eha)
library(foreign)

###
##load data
usa2 <-Import("usa_00011.dta.gz")
Loading required namespace: rio
usa2<-zap_labels(usa2)

usa2 <- read_stata("C:/Users/okabe/OneDrive/Pictures/Stats 2/usa_00011.dta.gz")
usa2<-zap_labels(usa2)
##recodes fro variables
##Citizen
usa2$mycitizen<-Recode (usa2$citizen, recodes =" 1:2='Citizen'; 3:4= 'Not citizen';else=NA" ,as.factor=T)

usa2$ mycitizen<-relevel(usa2$mycitizen, ref='Citizen')

#English speaking
usa2$english <-Recode (usa2$speakeng, recodes =" 1='No'; 2:5='Speaks English'; 6='Not well'; else=NA" ,as.factor=T)
usa2$ english<-relevel(usa2$english, ref='Speaks English')


##educ
usa2$education<-Recode(usa2$educd, recodes="000:002='noschool';010:026='0Prim'; 030:061='1somehs'; 062:063='2hsgrad'; 065:090='3somecol'; 100:113='4colgrad';114:116='5masteranddoc'; else= NA", as.factor=T)

##Employed

usa2$employed<-Recode(usa2$empstat, recodes ="1= 'Employed';2:3='Unemployed';else=NA")

##sex
usa2$male<-as.factor(ifelse(usa2$sex==1, "Male", "Female"))
usa2$male<-relevel(usa2$male, ref='Male')

#Age cut into intervals 

usa2$agec<-cut(usa2$age, breaks = c(18, 30, 40, 50, 60, 70, 80, 100), include.lowest = T, na.rm=T)







#usa$agec<-cut(usa$age, breaks=c(0,24,39,59,79,99))



##race

usa2$raceg<-Recode(usa2$race, recodes="1='White'; 2='Black';4:6='Asian'; else='other'", as.factor=T)
usa2$raceg<-relevel(usa2$raceg, ref='White')


##born

usa2$born<-Recode(usa2$bpl, recodes=" 200='Mexico' ; 210='Central America'; 300='South America';400:499='Europe'; 548:599= 'Asia';547='Middle East'; 600='Africa'; else= NA", as.factor=T)
usa2$ born<-relevel(usa2$born, ref='Europe')



##Poverty
usa2$poor<- Recode(usa2$poverty, recodes = "000= 'NA'; 001= 'Below poverty threshold';501= 'Above poverty threshold'", as.factor=T)

##income grouping

#usa1$inc<-Recode(usa1$incwage, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor = T)

#usa1$inc<-Recode(usa1$incwage, recodes = "1:1='1_lt10k'; 1960:1970='2_25-50k';1980='3_75-80k';1990:2002='4_100kplus';else= NA", as.factor = T)

#usa1$wage<- car::recode(usa1$incwage, recodes = "1:10000= '1_lt10k';10000:25000='2_10-25k';25000:35000='3_25-35k';35000:50000='4_35-50k';50000:200000= '5_50kplus'", as.factor = T)


##Occupation

usa2$occupation<- Recode(usa2$ind1990, recodes = "010:032= 'Agriculture'; 040:050= 'Mining';060= 'Construction'; 100:392='Manufacturing';400:472 ='Transportaion/Communication/Publicutilities';500:691= 'Wholesale&RetaiTrade';700:712= 'Finance/Ins/RealEst.'; 721:760= 'Business/Repair'; 761:791='PersonalServices'; 800:810= 'Ent.&Rec';812:893= 'Prof.services';900:932= 'Pub. Admin'; 940:960= 'Military'; else= NA ", as.factor=T)

##year naturalized

usa2$yrnatur <- ifelse(usa2$yrnatur>2020, NA, usa2$yrnatur)

##birthyear
usa2$birthyr<- ifelse(usa2$birthyr>2000,NA, usa2$birthyr)

Homework 4- Cox regression questions: 1) Carry out the following analysis: Define your outcome as in HW 1. Also consider what covariates are hypothesized to affect the outcome variable. Define these and construct a parametric model for your outcome. My outcome variable is citizenship. Two covariates that will be analyzed are sex and English proficiency to see if sex and English proficiency influence the length of time an immigrant takes to attain their citizenship in the United States.

usa1<- usa2 %>%
 
    
 
          mutate(cit_age = ifelse( mycitizen == "Citizen", 
                                     yrnatur - (birthyr), 
                                     year - (birthyr)), 
                 d.event = ifelse(mycitizen == "Citizen", 1, 0))%>%
  filter(complete.cases(agec,education,poor,male,english,yrnatur))

library(survival)

        age_fit <- survfit(Surv(cit_age, d.event)~ 1, 
                           data = usa1)

        library(ggsurvfit)

        age_fit %>%
          ggsurvfit() +
          add_confidence_interval(type = "ribbon") +
          add_quantile() 

Fit the Cox model the data. a.Include all main effects in the model

b.Test for an interaction between at least two of the predictors

Interpretation of results:

Hazard model:

When fitted into a Cox model, the results show that 325408 received their citizenship.

Additionally, compared to immigrants that speak English, immigrants who do not speak English well or at all have a lower risk of attaining their citizenship. Immigrants who do not speak English have a 70% lower risk of receiving their citizenship. While those who speak some English but not well have a 52% lower risk of receiving their citizenship.

##fit Cox model
##Cox model

library(dplyr)

usa3 <- usa1 %>%
  
   filter( complete.cases(.))
  
 

options(survey.lonely.psu = "adjust")
 

des<-svydesign (ids=~1, strata=~strata,
               data=usa3,
               weight=~ perwt )




#day<- 1/365


fit.5<-svycoxph(Surv(cit_age, d.event)~english+male,
                  design = des)
             
             


summary(fit.5)
Stratified Independent Sampling design (with replacement)
svydesign(ids = ~1, strata = ~strata, data = usa3, weight = ~perwt)
Call:
svycoxph(formula = Surv(cit_age, d.event) ~ english + male, design = des)

  n= 325408, number of events= 325408 

                     coef exp(coef)  se(coef) robust se       z Pr(>|z|)    
englishNo       -0.698791  0.497186  0.012635  0.013988 -49.955   <2e-16 ***
englishNot well -0.523684  0.592334  0.005792  0.006099 -85.860   <2e-16 ***
maleFemale       0.041980  1.042874  0.003510  0.004261   9.852   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                exp(coef) exp(-coef) lower .95 upper .95
englishNo          0.4972     2.0113    0.4837    0.5110
englishNot well    0.5923     1.6882    0.5853    0.5995
maleFemale         1.0429     0.9589    1.0342    1.0516

Concordance= 0.544  (se = 0.001 )
Likelihood ratio test= NA  on 3 df,   p=NA
Wald test            = 9164  on 3 df,   p=<2e-16
Score (logrank) test = NA  on 3 df,   p=NA

  (Note: the likelihood ratio and score tests assume independence of
     observations within a cluster, the Wald and robust score tests do not).

c.Produce plots of the survival function and the cumulative hazard function for different “risk profiles”, as done in the notes

Interpretation of plots:

The plots show how the risk lowers based on the covariates mentioned above

When looking at the age at which an immigrant is receives their citizenship, while using the survival function we see that survival to acquire citizenship reduces based on age. Where, the older an immigrant gets, the lower their survival is of acquiring citizenship

##survivalship
plot(survfit(fit.5),xlim=c(6,9),
     main="survivorship cox regression citiizenship")

Summary for Coxreg model results:

Shows that immigrants that do not speak English have a lower likelihood of receiving their citizenship when compared to immigrants that speak English well.In terms of age, we see that likelihood of receiving citizenship lowers as immigrants get older. While females have a higher likelihood of attaining citizenship compared to males.

##hazard function
fit.6<-coxreg(Surv(cit_age, d.event)~english+male+agec,
                  data =usa3)
summary(fit.6)
Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
english                                                      0.000 
  Speaks English      0.852     0         1 (reference)
              No      0.024    -0.492     0.611     0.013
        Not well      0.124    -0.355     0.701     0.006
male                                                         0.001 
            Male      0.515     0         1 (reference)
          Female      0.485     0.011     1.011     0.004
agec                                                         0.000 
         [18,30]      0.059     0         1 (reference)
         (30,40]      0.138    -1.220     0.295     0.007
         (40,50]      0.251    -2.062     0.127     0.007
         (50,60]      0.298    -2.650     0.071     0.008
         (60,70]      0.194    -3.132     0.044     0.009
         (70,80]      0.050    -3.489     0.031     0.012
        (80,100]      0.010    -3.705     0.025     0.023

Events                    325408 
Total time at risk        10598983 
Max. log. likelihood      -3716632 
LR test statistic         176634.29 
Degrees of freedom        9 
Overall p-value           0
plot(survfit(fit.6),xlim=c(6,40),
     main="survivorship cox regression citiizenship")

d.Evaluate the assumptions of the Cox model, including proportionality of hazards of covariates, and if you use a continuous variable, evaluate linearity of the effect using Martingale residuals

mart<- resid (fit.5, type = "martingale")

plot(mart)

##linearity
scatter.smooth( des$variable$agec, mart,degree = 2,
               span = 1, ylab="Martingale Residual",
               col=1,  cex=.25, lpars=list(col = "pink", lwd = 3))
title(main="Martingale residuals for citizenship status based on age")

When looking at the function form of the covariate,I decide to use the age of an immigrant because both English proficiency and sex will not show the in order to see if the covariate is being modeled correctly. In this case, we see that the line is somewhat flat, which hopefully means that the form of the covariate is correct