1 + 1[1] 2
Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.
When you click the Render button a document will be generated that includes both content and the output of embedded code. You can embed code like this:
1 + 1[1] 2
You can add options to executable code like this
[1] 4
The echo: false option disables the printing of code (only output is displayed).
##load libraries
##load libraries
library(tidyverse)── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6 ✔ purrr 0.3.4
✔ tibble 3.1.8 ✔ dplyr 1.0.9
✔ tidyr 1.2.0 ✔ stringr 1.4.1
✔ readr 2.1.2 ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(haven)
library(survival)
library(car)Loading required package: carData
Attaching package: 'car'
The following object is masked from 'package:dplyr':
recode
The following object is masked from 'package:purrr':
some
library(survey)Loading required package: grid
Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
Attaching package: 'survey'
The following object is masked from 'package:graphics':
dotchart
library(muhaz)
library(eha)
library(foreign)
#####load data
usa2 <-Import("usa_00011.dta.gz")Loading required namespace: rio
usa2<-zap_labels(usa2)
usa2 <- read_stata("C:/Users/okabe/OneDrive/Pictures/Stats 2/usa_00011.dta.gz")
usa2<-zap_labels(usa2)##recodes fro variables
##Citizen
usa2$mycitizen<-Recode (usa2$citizen, recodes =" 1:2='Citizen'; 3:4= 'Not citizen';else=NA" ,as.factor=T)
usa2$ mycitizen<-relevel(usa2$mycitizen, ref='Citizen')
#English speaking
usa2$english <-Recode (usa2$speakeng, recodes =" 1='No'; 2:5='Speaks English'; 6='Not well'; else=NA" ,as.factor=T)
usa2$ english<-relevel(usa2$english, ref='Speaks English')
##educ
usa2$education<-Recode(usa2$educd, recodes="000:002='noschool';010:026='0Prim'; 030:061='1somehs'; 062:063='2hsgrad'; 065:090='3somecol'; 100:113='4colgrad';114:116='5masteranddoc'; else= NA", as.factor=T)
##Employed
usa2$employed<-Recode(usa2$empstat, recodes ="1= 'Employed';2:3='Unemployed';else=NA")
##sex
usa2$male<-as.factor(ifelse(usa2$sex==1, "Male", "Female"))
usa2$male<-relevel(usa2$male, ref='Male')
#Age cut into intervals
usa2$agec<-cut(usa2$age, breaks = c(18, 30, 40, 50, 60, 70, 80, 100), include.lowest = T, na.rm=T)
#usa$agec<-cut(usa$age, breaks=c(0,24,39,59,79,99))
##race
usa2$raceg<-Recode(usa2$race, recodes="1='White'; 2='Black';4:6='Asian'; else='other'", as.factor=T)
usa2$raceg<-relevel(usa2$raceg, ref='White')
##born
usa2$born<-Recode(usa2$bpl, recodes=" 200='Mexico' ; 210='Central America'; 300='South America';400:499='Europe'; 548:599= 'Asia';547='Middle East'; 600='Africa'; else= NA", as.factor=T)
usa2$ born<-relevel(usa2$born, ref='Europe')
##Poverty
usa2$poor<- Recode(usa2$poverty, recodes = "000= 'NA'; 001= 'Below poverty threshold';501= 'Above poverty threshold'", as.factor=T)
##income grouping
#usa1$inc<-Recode(usa1$incwage, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor = T)
#usa1$inc<-Recode(usa1$incwage, recodes = "1:1='1_lt10k'; 1960:1970='2_25-50k';1980='3_75-80k';1990:2002='4_100kplus';else= NA", as.factor = T)
#usa1$wage<- car::recode(usa1$incwage, recodes = "1:10000= '1_lt10k';10000:25000='2_10-25k';25000:35000='3_25-35k';35000:50000='4_35-50k';50000:200000= '5_50kplus'", as.factor = T)
##Occupation
usa2$occupation<- Recode(usa2$ind1990, recodes = "010:032= 'Agriculture'; 040:050= 'Mining';060= 'Construction'; 100:392='Manufacturing';400:472 ='Transportaion/Communication/Publicutilities';500:691= 'Wholesale&RetaiTrade';700:712= 'Finance/Ins/RealEst.'; 721:760= 'Business/Repair'; 761:791='PersonalServices'; 800:810= 'Ent.&Rec';812:893= 'Prof.services';900:932= 'Pub. Admin'; 940:960= 'Military'; else= NA ", as.factor=T)
##year naturalized
usa2$yrnatur <- ifelse(usa2$yrnatur>2020, NA, usa2$yrnatur)
##birthyear
usa2$birthyr<- ifelse(usa2$birthyr>2000,NA, usa2$birthyr)Homework 4- Cox regression questions: 1) Carry out the following analysis: Define your outcome as in HW 1. Also consider what covariates are hypothesized to affect the outcome variable. Define these and construct a parametric model for your outcome. My outcome variable is citizenship. Two covariates that will be analyzed are sex and English proficiency to see if sex and English proficiency influence the length of time an immigrant takes to attain their citizenship in the United States.
usa1<- usa2 %>%
mutate(cit_age = ifelse( mycitizen == "Citizen",
yrnatur - (birthyr),
year - (birthyr)),
d.event = ifelse(mycitizen == "Citizen", 1, 0))%>%
filter(complete.cases(agec,education,poor,male,english,yrnatur))
library(survival)
age_fit <- survfit(Surv(cit_age, d.event)~ 1,
data = usa1)
library(ggsurvfit)
age_fit %>%
ggsurvfit() +
add_confidence_interval(type = "ribbon") +
add_quantile() Fit the Cox model the data. a.Include all main effects in the model
b.Test for an interaction between at least two of the predictors
Interpretation of results:
Hazard model:
When fitted into a Cox model, the results show that 325408 received their citizenship.
Additionally, compared to immigrants that speak English, immigrants who do not speak English well or at all have a lower risk of attaining their citizenship. Immigrants who do not speak English have a 70% lower risk of receiving their citizenship. While those who speak some English but not well have a 52% lower risk of receiving their citizenship.
##fit Cox model
##Cox model
library(dplyr)
usa3 <- usa1 %>%
filter( complete.cases(.))
options(survey.lonely.psu = "adjust")
des<-svydesign (ids=~1, strata=~strata,
data=usa3,
weight=~ perwt )
#day<- 1/365
fit.5<-svycoxph(Surv(cit_age, d.event)~english+male,
design = des)
summary(fit.5)Stratified Independent Sampling design (with replacement)
svydesign(ids = ~1, strata = ~strata, data = usa3, weight = ~perwt)
Call:
svycoxph(formula = Surv(cit_age, d.event) ~ english + male, design = des)
n= 325408, number of events= 325408
coef exp(coef) se(coef) robust se z Pr(>|z|)
englishNo -0.698791 0.497186 0.012635 0.013988 -49.955 <2e-16 ***
englishNot well -0.523684 0.592334 0.005792 0.006099 -85.860 <2e-16 ***
maleFemale 0.041980 1.042874 0.003510 0.004261 9.852 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
englishNo 0.4972 2.0113 0.4837 0.5110
englishNot well 0.5923 1.6882 0.5853 0.5995
maleFemale 1.0429 0.9589 1.0342 1.0516
Concordance= 0.544 (se = 0.001 )
Likelihood ratio test= NA on 3 df, p=NA
Wald test = 9164 on 3 df, p=<2e-16
Score (logrank) test = NA on 3 df, p=NA
(Note: the likelihood ratio and score tests assume independence of
observations within a cluster, the Wald and robust score tests do not).
c.Produce plots of the survival function and the cumulative hazard function for different “risk profiles”, as done in the notes
Interpretation of plots:
The plots show how the risk lowers based on the covariates mentioned above
When looking at the age at which an immigrant is receives their citizenship, while using the survival function we see that survival to acquire citizenship reduces based on age. Where, the older an immigrant gets, the lower their survival is of acquiring citizenship
##survivalship
plot(survfit(fit.5),xlim=c(6,9),
main="survivorship cox regression citiizenship")Summary for Coxreg model results:
Shows that immigrants that do not speak English have a lower likelihood of receiving their citizenship when compared to immigrants that speak English well.In terms of age, we see that likelihood of receiving citizenship lowers as immigrants get older. While females have a higher likelihood of attaining citizenship compared to males.
##hazard function
fit.6<-coxreg(Surv(cit_age, d.event)~english+male+agec,
data =usa3)
summary(fit.6)Covariate Mean Coef Rel.Risk S.E. LR p
english 0.000
Speaks English 0.852 0 1 (reference)
No 0.024 -0.492 0.611 0.013
Not well 0.124 -0.355 0.701 0.006
male 0.001
Male 0.515 0 1 (reference)
Female 0.485 0.011 1.011 0.004
agec 0.000
[18,30] 0.059 0 1 (reference)
(30,40] 0.138 -1.220 0.295 0.007
(40,50] 0.251 -2.062 0.127 0.007
(50,60] 0.298 -2.650 0.071 0.008
(60,70] 0.194 -3.132 0.044 0.009
(70,80] 0.050 -3.489 0.031 0.012
(80,100] 0.010 -3.705 0.025 0.023
Events 325408
Total time at risk 10598983
Max. log. likelihood -3716632
LR test statistic 176634.29
Degrees of freedom 9
Overall p-value 0
plot(survfit(fit.6),xlim=c(6,40),
main="survivorship cox regression citiizenship")d.Evaluate the assumptions of the Cox model, including proportionality of hazards of covariates, and if you use a continuous variable, evaluate linearity of the effect using Martingale residuals
mart<- resid (fit.5, type = "martingale")
plot(mart)##linearity
scatter.smooth( des$variable$agec, mart,degree = 2,
span = 1, ylab="Martingale Residual",
col=1, cex=.25, lpars=list(col = "pink", lwd = 3))
title(main="Martingale residuals for citizenship status based on age")When looking at the function form of the covariate,I decide to use the age of an immigrant because both English proficiency and sex will not show the in order to see if the covariate is being modeled correctly. In this case, we see that the line is somewhat flat, which hopefully means that the form of the covariate is correct