A) Choose a data set and an event, with an associated duration; define as many predictors as you need to do your analysis.

For this analysis the National Longitudinal Study of Adolescent to Adult Health (ADD Health) is used to determine if the likelihood of being jumped or beaten up for LGB individuals is greater and earlier then the liklihood of being beaten up for straight people.. ADD Health is a nationally representative longitudinal study of adolescents in the United States who are between 7 and 12 years old. The survey was conducted in 1994 with follow up surveys in 1996, 2008, and 2016.

B) Define the key variables: episodes, events, duration, censoring, predictors

Episode - calculated from the start time, or age at first interview, and end time, or age when event occured, and censoring

Event - being jumped and beaten up

Duration - the time until the even of being beaten up occurs

Censoring - if someone is not beaten up they are right censored at the end of the study, or wave 4 (survival time)

Predictors - sex, sexual orientation, race/ethnicity, education, residence, immigrant status

library(haven)
#download datasets - AddHealth
wave1<-read_dta("/projects/add_health/data/11613344/ICPSR_27021/DS0001/da27021p1.dta")
wave3<-read_dta("/projects/add_health/data/11613344/ICPSR_27021/DS0003/da27021p3.dta")
wave4<-read_dta("/projects/add_health/data/11613344/ICPSR_27021/DS0012/da27021p12.dta")
addhealth.wghtW1<-read_dta("/projects/add_health/data/11613344/ICPSR_27021/DS0025/da27021p25.dta")
#addhealth<-merge(wave1,wave3,by="aid")
addhealth1<-merge(wave1,wave3,by="aid")
addhealth1<-merge(addhealth1,wave4,by="aid")
addhealth1<-merge(addhealth1,addhealth.wghtW1,by="aid")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#select variables to use
addhealth<-addhealth1%>%
  select(GSWGT1,region,psuscid,BIO_SEX,H3SE13,H1GI6A,H1GI6B,H1GI4,H1GI6D,H1GI6E,H4ED2,iyear,imonth,H1GI1M,H1GI1Y,IYEAR3,IMONTH3,H3OD1Y,H3OD1M,H4OD1M,H4OD1Y,IMONTH4,IYEAR4,H4MH22,H3GH1,H4GH1,H1GI6C,H1FS6,H3SP9,H4DS18,H3DS18F,H4MH22,H4GH1,H3DS18G,H1FV6,H4LM1,H3HR2,H4OD4)
#complete cases
addhealth<-addhealth%>%
 filter(complete.cases(.))
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
#UScitizen
addhealth$UScitizen<-Recode(addhealth$H4OD4,recodes="0='a_no';1='b_yes';else=NA",as.factor=T)
#residence
addhealth$w3.residence<-Recode(addhealth$H3HR2,recodes="1='a_withparents';2='b_withsomeone';3='c_myownplace';4='d_group.qrts';5='e_homeless';6='f_other';else=NA",as.factor=T)
#birth year w1
addhealth$birthyearw1 <- factor(ifelse(addhealth$H1GI1Y=="74",1974,
                                             ifelse(addhealth$H1GI1Y=="75",1975,
                                                           ifelse(addhealth$H1GI1Y=="76",1976,
                                                                         ifelse(addhealth$H1GI1Y=="77",1977,
                                                                                       ifelse(addhealth$H1GI1Y=="78",1978,
                                                                                                     ifelse(addhealth$H1GI1Y=="79",1979,
                                                                                                                   ifelse(addhealth$H1GI1Y=="80",1980,
                                                                                                                                ifelse(addhealth$H1GI1Y=="81",1981,
                                                                                                                                               ifelse(addhealth$H1GI1Y=="82",1982,
                                                                                                                                                             ifelse(addhealth$H1GI1Y=="83",1983,1979)))))))))))
#birth month w1
addhealth$birthmonthw1 <- Recode(addhealth$H1GI1M,recodes="1=1;2=2;3=3;4=4;5=5;6=6;7=7;8=8;9=9;10=10;11=11;12=12;else=NA")
#combine year and month for a birth date wave 1
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
addhealth$birthdatew1 <- as.yearmon(paste(addhealth$birthyearw1, addhealth$birthmonthw1), "%Y %m")
#birth month wave 4
addhealth$birthmonthw4 <- Recode(addhealth$H4OD1M,recodes="1=1;2=2;3=3;4=4;5=5;6=6;7=7;8=8;9=9;10=10;11=11;12=12")
#birth year wave 4
addhealth$birthyearw4<-addhealth$H4OD1Y
addhealth$birthyearw4 <- Recode(addhealth$H4OD1Y,recodes="1974=1974;1975=1975;1976=1976;1977=1977;1978=1978;1979=1979;1980=1980;1981=1981;1982=1982;1983=1983")
#combine year and month for birth date wave 4
addhealth$birthdatew4 <- as.yearmon(paste(addhealth$birthyearw4, addhealth$birthmonthw4), "%Y %m")
#birth date wave 3
addhealth$birthdatew3<-as.yearmon(paste(addhealth$H3OD1Y,addhealth$H3OD1M),"%Y %m")
#interview date w1
addhealth$iyearfix <- Recode(addhealth$iyear,recodes="'94'='1994';'95'='1995'")
addhealth$interviewdatew1 <- as.yearmon(paste(addhealth$iyearfix, addhealth$imonth), "%Y %m")
#interview date w4
addhealth$interviewmonthw4 <- Recode(addhealth$IMONTH4,recodes="1=1;2=2;3=3;4=4;5=5;6=6;7=7;8=8;9=9;10=10;11=11;12=12")
addhealth$interviewyearw4 <- Recode(addhealth$IYEAR4,recodes="2007=2007;2008=2008;2009=2009")
addhealth$interviewdatew4<-as.yearmon(paste(addhealth$interviewyearw4,addhealth$interviewmonthw4),"%Y %m")
#interview date w3
addhealth$interviewdatew3<-as.yearmon(paste(addhealth$IYEAR3,addhealth$IMONTH3 ),"%Y %m")
#age - derived from date of birth subtracted from wave interview date 
addhealth$agew4<-(as.numeric(round(addhealth$interviewdatew4 - addhealth$birthdatew4)))
addhealth$agew3<-(as.numeric(round(addhealth$interviewdatew3 - addhealth$birthdatew3)))
addhealth$agew1<-(as.numeric(round(addhealth$interviewdatew1 - addhealth$birthdatew1)))

#sex variable - Male is reference group
addhealth$sex <- ifelse(addhealth$BIO_SEX==2,2,1)
addhealth$sex<-Recode(addhealth$sex, recodes="1='a_male'; 2='b_female'",as.factor=T)   
#sexual orientation from wave 3.  Reference group is straight people
addhealth$sexorient <- factor(ifelse(addhealth$H3SE13=="(1) 100% heterosexual (straight)",1,
                                       ifelse(addhealth$H3SE13=="(2) Mostly #heterosexual.somewhat attracted to people of own",2, 
                                           ifelse(addhealth$H3SE13=="(3) Bisexual-attracted #to men and women equally",3,
                                                   ifelse(addhealth$H3SE13=="(4) Mostly #homosexual.somewhat attracted to opposite sex",4,
                                                          ifelse(addhealth$H3SE13=="(5) 100% #homosexual (gay)",5,NA))))))

addhealth$sexorient<-Recode(addhealth$H3SE13, recodes="1:2='a_straight'; 3='b_bisexual';4:5='c_LGB';else=NA",as.factor=T)
#race variables - Reference group is nhwhite
#nhwhite
addhealth$nhwhite <- ifelse(addhealth$H1GI6A==1,1,0)
addhealth$nhwhite<-Recode(addhealth$nhwhite, recodes="1='nhwhite'; 0='nonnhwhite'; 6:8=NA",as.factor=T)
#nhblack
addhealth$nhblack <- ifelse(addhealth$H1GI6B==1,1,0)
addhealth$nhblack<-Recode(addhealth$nhblack, recodes="1='nhblack'; 0='nonnhblack'; 6:8=NA",as.factor=T)
#Hispanic
addhealth$hispanic <- ifelse(addhealth$H1GI4==1,1,0)
addhealth$hispanic<-Recode(addhealth$hispanic, recodes="1='hispanic'; 0='nonhispanic'; 6:8=NA",as.factor=T)
#Asian
addhealth$asian <- ifelse(addhealth$H1GI6D==1,1,0)
addhealth$asian<-Recode(addhealth$asian, recodes="1='asian'; 0='nonasian'; 6:8=NA",as.factor=T)
#Native American
addhealth$native_american <- ifelse(addhealth$H1GI6C==1,1,0)
addhealth$native_american<-Recode(addhealth$native_american, recodes="1='native_american'; 0='nonnative_american'; 6:8=NA",as.factor=T)
#other
addhealth$other <- ifelse(addhealth$H1GI6E==1,1,0)
addhealth$other<-Recode(addhealth$other, recodes="1='other'; 0='nonother'; 6:8=NA",as.factor=T)
#combine race to one variable
addhealth$racethnic <- factor(ifelse(addhealth$nhwhite=="nhwhite","a-nhwhite", 
                                            ifelse(addhealth$nhblack=="nhblack", "b-nhblack", 
                                            ifelse(addhealth$hispanic=="hispanic", "c-hispanic",
                                                   ifelse(addhealth$asian=="asian","d-asian",
                                                          ifelse(addhealth$native_american=="native_american","e-native_american",
                                                                 ifelse(addhealth$other=="other","f-other",NA)))))))

#Education.  Less then high school is reference group
addhealth$educ<-Recode(addhealth$H4ED2,recodes="1:2='a_less_highschool';3:6='b_highschool_grad';7='c_college_bach';8:13='d_college+';else=NA",as.factor=T)
#transition
addhealth$W1.jumped.beatenup<-Recode(addhealth$H1FV6, recodes="0='a.no'; 1:2='b.yes';else=NA")
addhealth$W3.jumped1<-ifelse(addhealth$H3DS18F==1,1,
                                  ifelse(addhealth$H3DS18F==0,0,NA))
addhealth$W3.jumped2<-ifelse(addhealth$H3DS18G==1,1,
                                  ifelse(addhealth$H3DS18G==0,0,NA))
addhealth$transition.w1<-ifelse(addhealth$W1.jumped.beatenup=="a.no"&addhealth$W3.jumped1==1|addhealth$W3.jumped2==1,1,0)
addhealth$W4.jumped.beatenup<-Recode(addhealth$H4DS18, recodes="0='a.no'; 1='b.yes';else=NA")   
addhealth$transition<-ifelse(addhealth$transition.w1==0&addhealth$W4.jumped.beatenup=="b.yes",1,0)
table(addhealth$transition)                                             
## 
##     0     1 
## 10880  1364
#select variables to use
addhealth<-addhealth%>%
  select(psuscid,region,GSWGT1,agew1,agew3, agew4, sex,sexorient,racethnic,educ,transition,transition.w1,w3.residence,UScitizen)
#filter complete cases
addhealth<-addhealth%>%
 filter(complete.cases(.))
adlong<-reshape(data.frame(addhealth), idvar = 'aid', varying=list(c('agew1', 'agew3'), c('agew3','agew4')),
                v.names = c('age_enter', 'age_exit'), 
                times = 1:2, direction='long')

adlong<-adlong[order(adlong$aid, adlong$time),]
head(adlong)
##     psuscid region   GSWGT1      sex  sexorient racethnic
## 1.1     371      1  69.1244   a_male a_straight b-nhblack
## 1.2     371      1  69.1244   a_male a_straight b-nhblack
## 2.1     078      1 175.6404   a_male a_straight a-nhwhite
## 2.2     078      1 175.6404   a_male a_straight a-nhwhite
## 3.1     044      3 318.6804 b_female a_straight a-nhwhite
## 3.2     044      3 318.6804 b_female a_straight a-nhwhite
##                  educ transition transition.w1  w3.residence UScitizen
## 1.1 b_highschool_grad          0             0 a_withparents     b_yes
## 1.2 b_highschool_grad          0             0 a_withparents     b_yes
## 2.1    c_college_bach          0             0  c_myownplace     b_yes
## 2.2    c_college_bach          0             0  c_myownplace     b_yes
## 3.1 b_highschool_grad          0             0  c_myownplace     b_yes
## 3.2 b_highschool_grad          0             0  c_myownplace     b_yes
##     time age_enter age_exit aid
## 1.1    1        16       22   1
## 1.2    2        22       29   1
## 2.1    1        14       20   2
## 2.2    2        20       27   2
## 3.1    1        14       20   3
## 3.2    2        20       26   3
adlong$trans<-NA
adlong$trans[adlong$transition.w1==0& adlong$time==1]<-0
adlong$trans[adlong$transition.w1==1& adlong$time==1]<-1
adlong$trans[adlong$transition==0& adlong$time==2]<-0
adlong$trans[adlong$transition==1& adlong$time==2]<-1
#recode race/ethnicity
adlong$racethnic_new<-as.factor(ifelse(adlong$racethnic%in%c("d-asian","e-native_american","f-other"), "other", adlong$racethnic))
adlong$racethnic_new<-Recode(adlong$racethnic_new, recodes = "1='a_white'; 2='b_black'; 3='c_hispanic'", as.factor=T)
library(survminer)
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Loading required package: ggpubr
## Loading required package: magrittr
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(survival)
#Survey design
library(survey)
des2<-svydesign(ids=~psuscid,
                strata = ~region,
                weights=~GSWGT1,
                data=adlong,
                nest=T)

C) Carry out the following analysis

a. Fit the Cox model to the data.

#fit to cox model
library(eha)
fit.cox<-coxreg(Surv(time = time,event = trans)~age_enter-1,data = adlong)
summary(fit.cox)
## Call:
## coxreg(formula = Surv(time = time, event = trans) ~ age_enter - 
##     1, data = adlong)
## 
## Covariate           Mean       Coef     Rel.Risk   S.E.    Wald p
## age_enter          20.320    -0.101     0.904     0.012     0.000 
## 
## Events                    1614 
## Total time at risk         36150 
## Max. log. likelihood      -15235 
## LR test statistic         76.92 
## Degrees of freedom        1 
## Overall p-value           0

i. Include all main effects in the model

#Cox model with all main effects in the model
fit1<-svycoxph(Surv(time=time,event=trans)~sex+sexorient+racethnic_new+educ+w3.residence+UScitizen ,design=des2)
summary(fit1)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (132) clusters.
## svydesign(ids = ~psuscid, strata = ~region, weights = ~GSWGT1, 
##     data = adlong, nest = T)
## Call:
## svycoxph(formula = Surv(time = time, event = trans) ~ sex + sexorient + 
##     racethnic_new + educ + w3.residence + UScitizen, design = des2)
## 
##   n= 24100, number of events= 1614 
## 
##                               coef exp(coef) se(coef)      z Pr(>|z|)    
## sexb_female                0.02851   1.02892  0.07102  0.401  0.68808    
## sexorientb_bisexual        0.34674   1.41445  0.19777  1.753  0.07956 .  
## sexorientc_LGB             0.12652   1.13488  0.23405  0.541  0.58879    
## racethnic_newb_black       0.32833   1.38865  0.07783  4.218 2.46e-05 ***
## racethnic_newc_hispanic    0.29407   1.34188  0.10934  2.690  0.00715 ** 
## racethnic_newother        -0.01568   0.98444  0.20098 -0.078  0.93780    
## educb_highschool_grad     -0.21941   0.80300  0.12459 -1.761  0.07823 .  
## educc_college_bach        -0.45523   0.63430  0.15205 -2.994  0.00275 ** 
## educd_college+            -0.45818   0.63244  0.16971 -2.700  0.00694 ** 
## w3.residenceb_withsomeone  0.17733   1.19403  0.13674  1.297  0.19469    
## w3.residencec_myownplace   0.06340   1.06546  0.06977  0.909  0.36348    
## w3.residenced_group.qrts   0.09031   1.09451  0.17776  0.508  0.61142    
## w3.residencee_homeless     0.88251   2.41696  0.85849  1.028  0.30396    
## w3.residencef_other        0.13876   1.14885  0.30732  0.452  0.65161    
## UScitizenb_yes             0.03497   1.03559  0.18184  0.192  0.84749    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                           exp(coef) exp(-coef) lower .95 upper .95
## sexb_female                  1.0289     0.9719    0.8952    1.1826
## sexorientb_bisexual          1.4144     0.7070    0.9599    2.0842
## sexorientc_LGB               1.1349     0.8812    0.7173    1.7954
## racethnic_newb_black         1.3887     0.7201    1.1922    1.6175
## racethnic_newc_hispanic      1.3419     0.7452    1.0830    1.6626
## racethnic_newother           0.9844     1.0158    0.6639    1.4597
## educb_highschool_grad        0.8030     1.2453    0.6290    1.0251
## educc_college_bach           0.6343     1.5765    0.4708    0.8545
## educd_college+               0.6324     1.5812    0.4535    0.8820
## w3.residenceb_withsomeone    1.1940     0.8375    0.9133    1.5610
## w3.residencec_myownplace     1.0655     0.9386    0.9293    1.2216
## w3.residenced_group.qrts     1.0945     0.9136    0.7725    1.5507
## w3.residencee_homeless       2.4170     0.4137    0.4493   13.0023
## w3.residencef_other          1.1489     0.8704    0.6290    2.0982
## UScitizenb_yes               1.0356     0.9656    0.7251    1.4790
## 
## Concordance= 0.561  (se = 0.011 )
## Likelihood ratio test= NA  on 15 df,   p=NA
## Wald test            = 73.8  on 15 df,   p=9e-10
## Score (logrank) test = NA  on 15 df,   p=NA

ii. Test for an interaction between at least two of the predictors

#Cox model with all main effects in the model and an interaction between sex and sexual orientation
fit.interact1<-svycoxph(Surv(time=time,event=trans)~sex*sexorient+racethnic_new+educ,design=des2)
summary(fit.interact1)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (132) clusters.
## svydesign(ids = ~psuscid, strata = ~region, weights = ~GSWGT1, 
##     data = adlong, nest = T)
## Call:
## svycoxph(formula = Surv(time = time, event = trans) ~ sex * sexorient + 
##     racethnic_new + educ, design = des2)
## 
##   n= 24100, number of events= 1614 
## 
##                                      coef exp(coef)  se(coef)      z
## sexb_female                      0.012349  1.012426  0.071874  0.172
## sexorientb_bisexual             -1.147407  0.317459  0.923974 -1.242
## sexorientc_LGB                   0.007104  1.007129  0.311442  0.023
## racethnic_newb_black             0.333245  1.395490  0.078020  4.271
## racethnic_newc_hispanic          0.279851  1.322933  0.096535  2.899
## racethnic_newother              -0.027401  0.972971  0.175507 -0.156
## educb_highschool_grad           -0.226442  0.797365  0.124389 -1.820
## educc_college_bach              -0.458449  0.632264  0.148205 -3.093
## educd_college+                  -0.455843  0.633913  0.164004 -2.779
## sexb_female:sexorientb_bisexual  1.678033  5.355010  0.949777  1.767
## sexb_female:sexorientc_LGB       0.288495  1.334418  0.487847  0.591
##                                 Pr(>|z|)    
## sexb_female                      0.86358    
## sexorientb_bisexual              0.21430    
## sexorientc_LGB                   0.98180    
## racethnic_newb_black            1.94e-05 ***
## racethnic_newc_hispanic          0.00374 ** 
## racethnic_newother               0.87593    
## educb_highschool_grad            0.06869 .  
## educc_college_bach               0.00198 ** 
## educd_college+                   0.00544 ** 
## sexb_female:sexorientb_bisexual  0.07727 .  
## sexb_female:sexorientc_LGB       0.55428    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                 exp(coef) exp(-coef) lower .95 upper .95
## sexb_female                        1.0124     0.9877    0.8794    1.1656
## sexorientb_bisexual                0.3175     3.1500    0.0519    1.9417
## sexorientc_LGB                     1.0071     0.9929    0.5470    1.8543
## racethnic_newb_black               1.3955     0.7166    1.1976    1.6261
## racethnic_newc_hispanic            1.3229     0.7559    1.0949    1.5985
## racethnic_newother                 0.9730     1.0278    0.6898    1.3724
## educb_highschool_grad              0.7974     1.2541    0.6249    1.0175
## educc_college_bach                 0.6323     1.5816    0.4729    0.8454
## educd_college+                     0.6339     1.5775    0.4597    0.8742
## sexb_female:sexorientb_bisexual    5.3550     0.1867    0.8324   34.4518
## sexb_female:sexorientc_LGB         1.3344     0.7494    0.5129    3.4717
## 
## Concordance= 0.559  (se = 0.011 )
## Likelihood ratio test= NA  on 11 df,   p=NA
## Wald test            = 96.71  on 11 df,   p=8e-16
## Score (logrank) test = NA  on 11 df,   p=NA

iii. Perform a likelihood ratio test for two nested models

AIC(fit1)
##        eff.p          AIC     deltabar 
##    23.296748 29591.390052     1.553117
AIC(fit.interact1)
##        eff.p          AIC     deltabar 
##    17.264477 29579.325510     1.569498

D) Assess proportionality of hazards for each covariate

a. What method did you use?

#Grambsch and Therneau
fit.test<-cox.zph(fit1)
fit.test
##                                rho    chisq        p
## sexb_female                0.16900 108.4905 2.10e-25
## sexorientb_bisexual        0.06253  12.4875 4.10e-04
## sexorientc_LGB            -0.00320   0.0279 8.67e-01
## racethnic_newb_black       0.03770   4.6112 3.18e-02
## racethnic_newc_hispanic   -0.03341   3.5483 5.96e-02
## racethnic_newother        -0.00381   0.0889 7.66e-01
## educb_highschool_grad     -0.04283   7.1561 7.47e-03
## educc_college_bach         0.03596   5.3255 2.10e-02
## educd_college+             0.04033   6.5763 1.03e-02
## w3.residenceb_withsomeone  0.02434   1.9380 1.64e-01
## w3.residencec_myownplace   0.09095  24.5983 7.06e-07
## w3.residenced_group.qrts   0.01064   0.4029 5.26e-01
## w3.residencee_homeless    -0.01593   0.9828 3.22e-01
## w3.residencef_other       -0.05727   9.6886 1.85e-03
## UScitizenb_yes            -0.05787  15.6383 7.67e-05
## GLOBAL                          NA 236.7351 6.63e-42

b If you find non-proportional hazards, use an appropriate method to fit an extended Cox model that addresses the non-proportionality

#I am going to stratify by the variables that showed significance. 
fit.stratify<-svycoxph(Surv(time=time,event=trans)~sex+sexorient+strata(racethnic_new)+strata(educ),design=des2)
summary(fit.stratify)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (132) clusters.
## svydesign(ids = ~psuscid, strata = ~region, weights = ~GSWGT1, 
##     data = adlong, nest = T)
## Call:
## svycoxph(formula = Surv(time = time, event = trans) ~ sex + sexorient + 
##     strata(racethnic_new) + strata(educ), design = des2)
## 
##   n= 24100, number of events= 1614 
## 
##                        coef exp(coef) se(coef)     z Pr(>|z|)  
## sexb_female         0.03360   1.03417  0.07128 0.471   0.6374  
## sexorientb_bisexual 0.34289   1.40901  0.19754 1.736   0.0826 .
## sexorientc_LGB      0.14291   1.15363  0.23836 0.600   0.5488  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## sexb_female             1.034     0.9670    0.8993     1.189
## sexorientb_bisexual     1.409     0.7097    0.9567     2.075
## sexorientc_LGB          1.154     0.8668    0.7231     1.841
## 
## Concordance= 0.495  (se = 0.013 )
## Likelihood ratio test= NA  on 3 df,   p=NA
## Wald test            = 3.61  on 3 df,   p=0.3
## Score (logrank) test = NA  on 3 df,   p=NA
fit.stratify
## Call:
## svycoxph(formula = Surv(time = time, event = trans) ~ sex + sexorient + 
##     strata(racethnic_new) + strata(educ), design = des2)
## 
##                        coef exp(coef) se(coef)     z      p
## sexb_female         0.03360   1.03417  0.07128 0.471 0.6374
## sexorientb_bisexual 0.34289   1.40901  0.19754 1.736 0.0826
## sexorientc_LGB      0.14291   1.15363  0.23836 0.600 0.5488
## 
## Likelihood ratio test=  on 3 df, p=
## n= 24100, number of events= 1614
#Grambsch and Therneau
fit.test<-cox.zph(fit.stratify)
fit.test
##                        rho  chisq        p
## sexb_female         0.1804 112.12 3.37e-26
## sexorientb_bisexual 0.0275   2.11 1.46e-01
## sexorientc_LGB      0.0411   3.72 5.36e-02
## GLOBAL                  NA 116.08 5.39e-25

E) Fit the discrete time hazard model to your outcome

i. You must form a person-period data set

ii. Consider both the general model and other time specifications

#linear term for time
fit.1<-svyglm(trans~factor(age_enter)-1,design=des2,family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fit.1)
## 
## Call:
## svyglm(formula = trans ~ factor(age_enter) - 1, design = des2, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psuscid, strata = ~region, weights = ~GSWGT1, 
##     data = adlong, nest = T)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## factor(age_enter)11 -12.10136    1.00001 -12.101  < 2e-16 ***
## factor(age_enter)12  -1.88615    1.07535  -1.754   0.0822 .  
## factor(age_enter)13  -3.13709    0.21153 -14.830  < 2e-16 ***
## factor(age_enter)14  -3.77107    0.26933 -14.002  < 2e-16 ***
## factor(age_enter)15  -3.82761    0.21212 -18.045  < 2e-16 ***
## factor(age_enter)16  -3.58145    0.15886 -22.544  < 2e-16 ***
## factor(age_enter)17  -4.03806    0.23611 -17.102  < 2e-16 ***
## factor(age_enter)18  -4.47073    0.27626 -16.183  < 2e-16 ***
## factor(age_enter)19  -2.88874    0.16890 -17.104  < 2e-16 ***
## factor(age_enter)20  -2.20200    0.10179 -21.633  < 2e-16 ***
## factor(age_enter)21  -2.21029    0.11061 -19.983  < 2e-16 ***
## factor(age_enter)22  -2.19125    0.07782 -28.157  < 2e-16 ***
## factor(age_enter)23  -2.03204    0.10756 -18.892  < 2e-16 ***
## factor(age_enter)24  -2.03765    0.06571 -31.011  < 2e-16 ***
## factor(age_enter)25  -2.07051    0.12490 -16.577  < 2e-16 ***
## factor(age_enter)26  -1.72465    0.27309  -6.315 5.73e-09 ***
## factor(age_enter)27  -0.96099    0.56662  -1.696   0.0927 .  
## factor(age_enter)28  -0.34386    1.29933  -0.265   0.7918    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9999273)
## 
## Number of Fisher Scoring iterations: 10

iii. Include all main effects in the model

fit.00<-svyglm(trans~age_enter+sex+sexorient+racethnic_new+educ,design=des2,family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fit.00)
## 
## Call:
## svyglm(formula = trans ~ age_enter + sex + sexorient + racethnic_new + 
##     educ, design = des2, family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psuscid, strata = ~region, weights = ~GSWGT1, 
##     data = adlong, nest = T)
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -6.12221    0.27816 -22.009  < 2e-16 ***
## age_enter                0.18241    0.01177  15.501  < 2e-16 ***
## sexb_female              0.07078    0.07606   0.931  0.35398    
## sexorientb_bisexual      0.43284    0.21752   1.990  0.04891 *  
## sexorientc_LGB           0.08110    0.24846   0.326  0.74468    
## racethnic_newb_black     0.29823    0.09274   3.216  0.00168 ** 
## racethnic_newc_hispanic  0.26559    0.10487   2.533  0.01264 *  
## racethnic_newother      -0.05640    0.18572  -0.304  0.76190    
## educb_highschool_grad   -0.27619    0.13058  -2.115  0.03652 *  
## educc_college_bach      -0.48279    0.15323  -3.151  0.00206 ** 
## educd_college+          -0.54858    0.16632  -3.298  0.00129 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.00188)
## 
## Number of Fisher Scoring iterations: 6
fit.000<-svyglm(trans~age_enter+sex+sexorient+racethnic_new+educ+w3.residence+UScitizen,design=des2,family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fit.000)
## 
## Call:
## svyglm(formula = trans ~ age_enter + sex + sexorient + racethnic_new + 
##     educ + w3.residence + UScitizen, design = des2, family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psuscid, strata = ~region, weights = ~GSWGT1, 
##     data = adlong, nest = T)
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -6.33712    0.32122 -19.728  < 2e-16 ***
## age_enter                  0.18672    0.01173  15.924  < 2e-16 ***
## sexb_female                0.08540    0.07553   1.131  0.26061    
## sexorientb_bisexual        0.43159    0.21684   1.990  0.04898 *  
## sexorientc_LGB             0.09202    0.24700   0.373  0.71018    
## racethnic_newb_black       0.28146    0.09306   3.025  0.00309 ** 
## racethnic_newc_hispanic    0.28979    0.11666   2.484  0.01447 *  
## racethnic_newother        -0.02816    0.20983  -0.134  0.89349    
## educb_highschool_grad     -0.27973    0.13022  -2.148  0.03386 *  
## educc_college_bach        -0.50535    0.15741  -3.210  0.00173 ** 
## educd_college+            -0.56602    0.17302  -3.271  0.00142 ** 
## w3.residenceb_withsomeone  0.07928    0.14139   0.561  0.57609    
## w3.residencec_myownplace  -0.12972    0.07367  -1.761  0.08101 .  
## w3.residenced_group.qrts   0.24837    0.19233   1.291  0.19924    
## w3.residencee_homeless     0.62435    1.01484   0.615  0.53966    
## w3.residencef_other        0.09111    0.33589   0.271  0.78671    
## UScitizenb_yes             0.18349    0.18828   0.975  0.33190    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9970073)
## 
## Number of Fisher Scoring iterations: 6
AIC(fit.00)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
##        eff.p          AIC     deltabar 
##    16.996967 10939.567262     1.699697
AIC(fit.000)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
##        eff.p          AIC     deltabar 
##    25.863595 10943.142811     1.616475
anova(fit.00,fit.000)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
## Working (Rao-Scott+F) LRT for w3.residence UScitizen
##  in svyglm(formula = trans ~ age_enter + sex + sexorient + racethnic_new + 
##     educ + w3.residence + UScitizen, design = des2, family = "binomial")
## Working 2logLR =  9.488531 p= 0.16091 
## (scale factors:  1.3 1.3 1.1 0.96 0.7 0.65 );  denominator df= 112
logLik(fit.00)
## Warning in logLik.svyglm(fit.00): svyglm not fitted by maximum likelihood.
## [1] 10905.57
logLik(fit.000)
## Warning in logLik.svyglm(fit.000): svyglm not fitted by maximum likelihood.
## [1] 10891.42

iv. Test for an interaction between at least two of the predictors

fit.int<-svyglm(trans~age_enter+sex*sexorient+racethnic_new+educ,design=des2,family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fit.int)
## 
## Call:
## svyglm(formula = trans ~ age_enter + sex * sexorient + racethnic_new + 
##     educ, design = des2, family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psuscid, strata = ~region, weights = ~GSWGT1, 
##     data = adlong, nest = T)
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -6.10941    0.27713 -22.045  < 2e-16 ***
## age_enter                        0.18224    0.01176  15.491  < 2e-16 ***
## sexb_female                      0.05250    0.07645   0.687  0.49361    
## sexorientb_bisexual             -1.01910    0.95133  -1.071  0.28629    
## sexorientc_LGB                  -0.05567    0.32965  -0.169  0.86619    
## racethnic_newb_black             0.30217    0.09298   3.250  0.00151 ** 
## racethnic_newc_hispanic          0.26440    0.10446   2.531  0.01270 *  
## racethnic_newother              -0.05243    0.18525  -0.283  0.77765    
## educb_highschool_grad           -0.27806    0.12990  -2.141  0.03440 *  
## educc_college_bach              -0.48260    0.15227  -3.169  0.00195 ** 
## educd_college+                  -0.54305    0.16636  -3.264  0.00144 ** 
## sexb_female:sexorientb_bisexual  1.62251    0.98314   1.650  0.10158    
## sexb_female:sexorientc_LGB       0.31347    0.50503   0.621  0.53601    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000591)
## 
## Number of Fisher Scoring iterations: 6

v. Generate hazard plots for interesting cases highlighting the significant predictors in your analysis

#hazard for sexual orientation
fit.kaplan.LGB<-survfit(Surv(time = time , event=trans)~sexorient,data=adlong)
summary(fit.kaplan.LGB)
## Call: survfit(formula = Surv(time = time, event = trans) ~ sexorient, 
##     data = adlong)
## 
##                 sexorient=a_straight 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##     1  23344     260    0.989 0.000687        0.988        0.990
##     2  11672    1289    0.880 0.002933        0.874        0.885
## 
##                 sexorient=b_bisexual 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    410       8    0.980 0.00683        0.967        0.994
##     2    205      33    0.823 0.02581        0.774        0.875
## 
##                 sexorient=c_LGB 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    346       1    0.997 0.00289        0.991        1.000
##     2    173      23    0.865 0.02586        0.815        0.917
ggsurvplot(fit.kaplan.LGB,risk.table = T,fun="cumhaz",title="Cumulative Hazard Function for violence transition by sexual orientation")

#hazard for race
fit.kaplan.race<-survfit(Surv(time = time , event=trans)~racethnic_new,data=adlong)
summary(fit.kaplan.race)
## Call: survfit(formula = Surv(time = time, event = trans) ~ racethnic_new, 
##     data = adlong)
## 
##                 racethnic_new=a_white 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##     1  15510     167    0.989 0.000829        0.988        0.991
##     2   7755     805    0.887 0.003506        0.880        0.893
## 
##                 racethnic_new=b_black 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   4980      60    0.988 0.00155        0.985        0.991
##     2   2490     324    0.859 0.00680        0.846        0.873
## 
##                 racethnic_new=c_hispanic 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   1838      30    0.984 0.00296        0.978        0.989
##     2    919     115    0.861 0.01104        0.839        0.883
## 
##                 racethnic_new=other 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   1772      12    0.993 0.00195        0.989        0.997
##     2    886     101    0.880 0.01074        0.859        0.901
ggsurvplot(fit.kaplan.race,risk.table = T,fun="cumhaz",title="Cumulative Hazard Function for violence transition by race")

#hazard for education
fit.kaplan.educ<-survfit(Surv(time = time , event=trans)~educ,data=adlong)
summary(fit.kaplan.educ)
## Call: survfit(formula = Surv(time = time, event = trans) ~ educ, data = adlong)
## 
##                 educ=a_less_highschool 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   1728      33    0.981 0.00329        0.974        0.987
##     2    864     125    0.839 0.01207        0.816        0.863
## 
##                 educ=b_highschool_grad 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##     1  14244     197    0.986 0.000979        0.984        0.988
##     2   7122     807    0.874 0.003804        0.867        0.882
## 
##                 educ=c_college_bach 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   4972      29    0.994 0.00108        0.992        0.996
##     2   2486     257    0.891 0.00615        0.879        0.904
## 
##                 educ=d_college+ 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   3156      10    0.997 0.00100        0.995        0.999
##     2   1578     156    0.898 0.00754        0.884        0.913
ggsurvplot(fit.kaplan.educ,risk.table = T,fun="cumhaz",title="Hazard for violence transition by education")

F) Provide the following as a single word document:

a. Code and output

b. Technical write up

For this analysis the National Longitudinal Study of Adolescent to Adult Health (ADD Health) is used to determine if the likelihood of being jumped or beaten up for LGB individuals is greater and earlier then the liklihood of being beaten up for straight people.. ADD Health is a nationally representative longitudinal study of adolescents in the United States who are between 7 and 12 years old. The survey was conducted in 1994 with follow up surveys in 1996, 2008, and 2016.

c. Descriptive write up consisting of

i. Tables of descriptive statistics

#Kaplan-Meier survival analysis of the outcome
fit.kaplan<-survfit(Surv(time = time , event=trans)~1,data=adlong)
summary(fit.kaplan)
## Call: survfit(formula = Surv(time = time, event = trans) ~ 1, data = adlong)
## 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##     1  24100     269    0.989 0.000677        0.988        0.990
##     2  12050    1345    0.878 0.002900        0.873        0.884
mean(adlong$trans)
## [1] 0.06697095
  fit0 <- coxph(Surv(time = time , event=trans) ~ factor(sexorient), data=adlong,
  iter=0, na.action=na.exclude)
  o.minus.e <- tapply(resid(fit0), adlong$sexorient, sum)
  obs       <- tapply(adlong$transition, adlong$sexorient, sum)
  cbind(observed=obs, expected= obs- o.minus.e, "o-e"=o.minus.e) 
##            observed   expected        o-e
## a_straight     2578 2593.20571 -15.205706
## b_bisexual       66   51.82881  14.171190
## c_LGB            46   44.96548   1.034517
summary(adlong)
##    psuscid              region          GSWGT1              sex       
##  Length:24100       Min.   :1.000   Min.   :  16.32   a_male  :11000  
##  Class :character   1st Qu.:2.000   1st Qu.: 206.10   b_female:13100  
##  Mode  :character   Median :2.000   Median : 948.81                   
##                     Mean   :2.382   Mean   :1174.81                   
##                     3rd Qu.:3.000   3rd Qu.:1780.76                   
##                     Max.   :4.000   Max.   :6649.36                   
##       sexorient                 racethnic                    educ      
##  a_straight:23344   a-nhwhite        :15510   a_less_highschool: 1728  
##  b_bisexual:  410   b-nhblack        : 4980   b_highschool_grad:14244  
##  c_LGB     :  346   c-hispanic       : 1838   c_college_bach   : 4972  
##                     d-asian          : 1470   d_college+       : 3156  
##                     e-native_american:  126                            
##                     f-other          :  176                            
##    transition     transition.w1            w3.residence   UScitizen    
##  Min.   :0.0000   Min.   :0.00000   a_withparents: 9644   a_no : 1488  
##  1st Qu.:0.0000   1st Qu.:0.00000   b_withsomeone: 1372   b_yes:22612  
##  Median :0.0000   Median :0.00000   c_myownplace :11784                
##  Mean   :0.1116   Mean   :0.02232   d_group.qrts : 1152                
##  3rd Qu.:0.0000   3rd Qu.:0.00000   e_homeless   :    8                
##  Max.   :1.0000   Max.   :1.00000   f_other      :  140                
##       time       age_enter        age_exit          aid       
##  Min.   :1.0   Min.   :11.00   Min.   :18.00   Min.   :    1  
##  1st Qu.:1.0   1st Qu.:16.00   1st Qu.:22.00   1st Qu.: 3013  
##  Median :1.5   Median :19.00   Median :26.00   Median : 6026  
##  Mean   :1.5   Mean   :19.26   Mean   :25.69   Mean   : 6026  
##  3rd Qu.:2.0   3rd Qu.:22.00   3rd Qu.:29.00   3rd Qu.: 9038  
##  Max.   :2.0   Max.   :28.00   Max.   :34.00   Max.   :12050  
##      trans            racethnic_new  
##  Min.   :0.00000   a_white   :15510  
##  1st Qu.:0.00000   b_black   : 4980  
##  Median :0.00000   c_hispanic: 1838  
##  Mean   :0.06697   other     : 1772  
##  3rd Qu.:0.00000                     
##  Max.   :1.00000

ii. Results of statistical tests

summary(fit.000)
## 
## Call:
## svyglm(formula = trans ~ age_enter + sex + sexorient + racethnic_new + 
##     educ + w3.residence + UScitizen, design = des2, family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psuscid, strata = ~region, weights = ~GSWGT1, 
##     data = adlong, nest = T)
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -6.33712    0.32122 -19.728  < 2e-16 ***
## age_enter                  0.18672    0.01173  15.924  < 2e-16 ***
## sexb_female                0.08540    0.07553   1.131  0.26061    
## sexorientb_bisexual        0.43159    0.21684   1.990  0.04898 *  
## sexorientc_LGB             0.09202    0.24700   0.373  0.71018    
## racethnic_newb_black       0.28146    0.09306   3.025  0.00309 ** 
## racethnic_newc_hispanic    0.28979    0.11666   2.484  0.01447 *  
## racethnic_newother        -0.02816    0.20983  -0.134  0.89349    
## educb_highschool_grad     -0.27973    0.13022  -2.148  0.03386 *  
## educc_college_bach        -0.50535    0.15741  -3.210  0.00173 ** 
## educd_college+            -0.56602    0.17302  -3.271  0.00142 ** 
## w3.residenceb_withsomeone  0.07928    0.14139   0.561  0.57609    
## w3.residencec_myownplace  -0.12972    0.07367  -1.761  0.08101 .  
## w3.residenced_group.qrts   0.24837    0.19233   1.291  0.19924    
## w3.residencee_homeless     0.62435    1.01484   0.615  0.53966    
## w3.residencef_other        0.09111    0.33589   0.271  0.78671    
## UScitizenb_yes             0.18349    0.18828   0.975  0.33190    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9970073)
## 
## Number of Fisher Scoring iterations: 6

iii. Interpretation of the results

Bisexuals are more likely to beaten up than heterosexuals. Non-Hispanic blacks and Hispanics are more likely to be beaten up than non-Hispanic white. People who have a high school, college or higher degree are less likely to be beaten up than people with less than a high school education.