library(foreign)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(labelled)
library(lattice)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(WriteXLS)
library(writexl)
brfss21 = read.xport("/Users/briansurratt/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/DEM 7283 Stats II/Assignment 4/LLCP2021.XPT_")
#codebook: https://www.cdc.gov/brfss/annual_data/annual_2021.html
head(brfss21)
## X_STATE FMONTH IDATE IMONTH IDAY IYEAR DISPCODE SEQNO X_PSU
## 1 1 1 01192021 01 19 2021 1100 2021000001 2.021e+09
## 2 1 1 01212021 01 21 2021 1100 2021000002 2.021e+09
## 3 1 1 01212021 01 21 2021 1100 2021000003 2.021e+09
## 4 1 1 01172021 01 17 2021 1100 2021000004 2.021e+09
## 5 1 1 01152021 01 15 2021 1100 2021000005 2.021e+09
## 6 1 1 01142021 01 14 2021 1100 2021000006 2.021e+09
## CTELENM1 PVTRESD1 COLGHOUS STATERE1 CELPHON1 LADULT1 COLGSEX NUMADULT LANDSEX
## 1 1 1 NA 1 2 1 NA 2 NA
## 2 1 1 NA 1 2 1 NA 2 NA
## 3 1 1 NA 1 2 1 NA 2 NA
## 4 1 1 NA 1 2 1 NA 2 NA
## 5 1 1 NA 1 2 1 NA 2 NA
## 6 1 1 NA 1 2 1 NA 1 1
## NUMMEN NUMWOMEN RESPSLCT SAFETIME CTELNUM1 CELLFON5 CADULT1 CELLSEX PVTRESD3
## 1 1 1 2 NA NA NA NA NA NA
## 2 1 1 2 NA NA NA NA NA NA
## 3 1 1 2 NA NA NA NA NA NA
## 4 1 1 2 NA NA NA NA NA NA
## 5 1 1 1 NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA
## CCLGHOUS CSTATE1 LANDLINE HHADULT SEXVAR GENHLTH PHYSHLTH MENTHLTH POORHLTH
## 1 NA NA NA NA 2 5 20 10 88
## 2 NA NA NA NA 2 3 88 88 NA
## 3 NA NA NA NA 2 2 88 88 NA
## 4 NA NA NA NA 2 2 88 10 88
## 5 NA NA NA NA 1 5 30 88 30
## 6 NA NA NA NA 1 3 88 88 NA
## PRIMINSR PERSDOC3 MEDCOST1 CHECKUP1 EXERANY2 BPHIGH6 BPMEDS CHOLCHK3 TOLDHI3
## 1 3 1 2 2 2 3 NA 2 1
## 2 1 2 2 1 1 1 1 2 1
## 3 2 2 2 1 2 1 1 2 2
## 4 2 1 2 1 1 1 1 2 1
## 5 3 1 2 1 1 4 NA 2 1
## 6 3 1 2 1 2 3 NA 2 2
## CHOLMED3 CVDINFR4 CVDCRHD4 CVDSTRK3 ASTHMA3 ASTHNOW CHCSCNCR CHCOCNCR
## 1 1 2 2 2 1 1 2 2
## 2 1 2 1 2 2 NA 2 2
## 3 2 2 1 2 2 NA 2 2
## 4 2 2 2 2 2 NA 2 2
## 5 1 1 7 1 2 NA 2 2
## 6 2 2 2 2 2 NA 2 2
## CHCCOPD3 ADDEPEV3 CHCKDNY2 DIABETE4 DIABAGE3 HAVARTH5 ARTHEXER ARTHEDU
## 1 1 2 2 3 NA 1 2 2
## 2 2 2 1 1 98 1 1 2
## 3 2 2 2 1 98 2 NA NA
## 4 2 2 2 1 56 2 NA NA
## 5 2 2 2 1 65 2 NA NA
## 6 1 2 2 3 NA 2 NA NA
## LMTJOIN3 ARTHDIS2 JOINPAI2 MARITAL EDUCA RENTHOM1 NUMHHOL3 NUMPHON3 CPDEMO1B
## 1 2 1 8 1 4 1 1 1 1
## 2 1 1 10 9 6 1 2 NA 1
## 3 NA NA NA 3 4 1 2 NA 1
## 4 NA NA NA 1 4 1 2 NA 1
## 5 NA NA NA 1 3 1 2 NA 8
## 6 NA NA NA 1 5 1 2 NA 1
## VETERAN3 EMPLOY1 CHILDREN INCOME3 PREGNANT WEIGHT2 HEIGHT3 DEAF BLIND DECIDE
## 1 2 7 88 5 NA 72 411 2 2 2
## 2 2 8 88 77 NA 7777 506 2 1 1
## 3 2 7 88 3 NA 170 505 2 2 2
## 4 2 7 88 7 NA 195 504 2 2 2
## 5 2 8 88 4 NA 206 511 1 2 2
## 6 2 7 88 6 NA 195 603 2 2 2
## DIFFWALK DIFFDRES DIFFALON SMOKE100 SMOKDAY2 USENOW3 ECIGNOW1 ALCDAY5
## 1 2 2 1 1 3 3 3 888
## 2 1 2 1 2 NA 3 3 888
## 3 2 2 2 2 NA 3 3 888
## 4 2 2 2 2 NA 3 3 101
## 5 1 2 2 2 NA 2 3 888
## 6 1 2 2 1 3 2 3 888
## AVEDRNK3 DRNK3GE5 MAXDRNKS FLUSHOT7 FLSHTMY3 IMFVPLA2 PNEUVAC4 HIVTST7
## 1 NA NA NA 1 92020 1 1 2
## 2 NA NA NA 2 NA NA 2 2
## 3 NA NA NA 2 NA NA 2 2
## 4 3 1 6 1 102020 1 2 2
## 5 NA NA NA 1 92020 1 1 1
## 6 NA NA NA 1 102020 5 1 2
## HIVTSTD3 FRUIT2 FRUITJU2 FVGREEN1 FRENCHF1 POTATOE1 VEGETAB2 PDIABTST
## 1 NA 101 555 204 203 201 101 2
## 2 NA 101 555 201 555 201 207 NA
## 3 NA 101 555 555 201 201 203 NA
## 4 NA 203 205 303 204 308 205 NA
## 5 777777 101 555 101 202 202 101 NA
## 6 NA 202 555 201 555 201 201 2
## PREDIAB1 INSULIN1 BLDSUGAR FEETCHK3 DOCTDIAB CHKHEMO3 FEETCHK EYEEXAM1
## 1 3 NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## 6 3 NA NA NA NA NA NA NA
## DIABEYE DIABEDU TOLDCFS HAVECFS WORKCFS TOLDHEPC TRETHEPC PRIRHEPC HAVEHEPC
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA
## HAVEHEPB MEDSHEPB HPVADVC4 HPVADSHT TETANUS1 SHINGLE2 LCSFIRST LCSLAST
## 1 NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA
## LCSNUMCG LCSCTSCN HADMAM HOWLONG CERVSCRN CRVCLCNC CRVCLPAP CRVCLHPV HADHYST2
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA
## PSATEST1 PSATIME1 PCPSARS2 PCSTALK HADSIGM4 COLNSIGM COLNTES1 SIGMTES1
## 1 NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA
## LASTSIG4 COLNCNCR VIRCOLO1 VCLNTES1 SMALSTOL STOLTEST STOOLDN1 BLDSTFIT
## 1 NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA
## SDNATES1 CNCRDIFF CNCRAGE CNCRTYP1 CSRVTRT3 CSRVDOC1 CSRVSUM CSRVRTRN
## 1 NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA
## CSRVINST CSRVINSR CSRVDEIN CSRVCLIN CSRVPAIN CSRVCTL2 HOMBPCHK HOMRGCHK
## 1 NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA
## WHEREBP SHAREBP WTCHSALT DRADVISE CIMEMLOS CDHOUSE CDASSIST CDHELP CDSOCIAL
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA
## CDDISCUS CAREGIV1 CRGVREL4 CRGVLNG1 CRGVHRS1 CRGVPRB3 CRGVALZD CRGVPER1
## 1 NA 2 NA NA NA NA NA NA
## 2 NA 2 NA NA NA NA NA NA
## 3 NA 2 NA NA NA NA NA NA
## 4 NA 2 NA NA NA NA NA NA
## 5 NA 2 NA NA NA NA NA NA
## 6 NA 2 NA NA NA NA NA NA
## CRGVHOU1 CRGVEXPT ACEDEPRS ACEDRINK ACEDRUGS ACEPRISN ACEDIVRC ACEPUNCH
## 1 NA 2 2 2 2 2 1 1
## 2 NA 2 2 2 2 2 2 1
## 3 NA 2 2 2 2 2 2 1
## 4 NA 7 1 2 2 2 2 1
## 5 NA 2 2 2 2 2 2 1
## 6 NA 2 2 2 2 2 2 1
## ACEHURT1 ACESWEAR ACETOUCH ACETTHEM ACEHVSEX ACEADSAF ACEADNED MARIJAN1
## 1 1 1 1 1 1 5 3 NA
## 2 1 1 1 1 1 5 5 NA
## 3 1 1 1 1 1 5 5 NA
## 4 3 1 1 1 1 5 5 NA
## 5 1 1 1 1 1 5 5 NA
## 6 1 1 1 1 1 5 5 NA
## USEMRJN3 RSNMRJN2 LASTSMK2 STOPSMK2 FIREARM5 GUNLOAD LOADULK2 RCSGENDR
## 1 NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA
## RCSRLTN2 CASTHDX2 CASTHNO2 BIRTHSEX SOMALE SOFEMALE TRNSGNDR QSTVER QSTLANG
## 1 NA NA NA NA NA NA NA 10 1
## 2 NA NA NA NA NA NA NA 10 1
## 3 NA NA NA NA NA NA NA 10 1
## 4 NA NA NA NA NA NA NA 10 1
## 5 NA NA NA NA NA NA NA 10 1
## 6 NA NA NA NA NA NA NA 10 1
## X_METSTAT X_URBSTAT MSCODE X_STSTR X_STRWT X_RAWRAKE X_WT2RAKE X_IMPRACE
## 1 1 1 1 11011 39.76616 2 79.53232 1
## 2 1 1 2 11011 39.76616 2 79.53232 2
## 3 1 1 1 11011 39.76616 2 79.53232 2
## 4 1 1 3 11011 39.76616 2 79.53232 1
## 5 2 1 2 11011 39.76616 2 79.53232 6
## 6 2 2 5 11011 39.76616 1 39.76616 1
## X_CHISPNC X_CRACE1 X_CPRACE1 CAGEG X_CLLCPWT X_DUALUSE X_DUALCOR X_LLCPWT2
## 1 9 NA NA NA NA 1 0.5190191 874.2429
## 2 9 NA NA NA NA 1 0.5190191 874.2429
## 3 9 NA NA NA NA 1 0.5190191 874.2429
## 4 9 NA NA NA NA 1 0.5190191 874.2429
## 5 9 NA NA NA NA 9 NA 2144.7690
## 6 9 NA NA NA NA 1 0.5190191 437.1215
## X_LLCPWT X_RFHLTH X_PHYS14D X_MENT14D X_HLTHPLN X_HCVU652 X_TOTINDA
## 1 744.7455 2 3 2 1 9 2
## 2 299.1374 1 1 1 1 9 1
## 3 587.8630 1 1 1 1 9 2
## 4 1099.6216 1 1 2 1 1 1
## 5 1711.8259 2 3 1 1 9 1
## 6 499.3652 1 1 1 1 9 2
## X_RFHYPE6 X_CHOLCH3 X_RFCHOL3 X_MICHD X_LTASTH1 X_CASTHM1 X_ASTHMS1 X_DRDXAR3
## 1 1 1 2 2 2 2 1 1
## 2 2 1 2 1 1 1 3 1
## 3 2 1 1 1 1 1 3 2
## 4 2 1 2 2 1 1 3 2
## 5 1 1 2 1 1 1 3 2
## 6 1 1 1 2 1 1 3 2
## X_LMTACT3 X_LMTWRK3 X_PRACE1 X_MRACE1 X_HISPANC X_RACE X_RACEG21 X_RACEGR3
## 1 2 1 1 1 2 1 1 1
## 2 1 1 2 2 2 2 2 2
## 3 3 3 2 2 2 2 2 2
## 4 3 3 1 1 2 1 1 1
## 5 3 3 1 7 2 7 2 4
## 6 3 3 1 1 2 1 1 1
## X_RACEPRV X_SEX X_AGEG5YR X_AGE65YR X_AGE80 X_AGE_G HTIN4 HTM4 WTKG3 X_BMI5
## 1 1 2 11 2 70 6 59 150 3266 1454
## 2 2 2 10 2 67 6 66 168 NA NA
## 3 2 2 11 2 72 6 65 165 7711 2829
## 4 1 2 9 1 62 5 64 163 8845 3347
## 5 7 1 12 2 76 6 71 180 9344 2873
## 6 1 1 13 2 80 6 75 191 8845 2437
## X_BMI5CAT X_RFBMI5 X_CHLDCNT X_EDUCAG X_INCOMG1 X_SMOKER3 X_RFSMOK3 X_CURECI1
## 1 1 1 1 2 3 3 1 1
## 2 NA 9 1 4 9 4 1 1
## 3 3 2 1 2 2 4 1 1
## 4 4 2 1 2 5 4 1 1
## 5 3 2 1 1 2 4 1 1
## 6 2 1 1 3 4 3 1 1
## DRNKANY5 DROCDY3_ X_RFBING5 X_DRNKWK1 X_RFDRHV7 X_FLSHOT7 X_PNEUMO3 X_AIDTST4
## 1 2 0 1 0 1 1 1 2
## 2 2 0 1 0 1 2 2 2
## 3 2 0 1 0 1 2 2 2
## 4 1 14 2 300 1 NA NA 2
## 5 2 0 1 0 1 1 1 1
## 6 2 0 1 0 1 1 1 2
## FTJUDA2_ FRUTDA2_ GRENDA1_ FRNCHDA_ POTADA1_ VEGEDA2_ X_MISFRT1 X_MISVEG1
## 1 0 100 57 43 14 100 0 0
## 2 0 100 14 0 14 100 0 0
## 3 0 100 0 14 14 43 0 0
## 4 71 43 10 57 27 71 0 0
## 5 0 100 100 29 29 100 0 0
## 6 0 29 14 0 14 14 0 0
## X_FRTRES1 X_VEGRES1 X_FRUTSU1 X_VEGESU1 X_FRTLT1A X_VEGLT1A X_FRT16A X_VEG23A
## 1 1 1 100 214 1 1 1 1
## 2 1 1 100 128 1 1 1 1
## 3 1 1 100 71 1 2 1 1
## 4 1 1 114 165 1 1 1 1
## 5 1 1 100 258 1 1 1 1
## 6 1 1 29 42 2 2 1 1
## X_FRUITE1 X_VEGETE1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
#checking sample size
nrow(brfss21)
## [1] 438693
# I found an "Own or Rent Home" variable to use as an IV. I can still use mental health as the DV. Is there are difference in mental health between owners, renters, and "other arrangement"?
look_for(brfss21, "RENTHOM1")
## pos variable label col_type values
## 67 RENTHOM1 — dbl
tabyl(brfss21$RENTHOM1)
## brfss21$RENTHOM1 n percent valid_percent
## 1 308963 7.042807e-01 0.704293514
## 2 103944 2.369402e-01 0.236944505
## 3 21187 4.829573e-02 0.048296614
## 7 928 2.115375e-03 0.002115413
## 9 3663 8.349803e-03 0.008349955
## NA 8 1.823599e-05 NA
# Building a working data frame with the variables I want to use: RENTHOM1, MENTHLTH, X_AGE80, X_AGEG5YR.
mh_home <- brfss21 %>%
select(RENTHOM1, MENTHLTH, X_AGE80, X_AGEG5YR)
head(mh_home)
## RENTHOM1 MENTHLTH X_AGE80 X_AGEG5YR
## 1 1 10 70 11
## 2 1 88 67 10
## 3 1 88 72 11
## 4 1 10 62 9
## 5 1 88 76 12
## 6 1 88 80 13
nrow(mh_home)
## [1] 438693
# Restricting sample to "Own", "Rent", "Other Arrangment". Filtering out "Don't know/Not sure", "Refused", and "Not asked or Missing."
mh_home <- mh_home %>%
filter(mh_home$RENTHOM1 %in% c("1","2","3"))
tabyl(mh_home$RENTHOM1)
## mh_home$RENTHOM1 n percent
## 1 308963 0.7117422
## 2 103944 0.2394504
## 3 21187 0.0488074
nrow(mh_home)
## [1] 434094
plot(mh_home$X_AGE80,mh_home$MENTHLTH)
tabyl(mh_home$MENTHLTH)
## mh_home$MENTHLTH n percent
## 1 12988 2.991979e-02
## 2 22553 5.195419e-02
## 3 14450 3.328772e-02
## 4 7369 1.697559e-02
## 5 18969 4.369791e-02
## 6 2215 5.102581e-03
## 7 7369 1.697559e-02
## 8 1553 3.577566e-03
## 9 249 5.736085e-04
## 10 14434 3.325086e-02
## 11 129 2.971707e-04
## 12 1070 2.464904e-03
## 13 160 3.685838e-04
## 14 2700 6.219851e-03
## 15 13371 3.080208e-02
## 16 229 5.275355e-04
## 17 195 4.492115e-04
## 18 285 6.565398e-04
## 19 27 6.219851e-05
## 20 7938 1.828636e-02
## 21 509 1.172557e-03
## 22 158 3.639765e-04
## 23 102 2.349721e-04
## 24 127 2.925634e-04
## 25 2933 6.756601e-03
## 26 114 2.626159e-04
## 27 206 4.745516e-04
## 28 809 1.863652e-03
## 29 397 9.145485e-04
## 30 24686 5.686787e-02
## 77 5603 1.290734e-02
## 88 268161 6.177487e-01
## 99 2036 4.690228e-03
# Coding those with none for number of days to zero. 77 is "Don't know/not sure". 88 is "None". 99 is "Refused".
mh_home$MENTHLTH<- recode(mh_home$MENTHLTH,"88=0")
mh_home <-filter(mh_home, MENTHLTH!=77 & MENTHLTH!=99)
nrow(mh_home)
## [1] 426455
tabyl(mh_home$MENTHLTH)
## mh_home$MENTHLTH n percent
## 0 268161 6.288143e-01
## 1 12988 3.045573e-02
## 2 22553 5.288483e-02
## 3 14450 3.388400e-02
## 4 7369 1.727967e-02
## 5 18969 4.448066e-02
## 6 2215 5.193983e-03
## 7 7369 1.727967e-02
## 8 1553 3.641650e-03
## 9 249 5.838834e-04
## 10 14434 3.384648e-02
## 11 129 3.024938e-04
## 12 1070 2.509057e-03
## 13 160 3.751861e-04
## 14 2700 6.331266e-03
## 15 13371 3.135384e-02
## 16 229 5.369851e-04
## 17 195 4.572581e-04
## 18 285 6.683003e-04
## 19 27 6.331266e-05
## 20 7938 1.861392e-02
## 21 509 1.193561e-03
## 22 158 3.704963e-04
## 23 102 2.391812e-04
## 24 127 2.978040e-04
## 25 2933 6.877631e-03
## 26 114 2.673201e-04
## 27 206 4.830521e-04
## 28 809 1.897035e-03
## 29 397 9.309306e-04
## 30 24686 5.788653e-02
nrow(mh_home)
## [1] 426455
# Creating focal dependent variable.
mh_home$BMH<- recode(mh_home$MENTHLTH, "14:30=1;77=NA; 99=NA; else=0")
tabyl(mh_home$BMH)
## mh_home$BMH n percent
## 0 371669 0.8715316
## 1 54786 0.1284684
tabyl(mh_home$X_AGEG5YR)
## mh_home$X_AGEG5YR n percent
## 1 25210 0.05911526
## 2 21119 0.04952222
## 3 25182 0.05904961
## 4 28001 0.06565992
## 5 28824 0.06758978
## 6 27863 0.06533632
## 7 33391 0.07829900
## 8 37207 0.08724719
## 9 43293 0.10151833
## 10 44774 0.10499115
## 11 41877 0.09819793
## 12 28875 0.06770937
## 13 33111 0.07764242
## 14 7728 0.01812149
# 18-24 to 80+
# Eliminating those with missing data on age.
mh_home <-filter(mh_home, X_AGEG5YR != 14)
nrow(mh_home)
## [1] 418727
tabyl(mh_home$X_AGE80)
## mh_home$X_AGE80 n percent
## 18 3093 0.007386674
## 19 3349 0.007998051
## 20 3465 0.008275081
## 21 3787 0.009044079
## 22 3730 0.008907952
## 23 3977 0.009497835
## 24 3809 0.009096619
## 25 4188 0.010001743
## 26 3914 0.009347379
## 27 4111 0.009817853
## 28 4506 0.010761188
## 29 4400 0.010508040
## 30 5296 0.012647859
## 31 4575 0.010925973
## 32 5206 0.012432922
## 33 5004 0.011950507
## 34 5101 0.012182162
## 35 5675 0.013552983
## 36 5442 0.012996535
## 37 5505 0.013146991
## 38 5839 0.013944647
## 39 5540 0.013230577
## 40 6724 0.016058195
## 41 5359 0.012798315
## 42 5992 0.014310040
## 43 5534 0.013216248
## 44 5215 0.012454415
## 45 5877 0.014035398
## 46 5213 0.012449639
## 47 5576 0.013316552
## 48 5513 0.013166096
## 49 5684 0.013574477
## 50 7553 0.018038006
## 51 6138 0.014658716
## 52 6838 0.016330449
## 53 6435 0.015368008
## 54 6427 0.015348903
## 55 7299 0.017431405
## 56 7019 0.016762712
## 57 7448 0.017787246
## 58 7775 0.018568184
## 59 7666 0.018307871
## 60 9169 0.021897322
## 61 7709 0.018410563
## 62 9039 0.021586857
## 63 8561 0.020445302
## 64 8815 0.021051903
## 65 9824 0.023461587
## 66 8746 0.020887117
## 67 8981 0.021448342
## 68 8682 0.020734273
## 69 8541 0.020397538
## 70 9242 0.022071660
## 71 7759 0.018529973
## 72 8561 0.020445302
## 73 8232 0.019659587
## 74 8083 0.019303747
## 75 6833 0.016318508
## 76 5706 0.013627017
## 77 5677 0.013557760
## 78 5820 0.013899271
## 79 4839 0.011556456
## 80 33111 0.079075388
tabyl(mh_home$X_AGEG5YR)
## mh_home$X_AGEG5YR n percent
## 1 25210 0.06020629
## 2 21119 0.05043620
## 3 25182 0.06013942
## 4 28001 0.06687173
## 5 28824 0.06883721
## 6 27863 0.06654216
## 7 33391 0.07974408
## 8 37207 0.08885742
## 9 43293 0.10339195
## 10 44774 0.10692886
## 11 41877 0.10001027
## 12 28875 0.06895901
## 13 33111 0.07907539
# Recoding to midpoint of interval
mh_home$NEWAGE <- recode(mh_home$X_AGEG5YR, "1=21;2=27;3=32;4=37;5=42;6=47;7=52;8=57;9=62;10=67;11=72;12=77;13=82")
tabyl(mh_home$NEWAGE)
## mh_home$NEWAGE n percent
## 21 25210 0.06020629
## 27 21119 0.05043620
## 32 25182 0.06013942
## 37 28001 0.06687173
## 42 28824 0.06883721
## 47 27863 0.06654216
## 52 33391 0.07974408
## 57 37207 0.08885742
## 62 43293 0.10339195
## 67 44774 0.10692886
## 72 41877 0.10001027
## 77 28875 0.06895901
## 82 33111 0.07907539
# observing association between two categorical variables
# 1 = Own, 2 = Rent, 3 = Other arrangement
chi_data = data.frame(mh_home$BMH, mh_home$RENTHOM1)
chi_data = table(mh_home$BMH, mh_home$RENTHOM1)
print(chi_data)
##
## 1 2 3
## 0 268880 79903 15820
## 1 29310 20303 4511
print(chisq.test(chi_data))
##
## Pearson's Chi-squared test
##
## data: chi_data
## X-squared = 8880.6, df = 2, p-value < 2.2e-16
# Calculating statistics for specific ages so that I can compare actual values to model-based values.
ageprobs<- mh_home %>%
group_by(X_AGE80) %>%
summarize(p=mean(BMH),n=n())
ageprobs$num <- ageprobs$p*(1-ageprobs$p)
ageprobs$sep <- sqrt(ageprobs$num/ageprobs$n)
ageprobs$me <- 2*ageprobs$sep
ggplot(data =ageprobs, aes(x = X_AGE80, y = p, ymin=p-me, ymax=p+me)) +
geom_line() + ylim(0,1) + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Mental Health by Age",
subtitle="Home Owners, Renters, and Other Arrangement",
caption="Source: 2021 BRFSS Data") +
xlab(label="Age at Survey") +
ylab(label="Proportion of Those Who Have Frequent Poor Mental Health")
# Splitting out results into an Excel file.
write_xlsx(ageprobs,"/Users/briansurratt/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/DEM 7283 Stats II/Assignment 4/ageprobs.xlsx")
# Calculating statistics for age groups for greater precision since the numbers at specific ages is small.
agecatprobs<- mh_home %>%
group_by(NEWAGE) %>%
summarize(p=mean(BMH),n=n())
agecatprobs$num <- agecatprobs$p*(1-agecatprobs$p)
agecatprobs$sep <- sqrt(agecatprobs$num/agecatprobs$n)
agecatprobs$me <- 2*agecatprobs$sep
ggplot(data =agecatprobs, aes(x = NEWAGE, y = p, ymin=p-me, ymax=p+me)) +
geom_line() + ylim(0,1) + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Mental Health by Age Category",
subtitle="Home Owners, Renters, and Other Arrangement",
caption="Source: 2021 BRFSS Data") +
xlab(label="Midpoint of 5-Year Age Category at Survey") +
ylab(label="Proportion of Those Who Have Frequent Poor Mental Health")
# Splitting out results into an Excel file
write_xlsx(agecatprobs,"/Users/briansurratt/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/DEM 7283 Stats II/Assignment 4/agecatprobs.xlsx")
#looking at association between categorical and numeric variable
boxplot(mh_home$X_AGE80 ~ mh_home$RENTHOM1,
col='steelblue',
xlab='Tenancy Subgroup',
ylab='Age at Survey')
# Logit models of bad mental health
mh_home$X_AGE80SQ <- mh_home$X_AGE80*mh_home$X_AGE80
model <- glm(BMH ~ X_AGE80, family = "binomial", data = mh_home)
summary(model)
##
## Call:
## glm(formula = BMH ~ X_AGE80, family = "binomial", data = mh_home)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7448 -0.5722 -0.4734 -0.4073 2.2865
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7348594 0.0136378 -53.88 <2e-16 ***
## X_AGE80 -0.0225403 0.0002594 -86.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 322399 on 418726 degrees of freedom
## Residual deviance: 314763 on 418725 degrees of freedom
## AIC: 314767
##
## Number of Fisher Scoring iterations: 5
model <- glm(BMH ~ X_AGE80 + X_AGE80SQ, family = "binomial", data = mh_home)
summary(model)
##
## Call:
## glm(formula = BMH ~ X_AGE80 + X_AGE80SQ, family = "binomial",
## data = mh_home)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7043 -0.5865 -0.4801 -0.3934 2.3329
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.135e+00 3.668e-02 -30.946 <2e-16 ***
## X_AGE80 -3.987e-03 1.594e-03 -2.501 0.0124 *
## X_AGE80SQ -1.874e-04 1.589e-05 -11.791 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 322399 on 418726 degrees of freedom
## Residual deviance: 314622 on 418724 degrees of freedom
## AIC: 314628
##
## Number of Fisher Scoring iterations: 5
# Creating dummy or indicator variables.
mh_home$own <- recode(mh_home$RENTHOM1, "1=1; else=0")
mh_home$rent <- recode(mh_home$RENTHOM1, "2=1; else=0")
mh_home$other <- recode(mh_home$RENTHOM1, "3=1; else=0")
# Renters as reference group.
model <- glm(BMH ~ own + other , family = "binomial", data = mh_home)
summary(model)
##
## Call:
## glm(formula = BMH ~ own + other, family = "binomial", data = mh_home)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7083 -0.4549 -0.4549 -0.4549 2.1540
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.370045 0.007859 -174.321 < 2e-16 ***
## own -0.846292 0.009980 -84.799 < 2e-16 ***
## other 0.115289 0.018619 6.192 5.94e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 322399 on 418726 degrees of freedom
## Residual deviance: 314156 on 418724 degrees of freedom
## AIC: 314162
##
## Number of Fisher Scoring iterations: 4
# Calculating odds ratios.
exp(coef(model))
## (Intercept) own other
## 0.2540956 0.4290029 1.1221973
# Owners as reference group.
model <- glm(BMH ~ rent + other , family = "binomial", data = mh_home)
summary(model)
##
## Call:
## glm(formula = BMH ~ rent + other, family = "binomial", data = mh_home)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7083 -0.4549 -0.4549 -0.4549 2.1540
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.216336 0.006151 -360.35 <2e-16 ***
## rent 0.846292 0.009980 84.80 <2e-16 ***
## other 0.961580 0.017964 53.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 322399 on 418726 degrees of freedom
## Residual deviance: 314156 on 418724 degrees of freedom
## AIC: 314162
##
## Number of Fisher Scoring iterations: 4
# Calculating odds ratios.
exp(coef(model))
## (Intercept) rent other
## 0.1090077 2.3309867 2.6158270
model <- glm(BMH ~ rent + other + X_AGE80 , family = "binomial", data = mh_home)
summary(model)
##
## Call:
## glm(formula = BMH ~ rent + other + X_AGE80, family = "binomial",
## data = mh_home)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8106 -0.5420 -0.4530 -0.4023 2.2980
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.3256173 0.0169084 -78.40 <2e-16 ***
## rent 0.6293490 0.0107641 58.47 <2e-16 ***
## other 0.6604368 0.0189299 34.89 <2e-16 ***
## X_AGE80 -0.0155091 0.0002816 -55.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 322399 on 418726 degrees of freedom
## Residual deviance: 311079 on 418723 degrees of freedom
## AIC: 311087
##
## Number of Fisher Scoring iterations: 5
# Generating predicted probabilities for age by evaluating subgrouos ar=t the mean
# https://druedin.com/2016/01/16/predicted-probabilities-in-r/
dataforprobs <- with(mh_home, data.frame(X_AGE80 = 18:80, rent=mean(rent, na.rm=TRUE), other=mean(other, na.rm=TRUE)))
preds <- predict(model, dataforprobs, type="response", se.fit=TRUE)
predf <- preds$fit # predicted
lower <- preds$fit - (1.96*preds$se.fit) # lower bounds
upper <- preds$fit + (1.96*preds$se.fit) # upper bounds
plot(18:80, predf, type="l", ylab="Predicted Probability of Frequent Poor Health", xlab="Age at Interview", bty="n")
lines(18:80, lower, lty=2)
lines(18:80, upper, lty=2)
# exploring interactions
mh_home$rentage <- mh_home$rent*mh_home$X_AGE80
mh_home$otherage <- mh_home$other*mh_home$X_AGE80
model <- glm(BMH ~ rent + other + X_AGE80 + rentage + otherage, family = "binomial", data = mh_home)
summary(model)
##
## Call:
## glm(formula = BMH ~ rent + other + X_AGE80 + rentage + otherage,
## family = "binomial", data = mh_home)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7749 -0.5766 -0.4504 -0.3856 2.3451
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0610171 0.0221085 -47.991 < 2e-16 ***
## rent 0.1466822 0.0304953 4.810 1.51e-06 ***
## other 0.1905858 0.0423254 4.503 6.70e-06 ***
## X_AGE80 -0.0202814 0.0003861 -52.527 < 2e-16 ***
## rentage 0.0099063 0.0005955 16.636 < 2e-16 ***
## otherage 0.0103455 0.0009338 11.079 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 322399 on 418726 degrees of freedom
## Residual deviance: 310758 on 418721 degrees of freedom
## AIC: 310770
##
## Number of Fisher Scoring iterations: 5
# Another way to do this is to run models by subgroup.
# Now let's revisit original continuous variable since we are throwing information away by using a binary variable
boxplot(mh_home$MENTHLTH ~ mh_home$RENTHOM1,
col='steelblue',
xlab='Own/Rent/Other Subgroup',
ylab='Number of Days with Poor Mental Health')
In this exercise, I selected RENTHOME1 as the independent variable. The question is “do you own or rent your home”. I included the answers “Own,” “Rent,” and “Other Arrangement.” I exclued “Don’t know/Not sure,” “Refused,” and “Not asked or Missing.”
I used MENTHLTH as the dependent variable. I thought it would be interesting to see the relationship between owner/renter/other status and mental health.
After all restrictions, the sample size was 418,727.
For the two models holding “Renters” as the reference group and “Owners” as the reference group, the p values are approaching 0 for both IVs, so the models would appear to be a good fit.
The coefficients for IVs are equivalent to log(odds). After running regressions, I converted the coefficients to odds and found the following results.
When “Renters” are the reference group, the odds for “Owners” having 14 or more days of poor mental health in the past 30 days is 0.429, less than half the odds of “Renters”. The odds of “Other arrangement” tenants having 14 or more days of poor mental health is 1.12, slightly higher than “Renters”.
When “Owners” are the reference group, the odds of “Renters” having 14 or more days of poor mental health in the past 30 days is 2.33 and the odds for “Other arrangement” tenants is 2.62, both more than double the odds of “Owners”.
I could further experiment with regressions using different categories (in the same thematic area). It may be interesting to use income as an IV, converting income brackets to categorical variables, and own/rent/other as the DV. It may be possible to estimate probabilities of owning or renting based on income brackets. I need to work on interpreting model fit. I need to experiment with converting odds ratios to probabilities.