Luttinen HW 6

#load libraries
library(haven)
library(gtsummary)
library(MASS)

## Warning: package 'MASS' was built under R version 4.1.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:gtsummary':
## 
##     select

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

#read in data
uganda16 <- read_dta("C:/Users/rlutt/Downloads/UGIR7BFL.DTA")
uganda16<-zap_labels(uganda16)

#recodes
#bank account
uganda16$bank<-car::Recode(uganda16$v170, recodes= "0= 'No'; 1= 'Yes'")
#age groups
uganda16$agegroup <- car::Recode(uganda16$v013, recodes = "1='15-19'; 2='20-24';3='25-29'; 4 = '30-34';5='35-39';6='40-44';7='45+' ", as.factor=T)
#fertility preferences
uganda16$wantanotherchild<-ifelse(uganda16$v602!=9&uganda16$v602==1,1,0)

#education level
uganda16$educationlevel <- car::recode(uganda16$v106, 
                            recodes = "0 = 'none'; 1 = 'primary'; 2:3='secondary and above' ",
                            as.factor=T)
#internet
uganda16$internet<-as.factor(uganda16$v171a)
uganda16$internet<-car::Recode(uganda16$v171a, recodes= "0='never'; 1= 'in the past year'; 2='over a year ago'; 3= 'yes, but unsure when'", as.factor=T)

#contraception
uganda16$contraception<-as.factor(uganda16$v313)
uganda16$contraception<-car::Recode(uganda16$v313, recodes= "0='none'; 1='folkloric method'; 2='traditional method' ;3= 'modern method'", as.factor=T)

#usingmoderncontraception
uganda16$moderncon<-car::Recode(uganda16$v313, recodes= "0='no'; 1='no'; 2='no'; 3='yes'", as.factor=T)

#has another kid
uganda16$currentchildren<-uganda16$v202+uganda16$v203


#regions
uganda16$region<-as.factor(uganda16$v024)
uganda16$regions<-car::Recode(uganda16$v024, recodes= "0='Kampala'; 1= 'South Buganda'; 2='North Buganda';3='Busoga';4='Bukedi'; 5='Bugisu'; 6='Teso';7='Karamoja'; 8='Lango'; 9='Acholi'; 10='West Nile'; 11='Bunyoro';12='Tooro'; 13='Ankole';14='Kigezi'")

# survey design variables
uganda16$psu <- uganda16$v021
uganda16$strata <- uganda16$v022
uganda16$pwt <- uganda16$v005/1000000
desi<-svydesign(ids = ~ psu, strata = ~ strata, weights =~ pwt, data=uganda16)

1) Define a count outcome for the dataset of your choosing, the Area Resource File used in class provides many options here

State a research question about your outcome

At what rate are women in Uganda actively using contraception? In other words: how many women per women in child-rearing years are actively using contraception per region?

Is an offset term necessary? why or why not? If an offset term is need, what is the appropriate offset?

Yes, the offset term that is necessary in this context is the population of women in Uganda in their child-rearing years (ages 15-49). Offset terms are used in the Poisson model, to convert a count outcome into a rate.

#offset term

#add person weight to get total population
uganda16$pwt <- uganda16$v005/1000000

#count number of respondents and apply weights
nrow(uganda16)

## [1] 18506

uganda16$offset2<-18506*uganda16$pwt


#get weighted # of women

library(survey)

svytable(~uganda16$moderncon+uganda16$regions, design=desi)

##                   uganda16$regions
## uganda16$moderncon     Acholi     Ankole     Bugisu     Bukedi    Bunyoro
##                no   706.90778 1084.96017  606.68150  836.62567  774.38142
##                yes  217.51421  412.69561  313.99111  332.74685  239.49458
##                   uganda16$regions
## uganda16$moderncon     Busoga    Kampala   Karamoja     Kigezi      Lango
##                no  1273.64691  731.79926  344.00124  510.59635  702.38910
##                yes  416.48729  292.95630   20.64405  220.96587  307.28332
##                   uganda16$regions
## uganda16$moderncon North Buganda South Buganda       Teso      Tooro  West Nile
##                no     1322.89924    1715.57354  840.77748  939.49550 1065.27710
##                yes     639.85610     778.13304  257.72262  417.34015  182.15685

Consider a Poisson regression model for the outcome
1. Evaluate the level of dispersion in the outcome
2. Is the Poisson model a good choice?

#filter na
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x dplyr::select() masks MASS::select(), gtsummary::select()
## x tidyr::unpack() masks Matrix::unpack()

uganda16 %>% 
  drop_na()

## # A tibble: 0 x 4,986
## # ... with 4,986 variables: caseid <chr>, v000 <chr>, v001 <dbl>, v002 <dbl>,
## #   v003 <dbl>, v004 <dbl>, v005 <dbl>, v006 <dbl>, v007 <dbl>, v008 <dbl>,
## #   v008a <dbl>, v009 <dbl>, v010 <dbl>, v011 <dbl>, v012 <dbl>, v013 <dbl>,
## #   v014 <dbl>, v015 <dbl>, v016 <dbl>, v017 <dbl>, v018 <dbl>, v019 <dbl>,
## #   v019a <dbl>, v020 <dbl>, v021 <dbl>, v022 <dbl>, v023 <dbl>, v024 <dbl>,
## #   v025 <dbl>, v026 <dbl>, v027 <dbl>, v028 <dbl>, v029 <dbl>, v030 <dbl>,
## #   v031 <dbl>, v032 <dbl>, v034 <dbl>, v040 <dbl>, v042 <dbl>, v044 <dbl>, ...

options(na.action = "na.omit")

#Here I use the original variables because the model will not run with factor varibles, which I how I recoded them


#poisson model
glmp_s<-glm(v313 ~offset(log(offset2))+factor(regions), data=uganda16, family=poisson)


#check dispersion

1-pchisq(glmp_s$deviance, df=glmp_s$df.residual)

## [1] 0

#The p value is 0 which means this model does not fit the data.

#3) Consider a Negative binomial model

#negative binomial

init.coef = c(1, 14)
glmb<-glm.nb(v313 ~offset(log(offset2))+ factor(regions) + factor(agegroup) +currentchildren, data=uganda16)
glmb%>%
  tbl_regression(exp=T)

Characteristic	IRR¹	95% CI¹	p-value
factor(regions)
Acholi	—	—
Ankole	0.84	0.70, 1.01	0.064
Bugisu	1.49	1.22, 1.82	<0.001
Bukedi	1.16	0.97, 1.40	0.11
Bunyoro	1.23	1.02, 1.48	0.030
Busoga	1.51	1.25, 1.82	<0.001
Kampala	1.72	1.43, 2.07	<0.001
Karamoja	0.49	0.38, 0.64	<0.001
Kigezi	1.43	1.17, 1.74	<0.001
Lango	1.36	1.13, 1.64	<0.001
North Buganda	1.35	1.12, 1.62	<0.001
South Buganda	1.23	1.03, 1.48	0.017
Teso	1.10	0.91, 1.32	0.3
Tooro	1.17	0.97, 1.40	0.095
West Nile	0.54	0.45, 0.66	<0.001
factor(agegroup)
15-19	—	—
20-24	3.26	2.91, 3.65	<0.001
25-29	3.78	3.35, 4.28	<0.001
30-34	3.73	3.27, 4.27	<0.001
35-39	3.37	2.91, 3.91	<0.001
40-44	3.31	2.84, 3.86	<0.001
45+	1.86	1.57, 2.21	<0.001
sons at home	1.11	1.08, 1.13	<0.001
¹ IRR = Incidence Rate Ratio, CI = Confidence Interval

#4) Compare the model fits of the alternative models using AIC

#calculate AICs

AIC(glmp_s, glmb)

##        df      AIC
## glmp_s 15 57900.07
## glmb   23 44674.72

#In summary, the negative binomial is a better choice because of a lower AIC.

Luttinen HW 6

Rebecca Luttinen

3/10/2022

1) Define a count outcome for the dataset of your choosing, the Area Resource File used in class provides many options here