#libraries
library(car, quietly = T)
library(stargazer, quietly = T)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey, quietly = T)
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr, quietly = T)
library(dplyr, quietly = T)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2, quietly = T)
library(haven)
tb19 <- read_dta("state_19_working_pudf.dta")
nams<-names(tb19)
head(nams, n=10)
## [1] "year" "ststr" "state" "fmonth" "dispcode" "orgseqno"
## [7] "c01q01" "fairpoor" "c02q01" "phyng"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(tb19)<-newnames
#recodevariables
#age
tb19$agegr4 = as.factor(tb19$agegr4)
tb19$age<-Recode(tb19$agegr4, recodes="1='18-19'; 2='30-44'; 3='45-64'; 4='65+'; else=NA", as.factor=T)
#insurance
tb19$c03q01 = as.factor(tb19$c03q01)
tb19$ins<-Recode(tb19$c03q01, recodes ="1=1;2=0")
#employment
tb19$c08q14 = as.factor(tb19$c08q14)
tb19$employ<-Recode(tb19$c08q14, recodes="1:2='Employed'; 3:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
tb19$employ<-relevel(tb19$employ, ref='Employed')
#education level
tb19$educat4 = as.factor(tb19$educat4)
tb19$educ<-Recode(tb19$educat4, recodes="1='leshs'; 2='hsgrad'; 3='sumcol'; 4='colgrad'", as.factor=T)
tb19$educ<-relevel(tb19$educ, ref='hsgrad')
#race/ethnicity
tb19$raceeth = as.factor(tb19$raceeth)
tb19$black<-Recode(tb19$raceeth, recodes="2=1; 9=NA; else=0")
tb19$white<-Recode(tb19$raceeth, recodes="1=1; 9=NA; else=0")
tb19$other<-Recode(tb19$raceeth, recodes="3:4=1; 9=NA; else=0")
tb19$hispanic<-Recode(tb19$raceeth, recodes="5=1; 9=NA; else=0")
tb19$race_eth<-Recode(tb19$raceeth, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)
#smoking currently
tb19$smoker2 = as.factor(tb19$smoker2)
tb19$smoke<-Recode(tb19$smoker2, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
tb19$smoke<-relevel(tb19$smoke, ref = "NeverSmoked")
#Poor or fair self rated health
tb19$fairpoor = as.factor(tb19$fairpoor)
tb19$genhlth<-Recode(tb19$fairpoor, recodes="1='poor'; 2='good'")
#sex
tb19$sex<-as.factor(ifelse(tb19$sex==1, "Male", "Female"))
#marital status
tb19$c08q05 = as.factor(tb19$c08q05)
tb19$marital<-Recode(tb19$c08q05, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
tb19$marital<-relevel(tb19$marital, ref='married')
#harveyhomedamage
tb19$tx04q04 = as.factor(tb19$tx04q04)
tb19$hdamage<-Recode(tb19$tx04q04, recodes="1='midly';2='moderate'; 3='severe';4='none'; else=NA", as.factor=T)
#Harvey_health
tb19$tx04q05 = as.factor(tb19$tx04q05)
tb19$h_hlth<-Recode(tb19$tx04q05, recodes="1=1;2=0", as.factor=T)
#harveychildasthma
tb19$tx05q01 = as.factor(tb19$tx05q01)
tb19$c_asth<-Recode(tb19$tx05q01, recodes="1=1;2=0; 3='notaffected'")
#harveymold
tb19$tx05q06 = as.factor(tb19$tx05q06)
tb19$mold<-Recode(tb19$tx05q06, recodes="1=1;2=0")
#harveyvermon
tb19$tx05q09 = as.factor(tb19$tx05q09)
tb19$vermon<-Recode(tb19$tx05q09, recodes="1=1;2=0")
#copd
tb19$c06q08 = as.factor(tb19$c06q08)
tb19$copd<-Recode(tb19$c06q08, recodes="1=1;2=0")
#My binary outcome is Chronic Obstructive Pulmonary Disease (COPD).
tb19$copd<-Recode(tb19$c06q08, recodes="1=1; 2=0;else=NA", as.factor=T)
summary(tb19$copd, na.rm = TRUE)
## 0 1 NA's
## 11188 1036 73
tb19_mod<-tb19 %>%
dplyr::select(age,ins,employ,educ,black,white,other,hispanic,race_eth,smoke,genhlth,sex,marital,hdamage,h_hlth,c_asth,mold,vermon,copd) %>%
filter (complete.cases(.))
knitr::kable(head(tb19_mod))
| age | ins | employ | educ | black | white | other | hispanic | race_eth | smoke | genhlth | sex | marital | hdamage | h_hlth | c_asth | mold | vermon | copd |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 45-64 | 1 | Employed | hsgrad | 0 | 1 | 0 | 0 | nhwhite | NeverSmoked | good | Female | married | none | 0 | 0 | 0 | 0 | 0 |
| 65+ | 1 | retired | sumcol | 0 | 1 | 0 | 0 | nhwhite | NeverSmoked | good | Female | married | moderate | 0 | 0 | 0 | 0 | 0 |
| 45-64 | 1 | nilf | colgrad | 1 | 0 | 0 | 0 | nh black | NeverSmoked | good | Female | separated | moderate | 1 | 1 | 1 | 0 | 1 |
| 18-19 | 1 | Employed | hsgrad | 1 | 0 | 0 | 0 | nh black | NeverSmoked | good | Male | cohab | moderate | 0 | 0 | 1 | 0 | 0 |
| 30-44 | 1 | unable | hsgrad | 1 | 0 | 0 | 0 | nh black | NeverSmoked | poor | Female | married | none | 0 | 0 | 0 | 0 | 0 |
| 30-44 | 1 | Employed | sumcol | 0 | 1 | 0 | 0 | nhwhite | Current | good | Female | widowed | severe | 0 | 0 | 1 | 0 | 0 |
subj<-tb19%>%
select(age,ins,employ,educ,black,white,other,hispanic,race_eth,smoke,genhlth,sex,marital,hdamage,h_hlth,c_asth,mold,vermon,copd) %>%
filter( complete.cases(.))
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~llcpwt, data =tb19)
#lm
fit.logit<-svyglm(copd~sex+ins+employ+educ+race_eth+smoke+genhlth+marital+hdamage+h_hlth+c_asth+mold+vermon+copd,
design= des,
family=binomial)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on the
## right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 14 in
## model.matrix: no columns are assigned
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in model.matrix.default(glm.object, data = structure(list(copd =
## structure(c(1L, : the response appeared on the right-hand side and was dropped
## Warning in model.matrix.default(glm.object, data = structure(list(copd =
## structure(c(1L, : problem with term 14 in model.matrix: no columns are assigned
summary(fit.logit)
##
## Call:
## svyglm(formula = copd ~ sex + ins + employ + educ + race_eth +
## smoke + genhlth + marital + hdamage + h_hlth + c_asth + mold +
## vermon + copd, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = tb19)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -57.584 3.873 -14.870 2.75e-13 ***
## sexMale 32.117 2.631 12.208 1.57e-11 ***
## ins1 2.725 1.212 2.249 0.034370 *
## employnilf 41.660 1.821 22.874 < 2e-16 ***
## employretired 113.784 3.516 32.358 < 2e-16 ***
## employunable -17.091 3.562 -4.798 7.71e-05 ***
## educcolgrad 1.165 1.414 0.824 0.418441
## educleshs 31.501 2.288 13.767 1.36e-12 ***
## educsumcol 6.376 1.354 4.708 9.64e-05 ***
## race_ethnh multirace -9.724 2.395 -4.060 0.000484 ***
## race_ethnh other -62.091 4.053 -15.319 1.47e-13 ***
## race_ethnhwhite -8.878 2.199 -4.038 0.000512 ***
## smokeCurrent 12.921 2.417 5.346 1.98e-05 ***
## smokeFormer -49.481 3.054 -16.202 4.50e-14 ***
## genhlthpoor 19.397 2.276 8.522 1.43e-08 ***
## maritalcohab 47.760 1.295 36.889 < 2e-16 ***
## maritaldivorced 8.584 1.534 5.595 1.08e-05 ***
## maritalnm -48.246 2.011 -23.990 < 2e-16 ***
## maritalseparated -1.721 1.839 -0.936 0.358911
## maritalwidowed 56.888 3.631 15.668 9.16e-14 ***
## hdamagemoderate -81.945 1.531 -53.520 < 2e-16 ***
## hdamagenone -72.190 2.340 -30.857 < 2e-16 ***
## hdamagesevere -70.863 2.718 -26.068 < 2e-16 ***
## h_hlth1 141.048 2.817 50.070 < 2e-16 ***
## c_asth1 -58.657 1.791 -32.744 < 2e-16 ***
## mold1 33.555 1.962 17.105 1.42e-14 ***
## vermon1 105.749 2.977 35.518 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.476337e-11)
##
## Number of Fisher Scoring iterations: 25
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
#tb19_mod <- na.omit(tb19)
set.seed(1115)
trainset<- createDataPartition(y = tb19_mod$copd,
p = .80,
list=F,)
tb19_train<-tb19_mod[trainset,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
tb19_test<-tb19_mod[-trainset,]
table(tb19_train$copd)
##
## 0 1
## 55 7
prop.table(table(tb19_train$copd))
##
## 0 1
## 0.8870968 0.1129032
summary(tb19_train)
## age ins employ educ black white other hispanic
## 18-19: 4 0:15 Employed:32 hsgrad :13 0:51 0:21 0:52 0:62
## 30-44:34 1:47 nilf :19 colgrad:20 1:11 1:41 1:10
## 45-64:21 retired : 2 leshs : 9
## 65+ : 3 unable : 9 sumcol :20
##
##
## race_eth smoke genhlth sex marital
## nh black :11 NeverSmoked:39 good:46 Female:43 married :34
## nh multirace: 2 Current :11 poor:16 Male :19 cohab : 6
## nh other : 8 Former :12 divorced :10
## nhwhite :41 nm : 4
## separated: 4
## widowed : 4
## hdamage h_hlth c_asth mold vermon copd
## midly :16 0:53 0 :46 0:41 0:58 0:55
## moderate:18 1: 9 1 :16 1:21 1: 4 1: 7
## none :18 notaffected: 0
## severe :10
##
##
#attach(tb19_train)
#tb19_train$copd <- as.numeric(tb19_train$copd)
glm1<-glm(copd~factor(h_hlth)+factor(employ)+factor(educ)+factor(race_eth)+factor(smoke)+factor(genhlth)+factor(sex)+factor(marital)+factor(hdamage)+factor(c_asth)+factor(mold)+factor(vermon),
data=tb19_train[-1],
family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm1)
##
## Call:
## glm(formula = copd ~ factor(h_hlth) + factor(employ) + factor(educ) +
## factor(race_eth) + factor(smoke) + factor(genhlth) + factor(sex) +
## factor(marital) + factor(hdamage) + factor(c_asth) + factor(mold) +
## factor(vermon), family = binomial, data = tb19_train[-1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.330e-05 -3.558e-06 -2.110e-08 -2.110e-08 1.348e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.329e+01 6.245e+05 0 1
## factor(h_hlth)1 6.963e+01 5.884e+05 0 1
## factor(employ)nilf 1.867e+01 3.007e+05 0 1
## factor(employ)retired 2.990e+01 2.911e+05 0 1
## factor(employ)unable 2.940e+00 5.841e+05 0 1
## factor(educ)colgrad 2.799e+00 4.738e+05 0 1
## factor(educ)leshs 4.307e+01 1.147e+06 0 1
## factor(educ)sumcol 1.158e+01 6.925e+05 0 1
## factor(race_eth)nh multirace -4.309e+01 9.709e+05 0 1
## factor(race_eth)nh other -9.107e+01 5.976e+05 0 1
## factor(race_eth)nhwhite -1.133e+01 3.223e+05 0 1
## factor(smoke)Current 4.237e+01 5.320e+05 0 1
## factor(smoke)Former -4.573e+01 7.885e+05 0 1
## factor(genhlth)poor -1.516e+01 2.131e+05 0 1
## factor(sex)Male 7.256e+00 3.274e+05 0 1
## factor(marital)cohab 4.900e+01 1.697e+05 0 1
## factor(marital)divorced 1.566e+01 2.417e+05 0 1
## factor(marital)nm -4.893e+01 2.520e+05 0 1
## factor(marital)separated 2.565e+01 3.177e+05 0 1
## factor(marital)widowed 3.465e+00 3.137e+05 0 1
## factor(hdamage)moderate -5.883e+01 3.372e+05 0 1
## factor(hdamage)none -1.361e+01 3.392e+05 0 1
## factor(hdamage)severe -4.543e+01 3.867e+05 0 1
## factor(c_asth)1 -9.176e+00 3.624e+05 0 1
## factor(mold)1 7.660e+00 3.277e+05 0 1
## factor(vermon)1 2.553e+01 5.155e+05 0 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4.3715e+01 on 61 degrees of freedom
## Residual deviance: 1.6203e-09 on 36 degrees of freedom
## AIC: 52
##
## Number of Fisher Scoring iterations: 25
.5
tb19_pred<- predict(glm1,
newdata = tb19_train,
type = "response")
head(tb19_pred)
## 1 2 3 4 5 6
## 2.220446e-16 1.000000e+00 5.625563e-13 1.423922e-11 1.000000e+00 2.220446e-16
tb19_predcl<-factor(ifelse(tb19_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=tb19_pred, gr=tb19_predcl, modcon=tb19_train$copd)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of COPD", subtitle = "Threshold = .5")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of COPD", subtitle = "Truth")+
geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table( tb19_predcl,tb19_train$copd)
##
## tb19_predcl 0 1
## 0 55 0
## 1 0 7
cm1<-confusionMatrix(data = tb19_predcl,
reference = tb19_train$copd )
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 55 0
## 1 0 7
##
## Accuracy : 1
## 95% CI : (0.9422, 1)
## No Information Rate : 0.8871
## P-Value [Acc > NIR] : 0.0005946
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8871
## Detection Rate : 0.8871
## Detection Prevalence : 0.8871
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
tb19_predcl<-factor(ifelse(tb19_pred>mean(I(tb19$copd==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tb19_pred, gr=tb19_predcl, modcon=tb19_train$copd)
pred2%>%
ggplot(aes(x=pr, fill=modcon))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of COPD", subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(tb19_train$copd==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred2%>%
ggplot(aes(x=pr, fill=modcon))+
geom_histogram(position="identity", alpha=.2)+
ggtitle(label = "Probability of COPD", subtitle = "Truth")+
geom_vline(xintercept=mean(I(tb19_train$copd==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pred_test<-predict(glm1,
newdata=tb19_test,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(tb19_test$copd==1)), 1, 0))
table(tb19_test$copd,pred_cl)
## pred_cl
## 0 1
## 0 11 2
## 1 1 0
confusionMatrix(data = pred_cl,tb19_test$copd )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 11 1
## 1 2 0
##
## Accuracy : 0.7857
## 95% CI : (0.492, 0.9534)
## No Information Rate : 0.9286
## P-Value [Acc > NIR] : 0.9854
##
## Kappa : -0.1053
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.8462
## Specificity : 0.0000
## Pos Pred Value : 0.9167
## Neg Pred Value : 0.0000
## Prevalence : 0.9286
## Detection Rate : 0.7857
## Detection Prevalence : 0.8571
## Balanced Accuracy : 0.4231
##
## 'Positive' Class : 0
##
#The train and test models score diffrent. Looking at the copd observations they seem to be very low and therefore, the data for the models is not accurate. Both models had less than 100 observations, I dont know if it was my recoding...but i know I went wrong somewhere.I cant locate the error(s)