# install.packages('arm') install.packages('lmSupport')
library(arm)
## Loading required package: MASS
## Loading required package: Matrix
## Loading required package: lattice
## Loading required package: lme4
## Attaching package: 'lme4'
## The following object(s) are masked from 'package:stats':
##
## AIC, BIC
## Loading required package: R2WinBUGS
## Loading required package: coda
## Attaching package: 'coda'
## The following object(s) are masked from 'package:lme4':
##
## HPDinterval
## Loading required package: abind
## Loading required package: foreign
## arm (Version 1.6-03, built: 2013-2-20)
## Working directory is E:/Quant/Assignments/CreateRJournal/Labs/Lab2
## Attaching package: 'arm'
## The following object(s) are masked from 'package:coda':
##
## traceplot
library(MASS)
library(car)
## Loading required package: nnet
library(lmSupport)
## Loading required package: psych
## Attaching package: 'psych'
## The following object(s) are masked from 'package:car':
##
## logit
## The following object(s) are masked from 'package:arm':
##
## rescale, sim
## Loading required package: gplots
## Loading required package: gtools
## Attaching package: 'gtools'
## The following object(s) are masked from 'package:psych':
##
## logit
## The following object(s) are masked from 'package:car':
##
## logit
## Loading required package: gdata
## gdata: Unable to locate valid perl interpreter gdata: gdata: read.xls()
## will be unable to read Excel XLS and XLSX files gdata: unless the 'perl='
## argument is used to specify the location gdata: of a valid perl
## intrpreter. gdata: gdata: (To avoid display of this message in the future,
## please gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls() gdata: to support
## 'XLX' (Excel 97-2004) files.
## ```
## gdata: Run the function 'installXLSXsupport()' gdata: to automatically
## download and install the perl gdata: libaries needed to support Excel XLS
## and XLSX formats.
## Attaching package: 'gdata'
## The following object(s) are masked from 'package:stats':
##
## nobs
## The following object(s) are masked from 'package:utils':
##
## object.size
## Loading required package: caTools
## Loading required package: grid
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
## Attaching package: 'gplots'
## The following object(s) are masked from 'package:stats':
##
## lowess
## Loading required package: gvlma
# Import Data
shotData <- read.table("E:\\Quant\\Assignments\\Lab02\\jh_gardasil.dat",
header = T)
names(shotData)
## [1] "Age" "AgeGroup" "Race" "Shots"
## [5] "Completed" "InsuranceType" "MedAssist" "Location"
## [9] "LocationType" "PracticeType"
# Create Dummy Variables Age Group
shotData$young <- ifelse(shotData$AgeGroup == 0, c(1), c(0))
# Race
shotData$white <- ifelse(shotData$Race == 0, c(1), c(0))
shotData$black <- ifelse(shotData$Race == 1, c(1), c(0))
shotData$hispanic <- ifelse(shotData$Race == 2, c(1), c(0))
shotData$r_other <- ifelse(shotData$Race == 3, c(1), c(0))
# No. Shots - this on is numeric - is it needed?
# All shots completed - no need to create dummy vars :)
# Insurance Type
shotData$medAsst <- ifelse(shotData$InsuranceType == 0, c(1), c(0))
shotData$privatePayer <- ifelse(shotData$InsuranceType == 1, c(1), c(0))
shotData$hospitalBased <- ifelse(shotData$InsuranceType == 2, c(1), c(0))
shotData$military <- ifelse(shotData$InsuranceType == 3, c(1), c(0))
# Medical Assistance - No need to create dummy vars here :)
shotData$noMedAsst <- ifelse(shotData$MedAssist == 0, c(1), c(0))
# Location
shotData$odenton <- ifelse(shotData$Location == 1, c(1), c(0))
shotData$whiteMarsh <- ifelse(shotData$Location == 2, c(1), c(0))
shotData$JHOC <- ifelse(shotData$Location == 3, c(1), c(0))
shotData$bayview <- ifelse(shotData$Location == 4, c(1), c(0))
# Location Type - no need to create dummy var here :)
shotData$suburban <- ifelse(shotData$LocationType == 0, c(1), c(0))
# Practice Type
shotData$pediatric <- ifelse(shotData$PracticeType == 0, c(1), c(0))
shotData$famPractice <- ifelse(shotData$PracticeType == 1, c(1), c(0))
shotData$OBGYN <- ifelse(shotData$PracticeType == 2, c(1), c(0))
names(shotData)
## [1] "Age" "AgeGroup" "Race" "Shots"
## [5] "Completed" "InsuranceType" "MedAssist" "Location"
## [9] "LocationType" "PracticeType" "young" "white"
## [13] "black" "hispanic" "r_other" "medAsst"
## [17] "privatePayer" "hospitalBased" "military" "noMedAsst"
## [21] "odenton" "whiteMarsh" "JHOC" "bayview"
## [25] "suburban" "pediatric" "famPractice" "OBGYN"
# Investigate which groups have higher rates of completion by plotting
# proportions...
plot(factor(shotData$Completed) ~ factor(shotData$AgeGroup), xlab = "Age\n (0=11 - 17,1=18 - 26)",
ylab = "(Completed=1, Not completed=0)")
# Bad... these are very similar
plot(factor(shotData$Completed) ~ factor(shotData$Race), xlab = "Race\n (0=white,1=black, 2=hispanic, 3=other)",
ylab = "(Completed=1, Not completed=0)")
# black appear to have more; white and other are lowest and about the same
plot(factor(shotData$Completed) ~ factor(shotData$MedAssist), xlab = "Medical Assistance\n (0=No, 1=Yes)",
ylab = "(Completed=1, Not completed=0)")
# having medical assistance is higher
plot(factor(shotData$Completed) ~ factor(shotData$InsuranceType), xlab = "Type of Insurance\n (0=medical assistance,1=private, 2=hospital-based, 3=military)",
ylab = "(Completed=1, Not completed=0)")
# medical assistance is highest; hospital based is lowest; private and
# military are about the same
plot(factor(shotData$Completed) ~ factor(shotData$Location), xlab = "Location\n (1=Odenton, 2=White Marsh, 3=John Hopkins 4=Bayview)",
ylab = "(Completed=1, Not completed=0)")
# John Hopkins highest; white marsh lowest
plot(factor(shotData$Completed) ~ factor(shotData$PracticeType), xlab = "Practice Type\n (0=pediatric,1=family practice, 2=OBGYN)",
ylab = "(Completed=1, Not completed=0)")
# All about the same
# Chi-Squared Get frequency of categories
# as.data.frame(table(shotData$Completed)) #Doesn't tell frequency for
# Chi-squared matrix
# Explore the data by computing odds ratios
grp_race <- glm(Completed ~ relevel(factor(Race), "0"), binomial, data = shotData)
grp_race <- glm(Completed ~ relevel(factor(Race), "1"), binomial, data = shotData)
grp_race <- glm(Completed ~ relevel(factor(Race), "2"), binomial, data = shotData)
grp_race <- glm(Completed ~ relevel(factor(Race), "3"), binomial, data = shotData)
summary(grp_race)
##
## Call:
## glm(formula = Completed ~ relevel(factor(Race), "3"), family = binomial,
## data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.982 -0.982 -0.736 1.386 1.697
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5744 0.1527 -3.76 0.00017 ***
## relevel(factor(Race), "3")0 0.0955 0.1706 0.56 0.57552
## relevel(factor(Race), "3")1 -0.5947 0.1892 -3.14 0.00168 **
## relevel(factor(Race), "3")2 -0.1477 0.3328 -0.44 0.65712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796 on 1412 degrees of freedom
## Residual deviance: 1768 on 1409 degrees of freedom
## AIC: 1776
##
## Number of Fisher Scoring iterations: 4
# AIC = 1776
exp(cbind(OR = coef(grp_race), confint(grp_race)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.5630 0.4154 0.7568
## relevel(factor(Race), "3")0 1.1003 0.7897 1.5428
## relevel(factor(Race), "3")1 0.5518 0.3811 0.8009
## relevel(factor(Race), "3")2 0.8627 0.4414 1.6375
# COMPARED TO WHITE OR 2.5 % 97.5 % (Intercept) 0.6194690 0.5331816
# 0.7184667 relevel(factor(Race), '0')1 0.5014793 0.3837039 0.6519972
# relevel(factor(Race), '0')2 0.7840816 0.4215722 1.4058708
# relevel(factor(Race), '0')3 0.9088836 0.6481777 1.2663418
# COMPARED TO BLACK (Intercept) 0.3106509 0.2484495 0.3851648
# relevel(factor(Race), '1')0 1.9941003 1.5337489 2.6061764
# relevel(factor(Race), '1')2 1.5635374 0.8246541 2.8673785
# relevel(factor(Race), '1')3 1.8124050 1.2486287 2.6240660
# COMPARED TO HISPANIC (Intercept) 0.4857143 0.2657182 0.8538221
# relevel(factor(Race), '2')0 1.2753774 0.7113029 2.3720730
# relevel(factor(Race), '2')1 0.6395754 0.3487506 1.2126297
# relevel(factor(Race), '2')3 1.1591696 0.6106937 2.2656163
# COMPARED TO OTHER (Intercept) 0.5630252 0.4153814 0.7567774
# relevel(factor(Race), '3')0 1.1002510 0.7896762 1.5427868
# relevel(factor(Race), '3')1 0.5517531 0.3810880 0.8008786
# relevel(factor(Race), '3')2 0.8626866 0.4413810 1.6374822 Based on these
# results, we can order the likelyhood of a given race to complete the
# 3-shots White is most likely, then other, hispanic, black We can
# conclude that white is a potentially good predictor of having completed
# the shots
grp_ageGrp <- glm(Completed ~ factor(AgeGroup), binomial, data = shotData)
summary(grp_ageGrp)
##
## Call:
## glm(formula = Completed ~ factor(AgeGroup), family = binomial,
## data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.932 -0.932 -0.865 1.444 1.527
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6087 0.0791 -7.70 1.4e-14 ***
## factor(AgeGroup)1 -0.1830 0.1131 -1.62 0.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796.0 on 1412 degrees of freedom
## Residual deviance: 1793.4 on 1411 degrees of freedom
## AIC: 1797
##
## Number of Fisher Scoring iterations: 4
# AIC = 1797.4
exp(cbind(OR = coef(grp_ageGrp), confint(grp_ageGrp)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.5441 0.4654 0.6345
## factor(AgeGroup)1 0.8328 0.6669 1.0393
# Compared to age11-17 OR 2.5 % 97.5 % (Intercept) 0.5440529 0.4653631
# 0.6345436 factor(AgeGroup)1 0.8327522 0.6669498 1.0392945 The age group
# of 11-17 yr olds is more likely to complete the 3 shots
grp_InsType <- glm(Completed ~ relevel(factor(InsuranceType), "0"), binomial,
data = shotData)
summary(grp_InsType)
##
## Call:
## glm(formula = Completed ~ relevel(factor(InsuranceType), "0"),
## family = binomial, data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.117 -0.928 -0.928 1.413 1.794
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.386 0.151 -9.20 < 2e-16
## relevel(factor(InsuranceType), "0")1 0.767 0.170 4.52 6.2e-06
## relevel(factor(InsuranceType), "0")2 1.243 0.266 4.68 2.9e-06
## relevel(factor(InsuranceType), "0")3 0.848 0.189 4.49 7.2e-06
##
## (Intercept) ***
## relevel(factor(InsuranceType), "0")1 ***
## relevel(factor(InsuranceType), "0")2 ***
## relevel(factor(InsuranceType), "0")3 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796.0 on 1412 degrees of freedom
## Residual deviance: 1763.1 on 1409 degrees of freedom
## AIC: 1771
##
## Number of Fisher Scoring iterations: 4
# AIC = 1771.1
exp(cbind(OR = coef(grp_InsType), confint(grp_InsType)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.250 0.1843 0.3332
## relevel(factor(InsuranceType), "0")1 2.153 1.5534 3.0247
## relevel(factor(InsuranceType), "0")2 3.467 2.0598 5.8496
## relevel(factor(InsuranceType), "0")3 2.335 1.6191 3.3996
# COMPARED TO MEDAssist OR 2.5 % 97.5 % (Intercept) 0.250000 0.1843109
# 0.3332127 relevel(factor(InsuranceType), '0')1 2.153191 1.5533828
# 3.0247079 relevel(factor(InsuranceType), '0')2 3.466667 2.0598141
# 5.8495818 relevel(factor(InsuranceType), '0')3 2.334928 1.6191330
# 3.3995684 Insurance type order: private payer, military, hospital,
# medical assistance A private providor of insurance is most likely likely
# to have a completed regiment
grp_MedAssist <- glm(Completed ~ MedAssist, binomial, data = shotData)
summary(grp_MedAssist)
##
## Call:
## glm(formula = Completed ~ MedAssist, family = binomial, data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.951 -0.951 -0.951 1.422 1.794
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5589 0.0616 -9.07 < 2e-16 ***
## MedAssist -0.8274 0.1629 -5.08 3.8e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796.0 on 1412 degrees of freedom
## Residual deviance: 1767.3 on 1411 degrees of freedom
## AIC: 1771
##
## Number of Fisher Scoring iterations: 4
# AIC = 1771.3
exp(cbind(OR = coef(grp_MedAssist), confint(grp_MedAssist)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.5718 0.5064 0.6448
## MedAssist 0.4372 0.3152 0.5975
# COMPARED TO NO MED ASSIST OR 2.5 % 97.5 % (Intercept) 0.5718232
# 0.5064174 0.6448202 MedAssist 0.4371981 0.3152027 0.5974797 not having
# medical assistance is a better odds ratio of completion
grp_Loc <- glm(Completed ~ relevel(factor(Location), "3"), binomial, data = shotData)
summary(grp_Loc)
##
## Call:
## glm(formula = Completed ~ relevel(factor(Location), "3"), family = binomial,
## data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.152 -0.919 -0.781 1.460 1.757
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.304 0.259 -5.04 4.6e-07 ***
## relevel(factor(Location), "3")1 0.661 0.269 2.46 0.014 *
## relevel(factor(Location), "3")2 1.243 0.302 4.12 3.8e-05 ***
## relevel(factor(Location), "3")4 0.274 0.285 0.96 0.336
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796.0 on 1412 degrees of freedom
## Residual deviance: 1764.9 on 1409 degrees of freedom
## AIC: 1773
##
## Number of Fisher Scoring iterations: 4
# AIC = 1772.9
exp(cbind(OR = coef(grp_Loc), confint(grp_Loc)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.2714 0.1590 0.441
## relevel(factor(Location), "3")1 1.9372 1.1660 3.369
## relevel(factor(Location), "3")2 3.4675 1.9489 6.397
## relevel(factor(Location), "3")4 1.3158 0.7656 2.352
# COMPARED TO ODENTON OR 2.5 % 97.5 % (Intercept) 0.5258126 0.4538552
# 0.6078322 relevel(factor(Location), '1')2 1.7899465 1.2752501 2.5114681
# relevel(factor(Location), '1')3 0.5162078 0.2968655 0.8576605
# relevel(factor(Location), '1')4 0.6792208 0.5138153 0.8928173 (best)
# white marsh, Odenton, Bayview, John Hopkins (worst)
grp_LocType <- glm(Completed ~ LocationType, binomial, data = shotData)
summary(grp_LocType)
##
## Call:
## glm(formula = Completed ~ LocationType, family = binomial, data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.959 -0.959 -0.764 1.413 1.657
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5381 0.0668 -8.06 7.9e-16 ***
## LocationType -0.5429 0.1273 -4.26 2.0e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796.0 on 1412 degrees of freedom
## Residual deviance: 1777.1 on 1411 degrees of freedom
## AIC: 1781
##
## Number of Fisher Scoring iterations: 4
# AIC = 1781.1
exp(cbind(OR = coef(grp_LocType), confint(grp_LocType)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.5839 0.5118 0.6651
## LocationType 0.5811 0.4516 0.7441
# COMPARED TO SUBURBAN OR 2.5 % 97.5 % (Intercept) 0.5838816 0.5118243
# 0.6650824 LocationType 0.5810865 0.4515551 0.7440576 Suburban locations
# have higher odds of completion than rural.
grp_Practice <- glm(Completed ~ relevel(factor(PracticeType), "1"), binomial,
data = shotData)
summary(grp_Practice)
##
## Call:
## glm(formula = Completed ~ relevel(factor(PracticeType), "1"),
## family = binomial, data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.973 -0.869 -0.828 1.397 1.573
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.893 0.115 -7.75 9.3e-15
## relevel(factor(PracticeType), "1")0 0.115 0.149 0.77 0.4432
## relevel(factor(PracticeType), "1")2 0.392 0.146 2.68 0.0073
##
## (Intercept) ***
## relevel(factor(PracticeType), "1")0
## relevel(factor(PracticeType), "1")2 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796.0 on 1412 degrees of freedom
## Residual deviance: 1787.6 on 1410 degrees of freedom
## AIC: 1794
##
## Number of Fisher Scoring iterations: 4
# AIC = 1793.6
exp(cbind(OR = coef(grp_Practice), confint(grp_Practice)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.4093 0.3252 0.5113
## relevel(factor(PracticeType), "1")0 1.1213 0.8377 1.5048
## relevel(factor(PracticeType), "1")2 1.4793 1.1132 1.9728
# COMPARED TO PEDIATRIC OR 2.5 % 97.5 % (Intercept) 0.4589235 0.3801637
# 0.5516303 relevel(factor(PracticeType), '0')1 0.8917966 0.6645241
# 1.1937630 relevel(factor(PracticeType), '0')2 1.3192213 1.0222206
# 1.7043677 OBGYN, Pediatric, Family Practice
# The variables with the lowest AIC's include: Insurance Type: 1771.1
# Medical Assistance: 1771.3 Location: 1772.9 Race' 1776.0 Location Type:
# 1781.1 Practice Type: 1793.6 Age Group: 1797.4
# The categories with the higest odds include: Age Group: 11 - 17 (id 0)
# Race: white (id 0) InsuranceType: hospital based (id 2) MedAssist: none
# (id 0) Location: White Marsh (id 2) LocationType: Suburban (id 0)
# PracticeType: OBGYN (id2)
################################################################################
mle_fullVars_model <- glm(Completed ~ relevel(factor(InsuranceType), "0") +
relevel(factor(Location), "1"), binomial, data = shotData)
summary(mle_fullVars_model)
##
## Call:
## glm(formula = Completed ~ relevel(factor(InsuranceType), "0") +
## relevel(factor(Location), "1"), family = binomial, data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.339 -0.901 -0.734 1.415 1.957
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2719 0.2107 -6.04 1.6e-09
## relevel(factor(InsuranceType), "0")1 0.5806 0.2005 2.90 0.00378
## relevel(factor(InsuranceType), "0")2 1.0705 0.2760 3.88 0.00011
## relevel(factor(InsuranceType), "0")3 0.7295 0.2376 3.07 0.00214
## relevel(factor(Location), "1")2 0.5740 0.1852 3.10 0.00194
## relevel(factor(Location), "1")3 -0.4837 0.2877 -1.68 0.09274
## relevel(factor(Location), "1")4 -0.0956 0.1801 -0.53 0.59556
##
## (Intercept) ***
## relevel(factor(InsuranceType), "0")1 **
## relevel(factor(InsuranceType), "0")2 ***
## relevel(factor(InsuranceType), "0")3 **
## relevel(factor(Location), "1")2 **
## relevel(factor(Location), "1")3 .
## relevel(factor(Location), "1")4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796.0 on 1412 degrees of freedom
## Residual deviance: 1747.1 on 1406 degrees of freedom
## AIC: 1761
##
## Number of Fisher Scoring iterations: 4
# AIC = 1761.1 Likelihood Ratio Test to see if anything should be
# dropped...
anova(mle_fullVars_model, test = "LRT")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Completed
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 1412 1796
## relevel(factor(InsuranceType), "0") 3 32.9 1409 1763
## relevel(factor(Location), "1") 3 16.0 1406 1747
## Pr(>Chi)
## NULL
## relevel(factor(InsuranceType), "0") 3.4e-07 ***
## relevel(factor(Location), "1") 0.0011 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Both terms significant at at least the 0.001 confidence interval.
# Examine the model by only using the variables that have the highest odds
# of completion
mle_fullCategories_model <- glm(Completed ~ young + white + hospitalBased +
noMedAsst + whiteMarsh + suburban + OBGYN, binomial, data = shotData)
summary(mle_fullCategories_model)
##
## Call:
## glm(formula = Completed ~ young + white + hospitalBased + noMedAsst +
## whiteMarsh + suburban + OBGYN, family = binomial, data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.568 -0.947 -0.732 1.290 2.111
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.114 0.208 -10.16 < 2e-16 ***
## young 0.610 0.139 4.38 1.2e-05 ***
## white 0.309 0.121 2.56 0.0103 *
## hospitalBased 0.471 0.246 1.92 0.0554 .
## noMedAsst 0.586 0.204 2.87 0.0041 **
## whiteMarsh 0.307 0.206 1.49 0.1362
## suburban 0.349 0.166 2.10 0.0355 *
## OBGYN 0.365 0.157 2.32 0.0202 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796.0 on 1412 degrees of freedom
## Residual deviance: 1722.4 on 1405 degrees of freedom
## AIC: 1738
##
## Number of Fisher Scoring iterations: 4
# AIC = 1738.4 Wald's Test suggests that privatePayer, whiteMarsh, and
# suburban have slopes that could be 0. Let's drop these variables...
# Compare Saturated Models...
anova(mle_fullVars_model, mle_fullCategories_model, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: Completed ~ relevel(factor(InsuranceType), "0") + relevel(factor(Location),
## "1")
## Model 2: Completed ~ young + white + hospitalBased + noMedAsst + whiteMarsh +
## suburban + OBGYN
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1406 1747
## 2 1405 1722 1 24.7 6.8e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Using Pearson's Chi-Squared Test, (Ho = there is no difference between
# the models), we can reject the null hypothesis pr = 24.669; p-value =
# 1.694e-06 and declare that there is a significant difference between the
# models. Because the AIC is lower in the categories model, we can
# identify that it is better.
mle_model1 <- glm(Completed ~ young + white + noMedAsst + suburban + OBGYN,
binomial, data = shotData)
summary(mle_model1)
##
## Call:
## glm(formula = Completed ~ young + white + noMedAsst + suburban +
## OBGYN, family = binomial, data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.300 -0.956 -0.732 1.331 2.131
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.162 0.205 -10.54 < 2e-16 ***
## young 0.637 0.138 4.61 4.1e-06 ***
## white 0.330 0.119 2.77 0.00565 **
## noMedAsst 0.648 0.198 3.28 0.00104 **
## suburban 0.333 0.153 2.17 0.02987 *
## OBGYN 0.498 0.139 3.58 0.00035 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796.0 on 1412 degrees of freedom
## Residual deviance: 1729.3 on 1407 degrees of freedom
## AIC: 1741
##
## Number of Fisher Scoring iterations: 4
# AIC = 1744.1; all variables have coeffiecients that are not 0 with at
# least a 0.001 significance level Likelihood Ratio Test to see if
# anything should be dropped...
anova(mle_model1, test = "LRT")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Completed
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 1412 1796
## young 1 2.62 1411 1793 0.10547
## white 1 18.51 1410 1775 1.7e-05 ***
## noMedAsst 1 28.99 1409 1746 7.3e-08 ***
## suburban 1 3.61 1408 1742 0.05730 .
## OBGYN 1 12.95 1407 1729 0.00032 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# All variables significantly add information to the model at 0.05
mle_model2 <- glm(Completed ~ hospitalBased + noMedAsst + whiteMarsh, binomial,
data = shotData)
summary(mle_model2)
##
## Call:
## glm(formula = Completed ~ hospitalBased + noMedAsst + whiteMarsh,
## family = binomial, data = shotData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.294 -0.908 -0.908 1.473 1.800
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.399 0.151 -9.27 < 2e-16 ***
## hospitalBased 0.363 0.231 1.57 0.11657
## noMedAsst 0.726 0.165 4.39 1.1e-05 ***
## whiteMarsh 0.580 0.171 3.40 0.00067 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1796 on 1412 degrees of freedom
## Residual deviance: 1752 on 1409 degrees of freedom
## AIC: 1760
##
## Number of Fisher Scoring iterations: 4
# AIC = 1760
anova(mle_model2, test = "LRT")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Completed
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 1412 1796
## hospitalBased 1 6.73 1411 1789 0.00950 **
## noMedAsst 1 25.82 1410 1763 3.7e-07 ***
## whiteMarsh 1 11.42 1409 1752 0.00073 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# All parameters are significant at 0.001
anova(mle_model1, mle_model2, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: Completed ~ young + white + noMedAsst + suburban + OBGYN
## Model 2: Completed ~ hospitalBased + noMedAsst + whiteMarsh
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1407 1729
## 2 1409 1752 -2 -22.7 1.2e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Using Pearson's Chi-Squared Test, (Ho = there is no difference between
# the models), we can reject the null hypothesis pr = 22.713; p-value =
# 1.694e-06 and declare that there is a significant difference between the
# models. Because the AIC is much lower in the reduced model1, we can
# identify that it is better.
anova(mle_fullCategories_model, mle_model1, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: Completed ~ young + white + hospitalBased + noMedAsst + whiteMarsh +
## suburban + OBGYN
## Model 2: Completed ~ young + white + noMedAsst + suburban + OBGYN
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1405 1722
## 2 1407 1729 -2 -6.9 0.032 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Using the Likelihood Ratio Test, (Ho = there is no difference between
# the models), we can not reject the null hypothesis pr = 6.899; p-value =
# 0.032 and declare that there is not a significant difference between the
# models. Because the reduced model is more generalized, we will use it
# instead.
# The GLM Model:
exp(cbind(OR = coef(mle_model1), confint(mle_model1)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.1151 0.07638 0.1708
## young 1.8905 1.44433 2.4838
## white 1.3908 1.10140 1.7577
## noMedAsst 1.9122 1.30325 2.8311
## suburban 1.3952 1.03516 1.8892
## OBGYN 1.6453 1.25373 2.1646
# y =
# (0.12+1.83(young)+1.40(white)+2.37(noMedAsst)+1.61(OBGYN))/(1+0.12+1.83(young)+1.40(white)+2.37(noMedAsst)+1.61(OBGYN))
# young = 0 white = 0 noMedAsst =0 OBGYN = 0
# (0.12+1.83*(young)+1.40*(white)+2.37*(noMedAsst)+1.61*(OBGYN))/(1+1.83*(young)+1.40*(white)+2.37*(noMedAsst)+1.61*(OBGYN))
# Ranges from .12 (all values 0) to .89 (all values 1)
# Test for multicolinearity
mean(vif(mle_fullVars_model)) #I don't understand this result.... mean = 1.97
## [1] 1.973
mean(vif(mle_fullCategories_model)) # 1.435
## [1] 1.435
mean(vif(mle_model1)) #1.344
## [1] 1.344
mean(vif(mle_model2)) #1.027
## [1] 1.027