gdata: Unable to load perl libaries needed by read.xls() gdata: to support

# install.packages('arm') install.packages('lmSupport')
library(arm)

## Loading required package: MASS

## Loading required package: Matrix

## Loading required package: lattice

## Loading required package: lme4

## Attaching package: 'lme4'

## The following object(s) are masked from 'package:stats':
## 
## AIC, BIC

## Loading required package: R2WinBUGS

## Loading required package: coda

## Attaching package: 'coda'

## The following object(s) are masked from 'package:lme4':
## 
## HPDinterval

## Loading required package: abind

## Loading required package: foreign

## arm (Version 1.6-03, built: 2013-2-20)

## Working directory is E:/Quant/Assignments/CreateRJournal/Labs/Lab2

## Attaching package: 'arm'

## The following object(s) are masked from 'package:coda':
## 
## traceplot

library(MASS)
library(car)

## Loading required package: nnet

library(lmSupport)

## Loading required package: psych

## Attaching package: 'psych'

## The following object(s) are masked from 'package:car':
## 
## logit

## The following object(s) are masked from 'package:arm':
## 
## rescale, sim

## Loading required package: gplots

## Loading required package: gtools

## Attaching package: 'gtools'

## The following object(s) are masked from 'package:psych':
## 
## logit

## The following object(s) are masked from 'package:car':
## 
## logit

## Loading required package: gdata

## gdata: Unable to locate valid perl interpreter gdata: gdata: read.xls()
## will be unable to read Excel XLS and XLSX files gdata: unless the 'perl='
## argument is used to specify the location gdata: of a valid perl
## intrpreter. gdata: gdata: (To avoid display of this message in the future,
## please gdata: ensure perl is installed and available on the executable
## gdata: search path.)

## gdata: Unable to load perl libaries needed by read.xls() gdata: to support
## 'XLX' (Excel 97-2004) files.

## ```

gdata: Unable to load perl libaries needed by read.xls() gdata: to support

'XLSX' (Excel 2007+) files.

“

## gdata: Run the function 'installXLSXsupport()' gdata: to automatically
## download and install the perl gdata: libaries needed to support Excel XLS
## and XLSX formats.

## Attaching package: 'gdata'

## The following object(s) are masked from 'package:stats':
## 
## nobs

## The following object(s) are masked from 'package:utils':
## 
## object.size

## Loading required package: caTools

## Loading required package: grid

## Loading required package: KernSmooth

## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009

## Attaching package: 'gplots'

## The following object(s) are masked from 'package:stats':
## 
## lowess

## Loading required package: gvlma


# Import Data
shotData <- read.table("E:\\Quant\\Assignments\\Lab02\\jh_gardasil.dat", 
    header = T)
names(shotData)

##  [1] "Age"           "AgeGroup"      "Race"          "Shots"        
##  [5] "Completed"     "InsuranceType" "MedAssist"     "Location"     
##  [9] "LocationType"  "PracticeType"


# Create Dummy Variables Age Group
shotData$young <- ifelse(shotData$AgeGroup == 0, c(1), c(0))

# Race
shotData$white <- ifelse(shotData$Race == 0, c(1), c(0))
shotData$black <- ifelse(shotData$Race == 1, c(1), c(0))
shotData$hispanic <- ifelse(shotData$Race == 2, c(1), c(0))
shotData$r_other <- ifelse(shotData$Race == 3, c(1), c(0))

# No. Shots - this on is numeric - is it needed?

# All shots completed - no need to create dummy vars :)

# Insurance Type
shotData$medAsst <- ifelse(shotData$InsuranceType == 0, c(1), c(0))
shotData$privatePayer <- ifelse(shotData$InsuranceType == 1, c(1), c(0))
shotData$hospitalBased <- ifelse(shotData$InsuranceType == 2, c(1), c(0))
shotData$military <- ifelse(shotData$InsuranceType == 3, c(1), c(0))

# Medical Assistance - No need to create dummy vars here :)
shotData$noMedAsst <- ifelse(shotData$MedAssist == 0, c(1), c(0))

# Location
shotData$odenton <- ifelse(shotData$Location == 1, c(1), c(0))
shotData$whiteMarsh <- ifelse(shotData$Location == 2, c(1), c(0))
shotData$JHOC <- ifelse(shotData$Location == 3, c(1), c(0))
shotData$bayview <- ifelse(shotData$Location == 4, c(1), c(0))

# Location Type - no need to create dummy var here :)
shotData$suburban <- ifelse(shotData$LocationType == 0, c(1), c(0))

# Practice Type
shotData$pediatric <- ifelse(shotData$PracticeType == 0, c(1), c(0))
shotData$famPractice <- ifelse(shotData$PracticeType == 1, c(1), c(0))
shotData$OBGYN <- ifelse(shotData$PracticeType == 2, c(1), c(0))

names(shotData)

##  [1] "Age"           "AgeGroup"      "Race"          "Shots"        
##  [5] "Completed"     "InsuranceType" "MedAssist"     "Location"     
##  [9] "LocationType"  "PracticeType"  "young"         "white"        
## [13] "black"         "hispanic"      "r_other"       "medAsst"      
## [17] "privatePayer"  "hospitalBased" "military"      "noMedAsst"    
## [21] "odenton"       "whiteMarsh"    "JHOC"          "bayview"      
## [25] "suburban"      "pediatric"     "famPractice"   "OBGYN"



# Investigate which groups have higher rates of completion by plotting
# proportions...
plot(factor(shotData$Completed) ~ factor(shotData$AgeGroup), xlab = "Age\n (0=11 - 17,1=18 - 26)", 
    ylab = "(Completed=1, Not completed=0)")

plot of chunk unnamed-chunk-1

# Bad... these are very similar
plot(factor(shotData$Completed) ~ factor(shotData$Race), xlab = "Race\n (0=white,1=black, 2=hispanic, 3=other)", 
    ylab = "(Completed=1, Not completed=0)")

plot of chunk unnamed-chunk-1

# black appear to have more; white and other are lowest and about the same
plot(factor(shotData$Completed) ~ factor(shotData$MedAssist), xlab = "Medical Assistance\n (0=No, 1=Yes)", 
    ylab = "(Completed=1, Not completed=0)")

plot of chunk unnamed-chunk-1

# having medical assistance is higher
plot(factor(shotData$Completed) ~ factor(shotData$InsuranceType), xlab = "Type of Insurance\n (0=medical assistance,1=private, 2=hospital-based, 3=military)", 
    ylab = "(Completed=1, Not completed=0)")

plot of chunk unnamed-chunk-1

# medical assistance is highest; hospital based is lowest; private and
# military are about the same
plot(factor(shotData$Completed) ~ factor(shotData$Location), xlab = "Location\n (1=Odenton, 2=White Marsh, 3=John Hopkins  4=Bayview)", 
    ylab = "(Completed=1, Not completed=0)")

plot of chunk unnamed-chunk-1

# John Hopkins highest; white marsh lowest
plot(factor(shotData$Completed) ~ factor(shotData$PracticeType), xlab = "Practice Type\n (0=pediatric,1=family practice, 2=OBGYN)", 
    ylab = "(Completed=1, Not completed=0)")

plot of chunk unnamed-chunk-1

# All about the same

# Chi-Squared Get frequency of categories
# as.data.frame(table(shotData$Completed)) #Doesn't tell frequency for
# Chi-squared matrix

# Explore the data by computing odds ratios
grp_race <- glm(Completed ~ relevel(factor(Race), "0"), binomial, data = shotData)
grp_race <- glm(Completed ~ relevel(factor(Race), "1"), binomial, data = shotData)
grp_race <- glm(Completed ~ relevel(factor(Race), "2"), binomial, data = shotData)
grp_race <- glm(Completed ~ relevel(factor(Race), "3"), binomial, data = shotData)
summary(grp_race)

## 
## Call:
## glm(formula = Completed ~ relevel(factor(Race), "3"), family = binomial, 
##     data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.982  -0.982  -0.736   1.386   1.697  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -0.5744     0.1527   -3.76  0.00017 ***
## relevel(factor(Race), "3")0   0.0955     0.1706    0.56  0.57552    
## relevel(factor(Race), "3")1  -0.5947     0.1892   -3.14  0.00168 ** 
## relevel(factor(Race), "3")2  -0.1477     0.3328   -0.44  0.65712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796  on 1412  degrees of freedom
## Residual deviance: 1768  on 1409  degrees of freedom
## AIC: 1776
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1776
exp(cbind(OR = coef(grp_race), confint(grp_race)))

## Waiting for profiling to be done...

##                                 OR  2.5 % 97.5 %
## (Intercept)                 0.5630 0.4154 0.7568
## relevel(factor(Race), "3")0 1.1003 0.7897 1.5428
## relevel(factor(Race), "3")1 0.5518 0.3811 0.8009
## relevel(factor(Race), "3")2 0.8627 0.4414 1.6375

# COMPARED TO WHITE OR 2.5 % 97.5 % (Intercept) 0.6194690 0.5331816
# 0.7184667 relevel(factor(Race), '0')1 0.5014793 0.3837039 0.6519972
# relevel(factor(Race), '0')2 0.7840816 0.4215722 1.4058708
# relevel(factor(Race), '0')3 0.9088836 0.6481777 1.2663418

# COMPARED TO BLACK (Intercept) 0.3106509 0.2484495 0.3851648
# relevel(factor(Race), '1')0 1.9941003 1.5337489 2.6061764
# relevel(factor(Race), '1')2 1.5635374 0.8246541 2.8673785
# relevel(factor(Race), '1')3 1.8124050 1.2486287 2.6240660

# COMPARED TO HISPANIC (Intercept) 0.4857143 0.2657182 0.8538221
# relevel(factor(Race), '2')0 1.2753774 0.7113029 2.3720730
# relevel(factor(Race), '2')1 0.6395754 0.3487506 1.2126297
# relevel(factor(Race), '2')3 1.1591696 0.6106937 2.2656163

# COMPARED TO OTHER (Intercept) 0.5630252 0.4153814 0.7567774
# relevel(factor(Race), '3')0 1.1002510 0.7896762 1.5427868
# relevel(factor(Race), '3')1 0.5517531 0.3810880 0.8008786
# relevel(factor(Race), '3')2 0.8626866 0.4413810 1.6374822 Based on these
# results, we can order the likelyhood of a given race to complete the
# 3-shots White is most likely, then other, hispanic, black We can
# conclude that white is a potentially good predictor of having completed
# the shots

grp_ageGrp <- glm(Completed ~ factor(AgeGroup), binomial, data = shotData)
summary(grp_ageGrp)

## 
## Call:
## glm(formula = Completed ~ factor(AgeGroup), family = binomial, 
##     data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.932  -0.932  -0.865   1.444   1.527  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -0.6087     0.0791   -7.70  1.4e-14 ***
## factor(AgeGroup)1  -0.1830     0.1131   -1.62     0.11    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796.0  on 1412  degrees of freedom
## Residual deviance: 1793.4  on 1411  degrees of freedom
## AIC: 1797
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1797.4
exp(cbind(OR = coef(grp_ageGrp), confint(grp_ageGrp)))

## Waiting for profiling to be done...

##                       OR  2.5 % 97.5 %
## (Intercept)       0.5441 0.4654 0.6345
## factor(AgeGroup)1 0.8328 0.6669 1.0393

# Compared to age11-17 OR 2.5 % 97.5 % (Intercept) 0.5440529 0.4653631
# 0.6345436 factor(AgeGroup)1 0.8327522 0.6669498 1.0392945 The age group
# of 11-17 yr olds is more likely to complete the 3 shots

grp_InsType <- glm(Completed ~ relevel(factor(InsuranceType), "0"), binomial, 
    data = shotData)
summary(grp_InsType)

## 
## Call:
## glm(formula = Completed ~ relevel(factor(InsuranceType), "0"), 
##     family = binomial, data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.117  -0.928  -0.928   1.413   1.794  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)                            -1.386      0.151   -9.20  < 2e-16
## relevel(factor(InsuranceType), "0")1    0.767      0.170    4.52  6.2e-06
## relevel(factor(InsuranceType), "0")2    1.243      0.266    4.68  2.9e-06
## relevel(factor(InsuranceType), "0")3    0.848      0.189    4.49  7.2e-06
##                                         
## (Intercept)                          ***
## relevel(factor(InsuranceType), "0")1 ***
## relevel(factor(InsuranceType), "0")2 ***
## relevel(factor(InsuranceType), "0")3 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796.0  on 1412  degrees of freedom
## Residual deviance: 1763.1  on 1409  degrees of freedom
## AIC: 1771
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1771.1
exp(cbind(OR = coef(grp_InsType), confint(grp_InsType)))

## Waiting for profiling to be done...

##                                         OR  2.5 % 97.5 %
## (Intercept)                          0.250 0.1843 0.3332
## relevel(factor(InsuranceType), "0")1 2.153 1.5534 3.0247
## relevel(factor(InsuranceType), "0")2 3.467 2.0598 5.8496
## relevel(factor(InsuranceType), "0")3 2.335 1.6191 3.3996

# COMPARED TO MEDAssist OR 2.5 % 97.5 % (Intercept) 0.250000 0.1843109
# 0.3332127 relevel(factor(InsuranceType), '0')1 2.153191 1.5533828
# 3.0247079 relevel(factor(InsuranceType), '0')2 3.466667 2.0598141
# 5.8495818 relevel(factor(InsuranceType), '0')3 2.334928 1.6191330
# 3.3995684 Insurance type order: private payer, military, hospital,
# medical assistance A private providor of insurance is most likely likely
# to have a completed regiment

grp_MedAssist <- glm(Completed ~ MedAssist, binomial, data = shotData)
summary(grp_MedAssist)

## 
## Call:
## glm(formula = Completed ~ MedAssist, family = binomial, data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.951  -0.951  -0.951   1.422   1.794  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.5589     0.0616   -9.07  < 2e-16 ***
## MedAssist    -0.8274     0.1629   -5.08  3.8e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796.0  on 1412  degrees of freedom
## Residual deviance: 1767.3  on 1411  degrees of freedom
## AIC: 1771
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1771.3
exp(cbind(OR = coef(grp_MedAssist), confint(grp_MedAssist)))

## Waiting for profiling to be done...

##                 OR  2.5 % 97.5 %
## (Intercept) 0.5718 0.5064 0.6448
## MedAssist   0.4372 0.3152 0.5975

# COMPARED TO NO MED ASSIST OR 2.5 % 97.5 % (Intercept) 0.5718232
# 0.5064174 0.6448202 MedAssist 0.4371981 0.3152027 0.5974797 not having
# medical assistance is a better odds ratio of completion

grp_Loc <- glm(Completed ~ relevel(factor(Location), "3"), binomial, data = shotData)
summary(grp_Loc)

## 
## Call:
## glm(formula = Completed ~ relevel(factor(Location), "3"), family = binomial, 
##     data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.152  -0.919  -0.781   1.460   1.757  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -1.304      0.259   -5.04  4.6e-07 ***
## relevel(factor(Location), "3")1    0.661      0.269    2.46    0.014 *  
## relevel(factor(Location), "3")2    1.243      0.302    4.12  3.8e-05 ***
## relevel(factor(Location), "3")4    0.274      0.285    0.96    0.336    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796.0  on 1412  degrees of freedom
## Residual deviance: 1764.9  on 1409  degrees of freedom
## AIC: 1773
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1772.9
exp(cbind(OR = coef(grp_Loc), confint(grp_Loc)))

## Waiting for profiling to be done...

##                                     OR  2.5 % 97.5 %
## (Intercept)                     0.2714 0.1590  0.441
## relevel(factor(Location), "3")1 1.9372 1.1660  3.369
## relevel(factor(Location), "3")2 3.4675 1.9489  6.397
## relevel(factor(Location), "3")4 1.3158 0.7656  2.352

# COMPARED TO ODENTON OR 2.5 % 97.5 % (Intercept) 0.5258126 0.4538552
# 0.6078322 relevel(factor(Location), '1')2 1.7899465 1.2752501 2.5114681
# relevel(factor(Location), '1')3 0.5162078 0.2968655 0.8576605
# relevel(factor(Location), '1')4 0.6792208 0.5138153 0.8928173 (best)
# white marsh, Odenton, Bayview, John Hopkins (worst)

grp_LocType <- glm(Completed ~ LocationType, binomial, data = shotData)
summary(grp_LocType)

## 
## Call:
## glm(formula = Completed ~ LocationType, family = binomial, data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.959  -0.959  -0.764   1.413   1.657  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.5381     0.0668   -8.06  7.9e-16 ***
## LocationType  -0.5429     0.1273   -4.26  2.0e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796.0  on 1412  degrees of freedom
## Residual deviance: 1777.1  on 1411  degrees of freedom
## AIC: 1781
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1781.1
exp(cbind(OR = coef(grp_LocType), confint(grp_LocType)))

## Waiting for profiling to be done...

##                  OR  2.5 % 97.5 %
## (Intercept)  0.5839 0.5118 0.6651
## LocationType 0.5811 0.4516 0.7441

# COMPARED TO SUBURBAN OR 2.5 % 97.5 % (Intercept) 0.5838816 0.5118243
# 0.6650824 LocationType 0.5810865 0.4515551 0.7440576 Suburban locations
# have higher odds of completion than rural.

grp_Practice <- glm(Completed ~ relevel(factor(PracticeType), "1"), binomial, 
    data = shotData)
summary(grp_Practice)

## 
## Call:
## glm(formula = Completed ~ relevel(factor(PracticeType), "1"), 
##     family = binomial, data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.973  -0.869  -0.828   1.397   1.573  
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)                           -0.893      0.115   -7.75  9.3e-15
## relevel(factor(PracticeType), "1")0    0.115      0.149    0.77   0.4432
## relevel(factor(PracticeType), "1")2    0.392      0.146    2.68   0.0073
##                                        
## (Intercept)                         ***
## relevel(factor(PracticeType), "1")0    
## relevel(factor(PracticeType), "1")2 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796.0  on 1412  degrees of freedom
## Residual deviance: 1787.6  on 1410  degrees of freedom
## AIC: 1794
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1793.6
exp(cbind(OR = coef(grp_Practice), confint(grp_Practice)))

## Waiting for profiling to be done...

##                                         OR  2.5 % 97.5 %
## (Intercept)                         0.4093 0.3252 0.5113
## relevel(factor(PracticeType), "1")0 1.1213 0.8377 1.5048
## relevel(factor(PracticeType), "1")2 1.4793 1.1132 1.9728

# COMPARED TO PEDIATRIC OR 2.5 % 97.5 % (Intercept) 0.4589235 0.3801637
# 0.5516303 relevel(factor(PracticeType), '0')1 0.8917966 0.6645241
# 1.1937630 relevel(factor(PracticeType), '0')2 1.3192213 1.0222206
# 1.7043677 OBGYN, Pediatric, Family Practice

# The variables with the lowest AIC's include: Insurance Type: 1771.1
# Medical Assistance: 1771.3 Location: 1772.9 Race' 1776.0 Location Type:
# 1781.1 Practice Type: 1793.6 Age Group: 1797.4

# The categories with the higest odds include: Age Group: 11 - 17 (id 0)
# Race: white (id 0) InsuranceType: hospital based (id 2) MedAssist: none
# (id 0) Location: White Marsh (id 2) LocationType: Suburban (id 0)
# PracticeType: OBGYN (id2)




################################################################################




mle_fullVars_model <- glm(Completed ~ relevel(factor(InsuranceType), "0") + 
    relevel(factor(Location), "1"), binomial, data = shotData)
summary(mle_fullVars_model)

## 
## Call:
## glm(formula = Completed ~ relevel(factor(InsuranceType), "0") + 
##     relevel(factor(Location), "1"), family = binomial, data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.339  -0.901  -0.734   1.415   1.957  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)                           -1.2719     0.2107   -6.04  1.6e-09
## relevel(factor(InsuranceType), "0")1   0.5806     0.2005    2.90  0.00378
## relevel(factor(InsuranceType), "0")2   1.0705     0.2760    3.88  0.00011
## relevel(factor(InsuranceType), "0")3   0.7295     0.2376    3.07  0.00214
## relevel(factor(Location), "1")2        0.5740     0.1852    3.10  0.00194
## relevel(factor(Location), "1")3       -0.4837     0.2877   -1.68  0.09274
## relevel(factor(Location), "1")4       -0.0956     0.1801   -0.53  0.59556
##                                         
## (Intercept)                          ***
## relevel(factor(InsuranceType), "0")1 ** 
## relevel(factor(InsuranceType), "0")2 ***
## relevel(factor(InsuranceType), "0")3 ** 
## relevel(factor(Location), "1")2      ** 
## relevel(factor(Location), "1")3      .  
## relevel(factor(Location), "1")4         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796.0  on 1412  degrees of freedom
## Residual deviance: 1747.1  on 1406  degrees of freedom
## AIC: 1761
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1761.1 Likelihood Ratio Test to see if anything should be
# dropped...
anova(mle_fullVars_model, test = "LRT")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Completed
## 
## Terms added sequentially (first to last)
## 
## 
##                                     Df Deviance Resid. Df Resid. Dev
## NULL                                                 1412       1796
## relevel(factor(InsuranceType), "0")  3     32.9      1409       1763
## relevel(factor(Location), "1")       3     16.0      1406       1747
##                                     Pr(>Chi)    
## NULL                                            
## relevel(factor(InsuranceType), "0")  3.4e-07 ***
## relevel(factor(Location), "1")        0.0011 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Both terms significant at at least the 0.001 confidence interval.

# Examine the model by only using the variables that have the highest odds
# of completion
mle_fullCategories_model <- glm(Completed ~ young + white + hospitalBased + 
    noMedAsst + whiteMarsh + suburban + OBGYN, binomial, data = shotData)
summary(mle_fullCategories_model)

## 
## Call:
## glm(formula = Completed ~ young + white + hospitalBased + noMedAsst + 
##     whiteMarsh + suburban + OBGYN, family = binomial, data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.568  -0.947  -0.732   1.290   2.111  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.114      0.208  -10.16  < 2e-16 ***
## young            0.610      0.139    4.38  1.2e-05 ***
## white            0.309      0.121    2.56   0.0103 *  
## hospitalBased    0.471      0.246    1.92   0.0554 .  
## noMedAsst        0.586      0.204    2.87   0.0041 ** 
## whiteMarsh       0.307      0.206    1.49   0.1362    
## suburban         0.349      0.166    2.10   0.0355 *  
## OBGYN            0.365      0.157    2.32   0.0202 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796.0  on 1412  degrees of freedom
## Residual deviance: 1722.4  on 1405  degrees of freedom
## AIC: 1738
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1738.4 Wald's Test suggests that privatePayer, whiteMarsh, and
# suburban have slopes that could be 0.  Let's drop these variables...

# Compare Saturated Models...
anova(mle_fullVars_model, mle_fullCategories_model, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: Completed ~ relevel(factor(InsuranceType), "0") + relevel(factor(Location), 
##     "1")
## Model 2: Completed ~ young + white + hospitalBased + noMedAsst + whiteMarsh + 
##     suburban + OBGYN
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1      1406       1747                         
## 2      1405       1722  1     24.7  6.8e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Using Pearson's Chi-Squared Test, (Ho = there is no difference between
# the models), we can reject the null hypothesis pr = 24.669; p-value =
# 1.694e-06 and declare that there is a significant difference between the
# models.  Because the AIC is lower in the categories model, we can
# identify that it is better.


mle_model1 <- glm(Completed ~ young + white + noMedAsst + suburban + OBGYN, 
    binomial, data = shotData)
summary(mle_model1)

## 
## Call:
## glm(formula = Completed ~ young + white + noMedAsst + suburban + 
##     OBGYN, family = binomial, data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.300  -0.956  -0.732   1.331   2.131  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.162      0.205  -10.54  < 2e-16 ***
## young          0.637      0.138    4.61  4.1e-06 ***
## white          0.330      0.119    2.77  0.00565 ** 
## noMedAsst      0.648      0.198    3.28  0.00104 ** 
## suburban       0.333      0.153    2.17  0.02987 *  
## OBGYN          0.498      0.139    3.58  0.00035 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796.0  on 1412  degrees of freedom
## Residual deviance: 1729.3  on 1407  degrees of freedom
## AIC: 1741
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1744.1; all variables have coeffiecients that are not 0 with at
# least a 0.001 significance level Likelihood Ratio Test to see if
# anything should be dropped...
anova(mle_model1, test = "LRT")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Completed
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                       1412       1796             
## young      1     2.62      1411       1793  0.10547    
## white      1    18.51      1410       1775  1.7e-05 ***
## noMedAsst  1    28.99      1409       1746  7.3e-08 ***
## suburban   1     3.61      1408       1742  0.05730 .  
## OBGYN      1    12.95      1407       1729  0.00032 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# All variables significantly add information to the model at 0.05

mle_model2 <- glm(Completed ~ hospitalBased + noMedAsst + whiteMarsh, binomial, 
    data = shotData)
summary(mle_model2)

## 
## Call:
## glm(formula = Completed ~ hospitalBased + noMedAsst + whiteMarsh, 
##     family = binomial, data = shotData)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.294  -0.908  -0.908   1.473   1.800  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.399      0.151   -9.27  < 2e-16 ***
## hospitalBased    0.363      0.231    1.57  0.11657    
## noMedAsst        0.726      0.165    4.39  1.1e-05 ***
## whiteMarsh       0.580      0.171    3.40  0.00067 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1796  on 1412  degrees of freedom
## Residual deviance: 1752  on 1409  degrees of freedom
## AIC: 1760
## 
## Number of Fisher Scoring iterations: 4

# AIC = 1760
anova(mle_model2, test = "LRT")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Completed
## 
## Terms added sequentially (first to last)
## 
## 
##               Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                           1412       1796             
## hospitalBased  1     6.73      1411       1789  0.00950 ** 
## noMedAsst      1    25.82      1410       1763  3.7e-07 ***
## whiteMarsh     1    11.42      1409       1752  0.00073 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# All parameters are significant at 0.001
anova(mle_model1, mle_model2, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: Completed ~ young + white + noMedAsst + suburban + OBGYN
## Model 2: Completed ~ hospitalBased + noMedAsst + whiteMarsh
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1      1407       1729                         
## 2      1409       1752 -2    -22.7  1.2e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Using Pearson's Chi-Squared Test, (Ho = there is no difference between
# the models), we can reject the null hypothesis pr = 22.713; p-value =
# 1.694e-06 and declare that there is a significant difference between the
# models.  Because the AIC is much lower in the reduced model1, we can
# identify that it is better.


anova(mle_fullCategories_model, mle_model1, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: Completed ~ young + white + hospitalBased + noMedAsst + whiteMarsh + 
##     suburban + OBGYN
## Model 2: Completed ~ young + white + noMedAsst + suburban + OBGYN
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1      1405       1722                       
## 2      1407       1729 -2     -6.9    0.032 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Using the Likelihood Ratio Test, (Ho = there is no difference between
# the models), we can not reject the null hypothesis pr = 6.899; p-value =
# 0.032 and declare that there is not a significant difference between the
# models.  Because the reduced model is more generalized, we will use it
# instead.


# The GLM Model:
exp(cbind(OR = coef(mle_model1), confint(mle_model1)))

## Waiting for profiling to be done...

##                 OR   2.5 % 97.5 %
## (Intercept) 0.1151 0.07638 0.1708
## young       1.8905 1.44433 2.4838
## white       1.3908 1.10140 1.7577
## noMedAsst   1.9122 1.30325 2.8311
## suburban    1.3952 1.03516 1.8892
## OBGYN       1.6453 1.25373 2.1646

# y =
# (0.12+1.83(young)+1.40(white)+2.37(noMedAsst)+1.61(OBGYN))/(1+0.12+1.83(young)+1.40(white)+2.37(noMedAsst)+1.61(OBGYN))
# young = 0 white = 0 noMedAsst =0 OBGYN = 0
# (0.12+1.83*(young)+1.40*(white)+2.37*(noMedAsst)+1.61*(OBGYN))/(1+1.83*(young)+1.40*(white)+2.37*(noMedAsst)+1.61*(OBGYN))
# Ranges from .12 (all values 0) to .89 (all values 1)

# Test for multicolinearity
mean(vif(mle_fullVars_model))  #I don't understand this result.... mean = 1.97

## [1] 1.973

mean(vif(mle_fullCategories_model))  # 1.435

## [1] 1.435

mean(vif(mle_model1))  #1.344

## [1] 1.344

mean(vif(mle_model2))  #1.027

## [1] 1.027