library(faraway)
## Warning: package 'faraway' was built under R version 4.0.3
data(esdcomp)
attach(esdcomp)
head(esdcomp)
##   visits complaints residency gender revenue   hours
## 1   2014          2         Y      F  263.03 1287.25
## 2   3091          3         N      M  334.94 1588.00
## 3    879          1         Y      M  206.42  705.25
## 4   1780          1         N      M  226.32 1005.50
## 5   3646         11         N      M  288.91 1667.25
## 6   2690          1         N      M  275.94 1517.75
tail(esdcomp)
##    visits complaints residency gender revenue   hours
## 39   2701          8         N      M  275.40 1652.75
## 40   2046          1         Y      M  289.56 1029.75
## 41   2548          2         Y      M  305.67 1127.00
## 42   2592          1         N      M  252.35 1547.25
## 43   2741          1         Y      F  276.86 1499.25
## 44   3763         10         Y      M  308.84 1747.50
summary(esdcomp)
##      visits       complaints     residency gender    revenue     
##  Min.   : 879   Min.   : 0.000   N:24      F:12   Min.   :206.4  
##  1st Qu.:2036   1st Qu.: 1.000   Y:20      M:32   1st Qu.:235.9  
##  Median :2384   Median : 2.000                    Median :258.5  
##  Mean   :2386   Mean   : 3.341                    Mean   :260.1  
##  3rd Qu.:2813   3rd Qu.: 5.000                    3rd Qu.:282.3  
##  Max.   :3763   Max.   :11.000                    Max.   :334.9  
##      hours     
##  Min.   : 589  
##  1st Qu.:1207  
##  Median :1512  
##  Mean   :1417  
##  3rd Qu.:1642  
##  Max.   :1917

A

This is a histogram of the number of complaints

 hist(esdcomp$complaints,xlab="number of complaints", main="")

From the histogram, it looks similar to the approximate of a Poisson distributed I’m decied to use Poisson regression rather than Linear Regression because the value associated with a distribution that is noticeably skewed with lots of small values and only few larger ones. As lambda increases the distribution of the responses begins to look more and more like normal distribution.

B

This is a boxplot of the response by gender.

boxplot(esdcomp$complaints~esdcomp$gender, xlab="gender", ylab="number of complaints")

The median for the number of complaints looks the same for 2 different gender. Male doctor tent to have more complaints than female doctor. There is an outlier for number of complaints of female at 10

C

This is a boxplot of the reponse by residency

boxplot(esdcomp$complaints~esdcomp$residency, xlab="residency", ylab="number of complaints")

Non residency doctor tent to have more complaints than those who are in the residency

D

plot(esdcomp$complaints~esdcomp$revenue)

plot(log(esdcomp$complaints)~esdcomp$revenue)

The plot appears that there are no relationship between the log of complaints vs. revenue

E

The complaints histogram looks very poisson which is a right skewed distribution and the response is greater or equal to zero

F

resmod <- glm(complaints ~ residency , family = poisson, data = esdcomp)
coef(resmod)
## (Intercept)  residencyY 
##   1.3758231  -0.4203116
(res.means <- round(exp(coef(resmod)), digits = 4))
## (Intercept)  residencyY 
##      3.9583      0.6568
summary(resmod)
## 
## Call:
## glm(formula = complaints ~ residency, family = poisson, data = esdcomp)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8137  -1.1353  -0.3880   0.5028   3.4845  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.3758     0.1026  13.410   <2e-16 ***
## residencyY   -0.4203     0.1725  -2.437   0.0148 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 89.447  on 43  degrees of freedom
## Residual deviance: 83.303  on 42  degrees of freedom
## AIC: 210.08
## 
## Number of Fisher Scoring iterations: 5

Every year that a doctor with residency is associated with a multiplicative change of .6568 in their mean number of complaints.

genmod <- glm(complaints ~ gender , family = poisson, data = esdcomp)
coef(genmod)
## (Intercept)     genderM 
##   0.9490806   0.3387737
(gen.means <- round(exp(coef(genmod)), digits = 4))
## (Intercept)     genderM 
##      2.5833      1.4032

Every year that a male doctor is associated with a multiplicative change of 1.4032 in their mean number of complaints.

summary(genmod)
## 
## Call:
## glm(formula = complaints ~ gender, family = poisson, data = esdcomp)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6926  -1.1263  -0.6557   0.6825   3.1098  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.9491     0.1796   5.284 1.26e-07 ***
## genderM       0.3388     0.2022   1.676   0.0938 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 89.447  on 43  degrees of freedom
## Residual deviance: 86.457  on 42  degrees of freedom
## AIC: 213.24
## 
## Number of Fisher Scoring iterations: 5
revmod <- glm(complaints ~ revenue , family = poisson, data = esdcomp)
coef(revmod)
## (Intercept)     revenue 
## 0.154157869 0.004011676
(rev.means <- round(exp(coef(revmod)), digits = 4))
## (Intercept)     revenue 
##      1.1667      1.0040
summary(revmod)
## 
## Call:
## glm(formula = complaints ~ revenue, family = poisson, data = esdcomp)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5780  -1.2597  -0.5812   0.9607   3.0496  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.154158   0.670437   0.230    0.818
## revenue     0.004012   0.002517   1.594    0.111
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 89.447  on 43  degrees of freedom
## Residual deviance: 86.929  on 42  degrees of freedom
## AIC: 213.71
## 
## Number of Fisher Scoring iterations: 5

resmod has the lowest residual deviande of the three models.

G

AIC(resmod)
## [1] 210.0828
AIC(genmod)
## [1] 213.2371
AIC(revmod)
## [1] 213.7092

The lower AIC , the better model is. Thus resmod have the lowest AIC, residency model is best according to AIC

H

Test for lack-of-fit for the model that you chose using AIC

(resdf <- nrow(esdcomp) - length(coef(resmod)))
## [1] 42
(teststat <- summary(resmod)$deviance)
## [1] 83.30301
pchisq(teststat, df= resdf, lower.tail=FALSE)
## [1] 0.0001534183

It looks like we have lack of fit

I

Construct a 95% confidence interval for the slope and interpret in context

(pci <- exp(confint(resmod, parm=2, level = .95)))
## Waiting for profiling to be done...
##     2.5 %    97.5 % 
## 0.4654089 0.9166971

Every year that a resident doctor is associated with a multiplicative change in their mean number of complaints; we’re 95% confident that this multiplicative change is between .4654 and .9167.

K

Drop in deviance test. We going to use forward-selection and the drop in deviance test to add more predictor to my model. Both of my model will have residency, but one has gender and the other has revenue

resmodA<-glm(complaints ~ residency+gender, data=esdcomp, family=poisson)
resmodB<-glm(complaints ~ residency+revenue, data=esdcomp, family=poisson)
anova(resmod, resmodA, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: complaints ~ residency
## Model 2: complaints ~ residency + gender
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        42     83.303                     
## 2        41     81.447  1   1.8557   0.1731
anova(resmod, resmodB, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: complaints ~ residency
## Model 2: complaints ~ residency + revenue
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1        42     83.303                        
## 2        41     76.427  1   6.8762 0.008735 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

When we adding more information on the residency gender does not have significantly improve the model (Pvalue is .1731) but adding revenue as a predictor does improve the model significant at (.01) , so the best model thus far is the resmodB(residency and revenue)

res3mod<-glm(complaints ~ residency+gender+revenue, data=esdcomp, family=poisson)
anova(resmodB, res3mod, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: complaints ~ residency + revenue
## Model 2: complaints ~ residency + gender + revenue
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        41     76.427                     
## 2        40     75.947  1  0.47986   0.4885

Adding gender into the model does not significantly improve the model . We should not add gender Add a quadratic term in REVENUE to determine whether there is a maximum complaints for the number of residency for doctor

resmodC<-glm(complaints ~ residency * revenue, data=esdcomp, family=poisson)
anova(resmodB, resmodC, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: complaints ~ residency + revenue
## Model 2: complaints ~ residency * revenue
##   Resid. Df Resid. Dev Df   Deviance Pr(>Chi)
## 1        41     76.427                       
## 2        40     76.426  1 0.00098171    0.975
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
esdcomp <- transform(esdcomp, revenuesq = revenue^2)
resmodD <- glm(complaints ~ residency +revenuesq +revenue, data=esdcomp, family=poisson)
anova(resmodB, resmodD, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: complaints ~ residency + revenue
## Model 2: complaints ~ residency + revenuesq + revenue
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        41     76.427                     
## 2        40     75.166  1   1.2606   0.2615
(pci <- exp(confint(resmodB, parm=2, level = .95)))
## Waiting for profiling to be done...
##     2.5 %    97.5 % 
## 0.3877621 0.7955934

Every year that a resident doctor with revenue is associated with a multiplicative change in their mean number of complaints; we’re 95% confident that this multiplicative change is between .3877 and .7956. Based on the drop deviance test, we can see that the current model is the best model

dp<-sum(residuals(resmodD,type="pearson")^2/resmodD$df.res)
dp
## [1] 1.895439

It a decent model .Based on our dispersion paramter, this is not greater than 10, or the cause of concern value of 5.

1.8954 is close to 1 meaning there should be no concern of overdisspersion for this produced model.