# 获取数据
data(Affairs, package="AER")
summary(Affairs)
## affairs gender age yearsmarried children
## Min. : 0.000 female:315 Min. :17.50 Min. : 0.125 no :171
## 1st Qu.: 0.000 male :286 1st Qu.:27.00 1st Qu.: 4.000 yes:430
## Median : 0.000 Median :32.00 Median : 7.000
## Mean : 1.456 Mean :32.49 Mean : 8.178
## 3rd Qu.: 0.000 3rd Qu.:37.00 3rd Qu.:15.000
## Max. :12.000 Max. :57.00 Max. :15.000
## religiousness education occupation rating
## Min. :1.000 Min. : 9.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:14.00 1st Qu.:3.000 1st Qu.:3.000
## Median :3.000 Median :16.00 Median :5.000 Median :4.000
## Mean :3.116 Mean :16.17 Mean :4.195 Mean :3.932
## 3rd Qu.:4.000 3rd Qu.:18.00 3rd Qu.:6.000 3rd Qu.:5.000
## Max. :5.000 Max. :20.00 Max. :7.000 Max. :5.000
table(Affairs$affairs)
##
## 0 1 2 3 7 12
## 451 34 17 19 42 38
# 将数值数据统计为布尔值(是否出轨)
Affairs$ynaffair[Affairs$affairs > 0] <- 1
Affairs$ynaffair[Affairs$affairs == 0] <- 0
Affairs$ynaffair <- factor(Affairs$ynaffair,
levels=c(0,1),
labels=c("No","Yes"))
table(Affairs$ynaffair)
##
## No Yes
## 451 150
fit.full=glm(ynaffair~gender+age+yearsmarried+children+religiousness+education+occupation+rating,data = Affairs,family = binomial())
#查看统计结果
summary(fit.full)
##
## Call:
## glm(formula = ynaffair ~ gender + age + yearsmarried + children +
## religiousness + education + occupation + rating, family = binomial(),
## data = Affairs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5713 -0.7499 -0.5690 -0.2539 2.5191
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.37726 0.88776 1.551 0.120807
## gendermale 0.28029 0.23909 1.172 0.241083
## age -0.04426 0.01825 -2.425 0.015301 *
## yearsmarried 0.09477 0.03221 2.942 0.003262 **
## childrenyes 0.39767 0.29151 1.364 0.172508
## religiousness -0.32472 0.08975 -3.618 0.000297 ***
## education 0.02105 0.05051 0.417 0.676851
## occupation 0.03092 0.07178 0.431 0.666630
## rating -0.46845 0.09091 -5.153 2.56e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.38 on 600 degrees of freedom
## Residual deviance: 609.51 on 592 degrees of freedom
## AIC: 627.51
##
## Number of Fisher Scoring iterations: 4
根据统计结果,可以发现,gender,age,children,edu,occupation对于方程的贡献较小(),因而考虑去除这几个变量再次进行统计。去除变量之后的统计模型如下:
fit.reduce=glm(ynaffair~age+yearsmarried+religiousness+rating,data=Affairs,family = binomial())
#查看新模型结果
summary(fit.reduce)
##
## Call:
## glm(formula = ynaffair ~ age + yearsmarried + religiousness +
## rating, family = binomial(), data = Affairs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6278 -0.7550 -0.5701 -0.2624 2.3998
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.93083 0.61032 3.164 0.001558 **
## age -0.03527 0.01736 -2.032 0.042127 *
## yearsmarried 0.10062 0.02921 3.445 0.000571 ***
## religiousness -0.32902 0.08945 -3.678 0.000235 ***
## rating -0.46136 0.08884 -5.193 2.06e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.38 on 600 degrees of freedom
## Residual deviance: 615.36 on 596 degrees of freedom
## AIC: 625.36
##
## Number of Fisher Scoring iterations: 4
利用之前学到的anova函数对两个模型的拟合程度进行比较:
anova(fit.reduce,fit.full,test="Chisq")
## Analysis of Deviance Table
##
## Model 1: ynaffair ~ age + yearsmarried + religiousness + rating
## Model 2: ynaffair ~ gender + age + yearsmarried + children + religiousness +
## education + occupation + rating
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 596 615.36
## 2 592 609.51 4 5.8474 0.2108
比较结果中卡方值小于0.21,可以相信两种模型对于结果的解释程度一样好。 最终我们倾向于使用更加简单的模型(fit.reduce)进行解释。
#查看回归系数
coef(fit.reduce)
## (Intercept) age yearsmarried religiousness rating
## 1.93083017 -0.03527112 0.10062274 -0.32902386 -0.46136144
#指数化
exp(coef(fit.reduce))
## (Intercept) age yearsmarried religiousness rating
## 6.8952321 0.9653437 1.1058594 0.7196258 0.6304248
由于模型本身中的对数计算,这里对结果进行指数化。 简而言之,我们现在可以说: 结婚时间每增加1,出轨优势将乘以1.106(1.1058594)
如果解释中“优势”理解不过确切,下面将转化为概率来进行说明:
# 创建测试数据(唯一变量为rating)
testdata=data.frame(rating=c(1,2,3,4,5),age=mean(Affairs$age),yearsmarried=mean(Affairs$yearsmarried),religiousness=mean(Affairs$religiousness))
#显示对于测试数据集应用模型的概率结果
testdata$prob=predict(fit.reduce,newdata=testdata,type="response")
#console中显示数据集及其结果
testdata
## rating age yearsmarried religiousness prob
## 1 1 32.48752 8.177696 3.116473 0.5302296
## 2 2 32.48752 8.177696 3.116473 0.4157377
## 3 3 32.48752 8.177696 3.116473 0.3096712
## 4 4 32.48752 8.177696 3.116473 0.2204547
## 5 5 32.48752 8.177696 3.116473 0.1513079
从结果中可以看出,当婚姻评分(rating)从1到5时,对应概率的变化为0.53到0.15。
#将唯一变量改为age
testdata=data.frame(rating=mean(Affairs$rating),age=seq(17,57,10),yearsmarried=mean(Affairs$yearsmarried),religiousness=mean(Affairs$religiousness))
testdata$prob=predict(fit.reduce,newdata=testdata,type="response")
testdata
## rating age yearsmarried religiousness prob
## 1 3.93178 17 8.177696 3.116473 0.3350834
## 2 3.93178 27 8.177696 3.116473 0.2615373
## 3 3.93178 37 8.177696 3.116473 0.1992953
## 4 3.93178 47 8.177696 3.116473 0.1488796
## 5 3.93178 57 8.177696 3.116473 0.1094738
从结果中看到,年龄(age)从17到57,对应概率变化为0.34到0.11。
对于二项分布,数据的期望方差分布可以表征为:nπ(1-π)。其中n为观测数,π为Y=1的概率。 若观测样本的响应变量方差高于二项分布的期望方差即为过度离势。 通常是由模型选定有误造成的,应该转为使用类二项分布。 过度离势将造成显著性检验和标准检验误差(后续将在此处添加解释) 对过度离势的评估,采用残差偏差与残差自由度的比值来评估。 下面对以上模型进行过度离势评估。
deviance(fit.reduce)/df.residual(fit.reduce)
## [1] 1.03248
由检验结果接近于1判断没有过度离势。 另一种评估方式则为,产生一个类二项分布的拟合模型,然后利用卡方检验评估二者差异。
fit=glm(ynaffair~age+yearsmarried+religiousness+rating,family = binomial(),data = Affairs)
fit.od=glm(ynaffair~age+yearsmarried+religiousness+rating,family = quasibinomial(),data = Affairs)
pchisq(summary(fit.od)$dispersion * fit$df.residual,fit.od$df.residual,lower=F)
## [1] 0.340122
此处p值较小,证明二者差异不显著,因而不存在过度离势。
# 查看数据
data(breslow.dat, package="robust")
names(breslow.dat)
## [1] "ID" "Y1" "Y2" "Y3" "Y4" "Base" "Age" "Trt"
## [9] "Ysum" "sumY" "Age10" "Base4"
summary(breslow.dat[c(6, 7, 8, 10)])
## Base Age Trt sumY
## Min. : 6.00 Min. :18.00 placebo :28 Min. : 0.00
## 1st Qu.: 12.00 1st Qu.:23.00 progabide:31 1st Qu.: 11.50
## Median : 22.00 Median :28.00 Median : 16.00
## Mean : 31.22 Mean :28.34 Mean : 33.05
## 3rd Qu.: 41.00 3rd Qu.:32.00 3rd Qu.: 36.00
## Max. :151.00 Max. :42.00 Max. :302.00
其中,6-8列为预测需要的自变量(患者信息),sumY为实验结果(服用药物后病发次数). 特殊说明,Trt为治疗方式,代表其中部分患者只是服用安慰剂。
# 显示治疗后发病数
par(mfrow=c(1, 2))
attach(breslow.dat)
hist(sumY, breaks=20, xlab="Seizure Count",
main="Distribution of Seizures")
boxplot(sumY ~ Trt, xlab="Treatment", main="Group Comparisons")
fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat, family=poisson())
summary(fit)
##
## Call:
## glm(formula = sumY ~ Base + Age + Trt, family = poisson(), data = breslow.dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.0569 -2.0433 -0.9397 0.7929 11.0061
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.9488259 0.1356191 14.370 < 2e-16 ***
## Base 0.0226517 0.0005093 44.476 < 2e-16 ***
## Age 0.0227401 0.0040240 5.651 1.59e-08 ***
## Trtprogabide -0.1527009 0.0478051 -3.194 0.0014 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2122.73 on 58 degrees of freedom
## Residual deviance: 559.44 on 55 degrees of freedom
## AIC: 850.71
##
## Number of Fisher Scoring iterations: 5
可以看到模型中各个因子在p<0.05的检验中都是显著的。因为无需对因子做出修改。
#查看因子影响程度
coef(fit)
## (Intercept) Base Age Trtprogabide
## 1.94882593 0.02265174 0.02274013 -0.15270095
exp(coef(fit))
## (Intercept) Base Age Trtprogabide
## 7.0204403 1.0229102 1.0230007 0.8583864
由检验结果我们可以得出如下结论
Age:年龄每增加1,癫痫病发次数乘以1.023
Trtprogabide:药物相对于安慰剂的效果使患病次数预期下降到85.8%
#过度离势评估
deviance(fit)/df.residual(fit)
## [1] 10.1717
library(qcc)
qcc.overdispersion.test(breslow.dat$sumY, type="poisson")
##
## Overdispersion test Obs.Var/Theor.Var Statistic p-value
## poisson data 62.87013 3646.468 0
由评估公式值远大于1,同时qcc中检验结果显著性小于0.05可以证确实过度离势。 ////需要解释gcc包的计算方案 特殊说明泊松拟合中过度离势产生的几个原因: - 遗漏了预测变量 - 事件相关,例如发生39次癫痫的概率和发生第40次癫痫的概率是相关的 - 重复测量数据的内在群聚特性////需要解释
#模型修改
fit.od <- glm(sumY ~ Base + Age + Trt, data=breslow.dat,
family=quasipoisson())
summary(fit.od)
##
## Call:
## glm(formula = sumY ~ Base + Age + Trt, family = quasipoisson(),
## data = breslow.dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.0569 -2.0433 -0.9397 0.7929 11.0061
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.948826 0.465091 4.190 0.000102 ***
## Base 0.022652 0.001747 12.969 < 2e-16 ***
## Age 0.022740 0.013800 1.648 0.105085
## Trtprogabide -0.152701 0.163943 -0.931 0.355702
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 11.76075)
##
## Null deviance: 2122.73 on 58 degrees of freedom
## Residual deviance: 559.44 on 55 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
同逻辑回归处理方法,使用quasipoisson进行拟合,并得出结果。