df63 = data.frame(X = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 6, 6, 7,
7, 7, 8, 8, 8, 9, 11, 12, 12, 12), Y = c(0.6, 1.6, 0.5, 1.2, 2, 1.3, 2.5,
2.2, 2.4, 1.2, 3.5, 4.1, 5.1, 5.7, 3.4, 9.7, 8.6, 4, 5.5, 10.5, 17.5, 13.4,
4.5, 30.4, 12.4, 13.4, 26.2, 7.4))
df63
## X Y
## 1 1 0.6
## 2 1 1.6
## 3 1 0.5
## 4 1 1.2
## 5 2 2.0
## 6 2 1.3
## 7 2 2.5
## 8 3 2.2
## 9 3 2.4
## 10 3 1.2
## 11 4 3.5
## 12 4 4.1
## 13 4 5.1
## 14 5 5.7
## 15 6 3.4
## 16 6 9.7
## 17 6 8.6
## 18 7 4.0
## 19 7 5.5
## 20 7 10.5
## 21 8 17.5
## 22 8 13.4
## 23 8 4.5
## 24 9 30.4
## 25 11 12.4
## 26 12 13.4
## 27 12 26.2
## 28 12 7.4
attach(df63)
plot(Y ~ X)
lm63.sol <- lm(Y ~ 1 + X, data = df63)
print(lm63.sol)
##
## Call:
## lm(formula = Y ~ 1 + X, data = df63)
##
## Coefficients:
## (Intercept) X
## -1.45 1.56
lines(df63$X, fitted(lm63.sol))
summary(lm63.sol)
##
## Call:
## lm(formula = Y ~ 1 + X, data = df63)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.841 -2.337 -0.021 1.059 17.832
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.452 1.835 -0.79 0.44
## X 1.558 0.281 5.55 7.9e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.17 on 26 degrees of freedom
## Multiple R-squared: 0.542, Adjusted R-squared: 0.525
## F-statistic: 30.8 on 1 and 26 DF, p-value: 7.93e-06
回归常数的p值为0.44且不显著,因此认为不通过T检验
F统计量为30.8,p值为7.93e-6,因此认为通过F检验
yn.res = resid(lm63.sol)
yn.rst = rstandard(lm63.sol)
yn.fit = predict(lm63.sol)
plot(yn.res ~ yn.fit)
plot(yn.rst ~ yn.fit)
shapiro.test(yn.res)
##
## Shapiro-Wilk normality test
##
## data: yn.res
## W = 0.8572, p-value = 0.00131
正态检验结果p值0.00131,因此残差不满足正态性假设
lm63.new = update(lm63.sol, sqrt(.) ~ .)
plot(sqrt(Y) ~ X)
print(lm63.new)
##
## Call:
## lm(formula = sqrt(Y) ~ X, data = df63)
##
## Coefficients:
## (Intercept) X
## 0.767 0.291
lines(df63$X, fitted(lm63.new), col = "red")
summary(lm63.new)
##
## Call:
## lm(formula = sqrt(Y) ~ X, data = df63)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5426 -0.4528 -0.0118 0.3493 2.1249
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7665 0.2559 3.00 0.006 **
## X 0.2914 0.0391 7.44 6.6e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.721 on 26 degrees of freedom
## Multiple R-squared: 0.681, Adjusted R-squared: 0.668
## F-statistic: 55.4 on 1 and 26 DF, p-value: 6.64e-08
回归常数和X系数的p值均显著,通过T检验。
F统计量为55.4,p值为6.64e-8,通过F检验。
yn.res = resid(lm63.new)
yn.rst = rstandard(lm63.new)
yn.fit = predict(lm63.new)
plot(yn.res ~ yn.fit)
plot(yn.rst ~ yn.fit)
shapiro.test(yn.res)
##
## Shapiro-Wilk normality test
##
## data: yn.res
## W = 0.9626, p-value = 0.4012
正态检验结果p值0.4012,能通过正态性检验,因此模型修正是合理的。
toothpaste <- data.frame(X1 = c(-0.05, 0.25, 0.6, 0, 0.25, 0.2, 0.15, 0.05,
-0.15, 0.15, 0.2, 0.1, 0.4, 0.45, 0.35, 0.3, 0.5, 0.5, 0.4, -0.05, -0.05,
-0.1, 0.2, 0.1, 0.5, 0.6, -0.05, 0, 0.05, 0.55), X2 = c(5.5, 6.75, 7.25,
5.5, 7, 6.5, 6.75, 5.25, 5.25, 6, 6.5, 6.25, 7, 6.9, 6.8, 6.8, 7.1, 7, 6.8,
6.5, 6.25, 6, 6.5, 7, 6.8, 6.8, 6.5, 5.75, 5.8, 6.8), Y = c(7.38, 8.51,
9.52, 7.5, 9.33, 8.28, 8.75, 7.87, 7.1, 8, 7.89, 8.15, 9.1, 8.86, 8.9, 8.87,
9.26, 9, 8.75, 7.95, 7.65, 7.27, 8, 8.5, 8.75, 9.21, 8.27, 7.67, 7.93, 9.26))
source("../R-Book-Demo/ch6/Reg_Diag.R", echo = TRUE, max.deparse.length = 3000)
##
## > Reg_Diag <- function(fm) {
## + n <- nrow(fm$model)
## + df <- fm$df.residual
## + p <- n - df - 1
## + s <- rep(" ", n)
## + res <- residuals(fm)
## + s1 <- s
## + s1[abs(res) == max(abs(res))] <- "*"
## + sta <- rstandard(fm)
## + s2 <- s
## + s2[abs(sta) > 2] <- "*"
## + stu <- rstudent(fm)
## + s3 <- s
## + s3[abs(sta) > 2] <- "*"
## + h <- hatvalues(fm)
## + s4 <- s
## + s4[h > 2 * (p + 1)/n] <- "*"
## + d <- dffits(fm)
## + s5 <- s
## + s5[abs(d) > 2 * sqrt((p + 1)/n)] <- "*"
## + c <- cooks.distance(fm)
## + s6 <- s
## + s6[c == max(c)] <- "*"
## + co <- covratio(fm)
## + abs_co <- abs(co - 1)
## + s7 <- s
## + s7[abs_co == max(abs_co)] <- "*"
## + data.frame(residual = res, s1, standard = sta, s2, student = stu,
## + s3, hat_matrix = h, s4, DFFITS = d, s5, cooks_distance = c,
## + s6, COVRATIO = co, s7)
## + }
toothpaste.lm1.sol = lm(Y ~ X1 + X2, data = toothpaste)
toothpaste.diag = Reg_Diag(toothpaste.lm1.sol)
print(toothpaste.diag)
## residual s1 standard s2 student s3 hat_matrix s4 DFFITS s5
## 1 -0.047232 -0.21248 -0.20868 0.13012 -0.08071
## 2 -0.098070 -0.42152 -0.41501 0.04704 -0.09221
## 3 0.074289 0.33492 0.32935 0.13386 0.12947
## 4 -0.006646 -0.03003 -0.02947 0.13798 -0.01179
## 5 0.581059 * 2.55701 * 2.88237 * 0.09092 0.91153 *
## 6 -0.107785 -0.46031 -0.45349 0.03475 -0.08605
## 7 0.300758 1.31532 1.33419 0.07955 0.39224
## 8 0.424810 2.05724 * 2.19842 * 0.24933 * 1.26699 *
## 9 -0.027532 -0.12804 -0.12569 0.18600 -0.06008
## 10 -0.026630 -0.11546 -0.11333 0.06357 -0.02953
## 11 -0.497785 -2.12587 * -2.28622 * 0.03475 -0.43379
## 12 0.061914 0.26540 0.26078 0.04194 0.05456
## 13 0.112816 0.48968 0.48267 0.06557 0.12786
## 14 -0.150250 -0.65395 -0.64687 0.07069 -0.17841
## 15 0.104927 0.45112 0.44437 0.04761 0.09935
## 16 0.154341 0.66319 0.65616 0.04652 0.14494
## 17 0.057640 0.25360 0.24916 0.09057 0.07863
## 18 -0.146012 -0.64156 -0.63442 0.08813 -0.19723
## 19 -0.124487 -0.53776 -0.53056 0.05659 -0.12995
## 20 -0.040714 -0.18584 -0.18248 0.15506 -0.07817
## 21 -0.199843 -0.88486 -0.88119 0.10203 -0.29703
## 22 -0.359558 -1.59386 -1.64328 0.10408 -0.56011
## 23 -0.387785 -1.65610 -1.71455 0.03475 -0.32532
## 24 -0.010698 -0.04979 -0.04886 0.18729 -0.02346
## 25 -0.283316 -1.25181 -1.26569 0.09823 -0.41775
## 26 0.017856 0.08231 0.08078 0.17144 0.03674
## 27 0.279286 1.27482 1.29043 0.15506 0.55279
## 28 0.022484 0.09865 0.09682 0.08549 0.02960
## 29 0.174895 0.76514 0.75911 0.08017 0.22411
## 30 0.147270 0.66281 0.65578 0.13089 0.25450
## cooks_distance s6 COVRATIO s7
## 1 2.251e-03 1.2810
## 2 2.924e-03 1.1521
## 3 5.779e-03 1.2769
## 4 4.813e-05 1.2990
## 5 2.180e-01 0.5362 *
## 6 2.543e-03 1.1331
## 7 4.984e-02 0.9975
## 8 4.686e-01 * 0.8945
## 9 1.249e-03 1.3733
## 10 3.017e-04 1.1941
## 11 5.424e-02 0.6697
## 12 1.028e-03 1.1598
## 13 5.609e-03 1.1668
## 14 1.084e-02 1.1487
## 15 3.391e-03 1.1495
## 16 7.153e-03 1.1181
## 17 2.135e-03 1.2226
## 18 1.326e-02 1.1728
## 19 5.783e-03 1.1493
## 20 2.113e-03 1.3203
## 21 2.965e-02 1.1417
## 22 9.838e-02 0.9293
## 23 3.291e-02 0.8413
## 24 1.904e-04 1.3776
## 25 5.690e-02 1.0380
## 26 4.672e-04 1.3506
## 27 9.941e-02 1.1002
## 28 3.032e-04 1.2232
## 29 1.701e-02 1.1400
## 30 2.206e-02 1.2267
第5、8号样本的标准化残差、外学生化残差最大,对应变量Y的影响最大。而且DFFITS, Cook, COVRATIO指标超标。
toothpaste.lm2.sol = lm(Y ~ X1 + X2, data = toothpaste, subset = c(-5, -8))
summary(toothpaste.lm1.sol)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = toothpaste)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4978 -0.1203 -0.0087 0.1108 0.5811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.407 0.722 6.10 1.6e-06 ***
## X1 1.588 0.299 5.30 1.3e-05 ***
## X2 0.563 0.119 4.73 6.2e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.238 on 27 degrees of freedom
## Multiple R-squared: 0.886, Adjusted R-squared: 0.878
## F-statistic: 105 on 2 and 27 DF, p-value: 1.84e-13
summary(toothpaste.lm2.sol)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = toothpaste, subset = c(-5, -8))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4635 -0.1014 0.0357 0.1101 0.3228
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.112 0.699 5.88 3.9e-06 ***
## X1 1.550 0.263 5.90 3.7e-06 ***
## X2 0.605 0.115 5.28 1.8e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.197 on 25 degrees of freedom
## Multiple R-squared: 0.921, Adjusted R-squared: 0.915
## F-statistic: 145 on 2 and 25 DF, p-value: 1.69e-14
5、8号样本删除后对新数据集进行回归后得到的相关系数的R平方已经从0.886上升到0.921,可以认为新模型好于PDF版326页的线性回归模型。
cement <- data.frame(X1 = c(7, 1, 11, 11, 7, 11, 3, 1, 2, 21, 1, 11, 10), X2 = c(26,
29, 56, 31, 52, 55, 71, 31, 54, 47, 40, 66, 68), X3 = c(6, 15, 8, 8, 6,
9, 17, 22, 18, 4, 23, 9, 8), X4 = c(60, 52, 20, 47, 33, 22, 6, 44, 22, 26,
34, 12, 12), Y = c(78.5, 74.3, 104.3, 87.6, 95.9, 109.2, 102.7, 72.5, 93.1,
115.9, 83.8, 113.3, 109.4))
cement
## X1 X2 X3 X4 Y
## 1 7 26 6 60 78.5
## 2 1 29 15 52 74.3
## 3 11 56 8 20 104.3
## 4 11 31 8 47 87.6
## 5 7 52 6 33 95.9
## 6 11 55 9 22 109.2
## 7 3 71 17 6 102.7
## 8 1 31 22 44 72.5
## 9 2 54 18 22 93.1
## 10 21 47 4 26 115.9
## 11 1 40 23 34 83.8
## 12 11 66 9 12 113.3
## 13 10 68 8 12 109.4
cement.cor = cor(cement[1:4])
kappa(cement.cor, exact = TRUE)
## [1] 1377
因为kappa条件数为1377>1000,所以可以认为该数据集有严重的多重共线性
eigen(cement.cor)
## $values
## [1] 2.235704 1.576066 0.186606 0.001624
##
## $vectors
## [,1] [,2] [,3] [,4]
## [1,] -0.4760 0.5090 0.6755 0.2411
## [2,] -0.5639 -0.4139 -0.3144 0.6418
## [3,] 0.3941 -0.6050 0.6377 0.2685
## [4,] 0.5479 0.4512 -0.1954 0.6767
由于X3,X4前的系数都比较小,可作为去掉变量的候选
cement.corX4 = cor(cement[, c("X1", "X2", "X3")])
kappa(cement.corX4, exact = TRUE)
## [1] 11.12
得到的kappa条件数是k=11.12 << 1000,多重共线性较弱
cement.corX3 = cor(cement[, c("X1", "X2", "X4")])
kappa(cement.corX3, exact = TRUE)
## [1] 77.25
得到的kappa条件数是k=77.25 < 1000,多重共线性中等
cement.cor2 = cor(cement[, c("X1", "X2")])
kappa(cement.cor2, exact = TRUE)
## [1] 1.593
得到的kappa条件数是k=1.593 << 100,已不具有多重共线性
对比书中例子6.10,step函数自动去掉X3,从多重共线性角度不一定是最优的,但是同时去掉X3,X4从多重共线性分析是很合理的。
X1 = c(1, 1, 1, 1, 0, 0, 0, 0) #是否用抗生素
X2 = c(1, 1, 0, 0, 1, 1, 0, 0) #是否有危险因子
X3 = c(1, 0, 1, 0, 1, 0, 1, 0) #事先有计划
success = c(1, 11, 0, 0, 28, 23, 8, 0) #有感染
fail = c(17, 87, 2, 0, 30, 3, 32, 9) #无感染
infection = data.frame(X1, X2, X3, success, fail)
infection$Ymat = cbind(infection$success, infection$fail)
glm.sol = glm(Ymat ~ X1 + X2 + X3, family = binomial, data = infection)
summary(glm.sol)
##
## Call:
## glm(formula = Ymat ~ X1 + X2 + X3, family = binomial, data = infection)
##
## Deviance Residuals:
## 1 2 3 4 5 6 7 8
## 0.2647 -0.0716 -0.1523 0.0000 -0.7852 1.4962 1.2156 -2.5623
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.821 0.495 -1.66 0.097 .
## X1 -3.254 0.481 -6.76 1.4e-11 ***
## X2 2.030 0.455 4.46 8.2e-06 ***
## X3 -1.072 0.425 -2.52 0.012 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 83.491 on 6 degrees of freedom
## Residual deviance: 10.997 on 3 degrees of freedom
## AIC: 36.18
##
## Number of Fisher Scoring iterations: 5
感染的回归模型是P=exp(-0.82-3.2544X1+2.0299X2-1.072X3)/1+exp(-0.82-3.2544X1+2.0299X2-1.072X3)
根据上述模型,我们认为使用抗生素并有计划,将很大可能无感染。而有危险因子将很大可能有感染。
X1 = c(70, 60, 70, 40, 40, 70, 70, 80, 60, 30, 80, 40, 60, 40, 20, 50, 50, 40,
80, 70, 60, 90, 50, 70, 20, 80, 60, 50, 70, 40, 30, 30, 40, 60, 80, 70,
30, 60, 80, 70) # 生活行为能力
X2 = c(64, 63, 65, 69, 63, 48, 48, 63, 63, 53, 43, 55, 66, 67, 61, 63, 66, 68,
41, 53, 37, 54, 52, 50, 65, 52, 70, 40, 36, 44, 54, 59, 69, 50, 62, 68,
39, 49, 64, 67) # 年龄
X3 = c(5, 9, 11, 10, 58, 9, 11, 4, 14, 4, 12, 2, 25, 23, 19, 4, 16, 12, 12,
8, 13, 12, 8, 7, 21, 28, 13, 13, 22, 36, 9, 87, 5, 22, 4, 15, 4, 11, 10,
18) # 诊断到直入研究时间
X4 = c(rep(1, 7), rep(2, 7), rep(3, 2), rep(0, 4), rep(1, 8), rep(2, 4), rep(3,
3), rep(0, 5)) # 肿瘤类型
X5 = c(rep(1, 21), rep(0, 19)) # 化疗方法
Y = c(1, rep(0, 11), 1, rep(0, 5), 1, 1, 0, 1, 1, 1, 0, 1, rep(0, 12), 1, 1)
lung.df = data.frame(X1, X2, X3, X4, X5, Y)
lung.df
## X1 X2 X3 X4 X5 Y
## 1 70 64 5 1 1 1
## 2 60 63 9 1 1 0
## 3 70 65 11 1 1 0
## 4 40 69 10 1 1 0
## 5 40 63 58 1 1 0
## 6 70 48 9 1 1 0
## 7 70 48 11 1 1 0
## 8 80 63 4 2 1 0
## 9 60 63 14 2 1 0
## 10 30 53 4 2 1 0
## 11 80 43 12 2 1 0
## 12 40 55 2 2 1 0
## 13 60 66 25 2 1 1
## 14 40 67 23 2 1 0
## 15 20 61 19 3 1 0
## 16 50 63 4 3 1 0
## 17 50 66 16 0 1 0
## 18 40 68 12 0 1 0
## 19 80 41 12 0 1 1
## 20 70 53 8 0 1 1
## 21 60 37 13 1 1 0
## 22 90 54 12 1 0 1
## 23 50 52 8 1 0 1
## 24 70 50 7 1 0 1
## 25 20 65 21 1 0 0
## 26 80 52 28 1 0 1
## 27 60 70 13 1 0 0
## 28 50 40 13 1 0 0
## 29 70 36 22 2 0 0
## 30 40 44 36 2 0 0
## 31 30 54 9 2 0 0
## 32 30 59 87 2 0 0
## 33 40 69 5 3 0 0
## 34 60 50 22 3 0 0
## 35 80 62 4 3 0 0
## 36 70 68 15 0 0 0
## 37 30 39 4 0 0 0
## 38 60 49 11 0 0 0
## 39 80 64 10 0 0 1
## 40 70 67 18 0 0 1
lung.glm1 = glm(Y ~ X1 + X2 + X3 + X4 + X5, family = binomial, data = lung.df)
summary(lung.glm1)
##
## Call:
## glm(formula = Y ~ X1 + X2 + X3 + X4 + X5, family = binomial,
## data = lung.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7150 -0.6673 -0.2225 0.0994 2.2394
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.0114 4.4753 -1.57 0.117
## X1 0.0999 0.0430 2.32 0.020 *
## X2 0.0142 0.0470 0.30 0.763
## X3 0.0175 0.0546 0.32 0.749
## X4 -1.0830 0.5872 -1.84 0.065 .
## X5 -0.6131 0.9607 -0.64 0.523
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 44.987 on 39 degrees of freedom
## Residual deviance: 28.392 on 34 degrees of freedom
## AIC: 40.39
##
## Number of Fisher Scoring iterations: 6
肺癌生存时间的模型是P=exp(-7.0114+0.0999X1+0.01415X2+0.01749X3-1.083X4-0.613X5)/1+exp(-7.0114+0.0999X1+0.01415X2+0.01749X3-1.083X4-0.613X5)
X1~X5对P(Y=1)的综合影响不够显著。X4肿瘤类型是最主要的影响因素,但不够显著。
lung.pre1 = predict(lung.glm1, lung.df[1:5])
p.lung.pre1 = exp(lung.pre1)/(1 + exp(lung.pre1))
p.lung.pre1
## 1 2 3 4 5 6 7
## 0.3278247 0.1595365 0.3545827 0.0277025 0.0571281 0.2943052 0.3016214
## 8 9 10 11 12 13 14
## 0.3029655 0.0655459 0.0025428 0.2736051 0.0068326 0.0814823 0.0116345
## 15 16 17 18 19 20 21
## 0.0004623 0.0072859 0.1957195 0.0791212 0.7615554 0.5650524 0.1234936
## 22 23 24 25 26 27 28
## 0.8670126 0.0978611 0.4333639 0.0080967 0.7552594 0.2932708 0.0908222
## 29 30 31 32 33 34 35
## 0.2163684 0.0193182 0.0051825 0.0214067 0.0054949 0.0402641 0.2112761
## 36 37 38 39 40
## 0.7702151 0.0325750 0.4678423 0.8874450 0.7769251
lung.glm2 = step(lung.glm1)
## Start: AIC=40.39
## Y ~ X1 + X2 + X3 + X4 + X5
##
## Df Deviance AIC
## - X3 1 28.5 38.5
## - X2 1 28.5 38.5
## - X5 1 28.8 38.8
## <none> 28.4 40.4
## - X4 1 32.6 42.6
## - X1 1 38.3 48.3
##
## Step: AIC=38.48
## Y ~ X1 + X2 + X4 + X5
##
## Df Deviance AIC
## - X2 1 28.6 36.6
## - X5 1 29.0 37.0
## <none> 28.5 38.5
## - X4 1 32.7 40.7
## - X1 1 38.5 46.5
##
## Step: AIC=36.56
## Y ~ X1 + X4 + X5
##
## Df Deviance AIC
## - X5 1 29.1 35.1
## <none> 28.6 36.6
## - X4 1 32.9 38.9
## - X1 1 38.5 44.5
##
## Step: AIC=35.07
## Y ~ X1 + X4
##
## Df Deviance AIC
## <none> 29.1 35.1
## - X4 1 33.5 37.5
## - X1 1 39.1 43.1
summary(lung.glm2)
##
## Call:
## glm(formula = Y ~ X1 + X4, family = binomial, data = lung.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.482 -0.662 -0.188 0.123 2.284
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.1375 2.7384 -2.24 0.025 *
## X1 0.0976 0.0408 2.39 0.017 *
## X4 -1.1252 0.6024 -1.87 0.062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 44.987 on 39 degrees of freedom
## Residual deviance: 29.073 on 37 degrees of freedom
## AIC: 35.07
##
## Number of Fisher Scoring iterations: 6
lung.pre2 = predict(lung.glm2, lung.df[1:5])
p.lung.pre2 = exp(lung.pre2)/(1 + exp(lung.pre2))
p.lung.pre2
## 1 2 3 4 5 6 7
## 0.3937156 0.1966149 0.3937156 0.0335913 0.0335913 0.3937156 0.3937156
## 8 9 10 11 12 13 14
## 0.3586809 0.0735887 0.0042337 0.3586809 0.0111560 0.0735887 0.0111560
## 15 16 17 18 19 20 21
## 0.0005198 0.0096230 0.2212816 0.0967316 0.8414941 0.6667496 0.1966149
## 22 23 24 25 26 27 28
## 0.8205404 0.0844432 0.3937156 0.0049125 0.6327765 0.1966149 0.0844432
## 29 30 31 32 33 34 35
## 0.1740832 0.0111560 0.0042337 0.0042337 0.0036484 0.0251343 0.1536398
## 36 37 38 39 40
## 0.6667496 0.0387931 0.4298786 0.8414941 0.6667496
比较两个模型,从估计病人生存时间角度,使用简化模型更方便,且在仅考虑的X1,X4两个因素更显著。