library(slidify)
create_deck(“Topic04”, git = TRUE)
X1->Y
Y = a + b1*X1
> load("wgcoll.rda")
> plot(wgc$aa ~ wgc$pe) # academic average & parents' education
> cor(wgc$aa, wgc$pe) # correlation coefficient
## [1] 0.7931
> model1 <- lm(wgc$aa ~ wgc$pe)
> summary(model1)
##
## Call:
## lm(formula = wgc$aa ~ wgc$pe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.243 -4.502 0.712 5.757 24.802
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.659 7.875 0.21 0.83
## wgc$pe 5.045 0.559 9.02 6.6e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.7 on 48 degrees of freedom
## Multiple R-squared: 0.629, Adjusted R-squared: 0.621
## F-statistic: 81.4 on 1 and 48 DF, p-value: 6.57e-12
> plot(wgc$aa ~ wgc$pe, cex = 10 * sqrt(cooks.distance(model1)))
> abline(h = mean(wgc$aa), lty = 2) #畫水平線
> abline(v = mean(wgc$pe), lty = 3) #畫垂直線
> abline(model1, lty = 1, col = "red")
attach(wgc)
with(wgc, identify(aa ~ pe, labels = id)) # 手動點出1個極端值(左下角)
abline(lm(aa ~ pe, data = wgc, subset = c(-8)), lty = 2, col = "blue")
labels <- c("all", "-8")
# 在圖上加圖示
legend(locator(1), legend = labels, lty = 1:2, col = c("red", "blue"))
detach(wgc)
> summary(model1)
##
## Call:
## lm(formula = wgc$aa ~ wgc$pe)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.243 -4.502 0.712 5.757 24.802
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.659 7.875 0.21 0.83
## wgc$pe 5.045 0.559 9.02 6.6e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.7 on 48 degrees of freedom
## Multiple R-squared: 0.629, Adjusted R-squared: 0.621
## F-statistic: 81.4 on 1 and 48 DF, p-value: 6.57e-12
> cor <- cor(wgc$aa, wgc$pe)
> cor^2
## [1] 0.6291
> model2 <- lm(aa ~ pe + c, data = wgc) # community type: 0=urban, 1=rural
> summary(model2)
##
## Call:
## lm(formula = aa ~ pe + c, data = wgc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.226 -4.721 0.831 6.424 16.942
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.462 6.946 0.79 0.43559
## pe 4.443 0.511 8.69 2.4e-11 ***
## c 11.276 2.829 3.99 0.00023 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.36 on 47 degrees of freedom
## Multiple R-squared: 0.723, Adjusted R-squared: 0.711
## F-statistic: 61.3 on 2 and 47 DF, p-value: 8.06e-14
> plot(fitted(model2), resid(model2))
table(wgc$sm)
##
## 0 1 2
## 13 23 14
sm1 <- ifelse(wgc$sm == 1, 1, 0)
sm2 <- ifelse(wgc$sm == 2, 1, 0) #注意共有三個類別但只編了兩個虛擬變數!
model3 <- lm(aa ~ pe + c + sm1 + sm2, data = wgc)
# 也可以寫成: model3 <- update(model2, .~.+ sm, data=wgc)
> summary(model3)
##
## Call:
## lm(formula = aa ~ pe + c + sm1 + sm2, data = wgc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.09 -5.29 1.86 5.39 19.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.353 7.020 0.90 0.37031
## pe 4.584 0.568 8.07 2.7e-10 ***
## c 11.639 2.825 4.12 0.00016 ***
## sm1 -5.853 3.513 -1.67 0.10264
## sm2 -1.009 4.208 -0.24 0.81162
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.16 on 45 degrees of freedom
## Multiple R-squared: 0.746, Adjusted R-squared: 0.724
## F-statistic: 33.1 on 4 and 45 DF, p-value: 7.07e-13
plot(fitted(model3), resid(model3))
-有人說,其實果家長的教育程度其實是受到其居住地的影響。
-(the effect of X1 dependents on X2)
> model4 <- lm(aa ~ pe * c, data = wgc)
> model4 <- lm(aa ~ pe + c + pe:c, data = wgc)
> summary(model4)
##
## Call:
## lm(formula = aa ~ pe + c + pe:c, data = wgc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.150 -4.545 0.799 6.523 13.633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.526 8.011 -0.07 0.95
## pe 4.898 0.595 8.24 1.3e-10 ***
## c 34.751 16.425 2.12 0.04 *
## pe:c -1.636 1.128 -1.45 0.15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.26 on 46 degrees of freedom
## Multiple R-squared: 0.735, Adjusted R-squared: 0.718
## F-statistic: 42.5 on 3 and 46 DF, p-value: 2.6e-13
AIC(model1)
## [1] 383
AIC(model2)
## [1] 370.5
AIC(model3)
## [1] 370.1
AIC(model4)
## [1] 370.2
install.packages("UsingR")
library(UsingR)
## Loading required package: MASS
data(babies)
subbabies <- subset(babies, gestation < 350 & age < 99 & ht < 99 &
wt1 < 999 & dage < 99 & dht < 99 & dwt < 999)
res.lm <- lm(wt ~ gestation + age + ht + wt1 + dage + dht + dwt,
data = subbabies)
summary(res.lm)
##
## Call:
## lm(formula = wt ~ gestation + age + ht + wt1 + dage + dht + dwt,
## data = subbabies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.69 -10.54 0.27 10.13 55.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.4576 23.3486 -4.52 7.4e-06 ***
## gestation 0.4625 0.0397 11.66 < 2e-16 ***
## age 0.1384 0.1879 0.74 0.462
## ht 1.2161 0.2848 4.27 2.2e-05 ***
## wt1 0.0289 0.0342 0.84 0.399
## dage 0.0590 0.1653 0.36 0.721
## dht -0.0663 0.2703 -0.25 0.806
## dwt 0.0782 0.0330 2.37 0.018 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.4 on 692 degrees of freedom
## Multiple R-squared: 0.215, Adjusted R-squared: 0.207
## F-statistic: 27.1 on 7 and 692 DF, p-value: <2e-16
plot(fitted(res.lm), resid(res.lm))
plot(res.lm) # 注意:R需要你按Enter鍵來切換四張圖;也注意一下261這個outlier
babies[260:263, ] # 261st (id=4604)這個個案看似陣痛期特別短
## id pluralty outcome date gestation sex wt parity race age ed ht wt1
## 260 4514 5 1 1569 285 1 130 1 6 23 4 63 128
## 261 4604 5 1 1598 148 1 116 7 7 28 1 66 135
## 262 4639 5 1 1569 256 1 81 4 2 30 1 64 148
## 263 4674 5 1 1355 287 1 124 3 4 27 1 62 105
## drace dage ded dht dwt marital inc smoke time number
## 260 6 27 2 99 999 1 98 1 1 1
## 261 7 36 1 68 155 0 2 0 0 0
## 262 5 31 2 73 190 1 5 1 1 5
## 263 5 31 1 99 999 1 98 1 1 2
subbabies2 <- subset(subbabies, id != 4604)
res.lm2 <- lm(wt ~ gestation + age + ht + wt1 + dage + dht + dwt,
data = subbabies2)
plot(res.lm2)
model5 <- update(model3, . ~ . + g)
summary(model5)
“odd"表「勝算」,例如,80%的好球率可以轉換為80/20=4的勝算,亦即,每一顆球投出是好球的勝算是壞球的四倍(a case is more likely to be in group 1 than in group 0)。
將這個勝算(odds)用取對數(log)的方式轉成介在0與1之間的數值,這個log odds簡稱叫作logits. 這時候自變數與依變數的關係就不再是線性,而是成了S形,而計算迴歸係數的方式也不能再用最小平法OLS(因為變數非線性的關係無法滿足線性迴歸所需要的基本條件,見上一講)。要計算羅吉斯迴歸的係數,我們用的是最大概似法)這個機制來尋找一個係數,使得每個觀察值被正確放進所屬類別(0或1)的機率最大,也就是說,這
一般化的線性模型 generalized linear models (glm)包括了線性模型和二元勝對數模型。可以說線性模型和二元勝對數模型都是一般化的線性模型的獨特型態:線性模型預設解釋變數為常態分佈,即gaussian/normal,而二元勝算對數模型預設解釋變數為界在0與1之間的二元分佈,即binominal distribution。
因此,我們可以用 `glm()` 來取代`lm()`,結果會一樣。
load("wgcoll.rda")
summary(lm(aa ~ pe * c, data = wgc))
##
## Call:
## lm(formula = aa ~ pe * c, data = wgc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.150 -4.545 0.799 6.523 13.633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.526 8.011 -0.07 0.95
## pe 4.898 0.595 8.24 1.3e-10 ***
## c 34.751 16.425 2.12 0.04 *
## pe:c -1.636 1.128 -1.45 0.15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.26 on 46 degrees of freedom
## Multiple R-squared: 0.735, Adjusted R-squared: 0.718
## F-statistic: 42.5 on 3 and 46 DF, p-value: 2.6e-13
summary(glm(aa ~ pe * c, family = gaussian, data = wgc))
##
## Call:
## glm(formula = aa ~ pe * c, family = gaussian, data = wgc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -27.150 -4.545 0.799 6.523 13.633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.526 8.011 -0.07 0.95
## pe 4.898 0.595 8.24 1.3e-10 ***
## c 34.751 16.425 2.12 0.04 *
## pe:c -1.636 1.128 -1.45 0.15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 85.66)
##
## Null deviance: 14863.8 on 49 degrees of freedom
## Residual deviance: 3940.4 on 46 degrees of freedom
## AIC: 370.2
##
## Number of Fisher Scoring iterations: 2
# 也可以寫成 summary(glm(aa~ pe + c + pe:c, family=gaussian, data=wgc))
pass <- ifelse(wgc$aa >= 60, 1, 0)
mod01 <- glm(pass ~ pe + factor(sm) + factor(g), family = binomial,
data = wgc) #注意family
summary(mod01)
##
## Call:
## glm(formula = pass ~ pe + factor(sm) + factor(g), family = binomial,
## data = wgc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3261 -0.3483 0.0001 0.4305 1.6708
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.2884 3.7654 -2.73 0.0063 **
## pe 0.8343 0.3165 2.64 0.0084 **
## factor(sm)1 0.4102 0.9704 0.42 0.6725
## factor(sm)2 17.8166 2494.3276 0.01 0.9943
## factor(g)1 0.0103 0.9711 0.01 0.9916
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 59.295 on 49 degrees of freedom
## Residual deviance: 30.070 on 45 degrees of freedom
## AIC: 40.07
##
## Number of Fisher Scoring iterations: 18
library(MASS)
stepAIC(mod01)
## Start: AIC=40.07
## pass ~ pe + factor(sm) + factor(g)
##
## Df Deviance AIC
## - factor(g) 1 30.1 38.1
## - factor(sm) 2 33.3 39.3
## <none> 30.1 40.1
## - pe 1 43.7 51.7
##
## Step: AIC=38.07
## pass ~ pe + factor(sm)
##
## Df Deviance AIC
## - factor(sm) 2 33.5 37.5
## <none> 30.1 38.1
## - pe 1 43.7 49.7
##
## Step: AIC=37.51
## pass ~ pe
##
## Df Deviance AIC
## <none> 33.5 37.5
## - pe 1 59.3 61.3
##
## Call: glm(formula = pass ~ pe, family = binomial, data = wgc)
##
## Coefficients:
## (Intercept) pe
## -11.87 1.01
##
## Degrees of Freedom: 49 Total (i.e. Null); 48 Residual
## Null Deviance: 59.3
## Residual Deviance: 33.5 AIC: 37.5
mod02 <- update(mod01, . ~ . - factor(sm) - factor(g))
# 也可以寫成 mod02 <- glm(pass ~ pe, family=binomial, data=wgc)
summary(mod02)
##
## Call:
## glm(formula = pass ~ pe, family = binomial, data = wgc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.559 -0.343 0.169 0.452 1.527
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.865 3.831 -3.10 0.0020 **
## pe 1.007 0.313 3.22 0.0013 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 59.295 on 49 degrees of freedom
## Residual deviance: 33.508 on 48 degrees of freedom
## AIC: 37.51
##
## Number of Fisher Scoring iterations: 6
exp(1.0067) #e的1.0067次方
## [1] 2.737
根據md02的結果,我們知道,家長教育程度是影響學生該次成績及格與否一重要解釋變數。當家長的教育程度的增加一年,學生及格的勝算對數增加了2.74倍,或說及格的勝算增加了174%,即(2.74-1)*100%。