library(slidify)
create_deck(“Topic04”, git = TRUE)


R 統計軟體初階課程


Topic 4: 用R作迴歸分析

  1. 迴歸的基本概念
  2. 複迴歸/多元線性迴歸
  3. 模型的選擇
  4. 非線性迴歸之二元勝算對數分析

1. 線性迴歸的基本概念

線性模型

 X1->Y 
 Y = a + b1*X1 

假設


模型的結構/語法

Y ~ X1 + X2 + X3 …


先瞭解自變數與依變數之間的基本關係

> load("wgcoll.rda")
> plot(wgc$aa ~ wgc$pe)  # academic average & parents' education

plot of chunk unnamed-chunk-1

> cor(wgc$aa, wgc$pe)  # correlation coefficient
## [1] 0.7931

> model1 <- lm(wgc$aa ~ wgc$pe)
> summary(model1)
## 
## Call:
## lm(formula = wgc$aa ~ wgc$pe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.243  -4.502   0.712   5.757  24.802 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.659      7.875    0.21     0.83    
## wgc$pe         5.045      0.559    9.02  6.6e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 10.7 on 48 degrees of freedom
## Multiple R-squared: 0.629,   Adjusted R-squared: 0.621 
## F-statistic: 81.4 on 1 and 48 DF,  p-value: 6.57e-12

用 Cook's distance的概念來找出影響力最大的個案

> plot(wgc$aa ~ wgc$pe, cex = 10 * sqrt(cooks.distance(model1)))
> abline(h = mean(wgc$aa), lty = 2)  #畫水平線
> abline(v = mean(wgc$pe), lty = 3)  #畫垂直線
> abline(model1, lty = 1, col = "red")

plot of chunk unnamed-chunk-3


動手操作這一段

attach(wgc)
with(wgc, identify(aa ~ pe, labels = id))  # 手動點出1個極端值(左下角)
abline(lm(aa ~ pe, data = wgc, subset = c(-8)), lty = 2, col = "blue")
labels <- c("all", "-8")
# 在圖上加圖示
legend(locator(1), legend = labels, lty = 1:2, col = c("red", "blue"))
detach(wgc)

經過迴歸計算後得到的「線性模型」

aa = 1.66 + 5.04 * pe


迴歸結果的判讀

F檢定與R2

> summary(model1)
## 
## Call:
## lm(formula = wgc$aa ~ wgc$pe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.243  -4.502   0.712   5.757  24.802 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.659      7.875    0.21     0.83    
## wgc$pe         5.045      0.559    9.02  6.6e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 10.7 on 48 degrees of freedom
## Multiple R-squared: 0.629,   Adjusted R-squared: 0.621 
## F-statistic: 81.4 on 1 and 48 DF,  p-value: 6.57e-12

> cor <- cor(wgc$aa, wgc$pe)
> cor^2
## [1] 0.6291

2. 複迴歸/多元線性迴歸

有人認為學生的居住地對學生成績會有影響。你說呢?

> model2 <- lm(aa ~ pe + c, data = wgc)  # community type: 0=urban, 1=rural
> summary(model2)
## 
## Call:
## lm(formula = aa ~ pe + c, data = wgc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.226  -4.721   0.831   6.424  16.942 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.462      6.946    0.79  0.43559    
## pe             4.443      0.511    8.69  2.4e-11 ***
## c             11.276      2.829    3.99  0.00023 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 9.36 on 47 degrees of freedom
## Multiple R-squared: 0.723,   Adjusted R-squared: 0.711 
## F-statistic: 61.3 on 2 and 47 DF,  p-value: 8.06e-14

> plot(fitted(model2), resid(model2))

plot of chunk unnamed-chunk-8


虛擬變數的使用:兩個虛擬變數可以表示三種名目變數的情境

有人認為,學生成就動機(student motivation)對學生成績有影響。你同意嗎?
table(wgc$sm)
## 
##  0  1  2 
## 13 23 14
sm1 <- ifelse(wgc$sm == 1, 1, 0)
sm2 <- ifelse(wgc$sm == 2, 1, 0)  #注意共有三個類別但只編了兩個虛擬變數!
model3 <- lm(aa ~ pe + c + sm1 + sm2, data = wgc)
# 也可以寫成: model3 <- update(model2, .~.+ sm, data=wgc)

> summary(model3)
## 
## Call:
## lm(formula = aa ~ pe + c + sm1 + sm2, data = wgc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24.09  -5.29   1.86   5.39  19.86 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.353      7.020    0.90  0.37031    
## pe             4.584      0.568    8.07  2.7e-10 ***
## c             11.639      2.825    4.12  0.00016 ***
## sm1           -5.853      3.513   -1.67  0.10264    
## sm2           -1.009      4.208   -0.24  0.81162    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 9.16 on 45 degrees of freedom
## Multiple R-squared: 0.746,   Adjusted R-squared: 0.724 
## F-statistic: 33.1 on 4 and 45 DF,  p-value: 7.07e-13

plot(fitted(model3), resid(model3))

plot of chunk unnamed-chunk-11

交叉變數的使用

-有人說,其實果家長的教育程度其實是受到其居住地的影響。
-(the effect of X1 dependents on X2)


> model4 <- lm(aa ~ pe * c, data = wgc)
> model4 <- lm(aa ~ pe + c + pe:c, data = wgc)
> summary(model4)
## 
## Call:
## lm(formula = aa ~ pe + c + pe:c, data = wgc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.150  -4.545   0.799   6.523  13.633 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.526      8.011   -0.07     0.95    
## pe             4.898      0.595    8.24  1.3e-10 ***
## c             34.751     16.425    2.12     0.04 *  
## pe:c          -1.636      1.128   -1.45     0.15    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 9.26 on 46 degrees of freedom
## Multiple R-squared: 0.735,   Adjusted R-squared: 0.718 
## F-statistic: 42.5 on 3 and 46 DF,  p-value: 2.6e-13

4. 模型選擇(model selection)

Akaike's information criterion (AIC): 愈小愈好。

AIC(model1)
## [1] 383
AIC(model2)
## [1] 370.5
AIC(model3)
## [1] 370.1
AIC(model4)
## [1] 370.2

一個解釋寶寶出生重量的迴歸模型

install.packages("UsingR")
library(UsingR)
## Loading required package: MASS
data(babies)
subbabies <- subset(babies, gestation < 350 & age < 99 & ht < 99 & 
    wt1 < 999 & dage < 99 & dht < 99 & dwt < 999)
res.lm <- lm(wt ~ gestation + age + ht + wt1 + dage + dht + dwt, 
    data = subbabies)

summary(res.lm)
## 
## Call:
## lm(formula = wt ~ gestation + age + ht + wt1 + dage + dht + dwt, 
##     data = subbabies)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -48.69 -10.54   0.27  10.13  55.23 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -105.4576    23.3486   -4.52  7.4e-06 ***
## gestation      0.4625     0.0397   11.66  < 2e-16 ***
## age            0.1384     0.1879    0.74    0.462    
## ht             1.2161     0.2848    4.27  2.2e-05 ***
## wt1            0.0289     0.0342    0.84    0.399    
## dage           0.0590     0.1653    0.36    0.721    
## dht           -0.0663     0.2703   -0.25    0.806    
## dwt            0.0782     0.0330    2.37    0.018 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 16.4 on 692 degrees of freedom
## Multiple R-squared: 0.215,   Adjusted R-squared: 0.207 
## F-statistic: 27.1 on 7 and 692 DF,  p-value: <2e-16

plot(fitted(res.lm), resid(res.lm))

plot of chunk unnamed-chunk-17


plot(res.lm)  # 注意:R需要你按Enter鍵來切換四張圖;也注意一下261這個outlier

plot of chunk unnamed-chunk-18 plot of chunk unnamed-chunk-18 plot of chunk unnamed-chunk-18 plot of chunk unnamed-chunk-18


babies[260:263, ]  # 261st (id=4604)這個個案看似陣痛期特別短
##       id pluralty outcome date gestation sex  wt parity race age ed ht wt1
## 260 4514        5       1 1569       285   1 130      1    6  23  4 63 128
## 261 4604        5       1 1598       148   1 116      7    7  28  1 66 135
## 262 4639        5       1 1569       256   1  81      4    2  30  1 64 148
## 263 4674        5       1 1355       287   1 124      3    4  27  1 62 105
##     drace dage ded dht dwt marital inc smoke time number
## 260     6   27   2  99 999       1  98     1    1      1
## 261     7   36   1  68 155       0   2     0    0      0
## 262     5   31   2  73 190       1   5     1    1      5
## 263     5   31   1  99 999       1  98     1    1      2
subbabies2 <- subset(subbabies, id != 4604)
res.lm2 <- lm(wt ~ gestation + age + ht + wt1 + dage + dht + dwt, 
    data = subbabies2)

plot(res.lm2)

plot of chunk unnamed-chunk-20 plot of chunk unnamed-chunk-20 plot of chunk unnamed-chunk-20 plot of chunk unnamed-chunk-20


課堂練習

有人認為性別對成績有影響。您相信嗎?請用lm和update指令做個模型來印證你的說法。
model5 <- update(model3, . ~ . + g)
summary(model5)

4. 非線性迴歸之二元勝算對數分析(binary logit analysis)

基本觀念


Odd的意思


lm 只是glm 的其中一種型態

load("wgcoll.rda")
summary(lm(aa ~ pe * c, data = wgc))
## 
## Call:
## lm(formula = aa ~ pe * c, data = wgc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.150  -4.545   0.799   6.523  13.633 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.526      8.011   -0.07     0.95    
## pe             4.898      0.595    8.24  1.3e-10 ***
## c             34.751     16.425    2.12     0.04 *  
## pe:c          -1.636      1.128   -1.45     0.15    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 9.26 on 46 degrees of freedom
## Multiple R-squared: 0.735,   Adjusted R-squared: 0.718 
## F-statistic: 42.5 on 3 and 46 DF,  p-value: 2.6e-13

summary(glm(aa ~ pe * c, family = gaussian, data = wgc))
## 
## Call:
## glm(formula = aa ~ pe * c, family = gaussian, data = wgc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -27.150   -4.545    0.799    6.523   13.633  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.526      8.011   -0.07     0.95    
## pe             4.898      0.595    8.24  1.3e-10 ***
## c             34.751     16.425    2.12     0.04 *  
## pe:c          -1.636      1.128   -1.45     0.15    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for gaussian family taken to be 85.66)
## 
##     Null deviance: 14863.8  on 49  degrees of freedom
## Residual deviance:  3940.4  on 46  degrees of freedom
## AIC: 370.2
## 
## Number of Fisher Scoring iterations: 2
# 也可以寫成 summary(glm(aa~ pe + c + pe:c, family=gaussian, data=wgc))

二元勝算對數模型

pass <- ifelse(wgc$aa >= 60, 1, 0)
mod01 <- glm(pass ~ pe + factor(sm) + factor(g), family = binomial, 
    data = wgc)  #注意family

summary(mod01)
## 
## Call:
## glm(formula = pass ~ pe + factor(sm) + factor(g), family = binomial, 
##     data = wgc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3261  -0.3483   0.0001   0.4305   1.6708  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -10.2884     3.7654   -2.73   0.0063 **
## pe             0.8343     0.3165    2.64   0.0084 **
## factor(sm)1    0.4102     0.9704    0.42   0.6725   
## factor(sm)2   17.8166  2494.3276    0.01   0.9943   
## factor(g)1     0.0103     0.9711    0.01   0.9916   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 59.295  on 49  degrees of freedom
## Residual deviance: 30.070  on 45  degrees of freedom
## AIC: 40.07
## 
## Number of Fisher Scoring iterations: 18

[進階] 使用工具來自動挑選模型

這方法只有沒有理論基礎的研究才適用
library(MASS)
stepAIC(mod01)
## Start:  AIC=40.07
## pass ~ pe + factor(sm) + factor(g)
## 
##              Df Deviance  AIC
## - factor(g)   1     30.1 38.1
## - factor(sm)  2     33.3 39.3
## <none>              30.1 40.1
## - pe          1     43.7 51.7
## 
## Step:  AIC=38.07
## pass ~ pe + factor(sm)
## 
##              Df Deviance  AIC
## - factor(sm)  2     33.5 37.5
## <none>              30.1 38.1
## - pe          1     43.7 49.7
## 
## Step:  AIC=37.51
## pass ~ pe
## 
##        Df Deviance  AIC
## <none>        33.5 37.5
## - pe    1     59.3 61.3
## 
## Call:  glm(formula = pass ~ pe, family = binomial, data = wgc)
## 
## Coefficients:
## (Intercept)           pe  
##      -11.87         1.01  
## 
## Degrees of Freedom: 49 Total (i.e. Null);  48 Residual
## Null Deviance:       59.3 
## Residual Deviance: 33.5  AIC: 37.5

mod02 <- update(mod01, . ~ . - factor(sm) - factor(g))
# 也可以寫成 mod02 <- glm(pass ~ pe, family=binomial, data=wgc)

summary(mod02)
## 
## Call:
## glm(formula = pass ~ pe, family = binomial, data = wgc)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.559  -0.343   0.169   0.452   1.527  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -11.865      3.831   -3.10   0.0020 **
## pe             1.007      0.313    3.22   0.0013 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 59.295  on 49  degrees of freedom
## Residual deviance: 33.508  on 48  degrees of freedom
## AIC: 37.51
## 
## Number of Fisher Scoring iterations: 6

結果的解釋

exp(1.0067)  #e的1.0067次方
## [1] 2.737

根據md02的結果,我們知道,家長教育程度是影響學生該次成績及格與否一重要解釋變數。當家長的教育程度的增加一年,學生及格的勝算對數增加了2.74倍,或說及格的勝算增加了174%,即(2.74-1)*100%。