lm() function ; linear model

age <- rep(seq(20, 60, by = 5), 3)
mhr <- 209 - 0.7 * age + rnorm(length(age), sd = 4)
plot(mhr ~ age, main = "Age versus maximum heart rate")
res.mhr <- lm(mhr ~ age)
res.mhr
## 
## Call:
## lm(formula = mhr ~ age)
## 
## Coefficients:
## (Intercept)          age  
##    206.6606      -0.6601
abline(res.mhr)

summary(res.mhr)
## 
## Call:
## lm(formula = mhr ~ age)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1566  -2.0898   0.3947   2.1686   7.2804 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 206.66059    2.61739   78.96  < 2e-16 ***
## age          -0.66005    0.06227  -10.60 9.78e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.177 on 25 degrees of freedom
## Multiple R-squared:  0.818,  Adjusted R-squared:  0.8107 
## F-statistic: 112.3 on 1 and 25 DF,  p-value: 9.776e-11
anova(res.mhr)
## Analysis of Variance Table
## 
## Response: mhr
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## age        1 1960.50 1960.50  112.35 9.776e-11 ***
## Residuals 25  436.25   17.45                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(res.mhr)

Making formula with interactions

x <- 1:10
y <- rchisq(10, 3)
z <- 1 + x + y + rnorm(10)
lm(z ~ x + y + I(x * y))
## 
## Call:
## lm(formula = z ~ x + y + I(x * y))
## 
## Coefficients:
## (Intercept)            x            y     I(x * y)  
##      2.6268       0.7208       0.4184       0.1313
lm(z ~ x + y + x:y)
## 
## Call:
## lm(formula = z ~ x + y + x:y)
## 
## Coefficients:
## (Intercept)            x            y          x:y  
##      2.6268       0.7208       0.4184       0.1313
lm(z ~ x * y)
## 
## Call:
## lm(formula = z ~ x * y)
## 
## Coefficients:
## (Intercept)            x            y          x:y  
##      2.6268       0.7208       0.4184       0.1313

Dummy variables

  • 93 Cars on Sale in the USA in 1993
  • missing for Mazda RX-7, which has a rotary engine
library(MASS)
data("Cars93")
names(Cars93)
##  [1] "Manufacturer"       "Model"              "Type"              
##  [4] "Min.Price"          "Price"              "Max.Price"         
##  [7] "MPG.city"           "MPG.highway"        "AirBags"           
## [10] "DriveTrain"         "Cylinders"          "EngineSize"        
## [13] "Horsepower"         "RPM"                "Rev.per.mile"      
## [16] "Man.trans.avail"    "Fuel.tank.capacity" "Passengers"        
## [19] "Length"             "Wheelbase"          "Width"             
## [22] "Turn.circle"        "Rear.seat.room"     "Luggage.room"      
## [25] "Weight"             "Origin"             "Make"
str(Cars93$Cylinders)
##  Factor w/ 6 levels "3","4","5","6",..: 2 4 4 4 2 2 4 4 4 5 ...
lm(MPG.highway ~ Cylinders + Horsepower, data = Cars93)
## 
## Call:
## lm(formula = MPG.highway ~ Cylinders + Horsepower, data = Cars93)
## 
## Coefficients:
##     (Intercept)       Cylinders4       Cylinders5       Cylinders6  
##        45.10713        -10.60858        -16.88485        -15.06570  
##      Cylinders8  Cylindersrotary       Horsepower  
##       -13.79903        -13.25383         -0.02688

Variable selection

  • SAT 점수(언어/수학), 퀴즈 3번, 수학 성작 자료
library(UsingR)
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
## 
##     cancer
library(MASS)
data("stud.recs")
head(stud.recs)
##       seq.1 seq.2 seq.3 sat.v sat.m letter.grade num.grade
## 1528     80    47    76   440   450          B         3.0
## 14586    74    76    81   410   480          B-        2.7
## 8295     84    85    90   610   590          D         1.0
## 8446     85    88    68   300   520          A         4.0
## 16685    81    75    66   550   560          A         4.0
## 16398    81    71    67   460   400          C         2.0
d <- subset(stud.recs, select = -letter.grade)
fit1 <- lm(num.grade ~ ., data = d)
stepAIC(fit1)
## Start:  AIC=101.22
## num.grade ~ seq.1 + seq.2 + seq.3 + sat.v + sat.m
## 
##         Df Sum of Sq    RSS     AIC
## - seq.2  1    0.0705 254.70  99.254
## - seq.1  1    0.1578 254.78  99.297
## - sat.v  1    1.2766 255.90  99.840
## <none>               254.63 101.220
## - seq.3  1    5.9746 260.60 102.096
## - sat.m  1   10.4959 265.12 104.229
## 
## Step:  AIC=99.25
## num.grade ~ seq.1 + seq.3 + sat.v + sat.m
## 
##         Df Sum of Sq    RSS     AIC
## - seq.1  1    0.2944 254.99  97.398
## - sat.v  1    1.2553 255.95  97.864
## <none>               254.70  99.254
## - seq.3  1    6.0421 260.74 100.162
## - sat.m  1   10.6615 265.36 102.339
## 
## Step:  AIC=97.4
## num.grade ~ seq.3 + sat.v + sat.m
## 
##         Df Sum of Sq    RSS     AIC
## - sat.v  1    1.2497 256.24  96.004
## <none>               254.99  97.398
## - seq.3  1    5.8384 260.83  98.205
## - sat.m  1   10.8807 265.87 100.579
## 
## Step:  AIC=96
## num.grade ~ seq.3 + sat.m
## 
##         Df Sum of Sq    RSS    AIC
## <none>               256.24 96.004
## - seq.3  1    5.5494 261.79 96.661
## - sat.m  1    9.6323 265.87 98.580
## 
## Call:
## lm(formula = num.grade ~ seq.3 + sat.m, data = d)
## 
## Coefficients:
## (Intercept)        seq.3        sat.m  
##    -1.14078      0.01371      0.00479
fit2 <- lm(num.grade ~ 1, data = d)
scope <- list(upper = fit1, lower = fit2)
stepAIC(fit2, direction = "forward", scope = scope)
## Start:  AIC=102.43
## num.grade ~ 1
## 
##         Df Sum of Sq    RSS     AIC
## + sat.m  1   16.9316 261.79  96.661
## + seq.3  1   12.8487 265.87  98.580
## + seq.2  1    5.4308 273.29 101.992
## + seq.1  1    4.8770 273.85 102.243
## <none>               278.72 102.432
## + sat.v  1    0.4485 278.27 104.232
## 
## Step:  AIC=96.66
## num.grade ~ sat.m
## 
##         Df Sum of Sq    RSS    AIC
## + seq.3  1    5.5494 256.24 96.004
## <none>               261.79 96.661
## + sat.v  1    0.9606 260.83 98.205
## + seq.2  1    0.2324 261.56 98.551
## + seq.1  1    0.0821 261.71 98.622
## 
## Step:  AIC=96
## num.grade ~ sat.m + seq.3
## 
##         Df Sum of Sq    RSS    AIC
## <none>               256.24 96.004
## + sat.v  1   1.24967 254.99 97.398
## + seq.1  1   0.28877 255.95 97.864
## + seq.2  1   0.17043 256.07 97.921
## 
## Call:
## lm(formula = num.grade ~ sat.m + seq.3, data = d)
## 
## Coefficients:
## (Intercept)        sat.m        seq.3  
##    -1.14078      0.00479      0.01371

glm() function

  • 흡연, 음주, 식도암 자료
library(UsingR)
library(MASS)
head(esoph)
##   agegp     alcgp    tobgp ncases ncontrols
## 1 25-34 0-39g/day 0-9g/day      0        40
## 2 25-34 0-39g/day    10-19      0        10
## 3 25-34 0-39g/day    20-29      0         6
## 4 25-34 0-39g/day      30+      0         5
## 5 25-34     40-79 0-9g/day      0        27
## 6 25-34     40-79    10-19      0         7
str(esoph)
## 'data.frame':    88 obs. of  5 variables:
##  $ agegp    : Ord.factor w/ 6 levels "25-34"<"35-44"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ alcgp    : Ord.factor w/ 4 levels "0-39g/day"<"40-79"<..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ tobgp    : Ord.factor w/ 4 levels "0-9g/day"<"10-19"<..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ ncases   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ncontrols: num  40 10 6 5 27 7 4 7 2 1 ...
res.full <- glm(cbind(ncases, ncontrols) ~ agegp + tobgp * alcgp, data = esoph, family = "binomial")
stepAIC(res.full, trace = 0)
## 
## Call:  glm(formula = cbind(ncases, ncontrols) ~ agegp + tobgp + alcgp, 
##     family = "binomial", data = esoph)
## 
## Coefficients:
## (Intercept)      agegp.L      agegp.Q      agegp.C      agegp^4      agegp^5  
##    -1.19039      3.99663     -1.65741      0.11094      0.07892     -0.26219  
##     tobgp.L      tobgp.Q      tobgp.C      alcgp.L      alcgp.Q      alcgp.C  
##     1.11749      0.34516      0.31692      2.53899      0.09376      0.43930  
## 
## Degrees of Freedom: 87 Total (i.e. Null);  76 Residual
## Null Deviance:       368 
## Residual Deviance: 82.34     AIC: 221.4