lm() function ; linear model
age <- rep(seq(20, 60, by = 5), 3)
mhr <- 209 - 0.7 * age + rnorm(length(age), sd = 4)
plot(mhr ~ age, main = "Age versus maximum heart rate")
res.mhr <- lm(mhr ~ age)
res.mhr
##
## Call:
## lm(formula = mhr ~ age)
##
## Coefficients:
## (Intercept) age
## 206.6606 -0.6601

##
## Call:
## lm(formula = mhr ~ age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.1566 -2.0898 0.3947 2.1686 7.2804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 206.66059 2.61739 78.96 < 2e-16 ***
## age -0.66005 0.06227 -10.60 9.78e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.177 on 25 degrees of freedom
## Multiple R-squared: 0.818, Adjusted R-squared: 0.8107
## F-statistic: 112.3 on 1 and 25 DF, p-value: 9.776e-11
## Analysis of Variance Table
##
## Response: mhr
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 1960.50 1960.50 112.35 9.776e-11 ***
## Residuals 25 436.25 17.45
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(res.mhr)

Dummy variables
- 93 Cars on Sale in the USA in 1993
- missing for Mazda RX-7, which has a rotary engine
library(MASS)
data("Cars93")
names(Cars93)
## [1] "Manufacturer" "Model" "Type"
## [4] "Min.Price" "Price" "Max.Price"
## [7] "MPG.city" "MPG.highway" "AirBags"
## [10] "DriveTrain" "Cylinders" "EngineSize"
## [13] "Horsepower" "RPM" "Rev.per.mile"
## [16] "Man.trans.avail" "Fuel.tank.capacity" "Passengers"
## [19] "Length" "Wheelbase" "Width"
## [22] "Turn.circle" "Rear.seat.room" "Luggage.room"
## [25] "Weight" "Origin" "Make"
## Factor w/ 6 levels "3","4","5","6",..: 2 4 4 4 2 2 4 4 4 5 ...
lm(MPG.highway ~ Cylinders + Horsepower, data = Cars93)
##
## Call:
## lm(formula = MPG.highway ~ Cylinders + Horsepower, data = Cars93)
##
## Coefficients:
## (Intercept) Cylinders4 Cylinders5 Cylinders6
## 45.10713 -10.60858 -16.88485 -15.06570
## Cylinders8 Cylindersrotary Horsepower
## -13.79903 -13.25383 -0.02688
Variable selection
- SAT 점수(언어/수학), 퀴즈 3번, 수학 성작 자료
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
##
## cancer
library(MASS)
data("stud.recs")
head(stud.recs)
## seq.1 seq.2 seq.3 sat.v sat.m letter.grade num.grade
## 1528 80 47 76 440 450 B 3.0
## 14586 74 76 81 410 480 B- 2.7
## 8295 84 85 90 610 590 D 1.0
## 8446 85 88 68 300 520 A 4.0
## 16685 81 75 66 550 560 A 4.0
## 16398 81 71 67 460 400 C 2.0
d <- subset(stud.recs, select = -letter.grade)
fit1 <- lm(num.grade ~ ., data = d)
stepAIC(fit1)
## Start: AIC=101.22
## num.grade ~ seq.1 + seq.2 + seq.3 + sat.v + sat.m
##
## Df Sum of Sq RSS AIC
## - seq.2 1 0.0705 254.70 99.254
## - seq.1 1 0.1578 254.78 99.297
## - sat.v 1 1.2766 255.90 99.840
## <none> 254.63 101.220
## - seq.3 1 5.9746 260.60 102.096
## - sat.m 1 10.4959 265.12 104.229
##
## Step: AIC=99.25
## num.grade ~ seq.1 + seq.3 + sat.v + sat.m
##
## Df Sum of Sq RSS AIC
## - seq.1 1 0.2944 254.99 97.398
## - sat.v 1 1.2553 255.95 97.864
## <none> 254.70 99.254
## - seq.3 1 6.0421 260.74 100.162
## - sat.m 1 10.6615 265.36 102.339
##
## Step: AIC=97.4
## num.grade ~ seq.3 + sat.v + sat.m
##
## Df Sum of Sq RSS AIC
## - sat.v 1 1.2497 256.24 96.004
## <none> 254.99 97.398
## - seq.3 1 5.8384 260.83 98.205
## - sat.m 1 10.8807 265.87 100.579
##
## Step: AIC=96
## num.grade ~ seq.3 + sat.m
##
## Df Sum of Sq RSS AIC
## <none> 256.24 96.004
## - seq.3 1 5.5494 261.79 96.661
## - sat.m 1 9.6323 265.87 98.580
##
## Call:
## lm(formula = num.grade ~ seq.3 + sat.m, data = d)
##
## Coefficients:
## (Intercept) seq.3 sat.m
## -1.14078 0.01371 0.00479
fit2 <- lm(num.grade ~ 1, data = d)
scope <- list(upper = fit1, lower = fit2)
stepAIC(fit2, direction = "forward", scope = scope)
## Start: AIC=102.43
## num.grade ~ 1
##
## Df Sum of Sq RSS AIC
## + sat.m 1 16.9316 261.79 96.661
## + seq.3 1 12.8487 265.87 98.580
## + seq.2 1 5.4308 273.29 101.992
## + seq.1 1 4.8770 273.85 102.243
## <none> 278.72 102.432
## + sat.v 1 0.4485 278.27 104.232
##
## Step: AIC=96.66
## num.grade ~ sat.m
##
## Df Sum of Sq RSS AIC
## + seq.3 1 5.5494 256.24 96.004
## <none> 261.79 96.661
## + sat.v 1 0.9606 260.83 98.205
## + seq.2 1 0.2324 261.56 98.551
## + seq.1 1 0.0821 261.71 98.622
##
## Step: AIC=96
## num.grade ~ sat.m + seq.3
##
## Df Sum of Sq RSS AIC
## <none> 256.24 96.004
## + sat.v 1 1.24967 254.99 97.398
## + seq.1 1 0.28877 255.95 97.864
## + seq.2 1 0.17043 256.07 97.921
##
## Call:
## lm(formula = num.grade ~ sat.m + seq.3, data = d)
##
## Coefficients:
## (Intercept) sat.m seq.3
## -1.14078 0.00479 0.01371
glm() function
library(UsingR)
library(MASS)
head(esoph)
## agegp alcgp tobgp ncases ncontrols
## 1 25-34 0-39g/day 0-9g/day 0 40
## 2 25-34 0-39g/day 10-19 0 10
## 3 25-34 0-39g/day 20-29 0 6
## 4 25-34 0-39g/day 30+ 0 5
## 5 25-34 40-79 0-9g/day 0 27
## 6 25-34 40-79 10-19 0 7
## 'data.frame': 88 obs. of 5 variables:
## $ agegp : Ord.factor w/ 6 levels "25-34"<"35-44"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ alcgp : Ord.factor w/ 4 levels "0-39g/day"<"40-79"<..: 1 1 1 1 2 2 2 2 3 3 ...
## $ tobgp : Ord.factor w/ 4 levels "0-9g/day"<"10-19"<..: 1 2 3 4 1 2 3 4 1 2 ...
## $ ncases : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ncontrols: num 40 10 6 5 27 7 4 7 2 1 ...
res.full <- glm(cbind(ncases, ncontrols) ~ agegp + tobgp * alcgp, data = esoph, family = "binomial")
stepAIC(res.full, trace = 0)
##
## Call: glm(formula = cbind(ncases, ncontrols) ~ agegp + tobgp + alcgp,
## family = "binomial", data = esoph)
##
## Coefficients:
## (Intercept) agegp.L agegp.Q agegp.C agegp^4 agegp^5
## -1.19039 3.99663 -1.65741 0.11094 0.07892 -0.26219
## tobgp.L tobgp.Q tobgp.C alcgp.L alcgp.Q alcgp.C
## 1.11749 0.34516 0.31692 2.53899 0.09376 0.43930
##
## Degrees of Freedom: 87 Total (i.e. Null); 76 Residual
## Null Deviance: 368
## Residual Deviance: 82.34 AIC: 221.4