code 12.1
visit<-c(350,460,350,430,350,380,430,470,450,490,340,300,440,450,300)
fee<-c(5.5,7.5,8,8,6.8,7.5,4.5,6.4,7,5,7.2,7.9,5.9,5,7)
ad<-c(3.3,3.3,3,4.5,3,4,3,3.7,3.5,4,3.5,3.2,4,3.5,2.7)
region<-c("A","A","A","A","A","B","B","B","B","B","C","C","C","C","C")
interaction<-fee*ad
da<-c(1,1,1,1,1,0,0,0,0,0,0,0,0,0,0)
db<-c(0,0,0,0,0,1,1,1,1,1,0,0,0,0,0)
sale<-data.frame(visit, fee, ad, interaction, region, da, db)
sale
## visit fee ad interaction region da db
## 1 350 5.5 3.3 18.15 A 1 0
## 2 460 7.5 3.3 24.75 A 1 0
## 3 350 8.0 3.0 24.00 A 1 0
## 4 430 8.0 4.5 36.00 A 1 0
## 5 350 6.8 3.0 20.40 A 1 0
## 6 380 7.5 4.0 30.00 B 0 1
## 7 430 4.5 3.0 13.50 B 0 1
## 8 470 6.4 3.7 23.68 B 0 1
## 9 450 7.0 3.5 24.50 B 0 1
## 10 490 5.0 4.0 20.00 B 0 1
## 11 340 7.2 3.5 25.20 C 0 0
## 12 300 7.9 3.2 25.28 C 0 0
## 13 440 5.9 4.0 23.60 C 0 0
## 14 450 5.0 3.5 17.50 C 0 0
## 15 300 7.0 2.7 18.90 C 0 0
code 12.2
# code 12.1의 자료 사용
sale
## visit fee ad interaction region da db
## 1 350 5.5 3.3 18.15 A 1 0
## 2 460 7.5 3.3 24.75 A 1 0
## 3 350 8.0 3.0 24.00 A 1 0
## 4 430 8.0 4.5 36.00 A 1 0
## 5 350 6.8 3.0 20.40 A 1 0
## 6 380 7.5 4.0 30.00 B 0 1
## 7 430 4.5 3.0 13.50 B 0 1
## 8 470 6.4 3.7 23.68 B 0 1
## 9 450 7.0 3.5 24.50 B 0 1
## 10 490 5.0 4.0 20.00 B 0 1
## 11 340 7.2 3.5 25.20 C 0 0
## 12 300 7.9 3.2 25.28 C 0 0
## 13 440 5.9 4.0 23.60 C 0 0
## 14 450 5.0 3.5 17.50 C 0 0
## 15 300 7.0 2.7 18.90 C 0 0
cov(sale[,1:2]) #공분산행렬
## visit fee
## visit 4035.23810 -32.990476
## fee -32.99048 1.372667
cov(sale[,1:2])*(nrow(sale)-1) #공분산행렬, 제곱합
## visit fee
## visit 56493.3333 -461.86667
## fee -461.8667 19.21733
summary(sale)
## visit fee ad interaction region
## Min. :300.0 Min. :4.500 Min. :2.70 Min. :13.50 A:5
## 1st Qu.:350.0 1st Qu.:5.700 1st Qu.:3.10 1st Qu.:19.45 B:5
## Median :430.0 Median :7.000 Median :3.50 Median :23.68 C:5
## Mean :399.3 Mean :6.613 Mean :3.48 Mean :23.03
## 3rd Qu.:450.0 3rd Qu.:7.500 3rd Qu.:3.85 3rd Qu.:24.98
## Max. :490.0 Max. :8.000 Max. :4.50 Max. :36.00
## da db
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000
## Mean :0.3333 Mean :0.3333
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
code 12.3
# code 12.1의 자료 사용
result.12.3<-lm(visit ~ fee, data=sale)
aov(result.12.3)
## Call:
## aov(formula = result.12.3)
##
## Terms:
## fee Residuals
## Sum of Squares 11100.44 45392.90
## Deg. of Freedom 1 13
##
## Residual standard error: 59.09113
## Estimated effects may be unbalanced
summary(aov(result.12.3))
## Df Sum Sq Mean Sq F value Pr(>F)
## fee 1 11100 11100 3.179 0.0979 .
## Residuals 13 45393 3492
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(result.12.3)
##
## Call:
## lm(formula = visit ~ fee, data = sale)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.040 -45.040 1.977 55.926 81.977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 558.28 90.44 6.173 3.36e-05 ***
## fee -24.03 13.48 -1.783 0.0979 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59.09 on 13 degrees of freedom
## Multiple R-squared: 0.1965, Adjusted R-squared: 0.1347
## F-statistic: 3.179 on 1 and 13 DF, p-value: 0.09794
confint(result.12.3, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 362.89124 753.66326
## fee -53.15468 5.08696
#각 회귀분석 모형 및 결과요약 객체. 필요한 결과물을 꺼내쓰면 된다.
ls(result.12.3)
## [1] "assign" "call" "coefficients" "df.residual"
## [5] "effects" "fitted.values" "model" "qr"
## [9] "rank" "residuals" "terms" "xlevels"
ls(summary(result.12.3))
## [1] "adj.r.squared" "aliased" "call" "coefficients"
## [5] "cov.unscaled" "df" "fstatistic" "r.squared"
## [9] "residuals" "sigma" "terms"
#수정 r제곱
summary(result.12.3)$adj.r.squared
## [1] 0.1346827
#Root MSE
r.mse<-sqrt(sum(result.12.3$residuals^2)/(nrow(sale)-2))
r.mse
## [1] 59.09113
#종속변수 평균
d.mean<-mean(sale$visit)
d.mean
## [1] 399.3333
#Coeff. Var.
c.v<-r.mse/d.mean*100
c.v
## [1] 14.79744
#적합결여검정(Lack-of-fit test)
model.lm<-lm(visit ~ fee, data=sale)
model.factor<-lm(visit ~ as.factor(fee), data=sale)
anova(model.lm, model.factor)
## Analysis of Variance Table
##
## Model 1: visit ~ fee
## Model 2: visit ~ as.factor(fee)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 13 45393
## 2 4 18450 9 26943 0.649 0.7295
code 12.4
# code 12.1의 자료 사용
library(ggplot2)
sale$predict<-predict(result.12.3)
library(grid)
ggplot(sale, aes(x=fee, y=predict)) + geom_point(shape=3) +
stat_smooth(method=lm, se=FALSE) +
geom_hline(yintercept=mean(sale$visit)) +
geom_segment(aes(yend=mean(sale$visit)), xend=fee, color="red", size=1) +
geom_segment(aes(yend=visit), xend=fee, linetype=3, size=1) +
geom_point(data=sale, aes(x=fee, y=visit)) +
annotate("text", parse=TRUE, x=5.9, y=410, label="hat(y[i]) - bar(y)", hjust=-.5) +
annotate("text", parse=TRUE, x=5.9, y=430, label="y[i] - bar(y)", hjust=-.5) +
annotate("segment", x=5.95, xend=5.95, y=mean(sale$visit), yend=416.4775,
arrow=arrow(ends="both", angle=45, length=unit(0.2, "cm")), color="blue") +
annotate("segment", x=5.95, xend=5.95, y=415.4775, yend=440,
arrow=arrow(ends="both", angle=45, length=unit(0.2, "cm")), color="blue")

code 12.5
# code 12.1의 자료 사용
confint(result.12.3, level=0.9) #계수의 신뢰구간
## 5 % 95 %
## (Intercept) 398.11218 718.442322
## fee -47.90526 -0.162456
predict(result.12.3, interval="confidence", level=0.9) #종속변수 평균의 신뢰구간
## fit lwr upr
## 1 426.0910 388.1913 463.9907
## 2 378.0233 343.7005 412.3461
## 3 366.0064 323.2772 408.7355
## 4 366.0064 323.2772 408.7355
## 5 394.8470 367.4624 422.2316
## 6 378.0233 343.7005 412.3461
## 7 450.1249 392.8966 507.3532
## 8 404.4606 376.9652 431.9559
## 9 390.0402 361.4875 418.5929
## 10 438.1080 391.0625 485.1534
## 11 385.2335 354.8002 415.6668
## 12 368.4098 327.5021 409.3175
## 13 416.4775 384.5397 448.4152
## 14 438.1080 391.0625 485.1534
## 15 390.0402 361.4875 418.5929
predict(result.12.3, interval="prediction", level=0.9) #종속변수 관측값의 신뢰구간
## Warning in predict.lm(result.12.3, interval = "prediction", level = 0.9): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 426.0910 314.7929 537.3891
## 2 378.0233 267.8919 488.1548
## 3 366.0064 252.9725 479.0403
## 4 366.0064 252.9725 479.0403
## 5 394.8470 286.6768 503.0172
## 6 378.0233 267.8919 488.1548
## 7 450.1249 330.8523 569.3975
## 8 404.4606 296.2623 512.6589
## 9 390.0402 281.5684 498.5121
## 10 438.1080 323.3728 552.8431
## 11 385.2335 276.2515 494.2154
## 12 368.4098 256.0518 480.7678
## 13 416.4775 307.0659 525.8891
## 14 438.1080 323.3728 552.8431
## 15 390.0402 281.5684 498.5121
ggplot(sale[1:15,], aes(x=fee, y=visit)) + geom_point() + stat_smooth(method=lm, level=0.90)

#좀 더 확실하게 그려보자
sale2<-sale[1:15,1:2]
sale2$mean.clm.l<-predict(result.12.3, interval="confidence", level=0.9)[,2]
sale2$mean.clm.u<-predict(result.12.3, interval="confidence", level=0.9)[,3]
sale2$cli.l<-predict(result.12.3, interval="prediction", level=0.9)[,2]
## Warning in predict.lm(result.12.3, interval = "prediction", level = 0.9): predictions on current data refer to _future_ responses
sale2$cli.u<-predict(result.12.3, interval="prediction", level=0.9)[,3]
## Warning in predict.lm(result.12.3, interval = "prediction", level = 0.9): predictions on current data refer to _future_ responses
ggplot(sale2, aes(x=fee, y=visit)) + geom_point() + stat_smooth(method=lm, level=0.90) +
geom_point(data=sale2, aes(x=fee, y=mean.clm.l), shape=3) +
geom_point(data=sale2, aes(x=fee, y=mean.clm.u), shape=3) +
geom_line(data=sale2, aes(x=fee, y=cli.l), linetype=5) +
geom_line(data=sale2, aes(x=fee, y=cli.u), linetype=5)

# 독립변수가 10으로 주어졌을 때 예측치와 신뢰구간
sale3<-sale[,1:2]
sale3<-rbind(sale3, c(NA, 10))
result.12.5<-lm(visit ~ fee, data=sale3)
predict(result.12.5, newdata=data.frame(fee=sale3$fee))
## 1 2 3 4 5 6 7 8
## 426.0910 378.0233 366.0064 366.0064 394.8470 378.0233 450.1249 404.4606
## 9 10 11 12 13 14 15 16
## 390.0402 438.1080 385.2335 368.4098 416.4775 438.1080 390.0402 317.9387
sale3$mean.clm.l<-predict(result.12.5, newdata=data.frame(fee=sale3$fee), interval="confidence", level=0.9)[,2]
sale3$mean.clm.u<-predict(result.12.5, newdata=data.frame(fee=sale3$fee), interval="confidence", level=0.9)[,3]
sale3$mean.cli.l<-predict(result.12.5, newdata=data.frame(fee=sale3$fee), interval="prediction", level=0.9)[,2]
sale3$mean.cli.u<-predict(result.12.5, newdata=data.frame(fee=sale3$fee), interval="prediction", level=0.9)[,3]
sale3
## visit fee mean.clm.l mean.clm.u mean.cli.l mean.cli.u
## 1 350 5.5 388.1913 463.9907 314.7929 537.3891
## 2 460 7.5 343.7005 412.3461 267.8919 488.1548
## 3 350 8.0 323.2772 408.7355 252.9725 479.0403
## 4 430 8.0 323.2772 408.7355 252.9725 479.0403
## 5 350 6.8 367.4624 422.2316 286.6768 503.0172
## 6 380 7.5 343.7005 412.3461 267.8919 488.1548
## 7 430 4.5 392.8966 507.3532 330.8523 569.3975
## 8 470 6.4 376.9652 431.9559 296.2623 512.6589
## 9 450 7.0 361.4875 418.5929 281.5684 498.5121
## 10 490 5.0 391.0625 485.1534 323.3728 552.8431
## 11 340 7.2 354.8002 415.6668 276.2515 494.2154
## 12 300 7.9 327.5021 409.3175 256.0518 480.7678
## 13 440 5.9 384.5397 448.4152 307.0659 525.8891
## 14 450 5.0 391.0625 485.1534 323.3728 552.8431
## 15 300 7.0 361.4875 418.5929 281.5684 498.5121
## 16 NA 10.0 232.6985 403.1789 182.9692 452.9082