Example 1 (Numeric)

Library

library(tidyverse); library(lattice); library(Hmisc); library(pastecs) ; library(psych) ; library(doBy) library(plyr); library(reshape2); library(scales) library(corrplot); library(corrgram); library(GGally); library(gridExtra); library(coefplot) library(plotly); library(gplots); library(PerformanceAnalytics) library(polycor);library(ggm);library(“VIM”); library(ppcor) library(car); library(leaps); library(effects); library(MASS); library(boot); library(forecast) library(olsrr); library(gvlma); library(lmtest); library(MPV) # 예측도 library(bootstrap); library(QuantPsyc) library(texreg); library(stargazer); library(DT)

library(tidyverse); library(lattice); library(Hmisc);library(pastecs) ; library(psych) ;
library(doBy);library(plyr); library(reshape2); library(scales);library(corrplot);
library(corrgram); library(GGally); library(gridExtra); library(coefplot);library(plotly);
library(gplots); library(PerformanceAnalytics);library(polycor);library(ggm);library("VIM");
library(ppcor);library(car); library(leaps); library(effects); library(MASS);
library(boot);library(forecast);library(olsrr); library(gvlma); library(lmtest); library(MPV);
library(bootstrap); library(QuantPsyc);library(texreg); library(stargazer); library(DT);

regbook 함수
library(devtools) install_github(“regbook/regbook”) library(regbook)

one graphic
devtools::install_github(“wilkelab/cowplot”) # 잘 안되면 R restart 후 재 설치 library(cowplot)

R Interactive Plot 참조
http://rstudio-pubs-static.s3.amazonaws.com/368025_30296c0aa9e7446aac3c94c2ccbd92d9.html

일반절차
1) 예비모형 결정 (anova 등을 통해 변수를 최대한 먼저 줄여 결정)
2) 예비모형의 잔차분석 및 회귀진단 > 선형회귀의 가능성 검토 및 일부 수정
특히 이상치의 경우 중요데이터인지 아닌지 신중히 검토해서 모델에 넣을지 여부를 결정한다.
3) 최적 변수선택
4) 최적모델 잔차분석 최종 수정
5) 예측 및 부트스트랩, 성능평가

Example 1. Ice Cream Data
아이스크림 소비량과 가격, 평균기온, 구매자 월소득의 관계로 예측 모델링
Data Type : All numeric

1. 데이터 import 및 데이터 view

1) 데이터 import
IceCreamData <- read.csv(txt)(“http://~”)
library(DT) https://rpubs.com/cngonz05/icecream
datatable(head(IceCream)) # 순서 및 정렬에 좋다.

IceCream <- read.csv("icecream.csv", stringsAsFactors = F)
head(IceCream)

##   preiod  cons income price temp year
## 1      1 0.386     78 0.270   41    0
## 2      2 0.374     79 0.282   56    0
## 3      3 0.393     81 0.277   63    0
## 4      4 0.425     80 0.280   68    0
## 5      5 0.406     76 0.272   69    0
## 6      6 0.344     78 0.262   65    0

datatable(IceCream)

datatable(head(IceCream))

2) 데이터 view
stat.desc(IceCream, basic = F, norm = T) 정규성 조사까지 진행
describe(IceCream) 왜도와 첨도로 히스토그램을 안해도 알수 있음
stargazer(IceCream, type = “text”) 옵션으로 flit = T 선택 하면 컬럼이 변수가 됨

str(IceCream)

## 'data.frame':    30 obs. of  6 variables:
##  $ preiod: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ cons  : num  0.386 0.374 0.393 0.425 0.406 0.344 0.327 0.288 0.269 0.256 ...
##  $ income: int  78 79 81 80 76 78 82 79 76 79 ...
##  $ price : num  0.27 0.282 0.277 0.28 0.272 0.262 0.275 0.267 0.265 0.277 ...
##  $ temp  : int  41 56 63 68 69 65 61 47 32 24 ...
##  $ year  : int  0 0 0 0 0 0 0 0 0 0 ...

summary(IceCream)

##      preiod           cons            income          price       
##  Min.   : 1.00   Min.   :0.2560   Min.   :76.00   Min.   :0.2600  
##  1st Qu.: 8.25   1st Qu.:0.3113   1st Qu.:79.25   1st Qu.:0.2685  
##  Median :15.50   Median :0.3515   Median :83.50   Median :0.2770  
##  Mean   :15.50   Mean   :0.3594   Mean   :84.60   Mean   :0.2753  
##  3rd Qu.:22.75   3rd Qu.:0.3912   3rd Qu.:89.25   3rd Qu.:0.2815  
##  Max.   :30.00   Max.   :0.5480   Max.   :96.00   Max.   :0.2920  
##       temp            year    
##  Min.   :24.00   Min.   :0.0  
##  1st Qu.:32.25   1st Qu.:0.0  
##  Median :49.50   Median :1.0  
##  Mean   :49.10   Mean   :0.9  
##  3rd Qu.:63.75   3rd Qu.:1.0  
##  Max.   :72.00   Max.   :2.0

stat.desc(IceCream)

##                   preiod         cons       income        price
## nbr.val       30.0000000 30.000000000 3.000000e+01 3.000000e+01
## nbr.null       0.0000000  0.000000000 0.000000e+00 0.000000e+00
## nbr.na         0.0000000  0.000000000 0.000000e+00 0.000000e+00
## min            1.0000000  0.256000000 7.600000e+01 2.600000e-01
## max           30.0000000  0.548000000 9.600000e+01 2.920000e-01
## range         29.0000000  0.292000000 2.000000e+01 3.200000e-02
## sum          465.0000000 10.783000000 2.538000e+03 8.259000e+00
## median        15.5000000  0.351500000 8.350000e+01 2.770000e-01
## mean          15.5000000  0.359433333 8.460000e+01 2.753000e-01
## SE.mean        1.6072751  0.012011650 1.140276e+00 1.523117e-03
## CI.mean.0.95   3.2872467  0.024566582 2.332127e+00 3.115124e-03
## var           77.5000000  0.004328392 3.900690e+01 6.959655e-05
## std.dev        8.8034084  0.065790516 6.245550e+00 8.342455e-03
## coef.var       0.5679618  0.183039550 7.382447e-02 3.030314e-02
##                      temp       year
## nbr.val        30.0000000 30.0000000
## nbr.null        0.0000000 10.0000000
## nbr.na          0.0000000  0.0000000
## min            24.0000000  0.0000000
## max            72.0000000  2.0000000
## range          48.0000000  2.0000000
## sum          1473.0000000 27.0000000
## median         49.5000000  1.0000000
## mean           49.1000000  0.9000000
## SE.mean         2.9982179  0.1385475
## CI.mean.0.95    6.1320440  0.2833614
## var           269.6793103  0.5758621
## std.dev        16.4219156  0.7588558
## coef.var        0.3344586  0.8431731

describe(IceCream)

##        vars  n  mean    sd median trimmed   mad   min   max range  skew
## preiod    1 30 15.50  8.80  15.50   15.50 11.12  1.00 30.00 29.00  0.00
## cons      2 30  0.36  0.07   0.35    0.35  0.06  0.26  0.55  0.29  0.70
## income    3 30 84.60  6.25  83.50   84.21  6.67 76.00 96.00 20.00  0.50
## price     4 30  0.28  0.01   0.28    0.28  0.01  0.26  0.29  0.03 -0.01
## temp      5 30 49.10 16.42  49.50   49.21 23.72 24.00 72.00 48.00 -0.06
## year      6 30  0.90  0.76   1.00    0.88  1.48  0.00  2.00  2.00  0.16
##        kurtosis   se
## preiod    -1.32 1.61
## cons       0.28 0.01
## income    -1.08 1.14
## price     -1.00 0.00
## temp      -1.60 3.00
## year      -1.31 0.14

by(IceCream[,c(2,4,5)], as.factor(IceCream$year), stat.desc, basic = F, norm = T)

## as.factor(IceCream$year): 0
##                      cons         price        temp
## median        0.359000000  2.735000e-01  58.5000000
## mean          0.346800000  2.727000e-01  52.6000000
## SE.mean       0.018910785  2.097883e-03   5.0093246
## CI.mean.0.95  0.042779167  4.745740e-03  11.3318796
## var           0.003576178  4.401111e-05 250.9333333
## std.dev       0.059801152  6.634087e-03  15.8408754
## coef.var      0.172437001  2.432742e-02   0.3011573
## skewness     -0.264635626 -1.788655e-01  -0.5571134
## skew.2SE     -0.192590316 -1.301706e-01  -0.4054430
## kurtosis     -1.640541990 -1.514989e+00  -1.3526953
## kurt.2SE     -0.614781151 -5.677310e-01  -0.5069127
## normtest.W    0.929679756  9.631515e-01   0.8954396
## normtest.p    0.444749610  8.211230e-01   0.1950937
## -------------------------------------------------------- 
## as.factor(IceCream$year): 1
##                      cons         price        temp
## median        0.329000000  2.770000e-01  44.0000000
## mean          0.349538462  2.801538e-01  48.5384615
## SE.mean       0.016361155  1.796885e-03   4.7144287
## CI.mean.0.95  0.035647893  3.915077e-03  10.2718578
## var           0.003479936  4.197436e-05 288.9358974
## std.dev       0.058990981  6.478762e-03  16.9981145
## coef.var      0.168768213  2.312573e-02   0.3501989
## skewness      0.694843098  2.504719e-01   0.1005840
## skew.2SE      0.563688506  2.031942e-01   0.0815983
## kurtosis     -0.854816784 -1.173265e+00  -1.7270276
## kurt.2SE     -0.358903000 -4.926066e-01  -0.7251090
## normtest.W    0.900395836  9.269291e-01   0.9052389
## normtest.p    0.135411591  3.105374e-01   0.1578044
## -------------------------------------------------------- 
## as.factor(IceCream$year): 2
##                      cons         price        temp
## median        0.376000000  2.650000e-01  41.0000000
## mean          0.395857143  2.700000e-01  45.1428571
## SE.mean       0.030637223  3.612149e-03   6.6527065
## CI.mean.0.95  0.074966585  8.838610e-03  16.2785863
## var           0.006570476  9.133333e-05 309.8095238
## std.dev       0.081058474  9.556847e-03  17.6014069
## coef.var      0.204766986  3.539573e-02   0.3899046
## skewness      0.682769989  6.088369e-01   0.2998127
## skew.2SE      0.430104665  3.835312e-01   0.1888643
## kurtosis     -0.943189511 -1.574268e+00  -1.8205821
## kurt.2SE     -0.297076772 -4.958477e-01  -0.5734294
## normtest.W    0.921524010  8.164673e-01   0.9046217
## normtest.p    0.481325694  5.936872e-02   0.3598915

colSums(IceCream)

##   preiod     cons   income    price     temp     year 
##  465.000   10.783 2538.000    8.259 1473.000   27.000

colMeans(IceCream)

##     preiod       cons     income      price       temp       year 
## 15.5000000  0.3594333 84.6000000  0.2753000 49.1000000  0.9000000

여기서 추가적으로 변수 전처리가 있을 수 있다.
전체 일부 변수 scale 로 표준화 또는 여러 범주형 변수 조정등이 있을 수 있다.
범주형 변수와 섞여있는 경우 다시 추가한다.

2. 그래프 분석 (변수 탐색, EDA)

데이터 분포 산점도, 히스토그램, 박스플롯, 인터렉티브 플롯
기본형
hist(x,freq=FALSE)
lines(density(x),na.rm=TRUE)

multi.hist(IceCream[,sapply(IceCream, is.numeric)]) 수치형만 골라낸다.

scatterplotMatrix(IceCream)

pairs(IceCream, panel = panel.smooth)

splom(IceCream)

ggpairs(IceCream)

hist.data.frame(IceCream)

multi.hist(IceCream)

multi.hist(IceCream, bcol="grey", dcol=c("blue","red"), breaks = 15)

lattice, ggplot 방식 melt(data)

ggplot(data = melt(IceCream), aes(x = value)) + 
  geom_histogram(bins = 15) + 
  facet_wrap(~variable, scales = "free")

histogram(~value|variable,
          data=melt(IceCream),  
          breaks=15, scales = "free") # density 추가 가능

densityplot(~value|variable,
            data=melt(IceCream), scales = "free")

boxplot 은 이상치 찾는데 특화된 부분이 있다.

boxplot(value ~ variable, melt(IceCream))

boxplot(value ~ variable, melt(IceCream), locater = T, outline=FALSE) # outlier 제거

y <- boxplot(value ~ variable, melt(IceCream), locater = T)

y$out

## [1] 0.548

bwplot(~value|variable, data=melt(IceCream), scales = "free")

ggplot(data = melt(IceCream), aes(x = variable, y = value)) + # fill = variable 색상화 
  geom_boxplot() + 
  facet_wrap(~variable, scales = "free")

ggplot(data = melt(IceCream), aes(x = variable, y = value)) + 
  geom_boxplot() + 
  facet_wrap(~variable, scales = "free")

인터렉티브 그래프

p1 <- ggplot(data = melt(IceCream), aes(x = variable, y = value)) + 
  geom_boxplot() + 
  facet_wrap(~variable, scales = "free")
ggplotly(p1)

3. 상관 분석

1) 상관분석 전 결측치 확인 결측치 분석은 전문영역으로 추후 link 로 연결한다.

colSums(is.na(IceCream)) # 결측치 행별로 몇개인지 보여줌

## preiod   cons income  price   temp   year 
##      0      0      0      0      0      0

aggr(IceCream, prop=FALSE, numbers=TRUE)  # 현 데이터는 결측치 없어 무의미

matrixplot(IceCream)

sum(is.na(IceCream)) # 결측치 개수

## [1] 0

na.omit(IceCream) # 결측치 행 전체 제거

##    preiod  cons income price temp year
## 1       1 0.386     78 0.270   41    0
## 2       2 0.374     79 0.282   56    0
## 3       3 0.393     81 0.277   63    0
## 4       4 0.425     80 0.280   68    0
## 5       5 0.406     76 0.272   69    0
## 6       6 0.344     78 0.262   65    0
## 7       7 0.327     82 0.275   61    0
## 8       8 0.288     79 0.267   47    0
## 9       9 0.269     76 0.265   32    0
## 10     10 0.256     79 0.277   24    0
## 11     11 0.286     82 0.282   28    1
## 12     12 0.298     85 0.270   26    1
## 13     13 0.329     86 0.272   32    1
## 14     14 0.318     83 0.287   40    1
## 15     15 0.381     84 0.277   55    1
## 16     16 0.381     82 0.287   63    1
## 17     17 0.470     80 0.280   72    1
## 18     18 0.443     78 0.277   72    1
## 19     19 0.386     84 0.277   67    1
## 20     20 0.342     86 0.277   60    1
## 21     21 0.319     85 0.292   44    1
## 22     22 0.307     87 0.287   40    1
## 23     23 0.284     94 0.277   32    1
## 24     24 0.326     92 0.285   27    2
## 25     25 0.309     95 0.282   28    2
## 26     26 0.359     96 0.265   33    2
## 27     27 0.376     94 0.265   41    2
## 28     28 0.416     96 0.265   52    2
## 29     29 0.437     91 0.268   64    2
## 30     30 0.548     90 0.260   71    2

2) 상관분석 그래프 데이터가 많으면 corrplot 이 적절하다.

corrgram(IceCream, order=TRUE, 
         lower.panel=panel.shade,
         upper.panel=panel.conf, # panel.pie, panel.cor
         text.panel=panel.txt)

corrplot(cor(IceCream), method = "circle", 
         tl.col = "red", tl.srt = 30, addshade = "all",
         diag=FALSE, addCoef.col = "black", order="FPC")

corrplot(cor(IceCream), method='shade', shade.col=NA, tl.col='black', tl.srt=45)

cor_5 <- Hmisc::rcorr(as.matrix(IceCream)) # rcorr 가 p value 를 지원한다는 점에 착안
IceCreamCor <- cor_5$r
p_mat <- cor_5$P
corrplot(IceCreamCor, type = "upper", order = "hclust", 
         p.mat = p_mat, sig.level = 0.01)

chart.Correlation

chart.Correlation(IceCream, histogram=TRUE, pch=19)
chart.Correlation(IceCream, histogram=TRUE, pch=19, method = c("pearson"))

ggpairs(IceCream) + theme(axis.text = element_text(size = 2)) 축 글자 사이즈 조정

splom(IceCream)

ggpairs(IceCream)

ggpairs(IceCream, 
        upper = list(continuous = wrap("cor", size=3, col = "coral")),
        diag = list(continuous = wrap("densityDiag", alpha=0.5, fill = "red")),
        lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.5, col = "blue")))

ggplotly(ggpairs(IceCream))

세부적 상관관계 확인

scatterplot(cons ~ temp, data=IceCream,
            spread=FALSE, smoother.args=list(lty=2), 
            pch=19, main = "IceCream Correlation")

scatterplot(income ~ year, data=IceCream,
            spread=FALSE, smoother.args=list(lty=2), 
            pch=19, main = "IceCream Correlation")

ggplot(IceCream, aes(x=temp, y=cons)) + 
  geom_point() + geom_smooth(method="lm") + 
  labs(x="temp", y="cons")

coplot(cons ~ temp | year, IceCream)

3) 상관 분석 및 편상관 분석 명확한 y 와 각각의 변수의 진정한 상관성을 보고자 할때 유용하다.

cor(IceCream, method="pearson")

##             preiod        cons      income       price       temp
## preiod  1.00000000  0.22180556  0.84478861 -0.06831566 -0.1036375
## cons    0.22180556  1.00000000  0.04793538 -0.25959400  0.7756246
## income  0.84478861  0.04793538  1.00000000 -0.10747896 -0.3247093
## price  -0.06831566 -0.25959400 -0.10747896  1.00000000 -0.1082061
## temp   -0.10363755  0.77562456 -0.32470927 -0.10820611  1.0000000
## year    0.93168504  0.26335786  0.87162341 -0.06046052 -0.1734948
##               year
## preiod  0.93168504
## cons    0.26335786
## income  0.87162341
## price  -0.06046052
## temp   -0.17349481
## year    1.00000000

# cor(IceCream, method=c("pearson","kendall")) 여러개 한번에 적용 방법
cor.test(IceCream$cons, IceCream$temp) # 매우 유의함

## 
##  Pearson's product-moment correlation
## 
## data:  IceCream$cons and IceCream$temp
## t = 6.5023, df = 28, p-value = 4.789e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5764290 0.8878098
## sample estimates:
##       cor 
## 0.7756246

corr.test(IceCream, use="complete") # cor.test table

## Call:corr.test(x = IceCream, use = "complete")
## Correlation matrix 
##        preiod  cons income price  temp  year
## preiod   1.00  0.22   0.84 -0.07 -0.10  0.93
## cons     0.22  1.00   0.05 -0.26  0.78  0.26
## income   0.84  0.05   1.00 -0.11 -0.32  0.87
## price   -0.07 -0.26  -0.11  1.00 -0.11 -0.06
## temp    -0.10  0.78  -0.32 -0.11  1.00 -0.17
## year     0.93  0.26   0.87 -0.06 -0.17  1.00
## Sample Size 
## [1] 30
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##        preiod cons income price temp year
## preiod   0.00 1.00   0.00  1.00 1.00    0
## cons     0.24 0.00   1.00  1.00 0.00    1
## income   0.00 0.80   0.00  1.00 0.88    0
## price    0.72 0.17   0.57  0.00 1.00    1
## temp     0.59 0.00   0.08  0.57 0.00    1
## year     0.00 0.16   0.00  0.75 0.36    0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

print(corr.test(IceCream), short=FALSE) p value 신뢰구간까지 출력

ggm::pcor(c(2,3,1,4,5,6), cov(IceCream)) # 2, 3 간의 관계 1,4,5,6 영향력 배제 (가격 vs. 소비량 순수)

## [1] -0.1184616

ggm::pcor(c(3,6,1,2,4,5), cov(IceCream)) # 실제 소득과 판매년도는 관계가 없다.

## [1] 0.3785229

cor(IceCream[, c(3,6)]) 이렇게 계산하면 매우 높게 나타남

ppcor::pcor(IceCream) # pcor 를 전량 구하면 관계 없던 year 와 cons 의 관계도 나타난다.

## $estimate
##            preiod       cons     income      price       temp       year
## preiod  1.0000000 -0.4085064  0.1910814 -0.1152993  0.4680603  0.7719157
## cons   -0.4085064  1.0000000 -0.1184616 -0.3489639  0.8723087  0.6191599
## income  0.1910814 -0.1184616  1.0000000 -0.1996042 -0.1001534  0.3785229
## price  -0.1152993 -0.3489639 -0.1996042  1.0000000  0.2237759  0.2540877
## temp    0.4680603  0.8723087 -0.1001534  0.2237759  1.0000000 -0.5455639
## year    0.7719157  0.6191599  0.3785229  0.2540877 -0.5455639  1.0000000
## 
## $p.value
##              preiod         cons     income      price         temp
## preiod 0.000000e+00 3.827277e-02 0.34975490 0.57488244 1.588995e-02
## cons   3.827277e-02 0.000000e+00 0.56436762 0.08059081 6.367934e-09
## income 3.497549e-01 5.643676e-01 0.00000000 0.32826503 6.264036e-01
## price  5.748824e-01 8.059081e-02 0.32826503 0.00000000 2.717986e-01
## temp   1.588995e-02 6.367934e-09 0.62640360 0.27179864 0.000000e+00
## year   3.867979e-06 7.448834e-04 0.05654459 0.21036632 3.943109e-03
##                year
## preiod 3.867979e-06
## cons   7.448834e-04
## income 5.654459e-02
## price  2.103663e-01
## temp   3.943109e-03
## year   0.000000e+00
## 
## $statistic
##            preiod       cons     income      price       temp      year
## preiod  0.0000000 -2.1925525  0.9536762 -0.5686415  2.5948026  5.948484
## cons   -2.1925525  0.0000000 -0.5844562 -1.8242463  8.7398975  3.862711
## income  0.9536762 -0.5844562  0.0000000 -0.9979386 -0.4931288  2.003449
## price  -0.5686415 -1.8242463 -0.9979386  0.0000000  1.1247978  1.287009
## temp    2.5948026  8.7398975 -0.4931288  1.1247978  0.0000000 -3.189124
## year    5.9484841  3.8627114  2.0034485  1.2870086 -3.1891237  0.000000
## 
## $n
## [1] 30
## 
## $gp
## [1] 4
## 
## $method
## [1] "pearson"

# ppcor::pcor.test(IceCream) # 특정 두 변수만 구할수 있다. (수정필요)

실제 관계 없어보이는 변수가 pcor 로 한꺼번에 구하면 관계 있다고 나타날수도 있기 때문에 주의 detach(package:ppcor) 메모리 제거

스피어만은 비모수적 방법이다. rcorr 는 p value 까지 구해준다는 점에서 cor 와 다르다. 다만 kendall 지원 안함 IceCream_P <- cor(IceCream, method=“pearson”) IceCream_K <- cor(IceCream, method=“kendall”) IceCream_S <- cor(IceCream, method=“spearman”) IceCreamCor <- list(IceCream_P,IceCream_K,IceCream_S)

점이연 상관분석, 이연 상관분석 이연 상관분석 원래 0 ~ 1 사이가 연속형일때의 상관성을 더 명확하게 나타낼수 있다. year와 cons 와의 관계

cor.test(IceCream$cons, IceCream$year)    # 이연 상관분석

## 
##  Pearson's product-moment correlation
## 
## data:  IceCream$cons and IceCream$year
## t = 1.4446, df = 28, p-value = 0.1597
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1070702  0.5695850
## sample estimates:
##       cor 
## 0.2633579

polyserial(IceCream$cons, IceCream$year)  # 점이연 상관분석

## [1] 0.2934057

상관분석의 표준화 (상관계수의 차이 검정) 어떤 상관계수가 더 큰것인가? 독립적인 r 비교 1) year0, 1, 2 의 각 소비량과 온도와의 상관성 크기 비교 및 유의성 분명히 각 년도별로 상관성이 있겠지만 각 년도별로 그 상관성이 차이가 있다고 할수 있는가?

zdifference<-function(r1, r2, n1, n2)
{zd<-(atanh(r1)-atanh(r2))/sqrt(1/(n1-3)+1/(n2-3))
p <-1 - pnorm(abs(zd))
print(paste("Z Difference: ", zd))
print(paste("One-Tailed P-Value: ", p))
}

IceCreamSplit <- split(IceCream, IceCream$year)
IceCream1 <- IceCreamSplit$`0`
IceCream2 <- IceCreamSplit$`1`
IceCream3 <- IceCreamSplit$`2`

cor(IceCream1$cons, IceCream1$temp)

## [1] 0.7766145

cor(IceCream2$cons, IceCream2$temp)

## [1] 0.9079522

zdifference(0.7766145, 0.9079522, 10, 13) # p value 0.16 으로 유의하지 않다. 즉, 차이가 없다.

## [1] "Z Difference:  -0.971896759653198"
## [1] "One-Tailed P-Value:  0.165550955997449"

cons 와 temp, cons 와 price 의 상관성 크기 비교 및 유의성 분명 차이가 있는데 다르다고 할수 있는가?

tdifference<-function(rxy, rxz, rzy, n) 
{   df<-n-3
td<-(rxy-rzy)*sqrt((df*(1 + rxz))/(2*(1-rxy^2-rxz^2-rzy^2+(2*rxy*rxz*rzy))))
p <-pt(td, df)
print(paste("t Difference: ", td))
print(paste("One-Tailed P-Value: ", p))
}

tdifference(0.776, -0.108, -0.260, 30) # 더 크다고 할수없다. 유의한 차이가 있다고 할수없다.

## [1] "t Difference:  5.97407571167203"
## [1] "One-Tailed P-Value:  0.999998867353825"

상관계수간의 검정 일정하게 특정 파라미터간 상관계수가 0.7 에서 작은 변동으로 일정하게 나오는 제품이 있다. 원가절감을 위해 재료를 바꾼후 생산했을때 상관계수가 0.8 이 나타났는데 동일하다고 할수 있는가? 이런 여러가지 경우가 있다.

4. 선형 회귀분석

순서 1. 예비모델 구축 (간이 변수 구축) 2. 통계적 가정 확인 (예비모델 회귀진단, 회귀분석의 방향성) > 선형 다중회귀분석 확정 3. 변수 선택 > 회귀 진단 4. 성능 향상 (이상치 제거, 변수 변환등 튜닝) 5. 부트스트랩, 최적 변수 (rsm, optim)

1) 예비 모델의 구축 간단하게 각 변수별 정규성을 먼저 확인해 볼수도 있으나 전체적으로 Full Model 확인이 좋다. 변수가 굉장히 많을때는 먼저 변수를 일부 F 검정으로 제거하고 중간 Full Model 로 진행하는것도 좋다. 하기비교에서 다항회귀로 빠지게 되면 anova 로 수많은 모델 탐색을 해야한다.

FullModel1 <- lm(cons ~., data = IceCream)
FullModel2 <- lm(cons ~.^2, data = IceCream) # 2차 상호작용 없다.
FullModel3 <- lm(cons ~.^3, data = IceCream) # 3차 상호작용 없다. 

summary(FullModel1)

## 
## Call:
## lm(formula = cons ~ ., data = IceCream)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.053479 -0.015621  0.000901  0.012732  0.065313 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.6206174  0.2723735   2.279 0.031884 *  
## preiod      -0.0039884  0.0018191  -2.193 0.038273 *  
## income      -0.0011911  0.0020380  -0.584 0.564368    
## price       -1.2398785  0.6796662  -1.824 0.080591 .  
## temp         0.0033623  0.0003847   8.740 6.37e-09 ***
## year         0.0862853  0.0223380   3.863 0.000745 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02981 on 24 degrees of freedom
## Multiple R-squared:  0.8301, Adjusted R-squared:  0.7947 
## F-statistic: 23.45 on 5 and 24 DF,  p-value: 1.628e-08

R.Model <- update(FullModel1, .~ . - income) # .~ . 
summary(R.Model)

## 
## Call:
## lm(formula = cons ~ preiod + price + temp + year, data = IceCream)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.05348 -0.01732  0.00000  0.01150  0.06754 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.5055744  0.1857656   2.722 0.011660 *  
## preiod      -0.0042694  0.0017312  -2.466 0.020866 *  
## price       -1.1723170  0.6608853  -1.774 0.088269 .  
## temp         0.0034565  0.0003447  10.029 3.02e-10 ***
## year         0.0811755  0.0202836   4.002 0.000493 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02942 on 25 degrees of freedom
## Multiple R-squared:  0.8277, Adjusted R-squared:  0.8001 
## F-statistic: 30.01 on 4 and 25 DF,  p-value: 3.235e-09

anova(FullModel1, R.Model)  # 그다지 유의한 차이를 보이지 않았다. FullModel 로 진행한다.

## Analysis of Variance Table
## 
## Model 1: cons ~ preiod + income + price + temp + year
## Model 2: cons ~ preiod + price + temp + year
##   Res.Df      RSS Df   Sum of Sq      F Pr(>F)
## 1     24 0.021330                             
## 2     25 0.021633 -1 -0.00030359 0.3416 0.5644

2) 가정의 적합 확인 잔차 분석을 통한 기본 선형 회귀분석 적용 가능성에 대한 평가 만일 적용이 어렵다면 변수 변환으로 가능한지 확인 이상치에 대한 평가로 정규성을 비롯 선형성 확보가 가능한지 press() 값이 올라가는지 확인

점검 항목 : 선형성, 정규성, 등분산성, 이상치, 다중공선성, 자기 상관성 (독립성), 상호 관계

사전 기본 점검

qqnorm(resid(FullModel1)); qqline(resid(FullModel1))

shapiro.test(resid(FullModel1))                        # p value > 0.05 정규성을 따른다.

## 
##  Shapiro-Wilk normality test
## 
## data:  resid(FullModel1)
## W = 0.97526, p-value = 0.6905

dwtest(FullModel1, data = IceCream)                    # 0 에 가깝다. 자기상관이 있다.

## 
##  Durbin-Watson test
## 
## data:  FullModel1
## DW = 1.3222, p-value = 0.002253
## alternative hypothesis: true autocorrelation is greater than 0

ncvTest(FullModel1)                                    # p value > 0.05 등분산을 따른다.

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 2.183456, Df = 1, p = 0.1395

vif(FullModel1)                                        # sqrt(vif(FullModel1)) > 2 다중공선성

##   preiod   income    price     temp     year 
## 8.368155 5.286561 1.049057 1.302336 9.376213

regbook::press(FullModel1)

## $PRESS
## [1] 0.03493918
## 
## $pred.r.squared
## [1] 0.721652

일단 이 경우에는 다중공선성 문제도 그렇고 시간 계열 변수를 삭제한다.

model1 <- update(FullModel1, .~ . - preiod)
model1 <- update(model1, .~ . - year)
summary(model1)

## 
## Call:
## lm(formula = cons ~ income + price + temp, data = IceCream)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.065302 -0.011873  0.002737  0.015953  0.078986 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.1973151  0.2702162   0.730  0.47179    
## income       0.0033078  0.0011714   2.824  0.00899 ** 
## price       -1.0444140  0.8343573  -1.252  0.22180    
## temp         0.0034584  0.0004455   7.762  3.1e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03683 on 26 degrees of freedom
## Multiple R-squared:  0.719,  Adjusted R-squared:  0.6866 
## F-statistic: 22.17 on 3 and 26 DF,  p-value: 2.451e-07

shapiro.test(resid(model1))

## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model1)
## W = 0.9444, p-value = 0.1195

dwtest(model1, data = IceCream)

## 
##  Durbin-Watson test
## 
## data:  model1
## DW = 1.0212, p-value = 0.0003024
## alternative hypothesis: true autocorrelation is greater than 0

ncvTest(model1)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 3.643784, Df = 1, p = 0.056279

vif(model1)                                        # sqrt(vif(model1)) > 2

##   income    price     temp 
## 1.144186 1.035673 1.144367

par(mfrow=c(2,2))
plot(model1)

기본 가정 준수하는 것으로 판정한다.

선형성 점검 Fitted vs. residuals

선형성 점검은 단번에 선형회귀분석의 적용 가능성과 조정지점을 판단할수 있는 시각적 확인 방법이다.
잔차가 크게 틀어지는 점이 실제로 크게 맞지 않는 지점이다.
약간의 비선형성은 변수변환도 검토할수 있으나 사실 선형회귀분석에서는 늘 있을수 있는 경우다.
이상치 여부 점검으로 선형성 증가 가능성, 변수 변환 등을 검토해 보고 결정할수 있다.

plot(model1, which=1) # y = 0 은 이상적 적합선이다.

plot(cons ~ fitted(model1), IceCream)            # 모델의 전체적 선형성 점검
abline(a = 0, b = 1, lty=3)

plot(log(cons) ~ fitted(model1), IceCream)       # log 적용 또는 이상치 제거 선택
lines(lowess(log(cons) ~ fitted(model1), IceCream), type = "l", col = "red")

plot(cons ~ fitted(model1), IceCream)

plot(cons ~ fitted(model1), IceCream, log="xy")   # 약간 사안이 더 나아지는 경향은 보인다.

ggplot 으로 그래픽 강화

fortify(model1)

##     cons income price temp       .hat     .sigma      .cooksd   .fitted
## 1  0.386     78 0.270   41 0.12663082 0.03436342 1.536782e-01 0.3151242
## 2  0.374     79 0.282   56 0.08059342 0.03740937 4.624894e-03 0.3577755
## 3  0.393     81 0.277   63 0.06372775 0.03756174 9.053363e-06 0.3938221
## 4  0.425     80 0.280   68 0.09826031 0.03731736 9.200814e-03 0.4046732
## 5  0.406     76 0.272   69 0.12759175 0.03755753 2.326228e-04 0.4032559
## 6  0.344     78 0.262   65 0.17922533 0.03493787 1.913957e-01 0.4064819
## 7  0.327     82 0.275   61 0.05270983 0.03508346 4.615818e-02 0.3923018
## 8  0.288     79 0.267   47 0.11573984 0.03574156 8.047263e-02 0.3423158
## 9  0.269     76 0.265   32 0.27792247 0.03742539 1.818110e-02 0.2826049
## 10 0.256     79 0.277   24 0.18988684 0.03755326 7.190064e-04 0.2523278
## 11 0.286     82 0.282   28 0.12711786 0.03742210 7.044729e-03 0.2708627
## 12 0.298     85 0.270   26 0.13331840 0.03747940 4.399489e-03 0.2864021
## 13 0.329     86 0.272   32 0.08111940 0.03731474 7.533729e-03 0.3083716
## 14 0.318     83 0.287   40 0.10912506 0.03752804 1.444469e-03 0.3104496
## 15 0.381     84 0.277   55 0.03996076 0.03754869 1.935655e-04 0.3760779
## 16 0.381     82 0.287   63 0.13613940 0.03754220 1.086764e-03 0.3866857
## 17 0.470     80 0.280   72 0.11947615 0.03592298 7.529558e-02 0.4185069
## 18 0.443     78 0.277   72 0.11683205 0.03708730 2.160227e-02 0.4150247
## 19 0.386     84 0.277   67 0.08156751 0.03697947 1.777028e-02 0.4175791
## 20 0.342     86 0.277   60 0.06024492 0.03560616 4.226725e-02 0.3999856
## 21 0.319     85 0.292   44 0.17423342 0.03753337 2.099026e-03 0.3256767
## 22 0.307     87 0.287   40 0.11305898 0.03739472 7.369060e-03 0.3236806
## 23 0.284     94 0.277   32 0.12595827 0.03627264 6.321010e-02 0.3296116
## 24 0.326     92 0.285   27 0.16345797 0.03703595 3.533408e-02 0.2973486
## 25 0.309     95 0.282   28 0.17689610 0.03754682 1.138145e-03 0.3138636
## 26 0.359     96 0.265   33 0.19726544 0.03753161 2.594331e-03 0.3522185
## 27 0.376     94 0.265   41 0.15260991 0.03755745 2.917873e-04 0.3732705
## 28 0.416     96 0.265   52 0.20095832 0.03755965 2.157621e-04 0.4179287
## 29 0.437     91 0.268   64 0.14028084 0.03755742 2.660061e-04 0.4397578
## 30 0.548     90 0.260   71 0.23809088 0.03291477 4.715257e-01 0.4690144
##           .resid   .stdresid
## 1   0.0708757723  2.05904267
## 2   0.0162245337  0.45939355
## 3  -0.0008220653 -0.02306597
## 4   0.0203267884  0.58115857
## 5   0.0027440885  0.07976359
## 6  -0.0624818534 -1.87244213
## 7  -0.0653017943 -1.82158765
## 8  -0.0543158086 -1.56820449
## 9  -0.0136049091 -0.43468055
## 10  0.0036722154  0.11076990
## 11  0.0151372850  0.43988247
## 12  0.0115978953  0.33823292
## 13  0.0206283844  0.58425461
## 14  0.0075504377  0.21718520
## 15  0.0049220913  0.13638658
## 16 -0.0056856859 -0.16608389
## 17  0.0514930694  1.48985736
## 18  0.0279753483  0.80820276
## 19 -0.0315790656 -0.89462727
## 20 -0.0579855783 -1.62397420
## 21 -0.0066767322 -0.19948111
## 22 -0.0166806041 -0.48087354
## 23 -0.0456116291 -1.32457323
## 24  0.0286513524  0.85048707
## 25 -0.0048636007 -0.14554492
## 26  0.0067814523  0.20549592
## 27  0.0027295353  0.08050327
## 28 -0.0019287127 -0.05857999
## 29 -0.0027578254 -0.08075231
## 30  0.0789856149  2.45676089

frame1 <- fortify(model1)

min1 <- min(IceCream$cons) - 0.01
max1 <- max(IceCream$cons) + 0.01
min2 <- min(log(IceCream$cons)) - 0.01
max2 <- max(log(IceCream$cons)) + 0.01

p1 <- ggplot(model1, aes(.fitted, cons)) + 
  geom_point(size = 2) + 
  ylim(min1, max1) +
  geom_hline(yintercept = 0, linetype = "dotted", col = "grey") +
  geom_smooth(method = "lm", span = 0.1, colour = I("red"), se = FALSE)

p2 <- ggplot(model1, aes(.fitted, log(cons))) + 
  geom_point(size = 2) + 
  ylim(min2, max2) +
  geom_hline(yintercept = 0, linetype = "dotted", col = "grey") +
  geom_smooth(method = "lm", span = 0.1, colour = I("red"), se = FALSE)

grid.arrange(p1, p2, ncol=2)

ggplot(model1, aes(.fitted, cons)) + 
  geom_point(size = 2) + 
  ylim(min1, max1) +
  geom_hline(yintercept = 0, linetype = "dotted", col = "grey") +
  geom_smooth(colour = I("red"), se = FALSE) # default 가 옵션 span = 1, 1 이하면 굴곡 더 심해짐

ggplot(model1, aes(.fitted, cons)) + 
  geom_point(size = 2) + 
  ylim(min1, max1) +
  geom_hline(yintercept = 0, linetype = "dotted", col = "grey") +
  geom_smooth(method = lm, formula = y ~ splines::ns(x, 2), se = FALSE) # ns 대신 bs

# 잔차
ggplot(model1, aes(.fitted, .resid)) + 
  geom_point(size = 2) + 
  geom_hline(yintercept = 0, linetype = "dotted", col = "grey") +
  geom_smooth(method = lm, formula = y ~ splines::ns(x, 2), se = FALSE)

rfs 구간별로 가장 틀어지는 곳을 찾아낼수 있다.
대부분 적절하다고 보고 있다.
좌측 : 설명되지 않은 부분 거의 0 에 가까울 수록 좋다.
우측 : 설명된 부분 선형성을 띌수록 좋다.

rfs(model1)

각 변수별 선형성 점검

crPlots(model1)

library(regbook)
data("usedcars")  # regbook data
model <- lm(price ~., data = usedcars)
par(mfrow=c(2,2))
res <- rstandard(model)  
vars <- names(usedcars[,-1]) 
for (i in 1:length(vars)){         
  plot(res ~ usedcars[[vars[i]]], type = "p", xlab = vars[i]) # x ~ y 가 벡터형이어야 한다. 
  abline(h=0, lty=3)}

data[1], data[[1]] or data[“var1”], data[["var1]] 특별한 비선형성이 나타나지 않는다.

편회귀 plot 순수한 y 와 각 x 의 관계의 산점도 수평에 가까울수록 관계가 적으므로 변수에 추가할 필요가 없다. 수직인 경우는 다중공선성

avPlots(model1) # 기울기로 변수 중요도 판단, 특이값도 찾을 수 있다.

temp 가 기울기가 제일크며, 또한 잔차 적합성도 가장 좋다. 다른 변수들은 기울기는 있긴 하나 잔차 적합성으로 봤을때 선형으로 표현하기 어렵다. (관계가 적다)

변수 변환 박스 티드웰 변환의 경우 소수점 3 자리인 변수가 많으면 뭔가 계산 오류가 나는 경향 몇몇 변수를 2 자리로 조정한다. price 가 덜 중요한 변수이므로 조정한다.

car::boxTidwell(cons ~ income + round(IceCream$price, 2) + temp, data = IceCream)

##                          MLE of lambda Score Statistic (z) Pr(>|z|)  
## income                         -9.8014             -0.3951  0.69278  
## round(IceCream$price, 2)       -8.9343              0.3097  0.75682  
## temp                            4.5366              1.6484  0.09928 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## iterations =  15

제안된 lambda 값을 각 변수에 적용한다. temp 의 경우 검토해 볼만한 필요가 있으나 전체적으로 선형성의 필요가 없다고 판단된다.

적용예시

newx1 <- (IceCream$temp)^4.5366
newx1^(1/4.5366) # 다시 재 복원

##  [1] 41 56 63 68 69 65 61 47 32 24 28 26 32 40 55 63 72 72 67 60 44 40 32
## [24] 27 28 33 41 52 64 71

최종적으로 선형변환은 검토하지 않는다. 전체적으로 가법 모형일 경우에는 비선형 회귀분석을 해야한다.

3. 정규성 점검

plot(model1, which = 2)

qqPlot(resid(model1))   # 약간 더 자세히 확인

## [1] 30  1

shapiro.test(resid(model1))

## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model1)
## W = 0.9444, p-value = 0.1195

library(ggpubr)
ggqqplot(resid(model1))

hist(resid(model1), breaks = 20, col = "grey")

ggplot(model1, aes(x = .resid)) + 
  labs(legend.position = "none") + 
  geom_histogram(aes(y=..density..), colour="black", fill="white", binwidth = 0.01) + 
  labs(x = "Studentized Residual", y = "Density") +
  geom_density(col = "red")

binwidth 는 x 축 해상도를 보면서 나눈다.

box cox 변환, power transform

boxcox(model1) # lambda 값을 구해서 적용한다.

(lambda <- BoxCox.lambda(IceCream$cons))

## [1] -0.9999242

summary(powerTransform(IceCream$cons)) # -0.6649 을 제안, -0.5  에 가깝다.

## bcPower Transformation to Normality 
##               Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## IceCream$cons   -0.6649           1      -2.5426       1.2127
## 
## Likelihood ratio test that transformation parameter is equal to 0
##  (log transformation)
##                             LRT df    pval
## LR test, lambda = (0) 0.4929533  1 0.48261
## 
## Likelihood ratio test that no transformation is needed
##                            LRT df     pval
## LR test, lambda = (1) 3.189822  1 0.074098

애매하긴 하지만 정규성 변환도 구지 필요하지 않다고 나타난다.

shiny boxcox model

library(shiny)

## 
## Attaching package: 'shiny'

## The following objects are masked from 'package:DT':
## 
##     dataTableOutput, renderDataTable

library(ggpubr)  
# Define UI for application that draws a histogram  
ui <- fluidPage(  
  
  # Application title  
  titlePanel("boxcox transform"),  
  
  # Sidebar with a slider input for number of bins   
  sidebarLayout(  
    sidebarPanel(  
      sliderInput("Lambda",  
                  "Lambda",  
                  step = 0.01,  
                  min = -2,  
                  max = 2,  
                  value = 0),  
      h5("This is a static text")  
    ),  
    
    # Show a plot of the generated distribution  
    mainPanel(  
      plotOutput("distPlot")  
    )  
  )  
)  

# Define server logic required to draw a histogram  
server <- function(input, output) {  
  
  output$distPlot <- renderPlot({
    # generate bins based on input$bins from ui.R
    newconc <- (IceCream$cons)^input$Lambda
    model1 <- lm(newconc ~ IceCream$income + IceCream$price + IceCream$temp)
    # draw the histogram with the specified number of bins
    ggqqplot(resid(model1))
    })
  
}

# Run the application 
shinyApp(ui = ui, server = server)

## PhantomJS not found. You can install it with webshot::install_phantomjs(). If it is installed, please make sure the phantomjs executable can be found via the PATH variable.

Shiny applications not supported in static R Markdown documents

대안으로는 이상치를 검토한후, 변수변환, 안되면 로버스트 회귀분석, 부트스트랩을 검토한다.

4. 등분산성 검정

plot(model1, which = 3) # 분산은 무작위여야 하며 어떤 트렌드를 보이면 안된다.

ncvTest(model1)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 3.643784, Df = 1, p = 0.056279

spreadLevelPlot(model1) # 등분산성 조사 p > 0.05 등분산성 만족, 거의 수평이면 만족한다.

## 
## Suggested power transformation:  0.35762

대안으로는 wls, glm 등이 있을 수 있다.

5. 다중 공선성

summary(vif(model1)) # 다중공선성 sqrt(Vif) > 2 다중공선성이 있다고 판단한다.

## 
## VIF:
## income  price   temp 
##  1.144  1.036  1.144 
## 
## Variance Proportion:
##   Eigenvalues Cond.Index     income        price       temp
## 1    1.324710   1.000000 0.32960065 2.308820e-06 0.33009978
## 2    1.060398   1.117702 0.05642018 7.871091e-01 0.05531365
## 3    0.614892   1.467780 0.61397917 2.128886e-01 0.61458657

4 다중공선성 검토해야 될수 있음 10 다중공선성 존재

대안으로 벌점회귀가 있다.

6. 자기상관성 검정

lmtest::dwtest(model1)

## 
##  Durbin-Watson test
## 
## data:  model1
## DW = 1.0212, p-value = 0.0003024
## alternative hypothesis: true autocorrelation is greater than 0

dwt(model1) # 더빈 왓슨 테스트

##  lag Autocorrelation D-W Statistic p-value
##    1       0.3297724       1.02117   0.002
##  Alternative hypothesis: rho != 0

대안 자기상관을 유발하는 변수를 추가, 예를 들면 시간등을 추가할수 있다. 변수 변환하여 자기상관을 없앤다. 차분의 방법이 있다. p =1 로 가정한 상기 변수 변환 gls 를 적용한다. lagged regression 이 있다.

예를 들면 하기처럼 관련변수에 구해진 파라미터를 곱하여 변수를 변환한다.

e <- ar(resid(model1)) # 0.3298
y2 <- conc[2:n] - 0.3298* cons[1:(n-1)]
x2 <- temp[2:n] - 0.3298* temp[1:(n-1)]
newmodel1 <- lm(y2 ~ x2)
lmtest::dwtest(newmodel1)

7. 이상치 점검

plot(model1, which=c(4,5,6)) # 해당 그래프에서 이상치를 찾을 수 있다.

지렛값은 마할라노비스의 거리로 판단하며 기존거리들과 굉장히 다른 값이다. 영향치는 높은 지렛점이며 꼭 이상치를 뜻하진 않는다. 영향치 또는 이상치 제거 lm(y ~ x, data[c(-1,-2),]) 1, 2 번이 영향치 또는 이상치 일 경우 영향력이 매우 큰 사례지만 잔차는 작을수 있기 때문에 항상 잔차분석과 함께 영향력을 확인해야한다.

예비모델에서는 간략히 검토한다. (어떤 영향력이 있을 지 모르므로 쉽게 삭제해서는 안됨) 여기서는 주로 정규성 및 등분산성, 선형성 위배의 가정을 만족시키기 위해 적용한다. 사실상 7 번은 넘겨도 되며 하기 8 번에서 만족하지 못하면 7 번 집중한다.

이상치, 지렛값 확인 외 표준화 잔차가 2 이상이면 특이점으로 볼수 있다.

rstudent(model1)        # 반응변수 기준

##           1           2           3           4           5           6 
##  2.20700075  0.45231189 -0.02261828  0.57361070  0.07822421 -1.97399225 
##           7           8           9          10          11          12 
## -1.91241060 -1.61607939 -0.42779662  0.10864446  0.43295432  0.33239675 
##          13          14          15          16          17          18 
##  0.57670708  0.21316106  0.13378591 -0.16294511  1.52758647  0.80265459 
##          19          20          21          22          23          24 
## -0.89107642 -1.67991591 -0.19575718 -0.47364626 -1.34502499  0.84581966 
##          25          26          27          28          29          30 
## -0.14277671  0.20166917  0.07894979 -0.05744620 -0.07919408  2.74919515

hatvalues(model1)       # 설명 변수 기준, 자료의 중심으로 부터 떨어져 있는 거리 (통계학적 의미)

##          1          2          3          4          5          6 
## 0.12663082 0.08059342 0.06372775 0.09826031 0.12759175 0.17922533 
##          7          8          9         10         11         12 
## 0.05270983 0.11573984 0.27792247 0.18988684 0.12711786 0.13331840 
##         13         14         15         16         17         18 
## 0.08111940 0.10912506 0.03996076 0.13613940 0.11947615 0.11683205 
##         19         20         21         22         23         24 
## 0.08156751 0.06024492 0.17423342 0.11305898 0.12595827 0.16345797 
##         25         26         27         28         29         30 
## 0.17689610 0.19726544 0.15260991 0.20095832 0.14028084 0.23809088

outlierTest(model1)

## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##    rstudent unadjusted p-value Bonferroni p
## 30 2.749195           0.010935      0.32804

leveragePlots(model1)

hat.plot <- function(fit) {
  p <- length(coefficients(fit))
  n <- length(fitted(fit))
  plot(hatvalues(fit), main="Index Plot of Hat Values")
  abline(h=c(2,3)*p/n, col="red", lty=2)
  identify(1:n, hatvalues(fit), names(hatvalues(fit)))
}
hat.plot(model1)    # 강한 지렛점 통상적으로 다른 점, 예측변수쪽의 이상치라고 볼수 있다.

## integer(0)

영향력의 측도 : cook’s D, dfbets, dffits, covratio 개별적으로도 구할수 있다. 모두 plot 지원 가능
cooks.distance(out) # 기준값 : >= 3.67/(n - 변수의 수)
covratio(out) # 기준값 : >= 3p/n (p 변수 개수)
dffits(out) # 기준값 : >= 2 or 2sqrt(p/n)
dfbeta(out) # 기준값 : >= 2 or 2/sqrt(n)

영향치 분석을 종합적으로 진행 influence.measures(model1) # 2 가 넘는 이상치에 대해 * 표시

influence.measures(model1)  # 2 가 넘는 이상치에 대해 * 표시

## Influence measures of
##   lm(formula = cons ~ income + price + temp, data = IceCream) :
## 
##       dfb.1_ dfb.incm dfb.pric  dfb.temp   dffit cov.r   cook.d    hat inf
## 1   0.607928 -0.61745 -0.39393 -4.43e-01  0.8404 0.657 1.54e-01 0.1266    
## 2  -0.032816 -0.05976  0.06484  2.17e-02  0.1339 1.232 4.62e-03 0.0806    
## 3   0.000637  0.00120 -0.00109 -3.16e-03 -0.0059 1.249 9.05e-06 0.0637    
## 4  -0.056337 -0.03162  0.07193  1.18e-01  0.1894 1.231 9.20e-03 0.0983    
## 5   0.011357 -0.01700 -0.00667  1.14e-02  0.0299 1.340 2.33e-04 0.1276    
## 6  -0.711387  0.41432  0.66205 -1.64e-01 -0.9224 0.799 1.91e-01 0.1792    
## 7  -0.011410  0.06825 -0.00505 -2.26e-01 -0.4511 0.715 4.62e-02 0.0527    
## 8  -0.479221  0.36995  0.37619  1.97e-01 -0.5847 0.889 8.05e-02 0.1157    
## 9  -0.217065  0.19095  0.15420  1.69e-01 -0.2654 1.574 1.82e-02 0.2779   *
## 10  0.019812 -0.03326 -0.00419 -4.33e-02  0.0526 1.441 7.19e-04 0.1899    
## 11 -0.002772 -0.06790  0.04652 -1.21e-01  0.1652 1.301 7.04e-03 0.1271    
## 12  0.072033 -0.03591 -0.05736 -1.05e-01  0.1304 1.326 4.40e-03 0.1333    
## 13  0.071709 -0.02238 -0.05985 -1.22e-01  0.1714 1.208 7.53e-03 0.0811    
## 14 -0.039577 -0.01087  0.05433 -1.97e-02  0.0746 1.304 1.44e-03 0.1091    
## 15 -0.006109  0.00151  0.00635  9.66e-03  0.0273 1.215 1.94e-04 0.0400    
## 16  0.044152 -0.00258 -0.04875 -3.16e-02 -0.0647 1.348 1.09e-03 0.1361    
## 17 -0.178226 -0.05842  0.20583  3.96e-01  0.5627 0.930 7.53e-02 0.1195    
## 18 -0.011798 -0.09378  0.04170  1.81e-01  0.2919 1.196 2.16e-02 0.1168    
## 19  0.086833 -0.05620 -0.06375 -2.01e-01 -0.2656 1.124 1.78e-02 0.0816    
## 20  0.170334 -0.16504 -0.11321 -2.65e-01 -0.4253 0.812 4.23e-02 0.0602    
## 21  0.070813 -0.01058 -0.07989 -4.49e-05 -0.0899 1.408 2.10e-03 0.1742    
## 22  0.121612 -0.03977 -0.13069  2.20e-02 -0.1691 1.273 7.37e-03 0.1131    
## 23  0.169635 -0.33724 -0.07535  1.43e-01 -0.5106 1.012 6.32e-02 0.1260    
## 24 -0.213363  0.16433  0.19859 -1.42e-01  0.3739 1.249 3.53e-02 0.1635    
## 25  0.034986 -0.04201 -0.02567  1.88e-02 -0.0662 1.417 1.14e-03 0.1769    
## 26  0.021225  0.05873 -0.04681 -2.41e-02  0.1000 1.448 2.59e-03 0.1973    
## 27  0.008160  0.01975 -0.01740 -2.72e-03  0.0335 1.379 2.92e-04 0.1526    
## 28 -0.001110 -0.02179  0.01112 -7.90e-03 -0.0288 1.463 2.16e-04 0.2010   *
## 29  0.000902 -0.02048  0.00914 -1.92e-02 -0.0320 1.359 2.66e-04 0.1403    
## 30  0.453759  0.65928 -0.88258  8.52e-01  1.5368 0.534 4.72e-01 0.2381   *

car pacakge outlier

infIndexPlot(model1)        # 회귀분석에 영향을 주는 값을 찾는다.

influencePlot(model1, id.method="identify", main="Influence Plot",
              sub="Circle size is proportional to Cook's distance")

##       StudRes       Hat     CookD
## 1   2.2070007 0.1266308 0.1536782
## 6  -1.9739923 0.1792253 0.1913957
## 9  -0.4277966 0.2779225 0.0181811
## 30  2.7491952 0.2380909 0.4715257

outlierTest 는 하기 진행
이상치 : 2 or -2 이상, 지레점 : 0.2 ~ 0.3 이상, 영향치 : 원의 크기

ls.diag() R 의 base 함수 ls.diag() 함수에 의한 plot (그다지 좋은 plot 은 아님 다만 ls.diag() 소개)

diag <- ls.diag(model1)                               # 회귀 진단
par(mfrow=c(2,2))
plot(diag$std.res, ylim = c(-5,5))           # 표준화 잔차 (+/- 2.5 이상 이상치 의심)
plot(diag$stud.res, ylim = c(-5,5))          # 스튜어트 잔차 (+/- 2.5 이상 이상치 의심)
plot(diag$cooks)                             # cooksD
plot(diag$dfits)                             # 영향치

ls.diag 에서는 dfbeta 값은 없기 때문에 따로 구한다.
계수별 dfbetas 값은 1 이하면 적절하다.
한 사례를 포함시키고 회귀분석 후 그 사례를 뺀 후 회귀분석을 했을 경우 각 계수별 차이

par(mfrow=c(1,3))
plot(dfbetas(model1)[,1], type = "h", ldw = 3, main = "intercept")
plot(dfbetas(model1)[,2], type = "h", ldw = 3, main = "income")
plot(dfbetas(model1)[,3], type = "h", ldw = 3, main = "temp")

pairs(dfbetas(model1))

ceresPlots

ceresPlots(model1) # 확인 필요 cr.Plots 와 거의 동일하다.

각 함수별 sort() 적용가능하다.

sort(dffits(model1), decreasing = TRUE)[1:5] # 영향치 탐색

##        30         1        17        24        18 
## 1.5368283 0.8403753 0.5626986 0.3738840 0.2919363

olsrr 패키지 threshold 가 함께 있어 판별하기 좋다.

ols_plot_cooksd_bar(model1) ; ols_plot_cooksd_chart(model1) ;

ols_plot_dffits(model1) ; ols_plot_resid_stud(model1) ; ols_plot_resid_stand(model1) ;

ols_plot_resid_lev(model1); ols_plot_dfbetas(model1)

8. 상호작용 및 다항 관계 (선형성에 연계되어야 할 부분) 최초 예비 모델에서 상호 작용과 다항관계를 포함시킬지 여부를 결정하는 것이 좋다. anova 를 비롯해 확인한다.

현재 상호 작용은 없지만 있는 경우 하기의 경우를 따른다.

gam 및 tree 로 교호효과 바로 확인하는 방법 교호효과가 강하고 많을 경우 step() 을 적용하면 최적결과가 나오지 않을 수 있다.

예시

ozone.pollution <- read.table("ozone.data.txt",header=T)
attach(ozone.pollution)
library(tree)
model <- tree(ozone~.,data=ozone.pollution)
par(mfrow=c(1,1))
plot(model)
text(model)

temp 와 wind, wind 는 rad, rad 와 temp 는 교호효과가 있다

상호작용 효과 및 시각화 예시

fit <- lm(mpg ~ hp + wt + hp:wt, data=mtcars)
summary(fit)

## 
## Call:
## lm(formula = mpg ~ hp + wt + hp:wt, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0632 -1.6491 -0.7362  1.4211  4.5513 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.80842    3.60516  13.816 5.01e-14 ***
## hp          -0.12010    0.02470  -4.863 4.04e-05 ***
## wt          -8.21662    1.26971  -6.471 5.20e-07 ***
## hp:wt        0.02785    0.00742   3.753 0.000811 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.153 on 28 degrees of freedom
## Multiple R-squared:  0.8848, Adjusted R-squared:  0.8724 
## F-statistic: 71.66 on 3 and 28 DF,  p-value: 2.981e-13

library(effects) 
# wt 의 평균 3.2 에서 각 표준편차 0.96 을 더하거나 빼준값으로 비교한다. 
plot(effect("hp:wt", fit,, list(wt=c(2.2,3.2,4.2))), multiline=TRUE)  # 틀린듯해도 빈칸 두어야함

effect(“hp:wt”, fit,, list(wt=c(2.2,3.2,4.2)))

해당 그래프에서 교호효과가 있다는 것을 알수있듯이 특정 지점에서 관계가 틀어지기 시작한다. wt 가 늘어날수록 기울기 각도가 줄어들어 hp 와 wt 는 점차 mpg 와 관계가 없어진다.

교호효과에 대해서는 다른 예시에서 다시 확인한다.

다항회귀 update 다항회귀모형에서 update() 함수가 매우 적용하기 좋다.
차수가 높아지면 다중공선성 문제가 생기게 되어 poly 를 적용한다.
poly(var1, degree = 2) : var1 + I(var1^2)

summary(lm(rate ~ conc + I(conc^2) + I(conc^3), enzyme))

## 
## Call:
## lm(formula = rate ~ conc + I(conc^2) + I(conc^3), data = enzyme)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.158  -3.975   1.446   5.971  17.133 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    56.281      9.026   6.235  0.00025 ***
## conc          716.836    118.146   6.067  0.00030 ***
## I(conc^2)   -1159.255    294.093  -3.942  0.00429 ** 
## I(conc^3)     572.142    178.915   3.198  0.01265 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.51 on 8 degrees of freedom
## Multiple R-squared:  0.9594, Adjusted R-squared:  0.9442 
## F-statistic: 63.02 on 3 and 8 DF,  p-value: 6.575e-06

summary(lm(rate ~ poly(conc, 3, raw = T), enzyme)) # 직교 다항식, 직교하여 다중공선성을 없애준다.

## 
## Call:
## lm(formula = rate ~ poly(conc, 3, raw = T), data = enzyme)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.158  -3.975   1.446   5.971  17.133 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                56.281      9.026   6.235  0.00025 ***
## poly(conc, 3, raw = T)1   716.836    118.146   6.067  0.00030 ***
## poly(conc, 3, raw = T)2 -1159.255    294.093  -3.942  0.00429 ** 
## poly(conc, 3, raw = T)3   572.142    178.915   3.198  0.01265 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.51 on 8 degrees of freedom
## Multiple R-squared:  0.9594, Adjusted R-squared:  0.9442 
## F-statistic: 63.02 on 3 and 8 DF,  p-value: 6.575e-06

9. 예비 모형에 대한 가정의 적합 및 선형 회귀모형 예비 모형이라도 어느정도 맞춘 모형이다.

순차제곱합과 편제곱합

anova(model1)

## Analysis of Variance Table
## 
## Response: cons
##           Df   Sum Sq  Mean Sq F value  Pr(>F)    
## income     1 0.000288 0.000288  0.2126 0.64857    
## price      1 0.008221 0.008221  6.0601 0.02078 *  
## temp       1 0.081741 0.081741 60.2519 3.1e-08 ***
## Residuals 26 0.035273 0.001357                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

drop1(model1, test = "F") # 각각의 변수를 하나씩 지웠을때의 증감하는 RSS 크기를 출력한다.

## Single term deletions
## 
## Model:
## cons ~ income + price + temp
##        Df Sum of Sq      RSS     AIC F value   Pr(>F)    
## <none>              0.035273 -194.38                     
## income  1  0.010817 0.046090 -188.35  7.9734 0.008989 ** 
## price   1  0.002126 0.037399 -194.62  1.5669 0.221803    
## temp    1  0.081741 0.117013 -160.40 60.2519  3.1e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

해당 결과의 RSS 값으로 변수 중요도를 간략히 파악이 가능하다. RSS 가 가장 큰 값이 중요한 값

전체적으로 선형회귀분석 (선형성) 적용에 대한 평가 그래프의 형상등으로 전체적으로 선형성을 비롯한 전체 통계적 가정도 확인한다.

gvmodel <- gvlma(model1)
summary(gvmodel)

## 
## Call:
## lm(formula = cons ~ income + price + temp, data = IceCream)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.065302 -0.011873  0.002737  0.015953  0.078986 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.1973151  0.2702162   0.730  0.47179    
## income       0.0033078  0.0011714   2.824  0.00899 ** 
## price       -1.0444140  0.8343573  -1.252  0.22180    
## temp         0.0034584  0.0004455   7.762  3.1e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03683 on 26 degrees of freedom
## Multiple R-squared:  0.719,  Adjusted R-squared:  0.6866 
## F-statistic: 22.17 on 3 and 26 DF,  p-value: 2.451e-07
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = model1) 
## 
##                       Value p-value                   Decision
## Global Stat        5.174915 0.26982    Assumptions acceptable.
## Skewness           0.007362 0.93162    Assumptions acceptable.
## Kurtosis           0.040079 0.84133    Assumptions acceptable.
## Link Function      4.869680 0.02733 Assumptions NOT satisfied!
## Heteroscedasticity 0.257795 0.61164    Assumptions acceptable.

Global Stat : 회귀모형에 대한 전반적인 가정에 대한 검정 결과
Skewness : 정규성 검정
Kurtosis : 정규성 검정
Link function : 선형성 검정

Heteroscedasticity : 등분산성 검정

p value 0.05 이상 = 선형 회귀 모델을 만족한다. 해당 데이터를 토대로 비선형으로 넘어가기위해서는 다시 검토를 한다. 무시해도 되나 참조할 필요는 있다. 약간의 변수 변환 정도는 확인해볼수도 있다.

regbook::press(model1)

## $PRESS
## [1] 0.04784483
## 
## $pred.r.squared
## [1] 0.6188372

결론 : 다중선형모델의 적용 가능성이 있다고 판단하고 최적 변수 탐색에 진입한다.

3) 최적 변수 selection 후 잔차 분석 및 진단 점검, 변수 중요도 사실상 대단히 많은 변수가 있을 경우 (7 ~ 8 개 이상) 적용하면 좋다. 수동적 방법, 단계별 방법, 전부분 집합 방법이 있다. -. 수동적 방법은 FullModel 에서 하나 하나 빼는 방법이다. 변수가 많지 않으면 적합하다. -. 단계별 방법 간단하고 좋지만 최근 지양되고 있다. 최적 변수 모델을 이끌어내지 못할때가 있다. 구지 쓰고자 한다면 후진이 낫다. -. 전부분집합 방법 정확한 방법이지만 변수가 많을 수록 컴퓨팅 자원이 크게 필요하다.

수동적 선택에는 여러가지 방법이 있으며 변수 중요도를 계산하여 적용하는 방법도 있다.

모델 판단 기준 기여율 다중회귀의 적합도 판정의 기본, 단점은 변수를 늘릴수록 좋아진다. AIC 는 기여율을 보완하기 위해 만들어짐 작을수록 좋지만 얼마나 작아야 좋은지 기준이 없다. 그저 두 모델을 비교하여 더 작으면 좋은 것이다. BIC 는 상기 단점을 보완한 모델

1. 수동적 방법

summary(model1)

## 
## Call:
## lm(formula = cons ~ income + price + temp, data = IceCream)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.065302 -0.011873  0.002737  0.015953  0.078986 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.1973151  0.2702162   0.730  0.47179    
## income       0.0033078  0.0011714   2.824  0.00899 ** 
## price       -1.0444140  0.8343573  -1.252  0.22180    
## temp         0.0034584  0.0004455   7.762  3.1e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03683 on 26 degrees of freedom
## Multiple R-squared:  0.719,  Adjusted R-squared:  0.6866 
## F-statistic: 22.17 on 3 and 26 DF,  p-value: 2.451e-07

anova(model1)

## Analysis of Variance Table
## 
## Response: cons
##           Df   Sum Sq  Mean Sq F value  Pr(>F)    
## income     1 0.000288 0.000288  0.2126 0.64857    
## price      1 0.008221 0.008221  6.0601 0.02078 *  
## temp       1 0.081741 0.081741 60.2519 3.1e-08 ***
## Residuals 26 0.035273 0.001357                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

순차제곱합과 편제곱합

anova(model1)

## Analysis of Variance Table
## 
## Response: cons
##           Df   Sum Sq  Mean Sq F value  Pr(>F)    
## income     1 0.000288 0.000288  0.2126 0.64857    
## price      1 0.008221 0.008221  6.0601 0.02078 *  
## temp       1 0.081741 0.081741 60.2519 3.1e-08 ***
## Residuals 26 0.035273 0.001357                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

drop1(model1, test = "F") # 각각의 변수를 하나씩 지웠을때의 증감하는 RSS 크기를 출력한다.

## Single term deletions
## 
## Model:
## cons ~ income + price + temp
##        Df Sum of Sq      RSS     AIC F value   Pr(>F)    
## <none>              0.035273 -194.38                     
## income  1  0.010817 0.046090 -188.35  7.9734 0.008989 ** 
## price   1  0.002126 0.037399 -194.62  1.5669 0.221803    
## temp    1  0.081741 0.117013 -160.40 60.2519  3.1e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

해당 결과의 RSS 값으로 변수 중요도를 간략히 파악이 가능하다. RSS 가 가장 큰 값이 중요한 값

add1, drop1 add1 과 drop1 은 서로 기능은 같지만 정 반대이다. add1 현재 모형에서 후보 대상의 변수를 하나씩 포함시켰을때의 결과 전향적 drop1 현재 모형에서 각 설명변수를 하나씩 제거시켰을때의 결과 후향적

add1(최소모델, 비교할 FullModel, data) add1 은 한개씩 적합한 값의 결과를 먼저 대략적으로 보여주므로 p value 가 가장 작은 값의 변수를 다음 model0 > model1 로 정한다.

model0 <- lm(price ~ 1, houseprice)
add1(model0, scope = ~ tax + ground + floor + year, test = "F")

## Single term additions
## 
## Model:
## price ~ 1
##        Df Sum of Sq     RSS     AIC  F value    Pr(>F)    
## <none>              1330.58 107.233                       
## tax     1   1114.60  215.98  60.142 129.0183 2.310e-11 ***
## ground  1    701.96  628.62  88.987  27.9170 1.792e-05 ***
## floor   1   1147.92  182.66  55.619 157.1084 2.807e-12 ***
## year    1    128.11 1202.47 106.500   2.6634    0.1152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

model1 <- update(model0, .~. + floor)
add1(model1, scope = ~ tax + ground + floor + year, test = "F")

## Single term additions
## 
## Model:
## price ~ floor
##        Df Sum of Sq     RSS    AIC F value    Pr(>F)    
## <none>              182.663 55.619                      
## tax     1    87.468  95.195 40.023 22.0517 8.997e-05 ***
## ground  1    14.075 168.588 55.454  2.0037   0.16976    
## year    1    28.854 153.809 52.977  4.5023   0.04437 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

model2 <- update(model1, .~. + tax)
drop1(model2,test = "F")

## Single term deletions
## 
## Model:
## price ~ floor + tax
##        Df Sum of Sq     RSS    AIC F value    Pr(>F)    
## <none>               95.195 40.023                      
## floor   1   120.782 215.978 60.142  30.451 1.126e-05 ***
## tax     1    87.468 182.663 55.619  22.052 8.997e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

add1(model2, scope = ~ tax + ground + floor + year, test = "F")

## Single term additions
## 
## Model:
## price ~ floor + tax
##        Df Sum of Sq    RSS    AIC F value Pr(>F)
## <none>              95.195 40.023               
## ground  1    1.9335 93.262 41.469  0.4768 0.4968
## year    1    2.8761 92.319 41.194  0.7165 0.4060

2. 단계적 방법 step() 자동선택 (가급적 후진위주로 쓴다.)

step(fit, direction = "backward") # 전진이나 both 쓰고 싶으면 model0 을 지정한다.

## Start:  AIC=52.8
## mpg ~ hp + wt + hp:wt
## 
##         Df Sum of Sq    RSS    AIC
## <none>               129.76 52.799
## - hp:wt  1    65.286 195.05 63.840

## 
## Call:
## lm(formula = mpg ~ hp + wt + hp:wt, data = mtcars)
## 
## Coefficients:
## (Intercept)           hp           wt        hp:wt  
##    49.80842     -0.12010     -8.21662      0.02785

3. best subset

subfit <- regsubsets(cons ~ ., data = IceCream1)

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : 1 linear dependencies found

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : nvmax reduced to 4

b.sum <- summary(subfit)
which.min(b.sum$bic)

## [1] 2

plot(b.sum$bic, type = "l", xlab = "# of Features", ylab = "BIC",  # 3 번 선택
     main = "BIC score by Feature Inclusion")

plot(subfit, scale = "bic", main = "Best Subset Features")

만일 머신러닝 같은 test data 가 따로 있었다면 library(forecast) 의 accuracy(model.pred, test) 적용하여 RMSE 평가 가능

leaps <-regsubsets(cons ~ ., data = IceCream1)

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : 1 linear dependencies found

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : nvmax reduced to 4

plot(leaps, scale="adjr2")

plot(leaps, scale = "Cp") # cp 가 실제 변수 값에 가까운 값을 택한다.

다른 그래프 하기 점선에 가까운 변수가 좋은 모형 subsets(leaps, statistic=“cp”, main=“Cp Plot for All Subsets Regression”) # 범례 위치 조정
abline(1,1,lty=2,col=“red”)
statistic = “rsq” 도 가능

library(RColorBrewer)
leaps1 <- regsubsets(x=cons ~ ., data = IceCream)
plot(leaps1, scale="adjr2", col=brewer.pal(9, "Pastel1"), main="All Subsets Regression")

best subset 색상조정

method=c(“exhaustive”, “forward”, “backward”, “seqrep”) 기본은 exhaustive 로 전역 탐색이다. 조정하고 싶을 경우 method 로 변경 가능하다.
nbest 는 각 변수 조합에서 상위 3 개까지만 출력하라는 의미

변수 1 개 조합에서는 변수가 5 개라면 상위 3 개까지 출력
변수 2 개 조합에서는 변수가 5 개라면 상위 3 개까지 출력
변수 3 개 조합에서는 변수가 5 개라면 상위 3 개까지 출력 …

변수가 적다면 nbest 는 그다지 의미 없다.

summary(leaps1)

## Subset selection object
## Call: regsubsets.formula(x = cons ~ ., data = IceCream)
## 5 Variables  (and intercept)
##        Forced in Forced out
## preiod     FALSE      FALSE
## income     FALSE      FALSE
## price      FALSE      FALSE
## temp       FALSE      FALSE
## year       FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: exhaustive
##          preiod income price temp year
## 1  ( 1 ) " "    " "    " "   "*"  " " 
## 2  ( 1 ) " "    " "    " "   "*"  "*" 
## 3  ( 1 ) "*"    " "    " "   "*"  "*" 
## 4  ( 1 ) "*"    " "    "*"   "*"  "*" 
## 5  ( 1 ) "*"    "*"    "*"   "*"  "*"

names(summary(leaps1)) # 적합도 지표

## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

summary(leaps1)$adjr2  # 적합도 지표중 수정된 r

## [1] 0.5873647 0.7474333 0.7835725 0.8000781 0.7946705

which.max(summary(leaps1)$adjr2)  # 제일 좋은 값 9

## [1] 4

which.min(summary(leaps1)$rss)    # rss 기준으로 최적 변수 값 확인

## [1] 5

coef(leaps1, 3)  # 제좋 좋은 값 계수

##  (Intercept)       preiod         temp         year 
##  0.178136609 -0.004227115  0.003528046  0.081766626

plot(summary(leaps1)$cp, xlab = "number of features", ylab = "cp")
points(which.min(summary(leaps1)$cp),summary(leaps1)$cp[which.min(summary(leaps1)$cp)],
       pch=20,col="red")

AIC 는 cp 와 비례하므로 cp 확인

layout(matrix(1:2, ncol = 2))
res.legend <- subsets(leaps1, statistic="adjr2", legend = FALSE, main = "Adjusted R^2")
res.legend <- subsets(leaps1, statistic="cp", legend = FALSE, main = "Mallow Cp")
abline(a = 1, b = 1, lty = 2)

summaryf(leaps1)

##          preiod income price temp year        rss       rsq     adjr2
## 1  ( 1 )                        *      0.05000933 0.6015935 0.5873647
## 2  ( 1 )                        *    * 0.02951661 0.7648517 0.7474333
## 3  ( 1 )      *                 *    * 0.02435636 0.8059615 0.7835725
## 4  ( 1 )      *            *    *    * 0.02163351 0.8276536 0.8000781
## 5  ( 1 )      *      *     *    *    * 0.02132992 0.8300721 0.7946705
##                 cp       bic
## 1  ( 1 ) 30.269502 -20.80608
## 2  ( 1 )  9.211502 -33.22257
## 3  ( 1 )  5.405294 -35.58618
## 4  ( 1 )  4.341589 -35.74147
## 5  ( 1 )  6.000000 -32.76425

regbook::press(leaps1) # 실제 예측값이 출력된다.

##          preiod income price temp year      PRESS pred.r.squared
## 1  ( 1 )                        *      0.05769281      0.5403819
## 2  ( 1 )                        *    * 0.03737904      0.7022145
## 3  ( 1 )      *                 *    * 0.03449085      0.7252237
## 4  ( 1 )      *            *    *    * 0.03363508      0.7320413
## 5  ( 1 )      *      *     *    *    * 0.03493918      0.7216520

model1 <- update(FullModel1, .~ . - preiod)
model1 <- update(model1, .~ . - year)
confint(model1) # 신뢰구간이 좁을수록 더 정확하다. 0 을 포함하지 않으면 계수는 유의하다.

##                     2.5 %      97.5 %
## (Intercept) -0.3581221927 0.752752337
## income       0.0008998752 0.005715646
## price       -2.7594600283 0.670632044
## temp         0.0025425950 0.004374264

car package 는 또 다른 신뢰구간 그래프를 제공한다. 50% 90% 구간에 신뢰구간을 표시한다.

dataEllipse(IceCream$temp, IceCream$cons)

press(model1)

## $PRESS
## [1] 0.04784483
## 
## $pred.r.squared
## [1] 0.6188372

4. olsrr 패키지 변수 선택

out.rgs2 <- ols_step_all_possible(model1) ; out.rgs2 ; plot(out.rgs2)

## # A tibble: 7 x 6
##   Index     N Predictors        `R-Square` `Adj. R-Square` `Mallow's Cp`
##   <int> <int> <chr>                  <dbl>           <dbl>         <dbl>
## 1     1     1 temp                 0.602           0.587           10.9 
## 2     2     1 price                0.0674          0.0341          60.3 
## 3     3     1 income               0.00230        -0.0333          66.3 
## 4     4     2 income temp          0.702           0.680            3.57
## 5     5     2 price temp           0.633           0.606            9.97
## 6     6     2 income price         0.0678         -0.00126         62.3 
## 7     7     3 income price temp    0.719           0.687            4

ols_step_best_subset(model1)

##     Best Subsets Regression     
## --------------------------------
## Model Index    Predictors
## --------------------------------
##      1         temp              
##      2         income temp       
##      3         income price temp 
## --------------------------------
## 
##                                                     Subsets Regression Summary                                                    
## ----------------------------------------------------------------------------------------------------------------------------------
##                        Adj.        Pred                                                                                            
## Model    R-Square    R-Square    R-Square     C(p)         AIC         SBIC          SBC        MSEP      FPE       HSP      APC  
## ----------------------------------------------------------------------------------------------------------------------------------
##   1        0.6016      0.5874      0.5404    10.8624    -100.7660    -186.7163     -96.5624    0.0019    0.0019    1e-04    0.4553 
##   2        0.7021      0.6800        0.63     3.5669    -107.4833    -192.1056    -101.8785    0.0015    0.0015    1e-04    0.3642 
##   3        0.7190      0.6866      0.6188     4.0000    -107.2389    -191.1918    -100.2329    0.0016    0.0015    1e-04    0.3675 
## ----------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria 
##  SBIC: Sawa's Bayesian Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
##  MSEP: Estimated error of prediction, assuming multivariate normality 
##  FPE: Final Prediction Error 
##  HSP: Hocking's Sp 
##  APC: Amemiya Prediction Criteria

5. lasso 최근 다차원 데이터 세트에서는 leaps 적용시 몇 시간을 기다려야 될수도 있다.
lasso 는 그에 대한 좋은 대안일 수 있다. object1 <- lars(x,y, type=“lasso”, trace = T) # lasso 에서 확인

6. 변수 중요성 평가

library(FSelector)

## Warning: package 'FSelector' was built under R version 3.5.3

chi.squared(cons ~., data=IceCream)

##        attr_importance
## preiod        0.000000
## income        0.000000
## price         0.000000
## temp          0.760117
## year          0.000000

cs <- chi.squared(cons ~., data=IceCream)
cutoff.k(cs,1)  # 변수 중 중요 1 개 선택 (변수가 많을때는 3 개 5 개도 선택 가능)

## [1] "temp"

상대적 변수 중요도

relweights <- function(fit,...){
  R <- cor(fit$model)
  nvar <- ncol(R)
  rxx <- R[2:nvar, 2:nvar]
  rxy <- R[2:nvar, 1]
  svd <- eigen(rxx)
  evec <- svd$vectors
  ev <- svd$values
  delta <- diag(sqrt(ev))
  lambda <- evec %*% delta %*% t(evec)
  lambdasq <- lambda ^ 2
  beta <- solve(lambda) %*% rxy
  rsquare <- colSums(beta ^ 2)
  rawwgt <- lambdasq %*% beta ^ 2
  import <- (rawwgt / rsquare) * 100
  import <- as.data.frame(import)
  row.names(import) <- names(fit$model[2:nvar])
  names(import) <- "Weights"
  import <- import[order(import),1, drop=FALSE]
  dotchart(import$Weights, labels=row.names(import),
           xlab="% of R-Square", pch=19,
           main="Relative Importance of Predictor Variables",
           sub=paste("Total R-Square=", round(rsquare, digits=3)),
           ...)
  return(import)
}

relweights(model1, col="blue")

##          Weights
## price   5.973038
## income  6.557038
## temp   87.469924

library(coefplot)
coefplot::coefplot(model1)   # coefplot 이 많으므로 주의한다.

multiplot(FullModel1, model1)  # 비교 가능

0 을 중심으로 해당 변수 변화량 만큼 y 가 변한다. y 가 1 단계 변할때의 x 변화량의 차이

최종 모델

model2 <- lm(cons ~ income + temp, data = IceCream)
summary(model2)

## 
## Call:
## lm(formula = cons ~ income + temp, data = IceCream)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.065420 -0.022458  0.004026  0.015987  0.091905 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.113195   0.108280  -1.045  0.30511    
## income       0.003530   0.001170   3.017  0.00551 ** 
## temp         0.003543   0.000445   7.963 1.47e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03722 on 27 degrees of freedom
## Multiple R-squared:  0.7021, Adjusted R-squared:   0.68 
## F-statistic: 31.81 on 2 and 27 DF,  p-value: 7.957e-08

4) 최적 계수 selection (모델 진단 및 평가, 모델의 튜닝, 이상치 및 변수 변환으로 성능 향상) 이상치를 지우는 것은 신중해야한다. 언제나 적은 데이터로 예측을 해야하는 만큼 이상치는 또다른 가능성일수 있다. 최대한 모델에 반영하고 최종적으로 모델 향상시 제거한다.

회귀진단시 더 많은 정확한 패키지 적용 가능

devtools::install_github(“yeukyul/lindia”)

Lindia 패키지

library(lindia)
lindia::gg_diagnose(model1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

gg_resX(model1, scale.factor = 1) #  각 잔차 plot

gg_boxcox(model1)

gg_diagnose(model1, theme = theme_bw())

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

gg_reshist(model1, bins = NULL)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

gg_scalelocation(model1)

## `geom_smooth()` using formula 'y ~ x'

library(ggfortify)
autoplot(model1)

olsrr https://cran.r-project.org/web/packages/olsrr/

ols_plot_resid_qq(model1)

ols_plot_resid_hist(model1)

press(model1)

## $PRESS
## [1] 0.04784483
## 
## $pred.r.squared
## [1] 0.6188372

최종 계수해석 및 계수의 검정, 표준화 계수 검정 (THE R BOOK 내용 확인 필요) X1 의 계수가 일반적으로 X2 보다 2.5 배이다. 이 경우 유의성 검정

mod0 <- lm(suneung ~ kor + I(2.5*math + sci), suneung)
mod1 <- lm(suneung ~ kor + math + sci, suneung)
anova(mod0, mod1)  # 유의하지 않다. 특별히 다르다고 할수 없다. mod1 이 타당하다. (순서 주의)

## Analysis of Variance Table
## 
## Model 1: suneung ~ kor + I(2.5 * math + sci)
## Model 2: suneung ~ kor + math + sci
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     22 3136.4                           
## 2     21 3133.4  1    3.0498 0.0204 0.8877

계수 검정 예시 계수 값에 대한 가설 검정 (상수 5, x1 = -5, x2 = 0, x3 = 1.1 로 해도 무관한가?) linearHypothesis(out, c(“(Intercept) = 5”, “x1 = -5”, “x2 = 0”, “x3= 1.1”))

계수 표준화 (계수 해석시 필요)

stdcoef(model1)  # 표준화된 계수 출력 regbook 함수

## (Intercept)      income       price        temp 
##   0.0000000   0.3140085  -0.1324351   0.8632558

summary(lmbeta(model1))

## 
## Call:
## lm(formula = cons ~ income + price + temp, data = IceCream)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.065302 -0.011873  0.002737  0.015953  0.078986 
## 
## Coefficients:
##                   BETA   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0000000  0.1973151  0.2702162   0.730  0.47179    
## income       0.3140085  0.0033078  0.0011714   2.824  0.00899 ** 
## price       -0.1324351 -1.0444140  0.8343573  -1.252  0.22180    
## temp         0.8632558  0.0034584  0.0004455   7.762  3.1e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03683 on 26 degrees of freedom
## Multiple R-squared:  0.719,  Adjusted R-squared:  0.6866 
## F-statistic: 22.17 on 3 and 26 DF,  p-value: 2.451e-07

매개효과와 조절효과분석은 추후 데이터에 맞춰 추가한다. 특히 mtcars 특히 상호작용에 대해서 약간 더 해당 내용이 추가되어야 한다.

회귀의 가정은 중요하다. 이 가정이 지켜져야 다른 모집단에도 적용되는 일반화 모델이 성립된다. 단 가정이 성립해도 일반화가 안될수도 있다는 것은 항시 명심해야 한다.

library(QuantPsyc) # 회귀계수 표준화

album2<-read.delim("Album Sales 2.dat", header = TRUE)
album.sales.3<-lm(sales ~ adverts + airplay + attract, data = album2)
summary(album.sales.3)

## 
## Call:
## lm(formula = sales ~ adverts + airplay + attract, data = album2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -121.324  -28.336   -0.451   28.967  144.132 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -26.612958  17.350001  -1.534    0.127    
## adverts       0.084885   0.006923  12.261  < 2e-16 ***
## airplay       3.367425   0.277771  12.123  < 2e-16 ***
## attract      11.086335   2.437849   4.548 9.49e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.09 on 196 degrees of freedom
## Multiple R-squared:  0.6647, Adjusted R-squared:  0.6595 
## F-statistic: 129.5 on 3 and 196 DF,  p-value: < 2.2e-16

lm.beta(album.sales.3) # 표준화된 계수 정확한 계수 해석시 적용한다.

##   adverts   airplay   attract 
## 0.5108462 0.5119881 0.1916834

5) 예측 및 성능 평가 예측 실선은 예측구간, 큰 점선은 신뢰구간, 작은 점선은 예측 (press) 구간이다.

predict(model2, se.fit = T)

## $fit
##         1         2         3         4         5         6         7 
## 0.3074334 0.3641132 0.3959767 0.4101631 0.3995857 0.3924728 0.3924202 
##         8         9        10        11        12        13        14 
## 0.3322235 0.2684834 0.2507274 0.2754911 0.2789950 0.3037850 0.3215410 
##        15        16        17        18        19        20        21 
## 0.3782207 0.3995069 0.4243363 0.4172759 0.4207404 0.4029976 0.3427745 
##        22        23        24        25        26        27        28 
## 0.3356616 0.3320264 0.3072495 0.3213833 0.3426300 0.3639161 0.4099528 
##        29        30 
## 0.4348217 0.4560946 
## 
## $se.fit
##  [1] 0.011698691 0.009244437 0.009232895 0.010791908 0.012959707
##  [6] 0.010971389 0.008544058 0.009692552 0.015969279 0.016166307
## [11] 0.012732487 0.012203013 0.009932446 0.008425721 0.007235948
## [16] 0.009025465 0.011972801 0.012590683 0.010318410 0.008805432
## [21] 0.007130920 0.007941476 0.013064049 0.012748971 0.014428407
## [26] 0.014605831 0.012424462 0.015391631 0.013357799 0.014866808
## 
## $df
## [1] 27
## 
## $residual.scale
## [1] 0.03721736

pred.pre <- predict(model2, interval = "prediction")

## Warning in predict.lm(model2, interval = "prediction"): predictions on current data refer to _future_ responses

pred.con <- predict(model2, interval = "confidence")
matplot(cbind(pred.con, pred.pre[,-1]), lty=c(1,2,2,3,3), type="l", ylab = "predict y")

최종 모델 평가 : 최종 모델의 적합도와 예측도의 차이로 완성도 평가 press (prediction + ess : 예측 잔차 제곱합) press 값은 낮을 수록, 기여율은 높을 수록 좋다.

sum(resid(model2)^2)                                 # ess

## [1] 0.03739857

sum((resid(model2)/(1-hatvalues(model2)))^2)         # prediction ess (press(out))

## [1] 0.04644069

차이가 30 이상 벌어질 때는 이상치가 아직도 남아 있다는 반증으로 이상치를 확인해 본다. resid(out)/(1-hatvalues(out)) 예측치의 이상치가 엄청나게 커 전체 편차는 적은 편인데 이 단 두 관측치의 이상치가 차이를 낸다고 판단하면 어느정도는 적용 가능하다고 판단한다. 즉, 편차를 확인하여 몇몇 이상치 때문에 차이가 나게되는 것인 경우라면 적합하다고 최종 판정한다.

plot(model2$fitted.values, IceCream$cons, xlab = "predicted", ylab = "actual")

구간비교

summary(predict(model2))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2507  0.3214  0.3640  0.3594  0.4021  0.4561

summary(IceCream$cons)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2560  0.3113  0.3515  0.3594  0.3912  0.5480

히스토그램 비교 적합도 고려해볼만하다.

par(mfrow=c(1,2))
hist(predict(model2))
hist(IceCream$cons)

**각 값의 상관성 확인

predict <- predict(model2) 
result <- aov(IceCream$cons ~ predict)  # aov(y ~ predict)
summary(result)

##             Df  Sum Sq Mean Sq F value   Pr(>F)    
## predict      1 0.08812 0.08812   65.98 7.64e-09 ***
## Residuals   28 0.03740 0.00134                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cor(predict, IceCream$cons)

## [1] 0.8378896

MAE <- function(actual, predicted) {  mean(abs(actual - predicted))  } 
MAE(predict, IceCream$cons)  # 실제 차이

## [1] 0.02611499

MAE(0.35, IceCream$cons)     # 실제 평균값과의 차이

## [1] 0.05263333

5. 부트스트랩

이상치를 비롯 로버스트 회귀분석은 제껴두고 애초에 이방법으로 진행해도 무관하다. 변수가 많을 수록 컴퓨팅 퍼포먼스가 많이 들어간다. 리샘플링에는 CV 를 비롯 베이지안같은 많은 방법이 있다. 최종 부스트랩 모델과 다른 모델간의 press() 값 비교 가정이 깨졌을 경우 로버스트 회귀분석 또는 부트스트랩 객체 <- boot(자료, 함수, 반복)

기본적으로 통계적 사전 검증을 거쳐 최적 변수를 선택한 후 부트스트랩을 통해 가장 예측도가 높은 계수를 확정짓는다.

부트스트랩의 기본형 bootreg <- function(formula, data, indices){
d <- data[indices,] # 부트스트랩의 특징 : 데이터 프레임의 한 부분 집합
fit <- lm(formula, data = d)
return(coef(fit))}

bootResult <- boot(statistic = bootreg, formula = sales ~ adverts + airplay + attract, album2,
R = 2000)
boot.ci(bootResult, type = “bca”, index = 1) # 현 변수가 3 개 이므로 상수항 포함 인덱스는 4 개

부트스트랩 방법 1

regdat <- read.table("regdat.txt",header=T)
str(regdat) # 35 obs. of  2 variables

## 'data.frame':    35 obs. of  2 variables:
##  $ explanatory: num  10.18 6.62 7.77 11.06 4.49 ...
##  $ response   : num  19.5 17.1 19.5 21.6 13.2 ...

model <- lm(response  ~ explanatory, regdat)
b.boot <- numeric(10000) # for 문 메모리 확보를 위해 10000 개 들어갈 공간 사전 확보
for (i in 1:10000){
  indices <- sample(1:35, replace=T) # 35 개 샘플에서 무작위 순서로 수차례 배치
  xv <- regdat$explanatory[indices]
  yv <- regdat$response[indices]
  model <- lm(yv~xv)
  b.boot[i] <- coef(model)[2]}
hist(b.boot,main="",col="green")  # b.boot 에 값이 쌓이게 된다.

reg.boot <- function(regdat, index){
  xv <- regdat$explanatory[index]
  yv <- regdat$response[index]
  model <- lm(yv~xv)
  coef(model)}

reg.model <- boot(regdat,reg.boot,R=10000) 
boot.ci(reg.model,index=2)

## Warning in boot.ci(reg.model, index = 2): bootstrap variances needed for
## studentized intervals

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = reg.model, index = 2)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 0.874,  1.250 )   ( 0.906,  1.279 )  
## 
## Level     Percentile            BCa          
## 95%   ( 0.822,  1.195 )   ( 0.833,  1.201 )  
## Calculations and Intervals on Original Scale

부스트스랩 방법 2 원 회귀모형에서 추정한 y 에 대해 임의로 잔차를 할당하는 방법

model <- lm(regdat$response~regdat$explanatory)
fit <- fitted(model)
res <- resid(model)

residual.boot <- function(res, index){
  y <- fit+res[index]                      # y 원래 설명변수를 회귀식에 입력한 값
  model <- lm(y~regdat$explanatory)
  coef(model) }

res.model <- boot(res,residual.boot,R=10000)  
boot.ci(res.model,index=2)

## Warning in boot.ci(res.model, index = 2): bootstrap variances needed for
## studentized intervals

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = res.model, index = 2)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 0.878,  1.223 )   ( 0.877,  1.223 )  
## 
## Level     Percentile            BCa          
## 95%   ( 0.878,  1.224 )   ( 0.880,  1.228 )  
## Calculations and Intervals on Original Scale

회귀분석의 잭나이프 데이터를 번갈아가면서 1 개씩 생략한 다음 해당 모수를 추정하는 방법이다.
부트스트랩과는 달리 비복원 추출이다.
잭나이프는 부트스트랩보다 오래된 기법이며 계산량 부담이 컸던 예전에 주로 썼던 방식으로
현재 컴퓨팅 파워가 올라간 지금 잭나이프는 특정한 상황에서 적용되며 부트스트랩을 더 적용한다.

잭나이프는 기본적으로 이상치가 잦은 데이터에 강점이 있다.
잭나이프 계산방법의 핵심은 다음과 같다.
데이터 1, 2, 3, 4, 5
(1+2+3+4+5)/5 = 3
((1+2+3+4)/4 + (1+2+3+5)/4 + (1+2+4+5)/4 + (1+3+4+5)/4 + (2+3+4+5)/4)/5 = 3

jack.reg <- numeric(35)  # 35 개의 0 벡터 가지게 된다.  

for (i in 1:35) {                           # 매번 1 개씩 삭제해가면서 n-1 개 데이터로 회귀분석
  model <- lm(regdat$response[-i]~regdat$explanatory[-i], regdat)
  jack.reg[i] <- coef(model)[2] }
hist(jack.reg,main="",col="pink") # 왼쪽 편향된 값을 발견할수 있다. 부트스트랩으로는 찾지 못한다.

해당 왼쪽으로 편향된값을 찾는다. 거의 대부분 쿡 디스턴스에서 벗어난 거리이다.

model <- lm(response~explanatory, regdat)
which(influence.measures(model)$infmat[,5] == max(influence.measures(model)$infmat[,5]))

## 22 
## 22

# influence.measures(model)$infmat[,5] 쿡 디스턴스
plot(regdat$explanatory,regdat$response,pch=21,col="green",bg="red")
abline(model,col="blue")
abline(lm(regdat$response[-22]~regdat$explanatory[-22]),col="red") # 쿡디스턴스를 지우고 그리면 기울기 달라짐

부트스트랩 이후의 잭나이프 적용

jack.after.boot(reg.model,index=2) # 관심있는 계수는 a + bx 의 b 이므로 2 를 준다.

부트스트랩에서는 나타나지 않은 특정 관측치들을 추출해준다.
현 데이터에서는 상당히 편향이 나타나는 그래프라는 것을 알수있으며 22 번은 확실히 이상치이다.
또한 34, 30 은 확실한 지렛점이다.

현 상황에서 잭나이프는 부트스트랩 이후 이상치 탐색에 있어서 나름대로의 위치를 가진다.

리샘플링이라는 기법 기본적으로 나올수있는 모든 확률을 총동원해 확률내 모든 경우를 조사하는 방법론이다. 몬테카를로, 부트스트랩, 더블 부트스트랩, 잭나이프, 교차검증, 순열테스트, 무작위테스트 등이 있다.

부트스트랩

rsq <- function(formula, data, indices) {
  d <- data[indices,]
  fit <- lm(formula, data=d)
  return(summary(fit)$r.square) # 원하는 통계량
} 

library(boot)
set.seed(1234)
results <- boot(data=mtcars, statistic=rsq, R=1000, formula=mpg~wt+disp)
print(results)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = mtcars, statistic = rsq, R = 1000, formula = mpg ~ 
##     wt + disp)
## 
## 
## Bootstrap Statistics :
##      original    bias    std. error
## t1* 0.7809306 0.0133367  0.05068926

plot(results)

boot.ci(results, type=c("perc", "bca"))

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = results, type = c("perc", "bca"))
## 
## Intervals : 
## Level     Percentile            BCa          
## 95%   ( 0.6838,  0.8833 )   ( 0.6344,  0.8549 )  
## Calculations and Intervals on Original Scale
## Some BCa intervals may be unstable

선형 회귀분석 mpg ~ wt + disp r-square 값을 반환하는 함수가 필요하다.

library(boot)
rsq <- function(formula, data, indices) { # 인덱스 파라미터가 포함되어야 한다.
  d <- data[indices,]             # 데이터의 인덱스
  fit <- lm(formula, data = d)    # 통계식 
  return(summary(fit)$r.square)   # 통계식 결과에서 얻고자 하는 리턴값 (주로 기여율이다.)
}


results <- boot(data = mtcars, statistic = rsq, R = 1000, formula = mpg~wt+disp)
print(results)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = mtcars, statistic = rsq, R = 1000, formula = mpg ~ 
##     wt + disp)
## 
## 
## Bootstrap Statistics :
##      original     bias    std. error
## t1* 0.7809306 0.01049571  0.04852903

회귀 계수와 같이 여러 값을 구해야하는경우

bs <- function(formula, data, indices) {
  d <- data[indices,]             # 데이터의 인덱스
  fit <- lm(formula, data = d)    # 통계식 
  return(coef(fit))   # 통계식 결과에서 얻고자 하는 리턴값 
}

results <- boot(data = mtcars, statistic = bs, R = 1000, formula = mpg~wt+disp)
print(results)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = mtcars, statistic = bs, R = 1000, formula = mpg ~ 
##     wt + disp)
## 
## 
## Bootstrap Statistics :
##        original        bias    std. error
## t1* 34.96055404  2.228099e-01  2.45893284
## t2* -3.35082533 -9.681666e-02  1.10954175
## t3* -0.01772474  7.147134e-05  0.00824112

최종 여러모델의 정확도 및 테스트 수행

최적화

추가데이터를 가지고 X 의 범위를 조정할수 있을때 최적화 데이터를 구한다. RSM 과 optim (optim 은 추후 업데이트)

RSM 1차항 FO 2차항 PQ 이원교호작용 TWI 그러나 보통 SO 로 모든 항을 자동 생성한다.

library(rsm)

## 
## Attaching package: 'rsm'

## The following object is masked from 'package:data.table':
## 
##     cube

## The following object is masked from 'package:igraph':
## 
##     star

yield.rsm <- rsm(yield ~ SO(time, temp), data = yield)
summary(yield.rsm) # 모델링 결과와 함께 정상값도 출력된다.

## 
## Call:
## rsm(formula = yield ~ SO(time, temp), data = yield)
## 
##                Estimate  Std. Error t value Pr(>|t|)  
## (Intercept) -3.2486e+02  1.3609e+02 -2.3871  0.06261 .
## time         6.9010e+00  2.4647e+00  2.8000  0.03799 *
## temp         3.1399e+00  1.0656e+00  2.9466  0.03201 *
## time:temp   -3.0075e-02  9.4237e-03 -3.1914  0.02423 *
## time^2       1.7134e-02  2.9597e-02  0.5789  0.58776  
## temp^2      -5.9682e-03  2.1178e-03 -2.8181  0.03719 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Multiple R-squared:  0.9121, Adjusted R-squared:  0.8243 
## F-statistic: 10.38 on 5 and 5 DF,  p-value: 0.01128
## 
## Analysis of Variance Table
## 
## Response: yield
##                 Df  Sum Sq Mean Sq F value   Pr(>F)
## FO(time, temp)   2 160.856  80.428 15.8022 0.006896
## TWI(time, temp)  1  51.840  51.840 10.1853 0.024226
## PQ(time, temp)   2  51.522  25.761  5.0614 0.062856
## Residuals        5  25.448   5.090                 
## Lack of fit      3  23.922   7.974 10.4461 0.088623
## Pure error       2   1.527   0.763                 
## 
## Stationary point of response surface:
##       time       temp 
##   9.182004 239.921637 
## 
## Eigenanalysis:
## eigen() decomposition
## $values
## [1]  0.02454460 -0.01337914
## 
## $vectors
##            [,1]       [,2]
## time -0.8969851 -0.4420607
## temp  0.4420607 -0.8969851

predict 함수로 지정된 값에서의 적합값도 구할수 있다. time 의 2 차식은 유의하지 않기때문에 time 에 대해서는 평평할 것으로 예상된다.

정상점 조건 함수로도 출력

xs(yield.rsm)

##       time       temp 
##   9.182004 239.921637

특정 값일 경우 출력

nw <- data.frame(time = xs(yield.rsm)[1], temp = xs(yield.rsm)[2])
predict(yield.rsm, newdata = nw)

##     time 
## 83.49256

정상점의 안장값 확인 두 값의 부호가 다를경우 안장값이다. 즉, 지역 최적값일 확률이 높다. 상기 값으로는 단위가 다르므로 어느쪽 경사가 더 큰지 말하기 어렵다. 경사도를 알기 위해 표준화 한다.

데이터 상에서 최대 최소값은 4, 20, 220, 280 으로 주어진다. 각 평균은 12, 250 이다. 표준화 변환 공식은 (Xi - (Xmin + Xmax)/2)/(Xmax - Xmin)/2

yieldcoded <- coded.data(yield, x1 ~ (time-12)/8, x2 ~ (temp-250)/30)
yieldcoded.rsm <- rsm(yield ~ SO(x1, x2), data = yieldcoded)
summary(yieldcoded.rsm)

## 
## Call:
## rsm(formula = yield ~ SO(x1, x2), data = yieldcoded)
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  82.1683     1.3025 63.0842 1.895e-08 ***
## x1           -1.6523     1.1237 -1.4704  0.201405    
## x2           -6.1515     1.1337 -5.4261  0.002881 ** 
## x1:x2        -7.2180     2.2617 -3.1914  0.024226 *  
## x1^2          1.0966     1.8942  0.5789  0.587764    
## x2^2         -5.3714     1.9060 -2.8181  0.037192 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Multiple R-squared:  0.9121, Adjusted R-squared:  0.8243 
## F-statistic: 10.38 on 5 and 5 DF,  p-value: 0.01128
## 
## Analysis of Variance Table
## 
## Response: yield
##             Df  Sum Sq Mean Sq F value   Pr(>F)
## FO(x1, x2)   2 160.856  80.428 15.8022 0.006896
## TWI(x1, x2)  1  51.840  51.840 10.1853 0.024226
## PQ(x1, x2)   2  51.522  25.761  5.0614 0.062856
## Residuals    5  25.448   5.090                 
## Lack of fit  3  23.922   7.974 10.4461 0.088623
## Pure error   2   1.527   0.763                 
## 
## Stationary point of response surface:
##         x1         x2 
## -0.3522495 -0.3359454 
## 
## Stationary point in original units:
##       time       temp 
##   9.182004 239.921637 
## 
## Eigenanalysis:
## eigen() decomposition
## $values
## [1]  2.708577 -6.983377
## 
## $vectors
##          [,1]      [,2]
## x1 -0.9130575 0.4078309
## x2  0.4078309 0.9130575

감소하는 방향 경사가 더 가파른 것을 알수 있다. 순서가 time 과 temp 이므로 정상점에서 time 을 변화시키면 효율증가, 온도를 변화시키면 감소한다.

contour(yield.rsm, ~time + temp, image = T)

persp(yield.rsm, ~time + temp, theta = 120, col = "lightcyan") # theta 바라보는 각도 조절

optim 제약조건을 걸지 못하는 최적화라서 거의 최대값을 구해버린다. optim 의 box constrained 조건과 optimzation to specific value 에 대해 따로 확인

m = lm(data=yield, yield ~ time + temp + I(time^2) + I(temp^2) + time:temp)

fxn = function(z){
  z = as.data.frame(t(z))
  colnames(z) = colnames(yield)[-1]
  predict(m, newdata=z)
}

optim(c(0,0), fxn, control=list(fnscale=-1)) #maximizes fxn

## $par
## [1] 2.684172e+55 4.996335e+54
## 
## $value
## [1] 8.162026e+108
## 
## $counts
## function gradient 
##      501       NA 
## 
## $convergence
## [1] 1
## 
## $message
## NULL

Linear_Regression

2020 5 21