Regression(forward, backward, both)

회귀분석

패키지 불러오기

library(tidyverse)
library(ggplot2)

파일 불러오기

autoparts <- read.csv("C:/Users/user/Desktop/JBTP/autoparts.csv", header=T)
dim(autoparts)

## [1] 34139    17

NA 검토

autoparts[!complete.cases(autoparts), ]

##  [1] prod_date         prod_no           prod_name        
##  [4] degree            mold              prod             
##  [7] s_no              fix_time          a_speed          
## [10] b_speed           separation        s_separation     
## [13] rate_terms        mpa               load_time        
## [16] highpressure_time c_thickness      
## <0 rows> (or 0-length row.names)

names(autoparts)

##  [1] "prod_date"         "prod_no"           "prod_name"        
##  [4] "degree"            "mold"              "prod"             
##  [7] "s_no"              "fix_time"          "a_speed"          
## [10] "b_speed"           "separation"        "s_separation"     
## [13] "rate_terms"        "mpa"               "load_time"        
## [16] "highpressure_time" "c_thickness"

분석 해당 데이터 선택

autoparts1 <- autoparts[autoparts$prod_no=="90784-76001", -c(1:7)]
dim(autoparts1)

## [1] 21779    10

summary(autoparts1)

##     fix_time         a_speed          b_speed        separation   
##  Min.   :  1.00   Min.   :0.4570   Min.   :1.240   Min.   :141.6  
##  1st Qu.: 81.00   1st Qu.:0.5980   1st Qu.:1.597   1st Qu.:185.9  
##  Median : 82.10   Median :0.6090   Median :1.640   Median :190.7  
##  Mean   : 83.14   Mean   :0.6189   Mean   :1.644   Mean   :214.5  
##  3rd Qu.: 85.40   3rd Qu.:0.6520   3rd Qu.:1.676   3rd Qu.:248.7  
##  Max.   :148.60   Max.   :0.8080   Max.   :2.528   Max.   :294.5  
##   s_separation     rate_terms         mpa          load_time    
##  Min.   :623.3   Min.   :76.00   Min.   :24.80   Min.   : 0.00  
##  1st Qu.:651.6   1st Qu.:81.00   1st Qu.:75.30   1st Qu.:18.10  
##  Median :710.3   Median :85.00   Median :76.60   Median :19.20  
##  Mean   :685.9   Mean   :84.53   Mean   :74.21   Mean   :18.68  
##  3rd Qu.:713.6   3rd Qu.:87.00   3rd Qu.:78.10   3rd Qu.:19.20  
##  Max.   :747.3   Max.   :97.00   Max.   :82.10   Max.   :22.30  
##  highpressure_time   c_thickness     
##  Min.   :   37.00   Min.   :   0.30  
##  1st Qu.:   60.00   1st Qu.:  21.80  
##  Median :   67.00   Median :  23.80  
##  Mean   :   96.36   Mean   :  27.44  
##  3rd Qu.:   72.00   3rd Qu.:  25.40  
##  Max.   :65534.00   Max.   :6553.40

데이터 탐색

# 종속변수 탐색 cf)설명변수
boxplot(autoparts1$c_thickness)

# 종속변수 이상치 제거
autoparts2 <- autoparts1[autoparts1$c_thickness < 1000, ]

# 이상치 제거 후 탐색
ggplot()+geom_boxplot(data=autoparts2, aes(1,c_thickness))

hist(autoparts2$c_thickness, breaks = 50)

ggplot()+geom_histogram(data=autoparts2, aes(c_thickness), color="black", fill="white")

***

분석 실시

m <- lm(c_thickness ~ ., data=autoparts2)
summary(m)

## 
## Call:
## lm(formula = c_thickness ~ ., data = autoparts2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.8428  -0.6105  -0.0214   0.5606  29.6508 
## 
## Coefficients:
##                     Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)        7.146e+02  3.367e+00  212.225  < 2e-16 ***
## fix_time           6.010e-02  5.331e-03   11.273  < 2e-16 ***
## a_speed           -1.738e+01  4.223e-01  -41.152  < 2e-16 ***
## b_speed            1.952e+00  1.516e-01   12.876  < 2e-16 ***
## separation        -7.592e-01  3.635e-03 -208.873  < 2e-16 ***
## s_separation      -7.468e-01  3.673e-03 -203.317  < 2e-16 ***
## rate_terms         1.133e-02  3.597e-03    3.151  0.00163 ** 
## mpa               -1.520e-01  1.458e-03 -104.253  < 2e-16 ***
## load_time         -1.523e-01  8.381e-03  -18.171  < 2e-16 ***
## highpressure_time -2.174e-05  8.738e-06   -2.488  0.01284 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.796 on 21757 degrees of freedom
## Multiple R-squared:  0.7841, Adjusted R-squared:  0.784 
## F-statistic:  8782 on 9 and 21757 DF,  p-value: < 2.2e-16

#첫번째 그림 오차와 예측값의 차이를 보여줌. 기울기 0인 직선이 관측되는 것이 이상적임
#두번째 그림 잔차가 정규분포를 따르는지를 보는 것으로 직선으로 나와야 한다 
#세번째 그림 표준화 잔차를 보여줌. 기울기가 0인 직선이 이상적임
#네번째 그림 모델에 영향을 많이 미치는 이상치를 보여준다. 왼쪽 가운데에 몰려 있으면서 cook's distance 안쪽에 있어야 이상적임
plot(m)

***

최적 모형 찾기

좋은 모형 여부는 AIC로 판정. AIC가 낮을수록 좋은 모형임

# 일반 회귀분석 
data(swiss)
m1 <- lm(Fertility ~ ., data=swiss)

# 전진선택법 
# 절편만 있고 변수는 없는 모형에 변수를 하나씩 삽입하면서 좋은 모형 선택
step(m1, direction = "forward")

## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality

## 
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education + 
##     Catholic + Infant.Mortality, data = swiss)
## 
## Coefficients:
##      (Intercept)       Agriculture       Examination         Education  
##          66.9152           -0.1721           -0.2580           -0.8709  
##         Catholic  Infant.Mortality  
##           0.1041            1.0770

# 후진선택법
# 모든 변수로 만든 모형에서 변수를 하나씩 빼면서 좋은 모형을 선택
step(m1, direction = "backward")

## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## - Examination       1     53.03 2158.1 189.86
## <none>                          2105.0 190.69
## - Agriculture       1    307.72 2412.8 195.10
## - Infant.Mortality  1    408.75 2513.8 197.03
## - Catholic          1    447.71 2552.8 197.75
## - Education         1   1162.56 3267.6 209.36
## 
## Step:  AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          2158.1 189.86
## - Agriculture       1    264.18 2422.2 193.29
## - Infant.Mortality  1    409.81 2567.9 196.03
## - Catholic          1    956.57 3114.6 205.10
## - Education         1   2249.97 4408.0 221.43

## 
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic + 
##     Infant.Mortality, data = swiss)
## 
## Coefficients:
##      (Intercept)       Agriculture         Education          Catholic  
##          62.1013           -0.1546           -0.9803            0.1247  
## Infant.Mortality  
##           1.0784

# 단계별 선택법
# 변수의 추가 삭제를 반복한다
step(m1, direction = "both")

## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## - Examination       1     53.03 2158.1 189.86
## <none>                          2105.0 190.69
## - Agriculture       1    307.72 2412.8 195.10
## - Infant.Mortality  1    408.75 2513.8 197.03
## - Catholic          1    447.71 2552.8 197.75
## - Education         1   1162.56 3267.6 209.36
## 
## Step:  AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          2158.1 189.86
## + Examination       1     53.03 2105.0 190.69
## - Agriculture       1    264.18 2422.2 193.29
## - Infant.Mortality  1    409.81 2567.9 196.03
## - Catholic          1    956.57 3114.6 205.10
## - Education         1   2249.97 4408.0 221.43

## 
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic + 
##     Infant.Mortality, data = swiss)
## 
## Coefficients:
##      (Intercept)       Agriculture         Education          Catholic  
##          62.1013           -0.1546           -0.9803            0.1247  
## Infant.Mortality  
##           1.0784

Regression(forward, backward, both)

updragon

2018년 8월 2일

회귀분석

패키지 불러오기

파일 불러오기

NA 검토

분석 해당 데이터 선택

데이터 탐색

분석 실시

최적 모형 찾기

좋은 모형 여부는 AIC로 판정. AIC가 낮을수록 좋은 모형임