회귀분석


패키지 불러오기
library(tidyverse)
library(ggplot2)

파일 불러오기
autoparts <- read.csv("C:/Users/user/Desktop/JBTP/autoparts.csv", header=T)
dim(autoparts)
## [1] 34139    17

NA 검토
autoparts[!complete.cases(autoparts), ]
##  [1] prod_date         prod_no           prod_name        
##  [4] degree            mold              prod             
##  [7] s_no              fix_time          a_speed          
## [10] b_speed           separation        s_separation     
## [13] rate_terms        mpa               load_time        
## [16] highpressure_time c_thickness      
## <0 rows> (or 0-length row.names)
names(autoparts)
##  [1] "prod_date"         "prod_no"           "prod_name"        
##  [4] "degree"            "mold"              "prod"             
##  [7] "s_no"              "fix_time"          "a_speed"          
## [10] "b_speed"           "separation"        "s_separation"     
## [13] "rate_terms"        "mpa"               "load_time"        
## [16] "highpressure_time" "c_thickness"

분석 해당 데이터 선택
autoparts1 <- autoparts[autoparts$prod_no=="90784-76001", -c(1:7)]
dim(autoparts1)
## [1] 21779    10
summary(autoparts1)
##     fix_time         a_speed          b_speed        separation   
##  Min.   :  1.00   Min.   :0.4570   Min.   :1.240   Min.   :141.6  
##  1st Qu.: 81.00   1st Qu.:0.5980   1st Qu.:1.597   1st Qu.:185.9  
##  Median : 82.10   Median :0.6090   Median :1.640   Median :190.7  
##  Mean   : 83.14   Mean   :0.6189   Mean   :1.644   Mean   :214.5  
##  3rd Qu.: 85.40   3rd Qu.:0.6520   3rd Qu.:1.676   3rd Qu.:248.7  
##  Max.   :148.60   Max.   :0.8080   Max.   :2.528   Max.   :294.5  
##   s_separation     rate_terms         mpa          load_time    
##  Min.   :623.3   Min.   :76.00   Min.   :24.80   Min.   : 0.00  
##  1st Qu.:651.6   1st Qu.:81.00   1st Qu.:75.30   1st Qu.:18.10  
##  Median :710.3   Median :85.00   Median :76.60   Median :19.20  
##  Mean   :685.9   Mean   :84.53   Mean   :74.21   Mean   :18.68  
##  3rd Qu.:713.6   3rd Qu.:87.00   3rd Qu.:78.10   3rd Qu.:19.20  
##  Max.   :747.3   Max.   :97.00   Max.   :82.10   Max.   :22.30  
##  highpressure_time   c_thickness     
##  Min.   :   37.00   Min.   :   0.30  
##  1st Qu.:   60.00   1st Qu.:  21.80  
##  Median :   67.00   Median :  23.80  
##  Mean   :   96.36   Mean   :  27.44  
##  3rd Qu.:   72.00   3rd Qu.:  25.40  
##  Max.   :65534.00   Max.   :6553.40

데이터 탐색
# 종속변수 탐색 cf)설명변수
boxplot(autoparts1$c_thickness)

# 종속변수 이상치 제거
autoparts2 <- autoparts1[autoparts1$c_thickness < 1000, ]

# 이상치 제거 후 탐색
ggplot()+geom_boxplot(data=autoparts2, aes(1,c_thickness))

hist(autoparts2$c_thickness, breaks = 50)

ggplot()+geom_histogram(data=autoparts2, aes(c_thickness), color="black", fill="white")

***

분석 실시
m <- lm(c_thickness ~ ., data=autoparts2)
summary(m)
## 
## Call:
## lm(formula = c_thickness ~ ., data = autoparts2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.8428  -0.6105  -0.0214   0.5606  29.6508 
## 
## Coefficients:
##                     Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)        7.146e+02  3.367e+00  212.225  < 2e-16 ***
## fix_time           6.010e-02  5.331e-03   11.273  < 2e-16 ***
## a_speed           -1.738e+01  4.223e-01  -41.152  < 2e-16 ***
## b_speed            1.952e+00  1.516e-01   12.876  < 2e-16 ***
## separation        -7.592e-01  3.635e-03 -208.873  < 2e-16 ***
## s_separation      -7.468e-01  3.673e-03 -203.317  < 2e-16 ***
## rate_terms         1.133e-02  3.597e-03    3.151  0.00163 ** 
## mpa               -1.520e-01  1.458e-03 -104.253  < 2e-16 ***
## load_time         -1.523e-01  8.381e-03  -18.171  < 2e-16 ***
## highpressure_time -2.174e-05  8.738e-06   -2.488  0.01284 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.796 on 21757 degrees of freedom
## Multiple R-squared:  0.7841, Adjusted R-squared:  0.784 
## F-statistic:  8782 on 9 and 21757 DF,  p-value: < 2.2e-16
#첫번째 그림 오차와 예측값의 차이를 보여줌. 기울기 0인 직선이 관측되는 것이 이상적임
#두번째 그림 잔차가 정규분포를 따르는지를 보는 것으로 직선으로 나와야 한다 
#세번째 그림 표준화 잔차를 보여줌. 기울기가 0인 직선이 이상적임
#네번째 그림 모델에 영향을 많이 미치는 이상치를 보여준다. 왼쪽 가운데에 몰려 있으면서 cook's distance 안쪽에 있어야 이상적임
plot(m)

***

최적 모형 찾기
좋은 모형 여부는 AIC로 판정. AIC가 낮을수록 좋은 모형임
# 일반 회귀분석 
data(swiss)
m1 <- lm(Fertility ~ ., data=swiss) 

# 전진선택법 
# 절편만 있고 변수는 없는 모형에 변수를 하나씩 삽입하면서 좋은 모형 선택
step(m1, direction = "forward")
## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education + 
##     Catholic + Infant.Mortality, data = swiss)
## 
## Coefficients:
##      (Intercept)       Agriculture       Examination         Education  
##          66.9152           -0.1721           -0.2580           -0.8709  
##         Catholic  Infant.Mortality  
##           0.1041            1.0770

# 후진선택법
# 모든 변수로 만든 모형에서 변수를 하나씩 빼면서 좋은 모형을 선택
step(m1, direction = "backward")
## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## - Examination       1     53.03 2158.1 189.86
## <none>                          2105.0 190.69
## - Agriculture       1    307.72 2412.8 195.10
## - Infant.Mortality  1    408.75 2513.8 197.03
## - Catholic          1    447.71 2552.8 197.75
## - Education         1   1162.56 3267.6 209.36
## 
## Step:  AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          2158.1 189.86
## - Agriculture       1    264.18 2422.2 193.29
## - Infant.Mortality  1    409.81 2567.9 196.03
## - Catholic          1    956.57 3114.6 205.10
## - Education         1   2249.97 4408.0 221.43
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic + 
##     Infant.Mortality, data = swiss)
## 
## Coefficients:
##      (Intercept)       Agriculture         Education          Catholic  
##          62.1013           -0.1546           -0.9803            0.1247  
## Infant.Mortality  
##           1.0784

# 단계별 선택법
# 변수의 추가 삭제를 반복한다
step(m1, direction = "both")
## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## - Examination       1     53.03 2158.1 189.86
## <none>                          2105.0 190.69
## - Agriculture       1    307.72 2412.8 195.10
## - Infant.Mortality  1    408.75 2513.8 197.03
## - Catholic          1    447.71 2552.8 197.75
## - Education         1   1162.56 3267.6 209.36
## 
## Step:  AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          2158.1 189.86
## + Examination       1     53.03 2105.0 190.69
## - Agriculture       1    264.18 2422.2 193.29
## - Infant.Mortality  1    409.81 2567.9 196.03
## - Catholic          1    956.57 3114.6 205.10
## - Education         1   2249.97 4408.0 221.43
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic + 
##     Infant.Mortality, data = swiss)
## 
## Coefficients:
##      (Intercept)       Agriculture         Education          Catholic  
##          62.1013           -0.1546           -0.9803            0.1247  
## Infant.Mortality  
##           1.0784