library(tidyverse)
library(ggplot2)
autoparts <- read.csv("C:/Users/user/Desktop/JBTP/autoparts.csv", header=T)
dim(autoparts)
## [1] 34139 17
autoparts[!complete.cases(autoparts), ]
## [1] prod_date prod_no prod_name
## [4] degree mold prod
## [7] s_no fix_time a_speed
## [10] b_speed separation s_separation
## [13] rate_terms mpa load_time
## [16] highpressure_time c_thickness
## <0 rows> (or 0-length row.names)
names(autoparts)
## [1] "prod_date" "prod_no" "prod_name"
## [4] "degree" "mold" "prod"
## [7] "s_no" "fix_time" "a_speed"
## [10] "b_speed" "separation" "s_separation"
## [13] "rate_terms" "mpa" "load_time"
## [16] "highpressure_time" "c_thickness"
autoparts1 <- autoparts[autoparts$prod_no=="90784-76001", -c(1:7)]
dim(autoparts1)
## [1] 21779 10
summary(autoparts1)
## fix_time a_speed b_speed separation
## Min. : 1.00 Min. :0.4570 Min. :1.240 Min. :141.6
## 1st Qu.: 81.00 1st Qu.:0.5980 1st Qu.:1.597 1st Qu.:185.9
## Median : 82.10 Median :0.6090 Median :1.640 Median :190.7
## Mean : 83.14 Mean :0.6189 Mean :1.644 Mean :214.5
## 3rd Qu.: 85.40 3rd Qu.:0.6520 3rd Qu.:1.676 3rd Qu.:248.7
## Max. :148.60 Max. :0.8080 Max. :2.528 Max. :294.5
## s_separation rate_terms mpa load_time
## Min. :623.3 Min. :76.00 Min. :24.80 Min. : 0.00
## 1st Qu.:651.6 1st Qu.:81.00 1st Qu.:75.30 1st Qu.:18.10
## Median :710.3 Median :85.00 Median :76.60 Median :19.20
## Mean :685.9 Mean :84.53 Mean :74.21 Mean :18.68
## 3rd Qu.:713.6 3rd Qu.:87.00 3rd Qu.:78.10 3rd Qu.:19.20
## Max. :747.3 Max. :97.00 Max. :82.10 Max. :22.30
## highpressure_time c_thickness
## Min. : 37.00 Min. : 0.30
## 1st Qu.: 60.00 1st Qu.: 21.80
## Median : 67.00 Median : 23.80
## Mean : 96.36 Mean : 27.44
## 3rd Qu.: 72.00 3rd Qu.: 25.40
## Max. :65534.00 Max. :6553.40
# 종속변수 탐색 cf)설명변수
boxplot(autoparts1$c_thickness)
# 종속변수 이상치 제거
autoparts2 <- autoparts1[autoparts1$c_thickness < 1000, ]
# 이상치 제거 후 탐색
ggplot()+geom_boxplot(data=autoparts2, aes(1,c_thickness))
hist(autoparts2$c_thickness, breaks = 50)
ggplot()+geom_histogram(data=autoparts2, aes(c_thickness), color="black", fill="white")
***
m <- lm(c_thickness ~ ., data=autoparts2)
summary(m)
##
## Call:
## lm(formula = c_thickness ~ ., data = autoparts2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.8428 -0.6105 -0.0214 0.5606 29.6508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.146e+02 3.367e+00 212.225 < 2e-16 ***
## fix_time 6.010e-02 5.331e-03 11.273 < 2e-16 ***
## a_speed -1.738e+01 4.223e-01 -41.152 < 2e-16 ***
## b_speed 1.952e+00 1.516e-01 12.876 < 2e-16 ***
## separation -7.592e-01 3.635e-03 -208.873 < 2e-16 ***
## s_separation -7.468e-01 3.673e-03 -203.317 < 2e-16 ***
## rate_terms 1.133e-02 3.597e-03 3.151 0.00163 **
## mpa -1.520e-01 1.458e-03 -104.253 < 2e-16 ***
## load_time -1.523e-01 8.381e-03 -18.171 < 2e-16 ***
## highpressure_time -2.174e-05 8.738e-06 -2.488 0.01284 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.796 on 21757 degrees of freedom
## Multiple R-squared: 0.7841, Adjusted R-squared: 0.784
## F-statistic: 8782 on 9 and 21757 DF, p-value: < 2.2e-16
#첫번째 그림 오차와 예측값의 차이를 보여줌. 기울기 0인 직선이 관측되는 것이 이상적임
#두번째 그림 잔차가 정규분포를 따르는지를 보는 것으로 직선으로 나와야 한다
#세번째 그림 표준화 잔차를 보여줌. 기울기가 0인 직선이 이상적임
#네번째 그림 모델에 영향을 많이 미치는 이상치를 보여준다. 왼쪽 가운데에 몰려 있으면서 cook's distance 안쪽에 있어야 이상적임
plot(m)
***
# 일반 회귀분석
data(swiss)
m1 <- lm(Fertility ~ ., data=swiss)
# 전진선택법
# 절편만 있고 변수는 없는 모형에 변수를 하나씩 삽입하면서 좋은 모형 선택
step(m1, direction = "forward")
## Start: AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic +
## Infant.Mortality
##
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education +
## Catholic + Infant.Mortality, data = swiss)
##
## Coefficients:
## (Intercept) Agriculture Examination Education
## 66.9152 -0.1721 -0.2580 -0.8709
## Catholic Infant.Mortality
## 0.1041 1.0770
# 후진선택법
# 모든 변수로 만든 모형에서 변수를 하나씩 빼면서 좋은 모형을 선택
step(m1, direction = "backward")
## Start: AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic +
## Infant.Mortality
##
## Df Sum of Sq RSS AIC
## - Examination 1 53.03 2158.1 189.86
## <none> 2105.0 190.69
## - Agriculture 1 307.72 2412.8 195.10
## - Infant.Mortality 1 408.75 2513.8 197.03
## - Catholic 1 447.71 2552.8 197.75
## - Education 1 1162.56 3267.6 209.36
##
## Step: AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
##
## Df Sum of Sq RSS AIC
## <none> 2158.1 189.86
## - Agriculture 1 264.18 2422.2 193.29
## - Infant.Mortality 1 409.81 2567.9 196.03
## - Catholic 1 956.57 3114.6 205.10
## - Education 1 2249.97 4408.0 221.43
##
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic +
## Infant.Mortality, data = swiss)
##
## Coefficients:
## (Intercept) Agriculture Education Catholic
## 62.1013 -0.1546 -0.9803 0.1247
## Infant.Mortality
## 1.0784
# 단계별 선택법
# 변수의 추가 삭제를 반복한다
step(m1, direction = "both")
## Start: AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic +
## Infant.Mortality
##
## Df Sum of Sq RSS AIC
## - Examination 1 53.03 2158.1 189.86
## <none> 2105.0 190.69
## - Agriculture 1 307.72 2412.8 195.10
## - Infant.Mortality 1 408.75 2513.8 197.03
## - Catholic 1 447.71 2552.8 197.75
## - Education 1 1162.56 3267.6 209.36
##
## Step: AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
##
## Df Sum of Sq RSS AIC
## <none> 2158.1 189.86
## + Examination 1 53.03 2105.0 190.69
## - Agriculture 1 264.18 2422.2 193.29
## - Infant.Mortality 1 409.81 2567.9 196.03
## - Catholic 1 956.57 3114.6 205.10
## - Education 1 2249.97 4408.0 221.43
##
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic +
## Infant.Mortality, data = swiss)
##
## Coefficients:
## (Intercept) Agriculture Education Catholic
## 62.1013 -0.1546 -0.9803 0.1247
## Infant.Mortality
## 1.0784