1.1 SVM
1.2 Hyperplane
1.3 Data Analysis
1.3.1 데이터 불러오기
## [1] "C:/Users/Administrator/Desktop/R Analysis"
## 'data.frame': 34139 obs. of 11 variables:
## $ prod_no : chr "90784-76001" "90784-76001" "90784-76001" "90784-76001" ...
## $ fix_time : num 85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
## $ a_speed : num 0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
## $ b_speed : num 1.72 1.71 1.72 1.72 1.7 ...
## $ separation : num 242 245 243 242 242 ...
## $ s_separation : num 658 657 658 657 657 ...
## $ rate_terms : int 95 95 95 95 95 95 95 95 95 95 ...
## $ mpa : num 78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
## $ load_time : num 18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
## $ highpressure_time: int 58 58 82 74 56 78 55 57 50 60 ...
## $ c_thickness : num 24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...
#--------------------------------------------------------------
# NA 확인
#----------------------------------------------------------------
colSums(is.na(autopart))## prod_no fix_time a_speed b_speed
## 0 0 0 0
## separation s_separation rate_terms mpa
## 0 0 0 0
## load_time highpressure_time c_thickness
## 0 0 0
#--------------------------------------------------------------
# Boxplot - 극단적인 outlier 가 있음으로 의 1000 이하의 값만 불러온다
#----------------------------------------------------------------
boxplot(autopart[,2:11])#--------------------------------------------------------------
# 90784-76001 의 데이터 부분만 추출하기
#----------------------------------------------------------------
unique(autopart$prod_no)## [1] "90784-76001" "45231-3B660" "45231-3B641" "45231-3B610" "45231-P3B750"
## [6] "45231-3B400"
autopart %>%
filter(prod_no == "90784-76001") %>%
filter(c_thickness < 1000) %>%
filter(highpressure_time < 1000)-> df_auto
boxplot(df_auto[,2:11])#--------------------------------------------------------------
# Y 설정 : c_thickness 기준으로 새로운 변수 생성해주기
#----------------------------------------------------------------
df_auto$target <- ifelse(df_auto$c_thickness < 20|(df_auto$c_thickness>32),1,0)
table(df_auto$target)##
## 0 1
## 18921 2836
#--------------------------------------------------------------
# 데이터 정리 prod_no 제거하고, target 변수 factor 변환, c_thinkness 을 활용했음으로 feature 제거
#----------------------------------------------------------------
df_auto %>%
select(-prod_no, -c_thickness) %>%
mutate(target = as.factor(target)) -> df_auto1.3.2 데이터 나누기
set.seed(2200)
sort(sample(nrow(df_auto), nrow(df_auto)*0.7)) -> flag
train <- df_auto[flag,]
test <- df_auto[-flag,]
trainControl(method="repeatedcv", repeats = 5) -> ctrl
train(target~.,
data = train,
method = "svmLinear",
trControl= ctrl,
preProcess= c("center","scale"),
metric = "Accuracy") -> svm_fit
svm_fit## Support Vector Machines with Linear Kernel
##
## 15229 samples
## 9 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (9), scaled (9)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 13706, 13706, 13706, 13706, 13706, 13706, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9034079 0.4581467
##
## Tuning parameter 'C' was held constant at a value of 1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5638 510
## 1 72 308
##
## Accuracy : 0.9108
## 95% CI : (0.9037, 0.9177)
## No Information Rate : 0.8747
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4722
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9874
## Specificity : 0.3765
## Pos Pred Value : 0.9170
## Neg Pred Value : 0.8105
## Prevalence : 0.8747
## Detection Rate : 0.8637
## Detection Prevalence : 0.9418
## Balanced Accuracy : 0.6820
##
## 'Positive' Class : 0
##
library(Epi)
#install.packages("Epi")
ROC(test= pred_svm_fit, stat=test$target, plot= "ROC", AUC=T, main="SVM")1.3.3 새로운 데이터 예측
new.data=data.frame(fix_time=87,a_speed=0.609,b_speed=1.715,separation=242.7,s_separation=657.5,rate_terms=95,mpa=78,load_time=18.1,highpressure_time=82,target=NA)
predict(svm_fit, newdata= new.data)## [1] 0
## Levels: 0 1
2.1 Support Vector Regreesion Model
c_thickness 연속형 변수의 예측 모델
## prod_no fix_time a_speed b_speed separation s_separation rate_terms mpa
## 1 90784-76001 85.5 0.611 1.715 242.0 657.6 95 78.2
## 2 90784-76001 86.2 0.606 1.708 244.7 657.1 95 77.9
## 3 90784-76001 86.0 0.609 1.715 242.7 657.5 95 78.0
## 4 90784-76001 86.1 0.610 1.718 241.9 657.3 95 78.2
## 5 90784-76001 86.1 0.603 1.704 242.5 657.3 95 77.9
## 6 90784-76001 86.3 0.606 1.707 244.5 656.9 95 77.9
## load_time highpressure_time c_thickness
## 1 18.1 58 24.7
## 2 18.2 58 22.5
## 3 18.1 82 24.1
## 4 18.1 74 25.1
## 5 18.2 56 24.5
## 6 18.0 78 22.9
##
## Call:
## svm(formula = c_thickness ~ ., data = train, gamma = 2, cost = 16)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 16
## gamma: 2
## epsilon: 0.1
##
##
## Number of Support Vectors: 2895
## RMSE Rsquared MAE
## 1.2528912 0.9030114 0.4062829
#----------------------------------------------------------------
# Modelling - Multiple Regression Linear
#-------------------------------------------------------------
lm(c_thickness~., data=train) ->lm_fit
summary(lm_fit)##
## Call:
## lm(formula = c_thickness ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.7653 -0.6136 -0.0209 0.5711 29.3171
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.061e+02 4.012e+00 175.974 <2e-16 ***
## fix_time 7.028e-02 6.215e-03 11.308 <2e-16 ***
## a_speed -1.695e+01 4.949e-01 -34.260 <2e-16 ***
## b_speed 1.995e+00 1.793e-01 11.132 <2e-16 ***
## separation -7.509e-01 4.334e-03 -173.254 <2e-16 ***
## s_separation -7.380e-01 4.381e-03 -168.449 <2e-16 ***
## rate_terms 9.898e-03 4.230e-03 2.340 0.0193 *
## mpa -1.523e-01 1.723e-03 -88.365 <2e-16 ***
## load_time -1.721e-01 9.576e-03 -17.973 <2e-16 ***
## highpressure_time -1.267e-05 1.028e-05 -1.233 0.2177
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.772 on 15226 degrees of freedom
## Multiple R-squared: 0.7824, Adjusted R-squared: 0.7823
## F-statistic: 6084 on 9 and 15226 DF, p-value: < 2.2e-16
## RMSE Rsquared MAE
## 1.8528011 0.7874622 0.9664274
| — | RMSE | Rsquared | MAE |
|---|---|---|---|
| SVM | 1.1226402 | 0.9163979 | 0.3835772 |
| linear | 1.8068063 | 0.7824639 | 0.9443125 |
par(mfrow= c(1,2))
plot(x=test$c_thickness, pred_svm_reg_fit, main= "SVM")
plot(x=test$c_thickness, pred_lm, main ="Linear")#----------------------------------------------------------------
# Selecting features
#-------------------------------------------------------------
autopart %>%
filter(prod_no == "90784-76001") %>%
mutate(target = ifelse(c_thickness<20| c_thickness>32,1 ,0) %>%
as.factor()) %>%
filter(c_thickness < 1000) %>%
select(-prod_no, -c_thickness) -> df_logit
prop.table(table(df_logit$target))##
## 0 1
## 0.8694354 0.1305646
#----------------------------------------------------------------
# Train/Test
#-------------------------------------------------------------
set.seed(2222)
sort(sample(nrow(df_logit), nrow(df_logit)*0.7)) -> flag
df_logit[flag, ] -> train
df_logit[-flag, ] -> test
#----------------------------------------------------------------
# Modelling
#------------------------------------------------------------
glm(target~., data=train, family=binomial(logit)) -> logit_reg
summary(logit_reg)##
## Call:
## glm(formula = target ~ ., family = binomial(logit), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.3977 -0.3739 -0.2169 -0.1211 5.2106
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.545e+02 1.207e+01 -37.661 < 2e-16 ***
## fix_time -3.382e-02 1.096e-02 -3.085 0.002034 **
## a_speed 1.852e+01 1.127e+00 16.437 < 2e-16 ***
## b_speed -1.972e+00 4.753e-01 -4.148 3.36e-05 ***
## separation 5.364e-01 1.354e-02 39.628 < 2e-16 ***
## s_separation 5.022e-01 1.326e-02 37.871 < 2e-16 ***
## rate_terms -3.209e-02 8.381e-03 -3.829 0.000129 ***
## mpa -1.406e-01 4.032e-03 -34.867 < 2e-16 ***
## load_time -5.116e-03 1.875e-02 -0.273 0.784955
## highpressure_time 2.018e-04 1.864e-05 10.825 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 11816.9 on 15235 degrees of freedom
## Residual deviance: 7001.4 on 15226 degrees of freedom
## AIC: 7021.4
##
## Number of Fisher Scoring iterations: 6
#----------------------------------------------------------------
# 기준값 설정 0.5 기준
#------------------------------------------------------------
ifelse(logit_reg$fitted.values >= 0.5, 1, 0) -> logit_table
table(real=train$target, predict= logit_table)## predict
## real 0 1
## 0 13015 229
## 1 1092 900
#----------------------------------------------------------------
# 예측하기
#------------------------------------------------------------
predict(logit_reg, test, type="response")-> pred_logit_reg
ROC(test = pred_logit_reg, stat= test$target, plot="ROC")## 'data.frame': 8143 obs. of 7 variables:
## $ date : chr "2015-02-04 17:51:00" "2015-02-04 17:51:59" "2015-02-04 17:53:00" "2015-02-04 17:54:00" ...
## $ Temperature : num 23.2 23.1 23.1 23.1 23.1 ...
## $ Humidity : num 27.3 27.3 27.2 27.2 27.2 ...
## $ Light : num 426 430 426 426 426 ...
## $ CO2 : num 721 714 714 708 704 ...
## $ HumidityRatio: num 0.00479 0.00478 0.00478 0.00477 0.00476 ...
## $ Occupancy : int 1 1 1 1 1 1 1 1 1 1 ...
occu %>%
select(-date) -> train
#----------------------------------------------------------------
# Modelling - 회귀식
#------------------------------------------------------------
ctrl <- trainControl(method = "repeatedcv", repeats = 5)
train(Occupancy~.,
data=train,
method = "glm",
trControl=ctrl,
metric = "RMSE") -> logit2
#----------------------------------------------------------------
# TEST 데이터
#------------------------------------------------------------
test=read.csv("occupancy_test.csv")
predict(logit2, test) -> pred_test
ROC(test=pred_test, stat = test$Occupancy, plot="ROC")SVM Classifier 사용 Test 데이터 예측값 및 정확도/ ROC,AUC
read.csv("occupancy_train.csv") -> train
read.csv("occupancy_test.csv") -> test
#----------------------------------------------------------------
# Factor 변환 및 selecting feastures
#------------------------------------------------------------
train$Occupancy <- as.factor(train$Occupancy)
train[,-1] ->train
test$Occupancy <- as.factor(test$Occupancy)
test[,-1] ->test
#----------------------------------------------------------------
# SVM Modelling
#------------------------------------------------------------
trainControl(method="repeatedcv", repeats = 5) -> ctrl
train(Occupancy~.,
data = train,
method = "svmLinear",
trControl= ctrl,
preProcess= c("center","scale"),
metric = "Accuracy") -> svm_fit
svm_fit## Support Vector Machines with Linear Kernel
##
## 8143 samples
## 5 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (5), scaled (5)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 7328, 7329, 7328, 7330, 7329, 7329, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9861476 0.9592325
##
## Tuning parameter 'C' was held constant at a value of 1
#----------------------------------------------------------------
# predict & confusion Matrix
#------------------------------------------------------------
predict(svm_fit, newdata= test)-> pred_svm
confusionMatrix(pred_svm, test$Occupancy)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1639 3
## 1 54 969
##
## Accuracy : 0.9786
## 95% CI : (0.9724, 0.9838)
## No Information Rate : 0.6353
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9544
##
## Mcnemar's Test P-Value : 3.528e-11
##
## Sensitivity : 0.9681
## Specificity : 0.9969
## Pos Pred Value : 0.9982
## Neg Pred Value : 0.9472
## Prevalence : 0.6353
## Detection Rate : 0.6150
## Detection Prevalence : 0.6161
## Balanced Accuracy : 0.9825
##
## 'Positive' Class : 0
##
#----------------------------------------------------------------
# ROC
#------------------------------------------------------------
ROC(test=pred_svm, stat= test$Occupancy, plot="ROC")Result
L1 Rasso + L2 Ridge 제약식을 사용하는 logistic 사용
trainControl(method="repeatedcv", repeats = 5) -> ctrl
train(Occupancy~.,
data = train,
method = "regLogistic",
trControl= ctrl,
preProcess= c("center","scale"),
metric = "Accuracy") -> reg_fit
reg_fit## Regularized Logistic Regression
##
## 8143 samples
## 5 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (5), scaled (5)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 7329, 7329, 7328, 7329, 7329, 7328, ...
## Resampling results across tuning parameters:
##
## cost loss epsilon Accuracy Kappa
## 0.5 L1 0.001 0.9860248 0.9588660
## 0.5 L1 0.010 0.9860985 0.9590744
## 0.5 L1 0.100 0.9875724 0.9634957
## 0.5 L2_dual 0.001 0.9860248 0.9588660
## 0.5 L2_dual 0.010 0.9860248 0.9588660
## 0.5 L2_dual 0.100 0.9860248 0.9588660
## 0.5 L2_primal 0.001 0.9860248 0.9588660
## 0.5 L2_primal 0.010 0.9860002 0.9587943
## 0.5 L2_primal 0.100 0.9861230 0.9591648
## 1.0 L1 0.001 0.9860248 0.9588660
## 1.0 L1 0.010 0.9860739 0.9590050
## 1.0 L1 0.100 0.9874985 0.9632924
## 1.0 L2_dual 0.001 0.9860248 0.9588660
## 1.0 L2_dual 0.010 0.9860248 0.9588660
## 1.0 L2_dual 0.100 0.9860248 0.9588660
## 1.0 L2_primal 0.001 0.9860248 0.9588660
## 1.0 L2_primal 0.010 0.9860002 0.9587943
## 1.0 L2_primal 0.100 0.9861230 0.9591652
## 2.0 L1 0.001 0.9860248 0.9588660
## 2.0 L1 0.010 0.9860739 0.9590050
## 2.0 L1 0.100 0.9874002 0.9629941
## 2.0 L2_dual 0.001 0.9860002 0.9587943
## 2.0 L2_dual 0.010 0.9860002 0.9587943
## 2.0 L2_dual 0.100 0.9860002 0.9587943
## 2.0 L2_primal 0.001 0.9860002 0.9587943
## 2.0 L2_primal 0.010 0.9860248 0.9588660
## 2.0 L2_primal 0.100 0.9861230 0.9591652
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were cost = 0.5, loss = L1 and epsilon
## = 0.1.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1638 3
## 1 55 969
##
## Accuracy : 0.9782
## 95% CI : (0.972, 0.9834)
## No Information Rate : 0.6353
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9536
##
## Mcnemar's Test P-Value : 2.133e-11
##
## Sensitivity : 0.9675
## Specificity : 0.9969
## Pos Pred Value : 0.9982
## Neg Pred Value : 0.9463
## Prevalence : 0.6353
## Detection Rate : 0.6146
## Detection Prevalence : 0.6158
## Balanced Accuracy : 0.9822
##
## 'Positive' Class : 0
##
iiiㅁ{r}