POSCO AI Solution Challenge

Data ready

training_dataset.csv 를 분석을 위해 R session 으로 읽어들임

d <- paste0(data_path, "preprocessed/training_dataset.csv") %>% fread %>% tbl_df

Minor data preprocessing

분석의 편의를 위해 소소한 전처리를 아래와 같이 수행

date 와 time 을 결합한 후 날짜시각 클레스로 관리하기 위하여 dt 필드로 변형
dt 로 날짜시각 클레스 파싱이 불가한 케이스의 제거¹
요일 및 월, 일, 시각 변수를 파생시킴 : wday, month, day, hour
게절변수를 파생시켜 season 필드 추가
swell 을 numeric 에서 명목화

d <- d %>% 
  mutate(dt = paste(date, "_", time) %>% ymd_h,
         month = lubridate::month(dt), wday = lubridate::wday(dt, label = T), day = lubridate::day(dt), hour = lubridate::hour(dt),
         swell = as.factor(swell),
         season = case_when(lubridate::month(dt) %in% c(3, 4, 5) ~ "spring",
                            lubridate::month(dt) %in% c(6, 7, 8, 9) ~ "summer",
                            lubridate::month(dt) %in% c(10, 11) ~ "autumn",
                            lubridate::month(dt) %in% c(12, 1, 2) ~ "winter") %>% factor(levels = c("spring", "summer", "autumn", "winter"))) %>% 
  filter(!is.na(dt)) %>% 
  select(-date, -time) %>%
  select(dt, everything())

EDA

Summary

필드별 값 현황 확인

summary(d)

##        dt                      swell      wind_speed     win_direction  
##  Min.   :2014-01-04 15:00:00   0:4882   Min.   : 0.000   Min.   :  0.0  
##  1st Qu.:2014-10-21 01:45:00   1:2578   1st Qu.: 4.400   1st Qu.: 44.0  
##  Median :2015-10-11 22:30:00            Median : 7.000   Median :131.0  
##  Mean   :2015-10-25 02:01:06            Mean   : 7.291   Mean   :155.3  
##  3rd Qu.:2016-10-08 10:15:00            3rd Qu.: 9.800   3rd Qu.:276.0  
##  Max.   :2017-12-28 06:00:00            Max.   :27.300   Max.   :360.0  
##                                         NA's   :461      NA's   :401    
##       GUST          air_pressure       humidity      temperature   
##  Min.   :  0.000   Min.   : 985.2   Min.   :22.00   Min.   :-5.70  
##  1st Qu.:  5.800   1st Qu.:1008.5   1st Qu.:62.00   1st Qu.: 7.90  
##  Median :  9.100   Median :1015.0   Median :77.00   Median :15.20  
##  Mean   :  9.406   Mean   :1015.0   Mean   :74.14   Mean   :14.27  
##  3rd Qu.: 12.500   3rd Qu.:1021.9   3rd Qu.:87.00   3rd Qu.:20.80  
##  Max.   :110.000   Max.   :1035.1   Max.   :99.00   Max.   :28.80  
##  NA's   :456       NA's   :136      NA's   :178     NA's   :149    
##  water_temperature max_wave_height  mean_wave_height avg_wave_height
##  Min.   :10.00     Min.   : 0.200   Min.   :0.100    Min.   :0.100  
##  1st Qu.:13.40     1st Qu.: 1.500   1st Qu.:0.900    1st Qu.:0.700  
##  Median :18.20     Median : 2.600   Median :1.700    Median :1.200  
##  Mean   :17.97     Mean   : 2.833   Mean   :1.793    Mean   :1.272  
##  3rd Qu.:22.30     3rd Qu.: 3.800   3rd Qu.:2.400    3rd Qu.:1.700  
##  Max.   :29.10     Max.   :11.700   Max.   :7.700    Max.   :5.500  
##  NA's   :81        NA's   :76       NA's   :86       NA's   :89     
##  wave_accurance   wave_direction      month         wday     
##  Min.   : 2.000   Min.   :  0.0   Min.   : 1.000   Sun: 956  
##  1st Qu.: 5.300   1st Qu.: 79.0   1st Qu.: 3.000   Mon:1153  
##  Median : 7.100   Median :182.0   Median : 7.000   Tue:1117  
##  Mean   : 7.054   Mean   :176.5   Mean   : 6.519   Wed:1190  
##  3rd Qu.: 9.100   3rd Qu.:258.0   3rd Qu.:10.000   Thu:1083  
##  Max.   :12.800   Max.   :360.0   Max.   :12.000   Fri: 962  
##  NA's   :130      NA's   :75                       Sat: 999  
##       day             hour          season    
##  Min.   : 1.00   Min.   : 0.00   spring:1454  
##  1st Qu.: 8.00   1st Qu.: 5.00   summer:2549  
##  Median :14.00   Median :11.00   autumn:1432  
##  Mean   :14.97   Mean   :11.05   winter:2025  
##  3rd Qu.:22.00   3rd Qu.:17.00                
##  Max.   :31.00   Max.   :23.00                
##

타겟변수인 swell 을 제외한 설명변수 중
연속형 변수들을의 시계열 도표를 그려봄

pd1_1 <- d %>% 
  select(-month, -wday, -day, -hour, -season) %>% 
  gather(class, value, -dt, -swell)

p1_1 <- pd1_1 %>% 
  ggplot(aes(dt, value, color = swell)) + 
  geom_step(stat = "identity") + 
  facet_wrap(~ class, ncol = 2, scales = "free")

p1_1 + 
  ggtitle("Time series plot by swell, class")

마찬가지로 시간별 분포는 어떠한지를 확인하기 위해 Boxplot 을 그려봄

pd1_2 <- pd1_1 %>% 
  mutate(hour = hour(dt) %>% as.factor)

p1_2 <- pd1_2 %>% 
  ggplot(aes(hour, value, color = swell)) + 
  geom_boxplot(outlier.alpha = 0.1) + 
  facet_wrap(~ class, ncol = 2, scales = "free")

p1_2 + 
  ggtitle("Boxplot group by hour by swell, class") + 
  labs(x = "hour")

마찬가지로 계절별 분포는 어떠한지 확인하기 위해 Boxplot 을 그려봄

pd1_3 <- d %>% 
  select(-month, -wday, -day, -hour, -dt) %>% 
  gather(class, value, -swell, -season)

p1_3 <- pd1_3 %>% 
  ggplot(aes(season, value, color = swell)) + 
  geom_boxplot(outlier.alpha = 0.1) + 
  facet_wrap(~ class, ncol = 2, scales = "free")

p1_3 + 
  ggtitle("Boxplot group by season by swell, class")

Boxplot 을 통한 swell 별 그리고 시간별, 계절별 패턴을 확인해 보면 패턴차이가 어느정도 존재하고, 이 패턴차를 이용하여 머신이 swell 을 구분하는 좋은 재료로 사용할 여지가 있어보임

독립변수간 상관관계를 확인하기 위해 상관행렬도를 그려봄

pd1_3 <- d %>% 
  select(-dt, -swell, -win_direction, -wave_direction, -month, -wday, -day, -hour, -season) %>% 
  cor(use = "pairwise.complete.obs")

corrplot(pd1_3, order = "AOE", method = "square")
corrplot(pd1_3, order = "AOE", type = "lower", method = "number", add = TRUE, diag = FALSE, tl.pos = "n", cl.pos = "n")

상관행렬도를 볼 때 상식적이지만 최대파고높이와 평균파고높이간에는 강력한 양의 상관관계가 있으며 두 변수는 거의 동일한 패턴이라고 볼 수 있다.
마찬가지로 풍속과 GUST 간에도 상당히 높은 양의 상관관계가 있는데 이런 동일패턴의 독립변수가 있는것이 얼추 확인 가능하고 이들은 머신러닝 학습시 중복된 정보로 치부될 가능성이 높아보임 ²

Simple machine learning

간단한 머신러닝을 통해 학습 및 예측을 수행해 봄

Partition

학습 및 테스트 셋 분류

fullTrainset <- d %>% 
  select(-dt) %>% 
  na.omit
index <- createDataPartition(fullTrainset$swell, p = .7, list = F)
train <- fullTrainset[index, ]
test <- fullTrainset[-index, ]

Training

학습모델은 mlMethods 를 통해 여러가지 알고리즘 모형을 적용하여 다양한 모델을 만들어 보고 상호 비교를 수행
하이퍼파라미터 최적 선정을 위한 validation 정책은 “5fold Cross Validation” 정책을 모든 알고리즘에 대해 통일하여 적용시킴

fitControl <- trainControl(method = "cv", number = 5, allowParallel = TRUE)
mlMethods <- c("simpls", "glm", "qda", "rocc", "knn", "rpart", "wsrf")
models <- mlMethods %>% 
  lapply(function(x) train(swell ~ ., data = train, method = x, trControl = fitControl))

Evaluate

evaluate <- function(model, testset, class, ylim = c(1, testset %>% pull(class) %>% max)){
  stopifnot(is.character(class))

  real = pull(testset, class) 
  pred = predict(model, newdata = testset) %>% unlist
  
  cm <- confusionMatrix(pred, real, positive = "1")
  totalScore <- as.matrix(cm)[1, 1] + as.matrix(cm)[2, 2]*2 - as.matrix(cm)[2, 1] - as.matrix(cm)[1, 2]*2
  
  paste0("<< Method = ", model$method, 
         " / Process time = ", model$times$everything[3] %>% round(3), 
         " / Total score = ", totalScore, " >>\n") %>% 
    cat
  print(cm)
}

혼동행렬 및 테스트셋 정답지를 이용한 점수계산(Total scroe)³ 결과

for(i in mlMethods %>% length %>% seq) evaluate(models[[i]], test, "swell")

## << Method = simpls / Process time = 1.143 / Total score = 1611 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1199  204
##          1  158  489
##                                           
##                Accuracy : 0.8234          
##                  95% CI : (0.8062, 0.8397)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5989          
##  Mcnemar's Test P-Value : 0.01802         
##                                           
##             Sensitivity : 0.7056          
##             Specificity : 0.8836          
##          Pos Pred Value : 0.7558          
##          Neg Pred Value : 0.8546          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2385          
##    Detection Prevalence : 0.3156          
##       Balanced Accuracy : 0.7946          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = glm / Process time = 0.982 / Total score = 1991 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1239  129
##          1  118  564
##                                           
##                Accuracy : 0.8795          
##                  95% CI : (0.8646, 0.8933)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7297          
##  Mcnemar's Test P-Value : 0.5246          
##                                           
##             Sensitivity : 0.8139          
##             Specificity : 0.9130          
##          Pos Pred Value : 0.8270          
##          Neg Pred Value : 0.9057          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2751          
##    Detection Prevalence : 0.3327          
##       Balanced Accuracy : 0.8634          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = qda / Process time = 0.54 / Total score = 1951 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1209  124
##          1  148  569
##                                           
##                Accuracy : 0.8673          
##                  95% CI : (0.8519, 0.8817)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.706           
##  Mcnemar's Test P-Value : 0.1631          
##                                           
##             Sensitivity : 0.8211          
##             Specificity : 0.8909          
##          Pos Pred Value : 0.7936          
##          Neg Pred Value : 0.9070          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2776          
##    Detection Prevalence : 0.3498          
##       Balanced Accuracy : 0.8560          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = rocc / Process time = 1.796 / Total score = 1717 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1196  176
##          1  161  517
##                                           
##                Accuracy : 0.8356          
##                  95% CI : (0.8188, 0.8514)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6307          
##  Mcnemar's Test P-Value : 0.4457          
##                                           
##             Sensitivity : 0.7460          
##             Specificity : 0.8814          
##          Pos Pred Value : 0.7625          
##          Neg Pred Value : 0.8717          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2522          
##    Detection Prevalence : 0.3307          
##       Balanced Accuracy : 0.8137          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = knn / Process time = 1.683 / Total score = 1969 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1232  131
##          1  125  562
##                                         
##                Accuracy : 0.8751        
##                  95% CI : (0.86, 0.8891)
##     No Information Rate : 0.662         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.7204        
##  Mcnemar's Test P-Value : 0.7547        
##                                         
##             Sensitivity : 0.8110        
##             Specificity : 0.9079        
##          Pos Pred Value : 0.8180        
##          Neg Pred Value : 0.9039        
##              Prevalence : 0.3380        
##          Detection Rate : 0.2741        
##    Detection Prevalence : 0.3351        
##       Balanced Accuracy : 0.8594        
##                                         
##        'Positive' Class : 1             
##                                         
## << Method = rpart / Process time = 0.904 / Total score = 1947 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1207  124
##          1  150  569
##                                           
##                Accuracy : 0.8663          
##                  95% CI : (0.8508, 0.8808)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7041          
##  Mcnemar's Test P-Value : 0.131           
##                                           
##             Sensitivity : 0.8211          
##             Specificity : 0.8895          
##          Pos Pred Value : 0.7914          
##          Neg Pred Value : 0.9068          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2776          
##    Detection Prevalence : 0.3507          
##       Balanced Accuracy : 0.8553          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = wsrf / Process time = 44.04 / Total score = 2451 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1325   57
##          1   32  636
##                                          
##                Accuracy : 0.9566         
##                  95% CI : (0.9468, 0.965)
##     No Information Rate : 0.662          
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.9021         
##  Mcnemar's Test P-Value : 0.01096        
##                                          
##             Sensitivity : 0.9177         
##             Specificity : 0.9764         
##          Pos Pred Value : 0.9521         
##          Neg Pred Value : 0.9588         
##              Prevalence : 0.3380         
##          Detection Rate : 0.3102         
##    Detection Prevalence : 0.3259         
##       Balanced Accuracy : 0.9471         
##                                          
##        'Positive' Class : 1              
##

Case study

subset model : 시간적 설명변수를 모두 제거한 후 학습한 모델의 케이스

sub_models <- mlMethods %>% 
  lapply(function(x) train(swell ~ . - month - wday - day - hour - season, data = train, method = x, trControl = fitControl))
for(i in mlMethods %>% length %>% seq) evaluate(sub_models[[i]], test, "swell")

## << Method = simpls / Process time = 0.604 / Total score = 1599 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1199  207
##          1  158  486
##                                           
##                Accuracy : 0.822           
##                  95% CI : (0.8047, 0.8383)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5952          
##  Mcnemar's Test P-Value : 0.01199         
##                                           
##             Sensitivity : 0.7013          
##             Specificity : 0.8836          
##          Pos Pred Value : 0.7547          
##          Neg Pred Value : 0.8528          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2371          
##    Detection Prevalence : 0.3141          
##       Balanced Accuracy : 0.7924          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = glm / Process time = 0.605 / Total score = 1977 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1240  133
##          1  117  560
##                                           
##                Accuracy : 0.878           
##                  95% CI : (0.8631, 0.8919)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.726           
##  Mcnemar's Test P-Value : 0.3428          
##                                           
##             Sensitivity : 0.8081          
##             Specificity : 0.9138          
##          Pos Pred Value : 0.8272          
##          Neg Pred Value : 0.9031          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2732          
##    Detection Prevalence : 0.3302          
##       Balanced Accuracy : 0.8609          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = qda / Process time = 0.489 / Total score = 1971 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1211  120
##          1  146  573
##                                           
##                Accuracy : 0.8702          
##                  95% CI : (0.8549, 0.8845)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7127          
##  Mcnemar's Test P-Value : 0.1253          
##                                           
##             Sensitivity : 0.8268          
##             Specificity : 0.8924          
##          Pos Pred Value : 0.7969          
##          Neg Pred Value : 0.9098          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2795          
##    Detection Prevalence : 0.3507          
##       Balanced Accuracy : 0.8596          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = rocc / Process time = 1.218 / Total score = 1601 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1246  230
##          1  111  463
##                                           
##                Accuracy : 0.8337          
##                  95% CI : (0.8168, 0.8495)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.612           
##  Mcnemar's Test P-Value : 1.658e-10       
##                                           
##             Sensitivity : 0.6681          
##             Specificity : 0.9182          
##          Pos Pred Value : 0.8066          
##          Neg Pred Value : 0.8442          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2259          
##    Detection Prevalence : 0.2800          
##       Balanced Accuracy : 0.7932          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = knn / Process time = 1.042 / Total score = 1919 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1233  144
##          1  124  549
##                                           
##                Accuracy : 0.8693          
##                  95% CI : (0.8539, 0.8836)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7058          
##  Mcnemar's Test P-Value : 0.2458          
##                                           
##             Sensitivity : 0.7922          
##             Specificity : 0.9086          
##          Pos Pred Value : 0.8158          
##          Neg Pred Value : 0.8954          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2678          
##    Detection Prevalence : 0.3283          
##       Balanced Accuracy : 0.8504          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = rpart / Process time = 0.696 / Total score = 1947 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1207  124
##          1  150  569
##                                           
##                Accuracy : 0.8663          
##                  95% CI : (0.8508, 0.8808)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7041          
##  Mcnemar's Test P-Value : 0.131           
##                                           
##             Sensitivity : 0.8211          
##             Specificity : 0.8895          
##          Pos Pred Value : 0.7914          
##          Neg Pred Value : 0.9068          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2776          
##    Detection Prevalence : 0.3507          
##       Balanced Accuracy : 0.8553          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = wsrf / Process time = 28.352 / Total score = 2403 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1317   65
##          1   40  628
##                                           
##                Accuracy : 0.9488          
##                  95% CI : (0.9383, 0.9579)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8845          
##  Mcnemar's Test P-Value : 0.01917         
##                                           
##             Sensitivity : 0.9062          
##             Specificity : 0.9705          
##          Pos Pred Value : 0.9401          
##          Neg Pred Value : 0.9530          
##              Prevalence : 0.3380          
##          Detection Rate : 0.3063          
##    Detection Prevalence : 0.3259          
##       Balanced Accuracy : 0.9384          
##                                           
##        'Positive' Class : 1               
##

Subset model 과 Full model 과 비교해 볼 때
Total score 가 미세하지만 상대적으로 떨어지는 경향이 보이므로, 파생시킬 수 있는 시간적 설명변수를 최대한 고려해 보는게 유리할 것으로 판단됨

Advanced analysis

분산팽창지수(VIF) 체크

편의상 설명변수 중 연속형 변수간 다중공산성을 분산팽창지수를 통해 확인

forVif <- fullTrainset %>% select(-month, -day, -wday, -hour, -season) %>% mutate(swell = as.numeric(swell))

lm(swell ~ ., data = forVif) %>% vif

##        wind_speed     win_direction              GUST      air_pressure 
##         17.353791          1.264370         19.544251          2.751693 
##          humidity       temperature water_temperature   max_wave_height 
##          2.116257         10.024319          5.898366         24.514069 
##  mean_wave_height   avg_wave_height    wave_accurance    wave_direction 
##        458.705780        453.460356          2.470627          1.041055

lm(swell ~ . - mean_wave_height, data = forVif) %>% vif

##        wind_speed     win_direction              GUST      air_pressure 
##         17.343611          1.264325         19.526862          2.751671 
##          humidity       temperature water_temperature   max_wave_height 
##          2.116257         10.021081          5.897988         24.059268 
##   avg_wave_height    wave_accurance    wave_direction 
##         24.750127          2.469571          1.040679

lm(swell ~ . - mean_wave_height - avg_wave_height, data = forVif) %>% vif

##        wind_speed     win_direction              GUST      air_pressure 
##         17.263232          1.258364         19.523719          2.747348 
##          humidity       temperature water_temperature   max_wave_height 
##          2.116230         10.015590          5.897917          3.605369 
##    wave_accurance    wave_direction 
##          2.375875          1.040643

lm(swell ~ . - mean_wave_height - avg_wave_height - GUST, data = forVif) %>% vif

##        wind_speed     win_direction      air_pressure          humidity 
##          2.044936          1.256764          2.692634          2.095751 
##       temperature water_temperature   max_wave_height    wave_accurance 
##          9.466656          5.610249          3.407452          2.347677 
##    wave_direction 
##          1.030944

3번의 스텝을 통해 분산팽창지수가 10 이하인 변수만을 선별할 경우 mean_wave_height, avg_wave_height, GUST 변수가 탈락됨
따라서 본 3개의 변수를 제외시킨 Subset model 로 예측성능을 확인해 보면 아래와 같음

sub_models <- mlMethods %>% 
  lapply(function(x) train(swell ~ . - mean_wave_height - avg_wave_height - GUST, data = train, method = x, trControl = fitControl))
for(i in mlMethods %>% length %>% seq) evaluate(sub_models[[i]], test, "swell")

## << Method = simpls / Process time = 0.645 / Total score = 1609 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1198  204
##          1  159  489
##                                           
##                Accuracy : 0.8229          
##                  95% CI : (0.8057, 0.8392)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.598           
##  Mcnemar's Test P-Value : 0.02092         
##                                           
##             Sensitivity : 0.7056          
##             Specificity : 0.8828          
##          Pos Pred Value : 0.7546          
##          Neg Pred Value : 0.8545          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2385          
##    Detection Prevalence : 0.3161          
##       Balanced Accuracy : 0.7942          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = glm / Process time = 0.69 / Total score = 1999 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1239  127
##          1  118  566
##                                           
##                Accuracy : 0.8805          
##                  95% CI : (0.8657, 0.8942)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7321          
##  Mcnemar's Test P-Value : 0.6093          
##                                           
##             Sensitivity : 0.8167          
##             Specificity : 0.9130          
##          Pos Pred Value : 0.8275          
##          Neg Pred Value : 0.9070          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2761          
##    Detection Prevalence : 0.3337          
##       Balanced Accuracy : 0.8649          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = qda / Process time = 0.557 / Total score = 1939 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1209  127
##          1  148  566
##                                           
##                Accuracy : 0.8659          
##                  95% CI : (0.8503, 0.8803)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7025          
##  Mcnemar's Test P-Value : 0.2278          
##                                           
##             Sensitivity : 0.8167          
##             Specificity : 0.8909          
##          Pos Pred Value : 0.7927          
##          Neg Pred Value : 0.9049          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2761          
##    Detection Prevalence : 0.3483          
##       Balanced Accuracy : 0.8538          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = rocc / Process time = 1.372 / Total score = 1717 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1196  176
##          1  161  517
##                                           
##                Accuracy : 0.8356          
##                  95% CI : (0.8188, 0.8514)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6307          
##  Mcnemar's Test P-Value : 0.4457          
##                                           
##             Sensitivity : 0.7460          
##             Specificity : 0.8814          
##          Pos Pred Value : 0.7625          
##          Neg Pred Value : 0.8717          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2522          
##    Detection Prevalence : 0.3307          
##       Balanced Accuracy : 0.8137          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = knn / Process time = 1.445 / Total score = 1945 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1234  138
##          1  123  555
##                                           
##                Accuracy : 0.8727          
##                  95% CI : (0.8575, 0.8868)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.714           
##  Mcnemar's Test P-Value : 0.3862          
##                                           
##             Sensitivity : 0.8009          
##             Specificity : 0.9094          
##          Pos Pred Value : 0.8186          
##          Neg Pred Value : 0.8994          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2707          
##    Detection Prevalence : 0.3307          
##       Balanced Accuracy : 0.8551          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = rpart / Process time = 0.88 / Total score = 1947 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1207  124
##          1  150  569
##                                           
##                Accuracy : 0.8663          
##                  95% CI : (0.8508, 0.8808)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7041          
##  Mcnemar's Test P-Value : 0.131           
##                                           
##             Sensitivity : 0.8211          
##             Specificity : 0.8895          
##          Pos Pred Value : 0.7914          
##          Neg Pred Value : 0.9068          
##              Prevalence : 0.3380          
##          Detection Rate : 0.2776          
##    Detection Prevalence : 0.3507          
##       Balanced Accuracy : 0.8553          
##                                           
##        'Positive' Class : 1               
##                                           
## << Method = wsrf / Process time = 37.078 / Total score = 2445 >>
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1326   59
##          1   31  634
##                                           
##                Accuracy : 0.9561          
##                  95% CI : (0.9463, 0.9646)
##     No Information Rate : 0.662           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9009          
##  Mcnemar's Test P-Value : 0.004427        
##                                           
##             Sensitivity : 0.9149          
##             Specificity : 0.9772          
##          Pos Pred Value : 0.9534          
##          Neg Pred Value : 0.9574          
##              Prevalence : 0.3380          
##          Detection Rate : 0.3093          
##    Detection Prevalence : 0.3244          
##       Balanced Accuracy : 0.9460          
##                                           
##        'Positive' Class : 1               
##

Fullset 에 비해 큰 차이가 없음
이 말은 역으로 생각하면 mean_wave_height, avg_wave_height, GUST 설명변수를 포함하든 그렇지 않든 예측성능에 큰 영향력이 없다는 것으로 볼 수 있으며,
따라서 모형의 복잡도를 낮추는 측면에서 패턴이 비슷한 3개의 설명변수를 학습셋에 고려하지 않는 편이 유리할 수 있음

Models Evaluation table

evaluationRes <- data.frame(Method = c("simpls", "glm", "qda", "rocc", "knn", "rpart", "wsrf"),
                            Method_Fullname = c("Partial Least Squares",  "Generalized Linear Model", "Quadratic Discriminant Analysis", "ROC-Based Classifier", "k-Nearest Neighbors", "CART", "Weighted Subspace Random Forest"),
                            FullModel = c(1585, 1949, 1991, 1589, 1983, 1879, 2419),
                            SubsetModel_timeOmit = c(1575, 1963, 2061, 1589, 1947, 1879, 2351),
                            SubsetModel_afterVIF = c(1581, 1947, 1975, 1589, 1973, 1879, 2441))

evaluationRes %>% 
  datatable(options = list(pageLength = 30)) %>% 
  formatStyle("FullModel", background = styleColorBar(evaluationRes$FullModel, "steelblue"))

Reference

우리나라 사계절 개시일과 지속기간

time 이 25~30 인 경우가 존재하는데 이는 체크가 필요함↩
이 때문에 분산팽창지수 검정을 추가로 시도해 볼 것임↩
우리과제의 평가정책에 의한 점수↩

POSCO AI Solution Challenge - EDA

Ntels BI Song (hjsong@ntels.com)

2018-07-25