1 온,습도,조도,CO2농도에 따른 객실의 사용유무 판별


# packages
library("tidyverse") 
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("caret")
## 필요한 패키지를 로딩중입니다: lattice
## 
## 다음의 패키지를 부착합니다: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library("e1071")

1.1 데이터 EDA 수행 후, 분석가 입장에서 의미있는 탐색

rdata <- read.csv("problem1.csv")

str(rdata) # dim 17910 7
## 'data.frame':    17910 obs. of  7 variables:
##  $ date         : chr  "2015-02-02 14:19:59" "2015-02-02 14:22:00" "2015-02-02 14:23:00" "2015-02-02 14:23:59" ...
##  $ Temperature  : num  23.7 23.7 23.8 23.8 23.8 ...
##  $ Humidity     : num  26.3 26.1 26.2 26.3 26.3 ...
##  $ Light        : num  578 494 489 569 509 ...
##  $ CO2          : num  760 775 779 790 797 ...
##  $ HumidityRatio: num  0.00477 0.00474 0.00477 0.00478 0.00478 ...
##  $ Occupancy    : int  1 1 1 1 1 1 1 1 1 1 ...
summary(rdata) # NA 21
##      date            Temperature       Humidity         Light        
##  Length:17910       Min.   :19.00   Min.   :16.75   Min.   : -99.00  
##  Class :character   1st Qu.:20.10   1st Qu.:24.39   1st Qu.:   0.00  
##  Mode  :character   Median :20.60   Median :27.20   Median :   0.00  
##                     Mean   :20.75   Mean   :27.59   Mean   :  78.16  
##                     3rd Qu.:21.20   3rd Qu.:31.29   3rd Qu.:  22.00  
##                     Max.   :24.41   Max.   :39.50   Max.   :1581.00  
##                                                                      
##       CO2         HumidityRatio        Occupancy     
##  Min.   : 412.8   Min.   :0.002674   Min.   :0.0000  
##  1st Qu.: 453.0   1st Qu.:0.003702   1st Qu.:0.0000  
##  Median : 532.7   Median :0.004222   Median :0.0000  
##  Mean   : 647.7   Mean   :0.004175   Mean   :0.1173  
##  3rd Qu.: 722.0   3rd Qu.:0.004790   3rd Qu.:0.0000  
##  Max.   :2076.5   Max.   :0.006461   Max.   :1.0000  
##  NA's   :21
rdata$Occupancy <- factor(rdata$Occupancy, levels=c("0","1"), labels=c("비어있음", "사용중"))

1.2 결측치를 대체하는 방식 선택하고 근거제시, 대체 수행

summary(rdata$CO2) # 정규분포가 아니어서 평균값, 최빈값으로 대체하기가 곤란
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   412.8   453.0   532.7   647.7   722.0  2076.5      21
boxplot(rdata$CO2)

na.ratio <- sum(is.na(rdata))/nrow(rdata) # 결측치 0.1% 이하로 매우 적어 결측치 제거
na.ratio
## [1] 0.001172529
rdata <- na.omit(rdata)
sum(is.na(rdata))
## [1] 0
summary(rdata)
##      date            Temperature       Humidity         Light        
##  Length:17889       Min.   :19.00   Min.   :16.75   Min.   : -99.00  
##  Class :character   1st Qu.:20.10   1st Qu.:24.39   1st Qu.:   0.00  
##  Mode  :character   Median :20.60   Median :27.20   Median :   0.00  
##                     Mean   :20.75   Mean   :27.59   Mean   :  78.22  
##                     3rd Qu.:21.20   3rd Qu.:31.29   3rd Qu.:  22.00  
##                     Max.   :24.41   Max.   :39.50   Max.   :1581.00  
##       CO2         HumidityRatio         Occupancy    
##  Min.   : 412.8   Min.   :0.002674   비어있음:15790  
##  1st Qu.: 453.0   1st Qu.:0.003702   사용중  : 2099  
##  Median : 532.7   Median :0.004222                   
##  Mean   : 647.7   Mean   :0.004174                   
##  3rd Qu.: 722.0   3rd Qu.:0.004791                   
##  Max.   :2076.5   Max.   :0.006461

1.3 추가적으로 데이터의 질 및 품질관리를 향상시킬만한 내용 작성

# number 변수의 scale이 다르므로 min-max 표준화 
normalize <- function(x){
  return((x-min(x))/(max(x)-min(x)))
}

rdata$Temperature <- normalize(rdata$Temperature)
rdata$Humidity <- normalize(rdata$Humidity)
rdata$Light <- normalize(rdata$Light)
rdata$CO2 <- normalize(rdata$CO2)
rdata$HumidityRatio <- normalize(rdata$HumidityRatio)
summary(rdata)
##      date            Temperature        Humidity          Light        
##  Length:17889       Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  Class :character   1st Qu.:0.2034   1st Qu.:0.3360   1st Qu.:0.05893  
##  Mode  :character   Median :0.2958   Median :0.4595   Median :0.05893  
##                     Mean   :0.3234   Mean   :0.4765   Mean   :0.10549  
##                     3rd Qu.:0.4068   3rd Qu.:0.6392   3rd Qu.:0.07202  
##                     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##       CO2          HumidityRatio       Occupancy    
##  Min.   :0.00000   Min.   :0.0000   비어있음:15790  
##  1st Qu.:0.02419   1st Qu.:0.2714   사용중  : 2099  
##  Median :0.07208   Median :0.4086                   
##  Mean   :0.14122   Mean   :0.3962                   
##  3rd Qu.:0.18588   3rd Qu.:0.5590                   
##  Max.   :1.00000   Max.   :1.0000

1.4 데이터에 불균형이 있는지 확인, 불균형 판단 근거 작성

summary(rdata$Occupancy)
## 비어있음   사용중 
##    15790     2099
class(rdata$Occupancy)
## [1] "factor"
table(rdata$Occupancy)
## 
## 비어있음   사용중 
##    15790     2099
prop.table(table(rdata$Occupancy)) # 데이터 불균형    비어있음15790 사용중2099
## 
##  비어있음    사용중 
## 0.8826653 0.1173347

1.5 오버샘플링 방법들 중 2개 선택하고 장단점 등 선정 이유 제시

# 방법 1. 업 샘플링, 다운 샘플링, caret::upSample 원데이터 샘플링
# 방법 2. SMOTE DMwR::SMOTE, 최근접 이용 원데이터 약간씩 이동시켜 새로운 값 생성
# 방법 1 선택, 선정 이유? ... (그냥 간단하니..)
# 실기 후기 남기신 분 글을 보니 업 샘플링은 데이터를 중복 선택하여 모으니 과적합이 될 수 있으니 
# 원데이터수가 많은 경우 다운 샘플링 즉, 원데이터에서만 샘플링 하는 것이 데이터 과적합을 줄이는 방법이라.. 
# 다운 샘플링 방법을 선정하였다 함.

1.6 오버샘플링 수행 및 결과, 잘 되었다는 것을 판단해라

table(rdata$Occupancy)
## 
## 비어있음   사용중 
##    15790     2099
set.seed(2201)
ovrdata <- upSample(rdata[, -c(7)], rdata$Occupancy) # 업/다운 샘플링 방법으로 샘플링 
summary(ovrdata) # Occupancy 가 Class로 변수명 자동 바뀜(이걸 안바뀌게?!!!)
##      date            Temperature        Humidity          Light        
##  Length:31580       Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  Class :character   1st Qu.:0.2672   1st Qu.:0.3557   1st Qu.:0.05893  
##  Mode  :character   Median :0.4068   Median :0.4726   Median :0.29548  
##                     Mean   :0.4216   Mean   :0.4853   Mean   :0.20874  
##                     3rd Qu.:0.5593   3rd Qu.:0.6352   3rd Qu.:0.32857  
##                     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##       CO2          HumidityRatio         Class      
##  Min.   :0.00000   Min.   :0.0000   비어있음:15790  
##  1st Qu.:0.05755   1st Qu.:0.2955   사용중  :15790  
##  Median :0.17157   Median :0.4675                   
##  Mean   :0.22447   Mean   :0.4431                   
##  3rd Qu.:0.33959   3rd Qu.:0.5883                   
##  Max.   :1.00000   Max.   :1.0000
prop.table(table(ovrdata$Class))
## 
## 비어있음   사용중 
##      0.5      0.5
# 비어있음   사용중 
#     0.5      0.5 
ovrdata$Occupancy <- ovrdata$Class 
summary(ovrdata)
##      date            Temperature        Humidity          Light        
##  Length:31580       Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  Class :character   1st Qu.:0.2672   1st Qu.:0.3557   1st Qu.:0.05893  
##  Mode  :character   Median :0.4068   Median :0.4726   Median :0.29548  
##                     Mean   :0.4216   Mean   :0.4853   Mean   :0.20874  
##                     3rd Qu.:0.5593   3rd Qu.:0.6352   3rd Qu.:0.32857  
##                     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##       CO2          HumidityRatio         Class          Occupancy    
##  Min.   :0.00000   Min.   :0.0000   비어있음:15790   비어있음:15790  
##  1st Qu.:0.05755   1st Qu.:0.2955   사용중  :15790   사용중  :15790  
##  Median :0.17157   Median :0.4675                                    
##  Mean   :0.22447   Mean   :0.4431                                    
##  3rd Qu.:0.33959   3rd Qu.:0.5883                                    
##  Max.   :1.00000   Max.   :1.0000
ovrdata <- ovrdata[, -7]
summary(ovrdata)
##      date            Temperature        Humidity          Light        
##  Length:31580       Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  Class :character   1st Qu.:0.2672   1st Qu.:0.3557   1st Qu.:0.05893  
##  Mode  :character   Median :0.4068   Median :0.4726   Median :0.29548  
##                     Mean   :0.4216   Mean   :0.4853   Mean   :0.20874  
##                     3rd Qu.:0.5593   3rd Qu.:0.6352   3rd Qu.:0.32857  
##                     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##       CO2          HumidityRatio       Occupancy    
##  Min.   :0.00000   Min.   :0.0000   비어있음:15790  
##  1st Qu.:0.05755   1st Qu.:0.2955   사용중  :15790  
##  Median :0.17157   Median :0.4675                   
##  Mean   :0.22447   Mean   :0.4431                   
##  3rd Qu.:0.33959   3rd Qu.:0.5883                   
##  Max.   :1.00000   Max.   :1.0000

1.7 속도측면, 정확도측면 모델 1개씩 선택, 선택 이유도 기술

# 일반적으로 속도가 빠른 알고리즘이 있습니다. Tree기반 앙상블보다는 선형 계열이 빠릅니다. 
# 즉 Logistic Regression이 Random Forest 보다 빠릅니다. 
# 같은 Tree기반 앙상블이더라도 Random Forest가 Gradient Boosting 보다 더 빠릅니다. 
# 또한 XGboost보다는 LightGBM이 더 빠르고 메모리도 더 적게 사용합니다. 
# 머신러닝의 예측 정확도를 중요시 한다면 학습 속도는 느리지만 보다 성능이 높은 알고리즘을 선택해야

1.8 위에서 오버샘플링 한 데이터 2개, 오버샘플링 하기 전 데이터 1개에 대해 모델 2개를 적용하고 성능 보여주기(1)

# 원 데이터 사용 glm, svm 모델 적용  
summary(rdata)
##      date            Temperature        Humidity          Light        
##  Length:17889       Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  Class :character   1st Qu.:0.2034   1st Qu.:0.3360   1st Qu.:0.05893  
##  Mode  :character   Median :0.2958   Median :0.4595   Median :0.05893  
##                     Mean   :0.3234   Mean   :0.4765   Mean   :0.10549  
##                     3rd Qu.:0.4068   3rd Qu.:0.6392   3rd Qu.:0.07202  
##                     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##       CO2          HumidityRatio       Occupancy    
##  Min.   :0.00000   Min.   :0.0000   비어있음:15790  
##  1st Qu.:0.02419   1st Qu.:0.2714   사용중  : 2099  
##  Median :0.07208   Median :0.4086                   
##  Mean   :0.14122   Mean   :0.3962                   
##  3rd Qu.:0.18588   3rd Qu.:0.5590                   
##  Max.   :1.00000   Max.   :1.0000
set.seed(2201)
idx <- sample(1:nrow(rdata), nrow(rdata)*0.7, replace = F)
rtrain <- rdata[idx,]
rtest <- rdata[-idx,]
dim(rtrain)
## [1] 12522     7
dim(rtest)
## [1] 5367    7
str(rtrain)
## 'data.frame':    12522 obs. of  7 variables:
##  $ date         : chr  "2015-02-03 04:01:00" "2015-02-08 20:06:00" "2015-02-03 12:40:59" "2015-02-15 14:44:00" ...
##  $ Temperature  : num  0.257 0.0721 0.6924 0.5344 0.3698 ...
##  $ Humidity     : num  0.253 0.477 0.433 0.613 0.413 ...
##  $ Light        : num  0.0589 0.0589 0.454 0.086 0.0589 ...
##  $ CO2          : num  0.0137 0.017 0.3618 0.1614 0.1178 ...
##  $ HumidityRatio: num  0.172 0.307 0.496 0.612 0.355 ...
##  $ Occupancy    : Factor w/ 2 levels "비어있음","사용중": 1 1 2 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:21] 1284 1305 2137 2857 3240 6684 6912 7457 8003 8089 ...
##   ..- attr(*, "names")= chr [1:21] "1284" "1305" "2137" "2857" ...
rtrain <- rtrain[,-1] #안할 시 엄청나게 시간 걸림, freeze 
str(rtrain)
## 'data.frame':    12522 obs. of  6 variables:
##  $ Temperature  : num  0.257 0.0721 0.6924 0.5344 0.3698 ...
##  $ Humidity     : num  0.253 0.477 0.433 0.613 0.413 ...
##  $ Light        : num  0.0589 0.0589 0.454 0.086 0.0589 ...
##  $ CO2          : num  0.0137 0.017 0.3618 0.1614 0.1178 ...
##  $ HumidityRatio: num  0.172 0.307 0.496 0.612 0.355 ...
##  $ Occupancy    : Factor w/ 2 levels "비어있음","사용중": 1 1 2 1 1 1 1 1 1 1 ...
# glm
rtrain.glm <- glm(Occupancy ~ ., rtrain, family = "binomial")
summary(rtrain.glm)
## 
## Call:
## glm(formula = Occupancy ~ ., family = "binomial", data = rtrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.8342  -0.0568  -0.0391  -0.0315   4.4794  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -0.745      1.675  -0.445    0.656    
## Temperature    -22.060      3.606  -6.117 9.51e-10 ***
## Humidity       -44.534      8.957  -4.972 6.62e-07 ***
## Light           33.589      1.026  32.747  < 2e-16 ***
## CO2              5.491      0.520  10.559  < 2e-16 ***
## HumidityRatio   48.696      9.430   5.164 2.42e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9118.7  on 12521  degrees of freedom
## Residual deviance: 1325.7  on 12516  degrees of freedom
## AIC: 1337.7
## 
## Number of Fisher Scoring iterations: 9
str(rtest)
## 'data.frame':    5367 obs. of  7 variables:
##  $ date         : chr  "2015-02-02 14:19:59" "2015-02-02 14:29:00" "2015-02-02 14:31:00" "2015-02-02 14:49:00" ...
##  $ Temperature  : num  0.872 0.877 0.869 0.851 0.832 ...
##  $ Humidity     : num  0.419 0.426 0.433 0.474 0.495 ...
##  $ Light        : num  0.403 0.346 0.342 0.369 0.324 ...
##  $ CO2          : num  0.209 0.242 0.252 0.34 0.385 ...
##  $ HumidityRatio: num  0.554 0.564 0.568 0.604 0.619 ...
##  $ Occupancy    : Factor w/ 2 levels "비어있음","사용중": 2 2 2 2 2 2 2 2 2 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:21] 1284 1305 2137 2857 3240 6684 6912 7457 8003 8089 ...
##   ..- attr(*, "names")= chr [1:21] "1284" "1305" "2137" "2857" ...
rtest <- rtest[,-1]
rtrain.pred <- predict(rtrain.glm, rtest[,-6], type = "response")
head(rtrain.pred)
##         1         7         9        18        24        25 
## 0.9527954 0.7859811 0.7870195 0.9561021 0.8820501 0.8696988
head(rtrain$Occupancy)
## [1] 비어있음 비어있음 사용중   비어있음 비어있음 비어있음
## Levels: 비어있음 사용중
rtrain.pred <- ifelse(rtrain.pred > 0.5, "사용중", "비어있음")
head(rtrain.pred)
##        1        7        9       18       24       25 
## "사용중" "사용중" "사용중" "사용중" "사용중" "사용중"
rtrain.pred <- as.factor(rtrain.pred)

confusionMatrix(rtrain.pred, rtest[,6], positive = "사용중") # Accuracy : 0.9879
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 비어있음 사용중
##   비어있음     4693      5
##   사용중         60    609
##                                           
##                Accuracy : 0.9879          
##                  95% CI : (0.9846, 0.9906)
##     No Information Rate : 0.8856          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9425          
##                                           
##  Mcnemar's Test P-Value : 2.115e-11       
##                                           
##             Sensitivity : 0.9919          
##             Specificity : 0.9874          
##          Pos Pred Value : 0.9103          
##          Neg Pred Value : 0.9989          
##              Prevalence : 0.1144          
##          Detection Rate : 0.1135          
##    Detection Prevalence : 0.1247          
##       Balanced Accuracy : 0.9896          
##                                           
##        'Positive' Class : 사용중          
## 
# svm
library(e1071)
rtrain.svm <- svm(Occupancy ~ ., rtrain)
summary(rtrain.svm)
## 
## Call:
## svm(formula = Occupancy ~ ., data = rtrain)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  431
## 
##  ( 212 219 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  비어있음 사용중
str(rtest)
## 'data.frame':    5367 obs. of  6 variables:
##  $ Temperature  : num  0.872 0.877 0.869 0.851 0.832 ...
##  $ Humidity     : num  0.419 0.426 0.433 0.474 0.495 ...
##  $ Light        : num  0.403 0.346 0.342 0.369 0.324 ...
##  $ CO2          : num  0.209 0.242 0.252 0.34 0.385 ...
##  $ HumidityRatio: num  0.554 0.564 0.568 0.604 0.619 ...
##  $ Occupancy    : Factor w/ 2 levels "비어있음","사용중": 2 2 2 2 2 2 2 2 2 2 ...
rtrain.pred <- predict(rtrain.svm, rtest[,-6], type = "response")
head(rtrain.pred)
##      1      7      9     18     24     25 
## 사용중 사용중 사용중 사용중 사용중 사용중 
## Levels: 비어있음 사용중
head(rtrain$Occupancy)
## [1] 비어있음 비어있음 사용중   비어있음 비어있음 비어있음
## Levels: 비어있음 사용중
rtrain.pred <- as.factor(rtrain.pred)

confusionMatrix(rtrain.pred, rtest[,6], positive = "사용중") # Accuracy : 0.988
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 비어있음 사용중
##   비어있음     4693      4
##   사용중         60    610
##                                           
##                Accuracy : 0.9881          
##                  95% CI : (0.9848, 0.9908)
##     No Information Rate : 0.8856          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9434          
##                                           
##  Mcnemar's Test P-Value : 6.199e-12       
##                                           
##             Sensitivity : 0.9935          
##             Specificity : 0.9874          
##          Pos Pred Value : 0.9104          
##          Neg Pred Value : 0.9991          
##              Prevalence : 0.1144          
##          Detection Rate : 0.1137          
##    Detection Prevalence : 0.1248          
##       Balanced Accuracy : 0.9904          
##                                           
##        'Positive' Class : 사용중          
## 

1.9 위에서 오버샘플링 한 데이터 2개, 오버샘플링 하기 전 데이터 1개에 대해 모델 2개를 적용하고 성능 보여주기(2)

# 오버 샘플링 데이터 사용 glm, svm 모델 적용  
summary(ovrdata)
##      date            Temperature        Humidity          Light        
##  Length:31580       Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  Class :character   1st Qu.:0.2672   1st Qu.:0.3557   1st Qu.:0.05893  
##  Mode  :character   Median :0.4068   Median :0.4726   Median :0.29548  
##                     Mean   :0.4216   Mean   :0.4853   Mean   :0.20874  
##                     3rd Qu.:0.5593   3rd Qu.:0.6352   3rd Qu.:0.32857  
##                     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##       CO2          HumidityRatio       Occupancy    
##  Min.   :0.00000   Min.   :0.0000   비어있음:15790  
##  1st Qu.:0.05755   1st Qu.:0.2955   사용중  :15790  
##  Median :0.17157   Median :0.4675                   
##  Mean   :0.22447   Mean   :0.4431                   
##  3rd Qu.:0.33959   3rd Qu.:0.5883                   
##  Max.   :1.00000   Max.   :1.0000
#glm 
set.seed(2201)
idx <- sample(1:nrow(ovrdata), nrow(ovrdata)*0.7, replace = F)
ovtrain <- ovrdata[idx,]
ovtest <- ovrdata[-idx,]
dim(ovtrain)
## [1] 22106     7
# [1] 22106     7
dim(ovtest)
## [1] 9474    7
# [1] 9474    7

str(ovtrain)
## 'data.frame':    22106 obs. of  7 variables:
##  $ date         : chr  "2015-02-05 10:28:59" "2015-02-03 14:15:00" "2015-02-03 05:39:59" "2015-02-05 15:30:59" ...
##  $ Temperature  : num  0.555 0.793 0.239 0.647 0.534 ...
##  $ Humidity     : num  0.419 0.44 0.262 0.477 0.621 ...
##  $ Light        : num  0.3381 0.382 0.0589 0.3327 0.3229 ...
##  $ CO2          : num  0.3674 0.3325 0.0132 0.4041 0.1159 ...
##  $ HumidityRatio: num  0.429 0.544 0.175 0.523 0.619 ...
##  $ Occupancy    : Factor w/ 2 levels "비어있음","사용중": 2 2 1 2 2 2 1 2 2 1 ...
ovtrain <- ovtrain[,-1] #안할 시 freeze
str(ovtrain)
## 'data.frame':    22106 obs. of  6 variables:
##  $ Temperature  : num  0.555 0.793 0.239 0.647 0.534 ...
##  $ Humidity     : num  0.419 0.44 0.262 0.477 0.621 ...
##  $ Light        : num  0.3381 0.382 0.0589 0.3327 0.3229 ...
##  $ CO2          : num  0.3674 0.3325 0.0132 0.4041 0.1159 ...
##  $ HumidityRatio: num  0.429 0.544 0.175 0.523 0.619 ...
##  $ Occupancy    : Factor w/ 2 levels "비어있음","사용중": 2 2 1 2 2 2 1 2 2 1 ...
ov.glm <- glm(Occupancy ~ ., ovtrain, family = "binomial")
summary(ov.glm)
## 
## Call:
## glm(formula = Occupancy ~ ., family = "binomial", data = ovtrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.4525  -0.0597  -0.0284   0.1444   4.3005  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.3352     1.1572   0.290    0.772    
## Temperature   -22.9443     2.4473  -9.375  < 2e-16 ***
## Humidity      -48.2275     6.2426  -7.726 1.11e-14 ***
## Light          37.4500     0.7017  53.367  < 2e-16 ***
## CO2             6.2794     0.3748  16.756  < 2e-16 ***
## HumidityRatio  52.2809     6.5662   7.962 1.69e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 30645  on 22105  degrees of freedom
## Residual deviance:  2662  on 22100  degrees of freedom
## AIC: 2674
## 
## Number of Fisher Scoring iterations: 9
str(ovtest)
## 'data.frame':    9474 obs. of  7 variables:
##  $ date         : chr  "2015-02-02 17:34:00" "2015-02-02 17:38:59" "2015-02-02 17:39:59" "2015-02-02 17:43:00" ...
##  $ Temperature  : num  0.666 0.655 0.652 0.647 0.647 ...
##  $ Humidity     : num  0.366 0.369 0.367 0.363 0.363 ...
##  $ Light        : num  0.314 0.31 0.312 0.308 0.308 ...
##  $ CO2          : num  0.262 0.26 0.253 0.251 0.247 ...
##  $ HumidityRatio: num  0.416 0.415 0.412 0.406 0.406 ...
##  $ Occupancy    : Factor w/ 2 levels "비어있음","사용중": 1 1 1 1 1 1 1 1 1 1 ...
ovtest <- ovtest[,-1]
ov.pred <- predict(ov.glm, ovtest[,-6], type = "response")
head(ov.pred)
##         1         6         7        10        11        13 
## 0.9294165 0.9223336 0.9259515 0.9126928 0.9109939 0.9056589
ov.pred <- ifelse(ov.pred > 0.5, "사용중", "비어있음")
head(ov.pred)
##        1        6        7       10       11       13 
## "사용중" "사용중" "사용중" "사용중" "사용중" "사용중"
confusionMatrix(as.factor(ov.pred), ovtest[,6], positive = "사용중") # Accuracy : 0.9884
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 비어있음 사용중
##   비어있음     4637     30
##   사용중         80   4727
##                                          
##                Accuracy : 0.9884         
##                  95% CI : (0.986, 0.9904)
##     No Information Rate : 0.5021         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9768         
##                                          
##  Mcnemar's Test P-Value : 2.983e-06      
##                                          
##             Sensitivity : 0.9937         
##             Specificity : 0.9830         
##          Pos Pred Value : 0.9834         
##          Neg Pred Value : 0.9936         
##              Prevalence : 0.5021         
##          Detection Rate : 0.4989         
##    Detection Prevalence : 0.5074         
##       Balanced Accuracy : 0.9884         
##                                          
##        'Positive' Class : 사용중         
## 
#svm 
ov.svm <- svm(Occupancy ~ ., ovtrain)
summary(ov.svm)  
## 
## Call:
## svm(formula = Occupancy ~ ., data = ovtrain)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  654
## 
##  ( 335 319 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  비어있음 사용중
ov.pred <- predict(ov.svm, ovtest[, -6])

head(ov.pred)
##      1      6      7     10     11     13 
## 사용중 사용중 사용중 사용중 사용중 사용중 
## Levels: 비어있음 사용중
confusionMatrix(ov.pred, ovtest[,6], positive = "사용중") # Accuracy : 0.9898
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 비어있음 사용중
##   비어있음     4648     28
##   사용중         69   4729
##                                           
##                Accuracy : 0.9898          
##                  95% CI : (0.9875, 0.9917)
##     No Information Rate : 0.5021          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9795          
##                                           
##  Mcnemar's Test P-Value : 4.878e-05       
##                                           
##             Sensitivity : 0.9941          
##             Specificity : 0.9854          
##          Pos Pred Value : 0.9856          
##          Neg Pred Value : 0.9940          
##              Prevalence : 0.5021          
##          Detection Rate : 0.4992          
##    Detection Prevalence : 0.5064          
##       Balanced Accuracy : 0.9897          
##                                           
##        'Positive' Class : 사용중          
## 

1.10 위 예측결과 사용해서 오버샘플링이 미친 영향에 대해 작성하라

# 모델 예측 정확도 비교 원데이터 사용 glm 0.9879 svm 0.9881
# 오버 샘플링 데이터 사용 glm 0.9884 svm 0.9898
# over sampling 이 더 나은 예측 결과