온,습도,조도,CO2농도에 따른 객실의 사용유무 판별
# packages
library("tidyverse")
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## 필요한 패키지를 로딩중입니다: lattice
##
## 다음의 패키지를 부착합니다: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
데이터 EDA 수행 후, 분석가 입장에서 의미있는 탐색
rdata <- read.csv("problem1.csv")
str(rdata) # dim 17910 7
## 'data.frame': 17910 obs. of 7 variables:
## $ date : chr "2015-02-02 14:19:59" "2015-02-02 14:22:00" "2015-02-02 14:23:00" "2015-02-02 14:23:59" ...
## $ Temperature : num 23.7 23.7 23.8 23.8 23.8 ...
## $ Humidity : num 26.3 26.1 26.2 26.3 26.3 ...
## $ Light : num 578 494 489 569 509 ...
## $ CO2 : num 760 775 779 790 797 ...
## $ HumidityRatio: num 0.00477 0.00474 0.00477 0.00478 0.00478 ...
## $ Occupancy : int 1 1 1 1 1 1 1 1 1 1 ...
## date Temperature Humidity Light
## Length:17910 Min. :19.00 Min. :16.75 Min. : -99.00
## Class :character 1st Qu.:20.10 1st Qu.:24.39 1st Qu.: 0.00
## Mode :character Median :20.60 Median :27.20 Median : 0.00
## Mean :20.75 Mean :27.59 Mean : 78.16
## 3rd Qu.:21.20 3rd Qu.:31.29 3rd Qu.: 22.00
## Max. :24.41 Max. :39.50 Max. :1581.00
##
## CO2 HumidityRatio Occupancy
## Min. : 412.8 Min. :0.002674 Min. :0.0000
## 1st Qu.: 453.0 1st Qu.:0.003702 1st Qu.:0.0000
## Median : 532.7 Median :0.004222 Median :0.0000
## Mean : 647.7 Mean :0.004175 Mean :0.1173
## 3rd Qu.: 722.0 3rd Qu.:0.004790 3rd Qu.:0.0000
## Max. :2076.5 Max. :0.006461 Max. :1.0000
## NA's :21
rdata$Occupancy <- factor(rdata$Occupancy, levels=c("0","1"), labels=c("비어있음", "사용중"))
결측치를 대체하는 방식 선택하고 근거제시, 대체 수행
summary(rdata$CO2) # 정규분포가 아니어서 평균값, 최빈값으로 대체하기가 곤란
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 412.8 453.0 532.7 647.7 722.0 2076.5 21

na.ratio <- sum(is.na(rdata))/nrow(rdata) # 결측치 0.1% 이하로 매우 적어 결측치 제거
na.ratio
## [1] 0.001172529
rdata <- na.omit(rdata)
sum(is.na(rdata))
## [1] 0
## date Temperature Humidity Light
## Length:17889 Min. :19.00 Min. :16.75 Min. : -99.00
## Class :character 1st Qu.:20.10 1st Qu.:24.39 1st Qu.: 0.00
## Mode :character Median :20.60 Median :27.20 Median : 0.00
## Mean :20.75 Mean :27.59 Mean : 78.22
## 3rd Qu.:21.20 3rd Qu.:31.29 3rd Qu.: 22.00
## Max. :24.41 Max. :39.50 Max. :1581.00
## CO2 HumidityRatio Occupancy
## Min. : 412.8 Min. :0.002674 비어있음:15790
## 1st Qu.: 453.0 1st Qu.:0.003702 사용중 : 2099
## Median : 532.7 Median :0.004222
## Mean : 647.7 Mean :0.004174
## 3rd Qu.: 722.0 3rd Qu.:0.004791
## Max. :2076.5 Max. :0.006461
추가적으로 데이터의 질 및 품질관리를 향상시킬만한 내용 작성
# number 변수의 scale이 다르므로 min-max 표준화
normalize <- function(x){
return((x-min(x))/(max(x)-min(x)))
}
rdata$Temperature <- normalize(rdata$Temperature)
rdata$Humidity <- normalize(rdata$Humidity)
rdata$Light <- normalize(rdata$Light)
rdata$CO2 <- normalize(rdata$CO2)
rdata$HumidityRatio <- normalize(rdata$HumidityRatio)
summary(rdata)
## date Temperature Humidity Light
## Length:17889 Min. :0.0000 Min. :0.0000 Min. :0.00000
## Class :character 1st Qu.:0.2034 1st Qu.:0.3360 1st Qu.:0.05893
## Mode :character Median :0.2958 Median :0.4595 Median :0.05893
## Mean :0.3234 Mean :0.4765 Mean :0.10549
## 3rd Qu.:0.4068 3rd Qu.:0.6392 3rd Qu.:0.07202
## Max. :1.0000 Max. :1.0000 Max. :1.00000
## CO2 HumidityRatio Occupancy
## Min. :0.00000 Min. :0.0000 비어있음:15790
## 1st Qu.:0.02419 1st Qu.:0.2714 사용중 : 2099
## Median :0.07208 Median :0.4086
## Mean :0.14122 Mean :0.3962
## 3rd Qu.:0.18588 3rd Qu.:0.5590
## Max. :1.00000 Max. :1.0000
데이터에 불균형이 있는지 확인, 불균형 판단 근거 작성
## 비어있음 사용중
## 15790 2099
## [1] "factor"
##
## 비어있음 사용중
## 15790 2099
prop.table(table(rdata$Occupancy)) # 데이터 불균형 비어있음15790 사용중2099
##
## 비어있음 사용중
## 0.8826653 0.1173347
오버샘플링 방법들 중 2개 선택하고 장단점 등 선정 이유 제시
# 방법 1. 업 샘플링, 다운 샘플링, caret::upSample 원데이터 샘플링
# 방법 2. SMOTE DMwR::SMOTE, 최근접 이용 원데이터 약간씩 이동시켜 새로운 값 생성
# 방법 1 선택, 선정 이유? ... (그냥 간단하니..)
# 실기 후기 남기신 분 글을 보니 업 샘플링은 데이터를 중복 선택하여 모으니 과적합이 될 수 있으니
# 원데이터수가 많은 경우 다운 샘플링 즉, 원데이터에서만 샘플링 하는 것이 데이터 과적합을 줄이는 방법이라..
# 다운 샘플링 방법을 선정하였다 함.
오버샘플링 수행 및 결과, 잘 되었다는 것을 판단해라
##
## 비어있음 사용중
## 15790 2099
set.seed(2201)
ovrdata <- upSample(rdata[, -c(7)], rdata$Occupancy) # 업/다운 샘플링 방법으로 샘플링
summary(ovrdata) # Occupancy 가 Class로 변수명 자동 바뀜(이걸 안바뀌게?!!!)
## date Temperature Humidity Light
## Length:31580 Min. :0.0000 Min. :0.0000 Min. :0.00000
## Class :character 1st Qu.:0.2672 1st Qu.:0.3557 1st Qu.:0.05893
## Mode :character Median :0.4068 Median :0.4726 Median :0.29548
## Mean :0.4216 Mean :0.4853 Mean :0.20874
## 3rd Qu.:0.5593 3rd Qu.:0.6352 3rd Qu.:0.32857
## Max. :1.0000 Max. :1.0000 Max. :1.00000
## CO2 HumidityRatio Class
## Min. :0.00000 Min. :0.0000 비어있음:15790
## 1st Qu.:0.05755 1st Qu.:0.2955 사용중 :15790
## Median :0.17157 Median :0.4675
## Mean :0.22447 Mean :0.4431
## 3rd Qu.:0.33959 3rd Qu.:0.5883
## Max. :1.00000 Max. :1.0000
prop.table(table(ovrdata$Class))
##
## 비어있음 사용중
## 0.5 0.5
# 비어있음 사용중
# 0.5 0.5
ovrdata$Occupancy <- ovrdata$Class
summary(ovrdata)
## date Temperature Humidity Light
## Length:31580 Min. :0.0000 Min. :0.0000 Min. :0.00000
## Class :character 1st Qu.:0.2672 1st Qu.:0.3557 1st Qu.:0.05893
## Mode :character Median :0.4068 Median :0.4726 Median :0.29548
## Mean :0.4216 Mean :0.4853 Mean :0.20874
## 3rd Qu.:0.5593 3rd Qu.:0.6352 3rd Qu.:0.32857
## Max. :1.0000 Max. :1.0000 Max. :1.00000
## CO2 HumidityRatio Class Occupancy
## Min. :0.00000 Min. :0.0000 비어있음:15790 비어있음:15790
## 1st Qu.:0.05755 1st Qu.:0.2955 사용중 :15790 사용중 :15790
## Median :0.17157 Median :0.4675
## Mean :0.22447 Mean :0.4431
## 3rd Qu.:0.33959 3rd Qu.:0.5883
## Max. :1.00000 Max. :1.0000
ovrdata <- ovrdata[, -7]
summary(ovrdata)
## date Temperature Humidity Light
## Length:31580 Min. :0.0000 Min. :0.0000 Min. :0.00000
## Class :character 1st Qu.:0.2672 1st Qu.:0.3557 1st Qu.:0.05893
## Mode :character Median :0.4068 Median :0.4726 Median :0.29548
## Mean :0.4216 Mean :0.4853 Mean :0.20874
## 3rd Qu.:0.5593 3rd Qu.:0.6352 3rd Qu.:0.32857
## Max. :1.0000 Max. :1.0000 Max. :1.00000
## CO2 HumidityRatio Occupancy
## Min. :0.00000 Min. :0.0000 비어있음:15790
## 1st Qu.:0.05755 1st Qu.:0.2955 사용중 :15790
## Median :0.17157 Median :0.4675
## Mean :0.22447 Mean :0.4431
## 3rd Qu.:0.33959 3rd Qu.:0.5883
## Max. :1.00000 Max. :1.0000
속도측면, 정확도측면 모델 1개씩 선택, 선택 이유도 기술
# 일반적으로 속도가 빠른 알고리즘이 있습니다. Tree기반 앙상블보다는 선형 계열이 빠릅니다.
# 즉 Logistic Regression이 Random Forest 보다 빠릅니다.
# 같은 Tree기반 앙상블이더라도 Random Forest가 Gradient Boosting 보다 더 빠릅니다.
# 또한 XGboost보다는 LightGBM이 더 빠르고 메모리도 더 적게 사용합니다.
# 머신러닝의 예측 정확도를 중요시 한다면 학습 속도는 느리지만 보다 성능이 높은 알고리즘을 선택해야
위에서 오버샘플링 한 데이터 2개, 오버샘플링 하기 전 데이터 1개에 대해 모델 2개를 적용하고 성능 보여주기(1)
# 원 데이터 사용 glm, svm 모델 적용
summary(rdata)
## date Temperature Humidity Light
## Length:17889 Min. :0.0000 Min. :0.0000 Min. :0.00000
## Class :character 1st Qu.:0.2034 1st Qu.:0.3360 1st Qu.:0.05893
## Mode :character Median :0.2958 Median :0.4595 Median :0.05893
## Mean :0.3234 Mean :0.4765 Mean :0.10549
## 3rd Qu.:0.4068 3rd Qu.:0.6392 3rd Qu.:0.07202
## Max. :1.0000 Max. :1.0000 Max. :1.00000
## CO2 HumidityRatio Occupancy
## Min. :0.00000 Min. :0.0000 비어있음:15790
## 1st Qu.:0.02419 1st Qu.:0.2714 사용중 : 2099
## Median :0.07208 Median :0.4086
## Mean :0.14122 Mean :0.3962
## 3rd Qu.:0.18588 3rd Qu.:0.5590
## Max. :1.00000 Max. :1.0000
set.seed(2201)
idx <- sample(1:nrow(rdata), nrow(rdata)*0.7, replace = F)
rtrain <- rdata[idx,]
rtest <- rdata[-idx,]
dim(rtrain)
## [1] 12522 7
## [1] 5367 7
## 'data.frame': 12522 obs. of 7 variables:
## $ date : chr "2015-02-03 04:01:00" "2015-02-08 20:06:00" "2015-02-03 12:40:59" "2015-02-15 14:44:00" ...
## $ Temperature : num 0.257 0.0721 0.6924 0.5344 0.3698 ...
## $ Humidity : num 0.253 0.477 0.433 0.613 0.413 ...
## $ Light : num 0.0589 0.0589 0.454 0.086 0.0589 ...
## $ CO2 : num 0.0137 0.017 0.3618 0.1614 0.1178 ...
## $ HumidityRatio: num 0.172 0.307 0.496 0.612 0.355 ...
## $ Occupancy : Factor w/ 2 levels "비어있음","사용중": 1 1 2 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:21] 1284 1305 2137 2857 3240 6684 6912 7457 8003 8089 ...
## ..- attr(*, "names")= chr [1:21] "1284" "1305" "2137" "2857" ...
rtrain <- rtrain[,-1] #안할 시 엄청나게 시간 걸림, freeze
str(rtrain)
## 'data.frame': 12522 obs. of 6 variables:
## $ Temperature : num 0.257 0.0721 0.6924 0.5344 0.3698 ...
## $ Humidity : num 0.253 0.477 0.433 0.613 0.413 ...
## $ Light : num 0.0589 0.0589 0.454 0.086 0.0589 ...
## $ CO2 : num 0.0137 0.017 0.3618 0.1614 0.1178 ...
## $ HumidityRatio: num 0.172 0.307 0.496 0.612 0.355 ...
## $ Occupancy : Factor w/ 2 levels "비어있음","사용중": 1 1 2 1 1 1 1 1 1 1 ...
# glm
rtrain.glm <- glm(Occupancy ~ ., rtrain, family = "binomial")
summary(rtrain.glm)
##
## Call:
## glm(formula = Occupancy ~ ., family = "binomial", data = rtrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.8342 -0.0568 -0.0391 -0.0315 4.4794
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.745 1.675 -0.445 0.656
## Temperature -22.060 3.606 -6.117 9.51e-10 ***
## Humidity -44.534 8.957 -4.972 6.62e-07 ***
## Light 33.589 1.026 32.747 < 2e-16 ***
## CO2 5.491 0.520 10.559 < 2e-16 ***
## HumidityRatio 48.696 9.430 5.164 2.42e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9118.7 on 12521 degrees of freedom
## Residual deviance: 1325.7 on 12516 degrees of freedom
## AIC: 1337.7
##
## Number of Fisher Scoring iterations: 9
## 'data.frame': 5367 obs. of 7 variables:
## $ date : chr "2015-02-02 14:19:59" "2015-02-02 14:29:00" "2015-02-02 14:31:00" "2015-02-02 14:49:00" ...
## $ Temperature : num 0.872 0.877 0.869 0.851 0.832 ...
## $ Humidity : num 0.419 0.426 0.433 0.474 0.495 ...
## $ Light : num 0.403 0.346 0.342 0.369 0.324 ...
## $ CO2 : num 0.209 0.242 0.252 0.34 0.385 ...
## $ HumidityRatio: num 0.554 0.564 0.568 0.604 0.619 ...
## $ Occupancy : Factor w/ 2 levels "비어있음","사용중": 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:21] 1284 1305 2137 2857 3240 6684 6912 7457 8003 8089 ...
## ..- attr(*, "names")= chr [1:21] "1284" "1305" "2137" "2857" ...
rtest <- rtest[,-1]
rtrain.pred <- predict(rtrain.glm, rtest[,-6], type = "response")
head(rtrain.pred)
## 1 7 9 18 24 25
## 0.9527954 0.7859811 0.7870195 0.9561021 0.8820501 0.8696988
## [1] 비어있음 비어있음 사용중 비어있음 비어있음 비어있음
## Levels: 비어있음 사용중
rtrain.pred <- ifelse(rtrain.pred > 0.5, "사용중", "비어있음")
head(rtrain.pred)
## 1 7 9 18 24 25
## "사용중" "사용중" "사용중" "사용중" "사용중" "사용중"
rtrain.pred <- as.factor(rtrain.pred)
confusionMatrix(rtrain.pred, rtest[,6], positive = "사용중") # Accuracy : 0.9879
## Confusion Matrix and Statistics
##
## Reference
## Prediction 비어있음 사용중
## 비어있음 4693 5
## 사용중 60 609
##
## Accuracy : 0.9879
## 95% CI : (0.9846, 0.9906)
## No Information Rate : 0.8856
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9425
##
## Mcnemar's Test P-Value : 2.115e-11
##
## Sensitivity : 0.9919
## Specificity : 0.9874
## Pos Pred Value : 0.9103
## Neg Pred Value : 0.9989
## Prevalence : 0.1144
## Detection Rate : 0.1135
## Detection Prevalence : 0.1247
## Balanced Accuracy : 0.9896
##
## 'Positive' Class : 사용중
##
# svm
library(e1071)
rtrain.svm <- svm(Occupancy ~ ., rtrain)
summary(rtrain.svm)
##
## Call:
## svm(formula = Occupancy ~ ., data = rtrain)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 431
##
## ( 212 219 )
##
##
## Number of Classes: 2
##
## Levels:
## 비어있음 사용중
## 'data.frame': 5367 obs. of 6 variables:
## $ Temperature : num 0.872 0.877 0.869 0.851 0.832 ...
## $ Humidity : num 0.419 0.426 0.433 0.474 0.495 ...
## $ Light : num 0.403 0.346 0.342 0.369 0.324 ...
## $ CO2 : num 0.209 0.242 0.252 0.34 0.385 ...
## $ HumidityRatio: num 0.554 0.564 0.568 0.604 0.619 ...
## $ Occupancy : Factor w/ 2 levels "비어있음","사용중": 2 2 2 2 2 2 2 2 2 2 ...
rtrain.pred <- predict(rtrain.svm, rtest[,-6], type = "response")
head(rtrain.pred)
## 1 7 9 18 24 25
## 사용중 사용중 사용중 사용중 사용중 사용중
## Levels: 비어있음 사용중
## [1] 비어있음 비어있음 사용중 비어있음 비어있음 비어있음
## Levels: 비어있음 사용중
rtrain.pred <- as.factor(rtrain.pred)
confusionMatrix(rtrain.pred, rtest[,6], positive = "사용중") # Accuracy : 0.988
## Confusion Matrix and Statistics
##
## Reference
## Prediction 비어있음 사용중
## 비어있음 4693 4
## 사용중 60 610
##
## Accuracy : 0.9881
## 95% CI : (0.9848, 0.9908)
## No Information Rate : 0.8856
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9434
##
## Mcnemar's Test P-Value : 6.199e-12
##
## Sensitivity : 0.9935
## Specificity : 0.9874
## Pos Pred Value : 0.9104
## Neg Pred Value : 0.9991
## Prevalence : 0.1144
## Detection Rate : 0.1137
## Detection Prevalence : 0.1248
## Balanced Accuracy : 0.9904
##
## 'Positive' Class : 사용중
##
위에서 오버샘플링 한 데이터 2개, 오버샘플링 하기 전 데이터 1개에 대해 모델 2개를 적용하고 성능 보여주기(2)
# 오버 샘플링 데이터 사용 glm, svm 모델 적용
summary(ovrdata)
## date Temperature Humidity Light
## Length:31580 Min. :0.0000 Min. :0.0000 Min. :0.00000
## Class :character 1st Qu.:0.2672 1st Qu.:0.3557 1st Qu.:0.05893
## Mode :character Median :0.4068 Median :0.4726 Median :0.29548
## Mean :0.4216 Mean :0.4853 Mean :0.20874
## 3rd Qu.:0.5593 3rd Qu.:0.6352 3rd Qu.:0.32857
## Max. :1.0000 Max. :1.0000 Max. :1.00000
## CO2 HumidityRatio Occupancy
## Min. :0.00000 Min. :0.0000 비어있음:15790
## 1st Qu.:0.05755 1st Qu.:0.2955 사용중 :15790
## Median :0.17157 Median :0.4675
## Mean :0.22447 Mean :0.4431
## 3rd Qu.:0.33959 3rd Qu.:0.5883
## Max. :1.00000 Max. :1.0000
#glm
set.seed(2201)
idx <- sample(1:nrow(ovrdata), nrow(ovrdata)*0.7, replace = F)
ovtrain <- ovrdata[idx,]
ovtest <- ovrdata[-idx,]
dim(ovtrain)
## [1] 22106 7
# [1] 22106 7
dim(ovtest)
## [1] 9474 7
# [1] 9474 7
str(ovtrain)
## 'data.frame': 22106 obs. of 7 variables:
## $ date : chr "2015-02-05 10:28:59" "2015-02-03 14:15:00" "2015-02-03 05:39:59" "2015-02-05 15:30:59" ...
## $ Temperature : num 0.555 0.793 0.239 0.647 0.534 ...
## $ Humidity : num 0.419 0.44 0.262 0.477 0.621 ...
## $ Light : num 0.3381 0.382 0.0589 0.3327 0.3229 ...
## $ CO2 : num 0.3674 0.3325 0.0132 0.4041 0.1159 ...
## $ HumidityRatio: num 0.429 0.544 0.175 0.523 0.619 ...
## $ Occupancy : Factor w/ 2 levels "비어있음","사용중": 2 2 1 2 2 2 1 2 2 1 ...
ovtrain <- ovtrain[,-1] #안할 시 freeze
str(ovtrain)
## 'data.frame': 22106 obs. of 6 variables:
## $ Temperature : num 0.555 0.793 0.239 0.647 0.534 ...
## $ Humidity : num 0.419 0.44 0.262 0.477 0.621 ...
## $ Light : num 0.3381 0.382 0.0589 0.3327 0.3229 ...
## $ CO2 : num 0.3674 0.3325 0.0132 0.4041 0.1159 ...
## $ HumidityRatio: num 0.429 0.544 0.175 0.523 0.619 ...
## $ Occupancy : Factor w/ 2 levels "비어있음","사용중": 2 2 1 2 2 2 1 2 2 1 ...
ov.glm <- glm(Occupancy ~ ., ovtrain, family = "binomial")
summary(ov.glm)
##
## Call:
## glm(formula = Occupancy ~ ., family = "binomial", data = ovtrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.4525 -0.0597 -0.0284 0.1444 4.3005
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.3352 1.1572 0.290 0.772
## Temperature -22.9443 2.4473 -9.375 < 2e-16 ***
## Humidity -48.2275 6.2426 -7.726 1.11e-14 ***
## Light 37.4500 0.7017 53.367 < 2e-16 ***
## CO2 6.2794 0.3748 16.756 < 2e-16 ***
## HumidityRatio 52.2809 6.5662 7.962 1.69e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 30645 on 22105 degrees of freedom
## Residual deviance: 2662 on 22100 degrees of freedom
## AIC: 2674
##
## Number of Fisher Scoring iterations: 9
## 'data.frame': 9474 obs. of 7 variables:
## $ date : chr "2015-02-02 17:34:00" "2015-02-02 17:38:59" "2015-02-02 17:39:59" "2015-02-02 17:43:00" ...
## $ Temperature : num 0.666 0.655 0.652 0.647 0.647 ...
## $ Humidity : num 0.366 0.369 0.367 0.363 0.363 ...
## $ Light : num 0.314 0.31 0.312 0.308 0.308 ...
## $ CO2 : num 0.262 0.26 0.253 0.251 0.247 ...
## $ HumidityRatio: num 0.416 0.415 0.412 0.406 0.406 ...
## $ Occupancy : Factor w/ 2 levels "비어있음","사용중": 1 1 1 1 1 1 1 1 1 1 ...
ovtest <- ovtest[,-1]
ov.pred <- predict(ov.glm, ovtest[,-6], type = "response")
head(ov.pred)
## 1 6 7 10 11 13
## 0.9294165 0.9223336 0.9259515 0.9126928 0.9109939 0.9056589
ov.pred <- ifelse(ov.pred > 0.5, "사용중", "비어있음")
head(ov.pred)
## 1 6 7 10 11 13
## "사용중" "사용중" "사용중" "사용중" "사용중" "사용중"
confusionMatrix(as.factor(ov.pred), ovtest[,6], positive = "사용중") # Accuracy : 0.9884
## Confusion Matrix and Statistics
##
## Reference
## Prediction 비어있음 사용중
## 비어있음 4637 30
## 사용중 80 4727
##
## Accuracy : 0.9884
## 95% CI : (0.986, 0.9904)
## No Information Rate : 0.5021
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9768
##
## Mcnemar's Test P-Value : 2.983e-06
##
## Sensitivity : 0.9937
## Specificity : 0.9830
## Pos Pred Value : 0.9834
## Neg Pred Value : 0.9936
## Prevalence : 0.5021
## Detection Rate : 0.4989
## Detection Prevalence : 0.5074
## Balanced Accuracy : 0.9884
##
## 'Positive' Class : 사용중
##
#svm
ov.svm <- svm(Occupancy ~ ., ovtrain)
summary(ov.svm)
##
## Call:
## svm(formula = Occupancy ~ ., data = ovtrain)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 654
##
## ( 335 319 )
##
##
## Number of Classes: 2
##
## Levels:
## 비어있음 사용중
ov.pred <- predict(ov.svm, ovtest[, -6])
head(ov.pred)
## 1 6 7 10 11 13
## 사용중 사용중 사용중 사용중 사용중 사용중
## Levels: 비어있음 사용중
confusionMatrix(ov.pred, ovtest[,6], positive = "사용중") # Accuracy : 0.9898
## Confusion Matrix and Statistics
##
## Reference
## Prediction 비어있음 사용중
## 비어있음 4648 28
## 사용중 69 4729
##
## Accuracy : 0.9898
## 95% CI : (0.9875, 0.9917)
## No Information Rate : 0.5021
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9795
##
## Mcnemar's Test P-Value : 4.878e-05
##
## Sensitivity : 0.9941
## Specificity : 0.9854
## Pos Pred Value : 0.9856
## Neg Pred Value : 0.9940
## Prevalence : 0.5021
## Detection Rate : 0.4992
## Detection Prevalence : 0.5064
## Balanced Accuracy : 0.9897
##
## 'Positive' Class : 사용중
##
위 예측결과 사용해서 오버샘플링이 미친 영향에 대해 작성하라
# 모델 예측 정확도 비교 원데이터 사용 glm 0.9879 svm 0.9881
# 오버 샘플링 데이터 사용 glm 0.9884 svm 0.9898
# over sampling 이 더 나은 예측 결과