setwd("~/db")
cen <- read.csv("census2000.csv",header=T,sep=",", fileEncoding = "CP949", encoding = "UTF-8")
# 결측값 처리 : 평균을 넣거나 리그레션 값에 상응하는 Expactation Maximization Algorithm
# conditianal expactation
temp <- table(complete.cases(cen))
cen <- cen[complete.cases(cen),]
sum(is.na(cen))
## [1] 0
head(cen)
## ID_1 ID_2 SiDoName GuName DongName DongCode Pop Gagu Income SESI
## 1 1 1 부산 부산 북구 구포1동 2108051 19452 6318 2365269 4
## 2 1 1 부산 부산 북구 구포2동 2108052 32989 9984 2464451 6
## 3 1 1 부산 부산 북구 구포3동 2108053 32635 9447 2831379 7
## 4 1 1 부산 부산 북구 금곡동 2108054 42470 13184 2187229 4
## 5 1 1 부산 부산 북구 화명동 2108055 53275 15685 2115389 4
## 6 1 1 부산 부산 북구 덕천1동 2108056 22627 7047 2433146 5
## ClusterCode AvgGaguNo AptRatio MultiHouseRatio SmeHouseRatio HouseOwner
## 1 9 3.3 0.0 11.4 10.2 32.0
## 2 5 3.4 32.7 2.5 7.0 39.1
## 3 11 3.6 26.9 34.2 3.0 56.0
## 4 12 3.4 94.2 0.0 0.0 41.4
## 5 12 3.4 61.9 0.0 2.5 45.8
## 6 4 3.5 59.3 6.2 4.4 53.8
## HouseFee NoRoom Area14 Area69 RoomDensity
## 1 170.8 2.5 12.1 4.5 1.5
## 2 222.7 3.0 5.4 2.0 1.4
## 3 199.0 3.4 3.7 5.1 1.2
## 4 277.7 3.0 65.3 0.0 1.2
## 5 161.6 2.7 54.9 1.4 1.4
## 6 204.6 3.4 24.0 0.4 1.2
table(cen$SiDoName)
##
## 강원 경기 경남 경북 광주 대구 대전 부산 서울 울산 인천 전남 전북 제주 충남
## 154 459 215 216 84 139 76 221 522 55 125 180 160 39 147
## 충북
## 111
레이블 정하기
cen$label <- 3
cen$label<-ifelse(cen$ID_1== 1, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 4, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 5, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 7, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 11, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 16, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 15, 1, cen$label)
시도별 레이블확인
temp <- sqldf("select sidoname, ID_1, label, count(*) cnt from cen group by id_1")
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
temp
## SiDoName ID_1 label cnt
## 1 부산 1 2 221
## 2 충북 2 3 111
## 3 충남 3 3 147
## 4 대구 4 2 139
## 5 대전 5 2 76
## 6 강원 6 3 154
## 7 광주 7 2 84
## 8 경기 8 3 459
## 9 경북 9 3 216
## 10 경남 10 3 215
## 11 인천 11 2 125
## 12 제주 12 3 39
## 13 전북 13 3 160
## 14 전남 14 3 180
## 15 서울 15 1 522
## 16 울산 16 2 55
temp2 <- sqldf("select * from cen where label= 2 order by pop")
temp3 <- sqldf("select * from cen where label= 3 order by pop")
hist(temp2$AptRatio)
hist(temp3$AptRatio)
hist(temp2$Pop)
hist(temp3$Pop)
# cen$label<-ifelse(cen$label ==3 & cen$Pop >= 10000, 2, cen$label)
right <- function(a){
a <- as.character(a)
n <- nchar(a)
return(substr(a,n,n))
}
cen$label2 <- right(cen$DongName)
cen$label<-ifelse(cen$label ==3 & cen$label2== "동" & cen$Pop >= 3000, 2, cen$label)
table(cen$label)
##
## 1 2 3
## 522 1568 813
set.seed(123)
library(sqldf)
censort <- sqldf("select * from cen order by label, Dongcode")
cenZ <- as.data.frame(scale(censort[,7:21]))
# samp <- append(sample(c(1:522),300), sample(c(523:1224),400))
# samp <- append(samp, sample(c(1223:2903),800))
samp <- sample(c(1:2903),2903*0.7)
train <- cenZ[samp,]
test <- cenZ[-samp,]
train_label <- censort[samp,22]
test_label <- censort[-samp,22]
library(class)
test_pred <- knn(train = train, test = test,
cl = train_label, k=7)
library(gmodels)
CrossTable(x = test_label, y = test_pred, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 871
##
##
## | test_pred
## test_label | 1 | 2 | 3 | Row Total |
## -------------|-----------|-----------|-----------|-----------|
## 1 | 121 | 37 | 0 | 158 |
## | 0.766 | 0.234 | 0.000 | 0.181 |
## | 0.846 | 0.076 | 0.000 | |
## | 0.139 | 0.042 | 0.000 | |
## -------------|-----------|-----------|-----------|-----------|
## 2 | 22 | 418 | 35 | 475 |
## | 0.046 | 0.880 | 0.074 | 0.545 |
## | 0.154 | 0.855 | 0.146 | |
## | 0.025 | 0.480 | 0.040 | |
## -------------|-----------|-----------|-----------|-----------|
## 3 | 0 | 34 | 204 | 238 |
## | 0.000 | 0.143 | 0.857 | 0.273 |
## | 0.000 | 0.070 | 0.854 | |
## | 0.000 | 0.039 | 0.234 | |
## -------------|-----------|-----------|-----------|-----------|
## Column Total | 143 | 489 | 239 | 871 |
## | 0.164 | 0.561 | 0.274 | |
## -------------|-----------|-----------|-----------|-----------|
##
##
table(test_label, y = test_pred)
## y
## test_label 1 2 3
## 1 121 37 0
## 2 22 418 35
## 3 0 34 204
a<- as.data.frame(table(test_label, y = test_pred))
(a[1,3]+a[5,3]+a[9,3])/nrow(test)
## [1] 0.8530425
library(class)
test_pred <- knn(train = train, test = test,
cl = train_label, k=5)
library(gmodels)
CrossTable(x = test_label, y = test_pred, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 871
##
##
## | test_pred
## test_label | 1 | 2 | 3 | Row Total |
## -------------|-----------|-----------|-----------|-----------|
## 1 | 124 | 34 | 0 | 158 |
## | 0.785 | 0.215 | 0.000 | 0.181 |
## | 0.849 | 0.070 | 0.000 | |
## | 0.142 | 0.039 | 0.000 | |
## -------------|-----------|-----------|-----------|-----------|
## 2 | 22 | 415 | 38 | 475 |
## | 0.046 | 0.874 | 0.080 | 0.545 |
## | 0.151 | 0.852 | 0.160 | |
## | 0.025 | 0.476 | 0.044 | |
## -------------|-----------|-----------|-----------|-----------|
## 3 | 0 | 38 | 200 | 238 |
## | 0.000 | 0.160 | 0.840 | 0.273 |
## | 0.000 | 0.078 | 0.840 | |
## | 0.000 | 0.044 | 0.230 | |
## -------------|-----------|-----------|-----------|-----------|
## Column Total | 146 | 487 | 238 | 871 |
## | 0.168 | 0.559 | 0.273 | |
## -------------|-----------|-----------|-----------|-----------|
##
##
a<- as.data.frame(table(test_label, y = test_pred))
(a[1,3]+a[5,3]+a[9,3])/nrow(test)
## [1] 0.8484501