setwd("~/db")
cen <- read.csv("census2000.csv",header=T,sep=",", fileEncoding = "CP949", encoding = "UTF-8")

# 결측값 처리 : 평균을 넣거나 리그레션 값에 상응하는 Expactation Maximization Algorithm
# conditianal expactation 
temp <- table(complete.cases(cen))
cen <- cen[complete.cases(cen),]
sum(is.na(cen))
## [1] 0
head(cen)
##   ID_1 ID_2 SiDoName    GuName DongName DongCode   Pop  Gagu  Income SESI
## 1    1    1     부산 부산 북구  구포1동  2108051 19452  6318 2365269    4
## 2    1    1     부산 부산 북구  구포2동  2108052 32989  9984 2464451    6
## 3    1    1     부산 부산 북구  구포3동  2108053 32635  9447 2831379    7
## 4    1    1     부산 부산 북구   금곡동  2108054 42470 13184 2187229    4
## 5    1    1     부산 부산 북구   화명동  2108055 53275 15685 2115389    4
## 6    1    1     부산 부산 북구  덕천1동  2108056 22627  7047 2433146    5
##   ClusterCode AvgGaguNo AptRatio MultiHouseRatio SmeHouseRatio HouseOwner
## 1           9       3.3      0.0            11.4          10.2       32.0
## 2           5       3.4     32.7             2.5           7.0       39.1
## 3          11       3.6     26.9            34.2           3.0       56.0
## 4          12       3.4     94.2             0.0           0.0       41.4
## 5          12       3.4     61.9             0.0           2.5       45.8
## 6           4       3.5     59.3             6.2           4.4       53.8
##   HouseFee NoRoom Area14 Area69 RoomDensity
## 1    170.8    2.5   12.1    4.5         1.5
## 2    222.7    3.0    5.4    2.0         1.4
## 3    199.0    3.4    3.7    5.1         1.2
## 4    277.7    3.0   65.3    0.0         1.2
## 5    161.6    2.7   54.9    1.4         1.4
## 6    204.6    3.4   24.0    0.4         1.2
table(cen$SiDoName)
## 
## 강원 경기 경남 경북 광주 대구 대전 부산 서울 울산 인천 전남 전북 제주 충남 
##  154  459  215  216   84  139   76  221  522   55  125  180  160   39  147 
## 충북 
##  111

레이블 정하기

cen$label <- 3
cen$label<-ifelse(cen$ID_1== 1, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 4, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 5, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 7, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 11, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 16, 2, cen$label)
cen$label<-ifelse(cen$ID_1== 15, 1, cen$label)

시도별 레이블확인

temp <- sqldf("select sidoname, ID_1, label, count(*) cnt from cen group by id_1")
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
temp
##    SiDoName ID_1 label cnt
## 1      부산    1     2 221
## 2      충북    2     3 111
## 3      충남    3     3 147
## 4      대구    4     2 139
## 5      대전    5     2  76
## 6      강원    6     3 154
## 7      광주    7     2  84
## 8      경기    8     3 459
## 9      경북    9     3 216
## 10     경남   10     3 215
## 11     인천   11     2 125
## 12     제주   12     3  39
## 13     전북   13     3 160
## 14     전남   14     3 180
## 15     서울   15     1 522
## 16     울산   16     2  55
temp2 <- sqldf("select * from cen where label= 2 order by pop")
temp3 <- sqldf("select * from cen where label= 3 order by pop")
hist(temp2$AptRatio)

hist(temp3$AptRatio)

hist(temp2$Pop)

hist(temp3$Pop)

# cen$label<-ifelse(cen$label ==3 & cen$Pop >= 10000, 2, cen$label)
right <- function(a){
  a <- as.character(a)
  n <- nchar(a)
  return(substr(a,n,n))
}
cen$label2 <- right(cen$DongName)
cen$label<-ifelse(cen$label ==3 & cen$label2== "동" & cen$Pop >= 3000, 2, cen$label)

table(cen$label)
## 
##    1    2    3 
##  522 1568  813
set.seed(123)
library(sqldf)
censort <- sqldf("select * from cen order by label, Dongcode")
cenZ <- as.data.frame(scale(censort[,7:21]))

# samp <- append(sample(c(1:522),300), sample(c(523:1224),400))
# samp <- append(samp, sample(c(1223:2903),800))

samp <- sample(c(1:2903),2903*0.7)
train <- cenZ[samp,]
test <- cenZ[-samp,]

train_label <- censort[samp,22]
test_label <- censort[-samp,22]
library(class)
test_pred <- knn(train = train, test = test,
                 cl = train_label, k=7)

library(gmodels)
CrossTable(x = test_label, y = test_pred, prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  871 
## 
##  
##              | test_pred 
##   test_label |         1 |         2 |         3 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|
##            1 |       121 |        37 |         0 |       158 | 
##              |     0.766 |     0.234 |     0.000 |     0.181 | 
##              |     0.846 |     0.076 |     0.000 |           | 
##              |     0.139 |     0.042 |     0.000 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            2 |        22 |       418 |        35 |       475 | 
##              |     0.046 |     0.880 |     0.074 |     0.545 | 
##              |     0.154 |     0.855 |     0.146 |           | 
##              |     0.025 |     0.480 |     0.040 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            3 |         0 |        34 |       204 |       238 | 
##              |     0.000 |     0.143 |     0.857 |     0.273 | 
##              |     0.000 |     0.070 |     0.854 |           | 
##              |     0.000 |     0.039 |     0.234 |           | 
## -------------|-----------|-----------|-----------|-----------|
## Column Total |       143 |       489 |       239 |       871 | 
##              |     0.164 |     0.561 |     0.274 |           | 
## -------------|-----------|-----------|-----------|-----------|
## 
## 
table(test_label, y = test_pred)
##           y
## test_label   1   2   3
##          1 121  37   0
##          2  22 418  35
##          3   0  34 204
a<- as.data.frame(table(test_label, y = test_pred))
(a[1,3]+a[5,3]+a[9,3])/nrow(test)
## [1] 0.8530425
library(class)
test_pred <- knn(train = train, test = test,
                 cl = train_label, k=5)

library(gmodels)
CrossTable(x = test_label, y = test_pred, prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  871 
## 
##  
##              | test_pred 
##   test_label |         1 |         2 |         3 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|
##            1 |       124 |        34 |         0 |       158 | 
##              |     0.785 |     0.215 |     0.000 |     0.181 | 
##              |     0.849 |     0.070 |     0.000 |           | 
##              |     0.142 |     0.039 |     0.000 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            2 |        22 |       415 |        38 |       475 | 
##              |     0.046 |     0.874 |     0.080 |     0.545 | 
##              |     0.151 |     0.852 |     0.160 |           | 
##              |     0.025 |     0.476 |     0.044 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            3 |         0 |        38 |       200 |       238 | 
##              |     0.000 |     0.160 |     0.840 |     0.273 | 
##              |     0.000 |     0.078 |     0.840 |           | 
##              |     0.000 |     0.044 |     0.230 |           | 
## -------------|-----------|-----------|-----------|-----------|
## Column Total |       146 |       487 |       238 |       871 | 
##              |     0.168 |     0.559 |     0.273 |           | 
## -------------|-----------|-----------|-----------|-----------|
## 
## 
a<- as.data.frame(table(test_label, y = test_pred))
(a[1,3]+a[5,3]+a[9,3])/nrow(test)
## [1] 0.8484501