使用隨機森林演算法建立房屋建坪單價預測模型

載入使用套件與設定工作目錄

讀取房屋資料

setwd("/Users/hadoop/CxlHouseAVM")
clear.house.data <- fread("HouseDataSample.csv", sep=',', header = TRUE, stringsAsFactors = TRUE)

檢視資料摘要

##   TOTAL_FLOOR     TRANS_FLOOR     BUILD_AREA.ADJ     HOUSE_YEAR   
##  Min.   : 3.00   Min.   : 2.000   Min.   : 10.00   Min.   : 1.00  
##  1st Qu.: 7.00   1st Qu.: 3.000   1st Qu.: 17.70   1st Qu.: 7.00  
##  Median :12.00   Median : 6.000   Median : 30.43   Median :23.00  
##  Mean   :11.18   Mean   : 6.478   Mean   : 31.66   Mean   :20.29  
##  3rd Qu.:14.00   3rd Qu.: 9.000   3rd Qu.: 39.74   3rd Qu.:31.00  
##  Max.   :29.00   Max.   :28.000   Max.   :141.97   Max.   :45.00  
##  BUILD_U_PRICE_ADJ_INDEX
##  Min.   : 32.11         
##  1st Qu.: 63.07         
##  Median : 73.68         
##  Mean   : 76.55         
##  3rd Qu.: 88.69         
##  Max.   :151.86

使用caret套件，檢視特徵變異性，無變異性特徵可以過濾掉

nsv <- nearZeroVar(clear.house.data, saveMetrics= TRUE)
print(nsv)

##                         freqRatio percentUnique zeroVar   nzv
## TOTAL_FLOOR              1.048913           2.1   FALSE FALSE
## TRANS_FLOOR              1.031008           2.2   FALSE FALSE
## BUILD_AREA.ADJ           3.000000          74.2   FALSE FALSE
## HOUSE_YEAR               1.137931           4.5   FALSE FALSE
## BUILD_U_PRICE_ADJ_INDEX  1.000000          99.8   FALSE FALSE

將資料拆分成訓練資料與測試資料

## [1] "資料筆數:1000；70% 訓練集筆數:900；30% 測試集筆數:100"

透過訓練資料集建立隨機森林模型，設定每個決策數規則節點至少有50個資料，簡化決策樹方便畫圖

model <- randomForest(BUILD_U_PRICE_ADJ_INDEX ~ ., 
                      data=trainData, ntree=random.ntree, nodesize = 50, importance = TRUE)

測試模型結果

test.result <-  evaluateModel(model, testData   , 'Test House Price Regression by Random Forest' )

##      MAPE       R2 HitRate_10 HitRate_20
## 1 0.16302 0.385412       0.36       0.72

head(test.result$data)

##    TOTAL_FLOOR TRANS_FLOOR BUILD_AREA.ADJ HOUSE_YEAR
## 1:          10           9          30.94          4
## 2:           7           6          34.31         32
## 3:           7           3          37.43         30
## 4:           6           4          14.07         23
## 5:           6           5          14.68         40
## 6:           7           4          14.88         21
##    BUILD_U_PRICE_ADJ_INDEX testPrice    residual        aep       V1
## 1:               106.73135  86.44992  20.2814266 0.19002314 85.27424
## 2:                67.31327  71.10996  -3.7966904 0.05640330 72.64566
## 3:                70.35297  69.35767   0.9952973 0.01414720 69.91232
## 4:                56.67431  73.78296 -17.1086531 0.30187669 69.91232
## 5:                69.31176  73.78296  -4.4712032 0.06450858 69.91232
## 6:                64.64966  73.78296  -9.1333047 0.14127383 69.91232
##          V2       V3
## 1: 98.05806 76.01747
## 2: 69.56958 71.11463
## 3: 69.56958 68.59111
## 4: 69.56958 81.86699
## 5: 69.56958 81.86699
## 6: 69.56958 81.86699

繪製變數的重要性

plotImportance(model)

繪製每個樹狀圖 1. How to actually plot a sample tree from randomForest::getTree()? 2. Best way to present a random forest in a publication?

allTree.rules <- lapply(1:random.ntree, function(x) {
        tree <- getTree(model,x,labelVar=TRUE)

        tree$prediction <- round(tree$prediction, 2)
        d <- to.dendrogram(tree)

        plot(d,center=TRUE,leaflab='perpendicular',edgePar=list(t.cex=0.7,p.col=NA,p.lty=0))
        
        colnames(tree)<-sapply(colnames(tree),collapse)
        rules<-getConds(tree)
        print(paste("摘要決策樹[",x,"]前三個規則",sep='') )
        print(head(rules,3))
        
})

## [1] "摘要決策樹[1]前三個規則"
## [[1]]
## [1] "HOUSE_YEAR < 22.5  &  TOTAL_FLOOR < 5.5  &  TRANS_FLOOR < 5.5  =>  72.81"
## 
## [[2]]
## [1] "BUILD_AREA.ADJ > 59.795  &  TOTAL_FLOOR > 5.5  &  TRANS_FLOOR < 5.5  =>  105.08"
## 
## [[3]]
## [1] "HOUSE_YEAR < 5.5  &  TOTAL_FLOOR > 17.5  &  TRANS_FLOOR > 5.5  =>  110.92"

## [1] "摘要決策樹[2]前三個規則"
## [[1]]
## [1] "HOUSE_YEAR < 1.5  &  HOUSE_YEAR < 16.5  =>  75.69"
## 
## [[2]]
## [1] "BUILD_AREA.ADJ > 65.69  &  TRANS_FLOOR > 6.5  &  HOUSE_YEAR > 16.5  =>  100.77"
## 
## [[3]]
## [1] "TRANS_FLOOR < 5.5  &  HOUSE_YEAR < 9.5  &  HOUSE_YEAR > 1.5  &  HOUSE_YEAR < 16.5  =>  83.16"

## [1] "摘要決策樹[3]前三個規則"
## [[1]]
## [1] "TRANS_FLOOR > 3.5  &  TRANS_FLOOR < 5.5  &  BUILD_AREA.ADJ < 20.845  =>  81.87"
## 
## [[2]]
## [1] "TOTAL_FLOOR > 16.5  &  TRANS_FLOOR > 5.5  &  BUILD_AREA.ADJ < 20.845  =>  110.49"
## 
## [[3]]
## [1] "HOUSE_YEAR < 22.5  &  BUILD_AREA.ADJ > 59.03  &  BUILD_AREA.ADJ > 20.845  =>  92.01"

繪製特徵對價格的影響

par.names <- c("HOUSE_YEAR", "TOTAL_FLOOR","TRANS_FLOOR","BUILD_AREA.ADJ" )

par.names[which(!par.names %in% names(trainData))]

## character(0)

par(mfrow=c(3, 3))
print(par.names)

## [1] "HOUSE_YEAR"     "TOTAL_FLOOR"    "TRANS_FLOOR"    "BUILD_AREA.ADJ"

for (f.name in par.names) {
   partialPlot(model, trainData, x.var = eval(f.name), main=f.name, xlab=f.name)
}
par(mfrow=c(1, 1))

附錄使用函式

plotImportance <-function(model) {
  library(caret)
  (VI_F=importance(model))
  library(dplyr)
  class(VI_F)


  varImp(model)
  par(mfrow=c(1, 2), xpd=NA)
  varImpPlot(model,type=1)
  varImpPlot(model,type=2)
  par(mfrow=c(1, 1))
}


evaluateModel <- function(model, testData, title) {
        test.data <- predict(model, testData, predict.all= TRUE)
        
        testPrice <- test.data$aggregate
        tree.data <-  test.data$individual
        names(tree.data) <- c('Tree_1','Tree_2','Tree_3')
        
        head(tree.data)
        
        
        head(testPrice)
        head(testPrice)
        #library(miscTools)
        sum(is.na(testPrice))
        rSquared <- function (y, resid)
        {
                yy <- y - matrix(mean(y), nrow = nrow(array(y)))
                r2 <- 1 - (t(resid) %*% resid)/(t(yy) %*% yy)
                return(r2)
        }
        r2 <- rSquared(testData$BUILD_U_PRICE_ADJ_INDEX, testData$BUILD_U_PRICE_ADJ_INDEX -testPrice)
        r2 <- r2[1,1]
        testResult <- cbind.data.frame(testData, testPrice)
        testResult$residual <- testResult$BUILD_U_PRICE_ADJ_INDEX - testResult$testPrice
        testResult$aep <- abs(testResult$residual /testResult$BUILD_U_PRICE_ADJ_INDEX)
        
        (mse <- mean((testResult$residual)^2) )
        
        
        #install.packages("RevoUtilsMath")
        
        mape <- round(mean(testResult$aep, na.rm = TRUE), 5)
        
        hit_10 <- round(sum(testResult$aep <= 0.1, na.rm = TRUE) /dim(testResult)[1],3)
        
        hit_20 <- round(sum(testResult$aep <= 0.2, na.rm = TRUE) /dim(testResult)[1],3)
        
        data_count <- dim(testResult)[1]
        
        library(ggplot2)
        result <- data.frame(MAPE =mape, R2 = r2, HitRate_10 = hit_10, HitRate_20 = hit_20 )
        print(result)
        
        p <- ggplot(aes(x=BUILD_U_PRICE_ADJ_INDEX, y=testPrice),
                    data=testResult)
        p <- p + geom_point() +
                geom_abline(color="red") +
                theme(title= element_text(vjust=0.6, size=8)) +
                ggtitle(paste(title, "\nR^2=", r2,"; MAPE=",mape,";Hit Rate(10%)=", hit_10,"; Hit Rate(20%)=", hit_20 ,  sep=""))
        class(hit_20)
        testResult <- cbind(testResult, tree.data)
        
        output <- list()
        output$data <- testResult
        output$result <- result
        output$resultPlot <- p
        return (output)
}



to.dendrogram <- function(dfrep,rownum=1,height.increment=0.5){
        
        if(dfrep[rownum,'status'] == -1){
                rval <- list()
                
                attr(rval,"members") <- 1
                attr(rval,"height") <- 0.0
                attr(rval,"label") <- dfrep[rownum,'prediction']
                attr(rval,"leaf") <- TRUE
                
        }else{##note the change "to.dendrogram" and not "to.dendogram"
                left <- to.dendrogram(dfrep,dfrep[rownum,'left daughter'],height.increment)
                right <- to.dendrogram(dfrep,dfrep[rownum,'right daughter'],height.increment)
                rval <- list(left,right)
                
                attr(rval,"members") <- attr(left,"members") + attr(right,"members")
                attr(rval,"height") <- max(attr(left,"height"),attr(right,"height")) + height.increment
                attr(rval,"leaf") <- FALSE
                attr(rval,"edgetext") <- dfrep[rownum,'split var']
        }
        
        class(rval) <- "dendrogram"
        
        return(rval)
}


to.dendrogram <- function(dfrep,rownum=1,height.increment=0.1){
        
        if(dfrep[rownum,'status'] == -1){
                rval <- list()
                
                attr(rval,"members") <- 1
                attr(rval,"height") <- 0.0
                attr(rval,"label") <- round(dfrep[rownum,'prediction'], 2)
                attr(rval,"leaf") <- TRUE
                
        }else{##note the change "to.dendrogram" and not "to.dendogram"
                left <- to.dendrogram(dfrep,dfrep[rownum,'left daughter'],height.increment)
                right <- to.dendrogram(dfrep,dfrep[rownum,'right daughter'],height.increment)
                rval <- list(left,right)
                
                attr(rval,"members") <- attr(left,"members") + attr(right,"members")
                attr(rval,"height") <- max(attr(left,"height"),attr(right,"height")) + height.increment
                attr(rval,"leaf") <- FALSE
                attr(rval,"edgetext") <- paste(dfrep[rownum,'split var'],'\n(', dfrep[rownum,'split point'],')',sep='')
        }
        
        class(rval) <- "dendrogram"
        
        return(rval)
}



#**************************
#return the rules of a tree
#**************************
getConds<-function(tree){
        #store all conditions into a list
        conds<-list()
        #start by the terminal nodes and find previous conditions
        id.leafs<-which(tree$status==-1)
        j<-0
        for(i in id.leafs){
                j<-j+1
                prevConds<-prevCond(tree,i)
                conds[[j]]<-prevConds$cond
                while(prevConds$id>1){
                        prevConds<-prevCond(tree,prevConds$id)
                        conds[[j]]<-paste(conds[[j]]," & ",prevConds$cond)
                        if(prevConds$id==1){
                                conds[[j]]<-paste(conds[[j]]," => ",tree$prediction[i])
                                break()
                        }
                }
                
        }
        
        return(conds)
}

#**************************
#find the previous conditions in the tree
#**************************
prevCond<-function(tree,i){
        if(i %in% tree$right_daughter){
                id<-which(tree$right_daughter==i)
                cond<-paste(tree$split_var[id],">",tree$split_point[id])
        }
        if(i %in% tree$left_daughter){
                id<-which(tree$left_daughter==i)
                cond<-paste(tree$split_var[id],"<",tree$split_point[id])
        }
        
        return(list(cond=cond,id=id))
}

#remove spaces in a word
collapse<-function(x){
        x<-sub(" ","_",x)
        
        return(x)
}

使用隨機森林演算法建立房屋建坪單價預測模型

AllenTsai

2016年8月27日