載入使用套件與設定工作目錄
讀取房屋資料
setwd("/Users/hadoop/CxlHouseAVM")
clear.house.data <- fread("HouseDataSample.csv", sep=',', header = TRUE, stringsAsFactors = TRUE)
檢視資料摘要
## TOTAL_FLOOR TRANS_FLOOR BUILD_AREA.ADJ HOUSE_YEAR
## Min. : 3.00 Min. : 2.000 Min. : 10.00 Min. : 1.00
## 1st Qu.: 7.00 1st Qu.: 3.000 1st Qu.: 17.70 1st Qu.: 7.00
## Median :12.00 Median : 6.000 Median : 30.43 Median :23.00
## Mean :11.18 Mean : 6.478 Mean : 31.66 Mean :20.29
## 3rd Qu.:14.00 3rd Qu.: 9.000 3rd Qu.: 39.74 3rd Qu.:31.00
## Max. :29.00 Max. :28.000 Max. :141.97 Max. :45.00
## BUILD_U_PRICE_ADJ_INDEX
## Min. : 32.11
## 1st Qu.: 63.07
## Median : 73.68
## Mean : 76.55
## 3rd Qu.: 88.69
## Max. :151.86
使用caret套件,檢視特徵變異性,無變異性特徵可以過濾掉
nsv <- nearZeroVar(clear.house.data, saveMetrics= TRUE)
print(nsv)
## freqRatio percentUnique zeroVar nzv
## TOTAL_FLOOR 1.048913 2.1 FALSE FALSE
## TRANS_FLOOR 1.031008 2.2 FALSE FALSE
## BUILD_AREA.ADJ 3.000000 74.2 FALSE FALSE
## HOUSE_YEAR 1.137931 4.5 FALSE FALSE
## BUILD_U_PRICE_ADJ_INDEX 1.000000 99.8 FALSE FALSE
將資料拆分成訓練資料與測試資料
## [1] "資料筆數:1000;70% 訓練集筆數:900;30% 測試集筆數:100"
透過訓練資料集建立隨機森林模型,設定每個決策數規則節點至少有50個資料,簡化決策樹方便畫圖
model <- randomForest(BUILD_U_PRICE_ADJ_INDEX ~ .,
data=trainData, ntree=random.ntree, nodesize = 50, importance = TRUE)
測試模型結果
test.result <- evaluateModel(model, testData , 'Test House Price Regression by Random Forest' )
## MAPE R2 HitRate_10 HitRate_20
## 1 0.16302 0.385412 0.36 0.72
head(test.result$data)
## TOTAL_FLOOR TRANS_FLOOR BUILD_AREA.ADJ HOUSE_YEAR
## 1: 10 9 30.94 4
## 2: 7 6 34.31 32
## 3: 7 3 37.43 30
## 4: 6 4 14.07 23
## 5: 6 5 14.68 40
## 6: 7 4 14.88 21
## BUILD_U_PRICE_ADJ_INDEX testPrice residual aep V1
## 1: 106.73135 86.44992 20.2814266 0.19002314 85.27424
## 2: 67.31327 71.10996 -3.7966904 0.05640330 72.64566
## 3: 70.35297 69.35767 0.9952973 0.01414720 69.91232
## 4: 56.67431 73.78296 -17.1086531 0.30187669 69.91232
## 5: 69.31176 73.78296 -4.4712032 0.06450858 69.91232
## 6: 64.64966 73.78296 -9.1333047 0.14127383 69.91232
## V2 V3
## 1: 98.05806 76.01747
## 2: 69.56958 71.11463
## 3: 69.56958 68.59111
## 4: 69.56958 81.86699
## 5: 69.56958 81.86699
## 6: 69.56958 81.86699
繪製變數的重要性
plotImportance(model)
繪製每個樹狀圖 1. How to actually plot a sample tree from randomForest::getTree()? 2. Best way to present a random forest in a publication?
allTree.rules <- lapply(1:random.ntree, function(x) {
tree <- getTree(model,x,labelVar=TRUE)
tree$prediction <- round(tree$prediction, 2)
d <- to.dendrogram(tree)
plot(d,center=TRUE,leaflab='perpendicular',edgePar=list(t.cex=0.7,p.col=NA,p.lty=0))
colnames(tree)<-sapply(colnames(tree),collapse)
rules<-getConds(tree)
print(paste("摘要決策樹[",x,"]前三個規則",sep='') )
print(head(rules,3))
})
## [1] "摘要決策樹[1]前三個規則"
## [[1]]
## [1] "HOUSE_YEAR < 22.5 & TOTAL_FLOOR < 5.5 & TRANS_FLOOR < 5.5 => 72.81"
##
## [[2]]
## [1] "BUILD_AREA.ADJ > 59.795 & TOTAL_FLOOR > 5.5 & TRANS_FLOOR < 5.5 => 105.08"
##
## [[3]]
## [1] "HOUSE_YEAR < 5.5 & TOTAL_FLOOR > 17.5 & TRANS_FLOOR > 5.5 => 110.92"
## [1] "摘要決策樹[2]前三個規則"
## [[1]]
## [1] "HOUSE_YEAR < 1.5 & HOUSE_YEAR < 16.5 => 75.69"
##
## [[2]]
## [1] "BUILD_AREA.ADJ > 65.69 & TRANS_FLOOR > 6.5 & HOUSE_YEAR > 16.5 => 100.77"
##
## [[3]]
## [1] "TRANS_FLOOR < 5.5 & HOUSE_YEAR < 9.5 & HOUSE_YEAR > 1.5 & HOUSE_YEAR < 16.5 => 83.16"
## [1] "摘要決策樹[3]前三個規則"
## [[1]]
## [1] "TRANS_FLOOR > 3.5 & TRANS_FLOOR < 5.5 & BUILD_AREA.ADJ < 20.845 => 81.87"
##
## [[2]]
## [1] "TOTAL_FLOOR > 16.5 & TRANS_FLOOR > 5.5 & BUILD_AREA.ADJ < 20.845 => 110.49"
##
## [[3]]
## [1] "HOUSE_YEAR < 22.5 & BUILD_AREA.ADJ > 59.03 & BUILD_AREA.ADJ > 20.845 => 92.01"
繪製特徵對價格的影響
par.names <- c("HOUSE_YEAR", "TOTAL_FLOOR","TRANS_FLOOR","BUILD_AREA.ADJ" )
par.names[which(!par.names %in% names(trainData))]
## character(0)
par(mfrow=c(3, 3))
print(par.names)
## [1] "HOUSE_YEAR" "TOTAL_FLOOR" "TRANS_FLOOR" "BUILD_AREA.ADJ"
for (f.name in par.names) {
partialPlot(model, trainData, x.var = eval(f.name), main=f.name, xlab=f.name)
}
par(mfrow=c(1, 1))
附錄使用函式
plotImportance <-function(model) {
library(caret)
(VI_F=importance(model))
library(dplyr)
class(VI_F)
varImp(model)
par(mfrow=c(1, 2), xpd=NA)
varImpPlot(model,type=1)
varImpPlot(model,type=2)
par(mfrow=c(1, 1))
}
evaluateModel <- function(model, testData, title) {
test.data <- predict(model, testData, predict.all= TRUE)
testPrice <- test.data$aggregate
tree.data <- test.data$individual
names(tree.data) <- c('Tree_1','Tree_2','Tree_3')
head(tree.data)
head(testPrice)
head(testPrice)
#library(miscTools)
sum(is.na(testPrice))
rSquared <- function (y, resid)
{
yy <- y - matrix(mean(y), nrow = nrow(array(y)))
r2 <- 1 - (t(resid) %*% resid)/(t(yy) %*% yy)
return(r2)
}
r2 <- rSquared(testData$BUILD_U_PRICE_ADJ_INDEX, testData$BUILD_U_PRICE_ADJ_INDEX -testPrice)
r2 <- r2[1,1]
testResult <- cbind.data.frame(testData, testPrice)
testResult$residual <- testResult$BUILD_U_PRICE_ADJ_INDEX - testResult$testPrice
testResult$aep <- abs(testResult$residual /testResult$BUILD_U_PRICE_ADJ_INDEX)
(mse <- mean((testResult$residual)^2) )
#install.packages("RevoUtilsMath")
mape <- round(mean(testResult$aep, na.rm = TRUE), 5)
hit_10 <- round(sum(testResult$aep <= 0.1, na.rm = TRUE) /dim(testResult)[1],3)
hit_20 <- round(sum(testResult$aep <= 0.2, na.rm = TRUE) /dim(testResult)[1],3)
data_count <- dim(testResult)[1]
library(ggplot2)
result <- data.frame(MAPE =mape, R2 = r2, HitRate_10 = hit_10, HitRate_20 = hit_20 )
print(result)
p <- ggplot(aes(x=BUILD_U_PRICE_ADJ_INDEX, y=testPrice),
data=testResult)
p <- p + geom_point() +
geom_abline(color="red") +
theme(title= element_text(vjust=0.6, size=8)) +
ggtitle(paste(title, "\nR^2=", r2,"; MAPE=",mape,";Hit Rate(10%)=", hit_10,"; Hit Rate(20%)=", hit_20 , sep=""))
class(hit_20)
testResult <- cbind(testResult, tree.data)
output <- list()
output$data <- testResult
output$result <- result
output$resultPlot <- p
return (output)
}
to.dendrogram <- function(dfrep,rownum=1,height.increment=0.5){
if(dfrep[rownum,'status'] == -1){
rval <- list()
attr(rval,"members") <- 1
attr(rval,"height") <- 0.0
attr(rval,"label") <- dfrep[rownum,'prediction']
attr(rval,"leaf") <- TRUE
}else{##note the change "to.dendrogram" and not "to.dendogram"
left <- to.dendrogram(dfrep,dfrep[rownum,'left daughter'],height.increment)
right <- to.dendrogram(dfrep,dfrep[rownum,'right daughter'],height.increment)
rval <- list(left,right)
attr(rval,"members") <- attr(left,"members") + attr(right,"members")
attr(rval,"height") <- max(attr(left,"height"),attr(right,"height")) + height.increment
attr(rval,"leaf") <- FALSE
attr(rval,"edgetext") <- dfrep[rownum,'split var']
}
class(rval) <- "dendrogram"
return(rval)
}
to.dendrogram <- function(dfrep,rownum=1,height.increment=0.1){
if(dfrep[rownum,'status'] == -1){
rval <- list()
attr(rval,"members") <- 1
attr(rval,"height") <- 0.0
attr(rval,"label") <- round(dfrep[rownum,'prediction'], 2)
attr(rval,"leaf") <- TRUE
}else{##note the change "to.dendrogram" and not "to.dendogram"
left <- to.dendrogram(dfrep,dfrep[rownum,'left daughter'],height.increment)
right <- to.dendrogram(dfrep,dfrep[rownum,'right daughter'],height.increment)
rval <- list(left,right)
attr(rval,"members") <- attr(left,"members") + attr(right,"members")
attr(rval,"height") <- max(attr(left,"height"),attr(right,"height")) + height.increment
attr(rval,"leaf") <- FALSE
attr(rval,"edgetext") <- paste(dfrep[rownum,'split var'],'\n(', dfrep[rownum,'split point'],')',sep='')
}
class(rval) <- "dendrogram"
return(rval)
}
#**************************
#return the rules of a tree
#**************************
getConds<-function(tree){
#store all conditions into a list
conds<-list()
#start by the terminal nodes and find previous conditions
id.leafs<-which(tree$status==-1)
j<-0
for(i in id.leafs){
j<-j+1
prevConds<-prevCond(tree,i)
conds[[j]]<-prevConds$cond
while(prevConds$id>1){
prevConds<-prevCond(tree,prevConds$id)
conds[[j]]<-paste(conds[[j]]," & ",prevConds$cond)
if(prevConds$id==1){
conds[[j]]<-paste(conds[[j]]," => ",tree$prediction[i])
break()
}
}
}
return(conds)
}
#**************************
#find the previous conditions in the tree
#**************************
prevCond<-function(tree,i){
if(i %in% tree$right_daughter){
id<-which(tree$right_daughter==i)
cond<-paste(tree$split_var[id],">",tree$split_point[id])
}
if(i %in% tree$left_daughter){
id<-which(tree$left_daughter==i)
cond<-paste(tree$split_var[id],"<",tree$split_point[id])
}
return(list(cond=cond,id=id))
}
#remove spaces in a word
collapse<-function(x){
x<-sub(" ","_",x)
return(x)
}