請使用hw資料夾下的lvr_prices_big5.csv,house_danger.csv檔案完成下列練習
load('~/lecture/riii/hw/lvr_prices.RData')
str(lvr_prices)
## 'data.frame': 102054 obs. of 24 variables:
## $ area : Factor w/ 12 levels "北投區","大安區",..: 2 12 3 3 5 10 7 7 6 2 ...
## $ trading_target : Factor w/ 5 levels "車位","房地(土地+建物)",..: 2 2 5 2 2 5 5 2 2 2 ...
## $ address : Factor w/ 17557 levels "安康段1~30地號",..: 4168 15798 1934 5769 7243 694 110 10129 7894 3918 ...
## $ land_sqmeter : num 19.39 8.46 5.5 3.88 32.41 ...
## $ city_land_type : Factor w/ 6 levels "","工","農","其他",..: 6 5 4 5 6 4 4 6 6 5 ...
## $ trading_ymd : Factor w/ 2312 levels "","1973-08-29",..: 912 931 940 923 923 936 936 930 933 930 ...
## $ trading_num : Factor w/ 420 levels "土地0建物0車位0",..: 130 306 87 337 105 303 87 105 105 105 ...
## $ floor : Factor w/ 515 levels "","八層","八層,電梯樓梯間",..: 381 155 1 166 225 1 1 197 413 76 ...
## $ total_floor : Factor w/ 37 levels "","八層","二層",..: 29 26 1 33 36 1 1 16 36 34 ...
## $ building_type : Factor w/ 12 levels "辦公商業大樓",..: 12 1 9 12 6 9 9 7 6 6 ...
## $ main_purpose : Factor w/ 13 levels "","工商用","工業用",..: 4 9 1 9 12 1 1 4 12 12 ...
## $ built_with : Factor w/ 18 levels "","壁式預鑄鋼筋混凝土造",..: 6 6 1 6 6 1 1 6 6 6 ...
## $ finish_ymd : Factor w/ 8710 levels "","1911-05-06",..: 4245 3467 1 2172 3125 1 1 5519 2821 1265 ...
## $ building_sqmeter: num 101 93.4 0 36.7 104.1 ...
## $ room : int 3 0 0 1 3 0 0 3 2 3 ...
## $ living_room : int 2 0 0 1 1 0 0 2 1 2 ...
## $ bath : int 1 0 0 1 1 0 0 2 1 2 ...
## $ compartment : Factor w/ 2 levels "無","有": 2 2 2 2 2 2 2 2 2 2 ...
## $ management : Factor w/ 2 levels "無","有": 2 2 1 2 1 1 1 2 1 1 ...
## $ total_price : num 18680000 20300000 132096 4200000 14000000 ...
## $ parking_type : Factor w/ 8 levels "","坡道機械",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ parking_sqmeter : num 0 0 0 0 0 0 0 0 0 0 ...
## $ parking_price : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ID : Factor w/ 102054 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
house = lvr_prices[lvr_prices$city_land_type == '住' & lvr_prices$total_price > 0 & lvr_prices$building_sqmeter > 0 & lvr_prices$finish_ymd != '',]
house$price_per_sqmeter = round(house$total_price / house$building_sqmeter)
house = house[abs(scale(house$price_per_sqmeter)) <= 3,]
house$finish_ymd = as.Date(house$finish_ymd)
house$building_age = as.integer( (Sys.Date() - house$finish_ymd ) / 365)
house_danger = read.csv('~/lecture/riii/hw/house_danger.csv')
house = merge(house,house_danger,by = 'ID', all.x = T)
ind<-sample(1:2, size=nrow(house), replace=T, prob=c(0.8, 0.2))
trainset=house[ind==1,]
testset=house[ind==2,]
library('rpart')
house.rp<-rpart(danger ~ area+building_age+building_type+building_sqmeter+price_per_sqmeter, data=trainset)
plot(house.rp, uniform=TRUE,branch = 0.6, margin=0.1)
text(house.rp, cex=0.7)
house.rp$cptable
## CP nsplit rel error xerror xstd
## 1 0.20128248 0 1.0000000 1.0000000 0.011866128
## 2 0.09776815 2 0.5974350 0.5976016 0.009504612
## 3 0.06645570 4 0.4018987 0.4017322 0.007921799
## 4 0.02681546 5 0.3354430 0.3352765 0.007276507
## 5 0.01000000 8 0.2538308 0.2544970 0.006381228
house.cp = house.rp$cptable[which.min(house.rp$cptable[,"xerror"]), "CP"]
house.rp=prune(house.rp, cp=house.cp)
predictions =predict(house.rp, testset,type = 'class')
library('caret')
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
library('e1071')
confusionMatrix(table(predictions, testset$danger))
## Confusion Matrix and Statistics
##
##
## predictions NO YES
## NO 7688 4
## YES 378 1432
##
## Accuracy : 0.9598
## 95% CI : (0.9557, 0.9637)
## No Information Rate : 0.8489
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8585
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9531
## Specificity : 0.9972
## Pos Pred Value : 0.9995
## Neg Pred Value : 0.7912
## Prevalence : 0.8489
## Detection Rate : 0.8091
## Detection Prevalence : 0.8095
## Balanced Accuracy : 0.9752
##
## 'Positive' Class : NO
##
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <-predict(house.rp, testset, type="prob")
pred.to.roc<-predictions[, 'YES']
pred.rocr<-prediction(pred.to.roc, testset$danger)
perf.tpr.rocr<-performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr)
perf.rocr<-performance(pred.rocr, measure ="auc", x.measure="cutoff")
print(perf.rocr@y.values)
## [[1]]
## [1] 0.9751252