R Final Homework

#(1) 請讀取final資料夾下的lvr_prices檔案 [5分]
load('~/lecture/riii/final/lvr_prices.RData')

#(2) 請問可使用哪個函式觀看檔案的資料結構？[5分]
str(lvr_prices)

## 'data.frame':    102054 obs. of  24 variables:
##  $ area            : Factor w/ 12 levels "北投區","大安區",..: 2 12 3 3 5 10 7 7 6 2 ...
##  $ trading_target  : Factor w/ 5 levels "車位","房地(土地+建物)",..: 2 2 5 2 2 5 5 2 2 2 ...
##  $ address         : Factor w/ 17557 levels "安康段1~30地號",..: 4168 15798 1934 5769 7243 694 110 10129 7894 3918 ...
##  $ land_sqmeter    : num  19.39 8.46 5.5 3.88 32.41 ...
##  $ city_land_type  : Factor w/ 6 levels "","工","農","其他",..: 6 5 4 5 6 4 4 6 6 5 ...
##  $ trading_ymd     : Factor w/ 2312 levels "","1973-08-29",..: 912 931 940 923 923 936 936 930 933 930 ...
##  $ trading_num     : Factor w/ 420 levels "土地0建物0車位0",..: 130 306 87 337 105 303 87 105 105 105 ...
##  $ floor           : Factor w/ 515 levels "","八層","八層，電梯樓梯間",..: 381 155 1 166 225 1 1 197 413 76 ...
##  $ total_floor     : Factor w/ 37 levels "","八層","二層",..: 29 26 1 33 36 1 1 16 36 34 ...
##  $ building_type   : Factor w/ 12 levels "辦公商業大樓",..: 12 1 9 12 6 9 9 7 6 6 ...
##  $ main_purpose    : Factor w/ 13 levels "","工商用","工業用",..: 4 9 1 9 12 1 1 4 12 12 ...
##  $ built_with      : Factor w/ 18 levels "","壁式預鑄鋼筋混凝土造",..: 6 6 1 6 6 1 1 6 6 6 ...
##  $ finish_ymd      : Factor w/ 8710 levels "","1911-05-06",..: 4245 3467 1 2172 3125 1 1 5519 2821 1265 ...
##  $ building_sqmeter: num  101 93.4 0 36.7 104.1 ...
##  $ room            : int  3 0 0 1 3 0 0 3 2 3 ...
##  $ living_room     : int  2 0 0 1 1 0 0 2 1 2 ...
##  $ bath            : int  1 0 0 1 1 0 0 2 1 2 ...
##  $ compartment     : Factor w/ 2 levels "無","有": 2 2 2 2 2 2 2 2 2 2 ...
##  $ management      : Factor w/ 2 levels "無","有": 2 2 1 2 1 1 1 2 1 1 ...
##  $ total_price     : num  18680000 20300000 132096 4200000 14000000 ...
##  $ parking_type    : Factor w/ 8 levels "","坡道機械",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ parking_sqmeter : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ parking_price   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ID              : Factor w/ 102054 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...

#(3) 請問可使用哪個函式觀看資料前10筆資料？[5分]
head(lvr_prices,10)

#(4) 請篩選出
#  1. city_land_type為住宅用
#  2. total_price > 0
#  3. building_sqmeter > 0
#  4. finish_ymd 非空值
#  的房屋資料,並存入house變數中。[8分]
house = lvr_prices[lvr_prices$city_land_type == '住' & lvr_prices$total_price > 0 & lvr_prices$building_sqmeter > 0 & lvr_prices$finish_ymd != '',]

#(5) 請使用house資料，利用房屋價格(total_price)及房屋平方米數(building_sqmeter)兩欄位，
#    產生一新欄位為每平方米價格(price_per_sqmeter)，並將其四捨五入到整數位。[5分]
house$price_per_sqmeter = round(house$total_price / house$building_sqmeter)

#(6) 請使用house資料，利用scale() 將每平方米價格(price_per_sqmeter)欄位資料標準化
#    ，並剔除掉outlier資料(z-score > 3)。[5分]
house = house[abs(scale(house$price_per_sqmeter)) <= 3,]

#(7) 請問在house資料中各行政區(area)的資料筆數為何? 可否畫出其長條圖? [5分]
table(house$area)

## 
## 北投區 大安區 大同區 南港區 內湖區 士林區 松山區 萬華區 文山區 信義區 
##   6057   3881    983   2903   7793   5274   2812   2440   6293   3085 
## 中山區 中正區 
##   4438   2377

barplot(table(house$area),family="Songti SC")

#(8) 請使用house資料，計算各行政區每平方米價格(price_per_sqmeter)欄位資料的平均數，中位數及標準差 [8分]
tapply(house$price_per_sqmeter,house$area,function(e){c(mean(e),median(e),sd(e))})

## $北投區
## [1] 133586.5 130108.0  55828.0
## 
## $大安區
## [1] 247147.8 248055.0 111525.0
## 
## $大同區
## [1] 138409.27 140310.00  55705.87
## 
## $南港區
## [1] 168882.97 170298.00  55608.19
## 
## $內湖區
## [1] 153234.58 153264.00  54979.36
## 
## $士林區
## [1] 165804.02 158733.00  82885.72
## 
## $松山區
## [1] 194179.66 202705.00  75834.34
## 
## $萬華區
## [1] 104257.01 101056.00  55204.07
## 
## $文山區
## [1] 125663.78 126366.00  39369.61
## 
## $信義區
## [1] 195851.33 187912.00  96537.79
## 
## $中山區
## [1] 191188.54 188337.00  81773.21
## 
## $中正區
## [1] 205840.27 205756.00  84486.38

#(9) 請使用house資料,利用ggplot2的facet_wrap函數繪製各行政區房屋每平方米價格(price_per_sqmeter)的直方圖 [8分]
library('ggplot2')
g = ggplot(house,aes(x=price_per_sqmeter))
g+geom_histogram()+facet_wrap(~area) + theme(text=element_text(size=16,  family="Songti SC"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#(10) 試利用房屋完工日期(finish_ymd)產生一新變數為屋齡(building_age)加入house資料中。
#hint1: 取得當前日期的函數為 Sys.Date()
#hint2: 一年請以365天計算，四捨五入至整數位
#hint3: 將運算完的資料轉為整數型態(integer) [8分]
house$finish_ymd = as.Date(house$finish_ymd)
house$building_age = as.integer( (Sys.Date() - house$finish_ymd ) / 365)

#(11) 請讀取final資料夾下的house_danger檔案，
#     並將house資料集和house_danger資料集以left outer join方式join起來，
#     存回house變數中 [5分]
load('~/lecture/riii/final/house_danger.RData')
house = merge(house,house_danger,by = 'ID', all.x = T)

#(12) 請將house資料以8:2的比例分為訓練集和測試集，
#     將訓練集資料存在trainset變數中，將測試集資料存在testset變數中。 [5分]
set.seed(1206)
ind<-sample(1:2, size=nrow(house), replace=T, prob=c(0.8, 0.2))
trainset=house[ind==1,]
testset=house[ind==2,]

#(13) 利用rpart套件建立一預測房屋是否為危樓(danger)的決策樹模型，
#     請利用行政區(area), 屋齡(building_age), 房屋總平方米數(building_sqmeter),
#     房屋類型(building_type)及每平方米價格(price_per_sqmeter)
#     5個變數作為解釋變數放入模型當中建模，並將模型存在house.rp變數中。 [5分]
library('rpart')
house.rp<-rpart(danger ~ area+building_age+building_type+building_sqmeter+price_per_sqmeter, data=trainset)

#(14) 請利用plot()和text()畫出house.rp模型的決策樹 [5分]
plot(house.rp, uniform=TRUE,branch = 0.6, margin=0.1)
text(house.rp, cex=0.7)

#(15) 請問此決策數是否需要進行剪枝(prune)？如需剪枝請將修剪後的模型存回house.rp中。 [5分]
house.rp$cptable

##           CP nsplit rel error    xerror        xstd
## 1 0.20133830      0 1.0000000 1.0000000 0.011977156
## 2 0.08326275      2 0.5973234 0.5973234 0.009588005
## 3 0.07284432      4 0.4307979 0.4309673 0.008257593
## 4 0.02498729      5 0.3579536 0.3582924 0.007573967
## 5 0.01000000      8 0.2803659 0.2808741 0.006747899

house.cp = house.rp$cptable[which.min(house.rp$cptable[,"xerror"]), "CP"]
house.rp=prune(house.rp, cp=house.cp)

#(16) 請將測試集資料(testset)放入模型中進行驗證，請問此模型的accuracy, precision, recall等績效分別為何？ [5分]
predictions =predict(house.rp, testset,type = 'class')
library('caret')

## Loading required package: lattice

library('e1071')
confusionMatrix(table(predictions, testset$danger))

## Confusion Matrix and Statistics
## 
##            
## predictions   NO  YES
##         NO  7855   18
##         YES  413 1519
##                                         
##                Accuracy : 0.956         
##                  95% CI : (0.9518, 0.96)
##     No Information Rate : 0.8432        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.8495        
##  Mcnemar's Test P-Value : < 2.2e-16     
##                                         
##             Sensitivity : 0.9500        
##             Specificity : 0.9883        
##          Pos Pred Value : 0.9977        
##          Neg Pred Value : 0.7862        
##              Prevalence : 0.8432        
##          Detection Rate : 0.8011        
##    Detection Prevalence : 0.8030        
##       Balanced Accuracy : 0.9692        
##                                         
##        'Positive' Class : NO            
##

#(17) 請繪製出此模型的ROC曲線，並計算其AUC [8分]
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

predictions <-predict(house.rp, testset, type="prob")
pred.to.roc<-predictions[, 'YES']
pred.rocr<-prediction(pred.to.roc, testset$danger)
perf.tpr.rocr<-performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr)

perf.rocr<-performance(pred.rocr, measure ="auc", x.measure="cutoff")
print(perf.rocr@y.values)

## [[1]]
## [1] 0.9740802