R Basic

a <- 3
b <- 2
a + b
## [1] 5
data('anscombe')
#View(anscombe)


fit <- lm(y1 ~ x1, data = anscombe)
fit
## 
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
## 
## Coefficients:
## (Intercept)           x1  
##      3.0001       0.5001
plot(y1 ~ x1, data = anscombe)
abline(fit, col='red')

predict(fit, data.frame(x1= 13))
##        1 
## 9.501273

資料匯入

library(readr)
customer <- read_csv("customer.csv")
## Parsed with column specification:
## cols(
##   ID = col_integer(),
##   Visit.Time = col_integer(),
##   Average.Expense = col_double(),
##   Sex = col_integer(),
##   Age = col_integer()
## )
library(readr)
customer <- read_csv("customer.csv", col_types = cols(Sex = col_factor(levels = c("0", 
    "1"))))

#View(customer)
class(customer)
## [1] "tbl_df"     "tbl"        "data.frame"
str(customer)
## Classes 'tbl_df', 'tbl' and 'data.frame':    60 obs. of  5 variables:
##  $ ID             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Visit.Time     : int  3 5 16 5 16 3 12 14 6 3 ...
##  $ Average.Expense: num  5.7 14.5 33.5 15.9 24.9 12 28.5 18.8 23.8 5.3 ...
##  $ Sex            : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age            : int  10 27 32 30 23 15 33 27 16 11 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 5
##   .. ..$ ID             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Visit.Time     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Average.Expense: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Sex            :List of 3
##   .. .. ..$ levels    : chr  "0" "1"
##   .. .. ..$ ordered   : logi FALSE
##   .. .. ..$ include_na: logi FALSE
##   .. .. ..- attr(*, "class")= chr  "collector_factor" "collector"
##   .. ..$ Age            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
summary(customer)
##        ID          Visit.Time   Average.Expense Sex         Age       
##  Min.   : 1.00   Min.   : 1.0   Min.   : 4.50   0:19   Min.   : 8.00  
##  1st Qu.:15.75   1st Qu.: 5.0   1st Qu.:10.82   1:41   1st Qu.:15.00  
##  Median :30.50   Median : 7.5   Median :16.00          Median :20.50  
##  Mean   :30.50   Mean   : 8.4   Mean   :17.06          Mean   :21.43  
##  3rd Qu.:45.25   3rd Qu.:12.0   3rd Qu.:24.90          3rd Qu.:27.00  
##  Max.   :60.00   Max.   :18.0   Max.   :33.70          Max.   :47.00
customer$Sex <- as.factor(customer$Sex)
summary(customer)
##        ID          Visit.Time   Average.Expense Sex         Age       
##  Min.   : 1.00   Min.   : 1.0   Min.   : 4.50   0:19   Min.   : 8.00  
##  1st Qu.:15.75   1st Qu.: 5.0   1st Qu.:10.82   1:41   1st Qu.:15.00  
##  Median :30.50   Median : 7.5   Median :16.00          Median :20.50  
##  Mean   :30.50   Mean   : 8.4   Mean   :17.06          Mean   :21.43  
##  3rd Qu.:45.25   3rd Qu.:12.0   3rd Qu.:24.90          3rd Qu.:27.00  
##  Max.   :60.00   Max.   :18.0   Max.   :33.70          Max.   :47.00
help(write.csv)
?write.csv
# method 1
write.csv(x = customer, file='customer2.csv')

# method 2
write.csv(customer, file='customer2.csv')

# get working directory
getwd()
## [1] "/home/david"
write.table(x = customer, file = 'customer.tab', sep = '\t')

?save
save(customer, file = 'customer.RData')

rm(customer)
#customer

load('customer.RData')


data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
library(rpart)
fit <- rpart(Species ~., data = iris)
fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
save(fit, file = 'model.RData')
rm(fit)
load('model.RData')

predicted <- predict(fit, iris)

匯入Excel 資料

download.file('https://raw.githubusercontent.com/ywchiu/rcathaybk/master/data/FinancialReport.xlsx', destfile = 'FinancialReport.xlsx')

library(readxl)
FinancialReport <- read_excel("FinancialReport.xlsx")
#View(FinancialReport)
class(FinancialReport)
## [1] "tbl_df"     "tbl"        "data.frame"
str(FinancialReport)
## Classes 'tbl_df', 'tbl' and 'data.frame':    17 obs. of  14 variables:
##  $ 年度    : num  2015 2014 2013 2012 2011 ...
##  $ 股本    : num  2593 2593 2593 2592 2592 ...
##  $ 財報評分: num  94 91 89 94 94 96 92 94 94 96 ...
##  $ 收盤    : num  143 141 105.5 97 75.8 ...
##  $ 平均    : num  140 123 104 84.1 72.1 62 55.5 56.4 65.5 61.3 ...
##  $ 漲跌    : num  2 35.5 8.5 21.2 4.8 6.5 20.1 -17.6 -5.5 5 ...
##  $ 漲跌__1 : num  1.4 33.6 8.8 28 6.8 10.1 45.3 -28.4 -8.1 8 ...
##  $ 營業收入: num  8435 7628 5970 5062 4271 ...
##  $ 營業毛利: num  4104 3777 2809 2436 1941 ...
##  $ 營業利益: num  3200 2959 2094 1811 1416 ...
##  $ 業外損益: num  304 62.1 60.6 4.97 35.9 111 35 70.4 99.2 61 ...
##  $ 稅後淨利: num  3066 2639 1881 1662 1342 ...
##  $ ROA     : num  19.4 19.1 17 19.2 18 24.7 15.5 17.8 19 23 ...
##  $ EPS     : num  11.82 10.18 7.26 6.41 5.18 ...
summary(FinancialReport)
##       年度           股本         財報評分          收盤       
##  Min.   :1999   Min.   : 767   Min.   :59.00   Min.   : 42.60  
##  1st Qu.:2003   1st Qu.:2027   1st Qu.:89.00   1st Qu.: 62.50  
##  Median :2007   Median :2583   Median :92.00   Median : 71.00  
##  Mean   :2007   Mean   :2249   Mean   :88.24   Mean   : 83.75  
##  3rd Qu.:2011   3rd Qu.:2592   3rd Qu.:94.00   3rd Qu.: 97.00  
##  Max.   :2015   Max.   :2643   Max.   :96.00   Max.   :167.00  
##       平均             漲跌            漲跌__1          營業收入   
##  Min.   : 52.40   Min.   :-88.500   Min.   :-53.00   Min.   : 731  
##  1st Qu.: 56.40   1st Qu.: -5.500   1st Qu.: -8.10   1st Qu.:2030  
##  Median : 67.40   Median :  6.500   Median :  8.80   Median :3174  
##  Mean   : 82.29   Mean   :  4.235   Mean   : 11.77   Mean   :3576  
##  3rd Qu.:104.00   3rd Qu.: 20.100   3rd Qu.: 28.00   3rd Qu.:4271  
##  Max.   :147.00   Max.   : 96.000   Max.   :135.00   Max.   :8435  
##     營業毛利       營業利益       業外損益         稅後淨利   
##  Min.   : 315   Min.   : 128   Min.   :-43.70   Min.   : 145  
##  1st Qu.: 765   1st Qu.: 613   1st Qu.:  4.97   1st Qu.: 651  
##  Median :1417   Median :1044   Median : 35.00   Median : 999  
##  Mean   :1639   Mean   :1238   Mean   : 50.67   Mean   :1179  
##  3rd Qu.:2071   3rd Qu.:1592   3rd Qu.: 62.10   3rd Qu.:1616  
##  Max.   :4104   Max.   :3200   Max.   :304.00   Max.   :3066  
##       ROA             EPS        
##  Min.   : 3.93   Min.   : 0.830  
##  1st Qu.:15.50   1st Qu.: 3.450  
##  Median :18.40   Median : 4.140  
##  Mean   :17.15   Mean   : 4.969  
##  3rd Qu.:19.40   3rd Qu.: 6.240  
##  Max.   :24.70   Max.   :11.820

讀取JSON

download.file('https://raw.githubusercontent.com/ywchiu/rcathaybk/master/data/usd.json', destfile = 'usd.json')

library(jsonlite)
btc <- fromJSON('usd.json')
class(btc)
## [1] "list"
names(btc)
## [1] "stats"         "total_volumes"
class(btc$stats)
## [1] "matrix"
dim(btc$stats)
## [1] 1812    2
df <- as.data.frame(btc$stats)

colnames(df)
## [1] "V1" "V2"
colnames(df) <- c('datetime', 'btc')

plot(btc ~ datetime, data = df, type= 'l', col='blue')

載入XML

download.file('https://raw.githubusercontent.com/ywchiu/rcathaybk/master/data/iso_company.xml', destfile = 'iso_company.xml')

library(XML)
iso <- xmlToDataFrame('iso_company.xml')
head(iso)
##   廠商代號                       廠商名稱
## 1  1S2I002             衛生福利部玉里醫院
## 2  1XDI001 亞洲水泥股份有限公司花蓮製造廠
## 3  2FBF001         九股山食品股份有限公司
## 4  2S4Y002 台灣電力股份有限公司協和發電廠
## 5  2XHY001       杏輝藥品工業股份有限公司
## 6  3A0Y017           威技電器股份有限公司
##                                 廠址             電話        傳真
## 1            花蓮縣玉里鎮中華路448號  03-8886141#2115  03-8980461
## 2      花蓮縣新城鄉新城村新興路125號 (03)8612101轉204 (03)8612108
## 3         宜蘭縣頭城鎮復興路37巷29號       03-9779988  03-9778081
## 4                   基隆市文化路80號  02-24248111-310 02-24260450
## 5             宜蘭縣冬山鄉中山路84號 03-9581101轉1245  03-9583309
## 6 新北市中和區新民街112號8樓之8、之9      02-22265381 02-22265539
##                登錄日期 登錄範圍 證書有效期限 版本
## 1 2009-11-26 00:00:00.0            2018/09/30   05
## 2 2009-12-11 00:00:00.0            2018/09/30   03
## 3 2006-09-18 00:00:00.0            2018/09/17   06
## 4 1997-10-27 00:00:00.0            2018/09/22   08
## 5 1997-10-30 00:00:00.0            2018/09/22   10
## 6 2000-03-23 00:00:00.0            2018/07/05   08

處理實價登錄資料

library(readr)
lvr_prices <- read_csv("/tmp/lvr_prices.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col     row col         expected   actual     file                  expected   <int> <chr>       <chr>      <chr>      <chr>                 actual 1  1282 total_price an integer 6700000000 '/tmp/lvr_prices.csv' file 2  2243 total_price an integer 3882685600 '/tmp/lvr_prices.csv' row 3  2244 total_price an integer 3373314400 '/tmp/lvr_prices.csv' col 4  4629 total_price an integer 3050000000 '/tmp/lvr_prices.csv' expected 5  5890 total_price an integer 3133800000 '/tmp/lvr_prices.csv'
## ... ................. ... ............................................................... ........ ............................................................... ...... ............................................................... .... ............................................................... ... ............................................................... ... ............................................................... ........ ...............................................................
## See problems(...) for more details.
class(lvr_prices)
## [1] "tbl_df"     "tbl"        "data.frame"
str(lvr_prices)
## Classes 'tbl_df', 'tbl' and 'data.frame':    102054 obs. of  29 variables:
##  $ X1                : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ area              : chr  "大安區" "中正區" "大同區" "大同區" ...
##  $ trading_target    : chr  "房地(土地+建物)" "房地(土地+建物)" "土地" "房地(土地+建物)" ...
##  $ address           : chr  "臺北市大安區和平東路三段1巷72弄1~30號" "臺北市中正區忠孝東路二段121~150號" "橋北段二小段601~630地號" "臺北市大同區重慶北路一段61~90號" ...
##  $ land_sqmeter      : num  19.39 8.46 5.5 3.88 32.41 ...
##  $ city_land_type    : chr  "住" "商" "其他" "商" ...
##  $ non_city_land_type: chr  NA NA NA NA ...
##  $ non_city_code     : chr  NA NA NA NA ...
##  $ trading_ymd       : Date, format: "2012-06-29" "2012-07-18" ...
##  $ trading_num       : chr  "土地1建物2車位0" "土地3建物1車位0" "土地1建物0車位0" "土地4建物1車位0" ...
##  $ floor             : chr  "五層" "九層" NA "六層" ...
##  $ total_floor       : chr  "十七層" "十二層" NA "十一層" ...
##  $ building_type     : chr  "住宅大樓(11層含以上有電梯)" "辦公商業大樓" "其他" "住宅大樓(11層含以上有電梯)" ...
##  $ main_purpose      : chr  "國民住宅" "商業用" NA "商業用" ...
##  $ built_with        : chr  "鋼筋混凝土造" "鋼筋混凝土造" NA "鋼筋混凝土造" ...
##  $ finish_ymd        : Date, format: "1985-05-22" "1982-04-08" ...
##  $ building_sqmeter  : num  101 93.4 0 36.7 104.1 ...
##  $ room              : int  3 0 0 1 3 0 0 3 2 3 ...
##  $ living_room       : int  2 0 0 1 1 0 0 2 1 2 ...
##  $ bath              : int  1 0 0 1 1 0 0 2 1 2 ...
##  $ compartment       : chr  "有" "有" "有" "有" ...
##  $ management        : chr  "有" "有" "無" "有" ...
##  $ total_price       : int  18680000 20300000 132096 4200000 14000000 255000 50000 25800000 19000000 28000000 ...
##  $ price_per_sqmeter : num  184999 217307 24017 114317 134473 ...
##  $ parking_type      : chr  NA NA NA NA ...
##  $ parking_sqmeter   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ parking_price     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ comments          : chr  NA NA NA NA ...
##  $ numbers           : chr  "RPQNMLSJQHHFFFA08CA" "RPQOMLKLQHHFFBA17CA" "RPUNMLLMQHHFFBA67CA" "RPOPMLRKJHIFFBA07CA" ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 32 obs. of  5 variables:
##   ..$ row     : int  1282 2243 2244 4629 5890 7153 7522 9777 10596 10714 ...
##   ..$ col     : chr  "total_price" "total_price" "total_price" "total_price" ...
##   ..$ expected: chr  "an integer" "an integer" "an integer" "an integer" ...
##   ..$ actual  : chr  "6700000000" "3882685600" "3373314400" "3050000000" ...
##   ..$ file    : chr  "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 29
##   .. ..$ X1                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ area              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ trading_target    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ address           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ land_sqmeter      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ city_land_type    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ non_city_land_type: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ non_city_code     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ trading_ymd       :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ trading_num       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ floor             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ total_floor       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ building_type     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ main_purpose      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ built_with        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ finish_ymd        :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ building_sqmeter  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ room              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ living_room       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ bath              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ compartment       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ management        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ total_price       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ price_per_sqmeter : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ parking_type      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ parking_sqmeter   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ parking_price     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ comments          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ numbers           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
summary(lvr_prices)
##        X1           area           trading_target       address         
##  Min.   :   0   Length:102054      Length:102054      Length:102054     
##  1st Qu.:1649   Class :character   Class :character   Class :character  
##  Median :3363   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :3539                                                           
##  3rd Qu.:5188                                                           
##  Max.   :9683                                                           
##                                                                         
##   land_sqmeter      city_land_type     non_city_land_type
##  Min.   :    0.00   Length:102054      Length:102054     
##  1st Qu.:    9.33   Class :character   Class :character  
##  Median :   22.13   Mode  :character   Mode  :character  
##  Mean   :   54.32                                        
##  3rd Qu.:   35.73                                        
##  Max.   :46193.00                                        
##                                                          
##  non_city_code       trading_ymd         trading_num       
##  Length:102054      Min.   :1973-08-29   Length:102054     
##  Class :character   1st Qu.:2013-04-17   Class :character  
##  Mode  :character   Median :2014-01-02   Mode  :character  
##                     Mean   :2014-02-11                     
##                     3rd Qu.:2014-12-29                     
##                     Max.   :2016-05-16                     
##                     NA's   :20                             
##     floor           total_floor        building_type     
##  Length:102054      Length:102054      Length:102054     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  main_purpose        built_with          finish_ymd        
##  Length:102054      Length:102054      Min.   :1911-05-06  
##  Class :character   Class :character   1st Qu.:1983-03-30  
##  Mode  :character   Mode  :character   Median :1997-01-20  
##                                        Mean   :1996-04-21  
##                                        3rd Qu.:2010-03-01  
##                                        Max.   :2016-03-11  
##                                        NA's   :21992       
##  building_sqmeter        room          living_room          bath        
##  Min.   :    0.00   Min.   :  0.000   Min.   : 0.000   Min.   :  0.000  
##  1st Qu.:   43.45   1st Qu.:  0.000   1st Qu.: 0.000   1st Qu.:  1.000  
##  Median :   90.79   Median :  2.000   Median : 1.000   Median :  1.000  
##  Mean   :  124.51   Mean   :  1.938   Mean   : 1.208   Mean   :  1.248  
##  3rd Qu.:  144.22   3rd Qu.:  3.000   3rd Qu.: 2.000   3rd Qu.:  2.000  
##  Max.   :69125.53   Max.   :168.000   Max.   :80.000   Max.   :174.000  
##                                                                         
##  compartment         management         total_price       
##  Length:102054      Length:102054      Min.   :0.000e+00  
##  Class :character   Class :character   1st Qu.:7.500e+06  
##  Mode  :character   Mode  :character   Median :1.450e+07  
##                                        Mean   :2.399e+07  
##                                        3rd Qu.:2.615e+07  
##                                        Max.   :2.029e+09  
##                                        NA's   :32         
##  price_per_sqmeter  parking_type       parking_sqmeter   
##  Min.   :       0   Length:102054      Min.   :0.00e+00  
##  1st Qu.:  120832   Class :character   1st Qu.:0.00e+00  
##  Median :  166859   Mode  :character   Median :0.00e+00  
##  Mean   :  184978                      Mean   :2.54e+01  
##  3rd Qu.:  222266                      3rd Qu.:0.00e+00  
##  Max.   :62685714                      Max.   :1.45e+06  
##  NA's   :5199                                            
##  parking_price         comments           numbers         
##  Min.   :        0   Length:102054      Length:102054     
##  1st Qu.:        0   Class :character   Class :character  
##  Median :        0   Mode  :character   Mode  :character  
##  Mean   :   488509                                        
##  3rd Qu.:        0                                        
##  Max.   :240000000                                        
## 
head(lvr_prices)
## # A tibble: 6 x 29
##      X1 area   trading_target address          land_sqmeter city_land_type
##   <int> <chr>  <chr>          <chr>                   <dbl> <chr>         
## 1     0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7…        19.4  住            
## 2     1 中正區 房地(土地+建物)… 臺北市中正區忠孝東路二段121…         8.46 商            
## 3     2 大同區 土地           橋北段二小段601~630地號…         5.50 其他          
## 4     3 大同區 房地(土地+建物)… 臺北市大同區重慶北路一段61~…         3.88 商            
## 5     4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90巷…        32.4  住            
## 6     5 信義區 土地           福德段一小段661~690地號…         9.37 其他          
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <chr>,
## #   main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <chr>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>
?head

head(lvr_prices, 10)
## # A tibble: 10 x 29
##       X1 area   trading_target address         land_sqmeter city_land_type
##    <int> <chr>  <chr>          <chr>                  <dbl> <chr>         
##  1     0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷…        19.4  住            
##  2     1 中正區 房地(土地+建物)… 臺北市中正區忠孝東路二段12…         8.46 商            
##  3     2 大同區 土地           橋北段二小段601~630地…         5.50 其他          
##  4     3 大同區 房地(土地+建物)… 臺北市大同區重慶北路一段61…         3.88 商            
##  5     4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90…        32.4  住            
##  6     5 信義區 土地           福德段一小段661~690地…         9.37 其他          
##  7     6 松山區 土地           寶清段一小段31~60地號…         1.02 其他          
##  8     7 松山區 房地(土地+建物)… 臺北市松山區三民路68巷1~…        35.5  住            
##  9     8 士林區 房地(土地+建物)… 臺北市士林區承德路四段10巷…        31.2  住            
## 10     9 大安區 房地(土地+建物)… 臺北市大安區敦化南路一段27…        48.0  商            
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <chr>,
## #   main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <chr>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>
tail(lvr_prices)
## # A tibble: 6 x 29
##      X1 area   trading_target  address         land_sqmeter city_land_type
##   <int> <chr>  <chr>           <chr>                  <dbl> <chr>         
## 1   767 萬華區 房地(土地+建物) 臺北市萬華區中華路二段364…         7.27 住            
## 2   768 中正區 房地(土地+建物) 臺北市中正區中華路一段1~3…         4.85 商            
## 3   769 中正區 房地(土地+建物)+車位… 臺北市中正區中華路一段1~3…         5.09 商            
## 4   770 中正區 房地(土地+建物) 臺北市中正區中華路一段1~3…         5.15 商            
## 5   771 文山區 車位            臺北市文山區羅斯福路六段15…         3.42 商            
## 6   772 中正區 土地            福和段一小段751~780地…         8.00 其他          
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <chr>,
## #   main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <chr>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>
tail(lvr_prices,10)
## # A tibble: 10 x 29
##       X1 area   trading_target  address        land_sqmeter city_land_type
##    <int> <chr>  <chr>           <chr>                 <dbl> <chr>         
##  1   763 文山區 房地(土地+建物)+車位… 臺北市文山區興隆路一段24…        10.5  住            
##  2   764 萬華區 房地(土地+建物) 臺北市萬華區中華路二段50…        12.4  住            
##  3   765 萬華區 房地(土地+建物) 臺北市萬華區西園路二段26…        24.0  住            
##  4   766 中正區 房地(土地+建物)+車位… 臺北市中正區濟南路一段1~…        47.2  住            
##  5   767 萬華區 房地(土地+建物) 臺北市萬華區中華路二段36…         7.27 住            
##  6   768 中正區 房地(土地+建物) 臺北市中正區中華路一段1~…         4.85 商            
##  7   769 中正區 房地(土地+建物)+車位… 臺北市中正區中華路一段1~…         5.09 商            
##  8   770 中正區 房地(土地+建物) 臺北市中正區中華路一段1~…         5.15 商            
##  9   771 文山區 車位            臺北市文山區羅斯福路六段1…         3.42 商            
## 10   772 中正區 土地            福和段一小段751~780…         8.00 其他          
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <chr>,
## #   main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <chr>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>
class(lvr_prices$area)
## [1] "character"
head(lvr_prices$area)
## [1] "大安區" "中正區" "大同區" "大同區" "內湖區" "信義區"
as.integer('30')
## [1] 30
as.numeric('30')
## [1] 30
lvr_prices$area <- as.factor(lvr_prices$area)
lvr_prices$trading_target <- as.factor(lvr_prices$trading_target)
lvr_prices$city_land_type <- as.factor(lvr_prices$city_land_type)
lvr_prices$city_land_type <- as.factor(lvr_prices$city_land_type)

lvr_prices$built_with   <- as.factor(lvr_prices$built_with)
lvr_prices$main_purpose <- as.factor(lvr_prices$main_purpose)
lvr_prices$compartment  <- as.factor(lvr_prices$compartment)
lvr_prices$building_type  <- as.factor(lvr_prices$building_type)

str(lvr_prices)
## Classes 'tbl_df', 'tbl' and 'data.frame':    102054 obs. of  29 variables:
##  $ X1                : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ area              : Factor w/ 12 levels "中山區","中正區",..: 9 2 8 8 4 3 11 11 7 9 ...
##  $ trading_target    : Factor w/ 5 levels "土地","建物",..: 3 3 1 3 3 1 1 3 3 3 ...
##  $ address           : chr  "臺北市大安區和平東路三段1巷72弄1~30號" "臺北市中正區忠孝東路二段121~150號" "橋北段二小段601~630地號" "臺北市大同區重慶北路一段61~90號" ...
##  $ land_sqmeter      : num  19.39 8.46 5.5 3.88 32.41 ...
##  $ city_land_type    : Factor w/ 5 levels "住","其他","商",..: 1 3 2 3 1 2 2 1 1 3 ...
##  $ non_city_land_type: chr  NA NA NA NA ...
##  $ non_city_code     : chr  NA NA NA NA ...
##  $ trading_ymd       : Date, format: "2012-06-29" "2012-07-18" ...
##  $ trading_num       : chr  "土地1建物2車位0" "土地3建物1車位0" "土地1建物0車位0" "土地4建物1車位0" ...
##  $ floor             : chr  "五層" "九層" NA "六層" ...
##  $ total_floor       : chr  "十七層" "十二層" NA "十一層" ...
##  $ building_type     : Factor w/ 12 levels "住宅大樓(11層含以上有電梯)",..: 1 10 4 1 3 4 4 9 3 3 ...
##  $ main_purpose      : Factor w/ 12 levels "住商用","住家用",..: 7 6 NA 6 2 NA NA 7 2 2 ...
##  $ built_with        : Factor w/ 17 levels "加強磚造","土木造",..: 12 12 NA 12 12 NA NA 12 12 12 ...
##  $ finish_ymd        : Date, format: "1985-05-22" "1982-04-08" ...
##  $ building_sqmeter  : num  101 93.4 0 36.7 104.1 ...
##  $ room              : int  3 0 0 1 3 0 0 3 2 3 ...
##  $ living_room       : int  2 0 0 1 1 0 0 2 1 2 ...
##  $ bath              : int  1 0 0 1 1 0 0 2 1 2 ...
##  $ compartment       : Factor w/ 2 levels "有","無": 1 1 1 1 1 1 1 1 1 1 ...
##  $ management        : chr  "有" "有" "無" "有" ...
##  $ total_price       : int  18680000 20300000 132096 4200000 14000000 255000 50000 25800000 19000000 28000000 ...
##  $ price_per_sqmeter : num  184999 217307 24017 114317 134473 ...
##  $ parking_type      : chr  NA NA NA NA ...
##  $ parking_sqmeter   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ parking_price     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ comments          : chr  NA NA NA NA ...
##  $ numbers           : chr  "RPQNMLSJQHHFFFA08CA" "RPQOMLKLQHHFFBA17CA" "RPUNMLLMQHHFFBA67CA" "RPOPMLRKJHIFFBA07CA" ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 32 obs. of  5 variables:
##   ..$ row     : int  1282 2243 2244 4629 5890 7153 7522 9777 10596 10714 ...
##   ..$ col     : chr  "total_price" "total_price" "total_price" "total_price" ...
##   ..$ expected: chr  "an integer" "an integer" "an integer" "an integer" ...
##   ..$ actual  : chr  "6700000000" "3882685600" "3373314400" "3050000000" ...
##   ..$ file    : chr  "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 29
##   .. ..$ X1                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ area              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ trading_target    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ address           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ land_sqmeter      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ city_land_type    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ non_city_land_type: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ non_city_code     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ trading_ymd       :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ trading_num       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ floor             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ total_floor       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ building_type     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ main_purpose      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ built_with        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ finish_ymd        :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ building_sqmeter  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ room              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ living_room       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ bath              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ compartment       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ management        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ total_price       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ price_per_sqmeter : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ parking_type      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ parking_sqmeter   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ parking_price     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ comments          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ numbers           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
summary(lvr_prices)
##        X1            area                    trading_target 
##  Min.   :   0   中山區 :15020   土地                :11130  
##  1st Qu.:1649   內湖區 :12312   建物                :  669  
##  Median :3363   文山區 :10572   房地(土地+建物)     :56207  
##  Mean   :3539   北投區 :10278   房地(土地+建物)+車位:28941  
##  3rd Qu.:5188   大安區 : 9383   車位                : 5107  
##  Max.   :9683   士林區 : 8878                               
##                 (Other):35611                               
##    address           land_sqmeter      city_land_type non_city_land_type
##  Length:102054      Min.   :    0.00   住  :59083     Length:102054     
##  Class :character   1st Qu.:    9.33   其他:11896     Class :character  
##  Mode  :character   Median :   22.13   商  :27436     Mode  :character  
##                     Mean   :   54.32   工  : 2269                       
##                     3rd Qu.:   35.73   農  :  435                       
##                     Max.   :46193.00   NA's:  935                       
##                                                                         
##  non_city_code       trading_ymd         trading_num       
##  Length:102054      Min.   :1973-08-29   Length:102054     
##  Class :character   1st Qu.:2013-04-17   Class :character  
##  Mode  :character   Median :2014-01-02   Mode  :character  
##                     Mean   :2014-02-11                     
##                     3rd Qu.:2014-12-29                     
##                     Max.   :2016-05-16                     
##                     NA's   :20                             
##     floor           total_floor                           building_type  
##  Length:102054      Length:102054      住宅大樓(11層含以上有電梯):28070  
##  Class :character   Class :character   公寓(5樓含以下無電梯)     :21493  
##  Mode  :character   Mode  :character   華廈(10層含以下有電梯)    :18043  
##                                        其他                      :16665  
##                                        套房(1房1廳1衛)           :10119  
##                                        辦公商業大樓              : 2781  
##                                        (Other)                   : 4883  
##          main_purpose              built_with      finish_ymd        
##  住家用        :62535   鋼筋混凝土造    :80483   Min.   :1911-05-06  
##  商業用        : 9350   加強磚造        : 4262   1st Qu.:1983-03-30  
##  見其他登記事項: 7602   鋼骨鋼筋混凝土造: 2259   Median :1997-01-20  
##  見使用執照    : 2877   見其他登記事項  : 2027   Mean   :1996-04-21  
##  國民住宅      : 2093   磚造            :  418   3rd Qu.:2010-03-01  
##  (Other)       : 2758   (Other)         :  579   Max.   :2016-03-11  
##  NA's          :14839   NA's            :12026   NA's   :21992       
##  building_sqmeter        room          living_room          bath        
##  Min.   :    0.00   Min.   :  0.000   Min.   : 0.000   Min.   :  0.000  
##  1st Qu.:   43.45   1st Qu.:  0.000   1st Qu.: 0.000   1st Qu.:  1.000  
##  Median :   90.79   Median :  2.000   Median : 1.000   Median :  1.000  
##  Mean   :  124.51   Mean   :  1.938   Mean   : 1.208   Mean   :  1.248  
##  3rd Qu.:  144.22   3rd Qu.:  3.000   3rd Qu.: 2.000   3rd Qu.:  2.000  
##  Max.   :69125.53   Max.   :168.000   Max.   :80.000   Max.   :174.000  
##                                                                         
##  compartment  management         total_price        price_per_sqmeter 
##  有:93155    Length:102054      Min.   :0.000e+00   Min.   :       0  
##  無: 8899    Class :character   1st Qu.:7.500e+06   1st Qu.:  120832  
##              Mode  :character   Median :1.450e+07   Median :  166859  
##                                 Mean   :2.399e+07   Mean   :  184978  
##                                 3rd Qu.:2.615e+07   3rd Qu.:  222266  
##                                 Max.   :2.029e+09   Max.   :62685714  
##                                 NA's   :32          NA's   :5199      
##  parking_type       parking_sqmeter    parking_price      
##  Length:102054      Min.   :0.00e+00   Min.   :        0  
##  Class :character   1st Qu.:0.00e+00   1st Qu.:        0  
##  Mode  :character   Median :0.00e+00   Median :        0  
##                     Mean   :2.54e+01   Mean   :   488509  
##                     3rd Qu.:0.00e+00   3rd Qu.:        0  
##                     Max.   :1.45e+06   Max.   :240000000  
##                                                           
##    comments           numbers         
##  Length:102054      Length:102054     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 

 資料篩選

lvr_prices[ 1        ,    ]
## # A tibble: 1 x 29
##      X1 area   trading_target address          land_sqmeter city_land_type
##   <int> <fct>  <fct>          <chr>                   <dbl> <fct>         
## 1     0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7…         19.4 住            
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <fct>,
## #   main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <fct>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>
lvr_prices[c(1,3,5)  ,    ]
## # A tibble: 3 x 29
##      X1 area   trading_target address          land_sqmeter city_land_type
##   <int> <fct>  <fct>          <chr>                   <dbl> <fct>         
## 1     0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7…        19.4  住            
## 2     2 大同區 土地           橋北段二小段601~630地號…         5.50 其他          
## 3     4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90巷…        32.4  住            
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <fct>,
## #   main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <fct>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>
1:5
## [1] 1 2 3 4 5
lvr_prices[      1:5 ,    ]
## # A tibble: 5 x 29
##      X1 area   trading_target address          land_sqmeter city_land_type
##   <int> <fct>  <fct>          <chr>                   <dbl> <fct>         
## 1     0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7…        19.4  住            
## 2     1 中正區 房地(土地+建物)… 臺北市中正區忠孝東路二段121…         8.46 商            
## 3     2 大同區 土地           橋北段二小段601~630地號…         5.50 其他          
## 4     3 大同區 房地(土地+建物)… 臺北市大同區重慶北路一段61~…         3.88 商            
## 5     4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90巷…        32.4  住            
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <fct>,
## #   main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <fct>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>
lvr_prices[      1:3 ,  c(1,2,3,4)  ]
## # A tibble: 3 x 4
##      X1 area   trading_target  address                              
##   <int> <fct>  <fct>           <chr>                                
## 1     0 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2     1 中正區 房地(土地+建物) 臺北市中正區忠孝東路二段121~150號    
## 3     2 大同區 土地            橋北段二小段601~630地號
lvr_prices[      1:3 ,  c(2,3,4)  ]
## # A tibble: 3 x 3
##   area   trading_target  address                              
##   <fct>  <fct>           <chr>                                
## 1 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2 中正區 房地(土地+建物) 臺北市中正區忠孝東路二段121~150號    
## 3 大同區 土地            橋北段二小段601~630地號
lvr_prices[      1:3 ,  2:4       ]
## # A tibble: 3 x 3
##   area   trading_target  address                              
##   <fct>  <fct>           <chr>                                
## 1 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2 中正區 房地(土地+建物) 臺北市中正區忠孝東路二段121~150號    
## 3 大同區 土地            橋北段二小段601~630地號
lvr_prices[      1:3 ,  c('area', 'address', 'total_price') ]
## # A tibble: 3 x 3
##   area   address                               total_price
##   <fct>  <chr>                                       <int>
## 1 大安區 臺北市大安區和平東路三段1巷72弄1~30號    18680000
## 2 中正區 臺北市中正區忠孝東路二段121~150號        20300000
## 3 大同區 橋北段二小段601~630地號                    132096
head(lvr_prices[,  c(1,2)])
## # A tibble: 6 x 2
##      X1 area  
##   <int> <fct> 
## 1     0 大安區
## 2     1 中正區
## 3     2 大同區
## 4     3 大同區
## 5     4 內湖區
## 6     5 信義區
head(lvr_prices$area)
## [1] 大安區 中正區 大同區 大同區 內湖區 信義區
## 12 Levels: 中山區 中正區 信義區 內湖區 北投區 南港區 士林區 ... 萬華區
a <- c(50,60,70)
a[1]
## [1] 50
a[c(TRUE, FALSE, FALSE)]
## [1] 50
daan <- lvr_prices$area == '大安區'
head(lvr_prices[ daan   ,   ])
## # A tibble: 6 x 29
##      X1 area   trading_target address          land_sqmeter city_land_type
##   <int> <fct>  <fct>          <chr>                   <dbl> <fct>         
## 1     0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7…      19.4    住            
## 2     9 大安區 房地(土地+建物)… 臺北市大安區敦化南路一段270…      48.0    商            
## 3    10 大安區 車位           臺北市大安區永康街7巷1~30…       0.100  商            
## 4    11 大安區 車位           臺北市大安區永康街7巷1~30…       0.0500 商            
## 5    12 大安區 車位           臺北市大安區永康街7巷1~30…       0.0500 商            
## 6    29 大安區 土地           通化段三小段781~810地號…      28.0    其他          
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <fct>,
## #   main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <fct>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>
head(lvr_prices[ daan   , c('area', 'total_price')  ])
## # A tibble: 6 x 2
##   area   total_price
##   <fct>        <int>
## 1 大安區    18680000
## 2 大安區    28000000
## 3 大安區     1600000
## 4 大安區      850000
## 5 大安區      850000
## 6 大安區      750000

缺失值

mean(lvr_prices$total_price)
## [1] NA
?mean
mean(lvr_prices$total_price,   na.rm=TRUE)
## [1] 23993655
median(lvr_prices$total_price, na.rm=TRUE)
## [1] 14500000
sum(is.na(lvr_prices$total_price)) / length(lvr_prices$total_price)
## [1] 0.0003135595
# method 1 : using for loop
a <- c(1,2,3,4,5)
for (ele in a){
  print(ele ^ 2)
}
## [1] 1
## [1] 4
## [1] 9
## [1] 16
## [1] 25
# method 2 : using sapply (迴圈函數)
sapply(a, function(ele) ele ^ 2 )
## [1]  1  4  9 16 25
# method 1 : using for loop
for (col in names(lvr_prices)){
  print(col)
  nstat <- sum(is.na(lvr_prices[, col])) / length(lvr_prices[, col])
  print(nstat)
}
## [1] "X1"
## [1] 0
## [1] "area"
## [1] 0
## [1] "trading_target"
## [1] 0
## [1] "address"
## [1] 0
## [1] "land_sqmeter"
## [1] 0
## [1] "city_land_type"
## [1] 935
## [1] "non_city_land_type"
## [1] 102037
## [1] "non_city_code"
## [1] 102054
## [1] "trading_ymd"
## [1] 20
## [1] "trading_num"
## [1] 0
## [1] "floor"
## [1] 12237
## [1] "total_floor"
## [1] 12257
## [1] "building_type"
## [1] 0
## [1] "main_purpose"
## [1] 14839
## [1] "built_with"
## [1] 12026
## [1] "finish_ymd"
## [1] 21992
## [1] "building_sqmeter"
## [1] 0
## [1] "room"
## [1] 0
## [1] "living_room"
## [1] 0
## [1] "bath"
## [1] 0
## [1] "compartment"
## [1] 0
## [1] "management"
## [1] 0
## [1] "total_price"
## [1] 32
## [1] "price_per_sqmeter"
## [1] 5199
## [1] "parking_type"
## [1] 67889
## [1] "parking_sqmeter"
## [1] 0
## [1] "parking_price"
## [1] 0
## [1] "comments"
## [1] 68120
## [1] "numbers"
## [1] 0
# method 2 : using sapply (迴圈函數)
sapply(names(lvr_prices), function(col)sum(is.na(lvr_prices[,col]) / length(lvr_prices[,col])))
##                 X1               area     trading_target 
##                  0                  0                  0 
##            address       land_sqmeter     city_land_type 
##                  0                  0                935 
## non_city_land_type      non_city_code        trading_ymd 
##             102037             102054                 20 
##        trading_num              floor        total_floor 
##                  0              12237              12257 
##      building_type       main_purpose         built_with 
##                  0              14839              12026 
##         finish_ymd   building_sqmeter               room 
##              21992                  0                  0 
##        living_room               bath        compartment 
##                  0                  0                  0 
##         management        total_price  price_per_sqmeter 
##                  0                 32               5199 
##       parking_type    parking_sqmeter      parking_price 
##              67889                  0                  0 
##           comments            numbers 
##              68120                  0
lvr_prices$non_city_land_type <- NULL

lvr_prices$non_city_code <- NULL
lvr_prices$comments <- NULL
lvr_prices$parking_type <- NULL

str(lvr_prices)
## Classes 'tbl_df', 'tbl' and 'data.frame':    102054 obs. of  25 variables:
##  $ X1               : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ area             : Factor w/ 12 levels "中山區","中正區",..: 9 2 8 8 4 3 11 11 7 9 ...
##  $ trading_target   : Factor w/ 5 levels "土地","建物",..: 3 3 1 3 3 1 1 3 3 3 ...
##  $ address          : chr  "臺北市大安區和平東路三段1巷72弄1~30號" "臺北市中正區忠孝東路二段121~150號" "橋北段二小段601~630地號" "臺北市大同區重慶北路一段61~90號" ...
##  $ land_sqmeter     : num  19.39 8.46 5.5 3.88 32.41 ...
##  $ city_land_type   : Factor w/ 5 levels "住","其他","商",..: 1 3 2 3 1 2 2 1 1 3 ...
##  $ trading_ymd      : Date, format: "2012-06-29" "2012-07-18" ...
##  $ trading_num      : chr  "土地1建物2車位0" "土地3建物1車位0" "土地1建物0車位0" "土地4建物1車位0" ...
##  $ floor            : chr  "五層" "九層" NA "六層" ...
##  $ total_floor      : chr  "十七層" "十二層" NA "十一層" ...
##  $ building_type    : Factor w/ 12 levels "住宅大樓(11層含以上有電梯)",..: 1 10 4 1 3 4 4 9 3 3 ...
##  $ main_purpose     : Factor w/ 12 levels "住商用","住家用",..: 7 6 NA 6 2 NA NA 7 2 2 ...
##  $ built_with       : Factor w/ 17 levels "加強磚造","土木造",..: 12 12 NA 12 12 NA NA 12 12 12 ...
##  $ finish_ymd       : Date, format: "1985-05-22" "1982-04-08" ...
##  $ building_sqmeter : num  101 93.4 0 36.7 104.1 ...
##  $ room             : int  3 0 0 1 3 0 0 3 2 3 ...
##  $ living_room      : int  2 0 0 1 1 0 0 2 1 2 ...
##  $ bath             : int  1 0 0 1 1 0 0 2 1 2 ...
##  $ compartment      : Factor w/ 2 levels "有","無": 1 1 1 1 1 1 1 1 1 1 ...
##  $ management       : chr  "有" "有" "無" "有" ...
##  $ total_price      : int  18680000 20300000 132096 4200000 14000000 255000 50000 25800000 19000000 28000000 ...
##  $ price_per_sqmeter: num  184999 217307 24017 114317 134473 ...
##  $ parking_sqmeter  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ parking_price    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ numbers          : chr  "RPQNMLSJQHHFFFA08CA" "RPQOMLKLQHHFFBA17CA" "RPUNMLLMQHHFFBA67CA" "RPOPMLRKJHIFFBA07CA" ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 32 obs. of  5 variables:
##   ..$ row     : int  1282 2243 2244 4629 5890 7153 7522 9777 10596 10714 ...
##   ..$ col     : chr  "total_price" "total_price" "total_price" "total_price" ...
##   ..$ expected: chr  "an integer" "an integer" "an integer" "an integer" ...
##   ..$ actual  : chr  "6700000000" "3882685600" "3373314400" "3050000000" ...
##   ..$ file    : chr  "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 29
##   .. ..$ X1                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ area              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ trading_target    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ address           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ land_sqmeter      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ city_land_type    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ non_city_land_type: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ non_city_code     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ trading_ymd       :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ trading_num       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ floor             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ total_floor       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ building_type     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ main_purpose      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ built_with        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ finish_ymd        :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ building_sqmeter  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ room              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ living_room       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ bath              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ compartment       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ management        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ total_price       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ price_per_sqmeter : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ parking_type      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ parking_sqmeter   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ parking_price     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ comments          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ numbers           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
lvr_prices <- lvr_prices[! is.na(lvr_prices$total_price), ]

文字處理


head(lvr_prices)


head(lvr_prices$floor)


#lvr_prices$floor
?gsub

# gsub can replace string by pattern
floor_tmp <- gsub('層', '', lvr_prices$floor)

phones <- c('0912345678', '0923456780', '0912344556')

c('0912345678') == phones

s <- '6'
grepl('6' , s )
grepl('5' , s )

# []
grepl('[0123456789]' , s )

# \\d = [0123456789]
grepl('\\d' , s )


s <- 'w'
grepl('[abcdefghijklmnopqrstuvwxyz]', s)

s <- 'W'
grepl('[abcdefghijklmnopqrstuvwxyz]', s)
grepl('[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]', s)

# -: [abcdefghijklmnopqrstuvwxyz] => [a-z]
s <- 'y'
grepl('[a-z]', s)
grepl('[a-zA-Z]', s)

s <- '7'
grepl('[a-zA-Z]', s)
grepl('[a-zA-Z0-9]', s)

# \\w => [a-zA-Z0-9]
grepl('\\w', s)

# {n}: match exact n characters in string
s <- 'apple'
grepl('\\w{5}', s)

# {m, n}: match at least m characters in string, at most n characters in the string
s <- 'bannana'
grepl('\\w{5, }', s)


s <- 'bannana'
grepl('\\w{1, }', s)

# + => {1,}
s <- 'bannana'
grepl('\\w+', s)


s <- 'bannana'
grepl('\\w{0, }', s)

# * => {0,}
grepl('\\w*', s)

phones <- c('0912345678', '0912-345-678', '0912-345678', '0958034580580435')

grepl('09\\d{8}', phones)
grepl('09\\d{2}-{0,1}\\d{3}-{0,1}\\d{3}', phones)

# ? => {0,1}
grepl('09\\d{2}-?\\d{3}-?\\d{3}', phones)


# ^ : match the begin , $ match the end
grepl('^09\\d{2}-?\\d{3}-?\\d{3}$', phones)


library(stringr)
trading_data <- str_match_all(lvr_prices$trading_num, "土地(\\d+)建物(\\d+)車位(\\d+)")

a <- data.frame(a=c(1,2), b=c(3,4))
a
b <- data.frame(a=c(5,6), b=c(7,8))
b

do.call('rbind', list(a,b))


#do.call('rbind',trading_data)


str_match_all(head(lvr_prices$address), "(.+[市縣])(.+區)(.+[路街])")

資料排序

head(sort(lvr_prices$total_price, decreasing = TRUE))
## [1] 2028880000 2008800000 1930000000 1869781219 1850000000 1715668356
a <- c(70,50,60,80,55)
sort(a)
## [1] 50 55 60 70 80
order(a)
## [1] 2 5 3 1 4
head(lvr_prices[order(lvr_prices$total_price, decreasing = TRUE),  ])
## # A tibble: 6 x 25
##      X1 area   trading_target  address         land_sqmeter city_land_type
##   <int> <fct>  <fct>           <chr>                  <dbl> <fct>         
## 1  7679 內湖區 房地(土地+建物)+車位… 臺北市內湖區新湖一路271~…        3948. 工            
## 2  1397 松山區 房地(土地+建物) 臺北市松山區敦化北路121~…        1304. 住            
## 3  4373 北投區 房地(土地+建物)+車位… 臺北市北投區石牌路一段181…        1127. 商            
## 4  2389 大安區 房地(土地+建物) 臺北市大安區羅斯福路三段28…        2376. 住            
## 5  5912 中山區 房地(土地+建物)+車位… 臺北市中山區建國北路一段13…         709. 住            
## 6  3786 北投區 土地            新洲美段31~60地號…        4526. 其他          
## # ... with 19 more variables: trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <fct>,
## #   main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <fct>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_sqmeter <dbl>, parking_price <int>,
## #   numbers <chr>
levels(lvr_prices$building_type)
##  [1] "住宅大樓(11層含以上有電梯)" "倉庫"                      
##  [3] "公寓(5樓含以下無電梯)"      "其他"                      
##  [5] "套房(1房1廳1衛)"            "工廠"                      
##  [7] "店面(店鋪)"                 "廠辦"                      
##  [9] "華廈(10層含以下有電梯)"     "辦公商業大樓"              
## [11] "農舍"                       "透天厝"
apartment <- lvr_prices[lvr_prices$building_type == '公寓(5樓含以下無電梯)',]


head(apartment[order(apartment$total_price, decreasing = TRUE),  ])
## # A tibble: 6 x 25
##      X1 area   trading_target address          land_sqmeter city_land_type
##   <int> <fct>  <fct>          <chr>                   <dbl> <fct>         
## 1  3015 中山區 房地(土地+建物)… 臺北市中山區中山北路二段121…         128. 商            
## 2  1713 松山區 房地(土地+建物)… 臺北市松山區南京東路三段301…         147. 商            
## 3  7282 萬華區 房地(土地+建物)… 臺北市萬華區華西街26巷1~3…         378. 商            
## 4  6783 中山區 房地(土地+建物)… 臺北市中山區中山北路二段121…         136. 商            
## 5  1522 中正區 房地(土地+建物)… 臺北市中正區汀州路二段151~…         317. 住            
## 6   659 中正區 房地(土地+建物)… 臺北市中正區延平南路151~1…         268. 商            
## # ... with 19 more variables: trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <fct>,
## #   main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <fct>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_sqmeter <dbl>, parking_price <int>,
## #   numbers <chr>
daan <- lvr_prices[lvr_prices$area == '大安區',]
sum(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 2.79477e+11
mean(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 29798170
median(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 2e+07
zhongshan <- lvr_prices[lvr_prices$area == '中山區', c('address', 'total_price')]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,]
res[1:3,]
## # A tibble: 3 x 2
##   address                             total_price
##   <chr>                                     <int>
## 1 臺北市中山區建國北路一段138巷1~30號  1850000000
## 2 臺北市中山區南京東路三段1~30號       1400000000
## 3 中山段二小段31~60地號                1084948034
zhongshan <- lvr_prices[lvr_prices$area == '中山區', c('address', 'total_price')]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,]
res[1:3,]
## # A tibble: 3 x 2
##   address                             total_price
##   <chr>                                     <int>
## 1 臺北市中山區建國北路一段138巷1~30號  1850000000
## 2 臺北市中山區南京東路三段1~30號       1400000000
## 3 中山段二小段31~60地號                1084948034
getTopThree <- function(area){
  house <- lvr_prices[lvr_prices$area == area, c('address', 'total_price')]
  idx <- order(house$total_price, decreasing = TRUE)
  res <- house[idx,]
  return(res[1:3,])
}

getTopThree('大安區')
## # A tibble: 3 x 2
##   address                                total_price
##   <chr>                                        <int>
## 1 臺北市大安區羅斯福路三段283巷4弄1~30號  1869781219
## 2 臺北市大安區忠孝東路四段241~270號        971340000
## 3 學府段三小段31~60地號                    966660000
mean(lvr_prices[lvr_prices$area == '中山區', ]$total_price, na.rm=TRUE)
## [1] 26708805
lvr_prices$area <- as.factor(lvr_prices$area)

# method 1: for loop
for (area in levels(lvr_prices$area)){
  p <- median(lvr_prices[lvr_prices$area == area, ]$total_price, na.rm=TRUE)
  print(paste(area, p))
}
## [1] "中山區 12800000"
## [1] "中正區 16180000"
## [1] "信義區 15800000"
## [1] "內湖區 16500000"
## [1] "北投區 13000000"
## [1] "南港區 16685000"
## [1] "士林區 14350000"
## [1] "大同區 11770000"
## [1] "大安區 20000000"
## [1] "文山區 13300000"
## [1] "松山區 17800000"
## [1] "萬華區 9542595"
# method 2 : tapply
price_per_sec <- tapply(lvr_prices$total_price, lvr_prices$area, function(e)median(e, na.rm=TRUE))

sort(price_per_sec, decreasing = TRUE)
##   大安區   松山區   南港區   內湖區   中正區   信義區   士林區   文山區 
## 20000000 17800000 16685000 16500000 16180000 15800000 14350000 13300000 
##   北投區   中山區   大同區   萬華區 
## 13000000 12800000 11770000  9542595
barplot(sort(price_per_sec, decreasing = TRUE), col='blue')

boxplot(log(total_price) ~ area, data = lvr_prices, main= "房價箱型圖", xlab = "區域", ylab = "價格(log)")
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 7 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 8 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 9 is not drawn