R Basic
a <- 3
b <- 2
a + b
## [1] 5
data('anscombe')
#View(anscombe)
fit <- lm(y1 ~ x1, data = anscombe)
fit
##
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
##
## Coefficients:
## (Intercept) x1
## 3.0001 0.5001
plot(y1 ~ x1, data = anscombe)
abline(fit, col='red')

predict(fit, data.frame(x1= 13))
## 1
## 9.501273
資料匯入
library(readr)
customer <- read_csv("customer.csv")
## Parsed with column specification:
## cols(
## ID = col_integer(),
## Visit.Time = col_integer(),
## Average.Expense = col_double(),
## Sex = col_integer(),
## Age = col_integer()
## )
library(readr)
customer <- read_csv("customer.csv", col_types = cols(Sex = col_factor(levels = c("0",
"1"))))
#View(customer)
class(customer)
## [1] "tbl_df" "tbl" "data.frame"
str(customer)
## Classes 'tbl_df', 'tbl' and 'data.frame': 60 obs. of 5 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Visit.Time : int 3 5 16 5 16 3 12 14 6 3 ...
## $ Average.Expense: num 5.7 14.5 33.5 15.9 24.9 12 28.5 18.8 23.8 5.3 ...
## $ Sex : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 10 27 32 30 23 15 33 27 16 11 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 5
## .. ..$ ID : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Visit.Time : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Average.Expense: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Sex :List of 3
## .. .. ..$ levels : chr "0" "1"
## .. .. ..$ ordered : logi FALSE
## .. .. ..$ include_na: logi FALSE
## .. .. ..- attr(*, "class")= chr "collector_factor" "collector"
## .. ..$ Age : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
summary(customer)
## ID Visit.Time Average.Expense Sex Age
## Min. : 1.00 Min. : 1.0 Min. : 4.50 0:19 Min. : 8.00
## 1st Qu.:15.75 1st Qu.: 5.0 1st Qu.:10.82 1:41 1st Qu.:15.00
## Median :30.50 Median : 7.5 Median :16.00 Median :20.50
## Mean :30.50 Mean : 8.4 Mean :17.06 Mean :21.43
## 3rd Qu.:45.25 3rd Qu.:12.0 3rd Qu.:24.90 3rd Qu.:27.00
## Max. :60.00 Max. :18.0 Max. :33.70 Max. :47.00
customer$Sex <- as.factor(customer$Sex)
summary(customer)
## ID Visit.Time Average.Expense Sex Age
## Min. : 1.00 Min. : 1.0 Min. : 4.50 0:19 Min. : 8.00
## 1st Qu.:15.75 1st Qu.: 5.0 1st Qu.:10.82 1:41 1st Qu.:15.00
## Median :30.50 Median : 7.5 Median :16.00 Median :20.50
## Mean :30.50 Mean : 8.4 Mean :17.06 Mean :21.43
## 3rd Qu.:45.25 3rd Qu.:12.0 3rd Qu.:24.90 3rd Qu.:27.00
## Max. :60.00 Max. :18.0 Max. :33.70 Max. :47.00
help(write.csv)
?write.csv
# method 1
write.csv(x = customer, file='customer2.csv')
# method 2
write.csv(customer, file='customer2.csv')
# get working directory
getwd()
## [1] "/home/david"
write.table(x = customer, file = 'customer.tab', sep = '\t')
?save
save(customer, file = 'customer.RData')
rm(customer)
#customer
load('customer.RData')
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
library(rpart)
fit <- rpart(Species ~., data = iris)
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
save(fit, file = 'model.RData')
rm(fit)
load('model.RData')
predicted <- predict(fit, iris)
匯入Excel 資料
download.file('https://raw.githubusercontent.com/ywchiu/rcathaybk/master/data/FinancialReport.xlsx', destfile = 'FinancialReport.xlsx')
library(readxl)
FinancialReport <- read_excel("FinancialReport.xlsx")
#View(FinancialReport)
class(FinancialReport)
## [1] "tbl_df" "tbl" "data.frame"
str(FinancialReport)
## Classes 'tbl_df', 'tbl' and 'data.frame': 17 obs. of 14 variables:
## $ 年度 : num 2015 2014 2013 2012 2011 ...
## $ 股本 : num 2593 2593 2593 2592 2592 ...
## $ 財報評分: num 94 91 89 94 94 96 92 94 94 96 ...
## $ 收盤 : num 143 141 105.5 97 75.8 ...
## $ 平均 : num 140 123 104 84.1 72.1 62 55.5 56.4 65.5 61.3 ...
## $ 漲跌 : num 2 35.5 8.5 21.2 4.8 6.5 20.1 -17.6 -5.5 5 ...
## $ 漲跌__1 : num 1.4 33.6 8.8 28 6.8 10.1 45.3 -28.4 -8.1 8 ...
## $ 營業收入: num 8435 7628 5970 5062 4271 ...
## $ 營業毛利: num 4104 3777 2809 2436 1941 ...
## $ 營業利益: num 3200 2959 2094 1811 1416 ...
## $ 業外損益: num 304 62.1 60.6 4.97 35.9 111 35 70.4 99.2 61 ...
## $ 稅後淨利: num 3066 2639 1881 1662 1342 ...
## $ ROA : num 19.4 19.1 17 19.2 18 24.7 15.5 17.8 19 23 ...
## $ EPS : num 11.82 10.18 7.26 6.41 5.18 ...
summary(FinancialReport)
## 年度 股本 財報評分 收盤
## Min. :1999 Min. : 767 Min. :59.00 Min. : 42.60
## 1st Qu.:2003 1st Qu.:2027 1st Qu.:89.00 1st Qu.: 62.50
## Median :2007 Median :2583 Median :92.00 Median : 71.00
## Mean :2007 Mean :2249 Mean :88.24 Mean : 83.75
## 3rd Qu.:2011 3rd Qu.:2592 3rd Qu.:94.00 3rd Qu.: 97.00
## Max. :2015 Max. :2643 Max. :96.00 Max. :167.00
## 平均 漲跌 漲跌__1 營業收入
## Min. : 52.40 Min. :-88.500 Min. :-53.00 Min. : 731
## 1st Qu.: 56.40 1st Qu.: -5.500 1st Qu.: -8.10 1st Qu.:2030
## Median : 67.40 Median : 6.500 Median : 8.80 Median :3174
## Mean : 82.29 Mean : 4.235 Mean : 11.77 Mean :3576
## 3rd Qu.:104.00 3rd Qu.: 20.100 3rd Qu.: 28.00 3rd Qu.:4271
## Max. :147.00 Max. : 96.000 Max. :135.00 Max. :8435
## 營業毛利 營業利益 業外損益 稅後淨利
## Min. : 315 Min. : 128 Min. :-43.70 Min. : 145
## 1st Qu.: 765 1st Qu.: 613 1st Qu.: 4.97 1st Qu.: 651
## Median :1417 Median :1044 Median : 35.00 Median : 999
## Mean :1639 Mean :1238 Mean : 50.67 Mean :1179
## 3rd Qu.:2071 3rd Qu.:1592 3rd Qu.: 62.10 3rd Qu.:1616
## Max. :4104 Max. :3200 Max. :304.00 Max. :3066
## ROA EPS
## Min. : 3.93 Min. : 0.830
## 1st Qu.:15.50 1st Qu.: 3.450
## Median :18.40 Median : 4.140
## Mean :17.15 Mean : 4.969
## 3rd Qu.:19.40 3rd Qu.: 6.240
## Max. :24.70 Max. :11.820
讀取JSON
download.file('https://raw.githubusercontent.com/ywchiu/rcathaybk/master/data/usd.json', destfile = 'usd.json')
library(jsonlite)
btc <- fromJSON('usd.json')
class(btc)
## [1] "list"
names(btc)
## [1] "stats" "total_volumes"
class(btc$stats)
## [1] "matrix"
dim(btc$stats)
## [1] 1812 2
df <- as.data.frame(btc$stats)
colnames(df)
## [1] "V1" "V2"
colnames(df) <- c('datetime', 'btc')
plot(btc ~ datetime, data = df, type= 'l', col='blue')

載入XML
download.file('https://raw.githubusercontent.com/ywchiu/rcathaybk/master/data/iso_company.xml', destfile = 'iso_company.xml')
library(XML)
iso <- xmlToDataFrame('iso_company.xml')
head(iso)
## 廠商代號 廠商名稱
## 1 1S2I002 衛生福利部玉里醫院
## 2 1XDI001 亞洲水泥股份有限公司花蓮製造廠
## 3 2FBF001 九股山食品股份有限公司
## 4 2S4Y002 台灣電力股份有限公司協和發電廠
## 5 2XHY001 杏輝藥品工業股份有限公司
## 6 3A0Y017 威技電器股份有限公司
## 廠址 電話 傳真
## 1 花蓮縣玉里鎮中華路448號 03-8886141#2115 03-8980461
## 2 花蓮縣新城鄉新城村新興路125號 (03)8612101轉204 (03)8612108
## 3 宜蘭縣頭城鎮復興路37巷29號 03-9779988 03-9778081
## 4 基隆市文化路80號 02-24248111-310 02-24260450
## 5 宜蘭縣冬山鄉中山路84號 03-9581101轉1245 03-9583309
## 6 新北市中和區新民街112號8樓之8、之9 02-22265381 02-22265539
## 登錄日期 登錄範圍 證書有效期限 版本
## 1 2009-11-26 00:00:00.0 2018/09/30 05
## 2 2009-12-11 00:00:00.0 2018/09/30 03
## 3 2006-09-18 00:00:00.0 2018/09/17 06
## 4 1997-10-27 00:00:00.0 2018/09/22 08
## 5 1997-10-30 00:00:00.0 2018/09/22 10
## 6 2000-03-23 00:00:00.0 2018/07/05 08
處理實價登錄資料
library(readr)
lvr_prices <- read_csv("/tmp/lvr_prices.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_character(),
## X1 = col_integer(),
## land_sqmeter = col_double(),
## trading_ymd = col_date(format = ""),
## finish_ymd = col_date(format = ""),
## building_sqmeter = col_double(),
## room = col_integer(),
## living_room = col_integer(),
## bath = col_integer(),
## total_price = col_integer(),
## price_per_sqmeter = col_double(),
## parking_sqmeter = col_double(),
## parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1282 total_price an integer 6700000000 '/tmp/lvr_prices.csv' file 2 2243 total_price an integer 3882685600 '/tmp/lvr_prices.csv' row 3 2244 total_price an integer 3373314400 '/tmp/lvr_prices.csv' col 4 4629 total_price an integer 3050000000 '/tmp/lvr_prices.csv' expected 5 5890 total_price an integer 3133800000 '/tmp/lvr_prices.csv'
## ... ................. ... ............................................................... ........ ............................................................... ...... ............................................................... .... ............................................................... ... ............................................................... ... ............................................................... ........ ...............................................................
## See problems(...) for more details.
class(lvr_prices)
## [1] "tbl_df" "tbl" "data.frame"
str(lvr_prices)
## Classes 'tbl_df', 'tbl' and 'data.frame': 102054 obs. of 29 variables:
## $ X1 : int 0 1 2 3 4 5 6 7 8 9 ...
## $ area : chr "大安區" "中正區" "大同區" "大同區" ...
## $ trading_target : chr "房地(土地+建物)" "房地(土地+建物)" "土地" "房地(土地+建物)" ...
## $ address : chr "臺北市大安區和平東路三段1巷72弄1~30號" "臺北市中正區忠孝東路二段121~150號" "橋北段二小段601~630地號" "臺北市大同區重慶北路一段61~90號" ...
## $ land_sqmeter : num 19.39 8.46 5.5 3.88 32.41 ...
## $ city_land_type : chr "住" "商" "其他" "商" ...
## $ non_city_land_type: chr NA NA NA NA ...
## $ non_city_code : chr NA NA NA NA ...
## $ trading_ymd : Date, format: "2012-06-29" "2012-07-18" ...
## $ trading_num : chr "土地1建物2車位0" "土地3建物1車位0" "土地1建物0車位0" "土地4建物1車位0" ...
## $ floor : chr "五層" "九層" NA "六層" ...
## $ total_floor : chr "十七層" "十二層" NA "十一層" ...
## $ building_type : chr "住宅大樓(11層含以上有電梯)" "辦公商業大樓" "其他" "住宅大樓(11層含以上有電梯)" ...
## $ main_purpose : chr "國民住宅" "商業用" NA "商業用" ...
## $ built_with : chr "鋼筋混凝土造" "鋼筋混凝土造" NA "鋼筋混凝土造" ...
## $ finish_ymd : Date, format: "1985-05-22" "1982-04-08" ...
## $ building_sqmeter : num 101 93.4 0 36.7 104.1 ...
## $ room : int 3 0 0 1 3 0 0 3 2 3 ...
## $ living_room : int 2 0 0 1 1 0 0 2 1 2 ...
## $ bath : int 1 0 0 1 1 0 0 2 1 2 ...
## $ compartment : chr "有" "有" "有" "有" ...
## $ management : chr "有" "有" "無" "有" ...
## $ total_price : int 18680000 20300000 132096 4200000 14000000 255000 50000 25800000 19000000 28000000 ...
## $ price_per_sqmeter : num 184999 217307 24017 114317 134473 ...
## $ parking_type : chr NA NA NA NA ...
## $ parking_sqmeter : num 0 0 0 0 0 0 0 0 0 0 ...
## $ parking_price : int 0 0 0 0 0 0 0 0 0 0 ...
## $ comments : chr NA NA NA NA ...
## $ numbers : chr "RPQNMLSJQHHFFFA08CA" "RPQOMLKLQHHFFBA17CA" "RPUNMLLMQHHFFBA67CA" "RPOPMLRKJHIFFBA07CA" ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 32 obs. of 5 variables:
## ..$ row : int 1282 2243 2244 4629 5890 7153 7522 9777 10596 10714 ...
## ..$ col : chr "total_price" "total_price" "total_price" "total_price" ...
## ..$ expected: chr "an integer" "an integer" "an integer" "an integer" ...
## ..$ actual : chr "6700000000" "3882685600" "3373314400" "3050000000" ...
## ..$ file : chr "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 29
## .. ..$ X1 : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ area : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ trading_target : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ address : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ land_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ city_land_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ non_city_land_type: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ non_city_code : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ trading_ymd :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ trading_num : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ floor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ total_floor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ building_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ main_purpose : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ built_with : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ finish_ymd :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ building_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ room : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ living_room : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ bath : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ compartment : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ management : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ total_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ price_per_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ parking_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ parking_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ parking_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ comments : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ numbers : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
summary(lvr_prices)
## X1 area trading_target address
## Min. : 0 Length:102054 Length:102054 Length:102054
## 1st Qu.:1649 Class :character Class :character Class :character
## Median :3363 Mode :character Mode :character Mode :character
## Mean :3539
## 3rd Qu.:5188
## Max. :9683
##
## land_sqmeter city_land_type non_city_land_type
## Min. : 0.00 Length:102054 Length:102054
## 1st Qu.: 9.33 Class :character Class :character
## Median : 22.13 Mode :character Mode :character
## Mean : 54.32
## 3rd Qu.: 35.73
## Max. :46193.00
##
## non_city_code trading_ymd trading_num
## Length:102054 Min. :1973-08-29 Length:102054
## Class :character 1st Qu.:2013-04-17 Class :character
## Mode :character Median :2014-01-02 Mode :character
## Mean :2014-02-11
## 3rd Qu.:2014-12-29
## Max. :2016-05-16
## NA's :20
## floor total_floor building_type
## Length:102054 Length:102054 Length:102054
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## main_purpose built_with finish_ymd
## Length:102054 Length:102054 Min. :1911-05-06
## Class :character Class :character 1st Qu.:1983-03-30
## Mode :character Mode :character Median :1997-01-20
## Mean :1996-04-21
## 3rd Qu.:2010-03-01
## Max. :2016-03-11
## NA's :21992
## building_sqmeter room living_room bath
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 43.45 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 1.000
## Median : 90.79 Median : 2.000 Median : 1.000 Median : 1.000
## Mean : 124.51 Mean : 1.938 Mean : 1.208 Mean : 1.248
## 3rd Qu.: 144.22 3rd Qu.: 3.000 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :69125.53 Max. :168.000 Max. :80.000 Max. :174.000
##
## compartment management total_price
## Length:102054 Length:102054 Min. :0.000e+00
## Class :character Class :character 1st Qu.:7.500e+06
## Mode :character Mode :character Median :1.450e+07
## Mean :2.399e+07
## 3rd Qu.:2.615e+07
## Max. :2.029e+09
## NA's :32
## price_per_sqmeter parking_type parking_sqmeter
## Min. : 0 Length:102054 Min. :0.00e+00
## 1st Qu.: 120832 Class :character 1st Qu.:0.00e+00
## Median : 166859 Mode :character Median :0.00e+00
## Mean : 184978 Mean :2.54e+01
## 3rd Qu.: 222266 3rd Qu.:0.00e+00
## Max. :62685714 Max. :1.45e+06
## NA's :5199
## parking_price comments numbers
## Min. : 0 Length:102054 Length:102054
## 1st Qu.: 0 Class :character Class :character
## Median : 0 Mode :character Mode :character
## Mean : 488509
## 3rd Qu.: 0
## Max. :240000000
##
head(lvr_prices)
## # A tibble: 6 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <chr> <chr> <chr> <dbl> <chr>
## 1 0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7… 19.4 住
## 2 1 中正區 房地(土地+建物)… 臺北市中正區忠孝東路二段121… 8.46 商
## 3 2 大同區 土地 橋北段二小段601~630地號… 5.50 其他
## 4 3 大同區 房地(土地+建物)… 臺北市大同區重慶北路一段61~… 3.88 商
## 5 4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90巷… 32.4 住
## 6 5 信義區 土地 福德段一小段661~690地號… 9.37 其他
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <chr>,
## # main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <chr>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
?head
head(lvr_prices, 10)
## # A tibble: 10 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <chr> <chr> <chr> <dbl> <chr>
## 1 0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷… 19.4 住
## 2 1 中正區 房地(土地+建物)… 臺北市中正區忠孝東路二段12… 8.46 商
## 3 2 大同區 土地 橋北段二小段601~630地… 5.50 其他
## 4 3 大同區 房地(土地+建物)… 臺北市大同區重慶北路一段61… 3.88 商
## 5 4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90… 32.4 住
## 6 5 信義區 土地 福德段一小段661~690地… 9.37 其他
## 7 6 松山區 土地 寶清段一小段31~60地號… 1.02 其他
## 8 7 松山區 房地(土地+建物)… 臺北市松山區三民路68巷1~… 35.5 住
## 9 8 士林區 房地(土地+建物)… 臺北市士林區承德路四段10巷… 31.2 住
## 10 9 大安區 房地(土地+建物)… 臺北市大安區敦化南路一段27… 48.0 商
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <chr>,
## # main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <chr>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
tail(lvr_prices)
## # A tibble: 6 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <chr> <chr> <chr> <dbl> <chr>
## 1 767 萬華區 房地(土地+建物) 臺北市萬華區中華路二段364… 7.27 住
## 2 768 中正區 房地(土地+建物) 臺北市中正區中華路一段1~3… 4.85 商
## 3 769 中正區 房地(土地+建物)+車位… 臺北市中正區中華路一段1~3… 5.09 商
## 4 770 中正區 房地(土地+建物) 臺北市中正區中華路一段1~3… 5.15 商
## 5 771 文山區 車位 臺北市文山區羅斯福路六段15… 3.42 商
## 6 772 中正區 土地 福和段一小段751~780地… 8.00 其他
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <chr>,
## # main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <chr>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
tail(lvr_prices,10)
## # A tibble: 10 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <chr> <chr> <chr> <dbl> <chr>
## 1 763 文山區 房地(土地+建物)+車位… 臺北市文山區興隆路一段24… 10.5 住
## 2 764 萬華區 房地(土地+建物) 臺北市萬華區中華路二段50… 12.4 住
## 3 765 萬華區 房地(土地+建物) 臺北市萬華區西園路二段26… 24.0 住
## 4 766 中正區 房地(土地+建物)+車位… 臺北市中正區濟南路一段1~… 47.2 住
## 5 767 萬華區 房地(土地+建物) 臺北市萬華區中華路二段36… 7.27 住
## 6 768 中正區 房地(土地+建物) 臺北市中正區中華路一段1~… 4.85 商
## 7 769 中正區 房地(土地+建物)+車位… 臺北市中正區中華路一段1~… 5.09 商
## 8 770 中正區 房地(土地+建物) 臺北市中正區中華路一段1~… 5.15 商
## 9 771 文山區 車位 臺北市文山區羅斯福路六段1… 3.42 商
## 10 772 中正區 土地 福和段一小段751~780… 8.00 其他
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <chr>,
## # main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <chr>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
class(lvr_prices$area)
## [1] "character"
head(lvr_prices$area)
## [1] "大安區" "中正區" "大同區" "大同區" "內湖區" "信義區"
as.integer('30')
## [1] 30
as.numeric('30')
## [1] 30
lvr_prices$area <- as.factor(lvr_prices$area)
lvr_prices$trading_target <- as.factor(lvr_prices$trading_target)
lvr_prices$city_land_type <- as.factor(lvr_prices$city_land_type)
lvr_prices$city_land_type <- as.factor(lvr_prices$city_land_type)
lvr_prices$built_with <- as.factor(lvr_prices$built_with)
lvr_prices$main_purpose <- as.factor(lvr_prices$main_purpose)
lvr_prices$compartment <- as.factor(lvr_prices$compartment)
lvr_prices$building_type <- as.factor(lvr_prices$building_type)
str(lvr_prices)
## Classes 'tbl_df', 'tbl' and 'data.frame': 102054 obs. of 29 variables:
## $ X1 : int 0 1 2 3 4 5 6 7 8 9 ...
## $ area : Factor w/ 12 levels "中山區","中正區",..: 9 2 8 8 4 3 11 11 7 9 ...
## $ trading_target : Factor w/ 5 levels "土地","建物",..: 3 3 1 3 3 1 1 3 3 3 ...
## $ address : chr "臺北市大安區和平東路三段1巷72弄1~30號" "臺北市中正區忠孝東路二段121~150號" "橋北段二小段601~630地號" "臺北市大同區重慶北路一段61~90號" ...
## $ land_sqmeter : num 19.39 8.46 5.5 3.88 32.41 ...
## $ city_land_type : Factor w/ 5 levels "住","其他","商",..: 1 3 2 3 1 2 2 1 1 3 ...
## $ non_city_land_type: chr NA NA NA NA ...
## $ non_city_code : chr NA NA NA NA ...
## $ trading_ymd : Date, format: "2012-06-29" "2012-07-18" ...
## $ trading_num : chr "土地1建物2車位0" "土地3建物1車位0" "土地1建物0車位0" "土地4建物1車位0" ...
## $ floor : chr "五層" "九層" NA "六層" ...
## $ total_floor : chr "十七層" "十二層" NA "十一層" ...
## $ building_type : Factor w/ 12 levels "住宅大樓(11層含以上有電梯)",..: 1 10 4 1 3 4 4 9 3 3 ...
## $ main_purpose : Factor w/ 12 levels "住商用","住家用",..: 7 6 NA 6 2 NA NA 7 2 2 ...
## $ built_with : Factor w/ 17 levels "加強磚造","土木造",..: 12 12 NA 12 12 NA NA 12 12 12 ...
## $ finish_ymd : Date, format: "1985-05-22" "1982-04-08" ...
## $ building_sqmeter : num 101 93.4 0 36.7 104.1 ...
## $ room : int 3 0 0 1 3 0 0 3 2 3 ...
## $ living_room : int 2 0 0 1 1 0 0 2 1 2 ...
## $ bath : int 1 0 0 1 1 0 0 2 1 2 ...
## $ compartment : Factor w/ 2 levels "有","無": 1 1 1 1 1 1 1 1 1 1 ...
## $ management : chr "有" "有" "無" "有" ...
## $ total_price : int 18680000 20300000 132096 4200000 14000000 255000 50000 25800000 19000000 28000000 ...
## $ price_per_sqmeter : num 184999 217307 24017 114317 134473 ...
## $ parking_type : chr NA NA NA NA ...
## $ parking_sqmeter : num 0 0 0 0 0 0 0 0 0 0 ...
## $ parking_price : int 0 0 0 0 0 0 0 0 0 0 ...
## $ comments : chr NA NA NA NA ...
## $ numbers : chr "RPQNMLSJQHHFFFA08CA" "RPQOMLKLQHHFFBA17CA" "RPUNMLLMQHHFFBA67CA" "RPOPMLRKJHIFFBA07CA" ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 32 obs. of 5 variables:
## ..$ row : int 1282 2243 2244 4629 5890 7153 7522 9777 10596 10714 ...
## ..$ col : chr "total_price" "total_price" "total_price" "total_price" ...
## ..$ expected: chr "an integer" "an integer" "an integer" "an integer" ...
## ..$ actual : chr "6700000000" "3882685600" "3373314400" "3050000000" ...
## ..$ file : chr "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 29
## .. ..$ X1 : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ area : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ trading_target : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ address : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ land_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ city_land_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ non_city_land_type: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ non_city_code : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ trading_ymd :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ trading_num : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ floor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ total_floor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ building_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ main_purpose : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ built_with : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ finish_ymd :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ building_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ room : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ living_room : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ bath : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ compartment : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ management : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ total_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ price_per_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ parking_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ parking_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ parking_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ comments : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ numbers : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
summary(lvr_prices)
## X1 area trading_target
## Min. : 0 中山區 :15020 土地 :11130
## 1st Qu.:1649 內湖區 :12312 建物 : 669
## Median :3363 文山區 :10572 房地(土地+建物) :56207
## Mean :3539 北投區 :10278 房地(土地+建物)+車位:28941
## 3rd Qu.:5188 大安區 : 9383 車位 : 5107
## Max. :9683 士林區 : 8878
## (Other):35611
## address land_sqmeter city_land_type non_city_land_type
## Length:102054 Min. : 0.00 住 :59083 Length:102054
## Class :character 1st Qu.: 9.33 其他:11896 Class :character
## Mode :character Median : 22.13 商 :27436 Mode :character
## Mean : 54.32 工 : 2269
## 3rd Qu.: 35.73 農 : 435
## Max. :46193.00 NA's: 935
##
## non_city_code trading_ymd trading_num
## Length:102054 Min. :1973-08-29 Length:102054
## Class :character 1st Qu.:2013-04-17 Class :character
## Mode :character Median :2014-01-02 Mode :character
## Mean :2014-02-11
## 3rd Qu.:2014-12-29
## Max. :2016-05-16
## NA's :20
## floor total_floor building_type
## Length:102054 Length:102054 住宅大樓(11層含以上有電梯):28070
## Class :character Class :character 公寓(5樓含以下無電梯) :21493
## Mode :character Mode :character 華廈(10層含以下有電梯) :18043
## 其他 :16665
## 套房(1房1廳1衛) :10119
## 辦公商業大樓 : 2781
## (Other) : 4883
## main_purpose built_with finish_ymd
## 住家用 :62535 鋼筋混凝土造 :80483 Min. :1911-05-06
## 商業用 : 9350 加強磚造 : 4262 1st Qu.:1983-03-30
## 見其他登記事項: 7602 鋼骨鋼筋混凝土造: 2259 Median :1997-01-20
## 見使用執照 : 2877 見其他登記事項 : 2027 Mean :1996-04-21
## 國民住宅 : 2093 磚造 : 418 3rd Qu.:2010-03-01
## (Other) : 2758 (Other) : 579 Max. :2016-03-11
## NA's :14839 NA's :12026 NA's :21992
## building_sqmeter room living_room bath
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 43.45 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 1.000
## Median : 90.79 Median : 2.000 Median : 1.000 Median : 1.000
## Mean : 124.51 Mean : 1.938 Mean : 1.208 Mean : 1.248
## 3rd Qu.: 144.22 3rd Qu.: 3.000 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :69125.53 Max. :168.000 Max. :80.000 Max. :174.000
##
## compartment management total_price price_per_sqmeter
## 有:93155 Length:102054 Min. :0.000e+00 Min. : 0
## 無: 8899 Class :character 1st Qu.:7.500e+06 1st Qu.: 120832
## Mode :character Median :1.450e+07 Median : 166859
## Mean :2.399e+07 Mean : 184978
## 3rd Qu.:2.615e+07 3rd Qu.: 222266
## Max. :2.029e+09 Max. :62685714
## NA's :32 NA's :5199
## parking_type parking_sqmeter parking_price
## Length:102054 Min. :0.00e+00 Min. : 0
## Class :character 1st Qu.:0.00e+00 1st Qu.: 0
## Mode :character Median :0.00e+00 Median : 0
## Mean :2.54e+01 Mean : 488509
## 3rd Qu.:0.00e+00 3rd Qu.: 0
## Max. :1.45e+06 Max. :240000000
##
## comments numbers
## Length:102054 Length:102054
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
資料篩選
lvr_prices[ 1 , ]
## # A tibble: 1 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <fct> <fct> <chr> <dbl> <fct>
## 1 0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7… 19.4 住
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <fct>,
## # main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <fct>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
lvr_prices[c(1,3,5) , ]
## # A tibble: 3 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <fct> <fct> <chr> <dbl> <fct>
## 1 0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7… 19.4 住
## 2 2 大同區 土地 橋北段二小段601~630地號… 5.50 其他
## 3 4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90巷… 32.4 住
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <fct>,
## # main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <fct>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
1:5
## [1] 1 2 3 4 5
lvr_prices[ 1:5 , ]
## # A tibble: 5 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <fct> <fct> <chr> <dbl> <fct>
## 1 0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7… 19.4 住
## 2 1 中正區 房地(土地+建物)… 臺北市中正區忠孝東路二段121… 8.46 商
## 3 2 大同區 土地 橋北段二小段601~630地號… 5.50 其他
## 4 3 大同區 房地(土地+建物)… 臺北市大同區重慶北路一段61~… 3.88 商
## 5 4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90巷… 32.4 住
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <fct>,
## # main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <fct>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
lvr_prices[ 1:3 , c(1,2,3,4) ]
## # A tibble: 3 x 4
## X1 area trading_target address
## <int> <fct> <fct> <chr>
## 1 0 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2 1 中正區 房地(土地+建物) 臺北市中正區忠孝東路二段121~150號
## 3 2 大同區 土地 橋北段二小段601~630地號
lvr_prices[ 1:3 , c(2,3,4) ]
## # A tibble: 3 x 3
## area trading_target address
## <fct> <fct> <chr>
## 1 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2 中正區 房地(土地+建物) 臺北市中正區忠孝東路二段121~150號
## 3 大同區 土地 橋北段二小段601~630地號
lvr_prices[ 1:3 , 2:4 ]
## # A tibble: 3 x 3
## area trading_target address
## <fct> <fct> <chr>
## 1 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2 中正區 房地(土地+建物) 臺北市中正區忠孝東路二段121~150號
## 3 大同區 土地 橋北段二小段601~630地號
lvr_prices[ 1:3 , c('area', 'address', 'total_price') ]
## # A tibble: 3 x 3
## area address total_price
## <fct> <chr> <int>
## 1 大安區 臺北市大安區和平東路三段1巷72弄1~30號 18680000
## 2 中正區 臺北市中正區忠孝東路二段121~150號 20300000
## 3 大同區 橋北段二小段601~630地號 132096
head(lvr_prices[, c(1,2)])
## # A tibble: 6 x 2
## X1 area
## <int> <fct>
## 1 0 大安區
## 2 1 中正區
## 3 2 大同區
## 4 3 大同區
## 5 4 內湖區
## 6 5 信義區
head(lvr_prices$area)
## [1] 大安區 中正區 大同區 大同區 內湖區 信義區
## 12 Levels: 中山區 中正區 信義區 內湖區 北投區 南港區 士林區 ... 萬華區
a <- c(50,60,70)
a[1]
## [1] 50
a[c(TRUE, FALSE, FALSE)]
## [1] 50
daan <- lvr_prices$area == '大安區'
head(lvr_prices[ daan , ])
## # A tibble: 6 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <fct> <fct> <chr> <dbl> <fct>
## 1 0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7… 19.4 住
## 2 9 大安區 房地(土地+建物)… 臺北市大安區敦化南路一段270… 48.0 商
## 3 10 大安區 車位 臺北市大安區永康街7巷1~30… 0.100 商
## 4 11 大安區 車位 臺北市大安區永康街7巷1~30… 0.0500 商
## 5 12 大安區 車位 臺北市大安區永康街7巷1~30… 0.0500 商
## 6 29 大安區 土地 通化段三小段781~810地號… 28.0 其他
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <fct>,
## # main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <fct>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
head(lvr_prices[ daan , c('area', 'total_price') ])
## # A tibble: 6 x 2
## area total_price
## <fct> <int>
## 1 大安區 18680000
## 2 大安區 28000000
## 3 大安區 1600000
## 4 大安區 850000
## 5 大安區 850000
## 6 大安區 750000
缺失值
mean(lvr_prices$total_price)
## [1] NA
?mean
mean(lvr_prices$total_price, na.rm=TRUE)
## [1] 23993655
median(lvr_prices$total_price, na.rm=TRUE)
## [1] 14500000
sum(is.na(lvr_prices$total_price)) / length(lvr_prices$total_price)
## [1] 0.0003135595
# method 1 : using for loop
a <- c(1,2,3,4,5)
for (ele in a){
print(ele ^ 2)
}
## [1] 1
## [1] 4
## [1] 9
## [1] 16
## [1] 25
# method 2 : using sapply (迴圈函數)
sapply(a, function(ele) ele ^ 2 )
## [1] 1 4 9 16 25
# method 1 : using for loop
for (col in names(lvr_prices)){
print(col)
nstat <- sum(is.na(lvr_prices[, col])) / length(lvr_prices[, col])
print(nstat)
}
## [1] "X1"
## [1] 0
## [1] "area"
## [1] 0
## [1] "trading_target"
## [1] 0
## [1] "address"
## [1] 0
## [1] "land_sqmeter"
## [1] 0
## [1] "city_land_type"
## [1] 935
## [1] "non_city_land_type"
## [1] 102037
## [1] "non_city_code"
## [1] 102054
## [1] "trading_ymd"
## [1] 20
## [1] "trading_num"
## [1] 0
## [1] "floor"
## [1] 12237
## [1] "total_floor"
## [1] 12257
## [1] "building_type"
## [1] 0
## [1] "main_purpose"
## [1] 14839
## [1] "built_with"
## [1] 12026
## [1] "finish_ymd"
## [1] 21992
## [1] "building_sqmeter"
## [1] 0
## [1] "room"
## [1] 0
## [1] "living_room"
## [1] 0
## [1] "bath"
## [1] 0
## [1] "compartment"
## [1] 0
## [1] "management"
## [1] 0
## [1] "total_price"
## [1] 32
## [1] "price_per_sqmeter"
## [1] 5199
## [1] "parking_type"
## [1] 67889
## [1] "parking_sqmeter"
## [1] 0
## [1] "parking_price"
## [1] 0
## [1] "comments"
## [1] 68120
## [1] "numbers"
## [1] 0
# method 2 : using sapply (迴圈函數)
sapply(names(lvr_prices), function(col)sum(is.na(lvr_prices[,col]) / length(lvr_prices[,col])))
## X1 area trading_target
## 0 0 0
## address land_sqmeter city_land_type
## 0 0 935
## non_city_land_type non_city_code trading_ymd
## 102037 102054 20
## trading_num floor total_floor
## 0 12237 12257
## building_type main_purpose built_with
## 0 14839 12026
## finish_ymd building_sqmeter room
## 21992 0 0
## living_room bath compartment
## 0 0 0
## management total_price price_per_sqmeter
## 0 32 5199
## parking_type parking_sqmeter parking_price
## 67889 0 0
## comments numbers
## 68120 0
lvr_prices$non_city_land_type <- NULL
lvr_prices$non_city_code <- NULL
lvr_prices$comments <- NULL
lvr_prices$parking_type <- NULL
str(lvr_prices)
## Classes 'tbl_df', 'tbl' and 'data.frame': 102054 obs. of 25 variables:
## $ X1 : int 0 1 2 3 4 5 6 7 8 9 ...
## $ area : Factor w/ 12 levels "中山區","中正區",..: 9 2 8 8 4 3 11 11 7 9 ...
## $ trading_target : Factor w/ 5 levels "土地","建物",..: 3 3 1 3 3 1 1 3 3 3 ...
## $ address : chr "臺北市大安區和平東路三段1巷72弄1~30號" "臺北市中正區忠孝東路二段121~150號" "橋北段二小段601~630地號" "臺北市大同區重慶北路一段61~90號" ...
## $ land_sqmeter : num 19.39 8.46 5.5 3.88 32.41 ...
## $ city_land_type : Factor w/ 5 levels "住","其他","商",..: 1 3 2 3 1 2 2 1 1 3 ...
## $ trading_ymd : Date, format: "2012-06-29" "2012-07-18" ...
## $ trading_num : chr "土地1建物2車位0" "土地3建物1車位0" "土地1建物0車位0" "土地4建物1車位0" ...
## $ floor : chr "五層" "九層" NA "六層" ...
## $ total_floor : chr "十七層" "十二層" NA "十一層" ...
## $ building_type : Factor w/ 12 levels "住宅大樓(11層含以上有電梯)",..: 1 10 4 1 3 4 4 9 3 3 ...
## $ main_purpose : Factor w/ 12 levels "住商用","住家用",..: 7 6 NA 6 2 NA NA 7 2 2 ...
## $ built_with : Factor w/ 17 levels "加強磚造","土木造",..: 12 12 NA 12 12 NA NA 12 12 12 ...
## $ finish_ymd : Date, format: "1985-05-22" "1982-04-08" ...
## $ building_sqmeter : num 101 93.4 0 36.7 104.1 ...
## $ room : int 3 0 0 1 3 0 0 3 2 3 ...
## $ living_room : int 2 0 0 1 1 0 0 2 1 2 ...
## $ bath : int 1 0 0 1 1 0 0 2 1 2 ...
## $ compartment : Factor w/ 2 levels "有","無": 1 1 1 1 1 1 1 1 1 1 ...
## $ management : chr "有" "有" "無" "有" ...
## $ total_price : int 18680000 20300000 132096 4200000 14000000 255000 50000 25800000 19000000 28000000 ...
## $ price_per_sqmeter: num 184999 217307 24017 114317 134473 ...
## $ parking_sqmeter : num 0 0 0 0 0 0 0 0 0 0 ...
## $ parking_price : int 0 0 0 0 0 0 0 0 0 0 ...
## $ numbers : chr "RPQNMLSJQHHFFFA08CA" "RPQOMLKLQHHFFBA17CA" "RPUNMLLMQHHFFBA67CA" "RPOPMLRKJHIFFBA07CA" ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 32 obs. of 5 variables:
## ..$ row : int 1282 2243 2244 4629 5890 7153 7522 9777 10596 10714 ...
## ..$ col : chr "total_price" "total_price" "total_price" "total_price" ...
## ..$ expected: chr "an integer" "an integer" "an integer" "an integer" ...
## ..$ actual : chr "6700000000" "3882685600" "3373314400" "3050000000" ...
## ..$ file : chr "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 29
## .. ..$ X1 : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ area : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ trading_target : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ address : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ land_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ city_land_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ non_city_land_type: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ non_city_code : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ trading_ymd :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ trading_num : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ floor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ total_floor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ building_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ main_purpose : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ built_with : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ finish_ymd :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ building_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ room : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ living_room : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ bath : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ compartment : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ management : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ total_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ price_per_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ parking_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ parking_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ parking_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ comments : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ numbers : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
lvr_prices <- lvr_prices[! is.na(lvr_prices$total_price), ]
文字處理
head(lvr_prices)
head(lvr_prices$floor)
#lvr_prices$floor
?gsub
# gsub can replace string by pattern
floor_tmp <- gsub('層', '', lvr_prices$floor)
phones <- c('0912345678', '0923456780', '0912344556')
c('0912345678') == phones
s <- '6'
grepl('6' , s )
grepl('5' , s )
# []
grepl('[0123456789]' , s )
# \\d = [0123456789]
grepl('\\d' , s )
s <- 'w'
grepl('[abcdefghijklmnopqrstuvwxyz]', s)
s <- 'W'
grepl('[abcdefghijklmnopqrstuvwxyz]', s)
grepl('[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]', s)
# -: [abcdefghijklmnopqrstuvwxyz] => [a-z]
s <- 'y'
grepl('[a-z]', s)
grepl('[a-zA-Z]', s)
s <- '7'
grepl('[a-zA-Z]', s)
grepl('[a-zA-Z0-9]', s)
# \\w => [a-zA-Z0-9]
grepl('\\w', s)
# {n}: match exact n characters in string
s <- 'apple'
grepl('\\w{5}', s)
# {m, n}: match at least m characters in string, at most n characters in the string
s <- 'bannana'
grepl('\\w{5, }', s)
s <- 'bannana'
grepl('\\w{1, }', s)
# + => {1,}
s <- 'bannana'
grepl('\\w+', s)
s <- 'bannana'
grepl('\\w{0, }', s)
# * => {0,}
grepl('\\w*', s)
phones <- c('0912345678', '0912-345-678', '0912-345678', '0958034580580435')
grepl('09\\d{8}', phones)
grepl('09\\d{2}-{0,1}\\d{3}-{0,1}\\d{3}', phones)
# ? => {0,1}
grepl('09\\d{2}-?\\d{3}-?\\d{3}', phones)
# ^ : match the begin , $ match the end
grepl('^09\\d{2}-?\\d{3}-?\\d{3}$', phones)
library(stringr)
trading_data <- str_match_all(lvr_prices$trading_num, "土地(\\d+)建物(\\d+)車位(\\d+)")
a <- data.frame(a=c(1,2), b=c(3,4))
a
b <- data.frame(a=c(5,6), b=c(7,8))
b
do.call('rbind', list(a,b))
#do.call('rbind',trading_data)
str_match_all(head(lvr_prices$address), "(.+[市縣])(.+區)(.+[路街])")
資料排序
head(sort(lvr_prices$total_price, decreasing = TRUE))
## [1] 2028880000 2008800000 1930000000 1869781219 1850000000 1715668356
a <- c(70,50,60,80,55)
sort(a)
## [1] 50 55 60 70 80
order(a)
## [1] 2 5 3 1 4
head(lvr_prices[order(lvr_prices$total_price, decreasing = TRUE), ])
## # A tibble: 6 x 25
## X1 area trading_target address land_sqmeter city_land_type
## <int> <fct> <fct> <chr> <dbl> <fct>
## 1 7679 內湖區 房地(土地+建物)+車位… 臺北市內湖區新湖一路271~… 3948. 工
## 2 1397 松山區 房地(土地+建物) 臺北市松山區敦化北路121~… 1304. 住
## 3 4373 北投區 房地(土地+建物)+車位… 臺北市北投區石牌路一段181… 1127. 商
## 4 2389 大安區 房地(土地+建物) 臺北市大安區羅斯福路三段28… 2376. 住
## 5 5912 中山區 房地(土地+建物)+車位… 臺北市中山區建國北路一段13… 709. 住
## 6 3786 北投區 土地 新洲美段31~60地號… 4526. 其他
## # ... with 19 more variables: trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <fct>,
## # main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <fct>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_sqmeter <dbl>, parking_price <int>,
## # numbers <chr>
levels(lvr_prices$building_type)
## [1] "住宅大樓(11層含以上有電梯)" "倉庫"
## [3] "公寓(5樓含以下無電梯)" "其他"
## [5] "套房(1房1廳1衛)" "工廠"
## [7] "店面(店鋪)" "廠辦"
## [9] "華廈(10層含以下有電梯)" "辦公商業大樓"
## [11] "農舍" "透天厝"
apartment <- lvr_prices[lvr_prices$building_type == '公寓(5樓含以下無電梯)',]
head(apartment[order(apartment$total_price, decreasing = TRUE), ])
## # A tibble: 6 x 25
## X1 area trading_target address land_sqmeter city_land_type
## <int> <fct> <fct> <chr> <dbl> <fct>
## 1 3015 中山區 房地(土地+建物)… 臺北市中山區中山北路二段121… 128. 商
## 2 1713 松山區 房地(土地+建物)… 臺北市松山區南京東路三段301… 147. 商
## 3 7282 萬華區 房地(土地+建物)… 臺北市萬華區華西街26巷1~3… 378. 商
## 4 6783 中山區 房地(土地+建物)… 臺北市中山區中山北路二段121… 136. 商
## 5 1522 中正區 房地(土地+建物)… 臺北市中正區汀州路二段151~… 317. 住
## 6 659 中正區 房地(土地+建物)… 臺北市中正區延平南路151~1… 268. 商
## # ... with 19 more variables: trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <fct>,
## # main_purpose <fct>, built_with <fct>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <fct>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_sqmeter <dbl>, parking_price <int>,
## # numbers <chr>
daan <- lvr_prices[lvr_prices$area == '大安區',]
sum(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 2.79477e+11
mean(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 29798170
median(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 2e+07
zhongshan <- lvr_prices[lvr_prices$area == '中山區', c('address', 'total_price')]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,]
res[1:3,]
## # A tibble: 3 x 2
## address total_price
## <chr> <int>
## 1 臺北市中山區建國北路一段138巷1~30號 1850000000
## 2 臺北市中山區南京東路三段1~30號 1400000000
## 3 中山段二小段31~60地號 1084948034
zhongshan <- lvr_prices[lvr_prices$area == '中山區', c('address', 'total_price')]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,]
res[1:3,]
## # A tibble: 3 x 2
## address total_price
## <chr> <int>
## 1 臺北市中山區建國北路一段138巷1~30號 1850000000
## 2 臺北市中山區南京東路三段1~30號 1400000000
## 3 中山段二小段31~60地號 1084948034
getTopThree <- function(area){
house <- lvr_prices[lvr_prices$area == area, c('address', 'total_price')]
idx <- order(house$total_price, decreasing = TRUE)
res <- house[idx,]
return(res[1:3,])
}
getTopThree('大安區')
## # A tibble: 3 x 2
## address total_price
## <chr> <int>
## 1 臺北市大安區羅斯福路三段283巷4弄1~30號 1869781219
## 2 臺北市大安區忠孝東路四段241~270號 971340000
## 3 學府段三小段31~60地號 966660000
mean(lvr_prices[lvr_prices$area == '中山區', ]$total_price, na.rm=TRUE)
## [1] 26708805
lvr_prices$area <- as.factor(lvr_prices$area)
# method 1: for loop
for (area in levels(lvr_prices$area)){
p <- median(lvr_prices[lvr_prices$area == area, ]$total_price, na.rm=TRUE)
print(paste(area, p))
}
## [1] "中山區 12800000"
## [1] "中正區 16180000"
## [1] "信義區 15800000"
## [1] "內湖區 16500000"
## [1] "北投區 13000000"
## [1] "南港區 16685000"
## [1] "士林區 14350000"
## [1] "大同區 11770000"
## [1] "大安區 20000000"
## [1] "文山區 13300000"
## [1] "松山區 17800000"
## [1] "萬華區 9542595"
# method 2 : tapply
price_per_sec <- tapply(lvr_prices$total_price, lvr_prices$area, function(e)median(e, na.rm=TRUE))
sort(price_per_sec, decreasing = TRUE)
## 大安區 松山區 南港區 內湖區 中正區 信義區 士林區 文山區
## 20000000 17800000 16685000 16500000 16180000 15800000 14350000 13300000
## 北投區 中山區 大同區 萬華區
## 13000000 12800000 11770000 9542595
barplot(sort(price_per_sec, decreasing = TRUE), col='blue')

boxplot(log(total_price) ~ area, data = lvr_prices, main= "房價箱型圖", xlab = "區域", ylab = "價格(log)")
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 7 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 8 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 9 is not drawn
