a <- 3
b = 2
a+b
## [1] 5
data()
data(iris)
View(iris)
class(iris)
## [1] "data.frame"
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(iris, 10)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
tail(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
help(head)
## starting httpd help server ... done
?head
iris[ 1 , ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
iris[ c(1,2,3) , ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
iris[ 1:3 , ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
iris[ 1:3 , 1 ]
## [1] 5.1 4.9 4.7
iris[ 1:3 , c(1,2) ]
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
iris[ 1:3 , 1:2 ]
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
iris[ 1:3 ,'Sepal.Length']
## [1] 5.1 4.9 4.7
iris[ 1:3 ,c('Sepal.Length', 'Sepal.Width')]
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
head(iris[, 'Sepal.Length'])
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
head(iris$Sepal.Length)
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
head(iris$Species == 'setosa')
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
iris[ iris$Species == 'setosa' , ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
filter.cond <- (iris$Species == 'setosa' ) &
(iris$Sepal.Length >= 5)
head(iris[filter.cond, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 15 5.8 4.0 1.2 0.2 setosa
heights <- c(180, 172, 168, 183, 178)
sort(heights)
## [1] 168 172 178 180 183
sort(heights, decreasing = TRUE)
## [1] 183 180 178 172 168
order(heights)
## [1] 3 2 5 1 4
order(heights, decreasing = TRUE)
## [1] 4 1 5 2 3
head(sort(iris$Sepal.Length, decreasing = TRUE))
## [1] 7.9 7.7 7.7 7.7 7.7 7.6
rank <- order(iris$Sepal.Length, decreasing = TRUE)
head(iris[ rank , 'Species' ])
## [1] virginica virginica virginica virginica virginica virginica
## Levels: setosa versicolor virginica
data("anscombe")
View(anscombe)
plot(y1 ~ x1, data = anscombe)
plot(y2 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
plot(y4 ~ x1, data = anscombe)
tb <- table(iris$Species)
pie(tb)
barplot(tb)
hist(iris$Sepal.Length)
boxplot(iris$Sepal.Length)
boxplot(iris$Petal.Length~iris$Species)
plot(iris$Petal.Length, iris$Petal.Width)
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)
#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/lvr_prices.csv', 'lvr_prices.csv')
getwd()
## [1] "D:/OS DATA/Desktop"
library(readr)
lvr_prices <- read_csv("D:/OS DATA/Desktop/lvr_prices.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_character(),
## X1 = col_integer(),
## land_sqmeter = col_double(),
## trading_ymd = col_date(format = ""),
## finish_ymd = col_date(format = ""),
## building_sqmeter = col_double(),
## room = col_integer(),
## living_room = col_integer(),
## bath = col_integer(),
## total_price = col_integer(),
## price_per_sqmeter = col_double(),
## parking_sqmeter = col_double(),
## parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual expected <int> <chr> <chr> <chr> actual 1 1282 total_price an integer 6700000000 file 2 2243 total_price an integer 3882685600 row 3 2244 total_price an integer 3373314400 col 4 4629 total_price an integer 3050000000 expected 5 5890 total_price an integer 3133800000 actual # ... with 1 more variables: file <chr>
## ... ................. ... ......................................... ........ ......................................... ...... ......................................... .... ......................................... ... ......................................... ... ......................................... ........ ......................................... ...... .......................................
## See problems(...) for more details.
#View(lvr_prices)
class(lvr_prices)
## [1] "tbl_df" "tbl" "data.frame"
str(lvr_prices)
## Classes 'tbl_df', 'tbl' and 'data.frame': 102054 obs. of 29 variables:
## $ X1 : int 0 1 2 3 4 5 6 7 8 9 ...
## $ area : chr "大安區" "中正區" "大同區" "大同區" ...
## $ trading_target : chr "房地(土地+建物)" "房地(土地+建物)" "土地" "房地(土地+建物)" ...
## $ address : chr "臺北市大安區和平東路三段1巷72弄1~30號" "臺北市中正區忠孝東路二段121~150號" "橋北段二小段601~630地號" "臺北市大同區重慶北路一段61~90號" ...
## $ land_sqmeter : num 19.39 8.46 5.5 3.88 32.41 ...
## $ city_land_type : chr "住" "商" "其他" "商" ...
## $ non_city_land_type: chr NA NA NA NA ...
## $ non_city_code : chr NA NA NA NA ...
## $ trading_ymd : Date, format: "2012-06-29" "2012-07-18" ...
## $ trading_num : chr "土地1建物2車位0" "土地3建物1車位0" "土地1建物0車位0" "土地4建物1車位0" ...
## $ floor : chr "五層" "九層" NA "六層" ...
## $ total_floor : chr "十七層" "十二層" NA "十一層" ...
## $ building_type : chr "住宅大樓(11層含以上有電梯)" "辦公商業大樓" "其他" "住宅大樓(11層含以上有電梯)" ...
## $ main_purpose : chr "國民住宅" "商業用" NA "商業用" ...
## $ built_with : chr "鋼筋混凝土造" "鋼筋混凝土造" NA "鋼筋混凝土造" ...
## $ finish_ymd : Date, format: "1985-05-22" "1982-04-08" ...
## $ building_sqmeter : num 101 93.4 0 36.7 104.1 ...
## $ room : int 3 0 0 1 3 0 0 3 2 3 ...
## $ living_room : int 2 0 0 1 1 0 0 2 1 2 ...
## $ bath : int 1 0 0 1 1 0 0 2 1 2 ...
## $ compartment : chr "有" "有" "有" "有" ...
## $ management : chr "有" "有" "無" "有" ...
## $ total_price : int 18680000 20300000 132096 4200000 14000000 255000 50000 25800000 19000000 28000000 ...
## $ price_per_sqmeter : num 184999 217307 24017 114317 134473 ...
## $ parking_type : chr NA NA NA NA ...
## $ parking_sqmeter : num 0 0 0 0 0 0 0 0 0 0 ...
## $ parking_price : int 0 0 0 0 0 0 0 0 0 0 ...
## $ comments : chr NA NA NA NA ...
## $ numbers : chr "RPQNMLSJQHHFFFA08CA" "RPQOMLKLQHHFFBA17CA" "RPUNMLLMQHHFFBA67CA" "RPOPMLRKJHIFFBA07CA" ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 32 obs. of 5 variables:
## ..$ row : int 1282 2243 2244 4629 5890 7153 7522 9777 10596 10714 ...
## ..$ col : chr "total_price" "total_price" "total_price" "total_price" ...
## ..$ expected: chr "an integer" "an integer" "an integer" "an integer" ...
## ..$ actual : chr "6700000000" "3882685600" "3373314400" "3050000000" ...
## ..$ file : chr "'D:/OS DATA/Desktop/lvr_prices.csv'" "'D:/OS DATA/Desktop/lvr_prices.csv'" "'D:/OS DATA/Desktop/lvr_prices.csv'" "'D:/OS DATA/Desktop/lvr_prices.csv'" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 29
## .. ..$ X1 : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ area : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ trading_target : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ address : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ land_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ city_land_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ non_city_land_type: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ non_city_code : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ trading_ymd :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ trading_num : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ floor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ total_floor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ building_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ main_purpose : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ built_with : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ finish_ymd :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ building_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ room : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ living_room : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ bath : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ compartment : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ management : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ total_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ price_per_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ parking_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ parking_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ parking_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ comments : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ numbers : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
summary(lvr_prices)
## X1 area trading_target address
## Min. : 0 Length:102054 Length:102054 Length:102054
## 1st Qu.:1649 Class :character Class :character Class :character
## Median :3363 Mode :character Mode :character Mode :character
## Mean :3539
## 3rd Qu.:5188
## Max. :9683
##
## land_sqmeter city_land_type non_city_land_type
## Min. : 0.00 Length:102054 Length:102054
## 1st Qu.: 9.33 Class :character Class :character
## Median : 22.13 Mode :character Mode :character
## Mean : 54.32
## 3rd Qu.: 35.73
## Max. :46193.00
##
## non_city_code trading_ymd trading_num
## Length:102054 Min. :1973-08-29 Length:102054
## Class :character 1st Qu.:2013-04-17 Class :character
## Mode :character Median :2014-01-02 Mode :character
## Mean :2014-02-11
## 3rd Qu.:2014-12-29
## Max. :2016-05-16
## NA's :20
## floor total_floor building_type
## Length:102054 Length:102054 Length:102054
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## main_purpose built_with finish_ymd
## Length:102054 Length:102054 Min. :1911-05-06
## Class :character Class :character 1st Qu.:1983-03-30
## Mode :character Mode :character Median :1997-01-20
## Mean :1996-04-21
## 3rd Qu.:2010-03-01
## Max. :2016-03-11
## NA's :21992
## building_sqmeter room living_room bath
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 43.45 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 1.000
## Median : 90.79 Median : 2.000 Median : 1.000 Median : 1.000
## Mean : 124.51 Mean : 1.938 Mean : 1.208 Mean : 1.248
## 3rd Qu.: 144.22 3rd Qu.: 3.000 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :69125.53 Max. :168.000 Max. :80.000 Max. :174.000
##
## compartment management total_price
## Length:102054 Length:102054 Min. :0.000e+00
## Class :character Class :character 1st Qu.:7.500e+06
## Mode :character Mode :character Median :1.450e+07
## Mean :2.399e+07
## 3rd Qu.:2.615e+07
## Max. :2.029e+09
## NA's :32
## price_per_sqmeter parking_type parking_sqmeter
## Min. : 0 Length:102054 Min. :0.00e+00
## 1st Qu.: 120832 Class :character 1st Qu.:0.00e+00
## Median : 166859 Mode :character Median :0.00e+00
## Mean : 184978 Mean :2.54e+01
## 3rd Qu.: 222266 3rd Qu.:0.00e+00
## Max. :62685714 Max. :1.45e+06
## NA's :5199
## parking_price comments numbers
## Min. : 0 Length:102054 Length:102054
## 1st Qu.: 0 Class :character Class :character
## Median : 0 Mode :character Mode :character
## Mean : 488509
## 3rd Qu.: 0
## Max. :240000000
##
head(lvr_prices)
## # A tibble: 6 x 29
## X1 area trading_target address
## <int> <chr> <chr> <chr>
## 1 0 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2 1 中正區 房地(土地+建物) 臺北市中正區忠孝東路二段121~150號
## 3 2 大同區 土地 橋北段二小段601~630地號
## 4 3 大同區 房地(土地+建物) 臺北市大同區重慶北路一段61~90號
## 5 4 內湖區 房地(土地+建物) 臺北市內湖區民權東路六段90巷6弄1~30號
## 6 5 信義區 土地 福德段一小段661~690地號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## # non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## # trading_num <chr>, floor <chr>, total_floor <chr>,
## # building_type <chr>, main_purpose <chr>, built_with <chr>,
## # finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## # living_room <int>, bath <int>, compartment <chr>, management <chr>,
## # total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## # parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## # numbers <chr>
tail(lvr_prices)
## # A tibble: 6 x 29
## X1 area trading_target address
## <int> <chr> <chr> <chr>
## 1 767 萬華區 房地(土地+建物) 臺北市萬華區中華路二段364巷24弄1~30號
## 2 768 中正區 房地(土地+建物) 臺北市中正區中華路一段1~30號
## 3 769 中正區 房地(土地+建物)+車位 臺北市中正區中華路一段1~30號
## 4 770 中正區 房地(土地+建物) 臺北市中正區中華路一段1~30號
## 5 771 文山區 車位 臺北市文山區羅斯福路六段159巷1弄1~30號
## 6 772 中正區 土地 福和段一小段751~780地號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## # non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## # trading_num <chr>, floor <chr>, total_floor <chr>,
## # building_type <chr>, main_purpose <chr>, built_with <chr>,
## # finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## # living_room <int>, bath <int>, compartment <chr>, management <chr>,
## # total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## # parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## # numbers <chr>
daan <- lvr_prices[lvr_prices$area =='大安區' , ]
sum(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 2.79477e+11
?sum
mean(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 29798170
median(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 2e+07
a <- c(2,3,4,5,6)
mean(a)
## [1] 4
median(a)
## [1] 4
a <- c(2,3,4,5,600)
mean(a)
## [1] 122.8
median(a)
## [1] 4
daan <- lvr_prices[lvr_prices$area =='大安區' , ]
filter.cond <- (lvr_prices$area =='大安區') & (lvr_prices$trading_target=='房地(土地+建物)') & (lvr_prices$city_land_type == '住')
daan1 <- lvr_prices[filter.cond,]
mean(daan1$total_price, na.rm=TRUE)
## [1] 26559742
mean(daan1$price_per_sqmeter, na.rm=TRUE)
## [1] 254814.4
254814.4 / 0.3025
## [1] 842361.7
zhongshan <- lvr_prices[lvr_prices$area == '中山區', c('total_price', 'address')]
head(zhongshan)
## # A tibble: 6 x 2
## total_price address
## <int> <chr>
## 1 5960000 臺北市中山區合江街31~60號
## 2 20200000 臺北市中山區中山北路二段183巷1~30號
## 3 4050000 臺北市中山區吉林路361~390號
## 4 1900000 長安段三小段271~300地號
## 5 14800000 臺北市中山區林森北路485巷1~30號
## 6 10200000 臺北市中山區建國北路三段93巷5弄1~30號
res <- zhongshan[order(zhongshan$total_price, decreasing = TRUE), ]
res[ 1:3 , ]
## # A tibble: 3 x 2
## total_price address
## <int> <chr>
## 1 1850000000 臺北市中山區建國北路一段138巷1~30號
## 2 1400000000 臺北市中山區南京東路三段1~30號
## 3 1084948034 中山段二小段31~60地號
head(res, 3)
## # A tibble: 3 x 2
## total_price address
## <int> <chr>
## 1 1850000000 臺北市中山區建國北路一段138巷1~30號
## 2 1400000000 臺北市中山區南京東路三段1~30號
## 3 1084948034 中山段二小段31~60地號
getTopThree <- function(area){
zhongshan <- lvr_prices[lvr_prices$area == area, c('total_price', 'address')]
head(zhongshan)
res <- zhongshan[order(zhongshan$total_price, decreasing = TRUE), ]
return(res[ 1:3 , ] )
}
getTopThree('大安區')
## # A tibble: 3 x 2
## total_price address
## <int> <chr>
## 1 1869781219 臺北市大安區羅斯福路三段283巷4弄1~30號
## 2 971340000 臺北市大安區忠孝東路四段241~270號
## 3 966660000 學府段三小段31~60地號
price_per_sec <- tapply(lvr_prices$total_price , lvr_prices$area , function(e) mean(e, na.rm = TRUE) )
barplot(sort(price_per_sec, decreasing = TRUE) , main = '各區平均價', xlab = '區域', ylab = '價格', col= "blue")
a <- c(1,20,30,40,50,60,700)
median(a)
## [1] 40
quantile(a, 0.25)
## 25%
## 25
quantile(a, 0.75)
## 75%
## 55
IQR(a)
## [1] 30
max(1, median(a) - 1.5 * IQR(a))
## [1] 1
min(700, median(a) + 1.5 * IQR(a))
## [1] 85
boxplot(a)
boxplot(lvr_prices$total_price ~ lvr_prices$area )
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
boxplot(log(lvr_prices$total_price) ~ lvr_prices$area , main= "房價箱型圖", xlab = "區域", ylab = "價格(log)" )
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 3 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 6 is not drawn
#install.packages('dplyr')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
help(package= 'dplyr')
# R Style Select
head(lvr_prices[, c('total_price')])
## # A tibble: 6 x 1
## total_price
## <int>
## 1 18680000
## 2 20300000
## 3 132096
## 4 4200000
## 5 14000000
## 6 255000
# dplyr Style Select
head(select(lvr_prices, total_price))
## # A tibble: 6 x 1
## total_price
## <int>
## 1 18680000
## 2 20300000
## 3 132096
## 4 4200000
## 5 14000000
## 6 255000
# R Style filter
head(lvr_prices[lvr_prices$area == '中山區', ])
## # A tibble: 6 x 29
## X1 area trading_target address
## <int> <chr> <chr> <chr>
## 1 13 中山區 房地(土地+建物) 臺北市中山區合江街31~60號
## 2 14 中山區 房地(土地+建物) 臺北市中山區中山北路二段183巷1~30號
## 3 16 中山區 房地(土地+建物) 臺北市中山區吉林路361~390號
## 4 17 中山區 土地 長安段三小段271~300地號
## 5 24 中山區 房地(土地+建物) 臺北市中山區林森北路485巷1~30號
## 6 39 中山區 房地(土地+建物) 臺北市中山區建國北路三段93巷5弄1~30號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## # non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## # trading_num <chr>, floor <chr>, total_floor <chr>,
## # building_type <chr>, main_purpose <chr>, built_with <chr>,
## # finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## # living_room <int>, bath <int>, compartment <chr>, management <chr>,
## # total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## # parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## # numbers <chr>
# dplyr Style filter
head(filter(lvr_prices, area == '中山區'))
## # A tibble: 6 x 29
## X1 area trading_target address
## <int> <chr> <chr> <chr>
## 1 13 中山區 房地(土地+建物) 臺北市中山區合江街31~60號
## 2 14 中山區 房地(土地+建物) 臺北市中山區中山北路二段183巷1~30號
## 3 16 中山區 房地(土地+建物) 臺北市中山區吉林路361~390號
## 4 17 中山區 土地 長安段三小段271~300地號
## 5 24 中山區 房地(土地+建物) 臺北市中山區林森北路485巷1~30號
## 6 39 中山區 房地(土地+建物) 臺北市中山區建國北路三段93巷5弄1~30號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## # non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## # trading_num <chr>, floor <chr>, total_floor <chr>,
## # building_type <chr>, main_purpose <chr>, built_with <chr>,
## # finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## # living_room <int>, bath <int>, compartment <chr>, management <chr>,
## # total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## # parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## # numbers <chr>
# Iris R Style Manipulation
sum(tail(head(iris), 3)$Sepal.Length)
## [1] 15
# magrittr Style Manipulation
iris %>% head() %>% tail(3) %>% .$Sepal.Length %>% sum()
## [1] 15
lvr_prices %>%
filter(area == '中山區') %>%
select(total_price, address) %>%
head()
## # A tibble: 6 x 2
## total_price address
## <int> <chr>
## 1 5960000 臺北市中山區合江街31~60號
## 2 20200000 臺北市中山區中山北路二段183巷1~30號
## 3 4050000 臺北市中山區吉林路361~390號
## 4 1900000 長安段三小段271~300地號
## 5 14800000 臺北市中山區林森北路485巷1~30號
## 6 10200000 臺北市中山區建國北路三段93巷5弄1~30號
lvr_prices %>%
filter(area == '中山區') %>%
select(total_price, address) %>%
arrange(total_price) %>%
head()
## # A tibble: 6 x 2
## total_price address
## <int> <chr>
## 1 0 中山段一小段691~720地號
## 2 0 中山段一小段691~720地號
## 3 10860 榮星段四小段211~240地號
## 4 16000 中山段四小段211~240地號
## 5 18060 榮星段四小段211~240地號
## 6 21244 榮星段二小段361~390地號
lvr_prices %>%
filter(area == '中山區') %>%
select(total_price, address) %>%
arrange(desc(total_price)) %>%
head()
## # A tibble: 6 x 2
## total_price address
## <int> <chr>
## 1 1850000000 臺北市中山區建國北路一段138巷1~30號
## 2 1400000000 臺北市中山區南京東路三段1~30號
## 3 1084948034 中山段二小段31~60地號
## 4 1011136500 中山段三小段301~330地號
## 5 952875000 金泰段61~90地號
## 6 903865500 中山段一小段361~390地號
lvr_prices$trading_ym <- as.Date(format(lvr_prices$trading_ymd, '%Y-%m-01'))
lvr_stat <- lvr_prices %>%
filter(trading_ym >= '2012-01-01') %>%
select(trading_ym, total_price, area) %>%
group_by(trading_ym, area) %>%
summarise(overall_price = sum(as.numeric(total_price)))
lvr_stat$area <- as.factor(lvr_stat$area)
par(mfrow =c(3,4))
for (a in levels(lvr_stat$area)){
res <- lvr_stat[lvr_stat$area == a,]
plot(res$trading_ym, res$overall_price, type = 'l', main = a)
}
## pivot table
#lvr_stat
#install.packages('tidyr')
library(tidyr)
price_pivot <- spread(lvr_stat, trading_ym, overall_price, fill = 0 )
price_pivot
## # A tibble: 12 x 54
## area `2012-01-01` `2012-02-01` `2012-03-01` `2012-04-01` `2012-05-01`
## * <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 士林區 661140000 231680000 359891504 205481036 2539010528
## 2 大同區 0 180150000 525210000 87110000 74806000
## 3 大安區 139136600 123870000 64470991 127736080 49052000
## 4 中山區 17601653 140250000 176022439 3156689238 2374390095
## 5 中正區 258200000 112690000 660420000 935400000 550190000
## 6 內湖區 349930000 216810000 299907515 944724354 1444681765
## 7 文山區 166887497 147810000 553478681 739757581 192890000
## 8 北投區 43850000 68000 82490000 494942526 899718610
## 9 松山區 0 0 405003695 554400 405400000
## 10 信義區 40800000 177020000 1269564574 2517094802 1241890000
## 11 南港區 53560259 145250000 430330000 453100000 734600000
## 12 萬華區 17430000 14800000 7800000 514268500 311962000
## # ... with 48 more variables: `2012-06-01` <dbl>, `2012-07-01` <dbl>,
## # `2012-08-01` <dbl>, `2012-09-01` <dbl>, `2012-10-01` <dbl>,
## # `2012-11-01` <dbl>, `2012-12-01` <dbl>, `2013-01-01` <dbl>,
## # `2013-02-01` <dbl>, `2013-03-01` <dbl>, `2013-04-01` <dbl>,
## # `2013-05-01` <dbl>, `2013-06-01` <dbl>, `2013-07-01` <dbl>,
## # `2013-08-01` <dbl>, `2013-09-01` <dbl>, `2013-10-01` <dbl>,
## # `2013-11-01` <dbl>, `2013-12-01` <dbl>, `2014-01-01` <dbl>,
## # `2014-02-01` <dbl>, `2014-03-01` <dbl>, `2014-04-01` <dbl>,
## # `2014-05-01` <dbl>, `2014-06-01` <dbl>, `2014-07-01` <dbl>,
## # `2014-08-01` <dbl>, `2014-09-01` <dbl>, `2014-10-01` <dbl>,
## # `2014-11-01` <dbl>, `2014-12-01` <dbl>, `2015-01-01` <dbl>,
## # `2015-02-01` <dbl>, `2015-03-01` <dbl>, `2015-04-01` <dbl>,
## # `2015-05-01` <dbl>, `2015-06-01` <dbl>, `2015-07-01` <dbl>,
## # `2015-08-01` <dbl>, `2015-09-01` <dbl>, `2015-10-01` <dbl>,
## # `2015-11-01` <dbl>, `2015-12-01` <dbl>, `2016-01-01` <dbl>,
## # `2016-02-01` <dbl>, `2016-03-01` <dbl>, `2016-04-01` <dbl>,
## # `2016-05-01` <dbl>
write.csv(price_pivot, 'taipei_house_price.csv')
#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Training50.csv', 'Training50.csv')
#install.packages('rpart')
library(rpart)
trainset <- read.csv('Training50.csv')
class(trainset)
## [1] "data.frame"
View(trainset)
trainset$X <- NULL
model <- rpart(Creditability ~ . , data = trainset, method = 'class')
plot(model, margin = 0.1)
text(model)
download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Test50.csv', 'Test50.csv')
testset <- read.csv('Test50.csv')
testset$X <- NULL
head(predict(model, testset, type='class'))
## 1 2 3 4 5 6
## 1 1 0 1 1 1
## Levels: 0 1
predicted <- predict(model, testset, type='class')
sum(predicted == testset$Creditability) / length(testset$Creditability)
## [1] 0.71
table(predicted, testset$Creditability)
##
## predicted 0 1
## 0 64 52
## 1 93 291
predicted <- predict(model, testset)
res <- ifelse(predicted[,1] > 0.2, 0, 1)
tb <- table(res, testset$Creditability)
TP <- tb[1]
FN <- tb[2]
FP <- tb[3]
TN <- tb[4]
TPR <- TP / (TP + FN)
FPR <- FP / (FP + TN)
prediction <- predict(model, testset, type = "prob")
roc_x <- c(0)
roc_y <- c(0)
for(i in seq(0,1,0.01)){
res <- as.factor(ifelse(prediction[,1] >= i, 0, 1))
tb <- table(testset$Creditability, res)
if (ncol(tb) == 2){
TP <- tb[1]
FN <- tb[2]
FP <- tb[3]
TN <- tb[4]
TPR <- TP / (TP + FN)
FPR <- FP / (FP + TN)
x <- FPR
y <- TPR
roc_x <- c(roc_x, x)
roc_y <- c(roc_y, y)
}
}
roc_x <- c(roc_x, 1)
roc_y <- c(roc_y, 1)
plot(roc_x, roc_y, type='b')
# install.packages('ROCR')
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <- predict(model, testset, type="prob")
pred.to.roc <- predictions[, 2]
pred.rocr <- prediction(pred.to.roc, as.factor(testset$Creditability))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))
## Using RandomForest
#install.packages('radomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
trainset$Creditability <- as.factor(trainset$Creditability)
testset$Creditability <- as.factor(testset$Creditability)
forest <- randomForest(Creditability ~., data = trainset, ntree=200, importance=T, proximity=T)
forest.predicted <- predict(forest, testset)
sum(forest.predicted == testset$Creditability) / length(testset$Creditability)
## [1] 0.74
table(forest$predicted, testset$Creditability)
##
## 0 1
## 0 19 61
## 1 138 282
predictions1 <- predict(model, testset, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$Creditability))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")
predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$Creditability))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")
plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)
## Clustering
#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/customers.csv', 'customers.csv')
customers <- read.csv('customers.csv')
class(customers)
## [1] "data.frame"
str(customers)
## 'data.frame': 200 obs. of 5 variables:
## $ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Genre : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual_Income : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending_Score: int 39 81 6 77 40 76 6 94 3 72 ...
head(customers)
## CustomerID Genre Age Annual_Income Spending_Score
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
customers <- customers[, c('Annual_Income', 'Spending_Score')]
set.seed(123)
sample.int(42,6)
## [1] 13 33 17 35 36 2
set.seed(123)
kc <- kmeans(customers, centers = 5)
kc
## K-means clustering with 5 clusters of sizes 50, 27, 74, 39, 10
##
## Cluster means:
## Annual_Income Spending_Score
## 1 27.40000 49.48000
## 2 79.00000 16.59259
## 3 55.90541 49.93243
## 4 86.53846 82.12821
## 5 109.70000 22.00000
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## [141] 2 4 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2
## [176] 4 2 4 2 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
##
## Within cluster sum of squares by cluster:
## [1] 48174.480 4062.519 7375.000 13444.051 2458.100
## (between_SS / total_SS = 72.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
#kc$cluster
plot(customers$Annual_Income, customers$Spending_Score, col = kc$cluster)
points(kc$centers[,1],kc$centers[,2] , col="yellow")
#install.packages('cluster')
library(cluster)
kcs <- silhouette(kc$cluster, dist(customers))
plot(kcs)
nk <- 2:10
sapply(nk, function(e) e ^ 2)
## [1] 4 9 16 25 36 49 64 81 100
library(fpc)
nk <- 2:10
set.seed(123)
SW <- sapply(nk, function(k) {
cluster.stats(dist(customers), kmeans(customers, centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")