Demo20170925

R Basic

a <- 3
b = 2
a+b

## [1] 5

Data Frame

data()
data(iris)
View(iris)

class(iris)

## [1] "data.frame"

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

head(iris, 10)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa

tail(iris)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 145          6.7         3.3          5.7         2.5 virginica
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica

help(head)

## starting httpd help server ... done

?head

iris[ 1         ,    ]

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa

iris[ c(1,2,3)  ,    ]

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa

iris[ 1:3       ,    ]

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa

iris[ 1:3       ,   1          ]

## [1] 5.1 4.9 4.7

iris[ 1:3       ,   c(1,2)     ]

##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2

iris[ 1:3       ,   1:2        ]

##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2

iris[ 1:3       ,'Sepal.Length']

## [1] 5.1 4.9 4.7

iris[ 1:3   ,c('Sepal.Length', 'Sepal.Width')]

##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2

head(iris[, 'Sepal.Length'])

## [1] 5.1 4.9 4.7 4.6 5.0 5.4

head(iris$Sepal.Length)

## [1] 5.1 4.9 4.7 4.6 5.0 5.4

head(iris$Species == 'setosa')

## [1] TRUE TRUE TRUE TRUE TRUE TRUE

iris[ iris$Species == 'setosa'  ,    ]

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 12          4.8         3.4          1.6         0.2  setosa
## 13          4.8         3.0          1.4         0.1  setosa
## 14          4.3         3.0          1.1         0.1  setosa
## 15          5.8         4.0          1.2         0.2  setosa
## 16          5.7         4.4          1.5         0.4  setosa
## 17          5.4         3.9          1.3         0.4  setosa
## 18          5.1         3.5          1.4         0.3  setosa
## 19          5.7         3.8          1.7         0.3  setosa
## 20          5.1         3.8          1.5         0.3  setosa
## 21          5.4         3.4          1.7         0.2  setosa
## 22          5.1         3.7          1.5         0.4  setosa
## 23          4.6         3.6          1.0         0.2  setosa
## 24          5.1         3.3          1.7         0.5  setosa
## 25          4.8         3.4          1.9         0.2  setosa
## 26          5.0         3.0          1.6         0.2  setosa
## 27          5.0         3.4          1.6         0.4  setosa
## 28          5.2         3.5          1.5         0.2  setosa
## 29          5.2         3.4          1.4         0.2  setosa
## 30          4.7         3.2          1.6         0.2  setosa
## 31          4.8         3.1          1.6         0.2  setosa
## 32          5.4         3.4          1.5         0.4  setosa
## 33          5.2         4.1          1.5         0.1  setosa
## 34          5.5         4.2          1.4         0.2  setosa
## 35          4.9         3.1          1.5         0.2  setosa
## 36          5.0         3.2          1.2         0.2  setosa
## 37          5.5         3.5          1.3         0.2  setosa
## 38          4.9         3.6          1.4         0.1  setosa
## 39          4.4         3.0          1.3         0.2  setosa
## 40          5.1         3.4          1.5         0.2  setosa
## 41          5.0         3.5          1.3         0.3  setosa
## 42          4.5         2.3          1.3         0.3  setosa
## 43          4.4         3.2          1.3         0.2  setosa
## 44          5.0         3.5          1.6         0.6  setosa
## 45          5.1         3.8          1.9         0.4  setosa
## 46          4.8         3.0          1.4         0.3  setosa
## 47          5.1         3.8          1.6         0.2  setosa
## 48          4.6         3.2          1.4         0.2  setosa
## 49          5.3         3.7          1.5         0.2  setosa
## 50          5.0         3.3          1.4         0.2  setosa

filter.cond <- (iris$Species == 'setosa' ) &
(iris$Sepal.Length >= 5)

head(iris[filter.cond,    ])

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 15          5.8         4.0          1.2         0.2  setosa

heights <- c(180, 172, 168, 183, 178)
sort(heights)

## [1] 168 172 178 180 183

sort(heights, decreasing = TRUE)

## [1] 183 180 178 172 168

order(heights)

## [1] 3 2 5 1 4

order(heights, decreasing = TRUE)

## [1] 4 1 5 2 3

head(sort(iris$Sepal.Length, decreasing = TRUE))

## [1] 7.9 7.7 7.7 7.7 7.7 7.6

rank <- order(iris$Sepal.Length, decreasing = TRUE)

head(iris[ rank , 'Species' ])

## [1] virginica virginica virginica virginica virginica virginica
## Levels: setosa versicolor virginica

data("anscombe")
View(anscombe)


plot(y1 ~ x1, data = anscombe)

plot(y2 ~ x1, data = anscombe)

plot(y3 ~ x1, data = anscombe)

plot(y4 ~ x1, data = anscombe)

tb <- table(iris$Species)
pie(tb)

barplot(tb)

hist(iris$Sepal.Length)

boxplot(iris$Sepal.Length)

boxplot(iris$Petal.Length~iris$Species)

plot(iris$Petal.Length, iris$Petal.Width)

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)

使用R 探索資料

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/lvr_prices.csv', 'lvr_prices.csv')

getwd()

## [1] "D:/OS DATA/Desktop"

library(readr)
lvr_prices <- read_csv("D:/OS DATA/Desktop/lvr_prices.csv")

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )

## See spec(...) for full column specifications.

## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)

## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col     row         col   expected     actual expected   <int>       <chr>      <chr>      <chr> actual 1  1282 total_price an integer 6700000000 file 2  2243 total_price an integer 3882685600 row 3  2244 total_price an integer 3373314400 col 4  4629 total_price an integer 3050000000 expected 5  5890 total_price an integer 3133800000 actual # ... with 1 more variables: file <chr>
## ... ................. ... ......................................... ........ ......................................... ...... ......................................... .... ......................................... ... ......................................... ... ......................................... ........ ......................................... ...... .......................................
## See problems(...) for more details.

#View(lvr_prices)

class(lvr_prices)

## [1] "tbl_df"     "tbl"        "data.frame"

str(lvr_prices)

## Classes 'tbl_df', 'tbl' and 'data.frame':    102054 obs. of  29 variables:
##  $ X1                : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ area              : chr  "大安區" "中正區" "大同區" "大同區" ...
##  $ trading_target    : chr  "房地(土地+建物)" "房地(土地+建物)" "土地" "房地(土地+建物)" ...
##  $ address           : chr  "臺北市大安區和平東路三段1巷72弄1~30號" "臺北市中正區忠孝東路二段121~150號" "橋北段二小段601~630地號" "臺北市大同區重慶北路一段61~90號" ...
##  $ land_sqmeter      : num  19.39 8.46 5.5 3.88 32.41 ...
##  $ city_land_type    : chr  "住" "商" "其他" "商" ...
##  $ non_city_land_type: chr  NA NA NA NA ...
##  $ non_city_code     : chr  NA NA NA NA ...
##  $ trading_ymd       : Date, format: "2012-06-29" "2012-07-18" ...
##  $ trading_num       : chr  "土地1建物2車位0" "土地3建物1車位0" "土地1建物0車位0" "土地4建物1車位0" ...
##  $ floor             : chr  "五層" "九層" NA "六層" ...
##  $ total_floor       : chr  "十七層" "十二層" NA "十一層" ...
##  $ building_type     : chr  "住宅大樓(11層含以上有電梯)" "辦公商業大樓" "其他" "住宅大樓(11層含以上有電梯)" ...
##  $ main_purpose      : chr  "國民住宅" "商業用" NA "商業用" ...
##  $ built_with        : chr  "鋼筋混凝土造" "鋼筋混凝土造" NA "鋼筋混凝土造" ...
##  $ finish_ymd        : Date, format: "1985-05-22" "1982-04-08" ...
##  $ building_sqmeter  : num  101 93.4 0 36.7 104.1 ...
##  $ room              : int  3 0 0 1 3 0 0 3 2 3 ...
##  $ living_room       : int  2 0 0 1 1 0 0 2 1 2 ...
##  $ bath              : int  1 0 0 1 1 0 0 2 1 2 ...
##  $ compartment       : chr  "有" "有" "有" "有" ...
##  $ management        : chr  "有" "有" "無" "有" ...
##  $ total_price       : int  18680000 20300000 132096 4200000 14000000 255000 50000 25800000 19000000 28000000 ...
##  $ price_per_sqmeter : num  184999 217307 24017 114317 134473 ...
##  $ parking_type      : chr  NA NA NA NA ...
##  $ parking_sqmeter   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ parking_price     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ comments          : chr  NA NA NA NA ...
##  $ numbers           : chr  "RPQNMLSJQHHFFFA08CA" "RPQOMLKLQHHFFBA17CA" "RPUNMLLMQHHFFBA67CA" "RPOPMLRKJHIFFBA07CA" ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 32 obs. of  5 variables:
##   ..$ row     : int  1282 2243 2244 4629 5890 7153 7522 9777 10596 10714 ...
##   ..$ col     : chr  "total_price" "total_price" "total_price" "total_price" ...
##   ..$ expected: chr  "an integer" "an integer" "an integer" "an integer" ...
##   ..$ actual  : chr  "6700000000" "3882685600" "3373314400" "3050000000" ...
##   ..$ file    : chr  "'D:/OS DATA/Desktop/lvr_prices.csv'" "'D:/OS DATA/Desktop/lvr_prices.csv'" "'D:/OS DATA/Desktop/lvr_prices.csv'" "'D:/OS DATA/Desktop/lvr_prices.csv'" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 29
##   .. ..$ X1                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ area              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ trading_target    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ address           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ land_sqmeter      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ city_land_type    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ non_city_land_type: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ non_city_code     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ trading_ymd       :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ trading_num       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ floor             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ total_floor       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ building_type     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ main_purpose      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ built_with        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ finish_ymd        :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ building_sqmeter  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ room              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ living_room       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ bath              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ compartment       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ management        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ total_price       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ price_per_sqmeter : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ parking_type      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ parking_sqmeter   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ parking_price     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ comments          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ numbers           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

summary(lvr_prices)

##        X1           area           trading_target       address         
##  Min.   :   0   Length:102054      Length:102054      Length:102054     
##  1st Qu.:1649   Class :character   Class :character   Class :character  
##  Median :3363   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :3539                                                           
##  3rd Qu.:5188                                                           
##  Max.   :9683                                                           
##                                                                         
##   land_sqmeter      city_land_type     non_city_land_type
##  Min.   :    0.00   Length:102054      Length:102054     
##  1st Qu.:    9.33   Class :character   Class :character  
##  Median :   22.13   Mode  :character   Mode  :character  
##  Mean   :   54.32                                        
##  3rd Qu.:   35.73                                        
##  Max.   :46193.00                                        
##                                                          
##  non_city_code       trading_ymd         trading_num       
##  Length:102054      Min.   :1973-08-29   Length:102054     
##  Class :character   1st Qu.:2013-04-17   Class :character  
##  Mode  :character   Median :2014-01-02   Mode  :character  
##                     Mean   :2014-02-11                     
##                     3rd Qu.:2014-12-29                     
##                     Max.   :2016-05-16                     
##                     NA's   :20                             
##     floor           total_floor        building_type     
##  Length:102054      Length:102054      Length:102054     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  main_purpose        built_with          finish_ymd        
##  Length:102054      Length:102054      Min.   :1911-05-06  
##  Class :character   Class :character   1st Qu.:1983-03-30  
##  Mode  :character   Mode  :character   Median :1997-01-20  
##                                        Mean   :1996-04-21  
##                                        3rd Qu.:2010-03-01  
##                                        Max.   :2016-03-11  
##                                        NA's   :21992       
##  building_sqmeter        room          living_room          bath        
##  Min.   :    0.00   Min.   :  0.000   Min.   : 0.000   Min.   :  0.000  
##  1st Qu.:   43.45   1st Qu.:  0.000   1st Qu.: 0.000   1st Qu.:  1.000  
##  Median :   90.79   Median :  2.000   Median : 1.000   Median :  1.000  
##  Mean   :  124.51   Mean   :  1.938   Mean   : 1.208   Mean   :  1.248  
##  3rd Qu.:  144.22   3rd Qu.:  3.000   3rd Qu.: 2.000   3rd Qu.:  2.000  
##  Max.   :69125.53   Max.   :168.000   Max.   :80.000   Max.   :174.000  
##                                                                         
##  compartment         management         total_price       
##  Length:102054      Length:102054      Min.   :0.000e+00  
##  Class :character   Class :character   1st Qu.:7.500e+06  
##  Mode  :character   Mode  :character   Median :1.450e+07  
##                                        Mean   :2.399e+07  
##                                        3rd Qu.:2.615e+07  
##                                        Max.   :2.029e+09  
##                                        NA's   :32         
##  price_per_sqmeter  parking_type       parking_sqmeter   
##  Min.   :       0   Length:102054      Min.   :0.00e+00  
##  1st Qu.:  120832   Class :character   1st Qu.:0.00e+00  
##  Median :  166859   Mode  :character   Median :0.00e+00  
##  Mean   :  184978                      Mean   :2.54e+01  
##  3rd Qu.:  222266                      3rd Qu.:0.00e+00  
##  Max.   :62685714                      Max.   :1.45e+06  
##  NA's   :5199                                            
##  parking_price         comments           numbers         
##  Min.   :        0   Length:102054      Length:102054     
##  1st Qu.:        0   Class :character   Class :character  
##  Median :        0   Mode  :character   Mode  :character  
##  Mean   :   488509                                        
##  3rd Qu.:        0                                        
##  Max.   :240000000                                        
##

head(lvr_prices)

## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1     0 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2     1 中正區 房地(土地+建物)     臺北市中正區忠孝東路二段121~150號
## 3     2 大同區            土地               橋北段二小段601~630地號
## 4     3 大同區 房地(土地+建物)       臺北市大同區重慶北路一段61~90號
## 5     4 內湖區 房地(土地+建物) 臺北市內湖區民權東路六段90巷6弄1~30號
## 6     5 信義區            土地               福德段一小段661~690地號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>

tail(lvr_prices)

## # A tibble: 6 x 29
##      X1   area       trading_target                                address
##   <int>  <chr>                <chr>                                  <chr>
## 1   767 萬華區      房地(土地+建物)  臺北市萬華區中華路二段364巷24弄1~30號
## 2   768 中正區      房地(土地+建物)           臺北市中正區中華路一段1~30號
## 3   769 中正區 房地(土地+建物)+車位           臺北市中正區中華路一段1~30號
## 4   770 中正區      房地(土地+建物)           臺北市中正區中華路一段1~30號
## 5   771 文山區                 車位 臺北市文山區羅斯福路六段159巷1弄1~30號
## 6   772 中正區                 土地                福和段一小段751~780地號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>

daan <- lvr_prices[lvr_prices$area =='大安區' ,  ]
sum(as.numeric(daan$total_price), na.rm = TRUE)

## [1] 2.79477e+11

?sum


mean(as.numeric(daan$total_price), na.rm = TRUE)

## [1] 29798170

median(as.numeric(daan$total_price), na.rm = TRUE)

## [1] 2e+07

a <- c(2,3,4,5,6)
mean(a)

## [1] 4

median(a)

## [1] 4

a <- c(2,3,4,5,600)
mean(a)

## [1] 122.8

median(a)

## [1] 4

daan <- lvr_prices[lvr_prices$area =='大安區' ,  ]

filter.cond <- (lvr_prices$area =='大安區') & (lvr_prices$trading_target=='房地(土地+建物)') & (lvr_prices$city_land_type == '住')

daan1 <- lvr_prices[filter.cond,]
mean(daan1$total_price, na.rm=TRUE)

## [1] 26559742

mean(daan1$price_per_sqmeter, na.rm=TRUE)

## [1] 254814.4

254814.4 / 0.3025

## [1] 842361.7

zhongshan <- lvr_prices[lvr_prices$area == '中山區', c('total_price', 'address')]
head(zhongshan)

## # A tibble: 6 x 2
##   total_price                               address
##         <int>                                 <chr>
## 1     5960000             臺北市中山區合江街31~60號
## 2    20200000   臺北市中山區中山北路二段183巷1~30號
## 3     4050000           臺北市中山區吉林路361~390號
## 4     1900000               長安段三小段271~300地號
## 5    14800000       臺北市中山區林森北路485巷1~30號
## 6    10200000 臺北市中山區建國北路三段93巷5弄1~30號

res <- zhongshan[order(zhongshan$total_price, decreasing = TRUE), ]
res[ 1:3  ,   ]

## # A tibble: 3 x 2
##   total_price                             address
##         <int>                               <chr>
## 1  1850000000 臺北市中山區建國北路一段138巷1~30號
## 2  1400000000      臺北市中山區南京東路三段1~30號
## 3  1084948034               中山段二小段31~60地號

head(res, 3)

## # A tibble: 3 x 2
##   total_price                             address
##         <int>                               <chr>
## 1  1850000000 臺北市中山區建國北路一段138巷1~30號
## 2  1400000000      臺北市中山區南京東路三段1~30號
## 3  1084948034               中山段二小段31~60地號

getTopThree <- function(area){
  zhongshan <- lvr_prices[lvr_prices$area == area, c('total_price', 'address')]
  head(zhongshan)
  
  res <- zhongshan[order(zhongshan$total_price, decreasing = TRUE), ]
  return(res[ 1:3  ,   ]  )
}

getTopThree('大安區')

## # A tibble: 3 x 2
##   total_price                                address
##         <int>                                  <chr>
## 1  1869781219 臺北市大安區羅斯福路三段283巷4弄1~30號
## 2   971340000      臺北市大安區忠孝東路四段241~270號
## 3   966660000                  學府段三小段31~60地號

price_per_sec <- tapply(lvr_prices$total_price  ,  lvr_prices$area , function(e) mean(e, na.rm = TRUE)  )

barplot(sort(price_per_sec, decreasing = TRUE) , main = '各區平均價', xlab = '區域', ylab = '價格', col= "blue")

a <-  c(1,20,30,40,50,60,700)
median(a)

## [1] 40

quantile(a, 0.25)

## 25% 
##  25

quantile(a, 0.75)

## 75% 
##  55

IQR(a)

## [1] 30

max(1, median(a) - 1.5 * IQR(a))

## [1] 1

min(700, median(a) + 1.5 * IQR(a))

## [1] 85

boxplot(a)

boxplot(lvr_prices$total_price ~ lvr_prices$area  )

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

boxplot(log(lvr_prices$total_price) ~ lvr_prices$area , main= "房價箱型圖", xlab = "區域", ylab = "價格(log)" )

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 3 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 6 is not drawn

DPLYR

#install.packages('dplyr')
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

help(package= 'dplyr')

# R Style Select
head(lvr_prices[, c('total_price')])

## # A tibble: 6 x 1
##   total_price
##         <int>
## 1    18680000
## 2    20300000
## 3      132096
## 4     4200000
## 5    14000000
## 6      255000

# dplyr Style Select
head(select(lvr_prices, total_price))

## # A tibble: 6 x 1
##   total_price
##         <int>
## 1    18680000
## 2    20300000
## 3      132096
## 4     4200000
## 5    14000000
## 6      255000

# R Style filter
head(lvr_prices[lvr_prices$area == '中山區', ])

## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1    13 中山區 房地(土地+建物)             臺北市中山區合江街31~60號
## 2    14 中山區 房地(土地+建物)   臺北市中山區中山北路二段183巷1~30號
## 3    16 中山區 房地(土地+建物)           臺北市中山區吉林路361~390號
## 4    17 中山區            土地               長安段三小段271~300地號
## 5    24 中山區 房地(土地+建物)       臺北市中山區林森北路485巷1~30號
## 6    39 中山區 房地(土地+建物) 臺北市中山區建國北路三段93巷5弄1~30號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>

# dplyr Style filter
head(filter(lvr_prices, area == '中山區'))

## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1    13 中山區 房地(土地+建物)             臺北市中山區合江街31~60號
## 2    14 中山區 房地(土地+建物)   臺北市中山區中山北路二段183巷1~30號
## 3    16 中山區 房地(土地+建物)           臺北市中山區吉林路361~390號
## 4    17 中山區            土地               長安段三小段271~300地號
## 5    24 中山區 房地(土地+建物)       臺北市中山區林森北路485巷1~30號
## 6    39 中山區 房地(土地+建物) 臺北市中山區建國北路三段93巷5弄1~30號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>

# Iris R Style Manipulation
sum(tail(head(iris), 3)$Sepal.Length)

## [1] 15

# magrittr Style Manipulation
iris %>% head() %>% tail(3) %>% .$Sepal.Length %>% sum()

## [1] 15

lvr_prices %>%
  filter(area == '中山區') %>%
  select(total_price, address) %>%
  head()

## # A tibble: 6 x 2
##   total_price                               address
##         <int>                                 <chr>
## 1     5960000             臺北市中山區合江街31~60號
## 2    20200000   臺北市中山區中山北路二段183巷1~30號
## 3     4050000           臺北市中山區吉林路361~390號
## 4     1900000               長安段三小段271~300地號
## 5    14800000       臺北市中山區林森北路485巷1~30號
## 6    10200000 臺北市中山區建國北路三段93巷5弄1~30號

lvr_prices %>%
  filter(area == '中山區') %>%
  select(total_price, address) %>%
  arrange(total_price) %>%
  head()

## # A tibble: 6 x 2
##   total_price                 address
##         <int>                   <chr>
## 1           0 中山段一小段691~720地號
## 2           0 中山段一小段691~720地號
## 3       10860 榮星段四小段211~240地號
## 4       16000 中山段四小段211~240地號
## 5       18060 榮星段四小段211~240地號
## 6       21244 榮星段二小段361~390地號

lvr_prices %>%
  filter(area == '中山區') %>%
  select(total_price, address) %>%
  arrange(desc(total_price)) %>%
  head()

## # A tibble: 6 x 2
##   total_price                             address
##         <int>                               <chr>
## 1  1850000000 臺北市中山區建國北路一段138巷1~30號
## 2  1400000000      臺北市中山區南京東路三段1~30號
## 3  1084948034               中山段二小段31~60地號
## 4  1011136500             中山段三小段301~330地號
## 5   952875000                     金泰段61~90地號
## 6   903865500             中山段一小段361~390地號

lvr_prices$trading_ym <- as.Date(format(lvr_prices$trading_ymd, '%Y-%m-01'))

lvr_stat <- lvr_prices %>% 
  filter(trading_ym >= '2012-01-01') %>%
  select(trading_ym, total_price, area) %>%
  group_by(trading_ym, area) %>%
  summarise(overall_price = sum(as.numeric(total_price)))

lvr_stat$area <- as.factor(lvr_stat$area)

par(mfrow =c(3,4))
for (a in levels(lvr_stat$area)){
  res <- lvr_stat[lvr_stat$area == a,]
  plot(res$trading_ym, res$overall_price,  type = 'l', main = a)  
}

## pivot table

#lvr_stat
#install.packages('tidyr')
library(tidyr)
price_pivot <- spread(lvr_stat, trading_ym, overall_price, fill = 0 )
price_pivot

## # A tibble: 12 x 54
##      area `2012-01-01` `2012-02-01` `2012-03-01` `2012-04-01` `2012-05-01`
##  * <fctr>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
##  1 士林區    661140000    231680000    359891504    205481036   2539010528
##  2 大同區            0    180150000    525210000     87110000     74806000
##  3 大安區    139136600    123870000     64470991    127736080     49052000
##  4 中山區     17601653    140250000    176022439   3156689238   2374390095
##  5 中正區    258200000    112690000    660420000    935400000    550190000
##  6 內湖區    349930000    216810000    299907515    944724354   1444681765
##  7 文山區    166887497    147810000    553478681    739757581    192890000
##  8 北投區     43850000        68000     82490000    494942526    899718610
##  9 松山區            0            0    405003695       554400    405400000
## 10 信義區     40800000    177020000   1269564574   2517094802   1241890000
## 11 南港區     53560259    145250000    430330000    453100000    734600000
## 12 萬華區     17430000     14800000      7800000    514268500    311962000
## # ... with 48 more variables: `2012-06-01` <dbl>, `2012-07-01` <dbl>,
## #   `2012-08-01` <dbl>, `2012-09-01` <dbl>, `2012-10-01` <dbl>,
## #   `2012-11-01` <dbl>, `2012-12-01` <dbl>, `2013-01-01` <dbl>,
## #   `2013-02-01` <dbl>, `2013-03-01` <dbl>, `2013-04-01` <dbl>,
## #   `2013-05-01` <dbl>, `2013-06-01` <dbl>, `2013-07-01` <dbl>,
## #   `2013-08-01` <dbl>, `2013-09-01` <dbl>, `2013-10-01` <dbl>,
## #   `2013-11-01` <dbl>, `2013-12-01` <dbl>, `2014-01-01` <dbl>,
## #   `2014-02-01` <dbl>, `2014-03-01` <dbl>, `2014-04-01` <dbl>,
## #   `2014-05-01` <dbl>, `2014-06-01` <dbl>, `2014-07-01` <dbl>,
## #   `2014-08-01` <dbl>, `2014-09-01` <dbl>, `2014-10-01` <dbl>,
## #   `2014-11-01` <dbl>, `2014-12-01` <dbl>, `2015-01-01` <dbl>,
## #   `2015-02-01` <dbl>, `2015-03-01` <dbl>, `2015-04-01` <dbl>,
## #   `2015-05-01` <dbl>, `2015-06-01` <dbl>, `2015-07-01` <dbl>,
## #   `2015-08-01` <dbl>, `2015-09-01` <dbl>, `2015-10-01` <dbl>,
## #   `2015-11-01` <dbl>, `2015-12-01` <dbl>, `2016-01-01` <dbl>,
## #   `2016-02-01` <dbl>, `2016-03-01` <dbl>, `2016-04-01` <dbl>,
## #   `2016-05-01` <dbl>

write.csv(price_pivot, 'taipei_house_price.csv')

Classfication

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Training50.csv', 'Training50.csv')

#install.packages('rpart')
library(rpart)
trainset <- read.csv('Training50.csv')
class(trainset)

## [1] "data.frame"

View(trainset)
trainset$X <- NULL

model <- rpart(Creditability ~ . , data = trainset, method = 'class')

plot(model, margin = 0.1)
text(model)

download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Test50.csv', 'Test50.csv')

testset <- read.csv('Test50.csv')
testset$X <- NULL

head(predict(model, testset, type='class'))

## 1 2 3 4 5 6 
## 1 1 0 1 1 1 
## Levels: 0 1

predicted <- predict(model, testset, type='class')

sum(predicted == testset$Creditability) / length(testset$Creditability)

## [1] 0.71

table(predicted, testset$Creditability)

##          
## predicted   0   1
##         0  64  52
##         1  93 291

ROC Curve

predicted <- predict(model, testset)
res <- ifelse(predicted[,1] > 0.2, 0, 1)
tb <- table(res, testset$Creditability)
TP <- tb[1]
FN <- tb[2]
FP <- tb[3]
TN <- tb[4]
TPR <- TP / (TP + FN)
FPR <- FP / (FP + TN)


prediction <- predict(model, testset, type = "prob")

roc_x <- c(0)
roc_y <- c(0)
for(i in seq(0,1,0.01)){
  res <- as.factor(ifelse(prediction[,1] >= i, 0, 1))

  tb <- table(testset$Creditability, res)
  if (ncol(tb) == 2){
    TP <- tb[1]
    FN <- tb[2]
    FP <- tb[3]
    TN <- tb[4]
    TPR <- TP / (TP + FN)
    FPR <- FP / (FP + TN)
    x <- FPR
    y <- TPR
    roc_x <- c(roc_x, x)
    roc_y <- c(roc_y, y)
  }
}
roc_x <- c(roc_x, 1)
roc_y <- c(roc_y, 1)
plot(roc_x, roc_y, type='b')

# install.packages('ROCR')
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

predictions <- predict(model, testset, type="prob")
pred.to.roc <- predictions[, 2] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset$Creditability)) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

## Using RandomForest

#install.packages('radomForest')
library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

trainset$Creditability <- as.factor(trainset$Creditability)


testset$Creditability <- as.factor(testset$Creditability)

forest <- randomForest(Creditability ~., data = trainset, ntree=200, importance=T, proximity=T)

forest.predicted <- predict(forest, testset)

sum(forest.predicted == testset$Creditability) / length(testset$Creditability)

## [1] 0.74

table(forest$predicted, testset$Creditability)

##    
##       0   1
##   0  19  61
##   1 138 282

predictions1 <- predict(model, testset, type="prob")
pred.to.roc1 <- predictions1[, 2] 
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$Creditability)) 
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 2] 
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$Creditability)) 
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")


plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

## Clustering

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/customers.csv', 'customers.csv')
customers <- read.csv('customers.csv')
class(customers)

## [1] "data.frame"

str(customers)

## 'data.frame':    200 obs. of  5 variables:
##  $ CustomerID    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Genre         : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ Age           : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ Annual_Income : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ Spending_Score: int  39 81 6 77 40 76 6 94 3 72 ...

head(customers)

##   CustomerID  Genre Age Annual_Income Spending_Score
## 1          1   Male  19            15             39
## 2          2   Male  21            15             81
## 3          3 Female  20            16              6
## 4          4 Female  23            16             77
## 5          5 Female  31            17             40
## 6          6 Female  22            17             76

customers <- customers[, c('Annual_Income', 'Spending_Score')]

set.seed(123)
sample.int(42,6)

## [1] 13 33 17 35 36  2

set.seed(123)
kc <- kmeans(customers, centers = 5)
kc

## K-means clustering with 5 clusters of sizes 50, 27, 74, 39, 10
## 
## Cluster means:
##   Annual_Income Spending_Score
## 1      27.40000       49.48000
## 2      79.00000       16.59259
## 3      55.90541       49.93243
## 4      86.53846       82.12821
## 5     109.70000       22.00000
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## [141] 2 4 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2
## [176] 4 2 4 2 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
## 
## Within cluster sum of squares by cluster:
## [1] 48174.480  4062.519  7375.000 13444.051  2458.100
##  (between_SS / total_SS =  72.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

#kc$cluster
plot(customers$Annual_Income, customers$Spending_Score, col = kc$cluster)
points(kc$centers[,1],kc$centers[,2] , col="yellow")

#install.packages('cluster')
library(cluster)
kcs <- silhouette(kc$cluster, dist(customers))

plot(kcs)



nk <- 2:10
sapply(nk, function(e) e ^ 2)

## [1]   4   9  16  25  36  49  64  81 100

library(fpc)

nk <- 2:10
set.seed(123)
SW <- sapply(nk, function(k) {
  cluster.stats(dist(customers), kmeans(customers, centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

Demo20170925

David Chiu

2017年9月25日

R Basic

Data Frame

使用R 探索資料

DPLYR

Classfication

ROC Curve