R Basic

data()
data(iris)
View(iris)
class(iris)
## [1] "data.frame"
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
head(iris, 10)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
# head(iris, -10)

?head
## starting httpd help server ... done
help(head)


head(iris[  1         ,     ])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
head(iris[  c(1,2,3)  ,     ])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
head(iris[  1:3       ,     ])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
head(iris[  1         ,    1 ])
## [1] 5.1
head(iris[  1:3       ,    1 ])
## [1] 5.1 4.9 4.7
head(iris[  1:3       ,"Sepal.Length" ])
## [1] 5.1 4.9 4.7
head(iris[            ,   1:2])
##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6
## 6          5.4         3.9
head(iris$Sepal.Length)
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
iris[ 1:5 ,  c("Sepal.Length","Sepal.Width") ]
##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6
head(iris$Species=='setosa')
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
head(iris[ iris$Species=='setosa' ,   ])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
head(iris[ iris$Species=='setosa' & iris$Sepal.Length >= 5,   ])
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 15          5.8         4.0          1.2         0.2  setosa
head(sort(iris$Sepal.Length))
## [1] 4.3 4.4 4.4 4.4 4.5 4.6
?sort

head(sort(iris$Sepal.Length, decreasing = TRUE))
## [1] 7.9 7.7 7.7 7.7 7.7 7.6
a <- c(2,1,5,3,4)
sort(a)
## [1] 1 2 3 4 5
order(a)
## [1] 2 1 4 5 3
head(iris[order(iris$Sepal.Length, decreasing = TRUE), c('Sepal.Length', 'Species')  ])
##     Sepal.Length   Species
## 132          7.9 virginica
## 118          7.7 virginica
## 119          7.7 virginica
## 123          7.7 virginica
## 136          7.7 virginica
## 106          7.6 virginica
table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50
pie(table(iris$Species))

barplot(table(iris$Species))

hist(iris$Sepal.Length)

boxplot(iris$Sepal.Length)

boxplot(Petal.Width ~ Species, data=iris)

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)

## 資料探索

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/lvr_prices.csv', 'lvr_prices.csv')
getwd()
## [1] "D:/OS DATA/Desktop"
library(readr)
lvr_prices <- read_csv("D:/OS DATA/Desktop/lvr_prices.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 5 parsing failures.
## row # A tibble: 5 x 5 col     row         col   expected     actual expected   <int>       <chr>      <chr>      <chr> actual 1  1282 total_price an integer 6700000000 file 2  2243 total_price an integer 3882685600 row 3  2244 total_price an integer 3373314400 col 4  4629 total_price an integer 3050000000 expected 5  5695        <NA> 29 columns  2 columns actual # ... with 1 more variables: file <chr>
View(lvr_prices)
daan <- lvr_prices[lvr_prices$area =='大安區' ,    ]

daan_total <- daan[(daan$trading_target == '房地(土地+建物)') & (daan$city_land_type == '住'), 'total_price']

summary(daan_total)
##   total_price       
##  Min.   :  2200000  
##  1st Qu.: 13180000  
##  Median : 21740000  
##  Mean   : 24473338  
##  3rd Qu.: 29450000  
##  Max.   :156300000  
##  NA's   :1
daan_price_per_sqmeter <- daan[(daan$trading_target == '房地(土地+建物)') & (daan$city_land_type == '住'), 'price_per_sqmeter']

summary(daan_price_per_sqmeter)
##  price_per_sqmeter
##  Min.   : 38010   
##  1st Qu.:200038   
##  Median :240063   
##  Mean   :241945   
##  3rd Qu.:273166   
##  Max.   :699830   
##  NA's   :1
241945 /0.3025
## [1] 799818.2
sum(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 13985391301
mean(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 27315217
median(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 18500000
zhongshan <- lvr_prices[lvr_prices$area == '中山區', c('address', 'total_price')]
res <- zhongshan[order(zhongshan$total_price, decreasing = TRUE),     ]
res[1:3,   ]
## # A tibble: 3 x 2
##                             address total_price
##                               <chr>       <int>
## 1   臺北市中山區南京東路三段61~90號   188888888
## 2           正義段一小段211~240地號   180720000
## 3 臺北市中山區建國北路一段151~180號   180000000
head(res)
## # A tibble: 6 x 2
##                                 address total_price
##                                   <chr>       <int>
## 1       臺北市中山區南京東路三段61~90號   188888888
## 2               正義段一小段211~240地號   180720000
## 3     臺北市中山區建國北路一段151~180號   180000000
## 4       臺北市中山區民權東路三段61~90號   170000000
## 5              臺北市中山區吉林路1~30號   157280000
## 6 臺北市中山區成功里樂群二路30巷61~90號   152600000
tail(res)
## # A tibble: 6 x 2
##                             address total_price
##                               <chr>       <int>
## 1           德惠段三小段151~180地號      218000
## 2             北安段三小段31~60地號      200000
## 3           榮星段二小段271~300地號      100000
## 4           正義段四小段151~180地號       70000
## 5         吉林段三小段1021~1050地號       30000
## 6 臺北市中山區建國北路二段121~150號          NA
getTopThree <- function(area){
  zhongshan <- lvr_prices[lvr_prices$area == area, c('address', 'total_price')]
  res <- zhongshan[order(zhongshan$total_price, decreasing = TRUE),     ]
  return(res[1:3,   ])  
}


getTopThree('大安區')
## # A tibble: 3 x 2
##                             address total_price
##                               <chr>       <int>
## 1             學府段三小段31~60地號   966660000
## 2 臺北市大安區忠孝東路四段271~300號   360270000
## 3   臺北市大安區敦化南路二段31~60號   321880000
getTopThree('萬華區')
## # A tibble: 3 x 2
##                         address total_price
##                           <chr>       <int>
## 1       福星段四小段151~180地號   307000000
## 2 臺北市萬華區漢口街二段31~60號   172200000
## 3     臺北市萬華區成都路31~60號   170000000
house_mean <- tapply(lvr_prices$total_price, lvr_prices$area, function(e) mean(e, na.rm=TRUE))

sort(house_mean, decreasing = TRUE)
##   中正區   信義區   內湖區   大安區   大同區   中山區   北投區   松山區 
## 28836445 28573246 27530523 27315217 20104882 20026617 19579429 19534135 
##   南港區   士林區   文山區   萬華區 
## 19107307 18453654 14809385 13710726
price_per_sec <- tapply(lvr_prices$total_price, lvr_prices$area, function(e) mean(e, na.rm=TRUE))

price_per_sec <- price_per_sec[! is.na(price_per_sec)]

barplot(sort(price_per_sec, decreasing = TRUE), main= "各區平均價", xlab = "區域", ylab = "價格", col="blue")

a <- c(1,2,3,4,5,6,700)
median(a)
## [1] 4
quantile(a, 0.25)
## 25% 
## 2.5
quantile(a, 0.75)
## 75% 
## 5.5
boxplot(a)

a <- c(1,2,3,4,5,6,7)
median(a)
## [1] 4
quantile(a, 0.25)
## 25% 
## 2.5
quantile(a, 0.75)
## 75% 
## 5.5
IQR(a)
## [1] 3
boxplot(a)

mean(a)
## [1] 4
boxplot(log(total_price) ~ area, data = lvr_prices, main= "房價箱型圖", xlab = "區域", ylab = "價格(log)")

DPLYR

#install.packages('dplyr')
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
help(package = "dplyr")

# R Style filter
head(lvr_prices[lvr_prices$area == '中山區'   ,    ])
## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1    13 中山區 房地(土地+建物)             臺北市中山區合江街31~60號
## 2    14 中山區 房地(土地+建物)   臺北市中山區中山北路二段183巷1~30號
## 3    16 中山區 房地(土地+建物)           臺北市中山區吉林路361~390號
## 4    17 中山區            土地               長安段三小段271~300地號
## 5    24 中山區 房地(土地+建物)       臺北市中山區林森北路485巷1~30號
## 6    39 中山區 房地(土地+建物) 臺北市中山區建國北路三段93巷5弄1~30號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>
# dplyr Style filter
head(filter(lvr_prices, area == '中山區'))
## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1    13 中山區 房地(土地+建物)             臺北市中山區合江街31~60號
## 2    14 中山區 房地(土地+建物)   臺北市中山區中山北路二段183巷1~30號
## 3    16 中山區 房地(土地+建物)           臺北市中山區吉林路361~390號
## 4    17 中山區            土地               長安段三小段271~300地號
## 5    24 中山區 房地(土地+建物)       臺北市中山區林森北路485巷1~30號
## 6    39 中山區 房地(土地+建物) 臺北市中山區建國北路三段93巷5弄1~30號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>
# R Style Select
head(lvr_prices[  , 'total_price' ])
## # A tibble: 6 x 1
##   total_price
##         <int>
## 1    18680000
## 2    20300000
## 3      132096
## 4     4200000
## 5    14000000
## 6      255000
# dplyr Style Select
head(select(lvr_prices, total_price))
## # A tibble: 6 x 1
##   total_price
##         <int>
## 1    18680000
## 2    20300000
## 3      132096
## 4     4200000
## 5    14000000
## 6      255000
# R Style
sum(tail(head(iris), 3)[ , 'Sepal.Length'])
## [1] 15
# Magrittr
iris %>% head() %>% tail(3) %>% .[, 'Sepal.Length'] %>% sum()
## [1] 15
# magrittr + dplyr
lvr_prices %>% filter(area == '中山區') %>% select(total_price) %>% head()
## # A tibble: 6 x 1
##   total_price
##         <int>
## 1     5960000
## 2    20200000
## 3     4050000
## 4     1900000
## 5    14800000
## 6    10200000
lvr_prices %>% filter(area == '中山區') %>% select(address, total_price) %>% arrange(total_price) %>% head()
## # A tibble: 6 x 2
##                     address total_price
##                       <chr>       <int>
## 1 吉林段三小段1021~1050地號       30000
## 2   正義段四小段151~180地號       70000
## 3   榮星段二小段271~300地號      100000
## 4     北安段三小段31~60地號      200000
## 5   德惠段三小段151~180地號      218000
## 6   德惠段三小段151~180地號      267716
lvr_prices %>% filter(area == '中山區') %>% select(address, total_price) %>% arrange(desc(total_price)) %>% head()
## # A tibble: 6 x 2
##                                 address total_price
##                                   <chr>       <int>
## 1       臺北市中山區南京東路三段61~90號   188888888
## 2               正義段一小段211~240地號   180720000
## 3     臺北市中山區建國北路一段151~180號   180000000
## 4       臺北市中山區民權東路三段61~90號   170000000
## 5              臺北市中山區吉林路1~30號   157280000
## 6 臺北市中山區成功里樂群二路30巷61~90號   152600000
lvr_prices$trading_ym <- as.Date(format(lvr_prices$trading_ymd, '%Y-%m-01'))
#lvr_prices$trading_ym

## SQL
## ========================================
## SELECT trading_ym, area, SUM(total_price) 
## FROM lvr_prices
## GROUP BY trading_ym, area

lvr_stat <- lvr_prices %>% 
  select(trading_ym, area, total_price) %>%
  group_by(trading_ym, area) %>%
  summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE))

lvr_stat2 <- lvr_stat[lvr_stat$area== '北投區', ]


plot(lvr_stat2$trading_ym, lvr_stat2$total_price, type = 'line')
## Warning: Unknown or uninitialised column: 'total_price'.
## Warning in plot.xy(xy, type, ...): 繪圖類型 'line' 被截短成第一個字元

lvr_stat <- lvr_prices %>% 
    select(trading_ym, area, total_price) %>% 
    filter(trading_ym >= '2012-01-01') %>% 
    group_by(trading_ym, area) %>% 
summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE))

lvr_stat$area <- as.factor(lvr_stat$area)

par(mfrow=c(3,4))
for (a in levels(lvr_stat$area)){
  plot(overall_price ~ trading_ym
       ,lvr_stat[lvr_stat$area == a,]
       , type='o
       ', main = a)
}
## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元
## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

## Warning in plot.xy(xy, type, ...): 繪圖類型 'o
##        ' 被截短成第一個字元

PIVOT Table

# install.packages('tidyr')
library(tidyr)

price_pivot <- spread(lvr_stat, trading_ym, overall_price, fill=0)

View(price_pivot)


write.csv(price_pivot, 'taipei_house_price.csv')

MELT

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
?melt
melt(price_pivot)
## Using area as id variables
##       area   variable      value
## 1   士林區 2012-01-01          0
## 2   大同區 2012-01-01          0
## 3   大安區 2012-01-01          0
## 4   中山區 2012-01-01          0
## 5   中正區 2012-01-01   39380000
## 6   內湖區 2012-01-01  191520000
## 7   文山區 2012-01-01          0
## 8   北投區 2012-01-01          0
## 9   松山區 2012-01-01          0
## 10  信義區 2012-01-01   40800000
## 11  南港區 2012-01-01          0
## 12  萬華區 2012-01-01          0
## 13  士林區 2012-02-01   21500000
## 14  大同區 2012-02-01          0
## 15  大安區 2012-02-01          0
## 16  中山區 2012-02-01  140250000
## 17  中正區 2012-02-01          0
## 18  內湖區 2012-02-01          0
## 19  文山區 2012-02-01          0
## 20  北投區 2012-02-01          0
## 21  松山區 2012-02-01          0
## 22  信義區 2012-02-01          0
## 23  南港區 2012-02-01   12300000
## 24  萬華區 2012-02-01          0
## 25  士林區 2012-03-01          0
## 26  大同區 2012-03-01          0
## 27  大安區 2012-03-01          0
## 28  中山區 2012-03-01          0
## 29  中正區 2012-03-01   40230000
## 30  內湖區 2012-03-01          0
## 31  文山區 2012-03-01   38620000
## 32  北投區 2012-03-01   64310000
## 33  松山區 2012-03-01   22800000
## 34  信義區 2012-03-01          0
## 35  南港區 2012-03-01   56600000
## 36  萬華區 2012-03-01          0
## 37  士林區 2012-04-01     579520
## 38  大同區 2012-04-01    8150000
## 39  大安區 2012-04-01    4886080
## 40  中山區 2012-04-01  191490000
## 41  中正區 2012-04-01  191730000
## 42  內湖區 2012-04-01   41030000
## 43  文山區 2012-04-01  113200000
## 44  北投區 2012-04-01   43970000
## 45  松山區 2012-04-01          0
## 46  信義區 2012-04-01  107610000
## 47  南港區 2012-04-01          0
## 48  萬華區 2012-04-01    4838500
## 49  士林區 2012-05-01  101420528
## 50  大同區 2012-05-01          0
## 51  大安區 2012-05-01   49052000
## 52  中山區 2012-05-01   35480000
## 53  中正區 2012-05-01   98930000
## 54  內湖區 2012-05-01    8800000
## 55  文山區 2012-05-01   10500000
## 56  北投區 2012-05-01  219868610
## 57  松山區 2012-05-01   46500000
## 58  信義區 2012-05-01  138070000
## 59  南港區 2012-05-01          0
## 60  萬華區 2012-05-01          0
## 61  士林區 2012-06-01  279545000
## 62  大同區 2012-06-01   91160800
## 63  大安區 2012-06-01  354136500
## 64  中山區 2012-06-01  267420000
## 65  中正區 2012-06-01  120000000
## 66  內湖區 2012-06-01  443459000
## 67  文山區 2012-06-01  229331800
## 68  北投區 2012-06-01   86900000
## 69  松山區 2012-06-01  270720000
## 70  信義區 2012-06-01  259820000
## 71  南港區 2012-06-01  149576914
## 72  萬華區 2012-06-01   47610000
## 73  士林區 2012-07-01 1325998130
## 74  大同區 2012-07-01  302473961
## 75  大安區 2012-07-01 2846892056
## 76  中山區 2012-07-01 2393478800
## 77  中正區 2012-07-01 1001514513
## 78  內湖區 2012-07-01 2210944966
## 79  文山區 2012-07-01 1208333372
## 80  北投區 2012-07-01 1992462984
## 81  松山區 2012-07-01  957017195
## 82  信義區 2012-07-01 2349217000
## 83  南港區 2012-07-01 1264313316
## 84  萬華區 2012-07-01  748839421
## 85  士林區 2012-08-01 2171758033
## 86  大同區 2012-08-01 2011512625
## 87  大安區 2012-08-01 4529857352
## 88  中山區 2012-08-01 4297669667
## 89  中正區 2012-08-01 4153153873
## 90  內湖區 2012-08-01 4803000969
## 91  文山區 2012-08-01 2561088358
## 92  北投區 2012-08-01 4314574989
## 93  松山區 2012-08-01 2186172414
## 94  信義區 2012-08-01 2525115998
## 95  南港區 2012-08-01 2314709691
## 96  萬華區 2012-08-01 1547935331
## 97  士林區 2012-09-01 2188610141
## 98  大同區 2012-09-01  957572920
## 99  大安區 2012-09-01 3914666964
## 100 中山區 2012-09-01 5597370292
## 101 中正區 2012-09-01 3220295172
## 102 內湖區 2012-09-01 5839368328
## 103 文山區 2012-09-01 1894795550
## 104 北投區 2012-09-01 3081013230
## 105 松山區 2012-09-01 1559866125
## 106 信義區 2012-09-01 4010489522
## 107 南港區 2012-09-01 3792856731
## 108 萬華區 2012-09-01 1198024054
## 109 士林區 2012-10-01  781134315
## 110 大同區 2012-10-01 1886308672
## 111 大安區 2012-10-01 2226942094
## 112 中山區 2012-10-01 2643790879
## 113 中正區 2012-10-01  741582125
## 114 內湖區 2012-10-01 1568754833
## 115 文山區 2012-10-01 1493564050
## 116 北投區 2012-10-01  884927511
## 117 松山區 2012-10-01  875767218
## 118 信義區 2012-10-01 1520031963
## 119 南港區 2012-10-01 1250733669
## 120 萬華區 2012-10-01  657943625
## 121 士林區 2012-11-01          0
## 122 大同區 2012-11-01          0
## 123 大安區 2012-11-01    1630329
## 124 中山區 2012-11-01          0
## 125 中正區 2012-11-01          0
## 126 內湖區 2012-11-01          0
## 127 文山區 2012-11-01          0
## 128 北投區 2012-11-01   11300000
## 129 松山區 2012-11-01          0
## 130 信義區 2012-11-01          0
## 131 南港區 2012-11-01          0
## 132 萬華區 2012-11-01          0

Multiple CSV Combind

dfall <- data.frame()
for(a in list.files('test/')){
  fname <- paste0('test/', a)
  df <- read.csv(fname)
  dfall <- rbind(dfall, df)
}
str(dfall)
for (a in levels(lvr_stat$area))

分類問題

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Training50.csv', 'Training50.csv')

trainset <- read.csv('Training50.csv')
View(trainset)

names(trainset)
##  [1] "X"                                
##  [2] "Creditability"                    
##  [3] "Account.Balance"                  
##  [4] "Duration.of.Credit..month."       
##  [5] "Payment.Status.of.Previous.Credit"
##  [6] "Purpose"                          
##  [7] "Credit.Amount"                    
##  [8] "Value.Savings.Stocks"             
##  [9] "Length.of.current.employment"     
## [10] "Instalment.per.cent"              
## [11] "Sex...Marital.Status"             
## [12] "Guarantors"                       
## [13] "Duration.in.Current.address"      
## [14] "Most.valuable.available.asset"    
## [15] "Age..years."                      
## [16] "Concurrent.Credits"               
## [17] "Type.of.apartment"                
## [18] "No.of.Credits.at.this.Bank"       
## [19] "Occupation"                       
## [20] "No.of.dependents"                 
## [21] "Telephone"                        
## [22] "Foreign.Worker"
trainset$X <- NULL
View(trainset)

#install.packages("rpart")
library(rpart)
trainset$Creditability = as.factor(trainset$Creditability)

model <- rpart(Creditability   ~  ., data=trainset, method = 'class' )

plot(model, margin = 0.1)
text(model)

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Test50.csv', 'Test50.csv')

testset <- read.csv('Test50.csv')
testset$X <- NULL
testset$Creditability = as.factor(testset$Creditability)


predicted <- predict(model, testset, type = "class")
predicted
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1   1   0   1   1   1   0   1   0   1   1   1   1   0   1   1   1   1 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   0   1   1   1   1   1   1   1   0   1   1   1   1   1   1   1   1   0 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   1   1   1   1   1   1   1   1   1   0   1   1   1   0   1   1   0   1 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   1   0   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   0 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   1   1   1   1   1   1   1   1   1   0   1   1   1   1   1   1   0   1 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   1   1   1   1   1   1   1   1   1   1   0   1   1   1   0   1   0   1 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   0   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 
##   1   1   1   1   0   1   1   1   1   1   0   1   1   1   1   1   1   1 
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 
##   1   1   1   0   0   1   0   1   0   1   0   1   1   1   1   1   1   1 
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   1   1   1   1   1   1   1   1   1   1   1   1   0   1   0   1   1   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 
##   1   0   0   1   1   0   1   1   1   1   1   0   1   1   1   1   1   1 
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 
##   1   1   1   1   1   1   1   1   0   1   1   1   1   1   1   0   1   1 
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 
##   1   1   1   0   0   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 
##   1   1   1   1   1   1   1   1   1   0   1   1   1   1   1   1   1   1 
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 
##   1   1   1   1   1   1   1   1   1   1   0   1   1   1   1   1   0   0 
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 
##   1   0   1   0   0   1   1   1   1   0   0   1   0   1   1   1   1   1 
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 
##   0   0   0   0   0   1   1   1   1   0   1   1   1   1   1   1   1   1 
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 
##   1   1   1   1   1   1   0   1   1   0   1   1   1   1   1   1   1   1 
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 
##   1   1   1   1   1   1   0   0   1   1   1   0   1   1   1   1   1   1 
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 
##   0   1   0   1   1   1   1   1   1   1   1   1   0   0   1   0   1   1 
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 
##   0   0   0   0   1   1   1   0   1   1   0   0   1   0   1   1   0   1 
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 
##   1   1   1   1   0   1   0   0   1   1   1   1   1   0   1   0   1   1 
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 
##   1   1   1   0   0   0   1   0   1   0   0   1   0   1   1   1   1   0 
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 
##   1   0   1   0   0   0   1   1   0   0   0   0   1   1   1   1   0   0 
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 
##   1   0   1   1   1   0   0   0   0   1   0   0   0   1   0   1   1   1 
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 
##   1   1   1   1   0   1   1   0   1   1   0   0   0   0   0   1   0   1 
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 
##   1   1   1   1   1   0   0   0   0   0   0   1   1   1 
## Levels: 0 1
sum(predicted == testset$Creditability) / length(testset$Creditability)
## [1] 0.71
table(predicted, testset$Creditability)
##          
## predicted   0   1
##         0  64  52
##         1  93 291
head(predict(model, testset, type = 'class'))
## 1 2 3 4 5 6 
## 1 1 0 1 1 1 
## Levels: 0 1

ROC Curve

# install.packages('caret')
#library(caret)
res <- as.factor(ifelse(predict(model, testset)[,1] >= 0.2, 0, 1))
tb <- table(testset$Creditability, res)
x <- tb[3] / (tb[4] + tb[3])
y <- tb[1] / (tb[1] + tb[2])


prediction <- predict(model, testset, type = "prob")

roc_x <- c(0)
roc_y <- c(0)
for(i in seq(0,1,0.01)){
  res <- as.factor(ifelse(prediction[,1] >= i, 0, 1))

  tb <- table(testset$Creditability, res)
  if (ncol(tb) == 2){
    x <- tb[3] / (tb[4] + tb[3])
    y <- tb[1] / (tb[1] + tb[2])
    roc_x <- c(roc_x, x)
    roc_y <- c(roc_y, y)
  }
}
roc_x <- c(roc_x, 1)
roc_y <- c(roc_y, 1)
plot(roc_x, roc_y, type='b')

#install.packages('ROCR')
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <- predict(model, testset, type="prob")
pred.to.roc <- predictions[, 2] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset$Creditability)) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

RandomForest

#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
forest <- randomForest(Creditability ~., data = trainset, ntree=200, importance=T, proximity=T) 

forest.predicted <- predict(forest, testset, type = "class")
table(forest.predicted, testset$Creditability)
##                 
## forest.predicted   0   1
##                0  50  28
##                1 107 315
sum(forest.predicted == testset$Creditability) / length(testset$Creditability)
## [1] 0.73
predictions1 <- predict(model, testset, type="prob")
pred.to.roc1 <- predictions1[, 2] 
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$Creditability)) 
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 2] 
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$Creditability)) 
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")


plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Clustering

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/customers.csv', 'customers.csv')
customers <- read.csv('customers.csv')
head(customers)
##   CustomerID  Genre Age Annual_Income Spending_Score
## 1          1   Male  19            15             39
## 2          2   Male  21            15             81
## 3          3 Female  20            16              6
## 4          4 Female  23            16             77
## 5          5 Female  31            17             40
## 6          6 Female  22            17             76
customers <- customers[, c('Annual_Income', 'Spending_Score')]

head(customers)
##   Annual_Income Spending_Score
## 1            15             39
## 2            15             81
## 3            16              6
## 4            16             77
## 5            17             40
## 6            17             76
set.seed(123)
kc <- kmeans(customers, centers = 5)
kc
## K-means clustering with 5 clusters of sizes 50, 27, 74, 39, 10
## 
## Cluster means:
##   Annual_Income Spending_Score
## 1      27.40000       49.48000
## 2      79.00000       16.59259
## 3      55.90541       49.93243
## 4      86.53846       82.12821
## 5     109.70000       22.00000
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## [141] 2 4 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2
## [176] 4 2 4 2 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
## 
## Within cluster sum of squares by cluster:
## [1] 48174.480  4062.519  7375.000 13444.051  2458.100
##  (between_SS / total_SS =  72.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
plot(customers$Annual_Income, customers$Spending_Score, col=kc$cluster)
points(kc$centers[,1], kc$centers[,2], col='yellow')

library(cluster)
kcs = silhouette(kc$cluster,dist(customers))
summary(kcs)
## Silhouette of 200 units in 5 clusters from silhouette.default(x = kc$cluster, dist = dist(customers)) :
##  Cluster sizes and average silhouette widths:
##         50         27         74         39         10 
## 0.04103987 0.47253618 0.63162169 0.48851396 0.35704613 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.4401  0.2089  0.5292  0.4209  0.6421  0.7556
plot(kcs)



library(fpc)

nk <- 2:10
set.seed(123)
SW <- sapply(nk, function(k) {
  cluster.stats(dist(customers), kmeans(customers, centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")