Hello World

Data Frame

data()
data(iris)
View(iris)
class(iris)
## [1] "data.frame"
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
head(iris, 10)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
help(head)
## starting httpd help server ... done
?head

iris[   1       ,   ]
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
iris[ c(1,2,3)  ,   ]
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
iris[      1:3  ,   ]
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
iris[   1 ,             1 ]
## [1] 5.1
iris[   1 ,'Sepal.Length' ]
## [1] 5.1
iris[   1   ,   c(1,2)]
##   Sepal.Length Sepal.Width
## 1          5.1         3.5
iris[ 1 ,   c('Sepal.Length','Sepal.Width')]
##   Sepal.Length Sepal.Width
## 1          5.1         3.5
head(iris$Sepal.Length)
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
head(iris[iris$Species == 'setosa',     ])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
idx <- (iris$Species == 'setosa') & (iris$Sepal.Length > 5)

head(iris[ idx ,   ])
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 15          5.8         4.0          1.2         0.2  setosa
## 16          5.7         4.4          1.5         0.4  setosa
## 17          5.4         3.9          1.3         0.4  setosa
idx2 <- (iris$Species == 'setosa') | (iris$Sepal.Length > 5)

head(iris[ idx2 ,   ])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
heights <- c(172, 180, 168 , 183, 155)
sort(heights)
## [1] 155 168 172 180 183
sort(heights, decreasing = TRUE)
## [1] 183 180 172 168 155
order(heights)
## [1] 5 3 1 2 4
heights[order(heights)]
## [1] 155 168 172 180 183
order(heights, decreasing = TRUE)
## [1] 4 2 1 3 5
heights[order(heights, decreasing = TRUE)]
## [1] 183 180 172 168 155
head(sort(iris$Sepal.Length, decreasing = TRUE))
## [1] 7.9 7.7 7.7 7.7 7.7 7.6
head(iris[order(iris$Sepal.Length, decreasing = TRUE),   ])
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 132          7.9         3.8          6.4         2.0 virginica
## 118          7.7         3.8          6.7         2.2 virginica
## 119          7.7         2.6          6.9         2.3 virginica
## 123          7.7         2.8          6.7         2.0 virginica
## 136          7.7         3.0          6.1         2.3 virginica
## 106          7.6         3.0          6.6         2.1 virginica

R Chart

tb <- table(iris$Species)
pie(tb)

barplot(tb, col="blue")

hist(iris$Sepal.Length)

boxplot(iris$Sepal.Length)

boxplot(iris$Petal.Width ~ iris$Species)

plot(iris$Petal.Length, iris$Petal.Width)

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)

## 使用R 探索資料

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/lvr_prices.csv', 'lvr_prices.csv')
library(readr)
lvr_prices <- read_csv("D:/OS DATA/Desktop/lvr_prices.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col     row         col   expected     actual expected   <int>       <chr>      <chr>      <chr> actual 1  1282 total_price an integer 6700000000 file 2  2243 total_price an integer 3882685600 row 3  2244 total_price an integer 3373314400 col 4  4629 total_price an integer 3050000000 expected 5  5890 total_price an integer 3133800000 actual # ... with 1 more variables: file <chr>
## ... ................. ... ......................................... ........ ......................................... ...... ......................................... .... ......................................... ... ......................................... ... ......................................... ........ ......................................... ...... .......................................
## See problems(...) for more details.
#View(lvr_prices)
getwd()
## [1] "D:/OS DATA/Desktop"
daan <- lvr_prices[lvr_prices$area == '大安區',]

#?sum
sum(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 2.79477e+11
mean(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 29798170
median(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 2e+07
temp <- c(34,31,21,24,26,28,27)

# mean = sum / count
sum(temp) / length(temp)
## [1] 27.28571
mean(temp)
## [1] 27.28571
temp <- c(34,31,21,24,26,28,27, 999)
mean(temp)
## [1] 148.75
sort(temp)
## [1]  21  24  26  27  28  31  34 999
median(temp)
## [1] 27.5
daan <- lvr_prices[ (lvr_prices$area == '大安區') & (lvr_prices$trading_target == '房地(土地+建物)') & (lvr_prices$city_land_type == '住')    ,  ]
summary(as.numeric(daan$total_price), na.rm = TRUE)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## 1.000e+04 1.353e+07 2.168e+07 2.656e+07 3.200e+07 1.870e+09         3
summary(as.numeric(daan$price_per_sqmeter), na.rm = TRUE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0  197403  249182  254814  293166 2080354       4
254814/0.3025
## [1] 842360.3
zhongshan <- lvr_prices[lvr_prices$area == '中山區', c('total_price', 'address')]

head(zhongshan)
## # A tibble: 6 x 2
##   total_price                               address
##         <int>                                 <chr>
## 1     5960000             臺北市中山區合江街31~60號
## 2    20200000   臺北市中山區中山北路二段183巷1~30號
## 3     4050000           臺北市中山區吉林路361~390號
## 4     1900000               長安段三小段271~300地號
## 5    14800000       臺北市中山區林森北路485巷1~30號
## 6    10200000 臺北市中山區建國北路三段93巷5弄1~30號
head(sort(zhongshan$total_price))
## [1]     0     0 10860 16000 18060 21244
head(sort(zhongshan$total_price, decreasing = TRUE))
## [1] 1850000000 1400000000 1084948034 1011136500  952875000  903865500
res <- zhongshan[order(zhongshan$total_price, decreasing = TRUE), ]
head(res,3 )
## # A tibble: 3 x 2
##   total_price                             address
##         <int>                               <chr>
## 1  1850000000 臺北市中山區建國北路一段138巷1~30號
## 2  1400000000      臺北市中山區南京東路三段1~30號
## 3  1084948034               中山段二小段31~60地號
res[1:3,]
## # A tibble: 3 x 2
##   total_price                             address
##         <int>                               <chr>
## 1  1850000000 臺北市中山區建國北路一段138巷1~30號
## 2  1400000000      臺北市中山區南京東路三段1~30號
## 3  1084948034               中山段二小段31~60地號
addNumber <- function(a, b){
  return(a+b)
}

addNumber(3,5)
## [1] 8
getTopThree <- function(area){
  zhongshan <- lvr_prices[lvr_prices$area == area, c('total_price', 'address')]
  res <- zhongshan[order(zhongshan$total_price, decreasing = TRUE), ]
  return(res[1:3,]  )
}

getTopThree('信義區')
## # A tibble: 3 x 2
##   total_price                              address
##         <int>                                <chr>
## 1  1000000000      臺北市信義區基隆路一段151~180號
## 2   921800000              雅祥段三小段721~750地號
## 3   868000000 臺北市信義區忠孝東路五段236巷61~90號
tapply(lvr_prices$total_price, lvr_prices$area , function(e) mean(e, na.rm=TRUE) )
##   士林區   大同區   大安區   中山區   中正區   內湖區   文山區   北投區 
## 24139903 18063872 29798170 26708805 30154011 27905514 16953869 20626410 
##   松山區   信義區   南港區   萬華區 
## 25652125 24725051 25235793 13642289
price_per_sec <- tapply(lvr_prices$total_price, lvr_prices$area , function(e) mean(e, na.rm=TRUE) )

barplot(price_per_sec)

barplot(sort(price_per_sec, decreasing = TRUE), main = '各區平均價格', xlab = '區域', ylab = '價格', col= "blue")

a <-  c(1,20,30,40,50,60,70,80,999)
median(a)
## [1] 50
quantile(a, 0.25)
## 25% 
##  30
quantile(a, 0.75)
## 75% 
##  70
IQR(a)
## [1] 40
boxplot(a)

a <-  c(1,20,30,40,50,60,70,80,90)
median(a)
## [1] 50
quantile(a, 0.25)
## 25% 
##  30
quantile(a, 0.75)
## 75% 
##  70
IQR(a)
## [1] 40
boxplot(a)

boxplot(total_price ~ area, data = lvr_prices)
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

boxplot(log(total_price) ~ area, data = lvr_prices, main= "房價箱型圖", xlab = "區域", ylab = "價格(log)")
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 3 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 6 is not drawn

## DPLYR

#install.packages('dplyr')
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
help(package='dplyr')

# R Style Filter
head(lvr_prices[ lvr_prices$area == '中山區'   ,   ])
## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1    13 中山區 房地(土地+建物)             臺北市中山區合江街31~60號
## 2    14 中山區 房地(土地+建物)   臺北市中山區中山北路二段183巷1~30號
## 3    16 中山區 房地(土地+建物)           臺北市中山區吉林路361~390號
## 4    17 中山區            土地               長安段三小段271~300地號
## 5    24 中山區 房地(土地+建物)       臺北市中山區林森北路485巷1~30號
## 6    39 中山區 房地(土地+建物) 臺北市中山區建國北路三段93巷5弄1~30號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>
# dplyr Style Filter
head(filter(lvr_prices, area == '中山區'))
## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1    13 中山區 房地(土地+建物)             臺北市中山區合江街31~60號
## 2    14 中山區 房地(土地+建物)   臺北市中山區中山北路二段183巷1~30號
## 3    16 中山區 房地(土地+建物)           臺北市中山區吉林路361~390號
## 4    17 中山區            土地               長安段三小段271~300地號
## 5    24 中山區 房地(土地+建物)       臺北市中山區林森北路485巷1~30號
## 6    39 中山區 房地(土地+建物) 臺北市中山區建國北路三段93巷5弄1~30號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>
# R Style Select
head(lvr_prices[ , c('total_price')])
## # A tibble: 6 x 1
##   total_price
##         <int>
## 1    18680000
## 2    20300000
## 3      132096
## 4     4200000
## 5    14000000
## 6      255000
# dplyr Style Select
head(select(lvr_prices, total_price))
## # A tibble: 6 x 1
##   total_price
##         <int>
## 1    18680000
## 2    20300000
## 3      132096
## 4     4200000
## 5    14000000
## 6      255000
# R Style Data Manipulation
sum(tail(head(iris), 3)$Sepal.Length)
## [1] 15
# magrittr Style Data Manipulation
iris %>% head() %>% tail(3) %>% .$Sepal.Length %>% sum()
## [1] 15
lvr_prices %>% 
  filter(area == '中山區') %>%
  select(total_price) %>%
  head()
## # A tibble: 6 x 1
##   total_price
##         <int>
## 1     5960000
## 2    20200000
## 3     4050000
## 4     1900000
## 5    14800000
## 6    10200000
lvr_prices %>% 
  filter(area == '中山區') %>%
  select(total_price, address) %>%
  arrange(total_price) %>%
  head()
## # A tibble: 6 x 2
##   total_price                 address
##         <int>                   <chr>
## 1           0 中山段一小段691~720地號
## 2           0 中山段一小段691~720地號
## 3       10860 榮星段四小段211~240地號
## 4       16000 中山段四小段211~240地號
## 5       18060 榮星段四小段211~240地號
## 6       21244 榮星段二小段361~390地號
lvr_prices %>% 
  filter(area == '中山區') %>%
  select(total_price, address) %>%
  arrange(desc(total_price)) %>%
  head()
## # A tibble: 6 x 2
##   total_price                             address
##         <int>                               <chr>
## 1  1850000000 臺北市中山區建國北路一段138巷1~30號
## 2  1400000000      臺北市中山區南京東路三段1~30號
## 3  1084948034               中山段二小段31~60地號
## 4  1011136500             中山段三小段301~330地號
## 5   952875000                     金泰段61~90地號
## 6   903865500             中山段一小段361~390地號
lvr_prices$trading_ym <- as.Date(format(lvr_prices$trading_ymd, '%Y-%m-01'))


lvr_stat <- lvr_prices %>%
  select(area, trading_ym, total_price) %>%
  filter(trading_ym >= '2012-01-01') %>%
  group_by(area, trading_ym) %>%
  summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE))

lvr_stat$area <- as.factor(lvr_stat$area)


par(mfrow=c(3,4))
for (a in levels(lvr_stat$area)){
  plot(overall_price ~ trading_ym, data = lvr_stat[lvr_stat$area == a,] , type = 'l', main = a)
}

head(lvr_stat)
## # A tibble: 6 x 3
## # Groups:   area [1]
##     area trading_ym overall_price
##   <fctr>     <date>         <dbl>
## 1 士林區 2012-01-01     661140000
## 2 士林區 2012-02-01     231680000
## 3 士林區 2012-03-01     359891504
## 4 士林區 2012-04-01     205481036
## 5 士林區 2012-05-01    2539010528
## 6 士林區 2012-06-01     514470692

Pivot Table

#install.packages('tidyr')
library(tidyr)

price_pivot <- spread(lvr_stat, trading_ym, overall_price, fill=0)
#price_pivot

write.csv(price_pivot, 'taipei_house_price.csv')

分類問題

download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Training50.csv', 'Training50.csv')

#install.packages('rpart')
library(rpart)
trainset <- read.csv('Training50.csv')
#View(trainset)

trainset$X <- NULL

trainset$Creditability = as.factor(trainset$Creditability)

head(trainset)
##   Creditability Account.Balance Duration.of.Credit..month.
## 1             1               3                          6
## 2             0               1                         15
## 3             0               1                         42
## 4             0               3                         36
## 5             1               3                         24
## 6             1               1                         15
##   Payment.Status.of.Previous.Credit Purpose Credit.Amount
## 1                                 2       3          2108
## 2                                 1       4           950
## 3                                 2       3          7174
## 4                                 3       4          7980
## 5                                 3       2          2028
## 6                                 2       4          2511
##   Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## 1                    1                            3                   2
## 2                    1                            4                   4
## 3                    4                            3                   4
## 4                    4                            1                   4
## 5                    1                            3                   2
## 6                    1                            1                   1
##   Sex...Marital.Status Guarantors Duration.in.Current.address
## 1                    3          1                           2
## 2                    2          1                           3
## 3                    1          1                           3
## 4                    2          1                           4
## 5                    2          1                           2
## 6                    1          1                           4
##   Most.valuable.available.asset Age..years. Concurrent.Credits
## 1                             1          29                  2
## 2                             3          33                  2
## 3                             3          30                  2
## 4                             3          27                  2
## 5                             2          30                  2
## 6                             3          23                  2
##   Type.of.apartment No.of.Credits.at.this.Bank Occupation No.of.dependents
## 1                 1                          1          1                1
## 2                 1                          2          1                2
## 3                 2                          1          1                1
## 4                 1                          2          1                1
## 5                 2                          2          1                1
## 6                 1                          1          1                1
##   Telephone Foreign.Worker
## 1         1              1
## 2         1              1
## 3         2              1
## 4         2              1
## 5         1              1
## 6         1              1
model <- rpart(Creditability ~ ., data = trainset ,method = 'class')

summary(model)
## Call:
## rpart(formula = Creditability ~ ., data = trainset, method = "class")
##   n= 500 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.05827506      0 1.0000000 1.0000000 0.07066121
## 2 0.04895105      3 0.8251748 0.9440559 0.06942123
## 3 0.02097902      5 0.7272727 0.8951049 0.06824258
## 4 0.01048951      9 0.6433566 0.8741259 0.06770950
## 5 0.01000000     13 0.6013986 0.9020979 0.06841649
## 
## Variable importance
##                   Account.Balance        Duration.of.Credit..month. 
##                                21                                15 
##                           Purpose              Value.Savings.Stocks 
##                                10                                 9 
##                     Credit.Amount Payment.Status.of.Previous.Credit 
##                                 9                                 7 
##       Duration.in.Current.address                       Age..years. 
##                                 5                                 5 
##                Concurrent.Credits        No.of.Credits.at.this.Bank 
##                                 4                                 4 
##                 Type.of.apartment      Length.of.current.employment 
##                                 4                                 3 
##               Instalment.per.cent     Most.valuable.available.asset 
##                                 2                                 2 
##              Sex...Marital.Status 
##                                 1 
## 
## Node number 1: 500 observations,    complexity param=0.05827506
##   predicted class=1  expected loss=0.286  P(node) =1
##     class counts:   143   357
##    probabilities: 0.286 0.714 
##   left son=2 (261 obs) right son=3 (239 obs)
##   Primary splits:
##       Account.Balance                   < 2.5     to the left,  improve=20.037280, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=12.134650, (0 missing)
##       Duration.of.Credit..month.        < 34.5    to the right, improve= 8.335074, (0 missing)
##       Credit.Amount                     < 4180.5  to the right, improve= 8.222546, (0 missing)
##       Value.Savings.Stocks              < 1.5     to the left,  improve= 5.038364, (0 missing)
##   Surrogate splits:
##       Value.Savings.Stocks              < 2.5     to the left,  agree=0.608, adj=0.180, (0 split)
##       Payment.Status.of.Previous.Credit < 2.5     to the left,  agree=0.580, adj=0.121, (0 split)
##       Age..years.                       < 30.5    to the left,  agree=0.558, adj=0.075, (0 split)
##       Purpose                           < 3.5     to the right, agree=0.556, adj=0.071, (0 split)
##       No.of.Credits.at.this.Bank        < 1.5     to the left,  agree=0.554, adj=0.067, (0 split)
## 
## Node number 2: 261 observations,    complexity param=0.05827506
##   predicted class=1  expected loss=0.4214559  P(node) =0.522
##     class counts:   110   151
##    probabilities: 0.421 0.579 
##   left son=4 (165 obs) right son=5 (96 obs)
##   Primary splits:
##       Duration.of.Credit..month.        < 13      to the right, improve=8.928178, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=7.065977, (0 missing)
##       Credit.Amount                     < 3757.5  to the right, improve=5.988777, (0 missing)
##       Most.valuable.available.asset     < 3.5     to the right, improve=5.407471, (0 missing)
##       Value.Savings.Stocks              < 1.5     to the left,  improve=2.605487, (0 missing)
##   Surrogate splits:
##       Credit.Amount                 < 1706.5  to the right, agree=0.785, adj=0.417, (0 split)
##       Most.valuable.available.asset < 1.5     to the right, agree=0.705, adj=0.198, (0 split)
##       Age..years.                   < 65.5    to the left,  agree=0.655, adj=0.062, (0 split)
##       Duration.in.Current.address   < 1.5     to the right, agree=0.648, adj=0.042, (0 split)
## 
## Node number 3: 239 observations,    complexity param=0.01048951
##   predicted class=1  expected loss=0.1380753  P(node) =0.478
##     class counts:    33   206
##    probabilities: 0.138 0.862 
##   left son=6 (72 obs) right son=7 (167 obs)
##   Primary splits:
##       Purpose                      < 3.5     to the right, improve=2.581640, (0 missing)
##       Length.of.current.employment < 1.5     to the left,  improve=1.717847, (0 missing)
##       Credit.Amount                < 7839.5  to the right, improve=1.427347, (0 missing)
##       No.of.dependents             < 1.5     to the right, improve=1.388353, (0 missing)
##       Concurrent.Credits           < 1.5     to the left,  improve=1.025677, (0 missing)
##   Surrogate splits:
##       Duration.of.Credit..month. < 54      to the right, agree=0.707, adj=0.028, (0 split)
##       Credit.Amount              < 11867   to the right, agree=0.707, adj=0.028, (0 split)
##       Age..years.                < 65.5    to the right, agree=0.707, adj=0.028, (0 split)
##       Foreign.Worker             < 1.5     to the right, agree=0.707, adj=0.028, (0 split)
## 
## Node number 4: 165 observations,    complexity param=0.05827506
##   predicted class=0  expected loss=0.4787879  P(node) =0.33
##     class counts:    86    79
##    probabilities: 0.521 0.479 
##   left son=8 (111 obs) right son=9 (54 obs)
##   Primary splits:
##       Value.Savings.Stocks              < 1.5     to the left,  improve=5.666830, (0 missing)
##       Duration.of.Credit..month.        < 31.5    to the right, improve=3.502839, (0 missing)
##       Instalment.per.cent               < 2.5     to the right, improve=3.151515, (0 missing)
##       Duration.in.Current.address       < 1.5     to the right, improve=3.098733, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=2.710674, (0 missing)
## 
## Node number 5: 96 observations,    complexity param=0.02097902
##   predicted class=1  expected loss=0.25  P(node) =0.192
##     class counts:    24    72
##    probabilities: 0.250 0.750 
##   left son=10 (7 obs) right son=11 (89 obs)
##   Primary splits:
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=3.255217, (0 missing)
##       Most.valuable.available.asset     < 2.5     to the right, improve=2.840580, (0 missing)
##       Credit.Amount                     < 4327.5  to the right, improve=1.560193, (0 missing)
##       Guarantors                        < 1.5     to the left,  improve=1.552941, (0 missing)
##       Age..years.                       < 59.5    to the right, improve=1.090909, (0 missing)
## 
## Node number 6: 72 observations,    complexity param=0.01048951
##   predicted class=1  expected loss=0.25  P(node) =0.144
##     class counts:    18    54
##    probabilities: 0.250 0.750 
##   left son=12 (11 obs) right son=13 (61 obs)
##   Primary splits:
##       Concurrent.Credits          < 1.5     to the left,  improve=3.8763040, (0 missing)
##       Credit.Amount               < 7077    to the right, improve=3.3428570, (0 missing)
##       Duration.of.Credit..month.  < 9.5     to the right, improve=1.1250000, (0 missing)
##       Duration.in.Current.address < 3.5     to the left,  improve=0.5846030, (0 missing)
##       Type.of.apartment           < 1.5     to the left,  improve=0.4945055, (0 missing)
## 
## Node number 7: 167 observations
##   predicted class=1  expected loss=0.08982036  P(node) =0.334
##     class counts:    15   152
##    probabilities: 0.090 0.910 
## 
## Node number 8: 111 observations,    complexity param=0.04895105
##   predicted class=0  expected loss=0.3873874  P(node) =0.222
##     class counts:    68    43
##    probabilities: 0.613 0.387 
##   left son=16 (45 obs) right son=17 (66 obs)
##   Primary splits:
##       Purpose                           < 3.5     to the right, improve=5.314988, (0 missing)
##       Duration.in.Current.address       < 1.5     to the right, improve=4.293790, (0 missing)
##       Duration.of.Credit..month.        < 33      to the right, improve=3.737602, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=3.279667, (0 missing)
##       Instalment.per.cent               < 2.5     to the right, improve=2.317008, (0 missing)
##   Surrogate splits:
##       Credit.Amount                     < 10672.5 to the right, agree=0.649, adj=0.133, (0 split)
##       Duration.of.Credit..month.        < 17      to the left,  agree=0.622, adj=0.067, (0 split)
##       No.of.Credits.at.this.Bank        < 1.5     to the right, agree=0.622, adj=0.067, (0 split)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  agree=0.613, adj=0.044, (0 split)
##       Age..years.                       < 51.5    to the right, agree=0.613, adj=0.044, (0 split)
## 
## Node number 9: 54 observations,    complexity param=0.02097902
##   predicted class=1  expected loss=0.3333333  P(node) =0.108
##     class counts:    18    36
##    probabilities: 0.333 0.667 
##   left son=18 (32 obs) right son=19 (22 obs)
##   Primary splits:
##       Length.of.current.employment      < 2.5     to the left,  improve=2.880682, (0 missing)
##       Sex...Marital.Status              < 1.5     to the left,  improve=2.142857, (0 missing)
##       Duration.of.Credit..month.        < 46.5    to the left,  improve=1.787234, (0 missing)
##       Payment.Status.of.Previous.Credit < 2.5     to the left,  improve=1.704545, (0 missing)
##       Purpose                           < 3.5     to the left,  improve=1.704545, (0 missing)
##   Surrogate splits:
##       Credit.Amount              < 2325.5  to the right, agree=0.685, adj=0.227, (0 split)
##       Account.Balance            < 1.5     to the right, agree=0.667, adj=0.182, (0 split)
##       Age..years.                < 29.5    to the left,  agree=0.667, adj=0.182, (0 split)
##       No.of.Credits.at.this.Bank < 1.5     to the left,  agree=0.611, adj=0.045, (0 split)
##       Telephone                  < 1.5     to the left,  agree=0.611, adj=0.045, (0 split)
## 
## Node number 10: 7 observations
##   predicted class=0  expected loss=0.2857143  P(node) =0.014
##     class counts:     5     2
##    probabilities: 0.714 0.286 
## 
## Node number 11: 89 observations
##   predicted class=1  expected loss=0.2134831  P(node) =0.178
##     class counts:    19    70
##    probabilities: 0.213 0.787 
## 
## Node number 12: 11 observations
##   predicted class=0  expected loss=0.3636364  P(node) =0.022
##     class counts:     7     4
##    probabilities: 0.636 0.364 
## 
## Node number 13: 61 observations
##   predicted class=1  expected loss=0.1803279  P(node) =0.122
##     class counts:    11    50
##    probabilities: 0.180 0.820 
## 
## Node number 16: 45 observations,    complexity param=0.02097902
##   predicted class=0  expected loss=0.2  P(node) =0.09
##     class counts:    36     9
##    probabilities: 0.800 0.200 
##   left son=32 (38 obs) right son=33 (7 obs)
##   Primary splits:
##       Duration.in.Current.address   < 1.5     to the right, improve=4.3849620, (0 missing)
##       Most.valuable.available.asset < 1.5     to the right, improve=1.3444440, (0 missing)
##       Instalment.per.cent           < 1.5     to the right, improve=0.8661654, (0 missing)
##       Duration.of.Credit..month.    < 47.5    to the right, improve=0.7783784, (0 missing)
##       Age..years.                   < 39.5    to the left,  improve=0.5818182, (0 missing)
## 
## Node number 17: 66 observations,    complexity param=0.04895105
##   predicted class=1  expected loss=0.4848485  P(node) =0.132
##     class counts:    32    34
##    probabilities: 0.485 0.515 
##   left son=34 (26 obs) right son=35 (40 obs)
##   Primary splits:
##       Duration.of.Credit..month.        < 33      to the right, improve=5.188928, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=2.771421, (0 missing)
##       Guarantors                        < 1.5     to the left,  improve=2.357628, (0 missing)
##       Instalment.per.cent               < 2.5     to the right, improve=2.187258, (0 missing)
##       No.of.Credits.at.this.Bank        < 1.5     to the left,  improve=2.122475, (0 missing)
##   Surrogate splits:
##       Credit.Amount    < 4762    to the right, agree=0.727, adj=0.308, (0 split)
##       Age..years.      < 49      to the right, agree=0.636, adj=0.077, (0 split)
##       No.of.dependents < 1.5     to the right, agree=0.636, adj=0.077, (0 split)
## 
## Node number 18: 32 observations,    complexity param=0.02097902
##   predicted class=1  expected loss=0.46875  P(node) =0.064
##     class counts:    15    17
##    probabilities: 0.469 0.531 
##   left son=36 (10 obs) right son=37 (22 obs)
##   Primary splits:
##       Type.of.apartment          < 1.5     to the left,  improve=3.192045, (0 missing)
##       Credit.Amount              < 4316    to the left,  improve=2.760011, (0 missing)
##       No.of.Credits.at.this.Bank < 1.5     to the right, improve=2.703214, (0 missing)
##       Duration.of.Credit..month. < 40.5    to the left,  improve=2.520833, (0 missing)
##       Value.Savings.Stocks       < 3.5     to the left,  improve=2.101136, (0 missing)
##   Surrogate splits:
##       Sex...Marital.Status          < 1.5     to the left,  agree=0.750, adj=0.2, (0 split)
##       Age..years.                   < 26.5    to the left,  agree=0.750, adj=0.2, (0 split)
##       Credit.Amount                 < 2145.5  to the left,  agree=0.719, adj=0.1, (0 split)
##       Most.valuable.available.asset < 1.5     to the left,  agree=0.719, adj=0.1, (0 split)
## 
## Node number 19: 22 observations
##   predicted class=1  expected loss=0.1363636  P(node) =0.044
##     class counts:     3    19
##    probabilities: 0.136 0.864 
## 
## Node number 32: 38 observations
##   predicted class=0  expected loss=0.1052632  P(node) =0.076
##     class counts:    34     4
##    probabilities: 0.895 0.105 
## 
## Node number 33: 7 observations
##   predicted class=1  expected loss=0.2857143  P(node) =0.014
##     class counts:     2     5
##    probabilities: 0.286 0.714 
## 
## Node number 34: 26 observations
##   predicted class=0  expected loss=0.2692308  P(node) =0.052
##     class counts:    19     7
##    probabilities: 0.731 0.269 
## 
## Node number 35: 40 observations,    complexity param=0.01048951
##   predicted class=1  expected loss=0.325  P(node) =0.08
##     class counts:    13    27
##    probabilities: 0.325 0.675 
##   left son=70 (28 obs) right son=71 (12 obs)
##   Primary splits:
##       No.of.Credits.at.this.Bank    < 1.5     to the left,  improve=2.0023810, (0 missing)
##       Instalment.per.cent           < 2.5     to the right, improve=1.6409090, (0 missing)
##       Credit.Amount                 < 2480.5  to the left,  improve=1.6001250, (0 missing)
##       Length.of.current.employment  < 3.5     to the left,  improve=1.1283480, (0 missing)
##       Most.valuable.available.asset < 1.5     to the left,  improve=0.8166667, (0 missing)
##   Surrogate splits:
##       Credit.Amount                     < 5239.5  to the left,  agree=0.825, adj=0.417, (0 split)
##       Payment.Status.of.Previous.Credit < 2.5     to the left,  agree=0.800, adj=0.333, (0 split)
##       Duration.of.Credit..month.        < 16.5    to the right, agree=0.750, adj=0.167, (0 split)
##       Purpose                           < 1.5     to the right, agree=0.750, adj=0.167, (0 split)
##       Age..years.                       < 47.5    to the left,  agree=0.750, adj=0.167, (0 split)
## 
## Node number 36: 10 observations
##   predicted class=0  expected loss=0.2  P(node) =0.02
##     class counts:     8     2
##    probabilities: 0.800 0.200 
## 
## Node number 37: 22 observations
##   predicted class=1  expected loss=0.3181818  P(node) =0.044
##     class counts:     7    15
##    probabilities: 0.318 0.682 
## 
## Node number 70: 28 observations,    complexity param=0.01048951
##   predicted class=1  expected loss=0.4285714  P(node) =0.056
##     class counts:    12    16
##    probabilities: 0.429 0.571 
##   left son=140 (17 obs) right son=141 (11 obs)
##   Primary splits:
##       Instalment.per.cent         < 2.5     to the right, improve=2.2062640, (0 missing)
##       Duration.in.Current.address < 2.5     to the right, improve=1.6253970, (0 missing)
##       Age..years.                 < 33.5    to the right, improve=1.5645530, (0 missing)
##       Credit.Amount               < 2480.5  to the left,  improve=1.1428570, (0 missing)
##       Telephone                   < 1.5     to the right, improve=0.8642857, (0 missing)
##   Surrogate splits:
##       Credit.Amount              < 2480.5  to the left,  agree=0.821, adj=0.545, (0 split)
##       Duration.of.Credit..month. < 27      to the left,  agree=0.679, adj=0.182, (0 split)
##       Age..years.                < 23      to the right, agree=0.679, adj=0.182, (0 split)
##       Type.of.apartment          < 1.5     to the right, agree=0.679, adj=0.182, (0 split)
##       Purpose                    < 1.5     to the right, agree=0.643, adj=0.091, (0 split)
## 
## Node number 71: 12 observations
##   predicted class=1  expected loss=0.08333333  P(node) =0.024
##     class counts:     1    11
##    probabilities: 0.083 0.917 
## 
## Node number 140: 17 observations
##   predicted class=0  expected loss=0.4117647  P(node) =0.034
##     class counts:    10     7
##    probabilities: 0.588 0.412 
## 
## Node number 141: 11 observations
##   predicted class=1  expected loss=0.1818182  P(node) =0.022
##     class counts:     2     9
##    probabilities: 0.182 0.818
plot(model, margin = 0.1)
text(model)

download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Test50.csv', 'Test50.csv')

testset <- read.csv('Test50.csv')
testset$X <- NULL
testset$Creditability = as.factor(testset$Creditability)
#predict(model, testset)
predicted <- predict(model, testset, type = "class")

sum(predicted == testset$Creditability) / length(testset$Creditability)
## [1] 0.71
table(predicted, testset$Creditability)
##          
## predicted   0   1
##         0  64  52
##         1  93 291

ROC Curve

predicted <- predict(model, testset)
res <- as.factor(ifelse(predicted[,1] > 0.1, 0, 1 ))
tb <- table(testset$Creditability,res)

roc_x <- c(0)
roc_y <- c(0)
for(i in seq(0,1,0.01)){
  res <- as.factor(ifelse(predicted[,1] >= i, 0, 1))

  tb <- table(testset$Creditability, res)
  if (ncol(tb) == 2){
    TP <- tb[1]
    FN <- tb[2]
    FP <- tb[3]
    TN <- tb[4]
    FPR <- FP / (FP + TN)
    TPR <- TP / (TP + FN)
    x <- FPR
    y <- TPR
    roc_x <- c(roc_x, x)
    roc_y <- c(roc_y, y)
  }
}
roc_x <- c(roc_x, 1)
roc_y <- c(roc_y, 1)
plot(roc_x, roc_y, type='b')

library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <- predict(model, testset, type="prob")
pred.to.roc <- predictions[, 2] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset$Creditability)) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

## RandomForest

#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
forest <- randomForest(Creditability ~., data = trainset, ntree=200, importance=T, proximity=T)

forest.predicted <- predict(forest, testset)
sum(forest.predicted == testset$Creditability) / length(testset$Creditability)
## [1] 0.728
table(forest.predicted, testset$Creditability)
##                 
## forest.predicted   0   1
##                0  49  28
##                1 108 315
#install.packages('e1071')
library(e1071)
svm.model <- svm(Creditability~., data = trainset)
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'Occupation' constant. Cannot scale data.
svm.model
## 
## Call:
## svm(formula = Creditability ~ ., data = trainset)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.05 
## 
## Number of Support Vectors:  500
predictions1 <- predict(model, testset, type="prob")
pred.to.roc1 <- predictions1[, 2] 
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$Creditability)) 
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 2] 
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$Creditability)) 
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")


plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

資料分群

download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/customers.csv', 'customers.csv')
customers <- read.csv('customers.csv')
head(customers)
##   CustomerID  Genre Age Annual_Income Spending_Score
## 1          1   Male  19            15             39
## 2          2   Male  21            15             81
## 3          3 Female  20            16              6
## 4          4 Female  23            16             77
## 5          5 Female  31            17             40
## 6          6 Female  22            17             76
customers <- customers[    , c("Annual_Income", "Spending_Score")  ]

head(customers)
##   Annual_Income Spending_Score
## 1            15             39
## 2            15             81
## 3            16              6
## 4            16             77
## 5            17             40
## 6            17             76
set.seed(123)
sample.int(42, 6)
## [1] 13 33 17 35 36  2
set.seed(123)
kc <- kmeans(customers, 5)
kc
## K-means clustering with 5 clusters of sizes 50, 27, 74, 39, 10
## 
## Cluster means:
##   Annual_Income Spending_Score
## 1      27.40000       49.48000
## 2      79.00000       16.59259
## 3      55.90541       49.93243
## 4      86.53846       82.12821
## 5     109.70000       22.00000
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## [141] 2 4 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2
## [176] 4 2 4 2 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
## 
## Within cluster sum of squares by cluster:
## [1] 48174.480  4062.519  7375.000 13444.051  2458.100
##  (between_SS / total_SS =  72.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
plot(customers$Annual_Income, customers$Spending_Score, col = kc$cluster)
points(kc$centers[,1], kc$centers[,2], col='orange')

kc$centers
##   Annual_Income Spending_Score
## 1      27.40000       49.48000
## 2      79.00000       16.59259
## 3      55.90541       49.93243
## 4      86.53846       82.12821
## 5     109.70000       22.00000
# install.packages('cluster')
library(cluster)
kcs <- silhouette(kc$cluster, dist(customers))
summary(kcs)
## Silhouette of 200 units in 5 clusters from silhouette.default(x = kc$cluster, dist = dist(customers)) :
##  Cluster sizes and average silhouette widths:
##         50         27         74         39         10 
## 0.04103987 0.47253618 0.63162169 0.48851396 0.35704613 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.4401  0.2089  0.5292  0.4209  0.6421  0.7556
plot(kcs)

#install.packages('fpc')
library(fpc)

nk <- 2:10
sapply(nk, function(e) e ^ 2)
## [1]   4   9  16  25  36  49  64  81 100
set.seed(123)

SW <- sapply(nk, function(k) {
  cluster.stats(dist(customers), kmeans(customers, centers=k)$cluster)$avg.silwidth
})
#SW
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")