Demo20180418

library(readr)
lvr_prices <- read_csv("/tmp/lvr_prices.csv")

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )

## See spec(...) for full column specifications.

## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)

## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col     row col         expected   actual     file                  expected   <int> <chr>       <chr>      <chr>      <chr>                 actual 1  1282 total_price an integer 6700000000 '/tmp/lvr_prices.csv' file 2  2243 total_price an integer 3882685600 '/tmp/lvr_prices.csv' row 3  2244 total_price an integer 3373314400 '/tmp/lvr_prices.csv' col 4  4629 total_price an integer 3050000000 '/tmp/lvr_prices.csv' expected 5  5890 total_price an integer 3133800000 '/tmp/lvr_prices.csv'
## ... ................. ... ............................................................... ........ ............................................................... ...... ............................................................... .... ............................................................... ... ............................................................... ... ............................................................... ........ ...............................................................
## See problems(...) for more details.

head(lvr_prices)

## # A tibble: 6 x 29
##      X1 area   trading_target address          land_sqmeter city_land_type
##   <int> <chr>  <chr>          <chr>                   <dbl> <chr>         
## 1     0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7…        19.4  住            
## 2     1 中正區 房地(土地+建物)… 臺北市中正區忠孝東路二段121…         8.46 商            
## 3     2 大同區 土地           橋北段二小段601~630地號…         5.50 其他          
## 4     3 大同區 房地(土地+建物)… 臺北市大同區重慶北路一段61~…         3.88 商            
## 5     4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90巷…        32.4  住            
## 6     5 信義區 土地           福德段一小段661~690地號…         9.37 其他          
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <chr>,
## #   main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <chr>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>

grades  <- c(50,60,80,90,40,70)
classes <- c(1,1,2,2,1,2)
?tapply()

tapply(grades, classes, mean)

##  1  2 
## 50 80

price_per_sec <- tapply(lvr_prices$total_price, lvr_prices$area, function(e) mean(e, na.rm=TRUE))

barplot(sort(price_per_sec, decreasing = TRUE), cex.names =0.5, cex.axis = 0.5, main = '區域房價', ylab = '房價', xlab = '區域', col='#65BAA3' )

temp <- sample.int(30, 100, replace=TRUE)
mean(temp)

## [1] 16.26

temp2<- c(temp,99,99,99)
mean(temp2)

## [1] 18.6699

grades <- c(50,40,60,70,80,90, 10000)
sort(grades)

## [1]    40    50    60    70    80    90 10000

median(grades)

## [1] 70

mean(grades)

## [1] 1484.286

boxplot(log(total_price) ~ area, data = lvr_prices, main= "房價箱型圖", xlab = "區域", ylab = "價格(log)", cex.axis = 0.6, cex.name = 0.6)

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 7 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 8 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 9 is not drawn

Dplyr

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

head(lvr_prices[ lvr_prices$area == '中山區'  ,   ])

## # A tibble: 6 x 29
##      X1 area   trading_target address          land_sqmeter city_land_type
##   <int> <chr>  <chr>          <chr>                   <dbl> <chr>         
## 1    13 中山區 房地(土地+建物)… 臺北市中山區合江街31~60號…        10.0  住            
## 2    14 中山區 房地(土地+建物)… 臺北市中山區中山北路二段183…        30.5  商            
## 3    16 中山區 房地(土地+建物)… 臺北市中山區吉林路361~39…         6.31 住            
## 4    17 中山區 土地           長安段三小段271~300地號…         4.20 其他          
## 5    24 中山區 房地(土地+建物)… 臺北市中山區林森北路485巷1…        17.6  商            
## 6    39 中山區 房地(土地+建物)… 臺北市中山區建國北路三段93巷…        27.9  住            
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <chr>,
## #   main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <chr>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>

head(lvr_prices[ lvr_prices$area == '中山區'  , c('area', 'total_price')   ])

## # A tibble: 6 x 2
##   area   total_price
##   <chr>        <int>
## 1 中山區     5960000
## 2 中山區    20200000
## 3 中山區     4050000
## 4 中山區     1900000
## 5 中山區    14800000
## 6 中山區    10200000

head(filter(lvr_prices, area == '中山區'))

## # A tibble: 6 x 29
##      X1 area   trading_target address          land_sqmeter city_land_type
##   <int> <chr>  <chr>          <chr>                   <dbl> <chr>         
## 1    13 中山區 房地(土地+建物)… 臺北市中山區合江街31~60號…        10.0  住            
## 2    14 中山區 房地(土地+建物)… 臺北市中山區中山北路二段183…        30.5  商            
## 3    16 中山區 房地(土地+建物)… 臺北市中山區吉林路361~39…         6.31 住            
## 4    17 中山區 土地           長安段三小段271~300地號…         4.20 其他          
## 5    24 中山區 房地(土地+建物)… 臺北市中山區林森北路485巷1…        17.6  商            
## 6    39 中山區 房地(土地+建物)… 臺北市中山區建國北路三段93巷…        27.9  住            
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <chr>,
## #   main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <chr>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>

head(select(lvr_prices, 'area', 'total_price'))

## # A tibble: 6 x 2
##   area   total_price
##   <chr>        <int>
## 1 大安區    18680000
## 2 中正區    20300000
## 3 大同區      132096
## 4 大同區     4200000
## 5 內湖區    14000000
## 6 信義區      255000

# SELECT area, total_price FROM lvr_prices WHERE area = '中山區' LIMIT 6

lvr_prices %>%
  filter(area == '中山區') %>%
  select('area', 'total_price') %>%
  head()

## # A tibble: 6 x 2
##   area   total_price
##   <chr>        <int>
## 1 中山區     5960000
## 2 中山區    20200000
## 3 中山區     4050000
## 4 中山區     1900000
## 5 中山區    14800000
## 6 中山區    10200000

# SELECT area, total_price FROM lvr_prices WHERE area = '中山區' ORDER BY total_price LIMIT 6
lvr_prices %>%
  filter(area == '中山區') %>%
  select('area', 'total_price') %>%
  arrange(total_price) %>%
  head()

## # A tibble: 6 x 2
##   area   total_price
##   <chr>        <int>
## 1 中山區           0
## 2 中山區           0
## 3 中山區       10860
## 4 中山區       16000
## 5 中山區       18060
## 6 中山區       21244

# SELECT area, total_price FROM lvr_prices WHERE area = '中山區' ORDER BY total_price DESC LIMIT 6
lvr_prices %>%
  filter(area == '中山區') %>%
  select('area', 'total_price') %>%
  arrange(desc(total_price)) %>%
  head()

## # A tibble: 6 x 2
##   area   total_price
##   <chr>        <int>
## 1 中山區  1850000000
## 2 中山區  1400000000
## 3 中山區  1084948034
## 4 中山區  1011136500
## 5 中山區   952875000
## 6 中山區   903865500

lvr_prices %>% head()

## # A tibble: 6 x 29
##      X1 area   trading_target address          land_sqmeter city_land_type
##   <int> <chr>  <chr>          <chr>                   <dbl> <chr>         
## 1     0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7…        19.4  住            
## 2     1 中正區 房地(土地+建物)… 臺北市中正區忠孝東路二段121…         8.46 商            
## 3     2 大同區 土地           橋北段二小段601~630地號…         5.50 其他          
## 4     3 大同區 房地(土地+建物)… 臺北市大同區重慶北路一段61~…         3.88 商            
## 5     4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90巷…        32.4  住            
## 6     5 信義區 土地           福德段一小段661~690地號…         9.37 其他          
## # ... with 23 more variables: non_city_land_type <chr>,
## #   non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## #   floor <chr>, total_floor <chr>, building_type <chr>,
## #   main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## #   building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## #   compartment <chr>, management <chr>, total_price <int>,
## #   price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## #   parking_price <int>, comments <chr>, numbers <chr>

lvr_prices$trading_ym <-  as.Date(format(lvr_prices$trading_ymd, '%Y-%m-01' ))

## SELECT trading_ym, area, sum(total_price) FROM lvr_prices 
## GROUP BY trading_ym, area

lvr_stat <- lvr_prices %>%
  select(trading_ym, area, total_price) %>%
  filter(trading_ym >= '2012-01-01') %>%
  group_by(trading_ym, area) %>%
  summarise(overall_price = sum(as.numeric(total_price), na.rm = TRUE))

str(lvr_stat)

## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  631 obs. of  3 variables:
##  $ trading_ym   : Date, format: "2012-01-01" "2012-01-01" ...
##  $ area         : chr  "中山區" "中正區" "信義區" "內湖區" ...
##  $ overall_price: num  1.76e+07 2.58e+08 4.08e+07 3.50e+08 4.38e+07 ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 32 obs. of  5 variables:
##   ..$ row     : int  1282 2243 2244 4629 5890 7153 7522 9777 10596 10714 ...
##   ..$ col     : chr  "total_price" "total_price" "total_price" "total_price" ...
##   ..$ expected: chr  "an integer" "an integer" "an integer" "an integer" ...
##   ..$ actual  : chr  "6700000000" "3882685600" "3373314400" "3050000000" ...
##   ..$ file    : chr  "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 29
##   .. ..$ X1                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ area              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ trading_target    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ address           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ land_sqmeter      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ city_land_type    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ non_city_land_type: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ non_city_code     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ trading_ymd       :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ trading_num       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ floor             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ total_floor       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ building_type     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ main_purpose      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ built_with        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ finish_ymd        :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ building_sqmeter  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ room              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ living_room       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ bath              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ compartment       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ management        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ total_price       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ price_per_sqmeter : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ parking_type      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ parking_sqmeter   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ parking_price     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ comments          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ numbers           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
##  - attr(*, "vars")= chr "trading_ym"
##  - attr(*, "drop")= logi TRUE

lvr_stat$area <- as.factor(lvr_stat$area)

par(mfrow = c(3,4))
for( a in levels(lvr_stat$area)){
  plot(overall_price ~ trading_ym, data = lvr_stat[lvr_stat$area == a, ], type='l', main = a )
}

library(tidyr)
price_pivot <- spread(lvr_stat,trading_ym, overall_price, fill = 0)
#price_pivot
write.csv(price_pivot, 'taipei_house_price.csv')

R 與資料庫

data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

library(RJDBC)

## Loading required package: DBI

## Loading required package: rJava

jar.loc <- '/tmp/mysql-connector-java-5.1.46-bin.jar'

drv <- JDBC("com.mysql.jdbc.Driver",
           jar.loc,
           identifier.quote="`")

conn <- dbConnect(drv, "jdbc:mysql://localhost/mydb?characterEncoding=utf8", "cathaybk", "largitdata")

dbWriteTable(conn, 'iris_david', iris)

## [1] TRUE

dbListTables(conn)

## [1] "iris_david"        "lvr_prices_david2" "lvr_prices_david3"

irisdata <- dbGetQuery(conn,"select * from iris_david where Species = 'setosa' limit 3;")
irisdata

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa

if(dbExistsTable(conn,'iris_david')){
     dbRemoveTable(conn, 'iris_david')
}

## logical(0)

#dbWriteTable(conn, "lvr_prices_david2", lvr_prices[,c('area', 'address')])
#dbListTables(conn)

#lvrdata <- dbGetQuery(conn,"select * from lvr_prices_david2 limit 3")
#lvrdata


dbWriteTable(conn, 'lvr_prices_david3', lvr_prices[lvr_prices$area == '中山區', c('area', 'address')])

## [1] TRUE

dbWriteTable(conn, 'lvr_prices_david3', lvr_prices[lvr_prices$area == '大安區', c('area', 'address')], append=TRUE,row.names=FALSE,overwrite=FALSE)

## [1] TRUE

lvrdata <- dbGetQuery(conn,"select distinct(area) from lvr_prices_david3")
lvrdata

##     area
## 1 中山區
## 2 大安區

dbDisconnect(conn)

## [1] TRUE

資料分類

data(iris)
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

library(rpart)
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)

plot(fit, margin=0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width, col= iris$Species)
abline(v = 2.45, col='blue')
abline(h = 1.75, col='orange')

predicted <- predict(fit, iris[, 1:4], type= 'class')
sum(predicted == iris$Species) / length(iris$Species)

## [1] 0.96

a <- c(1,2,1,1,2)
table(a)

## a
## 1 2 
## 3 2

b <- c(1,2,1,2,1)
table(a,b)

##    b
## a   1 2
##   1 2 1
##   2 1 1

table(iris$Species, predicted)

##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45

a   <- c(15,23,44,6,77,22,33,44)
idx <- c(1 ,1 ,1 ,1, 1,2 , 2, 2)
idx == 1

## [1]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE

a[idx == 1]

## [1] 15 23 44  6 77

a[idx == 2]

## [1] 22 33 44

set.seed(123)
sample.int(42, 6)

## [1] 13 33 17 35 36  2

# Holdout 驗證 - 利用70 % 的資料建立模型, 30% 當成是驗證用的資料
set.seed(123)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7, 0.3))

trainset <- iris[idx == 1, ]
testset  <- iris[idx == 2, ]

dim(trainset)

## [1] 106   5

dim(testset)

## [1] 44  5

fit2 <- rpart(Species ~ ., data = trainset)
plot(fit2, margin = 0.1)
text(fit2)

predicted <- predict(fit2, testset[,1:4], type= 'class')
table(testset$Species, predicted)

##             predicted
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         4
##   virginica       0          1        14

sum(testset$Species == predicted) / length(testset$Species)

## [1] 0.8863636

# K-fold Cross Validation - 10-fold 每次利用 k-1 份的資料建立模型, 拿 第k 份資料測試我們的模型

set.seed(123)
idx <- sample.int(10, nrow(iris), replace=TRUE)

models     <- c()
accuracies <- c()
for(i in 1:10){
  trainset <- iris[idx != i,  ]
  testset  <- iris[idx == i,  ]
  fit <- rpart(Species ~., data = trainset)
  models <- c(models, fit)
  predicted <- predict(fit, testset, type= 'class')
  acc <- sum(predicted == testset$Species) / length(testset$Species)
  accuracies <- c(accuracies, acc)
}


accuracies

##  [1] 1.0000000 1.0000000 1.0000000 0.9333333 0.9500000 0.9090909 0.8823529
##  [8] 0.7333333 1.0000000 0.8571429

邏輯式迴歸

data(anscombe)

fit <- glm(y1 ~ x1, data = anscombe, family = 'gaussian')
fit

## 
## Call:  glm(formula = y1 ~ x1, family = "gaussian", data = anscombe)
## 
## Coefficients:
## (Intercept)           x1  
##      3.0001       0.5001  
## 
## Degrees of Freedom: 10 Total (i.e. Null);  9 Residual
## Null Deviance:       41.27 
## Residual Deviance: 13.76     AIC: 39.68

plot(y1 ~ x1, data = anscombe)
abline(fit, col= 'red')

dataset <- iris[iris$Species != 'setosa', ]
dim(dataset)

## [1] 100   5

dataset$Species <- factor(dataset$Species)
dataset$Species

##   [1] versicolor versicolor versicolor versicolor versicolor versicolor
##   [7] versicolor versicolor versicolor versicolor versicolor versicolor
##  [13] versicolor versicolor versicolor versicolor versicolor versicolor
##  [19] versicolor versicolor versicolor versicolor versicolor versicolor
##  [25] versicolor versicolor versicolor versicolor versicolor versicolor
##  [31] versicolor versicolor versicolor versicolor versicolor versicolor
##  [37] versicolor versicolor versicolor versicolor versicolor versicolor
##  [43] versicolor versicolor versicolor versicolor versicolor versicolor
##  [49] versicolor versicolor virginica  virginica  virginica  virginica 
##  [55] virginica  virginica  virginica  virginica  virginica  virginica 
##  [61] virginica  virginica  virginica  virginica  virginica  virginica 
##  [67] virginica  virginica  virginica  virginica  virginica  virginica 
##  [73] virginica  virginica  virginica  virginica  virginica  virginica 
##  [79] virginica  virginica  virginica  virginica  virginica  virginica 
##  [85] virginica  virginica  virginica  virginica  virginica  virginica 
##  [91] virginica  virginica  virginica  virginica  virginica  virginica 
##  [97] virginica  virginica  virginica  virginica 
## Levels: versicolor virginica

fit <- glm(Species ~ ., data = dataset, family = 'binomial' )
summary(fit)

## 
## Call:
## glm(formula = Species ~ ., family = "binomial", data = dataset)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.01105  -0.00541  -0.00001   0.00677   1.78065  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -42.638     25.707  -1.659   0.0972 .
## Sepal.Length   -2.465      2.394  -1.030   0.3032  
## Sepal.Width    -6.681      4.480  -1.491   0.1359  
## Petal.Length    9.429      4.737   1.991   0.0465 *
## Petal.Width    18.286      9.743   1.877   0.0605 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.629  on 99  degrees of freedom
## Residual deviance:  11.899  on 95  degrees of freedom
## AIC: 21.899
## 
## Number of Fisher Scoring iterations: 10

predicted <- predict(fit, dataset)
pred <- factor(ifelse(predicted < 0 , 'versicolor', 'virginica'))
table(dataset$Species, pred)

##             pred
##              versicolor virginica
##   versicolor         49         1
##   virginica           1        49

SVM

library(e1071)
fit <- svm(Species ~., data = iris)
predicted <- predict(fit, iris[,1:4])
table(iris$Species, predicted)

##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48

信用風險評估

# download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Training50.csv', destfile = 'Training50.csv')
# download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Test50.csv', destfile = 'Test50.csv')

library(readr)
Training50 <- read_csv("/tmp/Training50.csv")

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   .default = col_integer()
## )

## See spec(...) for full column specifications.

head(Training50)

## # A tibble: 6 x 22
##      X1 Creditability Account.Balance Duration.of.Credi… Payment.Status.o…
##   <int>         <int>           <int>              <int>             <int>
## 1   497             1               3                  6                 2
## 2   756             0               1                 15                 1
## 3   580             0               1                 42                 2
## 4   833             0               3                 36                 3
## 5   602             1               3                 24                 3
## 6   734             1               1                 15                 2
## # ... with 17 more variables: Purpose <int>, Credit.Amount <int>,
## #   Value.Savings.Stocks <int>, Length.of.current.employment <int>,
## #   Instalment.per.cent <int>, Sex...Marital.Status <int>,
## #   Guarantors <int>, Duration.in.Current.address <int>,
## #   Most.valuable.available.asset <int>, Age..years. <int>,
## #   Concurrent.Credits <int>, Type.of.apartment <int>,
## #   No.of.Credits.at.this.Bank <int>, Occupation <int>,
## #   No.of.dependents <int>, Telephone <int>, Foreign.Worker <int>

Training50$X1 <- NULL
Training50$Creditability <- as.factor(Training50$Creditability)

head(Training50)

## # A tibble: 6 x 21
##   Creditability Account.Balance Duration.of.Cred… Payment.Status.… Purpose
##   <fct>                   <int>             <int>            <int>   <int>
## 1 1                           3                 6                2       3
## 2 0                           1                15                1       4
## 3 0                           1                42                2       3
## 4 0                           3                36                3       4
## 5 1                           3                24                3       2
## 6 1                           1                15                2       4
## # ... with 16 more variables: Credit.Amount <int>,
## #   Value.Savings.Stocks <int>, Length.of.current.employment <int>,
## #   Instalment.per.cent <int>, Sex...Marital.Status <int>,
## #   Guarantors <int>, Duration.in.Current.address <int>,
## #   Most.valuable.available.asset <int>, Age..years. <int>,
## #   Concurrent.Credits <int>, Type.of.apartment <int>,
## #   No.of.Credits.at.this.Bank <int>, Occupation <int>,
## #   No.of.dependents <int>, Telephone <int>, Foreign.Worker <int>

model <- rpart(Creditability ~ ., data = Training50 )

plot(model, margin = 0.1)
text(model, pretty = 0,cex = 0.6)

library(readr)
Test50 <- read_csv("/tmp/Test50.csv")

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   .default = col_integer()
## )
## See spec(...) for full column specifications.

Test50$Creditability <- as.factor(Test50$Creditability)
head(Test50)

## # A tibble: 6 x 22
##      X1 Creditability Account.Balance Duration.of.Credi… Payment.Status.o…
##   <int> <fct>                   <int>              <int>             <int>
## 1     2 1                           1                  9                 3
## 2     4 1                           1                 12                 3
## 3    10 1                           2                 24                 2
## 4    11 1                           1                 11                 3
## 5    13 1                           1                  6                 3
## 6    16 1                           1                  6                 2
## # ... with 17 more variables: Purpose <int>, Credit.Amount <int>,
## #   Value.Savings.Stocks <int>, Length.of.current.employment <int>,
## #   Instalment.per.cent <int>, Sex...Marital.Status <int>,
## #   Guarantors <int>, Duration.in.Current.address <int>,
## #   Most.valuable.available.asset <int>, Age..years. <int>,
## #   Concurrent.Credits <int>, Type.of.apartment <int>,
## #   No.of.Credits.at.this.Bank <int>, Occupation <int>,
## #   No.of.dependents <int>, Telephone <int>, Foreign.Worker <int>

predicted <- predict(model, Test50, type = 'class')
sum(predicted == Test50$Creditability) / length(Test50$Creditability)

## [1] 0.71

table(Test50$Creditability, predicted)

##    predicted
##       0   1
##   0  64  93
##   1  52 291

predicted <- predict(model, Test50)
#predicted[,2]
#res <- ifelse(predicted[,2] > 0.5, 1, 0)
#res <- factor(res)

#res <- ifelse(predicted[,2] > 0.5, 1, 0)
#res <- factor(res)
#tb <- table(Test50$Creditability, res)
#tb
#tp <- tb[1,1]
#fn <- tb[2,1]
#fp <- tb[1,2]
#tn <- tb[2,2]
#tpr <- tp / (tp+fn)
#fpr <- fp / (fp+tn)
#tpr
#fpr

fpr_ary <- c(0)
tpr_ary <- c(0)
for (thresh in seq(0,1,0.1)){
  res <- as.factor(ifelse(predicted[,1]>= thresh, 0, 1 ))
  tb  <- table(Test50$Creditability, res)
  if (ncol(tb) == 2){
    tp  <- tb[1,1]
    fp  <- tb[1,2]
    fn  <- tb[2,1]
    tn  <- tb[2,2]
    FPR <- fp / (fp + tn)
    TPR <- tp / (tp + fn)
    fpr_ary <- c(fpr_ary, FPR)
    tpr_ary <- c(tpr_ary, TPR)
  }
}
fpr_ary <- c(fpr_ary, 1)
tpr_ary <- c(tpr_ary, 1)
plot(fpr_ary, tpr_ary, type= 'b')

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

predictions <- predict(model, Test50, type="prob")
pred.to.roc <- predictions[, 2] 
pred.rocr <- prediction(pred.to.roc, as.factor(Test50$Creditability)) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

forest <- randomForest(Creditability ~., data = Training50, ntree = 200, importance=T, proximity=T)

forest.predicted <- predict(forest, Test50, type = 'class')
sum(Test50$Creditability == forest.predicted) / length(Test50$Creditability)

## [1] 0.728

tb <- table(Test50$Creditability, forest.predicted)

# 決策樹
predictions1 <- predict(model, Test50, type="prob")
pred.to.roc1 <- predictions1[, 2] 
pred.rocr1 <- prediction(pred.to.roc1, as.factor(Test50$Creditability)) 
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

# 隨機森林
predictions2 <- predict(forest, Test50, type="prob")
pred.to.roc2 <- predictions2[, 2] 
pred.rocr2 <- prediction(pred.to.roc2, as.factor(Test50$Creditability)) 
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")



plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Demo20180418

David Chiu

2018/4/18

Dplyr

R 與資料庫

資料分類

邏輯式迴歸

SVM

信用風險評估