library(readr)
lvr_prices <- read_csv("/tmp/lvr_prices.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_character(),
## X1 = col_integer(),
## land_sqmeter = col_double(),
## trading_ymd = col_date(format = ""),
## finish_ymd = col_date(format = ""),
## building_sqmeter = col_double(),
## room = col_integer(),
## living_room = col_integer(),
## bath = col_integer(),
## total_price = col_integer(),
## price_per_sqmeter = col_double(),
## parking_sqmeter = col_double(),
## parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1282 total_price an integer 6700000000 '/tmp/lvr_prices.csv' file 2 2243 total_price an integer 3882685600 '/tmp/lvr_prices.csv' row 3 2244 total_price an integer 3373314400 '/tmp/lvr_prices.csv' col 4 4629 total_price an integer 3050000000 '/tmp/lvr_prices.csv' expected 5 5890 total_price an integer 3133800000 '/tmp/lvr_prices.csv'
## ... ................. ... ............................................................... ........ ............................................................... ...... ............................................................... .... ............................................................... ... ............................................................... ... ............................................................... ........ ...............................................................
## See problems(...) for more details.
head(lvr_prices)
## # A tibble: 6 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <chr> <chr> <chr> <dbl> <chr>
## 1 0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7… 19.4 住
## 2 1 中正區 房地(土地+建物)… 臺北市中正區忠孝東路二段121… 8.46 商
## 3 2 大同區 土地 橋北段二小段601~630地號… 5.50 其他
## 4 3 大同區 房地(土地+建物)… 臺北市大同區重慶北路一段61~… 3.88 商
## 5 4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90巷… 32.4 住
## 6 5 信義區 土地 福德段一小段661~690地號… 9.37 其他
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <chr>,
## # main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <chr>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
grades <- c(50,60,80,90,40,70)
classes <- c(1,1,2,2,1,2)
?tapply()
tapply(grades, classes, mean)
## 1 2
## 50 80
price_per_sec <- tapply(lvr_prices$total_price, lvr_prices$area, function(e) mean(e, na.rm=TRUE))
barplot(sort(price_per_sec, decreasing = TRUE), cex.names =0.5, cex.axis = 0.5, main = '區域房價', ylab = '房價', xlab = '區域', col='#65BAA3' )

temp <- sample.int(30, 100, replace=TRUE)
mean(temp)
## [1] 16.26
temp2<- c(temp,99,99,99)
mean(temp2)
## [1] 18.6699
grades <- c(50,40,60,70,80,90, 10000)
sort(grades)
## [1] 40 50 60 70 80 90 10000
median(grades)
## [1] 70
mean(grades)
## [1] 1484.286
boxplot(log(total_price) ~ area, data = lvr_prices, main= "房價箱型圖", xlab = "區域", ylab = "價格(log)", cex.axis = 0.6, cex.name = 0.6)
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 7 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 8 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 9 is not drawn

Dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
head(lvr_prices[ lvr_prices$area == '中山區' , ])
## # A tibble: 6 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <chr> <chr> <chr> <dbl> <chr>
## 1 13 中山區 房地(土地+建物)… 臺北市中山區合江街31~60號… 10.0 住
## 2 14 中山區 房地(土地+建物)… 臺北市中山區中山北路二段183… 30.5 商
## 3 16 中山區 房地(土地+建物)… 臺北市中山區吉林路361~39… 6.31 住
## 4 17 中山區 土地 長安段三小段271~300地號… 4.20 其他
## 5 24 中山區 房地(土地+建物)… 臺北市中山區林森北路485巷1… 17.6 商
## 6 39 中山區 房地(土地+建物)… 臺北市中山區建國北路三段93巷… 27.9 住
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <chr>,
## # main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <chr>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
head(lvr_prices[ lvr_prices$area == '中山區' , c('area', 'total_price') ])
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 中山區 5960000
## 2 中山區 20200000
## 3 中山區 4050000
## 4 中山區 1900000
## 5 中山區 14800000
## 6 中山區 10200000
head(filter(lvr_prices, area == '中山區'))
## # A tibble: 6 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <chr> <chr> <chr> <dbl> <chr>
## 1 13 中山區 房地(土地+建物)… 臺北市中山區合江街31~60號… 10.0 住
## 2 14 中山區 房地(土地+建物)… 臺北市中山區中山北路二段183… 30.5 商
## 3 16 中山區 房地(土地+建物)… 臺北市中山區吉林路361~39… 6.31 住
## 4 17 中山區 土地 長安段三小段271~300地號… 4.20 其他
## 5 24 中山區 房地(土地+建物)… 臺北市中山區林森北路485巷1… 17.6 商
## 6 39 中山區 房地(土地+建物)… 臺北市中山區建國北路三段93巷… 27.9 住
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <chr>,
## # main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <chr>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
head(select(lvr_prices, 'area', 'total_price'))
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 大安區 18680000
## 2 中正區 20300000
## 3 大同區 132096
## 4 大同區 4200000
## 5 內湖區 14000000
## 6 信義區 255000
# SELECT area, total_price FROM lvr_prices WHERE area = '中山區' LIMIT 6
lvr_prices %>%
filter(area == '中山區') %>%
select('area', 'total_price') %>%
head()
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 中山區 5960000
## 2 中山區 20200000
## 3 中山區 4050000
## 4 中山區 1900000
## 5 中山區 14800000
## 6 中山區 10200000
# SELECT area, total_price FROM lvr_prices WHERE area = '中山區' ORDER BY total_price LIMIT 6
lvr_prices %>%
filter(area == '中山區') %>%
select('area', 'total_price') %>%
arrange(total_price) %>%
head()
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 中山區 0
## 2 中山區 0
## 3 中山區 10860
## 4 中山區 16000
## 5 中山區 18060
## 6 中山區 21244
# SELECT area, total_price FROM lvr_prices WHERE area = '中山區' ORDER BY total_price DESC LIMIT 6
lvr_prices %>%
filter(area == '中山區') %>%
select('area', 'total_price') %>%
arrange(desc(total_price)) %>%
head()
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 中山區 1850000000
## 2 中山區 1400000000
## 3 中山區 1084948034
## 4 中山區 1011136500
## 5 中山區 952875000
## 6 中山區 903865500
lvr_prices %>% head()
## # A tibble: 6 x 29
## X1 area trading_target address land_sqmeter city_land_type
## <int> <chr> <chr> <chr> <dbl> <chr>
## 1 0 大安區 房地(土地+建物)… 臺北市大安區和平東路三段1巷7… 19.4 住
## 2 1 中正區 房地(土地+建物)… 臺北市中正區忠孝東路二段121… 8.46 商
## 3 2 大同區 土地 橋北段二小段601~630地號… 5.50 其他
## 4 3 大同區 房地(土地+建物)… 臺北市大同區重慶北路一段61~… 3.88 商
## 5 4 內湖區 房地(土地+建物)… 臺北市內湖區民權東路六段90巷… 32.4 住
## 6 5 信義區 土地 福德段一小段661~690地號… 9.37 其他
## # ... with 23 more variables: non_city_land_type <chr>,
## # non_city_code <chr>, trading_ymd <date>, trading_num <chr>,
## # floor <chr>, total_floor <chr>, building_type <chr>,
## # main_purpose <chr>, built_with <chr>, finish_ymd <date>,
## # building_sqmeter <dbl>, room <int>, living_room <int>, bath <int>,
## # compartment <chr>, management <chr>, total_price <int>,
## # price_per_sqmeter <dbl>, parking_type <chr>, parking_sqmeter <dbl>,
## # parking_price <int>, comments <chr>, numbers <chr>
lvr_prices$trading_ym <- as.Date(format(lvr_prices$trading_ymd, '%Y-%m-01' ))
## SELECT trading_ym, area, sum(total_price) FROM lvr_prices
## GROUP BY trading_ym, area
lvr_stat <- lvr_prices %>%
select(trading_ym, area, total_price) %>%
filter(trading_ym >= '2012-01-01') %>%
group_by(trading_ym, area) %>%
summarise(overall_price = sum(as.numeric(total_price), na.rm = TRUE))
str(lvr_stat)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 631 obs. of 3 variables:
## $ trading_ym : Date, format: "2012-01-01" "2012-01-01" ...
## $ area : chr "中山區" "中正區" "信義區" "內湖區" ...
## $ overall_price: num 1.76e+07 2.58e+08 4.08e+07 3.50e+08 4.38e+07 ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 32 obs. of 5 variables:
## ..$ row : int 1282 2243 2244 4629 5890 7153 7522 9777 10596 10714 ...
## ..$ col : chr "total_price" "total_price" "total_price" "total_price" ...
## ..$ expected: chr "an integer" "an integer" "an integer" "an integer" ...
## ..$ actual : chr "6700000000" "3882685600" "3373314400" "3050000000" ...
## ..$ file : chr "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" "'/tmp/lvr_prices.csv'" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 29
## .. ..$ X1 : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ area : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ trading_target : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ address : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ land_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ city_land_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ non_city_land_type: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ non_city_code : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ trading_ymd :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ trading_num : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ floor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ total_floor : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ building_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ main_purpose : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ built_with : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ finish_ymd :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ building_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ room : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ living_room : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ bath : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ compartment : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ management : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ total_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ price_per_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ parking_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ parking_sqmeter : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ parking_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ comments : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ numbers : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
## - attr(*, "vars")= chr "trading_ym"
## - attr(*, "drop")= logi TRUE
lvr_stat$area <- as.factor(lvr_stat$area)
par(mfrow = c(3,4))
for( a in levels(lvr_stat$area)){
plot(overall_price ~ trading_ym, data = lvr_stat[lvr_stat$area == a, ], type='l', main = a )
}

library(tidyr)
price_pivot <- spread(lvr_stat,trading_ym, overall_price, fill = 0)
#price_pivot
write.csv(price_pivot, 'taipei_house_price.csv')
R 與資料庫
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
library(RJDBC)
## Loading required package: DBI
## Loading required package: rJava
jar.loc <- '/tmp/mysql-connector-java-5.1.46-bin.jar'
drv <- JDBC("com.mysql.jdbc.Driver",
jar.loc,
identifier.quote="`")
conn <- dbConnect(drv, "jdbc:mysql://localhost/mydb?characterEncoding=utf8", "cathaybk", "largitdata")
dbWriteTable(conn, 'iris_david', iris)
## [1] TRUE
dbListTables(conn)
## [1] "iris_david" "lvr_prices_david2" "lvr_prices_david3"
irisdata <- dbGetQuery(conn,"select * from iris_david where Species = 'setosa' limit 3;")
irisdata
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
if(dbExistsTable(conn,'iris_david')){
dbRemoveTable(conn, 'iris_david')
}
## logical(0)
#dbWriteTable(conn, "lvr_prices_david2", lvr_prices[,c('area', 'address')])
#dbListTables(conn)
#lvrdata <- dbGetQuery(conn,"select * from lvr_prices_david2 limit 3")
#lvrdata
dbWriteTable(conn, 'lvr_prices_david3', lvr_prices[lvr_prices$area == '中山區', c('area', 'address')])
## [1] TRUE
dbWriteTable(conn, 'lvr_prices_david3', lvr_prices[lvr_prices$area == '大安區', c('area', 'address')], append=TRUE,row.names=FALSE,overwrite=FALSE)
## [1] TRUE
lvrdata <- dbGetQuery(conn,"select distinct(area) from lvr_prices_david3")
lvrdata
## area
## 1 中山區
## 2 大安區
dbDisconnect(conn)
## [1] TRUE
資料分類
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
library(rpart)
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)
plot(fit, margin=0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width, col= iris$Species)
abline(v = 2.45, col='blue')
abline(h = 1.75, col='orange')

predicted <- predict(fit, iris[, 1:4], type= 'class')
sum(predicted == iris$Species) / length(iris$Species)
## [1] 0.96
a <- c(1,2,1,1,2)
table(a)
## a
## 1 2
## 3 2
b <- c(1,2,1,2,1)
table(a,b)
## b
## a 1 2
## 1 2 1
## 2 1 1
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
a <- c(15,23,44,6,77,22,33,44)
idx <- c(1 ,1 ,1 ,1, 1,2 , 2, 2)
idx == 1
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
a[idx == 1]
## [1] 15 23 44 6 77
a[idx == 2]
## [1] 22 33 44
set.seed(123)
sample.int(42, 6)
## [1] 13 33 17 35 36 2
# Holdout 驗證 - 利用70 % 的資料建立模型, 30% 當成是驗證用的資料
set.seed(123)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7, 0.3))
trainset <- iris[idx == 1, ]
testset <- iris[idx == 2, ]
dim(trainset)
## [1] 106 5
dim(testset)
## [1] 44 5
fit2 <- rpart(Species ~ ., data = trainset)
plot(fit2, margin = 0.1)
text(fit2)

predicted <- predict(fit2, testset[,1:4], type= 'class')
table(testset$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 10 4
## virginica 0 1 14
sum(testset$Species == predicted) / length(testset$Species)
## [1] 0.8863636
# K-fold Cross Validation - 10-fold 每次利用 k-1 份的資料建立模型, 拿 第k 份資料測試我們的模型
set.seed(123)
idx <- sample.int(10, nrow(iris), replace=TRUE)
models <- c()
accuracies <- c()
for(i in 1:10){
trainset <- iris[idx != i, ]
testset <- iris[idx == i, ]
fit <- rpart(Species ~., data = trainset)
models <- c(models, fit)
predicted <- predict(fit, testset, type= 'class')
acc <- sum(predicted == testset$Species) / length(testset$Species)
accuracies <- c(accuracies, acc)
}
accuracies
## [1] 1.0000000 1.0000000 1.0000000 0.9333333 0.9500000 0.9090909 0.8823529
## [8] 0.7333333 1.0000000 0.8571429
邏輯式迴歸
data(anscombe)
fit <- glm(y1 ~ x1, data = anscombe, family = 'gaussian')
fit
##
## Call: glm(formula = y1 ~ x1, family = "gaussian", data = anscombe)
##
## Coefficients:
## (Intercept) x1
## 3.0001 0.5001
##
## Degrees of Freedom: 10 Total (i.e. Null); 9 Residual
## Null Deviance: 41.27
## Residual Deviance: 13.76 AIC: 39.68
plot(y1 ~ x1, data = anscombe)
abline(fit, col= 'red')

dataset <- iris[iris$Species != 'setosa', ]
dim(dataset)
## [1] 100 5
dataset$Species <- factor(dataset$Species)
dataset$Species
## [1] versicolor versicolor versicolor versicolor versicolor versicolor
## [7] versicolor versicolor versicolor versicolor versicolor versicolor
## [13] versicolor versicolor versicolor versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] versicolor versicolor versicolor versicolor versicolor versicolor
## [37] versicolor versicolor versicolor versicolor versicolor versicolor
## [43] versicolor versicolor versicolor versicolor versicolor versicolor
## [49] versicolor versicolor virginica virginica virginica virginica
## [55] virginica virginica virginica virginica virginica virginica
## [61] virginica virginica virginica virginica virginica virginica
## [67] virginica virginica virginica virginica virginica virginica
## [73] virginica virginica virginica virginica virginica virginica
## [79] virginica virginica virginica virginica virginica virginica
## [85] virginica virginica virginica virginica virginica virginica
## [91] virginica virginica virginica virginica virginica virginica
## [97] virginica virginica virginica virginica
## Levels: versicolor virginica
fit <- glm(Species ~ ., data = dataset, family = 'binomial' )
summary(fit)
##
## Call:
## glm(formula = Species ~ ., family = "binomial", data = dataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.01105 -0.00541 -0.00001 0.00677 1.78065
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.638 25.707 -1.659 0.0972 .
## Sepal.Length -2.465 2.394 -1.030 0.3032
## Sepal.Width -6.681 4.480 -1.491 0.1359
## Petal.Length 9.429 4.737 1.991 0.0465 *
## Petal.Width 18.286 9.743 1.877 0.0605 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 138.629 on 99 degrees of freedom
## Residual deviance: 11.899 on 95 degrees of freedom
## AIC: 21.899
##
## Number of Fisher Scoring iterations: 10
predicted <- predict(fit, dataset)
pred <- factor(ifelse(predicted < 0 , 'versicolor', 'virginica'))
table(dataset$Species, pred)
## pred
## versicolor virginica
## versicolor 49 1
## virginica 1 49
SVM
library(e1071)
fit <- svm(Species ~., data = iris)
predicted <- predict(fit, iris[,1:4])
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
信用風險評估
# download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Training50.csv', destfile = 'Training50.csv')
# download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Test50.csv', destfile = 'Test50.csv')
library(readr)
Training50 <- read_csv("/tmp/Training50.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
head(Training50)
## # A tibble: 6 x 22
## X1 Creditability Account.Balance Duration.of.Credi… Payment.Status.o…
## <int> <int> <int> <int> <int>
## 1 497 1 3 6 2
## 2 756 0 1 15 1
## 3 580 0 1 42 2
## 4 833 0 3 36 3
## 5 602 1 3 24 3
## 6 734 1 1 15 2
## # ... with 17 more variables: Purpose <int>, Credit.Amount <int>,
## # Value.Savings.Stocks <int>, Length.of.current.employment <int>,
## # Instalment.per.cent <int>, Sex...Marital.Status <int>,
## # Guarantors <int>, Duration.in.Current.address <int>,
## # Most.valuable.available.asset <int>, Age..years. <int>,
## # Concurrent.Credits <int>, Type.of.apartment <int>,
## # No.of.Credits.at.this.Bank <int>, Occupation <int>,
## # No.of.dependents <int>, Telephone <int>, Foreign.Worker <int>
Training50$X1 <- NULL
Training50$Creditability <- as.factor(Training50$Creditability)
head(Training50)
## # A tibble: 6 x 21
## Creditability Account.Balance Duration.of.Cred… Payment.Status.… Purpose
## <fct> <int> <int> <int> <int>
## 1 1 3 6 2 3
## 2 0 1 15 1 4
## 3 0 1 42 2 3
## 4 0 3 36 3 4
## 5 1 3 24 3 2
## 6 1 1 15 2 4
## # ... with 16 more variables: Credit.Amount <int>,
## # Value.Savings.Stocks <int>, Length.of.current.employment <int>,
## # Instalment.per.cent <int>, Sex...Marital.Status <int>,
## # Guarantors <int>, Duration.in.Current.address <int>,
## # Most.valuable.available.asset <int>, Age..years. <int>,
## # Concurrent.Credits <int>, Type.of.apartment <int>,
## # No.of.Credits.at.this.Bank <int>, Occupation <int>,
## # No.of.dependents <int>, Telephone <int>, Foreign.Worker <int>
model <- rpart(Creditability ~ ., data = Training50 )
plot(model, margin = 0.1)
text(model, pretty = 0,cex = 0.6)

library(readr)
Test50 <- read_csv("/tmp/Test50.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
Test50$Creditability <- as.factor(Test50$Creditability)
head(Test50)
## # A tibble: 6 x 22
## X1 Creditability Account.Balance Duration.of.Credi… Payment.Status.o…
## <int> <fct> <int> <int> <int>
## 1 2 1 1 9 3
## 2 4 1 1 12 3
## 3 10 1 2 24 2
## 4 11 1 1 11 3
## 5 13 1 1 6 3
## 6 16 1 1 6 2
## # ... with 17 more variables: Purpose <int>, Credit.Amount <int>,
## # Value.Savings.Stocks <int>, Length.of.current.employment <int>,
## # Instalment.per.cent <int>, Sex...Marital.Status <int>,
## # Guarantors <int>, Duration.in.Current.address <int>,
## # Most.valuable.available.asset <int>, Age..years. <int>,
## # Concurrent.Credits <int>, Type.of.apartment <int>,
## # No.of.Credits.at.this.Bank <int>, Occupation <int>,
## # No.of.dependents <int>, Telephone <int>, Foreign.Worker <int>
predicted <- predict(model, Test50, type = 'class')
sum(predicted == Test50$Creditability) / length(Test50$Creditability)
## [1] 0.71
table(Test50$Creditability, predicted)
## predicted
## 0 1
## 0 64 93
## 1 52 291
predicted <- predict(model, Test50)
#predicted[,2]
#res <- ifelse(predicted[,2] > 0.5, 1, 0)
#res <- factor(res)
#res <- ifelse(predicted[,2] > 0.5, 1, 0)
#res <- factor(res)
#tb <- table(Test50$Creditability, res)
#tb
#tp <- tb[1,1]
#fn <- tb[2,1]
#fp <- tb[1,2]
#tn <- tb[2,2]
#tpr <- tp / (tp+fn)
#fpr <- fp / (fp+tn)
#tpr
#fpr
fpr_ary <- c(0)
tpr_ary <- c(0)
for (thresh in seq(0,1,0.1)){
res <- as.factor(ifelse(predicted[,1]>= thresh, 0, 1 ))
tb <- table(Test50$Creditability, res)
if (ncol(tb) == 2){
tp <- tb[1,1]
fp <- tb[1,2]
fn <- tb[2,1]
tn <- tb[2,2]
FPR <- fp / (fp + tn)
TPR <- tp / (tp + fn)
fpr_ary <- c(fpr_ary, FPR)
tpr_ary <- c(tpr_ary, TPR)
}
}
fpr_ary <- c(fpr_ary, 1)
tpr_ary <- c(tpr_ary, 1)
plot(fpr_ary, tpr_ary, type= 'b')

library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <- predict(model, Test50, type="prob")
pred.to.roc <- predictions[, 2]
pred.rocr <- prediction(pred.to.roc, as.factor(Test50$Creditability))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
forest <- randomForest(Creditability ~., data = Training50, ntree = 200, importance=T, proximity=T)
forest.predicted <- predict(forest, Test50, type = 'class')
sum(Test50$Creditability == forest.predicted) / length(Test50$Creditability)
## [1] 0.728
tb <- table(Test50$Creditability, forest.predicted)
# 決策樹
predictions1 <- predict(model, Test50, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(Test50$Creditability))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")
# 隨機森林
predictions2 <- predict(forest, Test50, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(Test50$Creditability))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")
plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)
