點我前往預告片
前言: 如何應用有限資料創造大數據的力量 以小雜貨店為例,帶入情境介紹預測性模型的基本觀念,以及如何應用。並以雜貨店老闆的提問,來帶入我們要預測的目標為何。 之後我們降利用此資料來預測顧客下一期行為 Y: 預測顧客會不會來買以及會買多少錢 購買金額 (amount) 基本資料檢視、資料視覺化,可以幫助我們快速了解這筆資料。 購買與否 (Buy) X: 如何重新彙整顧客資料以及產品銷售資訊
Chapter 1: 資料彙整流程
Zrm(list=ls(all=T))
Warning messages:
1: running command '"C:/Program Files/RStudio/bin/pandoc/pandoc" +RTS -K512m -RTS "C:/Football data/predict_model.utf8.md" --to html4 --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash --output pandoc9bc3e004a37.html --smart --email-obfuscation none --self-contained --standalone --section-divs --template "C:\Users\Albert\Documents\R\win-library\3.4\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable "theme:bootstrap" --include-in-header "C:\Users\Albert\AppData\Local\Temp\RtmpUzgFi3\rmarkdown-str9bc4b2d6534.html" --mathjax --variable "mathjax-url:https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --variable code_folding=show --variable "source_embed=predict model.Rmd" --include-after-body "C:\Users\Albert\AppData\Local\Temp\RtmpUzgFi3\file9bc17006259.html" --variable code_menu=1 --variable kable-scroll=1' had status 67
2: running command '"C:/Program Files/RStudio/bin/pandoc/pandoc" +RTS -K512m -RTS "C:/Football data/predict_model.utf8.md" --to html4 --from markdown+autolink_bare_uris+ascii_identifiers+tex_math_single_backslash --output pandoc9bc5ec1b8e.html --smart --email-obfuscation none --self-contained --standalone --section-divs --template "C:\Users\Albert\Documents\R\win-library\3.4\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable "theme:bootstrap" --include-in-header "C:\Users\Albert\AppData\Local\Temp\RtmpUzgFi3\rmarkdown-str9bc50225edd.html" --mathjax --variable "mathjax-url:https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --variable code_folding=show --variable "source_embed=predict model.Rmd" --include-after-body "C:\Users\Albert\AppData\Local\Temp\RtmpUzgFi3\file9bc38771baa.html" --variable code_menu=1 --variable kable-scroll=1' had status 67
Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(ggplot2)
library(caTools)
do.call-rbind-lapply ComboZ = do.call(rbind, lapply(
dir('data/TaFengDataSet','.*csv$',full.names=T),
read.csv, header=F)
) %>%
setNames(c("date","cust","age","area","cat","prod","qty","cost","price"))
nrow(Z)
[1] 817741
Z$date = as.Date(as.character(Z$date))
summary(Z)
date cust age area
Min. :2000-11-01 Min. : 1069 D :181213 E :312501
1st Qu.:2000-11-28 1st Qu.: 969222 E :151023 F :245213
Median :2001-01-01 Median : 1587722 C :140805 G : 72092
Mean :2000-12-30 Mean : 1406620 F : 99719 C : 71640
3rd Qu.:2001-01-30 3rd Qu.: 1854930 B : 66432 H : 40666
Max. :2001-02-28 Max. :20002000 G : 53719 D : 38674
(Other):124830 (Other): 36955
cat prod qty cost
Min. :100101 Min. :2.001e+07 Min. : 1.000 Min. : 0.0
1st Qu.:110106 1st Qu.:4.710e+12 1st Qu.: 1.000 1st Qu.: 35.0
Median :130106 Median :4.710e+12 Median : 1.000 Median : 62.0
Mean :284951 Mean :4.462e+12 Mean : 1.382 Mean : 112.1
3rd Qu.:520314 3rd Qu.:4.713e+12 3rd Qu.: 1.000 3rd Qu.: 112.0
Max. :780510 Max. :9.790e+12 Max. :1200.000 Max. :432000.0
price
Min. : 1.0
1st Qu.: 42.0
Median : 76.0
Mean : 131.9
3rd Qu.: 132.0
Max. :444000.0
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))
qty cost price
99% 6 858.0 1014.00
99.9% 14 2722.0 3135.82
99.95% 24 3799.3 3999.00
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000)
nrow(Z)
[1] 817182
Z$tid = group_indices(Z, date, cust)
sapply(Z[,c("cust","cat","prod","tid")], n_distinct)
cust cat prod tid
32256 2007 23789 119422
summary(Z)
date cust age area
Min. :2000-11-01 Min. : 1069 D :181089 E :312358
1st Qu.:2000-11-28 1st Qu.: 968775 E :150947 F :245079
Median :2001-01-01 Median : 1587685 C :140721 G : 71905
Mean :2000-12-30 Mean : 1406500 F : 99641 C : 71600
3rd Qu.:2001-01-30 3rd Qu.: 1854701 B : 66353 H : 40647
Max. :2001-02-28 Max. :20002000 G : 53689 D : 38654
(Other):124742 (Other): 36939
cat prod qty cost
Min. :100101 Min. :2.001e+07 Min. : 1.000 Min. : 0.0
1st Qu.:110106 1st Qu.:4.710e+12 1st Qu.: 1.000 1st Qu.: 35.0
Median :130106 Median :4.710e+12 Median : 1.000 Median : 62.0
Mean :284784 Mean :4.462e+12 Mean : 1.358 Mean : 106.2
3rd Qu.:520311 3rd Qu.:4.713e+12 3rd Qu.: 1.000 3rd Qu.: 112.0
Max. :780510 Max. :9.790e+12 Max. :24.000 Max. :3798.0
price tid
Min. : 1.0 Min. : 1
1st Qu.: 42.0 1st Qu.: 28783
Median : 76.0 Median : 59391
Mean : 125.5 Mean : 58845
3rd Qu.: 132.0 3rd Qu.: 87391
Max. :4000.0 Max. :119422
XX = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 119422
summary(X)
tid date cust age
Min. : 1 Min. :2000-11-01 Min. : 1069 D :23775
1st Qu.: 29856 1st Qu.:2000-11-29 1st Qu.: 927093 C :19661
Median : 59712 Median :2001-01-01 Median : 1615661 E :19596
Mean : 59712 Mean :2000-12-31 Mean : 1402548 F :13992
3rd Qu.: 89567 3rd Qu.:2001-02-02 3rd Qu.: 1894493 B :10515
Max. :119422 Max. :2001-02-28 Max. :20002000 G : 8493
(Other):23390
area items pieces total
E :50532 Min. : 1.000 Min. : 1.000 Min. : 5
F :33826 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 227
G : 9498 Median : 5.000 Median : 6.000 Median : 510
C : 8527 Mean : 6.843 Mean : 9.294 Mean : 859
H : 7502 3rd Qu.: 9.000 3rd Qu.: 12.000 3rd Qu.: 1082
D : 5108 Max. :112.000 Max. :339.000 Max. :30171
(Other): 4429
gross
Min. :-1645.0
1st Qu.: 21.0
Median : 68.0
Mean : 132.3
3rd Qu.: 169.0
Max. : 8069.0
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
items pieces total gross
99.9% 54 81.0000 9009.579 1824.737
99.95% 62 94.2895 10611.579 2179.817
99.99% 82 133.0000 16044.401 3226.548
X = subset(X, items<=62 & pieces<95 & total<16000) # 119328
par(cex=0.8)
hist(X$date, "weeks", freq=T, border='lightgray', col='darkcyan',
las=2, main="No. Transaction per Week")
Ad0 = max(X$date)
A = group_by(X, cust) %>% summarise(
r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = first(age), # age group
area = first(area), # area code
) %>% data.frame # 33241
summary(A)
cust r s f
Min. : 1069 Min. : 1.00 Min. : 1.00 Min. : 1.000
1st Qu.: 1088519 1st Qu.: 9.00 1st Qu.: 56.00 1st Qu.: 1.000
Median : 1663402 Median : 26.00 Median : 92.00 Median : 2.000
Mean : 1473585 Mean : 37.45 Mean : 80.78 Mean : 3.701
3rd Qu.: 1958089 3rd Qu.: 60.00 3rd Qu.:110.00 3rd Qu.: 4.000
Max. :20002000 Max. :120.00 Max. :120.00 Max. :85.000
m rev raw age area
Min. : 8.0 Min. : 8 Min. : -784.0 D :6580 E :10800
1st Qu.: 365.0 1st Qu.: 707 1st Qu.: 75.0 C :5915 F : 8539
Median : 705.7 Median : 1750 Median : 241.0 E :5080 G : 3695
Mean : 993.1 Mean : 3152 Mean : 484.6 F :3719 C : 3683
3rd Qu.: 1291.0 3rd Qu.: 3968 3rd Qu.: 612.0 B :3193 D : 2166
Max. :12636.0 Max. :127686 Max. :20273.0 G :2183 H : 1453
(Other):5571 (Other): 1905
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m'))
hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)
A0 = A; X0 = X; Z0 = Z
save(Z0, X0, A0, file="data/tf0.rdata")
range(X$date)
[1] "2000-11-01" "2001-02-28"
使用一月底(含2001-01-31)以前的資料,建立模型來預測每一位顧客:
Chapter 2: 資料準備流程
rm(list=ls(all=TRUE))
load("data/tf0.rdata")
Remove data after the demarcation date
feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01) # 618212
X = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 88387
summary(X)
tid date cust age
Min. : 1 Min. :2000-11-01 Min. : 1069 D :17541
1st Qu.:22098 1st Qu.:2000-11-23 1st Qu.: 923910 C :14624
Median :44194 Median :2000-12-12 Median : 1607000 E :14578
Mean :44194 Mean :2000-12-15 Mean : 1395768 F :10354
3rd Qu.:66291 3rd Qu.:2001-01-12 3rd Qu.: 1888874 B : 7817
Max. :88387 Max. :2001-01-31 Max. :20002000 G : 6308
(Other):17165
area items pieces total
E :37496 Min. : 1.000 Min. : 1.000 Min. : 5.0
F :25412 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 230.0
G : 6787 Median : 5.000 Median : 6.000 Median : 522.0
C : 6329 Mean : 6.994 Mean : 9.453 Mean : 888.7
H : 5524 3rd Qu.: 9.000 3rd Qu.: 12.000 3rd Qu.: 1120.0
D : 3655 Max. :112.000 Max. :339.000 Max. :30171.0
(Other): 3184
gross
Min. :-1645.0
1st Qu.: 23.0
Median : 72.0
Mean : 138.3
3rd Qu.: 174.0
Max. : 8069.0
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
items pieces total gross
99.9% 56.0000 84.0000 9378.684 1883.228
99.95% 64.0000 98.0000 11261.751 2317.087
99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
A: 整理並在最後用來跑預測性模型之資料
d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = first(age), # age group
area = first(area), # area code
) %>% data.frame # 28584
feb = filter(X0, date>= feb01) %>% group_by(cust) %>%
summarise(amount = sum(total)) # 16899
A$amountSimply a Left Joint
A = merge(A, feb, by="cust", all.x=T)
A$buyA$buy = !is.na(A$amount)
summary(A)
cust r s f
Min. : 1069 Min. : 1.00 Min. : 1.00 Min. : 1.000
1st Qu.: 1060898 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
Median : 1654100 Median :21.00 Median :68.00 Median : 2.000
Mean : 1461070 Mean :32.12 Mean :61.27 Mean : 3.089
3rd Qu.: 1945003 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
Max. :20002000 Max. :92.00 Max. :92.00 Max. :60.000
m rev raw age area
Min. : 8.0 Min. : 8 Min. : -742.0 D :5832 E :9907
1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 C :5238 F :7798
Median : 709.5 Median : 1566 Median : 218.0 E :4514 C :3169
Mean : 1012.4 Mean : 2711 Mean : 420.8 F :3308 G :3052
3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0 B :2802 D :1778
Max. :10634.0 Max. :99597 Max. :15565.0 G :1940 H :1295
(Other):4950 (Other):1585
amount buy
Min. : 8 Mode :logical
1st Qu.: 454 FALSE:15342
Median : 993 TRUE :13242
Mean : 1499
3rd Qu.: 1955
Max. :28089
NA's :15342
tapply(A$buy, A$age, mean) %>% barplot
abline(h = mean(A$buy), col='red')
tapply(A$buy, A$area, mean) %>% barplot
abline(h = mean(A$buy), col='red')
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
[1] 28584 20008 8576
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
set.seed(2018); spl2 = sample.split(A2$amount, SplitRatio=0.7)
c(nrow(A2), sum(spl2), sum(!spl2))
[1] 13242 9601 3641
save(Z, X, A, spl, spl2, file="data/tf2.rdata")
Chapter3: 迴歸模型
Fig-1: The First Model
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
TR = subset(A, spl)
TS = subset(A, !spl)
glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial())
summary(glm1)
Call:
glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9,
11)])
Deviance Residuals:
Min 1Q Median 3Q Max
-3.7931 -0.8733 -0.6991 1.0384 1.8735
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.259e+00 1.261e-01 -9.985 < 2e-16 ***
r -1.227e-02 8.951e-04 -13.708 < 2e-16 ***
s 9.566e-03 9.101e-04 10.511 < 2e-16 ***
f 2.905e-01 1.593e-02 18.233 < 2e-16 ***
m -3.028e-05 2.777e-05 -1.090 0.27559
rev 4.086e-05 1.940e-05 2.106 0.03521 *
raw -2.306e-04 8.561e-05 -2.693 0.00708 **
ageB -4.194e-02 8.666e-02 -0.484 0.62838
ageC 1.772e-02 7.992e-02 0.222 0.82456
ageD 7.705e-02 7.921e-02 0.973 0.33074
ageE 8.699e-02 8.132e-02 1.070 0.28476
ageF 1.928e-02 8.457e-02 0.228 0.81962
ageG 1.745e-02 9.323e-02 0.187 0.85155
ageH 1.752e-01 1.094e-01 1.602 0.10926
ageI 6.177e-02 1.175e-01 0.526 0.59904
ageJ 2.652e-01 1.047e-01 2.533 0.01131 *
ageK -1.419e-01 1.498e-01 -0.947 0.34347
areaB -4.105e-02 1.321e-01 -0.311 0.75603
areaC -2.075e-01 1.045e-01 -1.986 0.04703 *
areaD 3.801e-02 1.111e-01 0.342 0.73214
areaE 2.599e-01 9.682e-02 2.684 0.00727 **
areaF 1.817e-01 9.753e-02 1.863 0.06243 .
areaG -4.677e-02 1.045e-01 -0.448 0.65435
areaH -1.695e-01 1.232e-01 -1.375 0.16912
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27629 on 20007 degrees of freedom
Residual deviance: 23295 on 19984 degrees of freedom
AIC: 23343
Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3730 873
TRUE 1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.69998
[1] 0.6999767
colAUC(pred, TS$buy) # 0.7556
[,1]
FALSE vs. TRUE 0.7556038
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)
Call:
lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])
Residuals:
Min 1Q Median 3Q Max
-2.04733 -0.23360 0.04677 0.28484 1.67880
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.164e+00 4.891e-02 23.797 < 2e-16 ***
r 4.027e-04 3.094e-04 1.301 0.193127
s -3.277e-05 3.132e-04 -0.105 0.916668
f 2.630e-02 1.704e-03 15.441 < 2e-16 ***
m 5.136e-01 3.669e-02 14.000 < 2e-16 ***
rev 5.205e-02 3.552e-02 1.465 0.142934
ageB 2.510e-02 2.480e-02 1.012 0.311352
ageC 8.571e-02 2.288e-02 3.746 0.000181 ***
ageD 1.015e-01 2.256e-02 4.498 6.93e-06 ***
ageE 9.441e-02 2.303e-02 4.100 4.16e-05 ***
ageF 6.508e-02 2.399e-02 2.713 0.006684 **
ageG 4.403e-02 2.626e-02 1.677 0.093550 .
ageH 4.840e-02 3.089e-02 1.567 0.117186
ageI 1.610e-02 3.252e-02 0.495 0.620547
ageJ -3.803e-02 2.819e-02 -1.349 0.177398
ageK 7.642e-02 3.955e-02 1.932 0.053373 .
areaB 4.065e-02 4.130e-02 0.984 0.324960
areaC -8.146e-03 3.348e-02 -0.243 0.807754
areaD -2.134e-02 3.537e-02 -0.603 0.546389
areaE -2.820e-02 3.056e-02 -0.923 0.356161
areaF -6.067e-03 3.078e-02 -0.197 0.843755
areaG -1.427e-03 3.318e-02 -0.043 0.965695
areaH -2.296e-02 3.761e-02 -0.610 0.541547
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4282 on 9578 degrees of freedom
Multiple R-squared: 0.2972, Adjusted R-squared: 0.2956
F-statistic: 184.1 on 22 and 9578 DF, p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
[1] 0.2972137 0.2337665
Chapter4: 決策樹
Fig-1: Feature Engineering
Fig-2: Feature Engr. & Data Spliting Process
Sys.setlocale("LC_ALL","C")
[1] "C"
library(Matrix)
library(slam)
library(rpart)
library(rpart.plot)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A2 = subset(A, buy)
c(sum(spl), sum(spl2))
[1] 20008 9601
X = X %>% mutate(wday = format(date, "%w"))
table(X$wday)
0 1 2 3 4 5 6
18011 12615 11288 9898 11245 10651 14587
mx = xtabs(~ cust + wday, X)
dim(mx)
[1] 28584 7
mx[1:5,]
wday
cust 0 1 2 3 4 5 6
1069 1 1 0 0 0 0 0
1113 2 1 0 0 0 0 1
1359 0 1 0 0 0 0 0
1823 0 1 0 1 1 0 0
2189 0 0 0 1 0 0 1
mx = mx / rowSums(mx)
mx[1:5,]
wday
cust 0 1 2 3 4 5 6
1069 0.5000000 0.5000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1113 0.5000000 0.2500000 0.0000000 0.0000000 0.0000000 0.0000000 0.2500000
1359 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
1823 0.0000000 0.3333333 0.0000000 0.3333333 0.3333333 0.0000000 0.0000000
2189 0.0000000 0.0000000 0.0000000 0.5000000 0.0000000 0.0000000 0.5000000
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>%
setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>%
right_join(A, by='cust')
head(A)
TR = subset(A, spl)
TS = subset(A, !spl)
library(rpart)
library(rpart.plot)
rpart1 = rpart(buy ~ ., TR[,c(2:16,18)], method="class")
pred = predict(rpart1, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3730 873
TRUE 1643 2330
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.70662
[1] 0.7066231
colAUC(pred, TS$buy) # 0.6984
[,1]
FALSE vs. TRUE 0.6983998
rpart.plot(rpart1,cex=0.6)
Bad 'data' field in model 'call'.
To silence this warning:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart2 = rpart(buy ~ ., TR[,c(2:16,18)], method="class",cp=0.001)
pred = predict(rpart2, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3878 725
TRUE 1812 2161
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.70417
[1] 0.7041744
colAUC(pred, TS$buy) # 0.7169
[,1]
FALSE vs. TRUE 0.7168957
rpart.plot(rpart2,cex=0.6)
Bad 'data' field in model 'call'.
To silence this warning:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
rpart3 = rpart(amount ~ ., TR2[,c(2:17)], cp=0.002)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart3, TS2) - TS2$amount)^2)
1 - (SSE/SST)
[1] 0.2172772
Chapter5: 交叉驗證與參數調校
點我看影片
Fig-1: Supervised Learning Process
Fig-2: CV, Model Sel. & Parameter Tuning
Sys.setlocale("LC_ALL","C")
[1] "C"
library(caret)
Loading required package: lattice
library(doParallel)
Loading required package: foreach
Loading required package: iterators
Loading required package: parallel
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A$buy = factor(ifelse(A$buy, "yes", "no")) # comply to the rule of caret
TR = A[spl, c(2:9,11)]
TS = A[!spl, c(2:9,11)]
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4
ctrl = trainControl(
method="repeatedcv", number=10, # 10-fold, Repeated CV
savePredictions = "final", classProbs=TRUE,
summaryFunction=twoClassSummary)
rpart(), Classification Treectrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
buy ~ ., data=TR, method="rpart",
trControl=ctrl, metric="ROC",
tuneGrid = expand.grid(cp = seq(0.0002,0.001,0.0001) ) )
Sys.time() - t0
Time difference of 1.159329 mins
plot(cv.rpart)
cv.rpart$results
rpart1 = rpart(buy ~ ., TR, method="class", cp=0.0005)
predict(rpart1, TS, type="prob")[,2] %>%
colAUC(TS$buy)
[,1]
no vs. yes 0.7401771
glm(), General Linear Model(邏輯式回歸)ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
buy ~ ., data=TR, method="glm",
trControl=ctrl, metric="ROC")
Sys.time() - t0
Time difference of 20.15436 secs
cv.glm$results
glm(), Final Modelglm1 = b=glm(buy ~ ., TR, family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy)
[,1]
no vs. yes 0.7556038
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
TR2 = A2[ spl2, c(2:10)]
TS2 = A2[!spl2, c(2:10)]
ctrl2 = trainControl(
method="repeatedcv", number=10, # 10-fold, Repeated CV
savePredictions = "final")
rpart() Regression Treectrl$repeats = 2
set.seed(2)
cv.rpart2 = train(
amount ~ ., data=TR2, method="rpart",
trControl=ctrl2, metric="Rsquared",
tuneGrid = expand.grid(cp = seq(0.0008,0.0024,0.0001) ) )
plot(cv.rpart2)
cv.rpart2$results
rpart(), Regression Tree Final Modelrpart2 = rpart(amount ~ ., data=TR2, cp=0.0016)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) - TS2$amount)^2)
(r2.ts.rpart2 = 1 - (SSE/SST))
[1] 0.2174555
lm(), Linear Modelctrl$repeats = 2
set.seed(2)
cv.lm2 = train(
amount ~ ., data=TR2, method="lm",
trControl=ctrl2, metric="Rsquared",
tuneGrid = expand.grid( intercept = seq(0,5,0.5) )
)
plot(cv.lm2)
cv.lm2$results
lm() Final Modellm2 = lm(amount ~ ., TR2)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm2, TS2) - TS2$amount)^2)
(r2.ts.lm2 = 1 - (SSE/SST))
[1] 0.2381007
stopCluster(clust)