pacman::p_load(dplyr,rpart,rpart.plot,rattle,tidyverse,ROCit,pROC,plotROC,caTools,glmnet,ggplot2,caret,showtext,caret,data.table,GGally,ROCR,maptree,nnet,gbm,DT,randomForest,e1071)
load("kycClean0520.rdata")
load("bc.rdata") #rf.fit500 & svm.fit
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") ## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
colSums(is.na(newkyc))## cid 風險屬性 婚姻
## 0 0 0
## 子女人數 教育程度 個人年收入
## 0 0 0
## 家庭年收入 每月可投資金額 投資經驗
## 0 0 0
## 預計投資期限 偏好的投資商品配息方式 年化報酬率可承受區間
## 0 0 0
## 風險偏好 虧10%會如何應對 month
## 0 0 0
## year s1 s2
## 0 0 0
## s3 s4 e1
## 0 0 0
## e2 e3 e4
## 0 0 0
## e5 e6 e7
## 0 0 0
## p1 p2 p3
## 0 0 0
## p4 AGE GENDER
## 0 0 0
## 戶籍地 產業 職稱
## 0 0 0
## Buy_something
## 0
newkyc=mutate_if(newkyc,is.character,as.factor)
str(newkyc)## 'data.frame': 37166 obs. of 37 variables:
## $ cid : Factor w/ 37166 levels "90","95","125",..: 372 878 924 3592 5006 5370 5435 7283 10081 10390 ...
## $ 風險屬性 : Factor w/ 3 levels "2","3","4": 3 3 1 1 2 2 3 2 1 2 ...
## $ 婚姻 : Factor w/ 2 levels "A","B": 2 2 1 2 2 2 2 2 2 1 ...
## $ 子女人數 : Factor w/ 4 levels "A","B","C","D": 2 2 4 2 2 2 2 2 2 1 ...
## $ 教育程度 : Factor w/ 4 levels "A","B","C","D": 1 1 1 2 1 1 1 3 1 1 ...
## $ 個人年收入 : Factor w/ 4 levels "A","B","C","D": 4 4 4 4 4 4 4 4 4 4 ...
## $ 家庭年收入 : Factor w/ 4 levels "A","B","C","D": 4 4 4 4 4 4 4 4 4 4 ...
## $ 每月可投資金額 : Factor w/ 4 levels "A","B","C","D": 3 3 3 3 3 3 3 3 3 3 ...
## $ 投資經驗 : Factor w/ 4 levels "A","B","C","D": 4 4 4 4 4 4 4 4 4 4 ...
## $ 預計投資期限 : Factor w/ 4 levels "未滿6個月","6個月(含)以上 - 未滿1年",..: 4 3 3 2 4 3 3 2 4 2 ...
## $ 偏好的投資商品配息方式: Factor w/ 4 levels "每半年/年配息",..: 3 4 4 4 4 4 2 4 2 3 ...
## $ 年化報酬率可承受區間 : Factor w/ 4 levels "-5%~+5%","-10%~+10%",..: 2 2 1 3 3 2 2 4 2 3 ...
## $ 風險偏好 : Factor w/ 4 levels "偏好穩定收益",..: 1 3 1 1 4 1 1 4 1 4 ...
## $ 虧10%會如何應對 : Factor w/ 4 levels "把握機會趁機加碼入市",..: 4 4 4 4 4 4 4 1 3 4 ...
## $ month : Factor w/ 12 levels "1","2","3","4",..: 8 12 1 3 3 1 11 6 2 4 ...
## $ year : Factor w/ 3 levels "2019","2020",..: 2 2 1 1 1 2 2 1 1 1 ...
## $ s1 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ s2 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ s3 : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 2 2 1 1 ...
## $ s4 : Factor w/ 2 levels "0","1": 2 2 1 2 2 2 1 1 2 2 ...
## $ e1 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ e2 : Factor w/ 3 levels "0","1","2": 2 2 1 1 1 1 2 1 1 1 ...
## $ e3 : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ e4 : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 1 2 ...
## $ e5 : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 2 1 1 2 1 ...
## $ e6 : Factor w/ 3 levels "0","1","2": 1 1 2 1 1 2 1 1 1 1 ...
## $ e7 : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ p1 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ p2 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ p3 : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 1 1 1 1 ...
## $ p4 : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ AGE : Factor w/ 6 levels "18-24","25-34",..: 3 2 3 2 2 3 3 2 2 4 ...
## $ GENDER : Factor w/ 2 levels "F","M": 1 2 2 2 2 2 2 2 1 2 ...
## $ 戶籍地 : Factor w/ 22 levels "CHY","CWH","CYI",..: 19 9 19 22 19 20 9 16 9 19 ...
## $ 產業 : Factor w/ 25 levels "amb","art","bui",..: 10 8 22 22 13 22 20 20 22 20 ...
## $ 職稱 : Factor w/ 15 levels "0","A","B","C",..: 3 11 4 12 12 2 4 4 2 11 ...
## $ Buy_something : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 2 2 1 1 ...
summary(newkyc)## cid 風險屬性 婚姻 子女人數 教育程度 個人年收入 家庭年收入
## 90 : 1 2: 4593 A:11640 A: 4218 A:24431 A:14528 A: 2820
## 95 : 1 3:19516 B:25526 B:26559 B: 7451 B:15306 B: 4136
## 125 : 1 4:13057 C: 1149 C: 4984 C: 2907 C:14973
## 135 : 1 D: 5240 D: 300 D: 4425 D:15237
## 145 : 1
## 150 : 1
## (Other):37160
## 每月可投資金額 投資經驗 預計投資期限
## A:10691 A:10938 未滿6個月 : 5216
## B:15566 B: 3502 6個月(含)以上 - 未滿1年: 8425
## C: 7565 C: 8607 1年(含)以上 - 未滿3年 : 9659
## D: 3344 D:14119 3年(含)以上 :13866
##
##
##
## 偏好的投資商品配息方式 年化報酬率可承受區間 風險偏好
## 每半年/年配息 : 3290 -5%~+5% : 8468 偏好穩定收益:15563
## 每季配息 : 1979 -10%~+10%:14002 儘量避免損失: 1918
## 每月配息 :11432 -20%~+20%: 7770 追求潛力報酬: 6934
## 無特殊固定配息需求:20465 -30%~+30%: 6926 追求積極獲利:12751
##
##
##
## 虧10%會如何應對 month year s1 s2
## 把握機會趁機加碼入市:11158 1 : 4501 2019:20043 0:26357 0:30662
## 賣出一部份 : 3929 2 : 4226 2020:15455 1:10809 1: 6504
## 認賠全部賣掉 : 1233 3 : 3707 2021: 1668
## 再觀察一陣子 :20846 8 : 3166
## 5 : 3119
## 4 : 2835
## (Other):15612
## s3 s4 e1 e2 e3 e4 e5
## 0:33515 0: 4999 0:13601 0:27503 0:33823 0:12931 0:16689
## 1: 3651 1:32167 1:23565 1: 6186 1: 2909 1:20758 1:19350
## 2: 3477 2: 434 2: 3477 2: 1127
##
##
##
##
## e6 e7 p1 p2 p3 p4 AGE
## 0:28552 0:29777 0: 2103 0:31073 0:27885 0:33625 18-24 : 5748
## 1: 8180 1: 6262 1:35063 1: 6093 1: 9281 1: 3541 25-34 :15251
## 2: 434 2: 1127 35-44 :10117
## 45-54 : 4569
## 55-64 : 1233
## 65(含)以上: 248
##
## GENDER 戶籍地 產業 職稱 Buy_something
## F:17403 TPH :8112 o : 6156 B :17314 0:27402
## M:19763 TPE :6478 tech : 4711 K : 5485 1: 9764
## TYC :3921 gen : 4653 N : 3540
## KHH :3826 maf : 4610 A : 2989
## TXG :3758 fin : 3628 M : 2130
## TNN :2251 ret : 2541 J : 1522
## (Other):8820 (Other):10867 (Other): 4186
set.seed(222)
indexes<-sample(nrow(newkyc),0.7*nrow(newkyc),replace = F)
train<-newkyc[indexes,]
test<-newkyc[-indexes,]
traindata = train[,-1]
testdata = test[,-1]
dim(train)## [1] 26016 37
dim(test)## [1] 11150 37
colnames(traindata)## [1] "風險屬性" "婚姻" "子女人數"
## [4] "教育程度" "個人年收入" "家庭年收入"
## [7] "每月可投資金額" "投資經驗" "預計投資期限"
## [10] "偏好的投資商品配息方式" "年化報酬率可承受區間" "風險偏好"
## [13] "虧10%會如何應對" "month" "year"
## [16] "s1" "s2" "s3"
## [19] "s4" "e1" "e2"
## [22] "e3" "e4" "e5"
## [25] "e6" "e7" "p1"
## [28] "p2" "p3" "p4"
## [31] "AGE" "GENDER" "戶籍地"
## [34] "產業" "職稱" "Buy_something"
colnames(train)## [1] "cid" "風險屬性" "婚姻"
## [4] "子女人數" "教育程度" "個人年收入"
## [7] "家庭年收入" "每月可投資金額" "投資經驗"
## [10] "預計投資期限" "偏好的投資商品配息方式" "年化報酬率可承受區間"
## [13] "風險偏好" "虧10%會如何應對" "month"
## [16] "year" "s1" "s2"
## [19] "s3" "s4" "e1"
## [22] "e2" "e3" "e4"
## [25] "e5" "e6" "e7"
## [28] "p1" "p2" "p3"
## [31] "p4" "AGE" "GENDER"
## [34] "戶籍地" "產業" "職稱"
## [37] "Buy_something"
glm.fit<-glm(Buy_something~.-month -year , data = traindata,family = "binomial")
summary(glm.fit)##
## Call:
## glm(formula = Buy_something ~ . - month - year, family = "binomial",
## data = traindata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0107 -0.7297 -0.4236 0.7390 2.9976
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.599179 554.209583 -0.003 0.997698
## 風險屬性3 0.544148 0.063967 8.507 < 2e-16
## 風險屬性4 1.688420 0.096002 17.587 < 2e-16
## 婚姻B 0.010901 0.054269 0.201 0.840798
## 子女人數B 0.049586 0.067554 0.734 0.462930
## 子女人數C 0.076409 0.099589 0.767 0.442940
## 子女人數D 0.118465 0.060229 1.967 0.049194
## 教育程度B 0.126833 0.041303 3.071 0.002135
## 教育程度C -0.259043 0.058648 -4.417 1.00e-05
## 教育程度D -1.038755 0.316456 -3.282 0.001029
## 個人年收入B -0.031634 0.046803 -0.676 0.499107
## 個人年收入C 0.085995 0.074971 1.147 0.251364
## 個人年收入D 0.118615 0.053120 2.233 0.025553
## 家庭年收入B 0.136816 0.071942 1.902 0.057202
## 家庭年收入C 0.028143 0.077832 0.362 0.717658
## 家庭年收入D -0.011529 0.070276 -0.164 0.869692
## 每月可投資金額B -0.136875 0.045248 -3.025 0.002486
## 每月可投資金額C 0.236187 0.045444 5.197 2.02e-07
## 每月可投資金額D 0.186249 0.068070 2.736 0.006217
## 投資經驗B 0.084641 0.057407 1.474 0.140374
## 投資經驗C 0.168237 0.050630 3.323 0.000891
## 投資經驗D -0.227706 0.062531 -3.641 0.000271
## 預計投資期限6個月(含)以上 - 未滿1年 0.252427 0.073559 3.432 0.000600
## 預計投資期限1年(含)以上 - 未滿3年 0.494295 0.072141 6.852 7.30e-12
## 預計投資期限3年(含)以上 0.539480 0.071608 7.534 4.93e-14
## 偏好的投資商品配息方式每季配息 0.219235 0.092705 2.365 0.018037
## 偏好的投資商品配息方式每月配息 0.539349 0.065362 8.252 < 2e-16
## 偏好的投資商品配息方式無特殊固定配息需求 0.501732 0.062119 8.077 6.64e-16
## 年化報酬率可承受區間-10%~+10% 0.598373 0.059786 10.009 < 2e-16
## 年化報酬率可承受區間-20%~+20% 0.749719 0.067836 11.052 < 2e-16
## 年化報酬率可承受區間-30%~+30% 0.881932 0.074526 11.834 < 2e-16
## 風險偏好儘量避免損失 -0.525202 0.118732 -4.423 9.72e-06
## 風險偏好追求潛力報酬 0.164556 0.049185 3.346 0.000821
## 風險偏好追求積極獲利 0.283505 0.042566 6.660 2.73e-11
## `虧10%會如何應對`賣出一部份 -0.339385 0.064096 -5.295 1.19e-07
## `虧10%會如何應對`認賠全部賣掉 -0.198198 0.101259 -1.957 0.050308
## `虧10%會如何應對`再觀察一陣子 -0.159200 0.037885 -4.202 2.64e-05
## s11 0.206970 0.038420 5.387 7.16e-08
## s21 -0.105047 0.044319 -2.370 0.017776
## s31 -0.079474 0.060115 -1.322 0.186157
## s41 -0.084484 0.053903 -1.567 0.117033
## e11 0.035290 0.036047 0.979 0.327571
## e21 -0.003344 0.081533 -0.041 0.967285
## e22 0.322008 0.077413 4.160 3.19e-05
## e31 -0.080954 0.076200 -1.062 0.288057
## e32 0.144401 0.145907 0.990 0.322330
## e41 0.274171 0.059589 4.601 4.20e-06
## e42 NA NA NA NA
## e51 0.152439 0.049160 3.101 0.001929
## e52 0.011948 0.099416 0.120 0.904339
## e61 0.065653 0.051642 1.271 0.203623
## e62 NA NA NA NA
## e71 0.040478 0.068939 0.587 0.557094
## e72 NA NA NA NA
## p11 0.211092 0.080263 2.630 0.008539
## p21 0.032885 0.050860 0.647 0.517905
## p31 -0.034595 0.040807 -0.848 0.396559
## p41 -0.046969 0.055419 -0.848 0.396697
## AGE25-34 0.328503 0.067175 4.890 1.01e-06
## AGE35-44 0.536864 0.076502 7.018 2.26e-12
## AGE45-54 0.727926 0.086821 8.384 < 2e-16
## AGE55-64 0.915682 0.116324 7.872 3.50e-15
## AGE65(含)以上 0.741594 0.216111 3.432 0.000600
## GENDERM -0.355084 0.035622 -9.968 < 2e-16
## 戶籍地CWH 0.369312 0.184137 2.006 0.044895
## 戶籍地CYI 0.336365 0.242867 1.385 0.166059
## 戶籍地HSC 0.577938 0.202775 2.850 0.004370
## 戶籍地HSH 0.465389 0.197749 2.353 0.018601
## 戶籍地HWA 0.584090 0.228297 2.558 0.010513
## 戶籍地ILN 0.383208 0.206112 1.859 0.062995
## 戶籍地IUH 0.549092 0.197949 2.774 0.005539
## 戶籍地KHH 0.427767 0.172629 2.478 0.013213
## 戶籍地KLU 0.334586 0.203307 1.646 0.099821
## 戶籍地KMN -0.102887 0.258743 -0.398 0.690893
## 戶籍地LNN 0.085099 0.606417 0.140 0.888398
## 戶籍地MAL 0.197663 0.210186 0.940 0.347003
## 戶籍地NTO 0.432057 0.219216 1.971 0.048733
## 戶籍地NULL 0.574660 0.265866 2.161 0.030660
## 戶籍地PEH 0.130954 0.342425 0.382 0.702141
## 戶籍地TNN 0.367072 0.177651 2.066 0.038804
## 戶籍地TPE 0.369683 0.170032 2.174 0.029690
## 戶籍地TPH 0.380635 0.169115 2.251 0.024402
## 戶籍地TXG 0.522758 0.172339 3.033 0.002419
## 戶籍地TYC 0.414619 0.172620 2.402 0.016309
## 戶籍地YLH 0.557298 0.204521 2.725 0.006432
## 產業art -15.551655 535.411549 -0.029 0.976828
## 產業bui -15.169608 535.411221 -0.028 0.977397
## 產業edu -14.796181 535.411214 -0.028 0.977953
## 產業ele -15.268264 535.411301 -0.029 0.977250
## 產業fin -15.188847 535.411212 -0.028 0.977368
## 產業fir -15.263124 535.411275 -0.029 0.977258
## 產業fire -14.658164 535.411215 -0.027 0.978159
## 產業gam -14.712795 535.411341 -0.027 0.978077
## 產業gen -15.126490 535.411212 -0.028 0.977461
## 產業gold -14.992377 535.412657 -0.028 0.977661
## 產業gov -14.858656 535.411219 -0.028 0.977860
## 產業heal -14.809017 535.411212 -0.028 0.977934
## 產業law -14.883922 535.411243 -0.028 0.977822
## 產業maf -14.849008 535.411210 -0.028 0.977874
## 產業o -14.473583 535.411246 -0.027 0.978434
## 產業pnsp -26.238706 570.530685 -0.046 0.963318
## 產業pol -14.632133 535.411459 -0.027 0.978198
## 產業rel -14.940079 535.411247 -0.028 0.977739
## 產業ret -15.043881 535.411214 -0.028 0.977584
## 產業rles -15.701103 535.411267 -0.029 0.976605
## 產業tech -14.985414 535.411210 -0.028 0.977671
## 產業tepay -14.205150 535.411487 -0.027 0.978834
## 產業trsp -14.822181 535.411219 -0.028 0.977914
## 產業vac -15.097698 535.411214 -0.028 0.977504
## 職稱A 11.713890 143.118941 0.082 0.934768
## 職稱B 11.845320 143.118932 0.083 0.934038
## 職稱C 11.791272 143.118958 0.082 0.934338
## 職稱D 11.271844 143.119931 0.079 0.937225
## 職稱E 10.636368 143.120558 0.074 0.940758
## 職稱F 11.755172 143.120147 0.082 0.934539
## 職稱G 11.343564 143.119048 0.079 0.936826
## 職稱H 11.738533 143.119031 0.082 0.934631
## 職稱I 11.089110 143.119102 0.077 0.938240
## 職稱J 12.125219 143.118948 0.085 0.932483
## 職稱K 11.814005 143.118937 0.083 0.934212
## 職稱L 11.324555 143.119055 0.079 0.936932
## 職稱M 11.825703 143.118946 0.083 0.934147
## 職稱N 11.437260 143.119036 0.080 0.936305
##
## (Intercept)
## 風險屬性3 ***
## 風險屬性4 ***
## 婚姻B
## 子女人數B
## 子女人數C
## 子女人數D *
## 教育程度B **
## 教育程度C ***
## 教育程度D **
## 個人年收入B
## 個人年收入C
## 個人年收入D *
## 家庭年收入B .
## 家庭年收入C
## 家庭年收入D
## 每月可投資金額B **
## 每月可投資金額C ***
## 每月可投資金額D **
## 投資經驗B
## 投資經驗C ***
## 投資經驗D ***
## 預計投資期限6個月(含)以上 - 未滿1年 ***
## 預計投資期限1年(含)以上 - 未滿3年 ***
## 預計投資期限3年(含)以上 ***
## 偏好的投資商品配息方式每季配息 *
## 偏好的投資商品配息方式每月配息 ***
## 偏好的投資商品配息方式無特殊固定配息需求 ***
## 年化報酬率可承受區間-10%~+10% ***
## 年化報酬率可承受區間-20%~+20% ***
## 年化報酬率可承受區間-30%~+30% ***
## 風險偏好儘量避免損失 ***
## 風險偏好追求潛力報酬 ***
## 風險偏好追求積極獲利 ***
## `虧10%會如何應對`賣出一部份 ***
## `虧10%會如何應對`認賠全部賣掉 .
## `虧10%會如何應對`再觀察一陣子 ***
## s11 ***
## s21 *
## s31
## s41
## e11
## e21
## e22 ***
## e31
## e32
## e41 ***
## e42
## e51 **
## e52
## e61
## e62
## e71
## e72
## p11 **
## p21
## p31
## p41
## AGE25-34 ***
## AGE35-44 ***
## AGE45-54 ***
## AGE55-64 ***
## AGE65(含)以上 ***
## GENDERM ***
## 戶籍地CWH *
## 戶籍地CYI
## 戶籍地HSC **
## 戶籍地HSH *
## 戶籍地HWA *
## 戶籍地ILN .
## 戶籍地IUH **
## 戶籍地KHH *
## 戶籍地KLU .
## 戶籍地KMN
## 戶籍地LNN
## 戶籍地MAL
## 戶籍地NTO *
## 戶籍地NULL *
## 戶籍地PEH
## 戶籍地TNN *
## 戶籍地TPE *
## 戶籍地TPH *
## 戶籍地TXG **
## 戶籍地TYC *
## 戶籍地YLH **
## 產業art
## 產業bui
## 產業edu
## 產業ele
## 產業fin
## 產業fir
## 產業fire
## 產業gam
## 產業gen
## 產業gold
## 產業gov
## 產業heal
## 產業law
## 產業maf
## 產業o
## 產業pnsp
## 產業pol
## 產業rel
## 產業ret
## 產業rles
## 產業tech
## 產業tepay
## 產業trsp
## 產業vac
## 職稱A
## 職稱B
## 職稱C
## 職稱D
## 職稱E
## 職稱F
## 職稱G
## 職稱H
## 職稱I
## 職稱J
## 職稱K
## 職稱L
## 職稱M
## 職稱N
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 30018 on 26015 degrees of freedom
## Residual deviance: 23904 on 25896 degrees of freedom
## AIC: 24144
##
## Number of Fisher Scoring iterations: 12
glm.pred<-predict(glm.fit,traindata,type = "response")## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
glm.pred2<-predict(glm.fit,testdata,type = "response")## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
ROCit_obj <- rocit(score=glm.pred2,class=test$Buy_something)
plot(ROCit_obj)summary(ROCit_obj)#AUC:0.8008##
## Method used: empirical
## Number of positive(s): 2903
## Number of negative(s): 8247
## Area under curve: 0.8005
test
#glm.prob = rep(0, nrow(traindata))
glm.prob2 = rep(0, nrow(testdata))
glm.prob2[glm.pred2 >0.5] = 1
glmtable<- table(pred=glm.prob2,true=testdata$Buy_something)
glmtable## true
## pred 0 1
## 0 7554 1726
## 1 693 1177
glmacc <- sum(diag(glmtable)/sum(glmtable))
glmacc #0.7827277## [1] 0.7830493
sensitivity = sum(testdata$Buy_something == 1 & glm.prob2 == 1)/sum(testdata$Buy_something == 1)
specificity = sum(testdata$Buy_something == 0 & glm.prob2 == 0)/sum(testdata$Buy_something == 0)
precision= sum(testdata$Buy_something == 1 & glm.prob2 == 1)/sum(glm.prob2 == 1)
recall = sensitivity
F1_score = 2*(precision*recall)/(precision+recall)
cbind(sensitivity, specificity, precision,roc = sensitivity + specificity,F1_score)## sensitivity specificity precision roc F1_score
## [1,] 0.4054426 0.9159694 0.6294118 1.321412 0.4931909
train$glm = glm.pred
test$glm = glm.pred2
all = rbind(train,test)
all_model = allgbmImp = varImp(glm.fit,scale = TRUE)
gbmImp %>% arrange(desc(Overall)) %>% top_n(30)## Selecting by Overall
## Overall
## 風險屬性4 17.587361
## 年化報酬率可承受區間-30%~+30% 11.833841
## 年化報酬率可承受區間-20%~+20% 11.051965
## 年化報酬率可承受區間-10%~+10% 10.008522
## GENDERM 9.968046
## 風險屬性3 8.506768
## AGE45-54 8.384225
## 偏好的投資商品配息方式每月配息 8.251740
## 偏好的投資商品配息方式無特殊固定配息需求 8.076976
## AGE55-64 7.871803
## 預計投資期限3年(含)以上 7.533840
## AGE35-44 7.017596
## 預計投資期限1年(含)以上 - 未滿3年 6.851744
## 風險偏好追求積極獲利 6.660326
## s11 5.387091
## `虧10%會如何應對`賣出一部份 5.294982
## 每月可投資金額C 5.197284
## AGE25-34 4.890286
## e41 4.601047
## 風險偏好儘量避免損失 4.423406
## 教育程度C 4.416911
## `虧10%會如何應對`再觀察一陣子 4.202179
## e22 4.159587
## 投資經驗D 3.641495
## 預計投資期限6個月(含)以上 - 未滿1年 3.431632
## AGE65(含)以上 3.431544
## 風險偏好追求潛力報酬 3.345626
## 投資經驗C 3.322853
## 教育程度D 3.282468
## e51 3.100879
traindata_n = traindata[,-c(3,14,15,32,33)]
tree.fit<-rpart(Buy_something ~.,
data = traindata_n,
method = 'class',
na.action = na.rpart,
control = rpart.control(cp = 0.001,xval = 10,maxdepth = 30))
#p = fancyRpartPlot(tree.fit,cex=1)
plotcp(tree.fit)printcp(tree.fit)##
## Classification tree:
## rpart(formula = Buy_something ~ ., data = traindata_n, na.action = na.rpart,
## method = "class", control = rpart.control(cp = 0.001, xval = 10,
## maxdepth = 30))
##
## Variables actually used in tree construction:
## [1] 產業 風險偏好 風險屬性
## [4] 個人年收入 教育程度 每月可投資金額
## [7] 年化報酬率可承受區間 偏好的投資商品配息方式 投資經驗
## [10] 預計投資期限 職稱 AGE
## [13] e2 e5 p2
## [16] s1 s2
##
## Root node error: 6861/26016 = 0.26372
##
## n= 26016
##
## CP nsplit rel error xerror xstd
## 1 0.0523976 0 1.00000 1.00000 0.0103592
## 2 0.0246320 2 0.89520 0.89520 0.0099837
## 3 0.0049555 3 0.87057 0.87422 0.0099016
## 4 0.0040082 6 0.85279 0.86299 0.0098568
## 5 0.0039353 8 0.84477 0.85848 0.0098385
## 6 0.0018948 9 0.84084 0.85119 0.0098088
## 7 0.0017490 12 0.83384 0.84988 0.0098034
## 8 0.0013118 13 0.83209 0.85498 0.0098243
## 9 0.0012389 14 0.83078 0.85716 0.0098332
## 10 0.0011660 16 0.82830 0.85789 0.0098361
## 11 0.0010786 19 0.82379 0.85410 0.0098207
## 12 0.0010203 26 0.81548 0.85454 0.0098225
## 13 0.0010000 35 0.80542 0.85410 0.0098207
tree.pred<-predict(tree.fit,traindata,type = 'prob')
tree.pred2<-predict(tree.fit,testdata,type = 'prob')
ROCit_obj <- rocit(score=tree.pred2[,2],class= test$Buy_something == "1")
plot(ROCit_obj)summary(ROCit_obj)##
## Method used: empirical
## Number of positive(s): 2903
## Number of negative(s): 8247
## Area under curve: 0.7366
#AUC:0.7344treeImp = varImp(tree.fit,scale = FALSE)
treeImp %>% arrange(desc(Overall)) %>% top_n(30) ## Selecting by Overall
## Overall
## 投資經驗 1207.678879
## 年化報酬率可承受區間 1194.690014
## e2 1125.613561
## 風險屬性 882.849818
## e7 875.816494
## 每月可投資金額 531.612906
## 預計投資期限 405.221210
## 風險偏好 274.677818
## e5 129.658397
## 偏好的投資商品配息方式 96.843008
## 產業 87.373182
## 職稱 80.294640
## AGE 79.251641
## 個人年收入 53.439674
## 虧10%會如何應對 51.437219
## 教育程度 43.948286
## s1 21.935500
## e3 15.851236
## e6 15.851236
## 家庭年收入 15.656193
## s3 6.578107
## p2 5.723993
## s2 4.200117
## p1 3.201058
## e1 3.135837
## p3 2.790018
## s4 1.199019
## 婚姻 0.000000
## `虧10%會如何應對` 0.000000
## e4 0.000000
## p4 0.000000
traindata %>% colnames## [1] "風險屬性" "婚姻" "子女人數"
## [4] "教育程度" "個人年收入" "家庭年收入"
## [7] "每月可投資金額" "投資經驗" "預計投資期限"
## [10] "偏好的投資商品配息方式" "年化報酬率可承受區間" "風險偏好"
## [13] "虧10%會如何應對" "month" "year"
## [16] "s1" "s2" "s3"
## [19] "s4" "e1" "e2"
## [22] "e3" "e4" "e5"
## [25] "e6" "e7" "p1"
## [28] "p2" "p3" "p4"
## [31] "AGE" "GENDER" "戶籍地"
## [34] "產業" "職稱" "Buy_something"
traindata_n = traindata[,-c(2,3,13:15)]# 婚姻,子女人數,虧10%會如何應對,month,year
testdata_n = testdata[,-c(2,3,13:15)]# 婚姻,子女人數,虧10%會如何應對,month,year
#rf.fit<-randomForest(data=traindata_n,Buy_something~.,ntree=1000,importance = TRUE) #importance加了跑很久
#rf.fit500<-randomForest(data=traindata_n,Buy_something~.,ntree=500,importance = TRUE) #importance加了跑很久
plot(rf.fit500)rf.pred<-predict(rf.fit500,traindata_n,type = "response")
rf.pred2<-predict(rf.fit500,testdata_n,type = "response")train
rftable<- table(pred=rf.pred,true=traindata_n$Buy_something)
rftable## true
## pred 0 1
## 0 19155 49
## 1 0 6812
rfacc <- sum(diag(rftable)/sum(rftable))
rfacc #0.7849016## [1] 0.9981165
sensitivity = sum(traindata_n$Buy_something == 1 & rf.pred == 1)/sum(traindata_n$Buy_something == 1)
specificity = sum(traindata_n$Buy_something == 0 & rf.pred == 0)/sum(traindata_n$Buy_something == 0)
precision= sum(traindata_n$Buy_something == 1 & rf.pred == 1)/sum(rf.pred == 1)
recall = sensitivity
F1_score = 2*(precision*recall)/(precision+recall)
cbind(sensitivity, specificity, precision,roc = sensitivity + specificity,F1_score)## sensitivity specificity precision roc F1_score
## [1,] 0.9928582 1 1 1.992858 0.9964163
test
rftable<- table(pred=rf.pred2,true=testdata_n$Buy_something)
rftable## true
## pred 0 1
## 0 7278 1492
## 1 969 1411
rfacc <- sum(diag(rftable)/sum(rftable))
rfacc #0.7849016## [1] 0.7792825
sensitivity = sum(testdata_n$Buy_something == 1 & rf.pred2 == 1)/sum(testdata_n$Buy_something == 1)
specificity = sum(testdata_n$Buy_something == 0 & rf.pred2 == 0)/sum(testdata_n$Buy_something == 0)
precision= sum(testdata_n$Buy_something == 1 & rf.pred2 == 1)/sum(rf.pred2 == 1)
recall = sensitivity
F1_score = 2*(precision*recall)/(precision+recall)
cbind(sensitivity, specificity, precision,roc = sensitivity + specificity,F1_score)## sensitivity specificity precision roc F1_score
## [1,] 0.4860489 0.8825027 0.5928571 1.368552 0.5341662
rf.pred<-predict(rf.fit500,traindata_n,type = "prob")
rf.pred2<-predict(rf.fit500,testdata_n,type = "prob")train$rf = rf.pred[,2]
test$rf = rf.pred2[,2]
all = rbind(train,test)
all_model$rf = all$rf # make dataframe from importance() output
feat_imp_df <- importance(rf.fit500) %>%
data.frame() %>%
mutate(feature = row.names(.)) %>%
filter(MeanDecreaseGini>250)
# plot dataframe
ggplot(feat_imp_df, aes(x = reorder(feature, MeanDecreaseGini),
y = MeanDecreaseGini)) +
geom_bar(stat='identity') +
coord_flip() +
theme(text=element_text(family="STKaiti")) +
labs(
x = "Feature",
y = "Importance",
title = "Feature Importance:Random Forest"
)svm.fit<-svm(Buy_something ~.,data=traindata_n,kernel='radial',gamma=0.01,cost=10, probability = TRUE)
summary(svm.fit)##
## Call:
## svm(formula = Buy_something ~ ., data = traindata_n, kernel = "radial",
## gamma = 0.01, cost = 10, probability = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 13045
##
## ( 6870 6175 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svm.pred<-predict(svm.fit,traindata, probability = TRUE)
svm.pred2<-predict(svm.fit,testdata_n,type = "response")train
svmtable<- table(pred=svm.pred,true=traindata_n$Buy_something)
svmtable## true
## pred 0 1
## 0 18040 4373
## 1 1115 2488
svmacc <- sum(diag(svmtable)/sum(svmtable)) #0.7797069
svmacc ## [1] 0.7890529
sensitivity = sum(traindata_n$Buy_something == 1 & svm.pred == 1)/sum(traindata_n$Buy_something == 1)
specificity = sum(traindata_n$Buy_something == 0 & svm.pred == 0)/sum(traindata_n$Buy_something == 0)
precision= sum(traindata_n$Buy_something == 1 & svm.pred == 1)/sum(svm.pred == 1)
recall = sensitivity
F1_score = 2*(precision*recall)/(precision+recall)
cbind(sensitivity, specificity, precision,roc = sensitivity + specificity,F1_score)## sensitivity specificity precision roc F1_score
## [1,] 0.3626294 0.9417907 0.6905357 1.30442 0.4755352
test
svmtable<- table(pred=svm.pred2,true=testdata_n$Buy_something)
svmtable## true
## pred 0 1
## 0 7632 1833
## 1 615 1070
svmfacc <- sum(diag(svmtable)/sum(svmtable))
svmacc ## [1] 0.7890529
sensitivity = sum(testdata_n$Buy_something == 1 & svm.pred2 == 1)/sum(testdata_n$Buy_something == 1)
specificity = sum(testdata_n$Buy_something == 0 & svm.pred2 == 0)/sum(testdata_n$Buy_something == 0)
precision= sum(testdata_n$Buy_something == 1 & svm.pred2 == 1)/sum(svm.pred2 == 1)
recall = sensitivity
F1_score = 2*(precision*recall)/(precision+recall)
cbind(sensitivity, specificity, precision,roc = sensitivity + specificity,F1_score)## sensitivity specificity precision roc F1_score
## [1,] 0.3685842 0.9254274 0.6350148 1.294012 0.4664342