Standardized Binary Split Regression Tree (SBSRT)

Poisson.Deviance <- function(pred, obs){200*(mean(pred)-mean(obs)+mean(log((obs/pred)^obs)))}

第一题

choice of learning and test sample

load("C:\\Users\\user\\Desktop\\freMTPL2freq.rda")
set.seed(100)
ll <- sample(c(1:nrow(freMTPL2freq)), round(0.9*nrow(freMTPL2freq)), replace = FALSE) 
learn <- freMTPL2freq[ll,]
test <- freMTPL2freq[-ll,]
n_l = nrow(learn)
n_t = nrow(test)

library(rpart)
## Warning: 程序包'rpart'是用R版本4.4.2 来建造的
library(rpart.plot)
## Warning: 程序包'rpart.plot'是用R版本4.4.2 来建造的
tree1 <- rpart(cbind(Exposure,ClaimNb) ~ Area + VehPower + VehAge 
               + DrivAge + BonusMalus + VehBrand + VehGas + Density + Region, 
               data = learn, method="poisson",
               control=rpart.control(xval=1, minbucket=1500, cp=0.00005))

tree1
## n= 610212 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##    1) root 610212 202513.2000 0.10107650  
##      2) VehAge>=0.5 558264 170893.2000 0.09060582  
##        4) BonusMalus< 57.5 370099 102548.1000 0.07104234  
##          8) VehAge>=12.5 72041  17242.9300 0.05194728  
##           16) DrivAge< 41.5 19955   3354.0600 0.03469194  
##             32) Density< 472.5 13088   1977.0870 0.02911974  
##               64) Region=R11,R22,R23,R24,R25,R41,R43,R52,R54,R72,R73,R74,R82,R83,R91,R94 11088   1566.5430 0.02641443 *
##               65) Region=R21,R26,R31,R42,R53,R93 2000    399.7157 0.04565388 *
##             33) Density>=472.5 6867   1355.2250 0.04651561  
##               66) VehAge>=16.5 1790    235.0763 0.02828077 *
##               67) VehAge< 16.5 5077   1108.4710 0.05320576 *
##           17) DrivAge>=41.5 52086  13787.4700 0.05788736  
##             34) VehAge>=17.5 11564   2489.6710 0.04255365  
##               68) Region=R21,R23,R26,R31,R41,R43,R54,R72,R73,R83,R91,R94 2184    301.5672 0.02684206 *
##               69) Region=R11,R22,R24,R25,R42,R52,R53,R74,R82,R93 9380   2176.6410 0.04593593 *
##             35) VehAge< 17.5 40522  11252.6900 0.06240305  
##               70) Region=R22,R23,R41,R42,R43,R52,R54,R72,R73,R74,R83,R94 8495   1912.1520 0.04682113  
##                140) VehBrand=B1,B12,B13,B14,B2,B4 6444   1333.4640 0.04133264 *
##                141) VehBrand=B10,B11,B3,B5,B6 2051    567.8954 0.06471031 *
##               71) Region=R11,R21,R24,R25,R26,R31,R53,R82,R91,R93 32027   9311.1420 0.06644108  
##                142) Density< 401 19415   5279.9410 0.05907223  
##                  284) DrivAge>=49.5 12606   3352.5100 0.05492159  
##                    568) Region=R25,R31,R91,R93 1642    262.5201 0.03323221 *
##                    569) Region=R11,R21,R24,R26,R53,R82 10964   3078.1230 0.05767125 *
##                  285) DrivAge< 49.5 6809   1919.2080 0.06784897  
##                    570) Density< 42.5 2133    520.5560 0.05040382 *
##                    571) Density>=42.5 4676   1388.0250 0.07660286 *
##                143) Density>=401 12612   4001.2470 0.07927352  
##                  286) VehBrand=B11,B12,B13,B14,B6 1770    343.5664 0.04489425 *
##                  287) VehBrand=B1,B10,B2,B3,B4,B5 10842   3635.8430 0.08470103  
##                    574) VehAge>=15.5 2873    762.5905 0.06505433 *
##                    575) VehAge< 15.5 7969   2861.0690 0.09193148  
##                     1150) Density>=3319 2342    640.3034 0.07026589 *
##                     1151) Density< 3319 5627   2209.7840 0.10069150 *
##          9) VehAge< 12.5 298058  84975.6700 0.07613833  
##           18) DrivAge< 39.5 66195  15115.7700 0.05802313  
##             36) Density< 1313.5 49216  10648.7500 0.05214276  
##               72) Region=R24,R25,R26,R52,R54,R74,R83,R94 22929   4833.0650 0.04577503  
##                144) VehBrand=B1,B10,B11,B13,B2,B3 17572   3551.1080 0.04267205 *
##                145) VehBrand=B12,B14,B4,B5,B6 5357   1271.3130 0.05696684 *
##               73) Region=R11,R21,R22,R23,R31,R41,R42,R43,R53,R72,R73,R82,R91,R93 26287   5791.5120 0.05906762  
##                146) VehPower< 9.5 23399   4977.3270 0.05650441 *
##                147) VehPower>=9.5 2888    801.6129 0.08252970 *
##             37) Density>=1313.5 16979   4398.1560 0.07759025  
##               74) VehPower< 5.5 5907   1256.4380 0.05839881  
##                148) Region=R23,R25,R26,R41,R52,R54,R73,R82,R83,R91 2104    274.3783 0.03185520 *
##                149) Region=R11,R21,R22,R24,R31,R42,R43,R53,R72,R74,R93,R94 3803    957.0218 0.07403195 *
##               75) VehPower>=5.5 11072   3116.9120 0.08869021  
##                150) VehBrand=B1,B12,B13,B14,B2,B3,B6 9307   2471.3790 0.08128688  
##                  300) VehPower< 8.5 7363   1811.7480 0.07399171 *
##                  301) VehPower>=8.5 1944    648.2979 0.11096580 *
##                151) VehBrand=B10,B11,B4,B5 1765    629.8030 0.12638830 *
##           19) DrivAge>=39.5 231863  69643.0500 0.08098350  
##             38) BonusMalus< 51.5 216252  64134.4000 0.07877503  
##               76) Region=R11,R21,R22,R23,R26,R31,R41,R42,R52,R54,R72,R73,R74,R83,R93 105948  26465.1300 0.06960676  
##                152) Density< 1989.5 74868  18225.8100 0.06524732  
##                  304) DrivAge>=56.5 26785   6028.5020 0.05488491  
##                    608) Region=R11,R21,R22,R26,R31,R41,R42,R72,R73,R74,R83 13470   2731.6240 0.04790029 *
##                    609) Region=R23,R52,R54,R93 13315   3282.6240 0.06201679 *
##                  305) DrivAge< 56.5 48083  12154.5700 0.07165684  
##                    610) Region=R31,R41,R73,R74 10681   2385.2310 0.05903584 *
##                    611) Region=R11,R21,R22,R23,R26,R42,R52,R54,R72,R83,R93 37402   9751.8060 0.07539822  
##                     1222) DrivAge< 44.5 10722   2522.4130 0.06638516  
##                       2444) VehBrand=B1,B11,B12,B14,B2,B6 8447   1787.7710 0.05860208 *
##                       2445) VehBrand=B10,B13,B3,B4,B5 2275    718.5544 0.09287882 *
##                     1223) DrivAge>=44.5 26680   7220.6700 0.07901836  
##                       2446) DrivAge>=45.5 24359   6459.0060 0.07619470  
##                         4892) VehBrand=B12,B3,B4,B6 12793   2867.7440 0.06664958  
##                           9784) VehPower< 9.5 10636   2237.4820 0.06140731 *
##                           9785) VehPower>=9.5 2157    618.4707 0.09422875 *
##                         4893) VehBrand=B1,B10,B11,B13,B14,B2,B5 11566   3577.7260 0.08452261  
##                           9786) Region=R11,R23,R52,R72 5962   1700.1500 0.07390865 *
##                           9787) Region=R21,R22,R26,R42,R54,R83,R93 5604   1867.1530 0.09657541 *
##                       2447) DrivAge< 45.5 2321    747.4779 0.11049280 *
##                153) Density>=1989.5 31080   8200.1240 0.08100467  
##                  306) VehPower< 5.5 11045   2568.3810 0.06808386 *
##                  307) VehPower>=5.5 20035   5612.8700 0.08832735  
##                    614) Region=R11,R21,R22,R23,R26,R42,R52,R54,R73,R83 12133   3064.6610 0.07835707 *
##                    615) Region=R31,R41,R72,R93 7902   2531.8720 0.10291930 *
##               77) Region=R24,R25,R43,R53,R82,R91,R94 110304  37561.3200 0.08579789  
##                154) Density< 228.5 58065  19009.8400 0.07904320  
##                  308) VehBrand=B10,B11,B12,B13,B14,B3,B4,B6 19142   5067.4320 0.06765141 *
##                  309) VehBrand=B1,B2,B5 38923  13915.0700 0.08359497  
##                    618) VehAge>=7.5 16114   5510.7540 0.07713346 *
##                    619) VehAge< 7.5 22809   8393.9210 0.08821321 *
##                155) Density>=228.5 52239  18504.5000 0.09389478  
##                  310) VehBrand=B10,B11,B12,B14,B3,B6 19721   5727.1760 0.08337724  
##                    620) VehPower< 5.5 6137   1566.7870 0.06862881 *
##                    621) VehPower>=5.5 13584   4147.0800 0.08994695 *
##                  311) VehBrand=B1,B13,B2,B4,B5 32518  12757.6800 0.09899298  
##                    622) Density>=1304.5 10947   3924.6080 0.08990584 *
##                    623) Density< 1304.5 21571   8823.5190 0.10349860  
##                     1246) Region=R24,R43,R53,R82 18439   7291.7580 0.10005980  
##                       2492) DrivAge< 44.5 2683    814.2055 0.07659123 *
##                       2493) DrivAge>=44.5 15756   6465.0580 0.10373640  
##                         4986) VehAge>=7.5 6306   2450.6350 0.09200427 *
##                         4987) VehAge< 7.5 9450   4004.1140 0.11146930 *
##                     1247) Region=R25,R91,R94 3132   1519.5420 0.12925320 *
##             39) BonusMalus>=51.5 15611   5393.3940 0.11508240  
##               78) VehBrand=B10,B12,B14,B6 5515   1513.8280 0.09284452 *
##               79) VehBrand=B1,B11,B13,B2,B3,B4,B5 10096   3863.1170 0.12470450  
##                158) VehAge>=9.5 2525    840.4064 0.09708720 *
##                159) VehAge< 9.5 7571   3009.7310 0.13405600 *
##        5) BonusMalus>=57.5 188165  65202.0000 0.14243220  
##         10) BonusMalus< 95.5 165358  52997.0800 0.12484240  
##           20) DrivAge< 38.5 110769  31603.5600 0.10380880  
##             40) BonusMalus< 76.5 64966  17061.6200 0.08968541  
##               80) VehAge>=11.5 15481   3474.8790 0.06784939  
##                160) DrivAge< 31.5 8044   1548.4800 0.05291392  
##                  320) Region=R11,R21,R22,R23,R24,R25,R41,R42,R43,R52,R53,R54,R72,R73,R83,R91,R94 5742   1022.2310 0.04573780 *
##                  321) Region=R26,R31,R74,R82,R93 2302    515.8830 0.07432589 *
##                161) DrivAge>=31.5 7437   1899.7220 0.08425223  
##                  322) VehGas=Regular 4350   1019.5110 0.07097273 *
##                  323) VehGas=Diesel 3087    869.2496 0.10428000 *
##               81) VehAge< 11.5 49485  13531.5000 0.09685020  
##                162) DrivAge< 32.5 32503   8460.2690 0.08724457  
##                  324) VehPower< 5.5 12743   2910.9910 0.07071987  
##                    648) Region=R22,R24,R25,R26,R42,R43,R53,R54,R72,R73,R82,R83,R94 7191   1547.5760 0.05899664  
##                     1296) VehBrand=B1,B10,B11,B12,B13,B14,B2,B6 5048    898.9420 0.04590341  
##                       2592) BonusMalus< 65.5 1726    227.6364 0.02934943 *
##                       2593) BonusMalus>=65.5 3322    661.0961 0.05581604 *
##                     1297) VehBrand=B3,B4,B5 2143    625.5721 0.08852834 *
##                    649) Region=R11,R21,R23,R31,R41,R52,R74,R91,R93 5552   1344.6160 0.08992865 *
##                  325) VehPower>=5.5 19760   5516.4720 0.09863767  
##                    650) Region=R24,R25,R42,R43,R54,R83 5443   1370.3410 0.07664061 *
##                    651) Region=R11,R21,R22,R23,R26,R31,R41,R52,R53,R72,R73,R74,R82,R91,R93,R94 14317   4125.1660 0.10883050  
##                     1302) VehPower< 8.5 11760   3218.2300 0.10187000 *
##                     1303) VehPower>=8.5 2557    894.9984 0.14190710 *
##                163) DrivAge>=32.5 16982   5030.7460 0.11556440  
##                  326) VehBrand=B12,B2,B6 8465   2307.6030 0.09979518  
##                    652) Region=R21,R23,R25,R26,R31,R43,R53,R72,R83 1861    345.7515 0.06407078 *
##                    653) Region=R11,R22,R24,R41,R42,R52,R54,R73,R74,R82,R91,R93,R94 6604   1947.9480 0.10921220 *
##                  327) VehBrand=B1,B10,B11,B13,B14,B3,B4,B5 8517   2707.8920 0.13012990  
##                    654) BonusMalus>=63.5 5702   1677.5910 0.11532150  
##                     1308) BonusMalus< 66.5 1551    347.5432 0.08082072 *
##                     1309) BonusMalus>=66.5 4151   1319.1610 0.12855510 *
##                    655) BonusMalus< 63.5 2815   1018.8670 0.15594420 *
##             41) BonusMalus>=76.5 45803  14399.1900 0.12568500  
##               82) VehAge>=12.5 10812   2904.6420 0.09594250 *
##               83) VehAge< 12.5 34991  11448.0700 0.13531570  
##                166) BonusMalus< 90.5 26750   8401.0270 0.12438590  
##                  332) VehPower< 4.5 5104   1357.9540 0.09142491 *
##                  333) VehPower>=4.5 21646   7017.1810 0.13247430  
##                    666) Region=R22,R23,R24,R25,R31,R41,R43,R52,R54,R73,R93 12129   3618.0500 0.11536780 *
##                    667) Region=R11,R21,R26,R42,R53,R72,R74,R82,R83,R91,R94 9517   3373.0050 0.15517130 *
##                167) BonusMalus>=90.5 8241   3006.3750 0.17192520  
##                  334) VehBrand=B10,B12,B14 1749    434.3224 0.10286260 *
##                  335) VehBrand=B1,B11,B13,B2,B3,B4,B5,B6 6492   2546.0760 0.18903850  
##                    670) VehAge>=9.5 2320    716.1324 0.13698320 *
##                    671) VehAge< 9.5 4172   1808.6120 0.21646810 *
##           21) DrivAge>=38.5 54589  20922.9000 0.16302950  
##             42) BonusMalus>=63.5 38382  13646.4300 0.14435880  
##               84) BonusMalus< 64.5 4399   1200.1970 0.08801653 *
##               85) BonusMalus>=64.5 33983  12385.5900 0.15178820  
##                170) Region=R11,R26,R42,R72,R73,R91,R93 14718   4373.2270 0.12599140  
##                  340) Region=R91 2179    459.3754 0.07582583 *
##                  341) Region=R11,R26,R42,R72,R73,R93 12539   3889.5710 0.13448370  
##                    682) VehBrand=B12,B13 4452   1207.5850 0.10578680 *
##                    683) VehBrand=B1,B10,B11,B14,B2,B3,B4,B5,B6 8087   2664.4250 0.14921400 *
##                171) Region=R21,R22,R23,R24,R25,R31,R41,R43,R52,R53,R54,R74,R82,R83,R94 19265   7966.7170 0.16800700  
##                  342) VehAge>=10.5 8036   2999.3710 0.13901220 *
##                  343) VehAge< 10.5 11229   4931.5830 0.18846460  
##                    686) VehBrand=B12 2316    771.8453 0.13166220 *
##                    687) VehBrand=B1,B10,B11,B13,B14,B2,B3,B4,B5,B6 8913   4137.4240 0.20013210  
##                     1374) DrivAge< 42.5 2159    795.6395 0.14723640 *
##                     1375) DrivAge>=42.5 6754   3321.4790 0.21503280  
##                       2750) VehGas=Regular 2895   1380.8090 0.18499890 *
##                       2751) VehGas=Diesel 3859   1927.8010 0.23927630 *
##             43) BonusMalus< 63.5 16207   7159.4520 0.20232160  
##               86) BonusMalus< 61.5 10176   3815.1150 0.14617820  
##                172) BonusMalus>=58.5 6346   2013.9200 0.10859960 *
##                173) BonusMalus< 58.5 3830   1726.0600 0.20124040 *
##               87) BonusMalus>=61.5 6031   3129.4600 0.29480540  
##                174) Region=R11,R22,R23,R42,R43,R72,R73,R83,R91,R93,R94 2426    922.3735 0.18894920 *
##                175) Region=R21,R24,R25,R26,R31,R41,R52,R53,R54,R74,R82 3605   2137.4200 0.34974680 *
##         11) BonusMalus>=95.5 22807  10917.3600 0.29983630  
##           22) VehBrand=B12,B4 4937   1884.8830 0.18777630  
##             44) VehAge< 1.5 1608    443.6318 0.10847520 *
##             45) VehAge>=1.5 3329   1406.3820 0.22732720 *
##           23) VehBrand=B1,B10,B11,B13,B14,B2,B3,B5,B6 17870   8921.1320 0.33146650  
##             46) BonusMalus< 100.5 12859   5822.5180 0.28359990  
##               92) DrivAge>=24.5 7487   2951.8330 0.23091300  
##                184) Region=R22,R23,R26,R31,R41,R43,R72,R83,R91,R94 1558    418.5323 0.13751210 *
##                185) Region=R11,R21,R24,R25,R42,R52,R53,R54,R73,R74,R82,R93 5929   2506.0490 0.25409250  
##                  370) Density< 853.5 2704   1019.0840 0.21141790 *
##                  371) Density>=853.5 3225   1473.9180 0.29120140 *
##               93) DrivAge< 24.5 5372   2807.1730 0.36763920  
##                186) Area=A,C 2145    944.1089 0.26679580 *
##                187) Area=B,D,E,F 3227   1833.2270 0.43304140 *
##             47) BonusMalus>=100.5 5011   3024.1450 0.41118840  
##               94) BonusMalus< 120.5 3404   1992.1600 0.37295420 *
##               95) BonusMalus>=120.5 1607   1013.0520 0.49445540 *
##      3) VehAge< 0.5 51948  26933.7400 0.31504200  
##        6) VehBrand=B1,B10,B11,B13,B14,B2,B3,B4,B5,B6 18572   5244.5510 0.12628530  
##         12) BonusMalus< 61.5 14651   4003.1770 0.11307060  
##           24) Region=R26,R31,R41,R42,R54,R72,R73,R74,R82,R91,R93 7184   1561.2800 0.08583189 *
##           25) Region=R11,R21,R22,R23,R24,R25,R43,R52,R53,R83,R94 7467   2408.4170 0.13922660  
##             50) VehPower>=6.5 3364    839.6734 0.10574280 *
##             51) VehPower< 6.5 4103   1551.0830 0.16587490 *
##         13) BonusMalus>=61.5 3921   1206.8590 0.18215570 *
##        7) VehBrand=B12 33376  20269.5400 0.45580580  
##         14) VehGas=Diesel 16691   6309.3660 0.23662770  
##           28) VehPower< 4.5 2888    419.4473 0.07331867 *
##           29) VehPower>=4.5 13803   5762.8930 0.26973000  
##             58) VehPower>=7.5 7070   2229.2200 0.19525490  
##              116) VehPower< 9.5 3962    608.3534 0.08496435 *
##              117) VehPower>=9.5 3108   1480.8970 0.32727070 *
##             59) VehPower< 7.5 6733   3458.3520 0.34580160  
##              118) Region=R11,R21,R23,R24,R31,R53,R72,R73,R83,R94 3616   1598.8090 0.27410270 *
##              119) Region=R22,R25,R26,R41,R42,R43,R52,R54,R74,R82,R91,R93 3117   1829.8880 0.42715720 *
##         15) VehGas=Regular 16685  13042.6300 0.66668610  
##           30) VehPower>=9.5 2427   1196.5010 0.30341220 *
##           31) VehPower< 9.5 14258  11670.6500 0.72735490  
##             62) VehPower< 4.5 6073   3997.1800 0.54995700  
##              124) DrivAge>=56.5 1660    785.1871 0.31200350 *
##              125) DrivAge< 56.5 4413   3146.9050 0.63719760 *
##             63) VehPower>=4.5 8185   7559.7260 0.84731730  
##              126) VehPower>=6.5 3513   2506.3070 0.56742610 *
##              127) VehPower< 6.5 4672   4896.6810 1.05032900  
##                254) DrivAge>=41.5 3030   3124.8750 0.94912390 *
##                255) DrivAge< 41.5 1642   1751.2660 1.21310500 *
rpart.plot(tree1)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

average_loss <- cbind(tree1$cptable[,2], tree1$cptable[,3], tree1$cptable[,3]* tree1$frame$dev[1] / n_l)

plot(x=average_loss[,1], y=average_loss[,3]*100, type='l', col="blue", ylim=c(30.5,33.5), 
     xlab="number of splits", ylab="average in-sample loss (in 10^(-2))", 
     main="decrease of in-sample loss")
points(x=average_loss[,1], y=average_loss[,3]*100, pch=19, col="blue")

learn$fit <- predict(tree1)*learn$Exposure
test$fit <- predict(tree1, newdata=test)*test$Exposure

c(Poisson.Deviance(learn$fit, learn$ClaimNb),Poisson.Deviance(test$fit, test$ClaimNb))
## [1] 30.50441 30.25269

Cost-complexity pruning (cross-validation)

K <- 10

tree2 <- rpart(cbind(Exposure,ClaimNb) ~ Area + VehPower + VehAge + DrivAge +
                 BonusMalus + VehBrand + VehGas + Density + Region, 
               data = learn, method="poisson",
               control=rpart.control(xval=K, minbucket=1500, cp=0.00005))

printcp(tree2)
## 
## Rates regression tree:
## rpart(formula = cbind(Exposure, ClaimNb) ~ Area + VehPower + 
##     VehAge + DrivAge + BonusMalus + VehBrand + VehGas + Density + 
##     Region, data = learn, method = "poisson", control = rpart.control(xval = K, 
##     minbucket = 1500, cp = 5e-05))
## 
## Variables actually used in tree construction:
## [1] Area       BonusMalus Density    DrivAge    Region     VehAge     VehBrand  
## [8] VehGas     VehPower  
## 
## Root node error: 202513/610212 = 0.33187
## 
## n= 610212 
## 
##            CP nsplit rel error  xerror      xstd
## 1  2.3141e-02      0   1.00000 1.00000 0.0046651
## 2  1.5520e-02      1   0.97686 0.97687 0.0045185
## 3  7.0101e-03      2   0.96134 0.96136 0.0044763
## 4  6.3579e-03      3   0.95433 0.95438 0.0044470
## 5  4.5308e-03      4   0.94797 0.94803 0.0044267
## 6  2.3239e-03      5   0.94344 0.94351 0.0044187
## 7  1.6271e-03      6   0.94112 0.94130 0.0044237
## 8  1.0708e-03      7   0.93949 0.93970 0.0044240
## 9  8.6648e-04      8   0.93842 0.93863 0.0044176
## 10 8.1944e-04      9   0.93755 0.93788 0.0044196
## 11 7.0488e-04     11   0.93591 0.93679 0.0044192
## 12 6.6782e-04     12   0.93521 0.93616 0.0044178
## 13 6.2725e-04     14   0.93387 0.93494 0.0044165
## 14 5.6915e-04     15   0.93325 0.93378 0.0044167
## 15 5.4983e-04     16   0.93268 0.93317 0.0044163
## 16 5.3303e-04     17   0.93213 0.93283 0.0044167
## 17 5.3155e-04     18   0.93159 0.93242 0.0044165
## 18 5.0070e-04     20   0.93053 0.93164 0.0044086
## 19 3.7101e-04     21   0.93003 0.93086 0.0044032
## 20 3.6772e-04     22   0.92966 0.93039 0.0044051
## 21 3.4401e-04     23   0.92929 0.93029 0.0044059
## 22 3.4003e-04     24   0.92895 0.93007 0.0044061
## 23 3.2140e-04     25   0.92861 0.92977 0.0044060
## 24 3.1363e-04     26   0.92829 0.92973 0.0044062
## 25 2.9945e-04     27   0.92797 0.92955 0.0044058
## 26 2.7277e-04     28   0.92767 0.92915 0.0044053
## 27 2.3199e-04     29   0.92740 0.92878 0.0044059
## 28 2.2953e-04     30   0.92717 0.92850 0.0044057
## 29 2.2541e-04     31   0.92694 0.92839 0.0044058
## 30 2.2274e-04     32   0.92671 0.92837 0.0044041
## 31 2.0229e-04     33   0.92649 0.92815 0.0044038
## 32 2.0082e-04     35   0.92609 0.92794 0.0044015
## 33 1.9989e-04     36   0.92588 0.92793 0.0044015
## 34 1.7659e-04     37   0.92568 0.92797 0.0044028
## 35 1.7218e-04     38   0.92551 0.92787 0.0043997
## 36 1.7043e-04     39   0.92534 0.92775 0.0043995
## 37 1.6532e-04     40   0.92517 0.92778 0.0044001
## 38 1.6199e-04     41   0.92500 0.92778 0.0044007
## 39 1.4733e-04     42   0.92484 0.92765 0.0044007
## 40 1.4653e-04     43   0.92469 0.92745 0.0044012
## 41 1.4643e-04     45   0.92440 0.92744 0.0044011
## 42 1.3500e-04     46   0.92425 0.92746 0.0044112
## 43 1.3457e-04     47   0.92412 0.92736 0.0044112
## 44 1.3173e-04     48   0.92398 0.92736 0.0044113
## 45 1.2843e-04     49   0.92385 0.92731 0.0044116
## 46 1.2827e-04     51   0.92359 0.92726 0.0044109
## 47 1.2307e-04     52   0.92346 0.92719 0.0044103
## 48 1.1990e-04     54   0.92322 0.92711 0.0044108
## 49 1.1939e-04     55   0.92310 0.92705 0.0044110
## 50 1.1018e-04     56   0.92298 0.92692 0.0044113
## 51 1.0783e-04     57   0.92287 0.92672 0.0044108
## 52 1.0739e-04     58   0.92276 0.92671 0.0044107
## 53 1.0534e-04     59   0.92265 0.92668 0.0044099
## 54 1.0352e-04     60   0.92255 0.92668 0.0044101
## 55 1.0335e-04     61   0.92245 0.92665 0.0044101
## 56 1.0142e-04     63   0.92224 0.92665 0.0044104
## 57 1.0027e-04     64   0.92214 0.92665 0.0044107
## 58 9.7016e-05     65   0.92204 0.92661 0.0044106
## 59 9.3493e-05     66   0.92194 0.92665 0.0044113
## 60 9.3193e-05     67   0.92185 0.92659 0.0044111
## 61 8.7209e-05     68   0.92175 0.92662 0.0044117
## 62 8.6715e-05     69   0.92167 0.92648 0.0044108
## 63 8.6587e-05     70   0.92158 0.92648 0.0044108
## 64 8.1225e-05     71   0.92149 0.92650 0.0044113
## 65 8.0671e-05     72   0.92141 0.92659 0.0044122
## 66 7.7671e-05     73   0.92133 0.92668 0.0044135
## 67 7.5310e-05     74   0.92125 0.92673 0.0044146
## 68 7.0382e-05     75   0.92118 0.92682 0.0044145
## 69 6.8652e-05     76   0.92111 0.92686 0.0044127
## 70 6.5720e-05     77   0.92104 0.92685 0.0044115
## 71 6.4420e-05     78   0.92097 0.92686 0.0044119
## 72 6.4188e-05     79   0.92091 0.92687 0.0044120
## 73 6.4096e-05     83   0.92065 0.92687 0.0044120
## 74 6.3549e-05     84   0.92059 0.92679 0.0044080
## 75 6.2081e-05     85   0.92052 0.92694 0.0044097
## 76 6.0162e-05     86   0.92046 0.92716 0.0044120
## 77 5.8946e-05     87   0.92040 0.92724 0.0044127
## 78 5.8227e-05     88   0.92034 0.92735 0.0044134
## 79 5.7661e-05     89   0.92028 0.92737 0.0044138
## 80 5.6603e-05     90   0.92022 0.92741 0.0044138
## 81 5.6457e-05     91   0.92017 0.92742 0.0044131
## 82 5.6399e-05     92   0.92011 0.92740 0.0044130
## 83 5.5965e-05     95   0.91994 0.92745 0.0044133
## 84 5.4224e-05     96   0.91989 0.92756 0.0044143
## 85 5.4126e-05     97   0.91983 0.92760 0.0044144
## 86 5.3759e-05     98   0.91978 0.92761 0.0044147
## 87 5.3468e-05     99   0.91972 0.92763 0.0044149
## 88 5.3291e-05    100   0.91967 0.92760 0.0044147
## 89 5.2560e-05    101   0.91962 0.92761 0.0044148
## 90 5.1468e-05    102   0.91956 0.92773 0.0044153
## 91 5.1313e-05    103   0.91951 0.92777 0.0044156
## 92 5.1187e-05    104   0.91946 0.92778 0.0044156
## 93 5.0902e-05    105   0.91941 0.92777 0.0044155
## 94 5.0559e-05    106   0.91936 0.92776 0.0044155
## 95 5.0414e-05    109   0.91921 0.92776 0.0044155
## 96 5.0000e-05    110   0.91916 0.92783 0.0044160
library(plotrix)
library(Hmisc)
## Warning: 程序包'Hmisc'是用R版本4.4.2 来建造的
## 
## 载入程序包:'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
set.seed(100)
xgroup <- rep(1:K, length = nrow(learn))
xfit <- xpred.rpart(tree2, xgroup)
(n_subtrees <- dim(tree2$cptable)[1])
## [1] 96
std3 <- numeric(n_subtrees)
err3 <- numeric(n_subtrees)
err_group <- numeric(K)
for (i in 1:n_subtrees){
  for (k in 1:K){
    ind_group <- which(xgroup ==k)  
    err_group[k] <- 2*sum(learn[ind_group,"Exposure"]*xfit[ind_group,i]-
                            learn[ind_group,"ClaimNb"]+log((learn[ind_group,"ClaimNb"]/
                                                              (learn[ind_group,"Exposure"]*xfit[ind_group,i]))
                                                           ^learn[ind_group,"ClaimNb"]))
  }
  err3[i] <- mean(err_group)             
  std3[i] <- sd(err_group)
}
average_loss <- cbind(tree2$cptable[,2], tree2$cptable[,3], tree2$cptable[,3]* tree2$frame$dev[1] / n_l)
y1 <- err3*K/n_l
s1 <- K*std3/n_l
x1 <- log10(tree2$cptable[,1])

xmain <- "cross-validation error plot"
xlabel <- "cost-complexity parameter (log-scale)"
ylabel <- "CV error (in 10^(-2))"

errbar(x=x1, y=y1*100, yplus=(y1+s1)*100, yminus=(y1-s1)*100, xlim=rev(range(x1)), 
       col="blue", main=xmain, ylab=ylabel, xlab=xlabel)
lines(x=x1, y=y1*100, col="blue")
abline(h=c(min(y1+s1)*100), lty=1, col="orange")
abline(h=c(min(y1)*100), lty=1, col="magenta")
legend(x="topright", col=c("blue", "orange", "magenta"), lty=c(1,1,1), 
       lwd=c(1,1,1), pch=c(19,-1,-1), legend=c("tree2", "1-SD rule", "min.CV rule"))

1-SD rule: cp 0.003 (4-5 splits)

min.CV rule: minimal cp (33 splits)

learn$fit <- predict(tree2)*learn$Exposure
test$fit <- predict(tree2, newdata=test)*test$Exposure

c(Poisson.Deviance(learn$fit, learn$ClaimNb),Poisson.Deviance(test$fit, test$ClaimNb))
## [1] 30.50441 30.25269

Model RT3

tree3 <- prune(tree2, cp=0.0001)

rpart.plot(tree3)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

learn$fit <- predict(tree3)*learn$Exposure
test$fit <- predict(tree3, newdata=test)*test$Exposure

c(Poisson.Deviance(learn$fit, learn$ClaimNb),Poisson.Deviance(test$fit, test$ClaimNb))
## [1] 30.59995 30.24319

第二题

Possion boosting machine

choice of learning and test sample

set.seed(100)
ll <- sample(c(1:nrow(freMTPL2freq)), round(0.9*nrow(freMTPL2freq)), replace = FALSE) 
learn <- freMTPL2freq[ll,]
test <- freMTPL2freq[-ll,]
n_l = nrow(learn)
n_t = nrow(test)

Poisson boosting machine

library(rpart)
J0 <- 3      #depth of tree
M0 <- 50      #iterations
learn$fit0 <- learn$Exposure
test$fit0  <- test$Exposure
for (m in 1:M0){
  PBM.1 <- rpart(cbind(fit0,ClaimNb) ~ Area + VehPower + VehAge + DrivAge 
                 + BonusMalus + VehBrand + VehGas + Density + Region, 
                 data=learn, method="poisson",
                 control=rpart.control(maxdepth=J0, maxsurrogate=0, xval=1, 
                                       minbucket=10000, cp=0.00005))     
  learn$fit0 <- learn$fit0 * predict(PBM.1)
  learn[,paste("PBM_",m, sep="")] <-  learn$fit0
  test$fit0 <- test$fit0 * predict(PBM.1, newdata=test)
  test[,paste("PBM_",m, sep="")] <-  test$fit0
}

in-sample loss and out-of-sample loss

losses <- array(NA, c(2,M0))
for (m in 1:M0){
  losses[1,m] <- 200*(sum(learn[,paste("PBM_",m, sep="")])-sum(learn$ClaimNb)+
                        sum(log((learn$ClaimNb/learn[,paste("PBM_",m, sep="")])^
                                  (learn$ClaimNb))))/n_l
  losses[2,m] <- 200*(sum(test[,paste("PBM_",m, sep="")])-sum(test$ClaimNb)+
                        sum(log((test$ClaimNb/test[,paste("PBM_",m, sep="")])^
                                  (test$ClaimNb))))/n_t
}

losses[,M0] 
## [1] 30.56398 30.18851
plot(x=c(0:M0), y=c(32.93518, losses[1,]), type='l', col="red", ylim=c(30,33.5),
     xlab="number of iterations", ylab="average in-sample loss (in 10^(-2))", 
     main=paste("decrease of in-sample loss (depth=", J0,")", sep=""))
points(x=c(0:M0), y=c(32.93518, losses[1,]), pch=19, col="red")

plot(x=c(1:M0), y=losses[2,], type='l', lwd=2, col="red", ylim=c(30,31.5), 
     xlab="number of iterations", ylab="average out-of-sample loss (in 10^(-2))", 
     main="decrease of out-of-sample loss")