1. 데이터 불러오기: 내장 데이터인 dataCar 사용

library(insuranceData)
data('dataCar')

2. 필요라이브러리

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(doBy)
library(corrplot)
## corrplot 0.84 loaded
library(ggplot2)

3. 데이터 탐색

attach(dataCar)
glimpse(dataCar)
## Observations: 67,856
## Variables: 11
## $ veh_value <dbl> 1.06, 1.03, 3.26, 4.14, 0.72, 2.01, 1.60, 1.47, 0.52...
## $ exposure  <dbl> 0.3039014, 0.6488706, 0.5694730, 0.3175907, 0.648870...
## $ clm       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1...
## $ numclaims <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1...
## $ claimcst0 <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.00...
## $ veh_body  <fct> HBACK, HBACK, UTE, STNWG, HBACK, HDTOP, PANVN, HBACK...
## $ veh_age   <int> 3, 2, 2, 2, 4, 3, 3, 2, 4, 4, 2, 3, 2, 1, 3, 2, 3, 3...
## $ gender    <fct> F, F, F, F, F, M, M, M, F, F, M, M, F, M, M, M, F, M...
## $ area      <fct> C, A, E, D, C, C, A, B, A, B, A, C, C, A, B, C, F, C...
## $ agecat    <int> 2, 4, 2, 2, 2, 4, 4, 6, 3, 4, 2, 4, 4, 5, 6, 4, 4, 4...
## $ X_OBSTAT_ <fct> 01101    0    0    0, 01101    0    0    0, 01101   ...
table(veh_body, clm)
##         clm
## veh_body     0     1
##    BUS      39     9
##    CONVT    78     3
##    COUPE   712    68
##    HBACK 17651  1264
##    HDTOP  1449   130
##    MCARA   113    14
##    MIBUS   674    43
##    PANVN   690    62
##    RDSTR    25     2
##    SEDAN 20757  1476
##    STNWG 15088  1173
##    TRUCK  1630   120
##    UTE    4326   260
table(veh_age, clm)
##        clm
## veh_age     0     1
##       1 11432   825
##       2 15328  1259
##       3 18702  1362
##       4 17770  1178
table(agecat, clm)
##       clm
## agecat     0     1
##      1  5246   496
##      2 11943   932
##      3 14654  1113
##      4 15085  1104
##      5 10122   614
##      6  6182   365
table(area, clm)
##     clm
## area     0     1
##    A 15227  1085
##    B 12376   965
##    C 19128  1412
##    D  7677   496
##    E  5526   386
##    F  3298   280
table(gender, clm)
##       clm
## gender     0     1
##      F 35955  2648
##      M 27277  1976
table(veh_body, numclaims)
##         numclaims
## veh_body     0     1     2     3     4
##    BUS      39     8     1     0     0
##    CONVT    78     3     0     0     0
##    COUPE   712    61     7     0     0
##    HBACK 17651  1202    58     4     0
##    HDTOP  1449   124     6     0     0
##    MCARA   113    13     1     0     0
##    MIBUS   674    41     2     0     0
##    PANVN   690    57     4     1     0
##    RDSTR    25     1     1     0     0
##    SEDAN 20757  1361   108     7     0
##    STNWG 15088  1105    63     3     2
##    TRUCK  1630   112     6     2     0
##    UTE    4326   245    14     1     0
table(veh_age, numclaims)
##        numclaims
## veh_age     0     1     2     3     4
##       1 11432   775    49     1     0
##       2 15328  1169    85     5     0
##       3 18702  1285    72     3     2
##       4 17770  1104    65     9     0
table(agecat, numclaims)
##       numclaims
## agecat     0     1     2     3     4
##      1  5246   468    27     1     0
##      2 11943   869    58     5     0
##      3 14654  1044    63     5     1
##      4 15085  1027    73     4     0
##      5 10122   583    29     1     1
##      6  6182   342    21     2     0
table(area, numclaims)
##     numclaims
## area     0     1     2     3     4
##    A 15227   996    82     7     0
##    B 12376   916    43     5     1
##    C 19128  1332    79     1     0
##    D  7677   469    26     1     0
##    E  5526   363    20     2     1
##    F  3298   257    21     2     0
table(gender,numclaims)
##       numclaims
## gender     0     1     2     3     4
##      F 35955  2477   160     9     2
##      M 27277  1856   111     9     0

4. 데이터 시각화

#2)
# 차종에 따른 손해 발생 여부
ggplot(dataCar,aes(x=veh_body,fill=factor(clm)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()

ggplot(subset(dataCar,veh_body %in% c("HBACK","SEDAN","STNWG")),aes(x=veh_body,fill=factor(clm)))+geom_bar(aes(y=(..count..)/sum(..count..)))+facet_wrap(~agecat)+theme_bw()

# 연령대에 따른 손해 발생 여부
ggplot(dataCar,aes(x=agecat,fill=factor(clm)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()

# 성별에 따른 손해 발생 여부
ggplot(dataCar,aes(x=gender,fill=factor(clm)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()

# 연령대 x 성별에 따른 손해 발생 여부
dataCar$gender_agecat<-paste(dataCar$gender,dataCar$agecat,sep="")
ggplot(dataCar,aes(x=gender_agecat,fill=factor(clm)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()

# 상위 5개 차종에 대한 손해금액 확인 (0 제외)
top5_veh<-table(dataCar$veh_body)[order(-table(dataCar$veh_body))][1:5]
ggplot(data=subset(dataCar,clm==1 & veh_body %in% names(top5_veh)),aes(x=veh_body,y=claimcst0,color=veh_body))+geom_boxplot()+theme_bw()

5.모델 데이터 셋 확정

#손해 금액 상위 5개 차종을 대상으로 GLM 분석
dim(subset(dataCar,clm==1))
## [1] 4624   12
dataCar_final<-subset(dataCar,clm==1 & veh_body %in% names(top5_veh),select=-c(11))

# factor 유형 변환
str(dataCar_final)
## 'data.frame':    4293 obs. of  11 variables:
##  $ veh_value    : num  1.66 1.51 0.76 1.89 4.06 1.39 2.66 0.5 1.16 2.15 ...
##  $ exposure     : num  0.485 0.994 0.539 0.654 0.851 ...
##  $ clm          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ numclaims    : int  1 1 1 2 1 1 1 1 2 1 ...
##  $ claimcst0    : num  670 807 402 1812 5434 ...
##  $ veh_body     : Factor w/ 13 levels "BUS","CONVT",..: 10 10 4 11 11 4 11 4 11 10 ...
##  $ veh_age      : int  3 3 3 3 2 3 1 4 4 3 ...
##  $ gender       : Factor w/ 2 levels "F","M": 2 1 2 2 2 1 1 1 1 1 ...
##  $ area         : Factor w/ 6 levels "A","B","C","D",..: 2 6 3 6 6 1 6 1 2 1 ...
##  $ agecat       : int  6 4 4 2 3 4 5 5 2 5 ...
##  $ gender_agecat: chr  "M6" "F4" "M4" "M2" ...
dataCar_final$veh_age<-as.factor(dataCar_final$veh_age)
dataCar_final$agecat<-as.factor(dataCar_final$agecat)

# 범주형 변수들의 수준별 빈도 확인
summary(dataCar_final[c("veh_body","veh_age","gender","area","agecat")])
##     veh_body    veh_age  gender   area     agecat  
##  SEDAN  :1476   1: 795   F:2497   A: 994   1: 456  
##  HBACK  :1264   2:1201   M:1796   B: 886   2: 861  
##  STNWG  :1173   3:1259            C:1329   3:1021  
##  UTE    : 260   4:1038            D: 461   4:1021  
##  TRUCK  : 120                     E: 367   5: 582  
##  BUS    :   0                     F: 256   6: 352  
##  (Other):   0
# 참조 수준 재설정
dataCar_final$veh_body<-relevel(dataCar_final$veh_body,"SEDAN")
dataCar_final$veh_age<-relevel(dataCar_final$veh_age,"3")
dataCar_final$area<-relevel(dataCar_final$area,"C")
dataCar_final$agecat<-relevel(dataCar_final$agecat,"3")

6.GLM 모형 생성

# 일반화 선형 모형 적합
names(dataCar_final)
##  [1] "veh_value"     "exposure"      "clm"           "numclaims"    
##  [5] "claimcst0"     "veh_body"      "veh_age"       "gender"       
##  [9] "area"          "agecat"        "gender_agecat"
fit<-glm(claimcst0~.,data=subset(dataCar_final,select=c(veh_value,exposure,numclaims,claimcst0,veh_body,veh_age,gender,area,agecat)),Gamma(link="log"))
summary(fit)
## 
## Call:
## glm(formula = claimcst0 ~ ., family = Gamma(link = "log"), data = subset(dataCar_final, 
##     select = c(veh_value, exposure, numclaims, claimcst0, veh_body, 
##         veh_age, gender, area, agecat)))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0041  -1.3423  -0.7833   0.1017   6.0774  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.30843    0.14278  51.185  < 2e-16 ***
## veh_value      0.03380    0.03397   0.995  0.31985    
## exposure      -0.83705    0.09502  -8.809  < 2e-16 ***
## numclaims      0.51883    0.09147   5.672  1.5e-08 ***
## veh_bodyHBACK  0.13101    0.06430   2.037  0.04166 *  
## veh_bodySTNWG -0.01403    0.07733  -0.181  0.85603    
## veh_bodyTRUCK  0.26872    0.15849   1.696  0.09005 .  
## veh_bodyUTE    0.10061    0.11385   0.884  0.37693    
## veh_age1      -0.07025    0.08288  -0.848  0.39667    
## veh_age2      -0.02959    0.06884  -0.430  0.66732    
## veh_age4       0.08312    0.07231   1.149  0.25043    
## genderM        0.13491    0.05223   2.583  0.00983 ** 
## areaA         -0.05874    0.06783  -0.866  0.38654    
## areaB         -0.08402    0.07011  -1.198  0.23082    
## areaD         -0.09870    0.08821  -1.119  0.26321    
## areaE          0.02958    0.09649   0.307  0.75922    
## areaF          0.24347    0.11435   2.129  0.03330 *  
## agecat1        0.24649    0.09170   2.688  0.00722 ** 
## agecat2        0.12443    0.07480   1.663  0.09629 .  
## agecat4        0.05073    0.07189   0.706  0.48042    
## agecat5       -0.09018    0.08428  -1.070  0.28469    
## agecat6        0.02684    0.10086   0.266  0.79014    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 2.600675)
## 
##     Null deviance: 6799.2  on 4292  degrees of freedom
## Residual deviance: 6369.8  on 4271  degrees of freedom
## AIC: 73357
## 
## Number of Fisher Scoring iterations: 7

###.7.기준 요인 변수별 영향도 비교

ls(fit) 
##  [1] "aic"               "boundary"          "call"             
##  [4] "coefficients"      "contrasts"         "control"          
##  [7] "converged"         "data"              "deviance"         
## [10] "df.null"           "df.residual"       "effects"          
## [13] "family"            "fitted.values"     "formula"          
## [16] "iter"              "linear.predictors" "method"           
## [19] "model"             "null.deviance"     "offset"           
## [22] "prior.weights"     "qr"                "R"                
## [25] "rank"              "residuals"         "terms"            
## [28] "weights"           "xlevels"           "y"
data.frame(exp(fit$coefficients))
##               exp.fit.coefficients.
## (Intercept)            1492.8325741
## veh_value                 1.0343750
## exposure                  0.4329840
## numclaims                 1.6800655
## veh_bodyHBACK             1.1399844
## veh_bodySTNWG             0.9860664
## veh_bodyTRUCK             1.3082833
## veh_bodyUTE               1.1058407
## veh_age1                  0.9321602
## veh_age2                  0.9708419
## veh_age4                  1.0866741
## genderM                   1.1444302
## areaA                     0.9429511
## areaB                     0.9194130
## areaD                     0.9060118
## areaE                     1.0300179
## areaF                     1.2756623
## agecat1                   1.2795203
## agecat2                   1.1325045
## agecat4                   1.0520407
## agecat5                   0.9137680
## agecat6                   1.0272076