library(insuranceData)
data('dataCar')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(doBy)
library(corrplot)
## corrplot 0.84 loaded
library(ggplot2)
attach(dataCar)
glimpse(dataCar)
## Observations: 67,856
## Variables: 11
## $ veh_value <dbl> 1.06, 1.03, 3.26, 4.14, 0.72, 2.01, 1.60, 1.47, 0.52...
## $ exposure <dbl> 0.3039014, 0.6488706, 0.5694730, 0.3175907, 0.648870...
## $ clm <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1...
## $ numclaims <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1...
## $ claimcst0 <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.00...
## $ veh_body <fct> HBACK, HBACK, UTE, STNWG, HBACK, HDTOP, PANVN, HBACK...
## $ veh_age <int> 3, 2, 2, 2, 4, 3, 3, 2, 4, 4, 2, 3, 2, 1, 3, 2, 3, 3...
## $ gender <fct> F, F, F, F, F, M, M, M, F, F, M, M, F, M, M, M, F, M...
## $ area <fct> C, A, E, D, C, C, A, B, A, B, A, C, C, A, B, C, F, C...
## $ agecat <int> 2, 4, 2, 2, 2, 4, 4, 6, 3, 4, 2, 4, 4, 5, 6, 4, 4, 4...
## $ X_OBSTAT_ <fct> 01101 0 0 0, 01101 0 0 0, 01101 ...
table(veh_body, clm)
## clm
## veh_body 0 1
## BUS 39 9
## CONVT 78 3
## COUPE 712 68
## HBACK 17651 1264
## HDTOP 1449 130
## MCARA 113 14
## MIBUS 674 43
## PANVN 690 62
## RDSTR 25 2
## SEDAN 20757 1476
## STNWG 15088 1173
## TRUCK 1630 120
## UTE 4326 260
table(veh_age, clm)
## clm
## veh_age 0 1
## 1 11432 825
## 2 15328 1259
## 3 18702 1362
## 4 17770 1178
table(agecat, clm)
## clm
## agecat 0 1
## 1 5246 496
## 2 11943 932
## 3 14654 1113
## 4 15085 1104
## 5 10122 614
## 6 6182 365
table(area, clm)
## clm
## area 0 1
## A 15227 1085
## B 12376 965
## C 19128 1412
## D 7677 496
## E 5526 386
## F 3298 280
table(gender, clm)
## clm
## gender 0 1
## F 35955 2648
## M 27277 1976
table(veh_body, numclaims)
## numclaims
## veh_body 0 1 2 3 4
## BUS 39 8 1 0 0
## CONVT 78 3 0 0 0
## COUPE 712 61 7 0 0
## HBACK 17651 1202 58 4 0
## HDTOP 1449 124 6 0 0
## MCARA 113 13 1 0 0
## MIBUS 674 41 2 0 0
## PANVN 690 57 4 1 0
## RDSTR 25 1 1 0 0
## SEDAN 20757 1361 108 7 0
## STNWG 15088 1105 63 3 2
## TRUCK 1630 112 6 2 0
## UTE 4326 245 14 1 0
table(veh_age, numclaims)
## numclaims
## veh_age 0 1 2 3 4
## 1 11432 775 49 1 0
## 2 15328 1169 85 5 0
## 3 18702 1285 72 3 2
## 4 17770 1104 65 9 0
table(agecat, numclaims)
## numclaims
## agecat 0 1 2 3 4
## 1 5246 468 27 1 0
## 2 11943 869 58 5 0
## 3 14654 1044 63 5 1
## 4 15085 1027 73 4 0
## 5 10122 583 29 1 1
## 6 6182 342 21 2 0
table(area, numclaims)
## numclaims
## area 0 1 2 3 4
## A 15227 996 82 7 0
## B 12376 916 43 5 1
## C 19128 1332 79 1 0
## D 7677 469 26 1 0
## E 5526 363 20 2 1
## F 3298 257 21 2 0
table(gender,numclaims)
## numclaims
## gender 0 1 2 3 4
## F 35955 2477 160 9 2
## M 27277 1856 111 9 0
#2)
# 차종에 따른 손해 발생 여부
ggplot(dataCar,aes(x=veh_body,fill=factor(clm)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
ggplot(subset(dataCar,veh_body %in% c("HBACK","SEDAN","STNWG")),aes(x=veh_body,fill=factor(clm)))+geom_bar(aes(y=(..count..)/sum(..count..)))+facet_wrap(~agecat)+theme_bw()
# 연령대에 따른 손해 발생 여부
ggplot(dataCar,aes(x=agecat,fill=factor(clm)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
# 성별에 따른 손해 발생 여부
ggplot(dataCar,aes(x=gender,fill=factor(clm)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
# 연령대 x 성별에 따른 손해 발생 여부
dataCar$gender_agecat<-paste(dataCar$gender,dataCar$agecat,sep="")
ggplot(dataCar,aes(x=gender_agecat,fill=factor(clm)))+geom_bar(aes(y=(..count..)/sum(..count..)))+theme_bw()
# 상위 5개 차종에 대한 손해금액 확인 (0 제외)
top5_veh<-table(dataCar$veh_body)[order(-table(dataCar$veh_body))][1:5]
ggplot(data=subset(dataCar,clm==1 & veh_body %in% names(top5_veh)),aes(x=veh_body,y=claimcst0,color=veh_body))+geom_boxplot()+theme_bw()
#손해 금액 상위 5개 차종을 대상으로 GLM 분석
dim(subset(dataCar,clm==1))
## [1] 4624 12
dataCar_final<-subset(dataCar,clm==1 & veh_body %in% names(top5_veh),select=-c(11))
# factor 유형 변환
str(dataCar_final)
## 'data.frame': 4293 obs. of 11 variables:
## $ veh_value : num 1.66 1.51 0.76 1.89 4.06 1.39 2.66 0.5 1.16 2.15 ...
## $ exposure : num 0.485 0.994 0.539 0.654 0.851 ...
## $ clm : int 1 1 1 1 1 1 1 1 1 1 ...
## $ numclaims : int 1 1 1 2 1 1 1 1 2 1 ...
## $ claimcst0 : num 670 807 402 1812 5434 ...
## $ veh_body : Factor w/ 13 levels "BUS","CONVT",..: 10 10 4 11 11 4 11 4 11 10 ...
## $ veh_age : int 3 3 3 3 2 3 1 4 4 3 ...
## $ gender : Factor w/ 2 levels "F","M": 2 1 2 2 2 1 1 1 1 1 ...
## $ area : Factor w/ 6 levels "A","B","C","D",..: 2 6 3 6 6 1 6 1 2 1 ...
## $ agecat : int 6 4 4 2 3 4 5 5 2 5 ...
## $ gender_agecat: chr "M6" "F4" "M4" "M2" ...
dataCar_final$veh_age<-as.factor(dataCar_final$veh_age)
dataCar_final$agecat<-as.factor(dataCar_final$agecat)
# 범주형 변수들의 수준별 빈도 확인
summary(dataCar_final[c("veh_body","veh_age","gender","area","agecat")])
## veh_body veh_age gender area agecat
## SEDAN :1476 1: 795 F:2497 A: 994 1: 456
## HBACK :1264 2:1201 M:1796 B: 886 2: 861
## STNWG :1173 3:1259 C:1329 3:1021
## UTE : 260 4:1038 D: 461 4:1021
## TRUCK : 120 E: 367 5: 582
## BUS : 0 F: 256 6: 352
## (Other): 0
# 참조 수준 재설정
dataCar_final$veh_body<-relevel(dataCar_final$veh_body,"SEDAN")
dataCar_final$veh_age<-relevel(dataCar_final$veh_age,"3")
dataCar_final$area<-relevel(dataCar_final$area,"C")
dataCar_final$agecat<-relevel(dataCar_final$agecat,"3")
# 일반화 선형 모형 적합
names(dataCar_final)
## [1] "veh_value" "exposure" "clm" "numclaims"
## [5] "claimcst0" "veh_body" "veh_age" "gender"
## [9] "area" "agecat" "gender_agecat"
fit<-glm(claimcst0~.,data=subset(dataCar_final,select=c(veh_value,exposure,numclaims,claimcst0,veh_body,veh_age,gender,area,agecat)),Gamma(link="log"))
summary(fit)
##
## Call:
## glm(formula = claimcst0 ~ ., family = Gamma(link = "log"), data = subset(dataCar_final,
## select = c(veh_value, exposure, numclaims, claimcst0, veh_body,
## veh_age, gender, area, agecat)))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0041 -1.3423 -0.7833 0.1017 6.0774
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.30843 0.14278 51.185 < 2e-16 ***
## veh_value 0.03380 0.03397 0.995 0.31985
## exposure -0.83705 0.09502 -8.809 < 2e-16 ***
## numclaims 0.51883 0.09147 5.672 1.5e-08 ***
## veh_bodyHBACK 0.13101 0.06430 2.037 0.04166 *
## veh_bodySTNWG -0.01403 0.07733 -0.181 0.85603
## veh_bodyTRUCK 0.26872 0.15849 1.696 0.09005 .
## veh_bodyUTE 0.10061 0.11385 0.884 0.37693
## veh_age1 -0.07025 0.08288 -0.848 0.39667
## veh_age2 -0.02959 0.06884 -0.430 0.66732
## veh_age4 0.08312 0.07231 1.149 0.25043
## genderM 0.13491 0.05223 2.583 0.00983 **
## areaA -0.05874 0.06783 -0.866 0.38654
## areaB -0.08402 0.07011 -1.198 0.23082
## areaD -0.09870 0.08821 -1.119 0.26321
## areaE 0.02958 0.09649 0.307 0.75922
## areaF 0.24347 0.11435 2.129 0.03330 *
## agecat1 0.24649 0.09170 2.688 0.00722 **
## agecat2 0.12443 0.07480 1.663 0.09629 .
## agecat4 0.05073 0.07189 0.706 0.48042
## agecat5 -0.09018 0.08428 -1.070 0.28469
## agecat6 0.02684 0.10086 0.266 0.79014
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 2.600675)
##
## Null deviance: 6799.2 on 4292 degrees of freedom
## Residual deviance: 6369.8 on 4271 degrees of freedom
## AIC: 73357
##
## Number of Fisher Scoring iterations: 7
###.7.기준 요인 변수별 영향도 비교
ls(fit)
## [1] "aic" "boundary" "call"
## [4] "coefficients" "contrasts" "control"
## [7] "converged" "data" "deviance"
## [10] "df.null" "df.residual" "effects"
## [13] "family" "fitted.values" "formula"
## [16] "iter" "linear.predictors" "method"
## [19] "model" "null.deviance" "offset"
## [22] "prior.weights" "qr" "R"
## [25] "rank" "residuals" "terms"
## [28] "weights" "xlevels" "y"
data.frame(exp(fit$coefficients))
## exp.fit.coefficients.
## (Intercept) 1492.8325741
## veh_value 1.0343750
## exposure 0.4329840
## numclaims 1.6800655
## veh_bodyHBACK 1.1399844
## veh_bodySTNWG 0.9860664
## veh_bodyTRUCK 1.3082833
## veh_bodyUTE 1.1058407
## veh_age1 0.9321602
## veh_age2 0.9708419
## veh_age4 1.0866741
## genderM 1.1444302
## areaA 0.9429511
## areaB 0.9194130
## areaD 0.9060118
## areaE 1.0300179
## areaF 1.2756623
## agecat1 1.2795203
## agecat2 1.1325045
## agecat4 1.0520407
## agecat5 0.9137680
## agecat6 1.0272076