Describe Your Data:
# Data preparation
read.csv("insurance.csv")
insurance_data <- read.csv("insurance.csv")
Data review
# Amount of date after cleaning
nrow(insurance_data)
## [1] 8163
# Cleaning of unnecessary symbols
unique(insurance_data$KIDSDRIV)
## [1] 0 1 2 3 4
unique(insurance_data$HOMEKIDS)
## [1] 0 1 2 3 4 5
unique(insurance_data$PARENT1)
## [1] "No" "Yes"
unique(insurance_data$MSTATUS)
## [1] "No" "Yes"
unique(insurance_data$GENDER)
## [1] "M" "F"
unique(insurance_data$EDUCATION)
## [1] "PhD" "High School" "Bachelors" "Masters"
unique(insurance_data$OCCUPATION)
## [1] "Professional" "Blue Collar" "Manager" "Clerical" "Lawyer"
## [6] "" "Home Maker" "Doctor" "Student"
unique(insurance_data$URBANICITY)
## [1] "Urban" "Rural"
# min/max values
sapply(insurance_data, min)
## KIDSDRIV BIRTH AGE HOMEKIDS YOJ INCOME
## "0" "01APR39" "16" "0" "0" "0"
## PARENT1 HOME_VAL MSTATUS GENDER EDUCATION OCCUPATION
## "Yes" "0" "Yes" "F" "Bachelors" ""
## TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE RED_CAR
## "5" "Commercial" "1500" "1" "Minivan" "yes"
## OLDCLAIM CLM_FREQ REVOKED MVR_PTS CLM_AMT CAR_AGE
## "0" "0" "Yes" "0" "0" "-3"
## CLAIM_FLAG URBANICITY
## "0" "Rural"
sapply(insurance_data, max)
## KIDSDRIV BIRTH AGE HOMEKIDS YOJ INCOME PARENT1
## "4" "31OCT73" "81" "5" "23" "367030" "No"
## HOME_VAL MSTATUS GENDER EDUCATION OCCUPATION TRAVTIME CAR_USE
## "885282" "No" "M" "PhD" "Student" "142" "Private"
## BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED
## "69740" "25" "Van" "no" "57037" "5" "No"
## MVR_PTS CLM_AMT CAR_AGE CLAIM_FLAG URBANICITY
## "13" "85524" "28" "1" "Urban"
# Determine the data types of a data frame's columns
str(insurance_data)
## 'data.frame': 8163 obs. of 26 variables:
## $ KIDSDRIV : int 0 0 0 0 0 1 0 0 0 0 ...
## $ BIRTH : chr "16MAR39" "21JAN56" "18NOV51" "05MAR64" ...
## $ AGE : int 60 43 48 35 34 40 44 34 50 53 ...
## $ HOMEKIDS : int 0 0 0 1 1 1 2 0 0 0 ...
## $ YOJ : int 11 11 11 10 12 11 12 10 7 14 ...
## $ INCOME : num 67349 91449 52881 16039 125301 ...
## $ PARENT1 : chr "No" "No" "No" "No" ...
## $ HOME_VAL : num 0 257252 0 124191 0 ...
## $ MSTATUS : chr "No" "No" "No" "Yes" ...
## $ GENDER : chr "M" "M" "M" "F" ...
## $ EDUCATION : chr "PhD" "High School" "Bachelors" "High School" ...
## $ OCCUPATION: chr "Professional" "Blue Collar" "Manager" "Clerical" ...
## $ TRAVTIME : int 14 22 26 5 46 21 30 34 48 15 ...
## $ CAR_USE : chr "Private" "Commercial" "Private" "Private" ...
## $ BLUEBOOK : num 14230 14940 21970 4010 17430 ...
## $ TIF : int 11 1 1 4 1 6 10 1 7 1 ...
## $ CAR_TYPE : chr "Minivan" "Minivan" "Van" "SUV" ...
## $ RED_CAR : chr "yes" "yes" "yes" "no" ...
## $ OLDCLAIM : num 4461 0 0 38690 0 ...
## $ CLM_FREQ : int 2 0 0 2 0 1 0 0 0 0 ...
## $ REVOKED : chr "No" "No" "No" "No" ...
## $ MVR_PTS : int 3 0 2 3 0 2 0 0 1 0 ...
## $ CLM_AMT : num 0 0 0 0 2946 ...
## $ CAR_AGE : int 18 1 10 10 7 1 10 1 17 11 ...
## $ CLAIM_FLAG: int 0 0 0 0 1 1 0 1 0 1 ...
## $ URBANICITY: chr "Urban" "Urban" "Urban" "Urban" ...
## - attr(*, "na.action")= 'omit' Named int [1:2139] 5 6 8 11 18 26 30 36 37 55 ...
## ..- attr(*, "names")= chr [1:2139] "5" "6" "8" "11" ...
insurance_data <- insurance_data %>% add_column(REPEAT5 = if_else(.$CLM_FREQ == 0, 0, 1))
| Column name | Type | Description |
|---|---|---|
| ID | integer | Customer ID Number |
| KIDSDRIV | integer | # of children (teenagers) driving the car |
| BIRTH | datetime string | Date of birth |
| AGE | integer | Age of driver |
| HOMEKIDS | integer | # of children at home |
| YOJ | integer | Years on job |
| INCOME | integer | Income |
| PARENT1 | character string | Single parent y/n |
| HOME_VAL | integer | Value of home |
| MSTATUS | character | Marital status |
| GENDER | character | Gender |
| EDUCATION | character string | Maximum education level |
| OCCUPATION | character string | Occupation |
| TRAVTIME | integer | Distance to work |
| CAR_USE | character string | Vehicle use |
| BLUEBOOK | integer | Value of vehicle |
| CAR_TYPE | character string | Type of car |
| OLDCLAIM | integer | Payouts, last 5 years |
| CLM_FREQ | integer | # of claims, last 5 years |
| REVOKED | character string | License revoked past 7 years yes/no |
| MVR_PTS | integer | Motor vehicle record points (demerits) |
| CLM_AMT | integer | Claim amount |
| REPEAT5 | boolean (binary) | IF THERE WAS CLAIMS BEFORE=1, OTHER=0 |
ggplot(insurance_data, aes(x=CLM_AMT)) +
geom_histogram(color="darkblue", fill="lightblue", bins = 100) +
labs(title = "Claim amount", x = "Value, $", y = "Count") +
theme_light()
boxplot(insurance_data$CLM_AMT, main = "Claim amount", ylab = "Value, $")
Komentaras:
ggplot(insurance_data, aes(x=YOJ)) +
geom_histogram(color="darkblue", fill="lightblue", bins = 24) +
labs(title = "Drivers Years on current job distribution", x = "Age, years", y = "Count") +
theme_light()
ggplot(insurance_data, aes(x=AGE)) +
geom_histogram(color="darkblue", fill="lightblue", bins = 67) +
labs(title = "Direvers age distribution", x = "Age, years", y = "Count") +
theme_light()
ggplot(insurance_data, aes(x=AGE)) +
geom_histogram(color="darkblue", fill="lightblue", bins = 67) +
labs(title = "Direvers age distribution", x = "Value", y = "Count") +
theme_light() +
geom_density(aes(y = stat(count)))
Komentaras:
par(mfrow = c(1,3))
boxplot(insurance_data$INCOME, main = "Distribution of\n INCOME", ylab = "Value, $")
boxplot(insurance_data$HOME_VAL, main = "Distribution of\n HOME VALUE", ylab = "Value, $")
boxplot(insurance_data$TRAVTIME, main = "Distribution of\n TRAVEL DISTANCE TO WORK", ylab = "Distance, miles")
Komentaras:
ggplot(insurance_data, aes(x=OCCUPATION, y=REPEAT5, fill=OCCUPATION)) +
geom_bar(stat="identity") +
labs(title = "Number of drivers by different groups of Ocupations",
x = "Ocupation groups", y = "Count") +
theme_light()
Komentaras:
### Correlation matrix Visualzation
# head(insurance_data )
insurance_data_cor <- insurance_data[,-c(2,7,9,10,11,12,14,17,18,21,26,27)]
# head(insurance_data_cor )
cor(insurance_data_cor)
## KIDSDRIV AGE HOMEKIDS YOJ INCOME
## KIDSDRIV 1.000000000 -0.0708930532 0.4608542587 0.049193558 -0.041471541
## AGE -0.070893053 1.0000000000 -0.4466382792 0.144778590 0.191496253
## HOMEKIDS 0.460854259 -0.4466382792 1.0000000000 0.084662218 -0.161007972
## YOJ 0.049193558 0.1447785900 0.0846622183 1.000000000 0.287987005
## INCOME -0.041471541 0.1914962532 -0.1610079719 0.287987005 1.000000000
## HOME_VAL -0.011532306 0.2158295117 -0.1067155429 0.272838927 0.581958563
## TRAVTIME 0.009195038 0.0022628219 -0.0051775750 -0.018413761 -0.046438182
## BLUEBOOK -0.019774482 0.1719095615 -0.1040193229 0.146840896 0.434257195
## TIF -0.006083834 -0.0003408653 0.0008658922 0.029198422 0.003059332
## OLDCLAIM 0.021420132 -0.0360179588 0.0311416785 -0.001692864 -0.033134710
## CLM_FREQ 0.039470256 -0.0383173915 0.0382038044 -0.021852281 -0.043903368
## MVR_PTS 0.052658731 -0.0792813001 0.0675903682 -0.040207142 -0.070170590
## CLM_AMT 0.058282849 -0.0581052718 0.0682082798 -0.019393815 -0.054994229
## CAR_AGE -0.056155454 0.1847550727 -0.1635226839 0.060535725 0.414392832
## CLAIM_FLAG 0.100733843 -0.1159015521 0.1238192883 -0.062850633 -0.142441531
## HOME_VAL TRAVTIME BLUEBOOK TIF OLDCLAIM
## KIDSDRIV -0.0115323056 0.009195038 -0.019774482 -0.0060838345 0.021420132
## AGE 0.2158295117 0.002262822 0.171909562 -0.0003408653 -0.036017959
## HOMEKIDS -0.1067155429 -0.005177575 -0.104019323 0.0008658922 0.031141679
## YOJ 0.2728389267 -0.018413761 0.146840896 0.0291984215 -0.001692864
## INCOME 0.5819585635 -0.046438182 0.434257195 0.0030593319 -0.033134710
## HOME_VAL 1.0000000000 -0.028965962 0.266800463 0.0009306881 -0.053118684
## TRAVTIME -0.0289659625 1.000000000 -0.021091451 -0.0156521917 -0.017957660
## BLUEBOOK 0.2668004629 -0.021091451 1.000000000 0.0118722571 -0.030319263
## TIF 0.0009306881 -0.015652192 0.011872257 1.0000000000 -0.012131071
## OLDCLAIM -0.0531186844 -0.017957660 -0.030319263 -0.0121310712 1.000000000
## CLM_FREQ -0.0978039623 0.007829285 -0.046691353 -0.0200443672 0.492361674
## MVR_PTS -0.1017816578 0.004809825 -0.058763715 -0.0310002124 0.268400681
## CLM_AMT -0.0980956834 0.035851212 0.001279025 -0.0426346372 0.077953310
## CAR_AGE 0.2230697831 -0.027963492 0.187622882 0.0133816603 -0.016740911
## CLAIM_FLAG -0.1867082963 0.056980177 -0.102987790 -0.0775456641 0.143025053
## CLM_FREQ MVR_PTS CLM_AMT CAR_AGE CLAIM_FLAG
## KIDSDRIV 0.039470256 0.052658731 0.058282849 -0.05615545 0.10073384
## AGE -0.038317391 -0.079281300 -0.058105272 0.18475507 -0.11590155
## HOMEKIDS 0.038203804 0.067590368 0.068208280 -0.16352268 0.12381929
## YOJ -0.021852281 -0.040207142 -0.019393815 0.06053572 -0.06285063
## INCOME -0.043903368 -0.070170590 -0.054994229 0.41439283 -0.14244153
## HOME_VAL -0.097803962 -0.101781658 -0.098095683 0.22306978 -0.18670830
## TRAVTIME 0.007829285 0.004809825 0.035851212 -0.02796349 0.05698018
## BLUEBOOK -0.046691353 -0.058763715 0.001279025 0.18762288 -0.10298779
## TIF -0.020044367 -0.031000212 -0.042634637 0.01338166 -0.07754566
## OLDCLAIM 0.492361674 0.268400681 0.077953310 -0.01674091 0.14302505
## CLM_FREQ 1.000000000 0.404764717 0.119884936 -0.01532654 0.22663134
## MVR_PTS 0.404764717 1.000000000 0.130610301 -0.03548709 0.22665096
## CLM_AMT 0.119884936 0.130610301 1.000000000 -0.06841157 0.54535984
## CAR_AGE -0.015326542 -0.035487092 -0.068411573 1.00000000 -0.11556306
## CLAIM_FLAG 0.226631344 0.226650959 0.545359836 -0.11556306 1.00000000
corrplot(cor(insurance_data_cor),
method = 'number',
type = "lower")
corrplot(cor(insurance_data_cor), type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
### Correlation matrix Heat map
col<- colorRampPalette(c("blue", "white", "red"))(20)
heatmap(x = cor(insurance_data_cor), col = col, symm = TRUE)
Komentaras: