Homework assignment No.2

Describe Your Data:

  1. Data types.
  2. Statistcs (mean, min, max, etc. depending on the data types), use box plots and other similar plots to illustrate it.
  3. Create basic visualizations of your data.
  4. Check for periodicity in your data, show it (if there is no seasonality, show that there is no seasonality).
# Data preparation
read.csv("insurance.csv")
insurance_data <- read.csv("insurance.csv")

Data review

# Amount of date after cleaning
nrow(insurance_data)
## [1] 8163
# Cleaning of unnecessary symbols
unique(insurance_data$KIDSDRIV)
## [1] 0 1 2 3 4
unique(insurance_data$HOMEKIDS)
## [1] 0 1 2 3 4 5
unique(insurance_data$PARENT1)
## [1] "No"  "Yes"
unique(insurance_data$MSTATUS)
## [1] "No"  "Yes"
unique(insurance_data$GENDER)
## [1] "M" "F"
unique(insurance_data$EDUCATION)
## [1] "PhD"         "High School" "Bachelors"   "Masters"
unique(insurance_data$OCCUPATION)
## [1] "Professional" "Blue Collar"  "Manager"      "Clerical"     "Lawyer"      
## [6] ""             "Home Maker"   "Doctor"       "Student"
unique(insurance_data$URBANICITY)
## [1] "Urban" "Rural"
# min/max values
sapply(insurance_data, min)
##     KIDSDRIV        BIRTH          AGE     HOMEKIDS          YOJ       INCOME 
##          "0"    "01APR39"         "16"          "0"          "0"          "0" 
##      PARENT1     HOME_VAL      MSTATUS       GENDER    EDUCATION   OCCUPATION 
##        "Yes"          "0"        "Yes"          "F"  "Bachelors"           "" 
##     TRAVTIME      CAR_USE     BLUEBOOK          TIF     CAR_TYPE      RED_CAR 
##          "5" "Commercial"       "1500"          "1"    "Minivan"        "yes" 
##     OLDCLAIM     CLM_FREQ      REVOKED      MVR_PTS      CLM_AMT      CAR_AGE 
##          "0"          "0"        "Yes"          "0"          "0"         "-3" 
##   CLAIM_FLAG   URBANICITY 
##          "0"      "Rural"
sapply(insurance_data, max)
##   KIDSDRIV      BIRTH        AGE   HOMEKIDS        YOJ     INCOME    PARENT1 
##        "4"  "31OCT73"       "81"        "5"       "23"   "367030"       "No" 
##   HOME_VAL    MSTATUS     GENDER  EDUCATION OCCUPATION   TRAVTIME    CAR_USE 
##   "885282"       "No"        "M"      "PhD"  "Student"      "142"  "Private" 
##   BLUEBOOK        TIF   CAR_TYPE    RED_CAR   OLDCLAIM   CLM_FREQ    REVOKED 
##    "69740"       "25"      "Van"       "no"    "57037"        "5"       "No" 
##    MVR_PTS    CLM_AMT    CAR_AGE CLAIM_FLAG URBANICITY 
##       "13"    "85524"       "28"        "1"    "Urban"
# Determine the data types of a data frame's columns
str(insurance_data)
## 'data.frame':    8163 obs. of  26 variables:
##  $ KIDSDRIV  : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ BIRTH     : chr  "16MAR39" "21JAN56" "18NOV51" "05MAR64" ...
##  $ AGE       : int  60 43 48 35 34 40 44 34 50 53 ...
##  $ HOMEKIDS  : int  0 0 0 1 1 1 2 0 0 0 ...
##  $ YOJ       : int  11 11 11 10 12 11 12 10 7 14 ...
##  $ INCOME    : num  67349 91449 52881 16039 125301 ...
##  $ PARENT1   : chr  "No" "No" "No" "No" ...
##  $ HOME_VAL  : num  0 257252 0 124191 0 ...
##  $ MSTATUS   : chr  "No" "No" "No" "Yes" ...
##  $ GENDER    : chr  "M" "M" "M" "F" ...
##  $ EDUCATION : chr  "PhD" "High School" "Bachelors" "High School" ...
##  $ OCCUPATION: chr  "Professional" "Blue Collar" "Manager" "Clerical" ...
##  $ TRAVTIME  : int  14 22 26 5 46 21 30 34 48 15 ...
##  $ CAR_USE   : chr  "Private" "Commercial" "Private" "Private" ...
##  $ BLUEBOOK  : num  14230 14940 21970 4010 17430 ...
##  $ TIF       : int  11 1 1 4 1 6 10 1 7 1 ...
##  $ CAR_TYPE  : chr  "Minivan" "Minivan" "Van" "SUV" ...
##  $ RED_CAR   : chr  "yes" "yes" "yes" "no" ...
##  $ OLDCLAIM  : num  4461 0 0 38690 0 ...
##  $ CLM_FREQ  : int  2 0 0 2 0 1 0 0 0 0 ...
##  $ REVOKED   : chr  "No" "No" "No" "No" ...
##  $ MVR_PTS   : int  3 0 2 3 0 2 0 0 1 0 ...
##  $ CLM_AMT   : num  0 0 0 0 2946 ...
##  $ CAR_AGE   : int  18 1 10 10 7 1 10 1 17 11 ...
##  $ CLAIM_FLAG: int  0 0 0 0 1 1 0 1 0 1 ...
##  $ URBANICITY: chr  "Urban" "Urban" "Urban" "Urban" ...
##  - attr(*, "na.action")= 'omit' Named int [1:2139] 5 6 8 11 18 26 30 36 37 55 ...
##   ..- attr(*, "names")= chr [1:2139] "5" "6" "8" "11" ...
insurance_data <- insurance_data %>% add_column(REPEAT5 = if_else(.$CLM_FREQ == 0, 0, 1))

Data types

Column name Type Description
ID integer Customer ID Number
KIDSDRIV integer # of children (teenagers) driving the car
BIRTH datetime string Date of birth
AGE integer Age of driver
HOMEKIDS integer # of children at home
YOJ integer Years on job
INCOME integer Income
PARENT1 character string Single parent y/n
HOME_VAL integer Value of home
MSTATUS character Marital status
GENDER character Gender
EDUCATION character string Maximum education level
OCCUPATION character string Occupation
TRAVTIME integer Distance to work
CAR_USE character string Vehicle use
BLUEBOOK integer Value of vehicle
CAR_TYPE character string Type of car
OLDCLAIM integer Payouts, last 5 years
CLM_FREQ integer # of claims, last 5 years
REVOKED character string License revoked past 7 years yes/no
MVR_PTS integer Motor vehicle record points (demerits)
CLM_AMT integer Claim amount
REPEAT5 boolean (binary) IF THERE WAS CLAIMS BEFORE=1, OTHER=0

Visualialisations

ggplot(insurance_data, aes(x=CLM_AMT)) + 
  geom_histogram(color="darkblue", fill="lightblue", bins = 100) +
  labs(title = "Claim amount", x = "Value, $", y = "Count") + 
  theme_light()

boxplot(insurance_data$CLM_AMT, main = "Claim amount", ylab = "Value, $")


Komentaras:

ggplot(insurance_data, aes(x=YOJ)) + 
  geom_histogram(color="darkblue", fill="lightblue", bins = 24) +
  labs(title = "Drivers Years on current job distribution", x = "Age, years", y = "Count") + 
  theme_light() 

ggplot(insurance_data, aes(x=AGE)) + 
  geom_histogram(color="darkblue", fill="lightblue", bins = 67) +
  labs(title = "Direvers age distribution", x = "Age, years", y = "Count") + 
  theme_light() 

ggplot(insurance_data, aes(x=AGE)) + 
  geom_histogram(color="darkblue", fill="lightblue", bins = 67) +
  labs(title = "Direvers age distribution", x = "Value", y = "Count") + 
  theme_light() + 
  geom_density(aes(y = stat(count)))


Komentaras:

par(mfrow = c(1,3))
boxplot(insurance_data$INCOME, main = "Distribution of\n INCOME", ylab = "Value, $")
boxplot(insurance_data$HOME_VAL, main = "Distribution of\n HOME VALUE", ylab = "Value, $")
boxplot(insurance_data$TRAVTIME, main = "Distribution of\n TRAVEL DISTANCE TO WORK", ylab = "Distance, miles")


Komentaras:

ggplot(insurance_data, aes(x=OCCUPATION, y=REPEAT5, fill=OCCUPATION)) +
  geom_bar(stat="identity") + 
  labs(title = "Number of drivers by different groups of Ocupations",
       x = "Ocupation groups", y = "Count") + 
  theme_light()


Komentaras:


### Correlation matrix Visualzation

# head(insurance_data )
insurance_data_cor <- insurance_data[,-c(2,7,9,10,11,12,14,17,18,21,26,27)]
# head(insurance_data_cor )
cor(insurance_data_cor)
##                KIDSDRIV           AGE      HOMEKIDS          YOJ       INCOME
## KIDSDRIV    1.000000000 -0.0708930532  0.4608542587  0.049193558 -0.041471541
## AGE        -0.070893053  1.0000000000 -0.4466382792  0.144778590  0.191496253
## HOMEKIDS    0.460854259 -0.4466382792  1.0000000000  0.084662218 -0.161007972
## YOJ         0.049193558  0.1447785900  0.0846622183  1.000000000  0.287987005
## INCOME     -0.041471541  0.1914962532 -0.1610079719  0.287987005  1.000000000
## HOME_VAL   -0.011532306  0.2158295117 -0.1067155429  0.272838927  0.581958563
## TRAVTIME    0.009195038  0.0022628219 -0.0051775750 -0.018413761 -0.046438182
## BLUEBOOK   -0.019774482  0.1719095615 -0.1040193229  0.146840896  0.434257195
## TIF        -0.006083834 -0.0003408653  0.0008658922  0.029198422  0.003059332
## OLDCLAIM    0.021420132 -0.0360179588  0.0311416785 -0.001692864 -0.033134710
## CLM_FREQ    0.039470256 -0.0383173915  0.0382038044 -0.021852281 -0.043903368
## MVR_PTS     0.052658731 -0.0792813001  0.0675903682 -0.040207142 -0.070170590
## CLM_AMT     0.058282849 -0.0581052718  0.0682082798 -0.019393815 -0.054994229
## CAR_AGE    -0.056155454  0.1847550727 -0.1635226839  0.060535725  0.414392832
## CLAIM_FLAG  0.100733843 -0.1159015521  0.1238192883 -0.062850633 -0.142441531
##                 HOME_VAL     TRAVTIME     BLUEBOOK           TIF     OLDCLAIM
## KIDSDRIV   -0.0115323056  0.009195038 -0.019774482 -0.0060838345  0.021420132
## AGE         0.2158295117  0.002262822  0.171909562 -0.0003408653 -0.036017959
## HOMEKIDS   -0.1067155429 -0.005177575 -0.104019323  0.0008658922  0.031141679
## YOJ         0.2728389267 -0.018413761  0.146840896  0.0291984215 -0.001692864
## INCOME      0.5819585635 -0.046438182  0.434257195  0.0030593319 -0.033134710
## HOME_VAL    1.0000000000 -0.028965962  0.266800463  0.0009306881 -0.053118684
## TRAVTIME   -0.0289659625  1.000000000 -0.021091451 -0.0156521917 -0.017957660
## BLUEBOOK    0.2668004629 -0.021091451  1.000000000  0.0118722571 -0.030319263
## TIF         0.0009306881 -0.015652192  0.011872257  1.0000000000 -0.012131071
## OLDCLAIM   -0.0531186844 -0.017957660 -0.030319263 -0.0121310712  1.000000000
## CLM_FREQ   -0.0978039623  0.007829285 -0.046691353 -0.0200443672  0.492361674
## MVR_PTS    -0.1017816578  0.004809825 -0.058763715 -0.0310002124  0.268400681
## CLM_AMT    -0.0980956834  0.035851212  0.001279025 -0.0426346372  0.077953310
## CAR_AGE     0.2230697831 -0.027963492  0.187622882  0.0133816603 -0.016740911
## CLAIM_FLAG -0.1867082963  0.056980177 -0.102987790 -0.0775456641  0.143025053
##                CLM_FREQ      MVR_PTS      CLM_AMT     CAR_AGE  CLAIM_FLAG
## KIDSDRIV    0.039470256  0.052658731  0.058282849 -0.05615545  0.10073384
## AGE        -0.038317391 -0.079281300 -0.058105272  0.18475507 -0.11590155
## HOMEKIDS    0.038203804  0.067590368  0.068208280 -0.16352268  0.12381929
## YOJ        -0.021852281 -0.040207142 -0.019393815  0.06053572 -0.06285063
## INCOME     -0.043903368 -0.070170590 -0.054994229  0.41439283 -0.14244153
## HOME_VAL   -0.097803962 -0.101781658 -0.098095683  0.22306978 -0.18670830
## TRAVTIME    0.007829285  0.004809825  0.035851212 -0.02796349  0.05698018
## BLUEBOOK   -0.046691353 -0.058763715  0.001279025  0.18762288 -0.10298779
## TIF        -0.020044367 -0.031000212 -0.042634637  0.01338166 -0.07754566
## OLDCLAIM    0.492361674  0.268400681  0.077953310 -0.01674091  0.14302505
## CLM_FREQ    1.000000000  0.404764717  0.119884936 -0.01532654  0.22663134
## MVR_PTS     0.404764717  1.000000000  0.130610301 -0.03548709  0.22665096
## CLM_AMT     0.119884936  0.130610301  1.000000000 -0.06841157  0.54535984
## CAR_AGE    -0.015326542 -0.035487092 -0.068411573  1.00000000 -0.11556306
## CLAIM_FLAG  0.226631344  0.226650959  0.545359836 -0.11556306  1.00000000
corrplot(cor(insurance_data_cor),
         method = 'number',
         type = "lower")

corrplot(cor(insurance_data_cor), type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)


### Correlation matrix Heat map

col<- colorRampPalette(c("blue", "white", "red"))(20)
heatmap(x = cor(insurance_data_cor), col = col, symm = TRUE)


Komentaras: