Data Exploration

head(Motor)
## # A tibble: 6 × 7
##   Driver_Age Driver_Experience Previous_Accidents `Annual_Mileage (x1000 km)`
##        <dbl>             <dbl>              <dbl>                       <dbl>
## 1         56                32                  4                          17
## 2         46                19                  0                          21
## 3         32                11                  4                          15
## 4         60                 0                  4                          19
## 5         25                 7                  0                          13
## 6         38                13                  2                          11
## # ℹ 3 more variables: Car_Manufacturing_Year <dbl>, Car_Age <dbl>,
## #   Insurance_Premium <dbl>
names(Motor)
## [1] "Driver_Age"                "Driver_Experience"        
## [3] "Previous_Accidents"        "Annual_Mileage (x1000 km)"
## [5] "Car_Manufacturing_Year"    "Car_Age"                  
## [7] "Insurance_Premium"
describe(Motor)
##                           vars    n    mean    sd  median trimmed   mad     min
## Driver_Age                   1 1000   41.58 13.77   42.00   41.61 17.05   18.00
## Driver_Experience            2 1000   14.76 10.54   13.00   14.07 11.86    0.00
## Previous_Accidents           3 1000    2.57  1.70    3.00    2.58  1.48    0.00
## Annual_Mileage (x1000 km)    4 1000   17.93  4.41   18.00   17.92  5.93   11.00
## Car_Manufacturing_Year       5 1000 2007.64 10.36 2008.00 2007.67 13.34 1990.00
## Car_Age                      6 1000   17.36 10.36   17.00   17.33 13.34    0.00
## Insurance_Premium            7 1000  493.74  5.91  493.95  493.83  6.52  477.05
##                               max range  skew kurtosis   se
## Driver_Age                  65.00  47.0 -0.05    -1.14 0.44
## Driver_Experience           40.00  40.0  0.45    -0.84 0.33
## Previous_Accidents           5.00   5.0 -0.06    -1.27 0.05
## Annual_Mileage (x1000 km)   25.00  14.0  0.02    -1.27 0.14
## Car_Manufacturing_Year    2025.00  35.0 -0.04    -1.20 0.33
## Car_Age                     35.00  35.0  0.04    -1.20 0.33
## Insurance_Premium          508.15  31.1 -0.12    -0.51 0.19
summary(Motor)
##    Driver_Age    Driver_Experience Previous_Accidents Annual_Mileage (x1000 km)
##  Min.   :18.00   Min.   : 0.00     Min.   :0.000      Min.   :11.00            
##  1st Qu.:30.00   1st Qu.: 6.00     1st Qu.:1.000      1st Qu.:14.00            
##  Median :42.00   Median :13.00     Median :3.000      Median :18.00            
##  Mean   :41.58   Mean   :14.76     Mean   :2.568      Mean   :17.93            
##  3rd Qu.:53.00   3rd Qu.:23.00     3rd Qu.:4.000      3rd Qu.:22.00            
##  Max.   :65.00   Max.   :40.00     Max.   :5.000      Max.   :25.00            
##  Car_Manufacturing_Year    Car_Age      Insurance_Premium
##  Min.   :1990           Min.   : 0.00   Min.   :477.1    
##  1st Qu.:1999           1st Qu.: 8.00   1st Qu.:489.5    
##  Median :2008           Median :17.00   Median :493.9    
##  Mean   :2008           Mean   :17.36   Mean   :493.7    
##  3rd Qu.:2017           3rd Qu.:26.00   3rd Qu.:498.3    
##  Max.   :2025           Max.   :35.00   Max.   :508.1
glimpse(Motor)
## Rows: 1,000
## Columns: 7
## $ Driver_Age                  <dbl> 56, 46, 32, 60, 25, 38, 56, 36, 40, 28, 28…
## $ Driver_Experience           <dbl> 32, 19, 11, 0, 7, 13, 37, 18, 10, 10, 10, …
## $ Previous_Accidents          <dbl> 4, 0, 4, 4, 0, 2, 3, 1, 4, 5, 5, 3, 5, 2, …
## $ `Annual_Mileage (x1000 km)` <dbl> 17, 21, 15, 19, 13, 11, 14, 19, 14, 12, 20…
## $ Car_Manufacturing_Year      <dbl> 2002, 2025, 2020, 1991, 2005, 1998, 1998, …
## $ Car_Age                     <dbl> 23, 0, 5, 34, 20, 27, 27, 25, 14, 3, 0, 13…
## $ Insurance_Premium           <dbl> 488.35, 486.15, 497.55, 498.35, 495.55, 49…
str(Motor)
## spc_tbl_ [1,000 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Driver_Age               : num [1:1000] 56 46 32 60 25 38 56 36 40 28 ...
##  $ Driver_Experience        : num [1:1000] 32 19 11 0 7 13 37 18 10 10 ...
##  $ Previous_Accidents       : num [1:1000] 4 0 4 4 0 2 3 1 4 5 ...
##  $ Annual_Mileage (x1000 km): num [1:1000] 17 21 15 19 13 11 14 19 14 12 ...
##  $ Car_Manufacturing_Year   : num [1:1000] 2002 2025 2020 1991 2005 ...
##  $ Car_Age                  : num [1:1000] 23 0 5 34 20 27 27 25 14 3 ...
##  $ Insurance_Premium        : num [1:1000] 488 486 498 498 496 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Driver_Age = col_number(),
##   ..   Driver_Experience = col_number(),
##   ..   Previous_Accidents = col_number(),
##   ..   `Annual_Mileage (x1000 km)` = col_number(),
##   ..   Car_Manufacturing_Year = col_number(),
##   ..   Car_Age = col_number(),
##   ..   Insurance_Premium = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
dim(Motor)
## [1] 1000    7
Motor <-  Motor %>% mutate(Annual_Mileage = (`Annual_Mileage (x1000 km)` *  1000))

Motor <- Motor %>%  select(-`Annual_Mileage (x1000 km)`)

Data Visualization

hist(Motor$Driver_Age)

Motor %>% ggplot(aes(Insurance_Premium)) + geom_histogram(aes(colour = "purple"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Table count for Previous Accidents

table(Motor$Previous_Accidents)
## 
##   0   1   2   3   4   5 
## 154 171 144 185 176 170
pairs.panels(Motor[c("Driver_Age","Driver_Experience", "Previous_Accidents", "Car_Manufacturing_Year", "Car_Age",
                     "Annual_Mileage","Insurance_Premium")], pch = ".")

Due to collinearity between Car_Age and Car_Manufacturing_Year, we remove one of the variable

Motor <- Motor %>% select(-"Car_Age")


names(Motor)
## [1] "Driver_Age"             "Driver_Experience"      "Previous_Accidents"    
## [4] "Car_Manufacturing_Year" "Insurance_Premium"      "Annual_Mileage"

Training a model on the data

motor_model <- lm(Insurance_Premium ~., data = Motor)


### checking for the estimated beta coefficients
options(scipen = 999)
motor_model
## 
## Call:
## lm(formula = Insurance_Premium ~ ., data = Motor)
## 
## Coefficients:
##            (Intercept)              Driver_Age       Driver_Experience  
##              702.50000                -0.20000                -0.30000  
##     Previous_Accidents  Car_Manufacturing_Year          Annual_Mileage  
##                1.50000                -0.10000                 0.00005

Evaluating the model performance

summary(motor_model)
## 
## Call:
## lm(formula = Insurance_Premium ~ ., data = Motor)
## 
## Residuals:
##                Min                 1Q             Median                 3Q 
## -0.000000000000297 -0.000000000000108 -0.000000000000042  0.000000000000022 
##                Max 
##  0.000000000047219 
## 
## Coefficients:
##                                        Estimate               Std. Error
## (Intercept)            702.50000000000568434189   0.00000000000922384722
## Driver_Age              -0.19999999999999962252   0.00000000000000436525
## Driver_Experience       -0.29999999999999227285   0.00000000000000569248
## Previous_Accidents       1.50000000000002375877   0.00000000000002798527
## Car_Manufacturing_Year  -0.10000000000000201783   0.00000000000000459239
## Annual_Mileage           0.00004999999999999790   0.00000000000000001081
##                                t value            Pr(>|t|)    
## (Intercept)             76161278860482 <0.0000000000000002 ***
## Driver_Age             -45816346258905 <0.0000000000000002 ***
## Driver_Experience      -52701089127014 <0.0000000000000002 ***
## Previous_Accidents      53599616237820 <0.0000000000000002 ***
## Car_Manufacturing_Year -21775175001694 <0.0000000000000002 ***
## Annual_Mileage           4626527664258 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000000000001501 on 994 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 3.096e+27 on 5 and 994 DF,  p-value: < 0.00000000000000022

Making a prediction with the original data

Motor$pred<- predict(motor_model, Motor)

correlation with the predicted model and the original data

cor(Motor$pred, Motor$Insurance_Premium)
## [1] 1

Plotting the graph

plot(Motor$pred, Motor$Insurance_Premium)

### Making a prediction on a new policy

new_client <- data.frame(Driver_Age = c(30), Driver_Experience  = c(6), Previous_Accidents = c(0), Car_Manufacturing_Year = c(2015),
                         Annual_Mileage = c(1200))

### prediction
predict(motor_model, new_client)
##      1 
## 493.26

The model’s regression allows us to implement regression model in a customer database for real-time predictions.