Data Exploration
head(Motor)
## # A tibble: 6 × 7
## Driver_Age Driver_Experience Previous_Accidents `Annual_Mileage (x1000 km)`
## <dbl> <dbl> <dbl> <dbl>
## 1 56 32 4 17
## 2 46 19 0 21
## 3 32 11 4 15
## 4 60 0 4 19
## 5 25 7 0 13
## 6 38 13 2 11
## # ℹ 3 more variables: Car_Manufacturing_Year <dbl>, Car_Age <dbl>,
## # Insurance_Premium <dbl>
names(Motor)
## [1] "Driver_Age" "Driver_Experience"
## [3] "Previous_Accidents" "Annual_Mileage (x1000 km)"
## [5] "Car_Manufacturing_Year" "Car_Age"
## [7] "Insurance_Premium"
describe(Motor)
## vars n mean sd median trimmed mad min
## Driver_Age 1 1000 41.58 13.77 42.00 41.61 17.05 18.00
## Driver_Experience 2 1000 14.76 10.54 13.00 14.07 11.86 0.00
## Previous_Accidents 3 1000 2.57 1.70 3.00 2.58 1.48 0.00
## Annual_Mileage (x1000 km) 4 1000 17.93 4.41 18.00 17.92 5.93 11.00
## Car_Manufacturing_Year 5 1000 2007.64 10.36 2008.00 2007.67 13.34 1990.00
## Car_Age 6 1000 17.36 10.36 17.00 17.33 13.34 0.00
## Insurance_Premium 7 1000 493.74 5.91 493.95 493.83 6.52 477.05
## max range skew kurtosis se
## Driver_Age 65.00 47.0 -0.05 -1.14 0.44
## Driver_Experience 40.00 40.0 0.45 -0.84 0.33
## Previous_Accidents 5.00 5.0 -0.06 -1.27 0.05
## Annual_Mileage (x1000 km) 25.00 14.0 0.02 -1.27 0.14
## Car_Manufacturing_Year 2025.00 35.0 -0.04 -1.20 0.33
## Car_Age 35.00 35.0 0.04 -1.20 0.33
## Insurance_Premium 508.15 31.1 -0.12 -0.51 0.19
summary(Motor)
## Driver_Age Driver_Experience Previous_Accidents Annual_Mileage (x1000 km)
## Min. :18.00 Min. : 0.00 Min. :0.000 Min. :11.00
## 1st Qu.:30.00 1st Qu.: 6.00 1st Qu.:1.000 1st Qu.:14.00
## Median :42.00 Median :13.00 Median :3.000 Median :18.00
## Mean :41.58 Mean :14.76 Mean :2.568 Mean :17.93
## 3rd Qu.:53.00 3rd Qu.:23.00 3rd Qu.:4.000 3rd Qu.:22.00
## Max. :65.00 Max. :40.00 Max. :5.000 Max. :25.00
## Car_Manufacturing_Year Car_Age Insurance_Premium
## Min. :1990 Min. : 0.00 Min. :477.1
## 1st Qu.:1999 1st Qu.: 8.00 1st Qu.:489.5
## Median :2008 Median :17.00 Median :493.9
## Mean :2008 Mean :17.36 Mean :493.7
## 3rd Qu.:2017 3rd Qu.:26.00 3rd Qu.:498.3
## Max. :2025 Max. :35.00 Max. :508.1
glimpse(Motor)
## Rows: 1,000
## Columns: 7
## $ Driver_Age <dbl> 56, 46, 32, 60, 25, 38, 56, 36, 40, 28, 28…
## $ Driver_Experience <dbl> 32, 19, 11, 0, 7, 13, 37, 18, 10, 10, 10, …
## $ Previous_Accidents <dbl> 4, 0, 4, 4, 0, 2, 3, 1, 4, 5, 5, 3, 5, 2, …
## $ `Annual_Mileage (x1000 km)` <dbl> 17, 21, 15, 19, 13, 11, 14, 19, 14, 12, 20…
## $ Car_Manufacturing_Year <dbl> 2002, 2025, 2020, 1991, 2005, 1998, 1998, …
## $ Car_Age <dbl> 23, 0, 5, 34, 20, 27, 27, 25, 14, 3, 0, 13…
## $ Insurance_Premium <dbl> 488.35, 486.15, 497.55, 498.35, 495.55, 49…
str(Motor)
## spc_tbl_ [1,000 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Driver_Age : num [1:1000] 56 46 32 60 25 38 56 36 40 28 ...
## $ Driver_Experience : num [1:1000] 32 19 11 0 7 13 37 18 10 10 ...
## $ Previous_Accidents : num [1:1000] 4 0 4 4 0 2 3 1 4 5 ...
## $ Annual_Mileage (x1000 km): num [1:1000] 17 21 15 19 13 11 14 19 14 12 ...
## $ Car_Manufacturing_Year : num [1:1000] 2002 2025 2020 1991 2005 ...
## $ Car_Age : num [1:1000] 23 0 5 34 20 27 27 25 14 3 ...
## $ Insurance_Premium : num [1:1000] 488 486 498 498 496 ...
## - attr(*, "spec")=
## .. cols(
## .. Driver_Age = col_number(),
## .. Driver_Experience = col_number(),
## .. Previous_Accidents = col_number(),
## .. `Annual_Mileage (x1000 km)` = col_number(),
## .. Car_Manufacturing_Year = col_number(),
## .. Car_Age = col_number(),
## .. Insurance_Premium = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
dim(Motor)
## [1] 1000 7
Motor <- Motor %>% mutate(Annual_Mileage = (`Annual_Mileage (x1000 km)` * 1000))
Motor <- Motor %>% select(-`Annual_Mileage (x1000 km)`)
Data Visualization
hist(Motor$Driver_Age)

Motor %>% ggplot(aes(Insurance_Premium)) + geom_histogram(aes(colour = "purple"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Table count for Previous Accidents
table(Motor$Previous_Accidents)
##
## 0 1 2 3 4 5
## 154 171 144 185 176 170
pairs.panels(Motor[c("Driver_Age","Driver_Experience", "Previous_Accidents", "Car_Manufacturing_Year", "Car_Age",
"Annual_Mileage","Insurance_Premium")], pch = ".")

Due to collinearity between Car_Age and Car_Manufacturing_Year, we
remove one of the variable
Motor <- Motor %>% select(-"Car_Age")
names(Motor)
## [1] "Driver_Age" "Driver_Experience" "Previous_Accidents"
## [4] "Car_Manufacturing_Year" "Insurance_Premium" "Annual_Mileage"
Training a model on the data
motor_model <- lm(Insurance_Premium ~., data = Motor)
### checking for the estimated beta coefficients
options(scipen = 999)
motor_model
##
## Call:
## lm(formula = Insurance_Premium ~ ., data = Motor)
##
## Coefficients:
## (Intercept) Driver_Age Driver_Experience
## 702.50000 -0.20000 -0.30000
## Previous_Accidents Car_Manufacturing_Year Annual_Mileage
## 1.50000 -0.10000 0.00005
Making a prediction with the original data
Motor$pred<- predict(motor_model, Motor)
correlation with the predicted model and the original data
cor(Motor$pred, Motor$Insurance_Premium)
## [1] 1
Plotting the graph
plot(Motor$pred, Motor$Insurance_Premium)

### Making a prediction on a new policy
new_client <- data.frame(Driver_Age = c(30), Driver_Experience = c(6), Previous_Accidents = c(0), Car_Manufacturing_Year = c(2015),
Annual_Mileage = c(1200))
### prediction
predict(motor_model, new_client)
## 1
## 493.26
The model’s regression allows us to implement regression model in a
customer database for real-time predictions.