library(mlbench)
library(readr)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
library("knitr")
library(readr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
ins1 = read.csv("C:\\Users\\huiwu\\OneDrive\\Desktop\\CAU\\Spring2025\\R_DataScience\\SwedishMotorInsurance.csv",header = T)
summary(ins1)
## Kilometres Zone Bonus Make
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.000 1st Qu.:3.000
## Median :3.000 Median :4.00 Median :4.000 Median :5.000
## Mean :2.986 Mean :3.97 Mean :4.015 Mean :4.992
## 3rd Qu.:4.000 3rd Qu.:6.00 3rd Qu.:6.000 3rd Qu.:7.000
## Max. :5.000 Max. :7.00 Max. :7.000 Max. :9.000
## Insured Claims Payment
## Min. : 0.01 Min. : 0.00 Min. : 0
## 1st Qu.: 21.61 1st Qu.: 1.00 1st Qu.: 2989
## Median : 81.53 Median : 5.00 Median : 27404
## Mean : 1092.19 Mean : 51.87 Mean : 257008
## 3rd Qu.: 389.78 3rd Qu.: 21.00 3rd Qu.: 111954
## Max. :127687.27 Max. :3338.00 Max. :18245026
str(ins1)
## 'data.frame': 2182 obs. of 7 variables:
## $ Kilometres: int 1 1 1 1 1 1 1 1 1 1 ...
## $ Zone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Bonus : int 1 1 1 1 1 1 1 1 1 2 ...
## $ Make : int 1 2 3 4 5 6 7 8 9 1 ...
## $ Insured : num 455.1 69.2 72.9 1292.4 191 ...
## $ Claims : int 108 19 13 124 40 57 23 14 1704 45 ...
## $ Payment : int 392491 46221 15694 422201 119373 170913 56940 77487 6805992 214011 ...
head(ins1)
## Kilometres Zone Bonus Make Insured Claims Payment
## 1 1 1 1 1 455.13 108 392491
## 2 1 1 1 2 69.17 19 46221
## 3 1 1 1 3 72.88 13 15694
## 4 1 1 1 4 1292.39 124 422201
## 5 1 1 1 5 191.01 40 119373
## 6 1 1 1 6 477.66 57 170913
library(DataExplorer)
plot_intro(ins1)
dim(ins1)
## [1] 2182 7
glimpse(ins1)
## Rows: 2,182
## Columns: 7
## $ Kilometres <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Zone <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Bonus <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,…
## $ Make <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2,…
## $ Insured <dbl> 455.13, 69.17, 72.88, 1292.39, 191.01, 477.66, 105.58, 32.5…
## $ Claims <int> 108, 19, 13, 124, 40, 57, 23, 14, 1704, 45, 10, 5, 48, 11, …
## $ Payment <int> 392491, 46221, 15694, 422201, 119373, 170913, 56940, 77487,…
summary(ins1)
## Kilometres Zone Bonus Make
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.000 1st Qu.:3.000
## Median :3.000 Median :4.00 Median :4.000 Median :5.000
## Mean :2.986 Mean :3.97 Mean :4.015 Mean :4.992
## 3rd Qu.:4.000 3rd Qu.:6.00 3rd Qu.:6.000 3rd Qu.:7.000
## Max. :5.000 Max. :7.00 Max. :7.000 Max. :9.000
## Insured Claims Payment
## Min. : 0.01 Min. : 0.00 Min. : 0
## 1st Qu.: 21.61 1st Qu.: 1.00 1st Qu.: 2989
## Median : 81.53 Median : 5.00 Median : 27404
## Mean : 1092.19 Mean : 51.87 Mean : 257008
## 3rd Qu.: 389.78 3rd Qu.: 21.00 3rd Qu.: 111954
## Max. :127687.27 Max. :3338.00 Max. :18245026
# The total value of payment by an insurance company is an important factor to be monitored.
#So the committee has decided to find whether this payment is related to number of claims
lm1<-lm(ins1$Payment~ins1$Claims+ins1$Insured)
lm1
##
## Call:
## lm(formula = ins1$Payment ~ ins1$Claims + ins1$Insured)
##
## Coefficients:
## (Intercept) ins1$Claims ins1$Insured
## 3250.74 4294.77 28.39
summary(lm1)
##
## Call:
## lm(formula = ins1$Payment ~ ins1$Claims + ins1$Insured)
##
## Residuals:
## Min 1Q Median 3Q Max
## -799392 -12743 -3733 10591 861235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3250.7447 1582.7077 2.054 0.0401 *
## ins1$Claims 4294.7750 18.2819 234.920 <2e-16 ***
## ins1$Insured 28.3881 0.6514 43.580 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 71270 on 2179 degrees of freedom
## Multiple R-squared: 0.9951, Adjusted R-squared: 0.9951
## F-statistic: 2.211e+05 on 2 and 2179 DF, p-value: < 2.2e-16
##calculates the Pearson correlation coefficient between two variables in the ins1 dataset:
cor(ins1$Claims,ins1$Payment)
## [1] 0.9954003
cor(ins1$Insured,ins1$Payment)
## [1] 0.933217
plot(ins1$Claims,ins1$Payment)
plot(ins1$Insured,ins1$Payment)
##creates a linear regression model using all available predictors in the ins1 dataset to predict the Payment variable.
lm2<-lm(ins1$Payment~.,data=ins1)
summary(lm2)
##
## Call:
## lm(formula = ins1$Payment ~ ., data = ins1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -806775 -16943 -6321 11528 847015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.173e+04 6.338e+03 -3.429 0.000617 ***
## Kilometres 4.769e+03 1.086e+03 4.392 1.18e-05 ***
## Zone 2.323e+03 7.735e+02 3.003 0.002703 **
## Bonus 1.183e+03 7.737e+02 1.529 0.126462
## Make -7.543e+02 6.107e+02 -1.235 0.216917
## Insured 2.788e+01 6.652e-01 41.913 < 2e-16 ***
## Claims 4.316e+03 1.895e+01 227.793 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70830 on 2175 degrees of freedom
## Multiple R-squared: 0.9952, Adjusted R-squared: 0.9952
## F-statistic: 7.462e+04 on 6 and 2175 DF, p-value: < 2.2e-16
#The committee wants to understand what affects their claim rates so as to decide the right
#premiums for a certain set of situations. Hence, they need to find whether the insured
#amount, zone, kilometer, bonus, or make affects the claim rates and to what extent.
##This R code creates a linear regression model predicting Claims using five specific predictor variables from the ins1 dataset.
reg<-lm(Claims~Kilometres+Zone+Bonus+Make+Insured,data=ins1)
summary(reg)
##
## Call:
## lm(formula = Claims ~ Kilometres + Zone + Bonus + Make + Insured,
## data = ins1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1214.57 -25.18 -9.41 10.04 1301.78
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.1230027 7.1270679 5.209 2.08e-07 ***
## Kilometres -3.9648601 1.2255209 -3.235 0.00123 **
## Zone -6.2924300 0.8647405 -7.277 4.75e-13 ***
## Bonus -4.2468101 0.8707236 -4.877 1.15e-06 ***
## Make 6.7725342 0.6755390 10.025 < 2e-16 ***
## Insured 0.0318697 0.0003158 100.933 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 80.14 on 2176 degrees of freedom
## Multiple R-squared: 0.8425, Adjusted R-squared: 0.8421
## F-statistic: 2328 on 5 and 2176 DF, p-value: < 2.2e-16
## R Markdown
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.
When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
```