library(mlbench)
library(readr)

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
library("knitr")
library(readr)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)


ins1 = read.csv("C:\\Users\\huiwu\\OneDrive\\Desktop\\CAU\\Spring2025\\R_DataScience\\SwedishMotorInsurance.csv",header = T)

summary(ins1)
##    Kilometres         Zone          Bonus            Make      
##  Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.00   1st Qu.:2.000   1st Qu.:3.000  
##  Median :3.000   Median :4.00   Median :4.000   Median :5.000  
##  Mean   :2.986   Mean   :3.97   Mean   :4.015   Mean   :4.992  
##  3rd Qu.:4.000   3rd Qu.:6.00   3rd Qu.:6.000   3rd Qu.:7.000  
##  Max.   :5.000   Max.   :7.00   Max.   :7.000   Max.   :9.000  
##     Insured              Claims           Payment        
##  Min.   :     0.01   Min.   :   0.00   Min.   :       0  
##  1st Qu.:    21.61   1st Qu.:   1.00   1st Qu.:    2989  
##  Median :    81.53   Median :   5.00   Median :   27404  
##  Mean   :  1092.19   Mean   :  51.87   Mean   :  257008  
##  3rd Qu.:   389.78   3rd Qu.:  21.00   3rd Qu.:  111954  
##  Max.   :127687.27   Max.   :3338.00   Max.   :18245026
str(ins1)
## 'data.frame':    2182 obs. of  7 variables:
##  $ Kilometres: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Zone      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Bonus     : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ Make      : int  1 2 3 4 5 6 7 8 9 1 ...
##  $ Insured   : num  455.1 69.2 72.9 1292.4 191 ...
##  $ Claims    : int  108 19 13 124 40 57 23 14 1704 45 ...
##  $ Payment   : int  392491 46221 15694 422201 119373 170913 56940 77487 6805992 214011 ...
head(ins1)
##   Kilometres Zone Bonus Make Insured Claims Payment
## 1          1    1     1    1  455.13    108  392491
## 2          1    1     1    2   69.17     19   46221
## 3          1    1     1    3   72.88     13   15694
## 4          1    1     1    4 1292.39    124  422201
## 5          1    1     1    5  191.01     40  119373
## 6          1    1     1    6  477.66     57  170913
library(DataExplorer)
plot_intro(ins1)

dim(ins1)
## [1] 2182    7
glimpse(ins1)
## Rows: 2,182
## Columns: 7
## $ Kilometres <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Zone       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Bonus      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,…
## $ Make       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2,…
## $ Insured    <dbl> 455.13, 69.17, 72.88, 1292.39, 191.01, 477.66, 105.58, 32.5…
## $ Claims     <int> 108, 19, 13, 124, 40, 57, 23, 14, 1704, 45, 10, 5, 48, 11, …
## $ Payment    <int> 392491, 46221, 15694, 422201, 119373, 170913, 56940, 77487,…
summary(ins1)
##    Kilometres         Zone          Bonus            Make      
##  Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.00   1st Qu.:2.000   1st Qu.:3.000  
##  Median :3.000   Median :4.00   Median :4.000   Median :5.000  
##  Mean   :2.986   Mean   :3.97   Mean   :4.015   Mean   :4.992  
##  3rd Qu.:4.000   3rd Qu.:6.00   3rd Qu.:6.000   3rd Qu.:7.000  
##  Max.   :5.000   Max.   :7.00   Max.   :7.000   Max.   :9.000  
##     Insured              Claims           Payment        
##  Min.   :     0.01   Min.   :   0.00   Min.   :       0  
##  1st Qu.:    21.61   1st Qu.:   1.00   1st Qu.:    2989  
##  Median :    81.53   Median :   5.00   Median :   27404  
##  Mean   :  1092.19   Mean   :  51.87   Mean   :  257008  
##  3rd Qu.:   389.78   3rd Qu.:  21.00   3rd Qu.:  111954  
##  Max.   :127687.27   Max.   :3338.00   Max.   :18245026
# The total value of payment by an insurance company is an important factor to be monitored.

#So the committee has decided to find whether this payment is related to number of claims 

lm1<-lm(ins1$Payment~ins1$Claims+ins1$Insured)

lm1
## 
## Call:
## lm(formula = ins1$Payment ~ ins1$Claims + ins1$Insured)
## 
## Coefficients:
##  (Intercept)   ins1$Claims  ins1$Insured  
##      3250.74       4294.77         28.39
summary(lm1)
## 
## Call:
## lm(formula = ins1$Payment ~ ins1$Claims + ins1$Insured)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -799392  -12743   -3733   10591  861235 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3250.7447  1582.7077   2.054   0.0401 *  
## ins1$Claims  4294.7750    18.2819 234.920   <2e-16 ***
## ins1$Insured   28.3881     0.6514  43.580   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 71270 on 2179 degrees of freedom
## Multiple R-squared:  0.9951, Adjusted R-squared:  0.9951 
## F-statistic: 2.211e+05 on 2 and 2179 DF,  p-value: < 2.2e-16
##calculates the Pearson correlation coefficient between two variables in the ins1 dataset:

cor(ins1$Claims,ins1$Payment)
## [1] 0.9954003
cor(ins1$Insured,ins1$Payment)
## [1] 0.933217
plot(ins1$Claims,ins1$Payment)

plot(ins1$Insured,ins1$Payment)

##creates a linear regression model using all available predictors in the ins1 dataset to predict the Payment variable.

lm2<-lm(ins1$Payment~.,data=ins1)
summary(lm2)
## 
## Call:
## lm(formula = ins1$Payment ~ ., data = ins1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -806775  -16943   -6321   11528  847015 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.173e+04  6.338e+03  -3.429 0.000617 ***
## Kilometres   4.769e+03  1.086e+03   4.392 1.18e-05 ***
## Zone         2.323e+03  7.735e+02   3.003 0.002703 ** 
## Bonus        1.183e+03  7.737e+02   1.529 0.126462    
## Make        -7.543e+02  6.107e+02  -1.235 0.216917    
## Insured      2.788e+01  6.652e-01  41.913  < 2e-16 ***
## Claims       4.316e+03  1.895e+01 227.793  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70830 on 2175 degrees of freedom
## Multiple R-squared:  0.9952, Adjusted R-squared:  0.9952 
## F-statistic: 7.462e+04 on 6 and 2175 DF,  p-value: < 2.2e-16
#The committee wants to understand what affects their claim rates so as to decide the right
#premiums for a certain set of situations. Hence, they need to find whether the insured 
#amount, zone, kilometer, bonus, or make affects the claim rates and to what extent. 

##This R code creates a linear regression model predicting Claims using five specific predictor variables from the ins1 dataset.

reg<-lm(Claims~Kilometres+Zone+Bonus+Make+Insured,data=ins1)
summary(reg)
## 
## Call:
## lm(formula = Claims ~ Kilometres + Zone + Bonus + Make + Insured, 
##     data = ins1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1214.57   -25.18    -9.41    10.04  1301.78 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.1230027  7.1270679   5.209 2.08e-07 ***
## Kilometres  -3.9648601  1.2255209  -3.235  0.00123 ** 
## Zone        -6.2924300  0.8647405  -7.277 4.75e-13 ***
## Bonus       -4.2468101  0.8707236  -4.877 1.15e-06 ***
## Make         6.7725342  0.6755390  10.025  < 2e-16 ***
## Insured      0.0318697  0.0003158 100.933  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 80.14 on 2176 degrees of freedom
## Multiple R-squared:  0.8425, Adjusted R-squared:  0.8421 
## F-statistic:  2328 on 5 and 2176 DF,  p-value: < 2.2e-16







## R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.

When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

```