ins<-read.csv("C:/Users/Vaibhav Goyal/Desktop/simpl/DATA SCIENCE WITH R/Insurance/SwedishMotorInsurance.csv",header = T)
#1 ques
#The committee is interested to know each field of the data collected through descriptive analysis
#to gain basic insights into the data set and to prepare for further analysis.
summary(ins)
## Kilometres Zone Bonus Make
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:2.000 1st Qu.:3.000
## Median :3.000 Median :4.00 Median :4.000 Median :5.000
## Mean :2.986 Mean :3.97 Mean :4.015 Mean :4.992
## 3rd Qu.:4.000 3rd Qu.:6.00 3rd Qu.:6.000 3rd Qu.:7.000
## Max. :5.000 Max. :7.00 Max. :7.000 Max. :9.000
## Insured Claims Payment
## Min. : 0.01 Min. : 0.00 Min. : 0
## 1st Qu.: 21.61 1st Qu.: 1.00 1st Qu.: 2989
## Median : 81.53 Median : 5.00 Median : 27404
## Mean : 1092.20 Mean : 51.87 Mean : 257008
## 3rd Qu.: 389.78 3rd Qu.: 21.00 3rd Qu.: 111954
## Max. :127687.27 Max. :3338.00 Max. :18245026
#2ques
# The total value of payment by an insurance company is an important factor to be monitored.
#So the committee has decided to find whether this payment is related to number of claims
#and the number of insured policy years.
#They also want to visualize the results for better understanding.
#myapprocah1
lm1<-lm(ins$Payment~ins$Claims+ins$Insured)
lm1
##
## Call:
## lm(formula = ins$Payment ~ ins$Claims + ins$Insured)
##
## Coefficients:
## (Intercept) ins$Claims ins$Insured
## 3250.74 4294.77 28.39
summary(lm1)
##
## Call:
## lm(formula = ins$Payment ~ ins$Claims + ins$Insured)
##
## Residuals:
## Min 1Q Median 3Q Max
## -799392 -12743 -3733 10591 861235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3250.7447 1582.7077 2.054 0.0401 *
## ins$Claims 4294.7750 18.2819 234.920 <2e-16 ***
## ins$Insured 28.3881 0.6514 43.580 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 71270 on 2179 degrees of freedom
## Multiple R-squared: 0.9951, Adjusted R-squared: 0.9951
## F-statistic: 2.211e+05 on 2 and 2179 DF, p-value: < 2.2e-16
#myapproach 2
#with linear we cant differnetiate only tell is it related strongly or not
#pvalue of claima and isnured is less therefor this means significant role of both for the total payment
cor(ins$Claims,ins$Payment)
## [1] 0.9954003
#99.54% means positively corelated with payment
cor(ins$Insured,ins$Payment)
## [1] 0.933217
#93.33% positively corelated insured with paymnet
#now to plot
plot(ins$Claims,ins$Payment)

plot(ins$Insured,ins$Payment)

#3ques
#The committee wants to figure out the reasons for insurance payment increase and decrease.
#So they have decided to find whether distance, location, bonus, make, and insured amount or
#claims are affecting the payment or all or some of these are affecting it.
#Independent variable: insured, claims, make, bonus, zone, and kilometers
# Dependent variable: payment
lm2<-lm(ins$Payment~.,data=ins)
summary(lm2)
##
## Call:
## lm(formula = ins$Payment ~ ., data = ins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -806775 -16943 -6321 11528 847015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.173e+04 6.338e+03 -3.429 0.000617 ***
## Kilometres 4.769e+03 1.086e+03 4.392 1.18e-05 ***
## Zone 2.323e+03 7.735e+02 3.003 0.002703 **
## Bonus 1.183e+03 7.737e+02 1.529 0.126462
## Make -7.543e+02 6.107e+02 -1.235 0.216917
## Insured 2.788e+01 6.652e-01 41.913 < 2e-16 ***
## Claims 4.316e+03 1.895e+01 227.793 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70830 on 2175 degrees of freedom
## Multiple R-squared: 0.9952, Adjusted R-squared: 0.9952
## F-statistic: 7.462e+04 on 6 and 2175 DF, p-value: < 2.2e-16
#except bonus and make all are related where km,insured,claims are strongly affecting
#4ques
#The insurance company is planning to establish a new branch office, so they are interested
#to find at what location, kilometer, and bonus level their insured amount, claims,
#and payment get increased.
grupzone<-apply(ins[,c(5,6,7)], 2, function(x) tapply(x, ins$Zone, mean))
grupzone
## Insured Claims Payment
## 1 1036.17175 73.568254 338518.95
## 2 1231.48184 67.625397 319921.52
## 3 1362.95870 63.295238 307550.85
## 4 2689.38041 101.311111 537071.76
## 5 384.80188 19.047923 93001.84
## 6 802.68457 32.577778 175528.47
## 7 64.91071 2.108844 9948.19
# Zone 4 has the highest number of claims, and thus payment as well.
# Zones 1-4 have more insured years, claims, and payments.
grupkil<-apply(ins[,c(5,6,7)],2,function(x)tapply(x,ins$Kilometres,mean))
grupkil
## Insured Claims Payment
## 1 1837.8163 75.59453 361899.35
## 2 1824.0288 89.27664 442523.78
## 3 1081.9714 54.16100 272012.58
## 4 398.9632 20.79493 108213.41
## 5 284.9475 18.04215 93306.12
# Kilometer group 2 has the maximum payments. Though the insured number of years is lesser than kilometre 1, the claims and payments are higher for group 2
grupbon<-apply(ins[,c(5,6,7)],2,function(x)tapply(x,ins$Bonus,mean))
grupbon
## Insured Claims Payment
## 1 525.5502 62.50489 282921.99
## 2 451.0754 34.23397 163316.62
## 3 397.4737 24.97419 122656.17
## 4 360.3867 20.35161 98498.12
## 5 437.3936 22.82109 108790.50
## 6 805.8167 39.94286 197723.82
## 7 4620.3728 157.22222 819322.48
#The committee wants to understand what affects their claim rates so as to decide the right
#premiums for a certain set of situations. Hence, they need to find whether the insured
#amount, zone, kilometer, bonus, or make affects the claim rates and to what extent.
reg<-lm(Claims~Kilometres+Zone+Bonus+Make+Insured,data=ins)
summary(reg)
##
## Call:
## lm(formula = Claims ~ Kilometres + Zone + Bonus + Make + Insured,
## data = ins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1214.57 -25.18 -9.41 10.04 1301.78
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.1230027 7.1270679 5.209 2.08e-07 ***
## Kilometres -3.9648601 1.2255209 -3.235 0.00123 **
## Zone -6.2924300 0.8647405 -7.277 4.75e-13 ***
## Bonus -4.2468101 0.8707236 -4.877 1.15e-06 ***
## Make 6.7725342 0.6755390 10.025 < 2e-16 ***
## Insured 0.0318697 0.0003158 100.933 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 80.14 on 2176 degrees of freedom
## Multiple R-squared: 0.8425, Adjusted R-squared: 0.8421
## F-statistic: 2328 on 5 and 2176 DF, p-value: < 2.2e-16
#Dependent variable: claims Independent variable: kilometres, zone, bonus, make, and insured
#The results provides the intercept and estimated value and this in turn shows
#that all the p values of independent variables, such as kilometres, zone, bonus, make, and
#insured are highly significant and are making an impact on the claims.