insurance<-fread("C:\\Users\\r631758\\Desktop\\test\\medical expense\\Machine Learning with R (2nd Ed.)\\Chapter 06\\insurance.csv")
insurance<-as.data.frame(insurance)
str(insurance)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 25.7 33.4 27.7 29.8 25.8 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ expenses: num 16885 1726 4449 21984 3867 ...
insurance$sex<-as.factor(insurance$sex)
insurance$smoker<-as.factor(insurance$smoker)
insurance$region<-as.factor(insurance$region)
summary(insurance$expenses)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1122 4740 9382 13270 16640 63770
#histogram(insurance$expenses)
p<-ggplot(data=insurance, aes(insurance$expenses))+geom_histogram( aes(y=..density..),fill="skyblue", col="pink")+ggtitle("Expenses Histogram")+geom_density( color="black",alpha = 0.5)
ggplotly(p)
insurance$log.insurance<-log(insurance$expenses)
p<-ggplot(data=insurance, aes(insurance$log.insurance)) + geom_histogram( aes(y=..density..),fill="skyblue", col="pink")+ggtitle("LOG Expenses Histogram")+geom_density( color="black",alpha = 0.5)
ggplotly(p)
insurance$reci.insurance<-1/insurance$expenses
p<-ggplot(data=insurance, aes(insurance$reci.insurance)) + geom_histogram( aes(y=..density..),fill="skyblue", col="pink")+ggtitle("reciprocal Expenses Histogram")+geom_density( color="black",alpha = 0.5)
ggplotly(p)
table(insurance$region)
##
## northeast northwest southeast southwest
## 324 325 364 325
# chart.Correlation(cbind(insurance$age, insurance$bmi, insurance$children, insurance$expenses), method = c("pearson"), pch=21)
cor(insurance[c("age", "bmi", "children", "expenses")])
## age bmi children expenses
## age 1.0000000 0.10934101 0.04246900 0.29900819
## bmi 0.1093410 1.00000000 0.01264471 0.19857626
## children 0.0424690 0.01264471 1.00000000 0.06799823
## expenses 0.2990082 0.19857626 0.06799823 1.00000000
#pairs(insurance[c("age", "bmi", "children", "expenses")])
library(psych)
pairs.panels(insurance[c("age" , "bmi","children", "expenses")])

library(ggplot2)
head(insurance)
## age sex bmi children smoker region expenses log.insurance
## 1 19 female 27.9 0 yes southwest 16884.92 9.734176
## 2 18 male 33.8 1 no southeast 1725.55 7.453301
## 3 28 male 33.0 3 no southeast 4449.46 8.400538
## 4 33 male 22.7 0 no northwest 21984.47 9.998092
## 5 32 male 28.9 0 no northwest 3866.86 8.260198
## 6 31 female 25.7 0 no southeast 3756.62 8.231275
## reci.insurance
## 1 5.922444e-05
## 2 5.795254e-04
## 3 2.247464e-04
## 4 4.548665e-05
## 5 2.586078e-04
## 6 2.661967e-04
p1<-ggplot(insurance, aes(x=expenses, y=bmi, colour =smoker)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth()+ggtitle("Scatter plot with smoker Status") # Add a loess smoothed fit curve with confidence region
p2<-ggplot(insurance, aes(x=expenses, y=age, colour =smoker)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth()+ggtitle("Scatter plot with smoker Status") # Add a loess smoothed fit curve with confidence region
p3<-ggplot(insurance, aes(x=expenses, y=children, colour =smoker)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth()+ggtitle("Scatter plot with smoker Status")
multiplot(p1, p2,p3, cols=3)

ggplotly(p1)
ggplotly(p2)
ggplotly(p3)
insurance$age2<-insurance$age^2
insurance$bmi30<-ifelse(insurance$bmi>=30,1,0)
trainIndex=createDataPartition(insurance$expenses,p=0.7, list=FALSE, times=1)
#https://www.rdocumentation.org/packages/caret/versions/6.0-76/topics/createDataPartition
training<-insurance[trainIndex,]
testing<-insurance[-trainIndex,]
training<-as.data.frame(training)
ins.model<-lm(expenses~age+children+bmi+sex+smoker+region, data=training)
#ins.model<-lm(expenses~age+children+bmi+sex+smoker+region, data=insurance)
ins.model
##
## Call:
## lm(formula = expenses ~ age + children + bmi + sex + smoker +
## region, data = training)
##
## Coefficients:
## (Intercept) age children bmi
## -11293.3 254.8 402.2 328.2
## sexmale smokeryes regionnorthwest regionsoutheast
## -217.9 23338.1 -594.2 -915.4
## regionsouthwest
## -1054.0
summary(ins.model)
##
## Call:
## lm(formula = expenses ~ age + children + bmi + sex + smoker +
## region, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10970 -2933 -1041 1256 23560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11293.2 1147.7 -9.840 <2e-16 ***
## age 254.8 13.9 18.334 <2e-16 ***
## children 402.2 159.5 2.522 0.0118 *
## bmi 328.2 33.6 9.769 <2e-16 ***
## sexmale -217.9 391.0 -0.557 0.5774
## smokeryes 23338.1 488.4 47.788 <2e-16 ***
## regionnorthwest -594.1 569.9 -1.043 0.2974
## regionsoutheast -915.4 565.5 -1.619 0.1059
## regionsouthwest -1054.0 560.3 -1.881 0.0603 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5957 on 929 degrees of freedom
## Multiple R-squared: 0.7516, Adjusted R-squared: 0.7494
## F-statistic: 351.3 on 8 and 929 DF, p-value: < 2.2e-16
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
bmi30*smoker + region, data = training)
summary(ins_model2)
##
## Call:
## lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 *
## smoker + region, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16730.4 -1696.7 -1314.5 -719.8 23844.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1281.881 1619.570 0.791 0.42886
## age -59.302 70.809 -0.837 0.40253
## age2 4.049 0.888 4.560 5.80e-06 ***
## children 668.031 123.927 5.391 8.92e-08 ***
## bmi 95.130 41.810 2.275 0.02311 *
## sexmale -492.901 292.364 -1.686 0.09215 .
## bmi30 -765.196 512.490 -1.493 0.13575
## smokeryes 13502.656 524.085 25.764 < 2e-16 ***
## regionnorthwest -314.017 426.079 -0.737 0.46131
## regionsoutheast -603.786 423.553 -1.426 0.15434
## regionsouthwest -1267.813 418.824 -3.027 0.00254 **
## bmi30:smokeryes 19109.395 726.802 26.292 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4452 on 926 degrees of freedom
## Multiple R-squared: 0.8617, Adjusted R-squared: 0.8601
## F-statistic: 524.5 on 11 and 926 DF, p-value: < 2.2e-16
#influence(ins_model2)
plot(ins_model2)




library(MASS)
step<-stepAIC(ins_model2, direction="both")
## Start: AIC=15772.32
## expenses ~ age + age2 + children + bmi + sex + bmi30 * smoker +
## region
##
## Df Sum of Sq RSS AIC
## - age 1 1.3900e+07 1.8366e+10 15771
## <none> 1.8352e+10 15772
## - sex 1 5.6330e+07 1.8408e+10 15773
## - bmi 1 1.0260e+08 1.8455e+10 15776
## - region 3 1.9932e+08 1.8551e+10 15776
## - age2 1 4.1214e+08 1.8764e+10 15791
## - children 1 5.7588e+08 1.8928e+10 15799
## - bmi30:smoker 1 1.3700e+10 3.2052e+10 16293
##
## Step: AIC=15771.03
## expenses ~ age2 + children + bmi + sex + bmi30 + smoker + region +
## bmi30:smoker
##
## Df Sum of Sq RSS AIC
## <none> 1.8366e+10 15771
## - sex 1 5.5991e+07 1.8422e+10 15772
## + age 1 1.3900e+07 1.8352e+10 15772
## - bmi 1 9.9576e+07 1.8465e+10 15774
## - region 3 1.9921e+08 1.8565e+10 15775
## - children 1 5.7035e+08 1.8936e+10 15798
## - age2 1 1.2807e+10 3.1173e+10 16265
## - bmi30:smoker 1 1.3687e+10 3.2053e+10 16291
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## expenses ~ age + age2 + children + bmi + sex + bmi30 * smoker +
## region
##
## Final Model:
## expenses ~ age2 + children + bmi + sex + bmi30 + smoker + region +
## bmi30:smoker
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 926 18351914079 15772.32
## 2 - age 1 13900488 927 18365814568 15771.03
ins_model3 <- lm(expenses ~ age2 + children + bmi + sex +
bmi30*smoker + region, data = training)
summary(ins_model3)
##
## Call:
## lm(formula = expenses ~ age2 + children + bmi + sex + bmi30 *
## smoker + region, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16915.4 -1679.7 -1323.2 -734.3 23987.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 293.8675 1109.4342 0.265 0.79116
## age2 3.3138 0.1303 25.425 < 2e-16 ***
## children 639.8661 119.2574 5.365 1.02e-07 ***
## bmi 93.6312 41.7647 2.242 0.02521 *
## sexmale -491.4064 292.3113 -1.681 0.09308 .
## bmi30 -733.8031 511.0352 -1.436 0.15136
## smokeryes 13508.8734 523.9479 25.783 < 2e-16 ***
## regionnorthwest -310.3216 425.9873 -0.728 0.46651
## regionsoutheast -602.4374 423.4812 -1.423 0.15519
## regionsouthwest -1266.3579 418.7528 -3.024 0.00256 **
## bmi30:smokeryes 19093.7552 726.4451 26.284 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4451 on 927 degrees of freedom
## Multiple R-squared: 0.8616, Adjusted R-squared: 0.8601
## F-statistic: 577.1 on 10 and 927 DF, p-value: < 2.2e-16
ins_model4 <- lm(expenses ~ age2 + children + bmi +
bmi30*smoker + region, data = training)
summary(ins_model4)
##
## Call:
## lm(formula = expenses ~ age2 + children + bmi + bmi30 * smoker +
## region, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17085.1 -1702.0 -1299.0 -681.5 23727.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.9812 1101.9829 0.057 0.95444
## age2 3.3139 0.1305 25.400 < 2e-16 ***
## children 640.6659 119.3738 5.367 1.01e-07 ***
## bmi 93.4946 41.8057 2.236 0.02556 *
## bmi30 -739.2086 511.5277 -1.445 0.14877
## smokeryes 13470.9115 523.9759 25.709 < 2e-16 ***
## regionnorthwest -320.5873 426.3625 -0.752 0.45229
## regionsoutheast -593.9991 423.8679 -1.401 0.16144
## regionsouthwest -1271.0560 419.1553 -3.032 0.00249 **
## bmi30:smokeryes 19055.0033 726.7933 26.218 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4455 on 928 degrees of freedom
## Multiple R-squared: 0.8612, Adjusted R-squared: 0.8598
## F-statistic: 639.6 on 9 and 928 DF, p-value: < 2.2e-16
AIC(ins_model4)
## [1] 18435.81
AIC(ins_model3)
## [1] 18434.96
AIC(ins_model2)
## [1] 18436.24
p.model2<-predict(ins_model2,testing)
cor(p.model2,testing$expenses)
## [1] 0.9363043
p.model3<-predict(ins_model3,testing)
cor(p.model3,testing$expenses)
## [1] 0.9364331
merge.test<-cbind(p.model2,testing$expenses)
merge.test<-as.data.frame(merge.test)
pairs.panels(cbind(p.model2,testing$expenses))

min_max_accuracy <- mean(apply(merge.test, 1, min) / apply(merge.test, 1, max))
min_max_accuracy
## [1] 0.7905797
mape <- mean(abs((merge.test$p.model2 - merge.test$V2))/merge.test$V2)
mape
## [1] 0.2953863
library(DAAG)
cvResults <- suppressWarnings(CVlm(data=training, form.lm=expenses ~ age + age2 + children + bmi + sex +
bmi30*smoker + region, m=5, dots=FALSE, seed=29, legend.pos="topleft", printit=FALSE, main="Small symbols are predicted values while bigger ones are actuals."));

attr(cvResults, "ms")
## [1] 20201566
#http://r-statistics.co/Linear-Regression.html
#http://www.forecastpro.com/Trends/forecasting101August2011.html
#cv.lm(data=training, ins_model2,m=3)
Use log expenses
insurance$expenses<-insurance$log.insurance
insurance$age2<-insurance$age^2
insurance$bmi30<-ifelse(insurance$bmi>=30,1,0)
trainIndex=createDataPartition(insurance$log.insurance,p=0.7, list=FALSE, times=1)
#https://www.rdocumentation.org/packages/caret/versions/6.0-76/topics/createDataPartition
training<-insurance[trainIndex,]
testing<-insurance[-trainIndex,]
training<-as.data.frame(training)
p<-ggplot(data=training, aes(training$expenses)) + geom_histogram( aes(y=..density..),fill="skyblue", col="pink")+ggtitle("LOG Expenses Histogram for training data")+geom_density( color="black",alpha = 0.5)
ggplotly(p)
ins.model<-lm(expenses~age+children+bmi+sex+smoker+region, data=training)
#ins.model<-lm(expenses~age+children+bmi+sex+smoker+region, data=insurance)
ins.model
##
## Call:
## lm(formula = expenses ~ age + children + bmi + sex + smoker +
## region, data = training)
##
## Coefficients:
## (Intercept) age children bmi
## 7.09457 0.03504 0.10737 0.01065
## sexmale smokeryes regionnorthwest regionsoutheast
## -0.08413 1.61106 -0.07127 -0.14203
## regionsouthwest
## -0.15842
summary(ins.model)
##
## Call:
## lm(formula = expenses ~ age + children + bmi + sex + smoker +
## region, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00584 -0.18838 -0.04325 0.05908 2.17726
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.094574 0.085313 83.159 < 2e-16 ***
## age 0.035045 0.001021 34.310 < 2e-16 ***
## children 0.107374 0.011918 9.009 < 2e-16 ***
## bmi 0.010655 0.002450 4.349 1.52e-05 ***
## sexmale -0.084130 0.028771 -2.924 0.003538 **
## smokeryes 1.611063 0.035668 45.168 < 2e-16 ***
## regionnorthwest -0.071270 0.041500 -1.717 0.086251 .
## regionsoutheast -0.142032 0.041517 -3.421 0.000651 ***
## regionsouthwest -0.158424 0.041095 -3.855 0.000124 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4371 on 929 degrees of freedom
## Multiple R-squared: 0.7761, Adjusted R-squared: 0.7742
## F-statistic: 402.6 on 8 and 929 DF, p-value: < 2.2e-16
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
bmi30*smoker + region, data = training)
summary(ins_model2)
##
## Call:
## lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 *
## smoker + region, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.79177 -0.18670 -0.07174 0.06520 2.27167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.028e+00 1.548e-01 45.393 < 2e-16 ***
## age 5.093e-02 6.724e-03 7.574 8.76e-14 ***
## age2 -2.001e-04 8.382e-05 -2.387 0.017179 *
## children 1.025e-01 1.189e-02 8.620 < 2e-16 ***
## bmi 4.588e-03 3.843e-03 1.194 0.232889
## sexmale -9.982e-02 2.743e-02 -3.639 0.000289 ***
## bmi30 -4.152e-02 4.749e-02 -0.874 0.382220
## smokeryes 1.261e+00 5.044e-02 25.005 < 2e-16 ***
## regionnorthwest -4.858e-02 3.959e-02 -1.227 0.220135
## regionsoutheast -1.314e-01 3.964e-02 -3.314 0.000955 ***
## regionsouthwest -1.453e-01 3.918e-02 -3.708 0.000221 ***
## bmi30:smokeryes 6.435e-01 6.817e-02 9.440 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4161 on 926 degrees of freedom
## Multiple R-squared: 0.7978, Adjusted R-squared: 0.7954
## F-statistic: 332.1 on 11 and 926 DF, p-value: < 2.2e-16
#influence(ins_model2)
plot(ins_model2)




library(MASS)
step<-stepAIC(ins_model2, direction="both")
## Start: AIC=-1633.08
## expenses ~ age + age2 + children + bmi + sex + bmi30 * smoker +
## region
##
## Df Sum of Sq RSS AIC
## - bmi 1 0.2467 160.56 -1633.6
## <none> 160.32 -1633.1
## - age2 1 0.9866 161.30 -1629.3
## - sex 1 2.2920 162.61 -1621.8
## - region 3 3.1468 163.46 -1620.8
## - age 1 9.9311 170.25 -1578.7
## - children 1 12.8632 173.18 -1562.7
## - bmi30:smoker 1 15.4294 175.75 -1548.9
##
## Step: AIC=-1633.63
## expenses ~ age + age2 + children + sex + bmi30 + smoker + region +
## bmi30:smoker
##
## Df Sum of Sq RSS AIC
## <none> 160.56 -1633.6
## + bmi 1 0.2467 160.32 -1633.1
## - age2 1 0.9928 161.56 -1629.8
## - sex 1 2.2318 162.79 -1622.7
## - region 3 2.9400 163.50 -1622.6
## - age 1 9.9760 170.54 -1579.1
## - children 1 13.0399 173.60 -1562.4
## - bmi30:smoker 1 15.6180 176.18 -1548.6
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## expenses ~ age + age2 + children + bmi + sex + bmi30 * smoker +
## region
##
## Final Model:
## expenses ~ age + age2 + children + sex + bmi30 + smoker + region +
## bmi30:smoker
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 926 160.3157 -1633.075
## 2 - bmi 1 0.2467046 927 160.5624 -1633.633
ins_model3 <- lm(expenses ~ age2 + children + bmi + sex +
bmi30*smoker + region, data = training)
summary(ins_model3)
##
## Call:
## lm(formula = expenses ~ age2 + children + bmi + sex + bmi30 *
## smoker + region, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01477 -0.18754 -0.02280 0.09469 2.14588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.903e+00 1.062e-01 74.404 < 2e-16 ***
## age2 4.281e-04 1.248e-05 34.288 < 2e-16 ***
## children 1.292e-01 1.169e-02 11.050 < 2e-16 ***
## bmi 4.988e-03 3.958e-03 1.260 0.207919
## sexmale -9.944e-02 2.826e-02 -3.519 0.000454 ***
## bmi30 -5.914e-02 4.885e-02 -1.211 0.226373
## smokeryes 1.240e+00 5.187e-02 23.906 < 2e-16 ***
## regionnorthwest -4.459e-02 4.077e-02 -1.093 0.274464
## regionsoutheast -1.366e-01 4.082e-02 -3.345 0.000855 ***
## regionsouthwest -1.388e-01 4.034e-02 -3.441 0.000606 ***
## bmi30:smokeryes 6.661e-01 7.014e-02 9.496 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4285 on 927 degrees of freedom
## Multiple R-squared: 0.7853, Adjusted R-squared: 0.783
## F-statistic: 339 on 10 and 927 DF, p-value: < 2.2e-16
ins_model4 <- lm(expenses ~ age2 + children + bmi +
bmi30*smoker + region, data = training)
summary(ins_model4)
##
## Call:
## lm(formula = expenses ~ age2 + children + bmi + bmi30 * smoker +
## region, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.99343 -0.17484 -0.02595 0.09662 2.10057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.866e+00 1.064e-01 73.965 < 2e-16 ***
## age2 4.292e-04 1.256e-05 34.181 < 2e-16 ***
## children 1.281e-01 1.176e-02 10.896 < 2e-16 ***
## bmi 4.387e-03 3.979e-03 1.103 0.27045
## bmi30 -5.710e-02 4.915e-02 -1.162 0.24565
## smokeryes 1.242e+00 5.219e-02 23.791 < 2e-16 ***
## regionnorthwest -3.709e-02 4.097e-02 -0.905 0.36558
## regionsoutheast -1.343e-01 4.107e-02 -3.270 0.00111 **
## regionsouthwest -1.334e-01 4.056e-02 -3.289 0.00104 **
## bmi30:smokeryes 6.518e-01 7.045e-02 9.251 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4312 on 928 degrees of freedom
## Multiple R-squared: 0.7824, Adjusted R-squared: 0.7803
## F-statistic: 370.7 on 9 and 928 DF, p-value: < 2.2e-16
AIC(ins_model4)
## [1] 1095.68
AIC(ins_model3)
## [1] 1085.231
AIC(ins_model2)
## [1] 1030.853
p.model2<-predict(ins_model2,testing)
cor(p.model2,testing$expenses)
## [1] 0.8789698
p.model3<-predict(ins_model3,testing)
cor(p.model3,testing$expenses)
## [1] 0.8687979
merge.test<-cbind(p.model2,testing$expenses)
merge.test<-as.data.frame(merge.test)
pairs.panels(cbind(p.model2,testing$expenses))

min_max_accuracy <- mean(apply(merge.test, 1, min) / apply(merge.test, 1, max))
min_max_accuracy
## [1] 0.9707279
mape <- mean(abs((merge.test$p.model2 - merge.test$V2))/merge.test$V2)
mape
## [1] 0.02996911
library(DAAG)
cvResults <- suppressWarnings(CVlm(data=training, form.lm=expenses ~ age + age2 + children + bmi + sex +
bmi30*smoker + region, m=5, dots=FALSE, seed=29, legend.pos="topleft", printit=FALSE, main="Small symbols are predicted values while bigger ones are actuals."));

attr(cvResults, "ms")
## [1] 0.1756247
#http://r-statistics.co/Linear-Regression.html
#http://www.forecastpro.com/Trends/forecasting101August2011.html
#cv.lm(data=training, ins_model2,m=3)