insurance<-fread("C:\\Users\\r631758\\Desktop\\test\\medical expense\\Machine Learning with R (2nd Ed.)\\Chapter 06\\insurance.csv")
insurance<-as.data.frame(insurance)
str(insurance)

## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 25.7 33.4 27.7 29.8 25.8 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ expenses: num  16885 1726 4449 21984 3867 ...

insurance$sex<-as.factor(insurance$sex)
insurance$smoker<-as.factor(insurance$smoker)
insurance$region<-as.factor(insurance$region)

summary(insurance$expenses)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1122    4740    9382   13270   16640   63770

#histogram(insurance$expenses)
p<-ggplot(data=insurance, aes(insurance$expenses))+geom_histogram( aes(y=..density..),fill="skyblue", col="pink")+ggtitle("Expenses Histogram")+geom_density( color="black",alpha = 0.5)

ggplotly(p)

insurance$log.insurance<-log(insurance$expenses)

p<-ggplot(data=insurance, aes(insurance$log.insurance)) + geom_histogram( aes(y=..density..),fill="skyblue", col="pink")+ggtitle("LOG Expenses Histogram")+geom_density( color="black",alpha = 0.5)
ggplotly(p)

insurance$reci.insurance<-1/insurance$expenses

p<-ggplot(data=insurance, aes(insurance$reci.insurance)) + geom_histogram( aes(y=..density..),fill="skyblue", col="pink")+ggtitle("reciprocal Expenses Histogram")+geom_density( color="black",alpha = 0.5)
ggplotly(p)

table(insurance$region)

## 
## northeast northwest southeast southwest 
##       324       325       364       325

# chart.Correlation(cbind(insurance$age, insurance$bmi, insurance$children, insurance$expenses),  method = c("pearson"), pch=21)

cor(insurance[c("age", "bmi", "children", "expenses")])

##                age        bmi   children   expenses
## age      1.0000000 0.10934101 0.04246900 0.29900819
## bmi      0.1093410 1.00000000 0.01264471 0.19857626
## children 0.0424690 0.01264471 1.00000000 0.06799823
## expenses 0.2990082 0.19857626 0.06799823 1.00000000

#pairs(insurance[c("age", "bmi", "children", "expenses")])

library(psych)
pairs.panels(insurance[c("age" ,  "bmi","children", "expenses")])

library(ggplot2)
head(insurance)

##   age    sex  bmi children smoker    region expenses log.insurance
## 1  19 female 27.9        0    yes southwest 16884.92      9.734176
## 2  18   male 33.8        1     no southeast  1725.55      7.453301
## 3  28   male 33.0        3     no southeast  4449.46      8.400538
## 4  33   male 22.7        0     no northwest 21984.47      9.998092
## 5  32   male 28.9        0     no northwest  3866.86      8.260198
## 6  31 female 25.7        0     no southeast  3756.62      8.231275
##   reci.insurance
## 1   5.922444e-05
## 2   5.795254e-04
## 3   2.247464e-04
## 4   4.548665e-05
## 5   2.586078e-04
## 6   2.661967e-04

p1<-ggplot(insurance, aes(x=expenses, y=bmi, colour =smoker)) +
    geom_point(shape=1) +    # Use hollow circles
    geom_smooth()+ggtitle("Scatter plot with smoker Status")            # Add a loess smoothed fit curve with confidence region

p2<-ggplot(insurance, aes(x=expenses, y=age, colour =smoker)) +
    geom_point(shape=1) +    # Use hollow circles
    geom_smooth()+ggtitle("Scatter plot with smoker Status")            # Add a loess smoothed fit curve with confidence region

p3<-ggplot(insurance, aes(x=expenses, y=children, colour =smoker)) +
    geom_point(shape=1) +    # Use hollow circles
    geom_smooth()+ggtitle("Scatter plot with smoker Status") 


multiplot(p1, p2,p3, cols=3)

ggplotly(p1)

ggplotly(p2)

ggplotly(p3)

insurance$age2<-insurance$age^2
insurance$bmi30<-ifelse(insurance$bmi>=30,1,0)
trainIndex=createDataPartition(insurance$expenses,p=0.7, list=FALSE, times=1)
#https://www.rdocumentation.org/packages/caret/versions/6.0-76/topics/createDataPartition
training<-insurance[trainIndex,]
testing<-insurance[-trainIndex,]
training<-as.data.frame(training)

ins.model<-lm(expenses~age+children+bmi+sex+smoker+region, data=training)
#ins.model<-lm(expenses~age+children+bmi+sex+smoker+region, data=insurance)
ins.model

## 
## Call:
## lm(formula = expenses ~ age + children + bmi + sex + smoker + 
##     region, data = training)
## 
## Coefficients:
##     (Intercept)              age         children              bmi  
##        -11293.3            254.8            402.2            328.2  
##         sexmale        smokeryes  regionnorthwest  regionsoutheast  
##          -217.9          23338.1           -594.2           -915.4  
## regionsouthwest  
##         -1054.0

summary(ins.model)

## 
## Call:
## lm(formula = expenses ~ age + children + bmi + sex + smoker + 
##     region, data = training)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10970  -2933  -1041   1256  23560 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11293.2     1147.7  -9.840   <2e-16 ***
## age                254.8       13.9  18.334   <2e-16 ***
## children           402.2      159.5   2.522   0.0118 *  
## bmi                328.2       33.6   9.769   <2e-16 ***
## sexmale           -217.9      391.0  -0.557   0.5774    
## smokeryes        23338.1      488.4  47.788   <2e-16 ***
## regionnorthwest   -594.1      569.9  -1.043   0.2974    
## regionsoutheast   -915.4      565.5  -1.619   0.1059    
## regionsouthwest  -1054.0      560.3  -1.881   0.0603 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5957 on 929 degrees of freedom
## Multiple R-squared:  0.7516, Adjusted R-squared:  0.7494 
## F-statistic: 351.3 on 8 and 929 DF,  p-value: < 2.2e-16

 ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
 bmi30*smoker + region, data = training)
 summary(ins_model2)

## 
## Call:
## lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 * 
##     smoker + region, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16730.4  -1696.7  -1314.5   -719.8  23844.4 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1281.881   1619.570   0.791  0.42886    
## age               -59.302     70.809  -0.837  0.40253    
## age2                4.049      0.888   4.560 5.80e-06 ***
## children          668.031    123.927   5.391 8.92e-08 ***
## bmi                95.130     41.810   2.275  0.02311 *  
## sexmale          -492.901    292.364  -1.686  0.09215 .  
## bmi30            -765.196    512.490  -1.493  0.13575    
## smokeryes       13502.656    524.085  25.764  < 2e-16 ***
## regionnorthwest  -314.017    426.079  -0.737  0.46131    
## regionsoutheast  -603.786    423.553  -1.426  0.15434    
## regionsouthwest -1267.813    418.824  -3.027  0.00254 ** 
## bmi30:smokeryes 19109.395    726.802  26.292  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4452 on 926 degrees of freedom
## Multiple R-squared:  0.8617, Adjusted R-squared:  0.8601 
## F-statistic: 524.5 on 11 and 926 DF,  p-value: < 2.2e-16

 #influence(ins_model2)
 plot(ins_model2)

 library(MASS)
 step<-stepAIC(ins_model2, direction="both")

## Start:  AIC=15772.32
## expenses ~ age + age2 + children + bmi + sex + bmi30 * smoker + 
##     region
## 
##                Df  Sum of Sq        RSS   AIC
## - age           1 1.3900e+07 1.8366e+10 15771
## <none>                       1.8352e+10 15772
## - sex           1 5.6330e+07 1.8408e+10 15773
## - bmi           1 1.0260e+08 1.8455e+10 15776
## - region        3 1.9932e+08 1.8551e+10 15776
## - age2          1 4.1214e+08 1.8764e+10 15791
## - children      1 5.7588e+08 1.8928e+10 15799
## - bmi30:smoker  1 1.3700e+10 3.2052e+10 16293
## 
## Step:  AIC=15771.03
## expenses ~ age2 + children + bmi + sex + bmi30 + smoker + region + 
##     bmi30:smoker
## 
##                Df  Sum of Sq        RSS   AIC
## <none>                       1.8366e+10 15771
## - sex           1 5.5991e+07 1.8422e+10 15772
## + age           1 1.3900e+07 1.8352e+10 15772
## - bmi           1 9.9576e+07 1.8465e+10 15774
## - region        3 1.9921e+08 1.8565e+10 15775
## - children      1 5.7035e+08 1.8936e+10 15798
## - age2          1 1.2807e+10 3.1173e+10 16265
## - bmi30:smoker  1 1.3687e+10 3.2053e+10 16291

 step$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## expenses ~ age + age2 + children + bmi + sex + bmi30 * smoker + 
##     region
## 
## Final Model:
## expenses ~ age2 + children + bmi + sex + bmi30 + smoker + region + 
##     bmi30:smoker
## 
## 
##    Step Df Deviance Resid. Df  Resid. Dev      AIC
## 1                         926 18351914079 15772.32
## 2 - age  1 13900488       927 18365814568 15771.03

  ins_model3 <- lm(expenses ~ age2 + children + bmi + sex +
 bmi30*smoker + region, data = training)
  summary(ins_model3)

## 
## Call:
## lm(formula = expenses ~ age2 + children + bmi + sex + bmi30 * 
##     smoker + region, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16915.4  -1679.7  -1323.2   -734.3  23987.5 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       293.8675  1109.4342   0.265  0.79116    
## age2                3.3138     0.1303  25.425  < 2e-16 ***
## children          639.8661   119.2574   5.365 1.02e-07 ***
## bmi                93.6312    41.7647   2.242  0.02521 *  
## sexmale          -491.4064   292.3113  -1.681  0.09308 .  
## bmi30            -733.8031   511.0352  -1.436  0.15136    
## smokeryes       13508.8734   523.9479  25.783  < 2e-16 ***
## regionnorthwest  -310.3216   425.9873  -0.728  0.46651    
## regionsoutheast  -602.4374   423.4812  -1.423  0.15519    
## regionsouthwest -1266.3579   418.7528  -3.024  0.00256 ** 
## bmi30:smokeryes 19093.7552   726.4451  26.284  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4451 on 927 degrees of freedom
## Multiple R-squared:  0.8616, Adjusted R-squared:  0.8601 
## F-statistic: 577.1 on 10 and 927 DF,  p-value: < 2.2e-16

ins_model4 <- lm(expenses ~ age2 + children + bmi + 
 bmi30*smoker + region, data = training) 
summary(ins_model4)

## 
## Call:
## lm(formula = expenses ~ age2 + children + bmi + bmi30 * smoker + 
##     region, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17085.1  -1702.0  -1299.0   -681.5  23727.9 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        62.9812  1101.9829   0.057  0.95444    
## age2                3.3139     0.1305  25.400  < 2e-16 ***
## children          640.6659   119.3738   5.367 1.01e-07 ***
## bmi                93.4946    41.8057   2.236  0.02556 *  
## bmi30            -739.2086   511.5277  -1.445  0.14877    
## smokeryes       13470.9115   523.9759  25.709  < 2e-16 ***
## regionnorthwest  -320.5873   426.3625  -0.752  0.45229    
## regionsoutheast  -593.9991   423.8679  -1.401  0.16144    
## regionsouthwest -1271.0560   419.1553  -3.032  0.00249 ** 
## bmi30:smokeryes 19055.0033   726.7933  26.218  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4455 on 928 degrees of freedom
## Multiple R-squared:  0.8612, Adjusted R-squared:  0.8598 
## F-statistic: 639.6 on 9 and 928 DF,  p-value: < 2.2e-16

AIC(ins_model4)

## [1] 18435.81

AIC(ins_model3)

## [1] 18434.96

AIC(ins_model2)

## [1] 18436.24

 p.model2<-predict(ins_model2,testing)
 cor(p.model2,testing$expenses)

## [1] 0.9363043

 p.model3<-predict(ins_model3,testing)
 cor(p.model3,testing$expenses)

## [1] 0.9364331

 merge.test<-cbind(p.model2,testing$expenses)
 merge.test<-as.data.frame(merge.test)
 pairs.panels(cbind(p.model2,testing$expenses))

 min_max_accuracy <- mean(apply(merge.test, 1, min) / apply(merge.test, 1, max))  
  min_max_accuracy

## [1] 0.7905797

  mape <- mean(abs((merge.test$p.model2 - merge.test$V2))/merge.test$V2)
  mape

## [1] 0.2953863

library(DAAG)
 cvResults <- suppressWarnings(CVlm(data=training, form.lm=expenses ~ age + age2 + children + bmi + sex +
 bmi30*smoker + region, m=5, dots=FALSE, seed=29, legend.pos="topleft",  printit=FALSE, main="Small symbols are predicted values while bigger ones are actuals."));

 attr(cvResults, "ms")

## [1] 20201566

 #http://r-statistics.co/Linear-Regression.html
  #http://www.forecastpro.com/Trends/forecasting101August2011.html
#cv.lm(data=training, ins_model2,m=3)

Use log expenses

insurance$expenses<-insurance$log.insurance
insurance$age2<-insurance$age^2
insurance$bmi30<-ifelse(insurance$bmi>=30,1,0)
trainIndex=createDataPartition(insurance$log.insurance,p=0.7, list=FALSE, times=1)
#https://www.rdocumentation.org/packages/caret/versions/6.0-76/topics/createDataPartition
training<-insurance[trainIndex,]
testing<-insurance[-trainIndex,]
training<-as.data.frame(training)



p<-ggplot(data=training, aes(training$expenses)) + geom_histogram( aes(y=..density..),fill="skyblue", col="pink")+ggtitle("LOG Expenses Histogram for training data")+geom_density( color="black",alpha = 0.5)
ggplotly(p)

ins.model<-lm(expenses~age+children+bmi+sex+smoker+region, data=training)
#ins.model<-lm(expenses~age+children+bmi+sex+smoker+region, data=insurance)
ins.model

## 
## Call:
## lm(formula = expenses ~ age + children + bmi + sex + smoker + 
##     region, data = training)
## 
## Coefficients:
##     (Intercept)              age         children              bmi  
##         7.09457          0.03504          0.10737          0.01065  
##         sexmale        smokeryes  regionnorthwest  regionsoutheast  
##        -0.08413          1.61106         -0.07127         -0.14203  
## regionsouthwest  
##        -0.15842

summary(ins.model)

## 
## Call:
## lm(formula = expenses ~ age + children + bmi + sex + smoker + 
##     region, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00584 -0.18838 -0.04325  0.05908  2.17726 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.094574   0.085313  83.159  < 2e-16 ***
## age              0.035045   0.001021  34.310  < 2e-16 ***
## children         0.107374   0.011918   9.009  < 2e-16 ***
## bmi              0.010655   0.002450   4.349 1.52e-05 ***
## sexmale         -0.084130   0.028771  -2.924 0.003538 ** 
## smokeryes        1.611063   0.035668  45.168  < 2e-16 ***
## regionnorthwest -0.071270   0.041500  -1.717 0.086251 .  
## regionsoutheast -0.142032   0.041517  -3.421 0.000651 ***
## regionsouthwest -0.158424   0.041095  -3.855 0.000124 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4371 on 929 degrees of freedom
## Multiple R-squared:  0.7761, Adjusted R-squared:  0.7742 
## F-statistic: 402.6 on 8 and 929 DF,  p-value: < 2.2e-16

 ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
 bmi30*smoker + region, data = training)
 summary(ins_model2)

## 
## Call:
## lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 * 
##     smoker + region, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79177 -0.18670 -0.07174  0.06520  2.27167 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.028e+00  1.548e-01  45.393  < 2e-16 ***
## age              5.093e-02  6.724e-03   7.574 8.76e-14 ***
## age2            -2.001e-04  8.382e-05  -2.387 0.017179 *  
## children         1.025e-01  1.189e-02   8.620  < 2e-16 ***
## bmi              4.588e-03  3.843e-03   1.194 0.232889    
## sexmale         -9.982e-02  2.743e-02  -3.639 0.000289 ***
## bmi30           -4.152e-02  4.749e-02  -0.874 0.382220    
## smokeryes        1.261e+00  5.044e-02  25.005  < 2e-16 ***
## regionnorthwest -4.858e-02  3.959e-02  -1.227 0.220135    
## regionsoutheast -1.314e-01  3.964e-02  -3.314 0.000955 ***
## regionsouthwest -1.453e-01  3.918e-02  -3.708 0.000221 ***
## bmi30:smokeryes  6.435e-01  6.817e-02   9.440  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4161 on 926 degrees of freedom
## Multiple R-squared:  0.7978, Adjusted R-squared:  0.7954 
## F-statistic: 332.1 on 11 and 926 DF,  p-value: < 2.2e-16

 #influence(ins_model2)
 plot(ins_model2)

 library(MASS)
 step<-stepAIC(ins_model2, direction="both")

## Start:  AIC=-1633.08
## expenses ~ age + age2 + children + bmi + sex + bmi30 * smoker + 
##     region
## 
##                Df Sum of Sq    RSS     AIC
## - bmi           1    0.2467 160.56 -1633.6
## <none>                      160.32 -1633.1
## - age2          1    0.9866 161.30 -1629.3
## - sex           1    2.2920 162.61 -1621.8
## - region        3    3.1468 163.46 -1620.8
## - age           1    9.9311 170.25 -1578.7
## - children      1   12.8632 173.18 -1562.7
## - bmi30:smoker  1   15.4294 175.75 -1548.9
## 
## Step:  AIC=-1633.63
## expenses ~ age + age2 + children + sex + bmi30 + smoker + region + 
##     bmi30:smoker
## 
##                Df Sum of Sq    RSS     AIC
## <none>                      160.56 -1633.6
## + bmi           1    0.2467 160.32 -1633.1
## - age2          1    0.9928 161.56 -1629.8
## - sex           1    2.2318 162.79 -1622.7
## - region        3    2.9400 163.50 -1622.6
## - age           1    9.9760 170.54 -1579.1
## - children      1   13.0399 173.60 -1562.4
## - bmi30:smoker  1   15.6180 176.18 -1548.6

 step$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## expenses ~ age + age2 + children + bmi + sex + bmi30 * smoker + 
##     region
## 
## Final Model:
## expenses ~ age + age2 + children + sex + bmi30 + smoker + region + 
##     bmi30:smoker
## 
## 
##    Step Df  Deviance Resid. Df Resid. Dev       AIC
## 1                          926   160.3157 -1633.075
## 2 - bmi  1 0.2467046       927   160.5624 -1633.633

  ins_model3 <- lm(expenses ~ age2 + children + bmi + sex +
 bmi30*smoker + region, data = training)
  summary(ins_model3)

## 
## Call:
## lm(formula = expenses ~ age2 + children + bmi + sex + bmi30 * 
##     smoker + region, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.01477 -0.18754 -0.02280  0.09469  2.14588 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.903e+00  1.062e-01  74.404  < 2e-16 ***
## age2             4.281e-04  1.248e-05  34.288  < 2e-16 ***
## children         1.292e-01  1.169e-02  11.050  < 2e-16 ***
## bmi              4.988e-03  3.958e-03   1.260 0.207919    
## sexmale         -9.944e-02  2.826e-02  -3.519 0.000454 ***
## bmi30           -5.914e-02  4.885e-02  -1.211 0.226373    
## smokeryes        1.240e+00  5.187e-02  23.906  < 2e-16 ***
## regionnorthwest -4.459e-02  4.077e-02  -1.093 0.274464    
## regionsoutheast -1.366e-01  4.082e-02  -3.345 0.000855 ***
## regionsouthwest -1.388e-01  4.034e-02  -3.441 0.000606 ***
## bmi30:smokeryes  6.661e-01  7.014e-02   9.496  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4285 on 927 degrees of freedom
## Multiple R-squared:  0.7853, Adjusted R-squared:  0.783 
## F-statistic:   339 on 10 and 927 DF,  p-value: < 2.2e-16

ins_model4 <- lm(expenses ~ age2 + children + bmi + 
 bmi30*smoker + region, data = training) 
summary(ins_model4)

## 
## Call:
## lm(formula = expenses ~ age2 + children + bmi + bmi30 * smoker + 
##     region, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.99343 -0.17484 -0.02595  0.09662  2.10057 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.866e+00  1.064e-01  73.965  < 2e-16 ***
## age2             4.292e-04  1.256e-05  34.181  < 2e-16 ***
## children         1.281e-01  1.176e-02  10.896  < 2e-16 ***
## bmi              4.387e-03  3.979e-03   1.103  0.27045    
## bmi30           -5.710e-02  4.915e-02  -1.162  0.24565    
## smokeryes        1.242e+00  5.219e-02  23.791  < 2e-16 ***
## regionnorthwest -3.709e-02  4.097e-02  -0.905  0.36558    
## regionsoutheast -1.343e-01  4.107e-02  -3.270  0.00111 ** 
## regionsouthwest -1.334e-01  4.056e-02  -3.289  0.00104 ** 
## bmi30:smokeryes  6.518e-01  7.045e-02   9.251  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4312 on 928 degrees of freedom
## Multiple R-squared:  0.7824, Adjusted R-squared:  0.7803 
## F-statistic: 370.7 on 9 and 928 DF,  p-value: < 2.2e-16

AIC(ins_model4)

## [1] 1095.68

AIC(ins_model3)

## [1] 1085.231

AIC(ins_model2)

## [1] 1030.853

 p.model2<-predict(ins_model2,testing)
 cor(p.model2,testing$expenses)

## [1] 0.8789698

 p.model3<-predict(ins_model3,testing)
 cor(p.model3,testing$expenses)

## [1] 0.8687979

 merge.test<-cbind(p.model2,testing$expenses)
 merge.test<-as.data.frame(merge.test)
 pairs.panels(cbind(p.model2,testing$expenses))

 min_max_accuracy <- mean(apply(merge.test, 1, min) / apply(merge.test, 1, max))  
  min_max_accuracy

## [1] 0.9707279

  mape <- mean(abs((merge.test$p.model2 - merge.test$V2))/merge.test$V2)
  mape

## [1] 0.02996911

library(DAAG)
 cvResults <- suppressWarnings(CVlm(data=training, form.lm=expenses ~ age + age2 + children + bmi + sex +
 bmi30*smoker + region, m=5, dots=FALSE, seed=29, legend.pos="topleft",  printit=FALSE, main="Small symbols are predicted values while bigger ones are actuals."));

 attr(cvResults, "ms")

## [1] 0.1756247

 #http://r-statistics.co/Linear-Regression.html
  #http://www.forecastpro.com/Trends/forecasting101August2011.html
#cv.lm(data=training, ins_model2,m=3)

medical insurance expense

Use log expenses