library(MASS); library(ggplot2)
## Warning: package 'MASS' was built under R version 3.1.2
setwd("/Users/SKSN/Documents/NYDSA/Lectures/Week3/Wk3data")
I <- read.csv("insurance.csv")
summary(I)
##       age            sex           bmi           children     smoker    
##  Min.   :18.00   female:662   Min.   :15.96   Min.   :0.000   no :1064  
##  1st Qu.:27.00   male  :676   1st Qu.:26.30   1st Qu.:0.000   yes: 274  
##  Median :39.00                Median :30.40   Median :1.000             
##  Mean   :39.21                Mean   :30.66   Mean   :1.095             
##  3rd Qu.:51.00                3rd Qu.:34.69   3rd Qu.:2.000             
##  Max.   :64.00                Max.   :53.13   Max.   :5.000             
##        region       charges     
##  northeast:324   Min.   : 1122  
##  northwest:325   1st Qu.: 4740  
##  southeast:364   Median : 9382  
##  southwest:325   Mean   :13270  
##                  3rd Qu.:16640  
##                  Max.   :63770
str(I)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
lm1 <- lm(charges~., data=I)
summary(lm1)
## 
## Call:
## lm(formula = charges ~ ., data = I)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11304.9  -2848.1   -982.1   1393.9  29992.8 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11938.5      987.8 -12.086  < 2e-16 ***
## age                256.9       11.9  21.587  < 2e-16 ***
## sexmale           -131.3      332.9  -0.394 0.693348    
## bmi                339.2       28.6  11.860  < 2e-16 ***
## children           475.5      137.8   3.451 0.000577 ***
## smokeryes        23848.5      413.1  57.723  < 2e-16 ***
## regionnorthwest   -353.0      476.3  -0.741 0.458769    
## regionsoutheast  -1035.0      478.7  -2.162 0.030782 *  
## regionsouthwest   -960.0      477.9  -2.009 0.044765 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7494 
## F-statistic: 500.8 on 8 and 1329 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm1, which = c(1:3,5), caption = c("Residuals vs Fitted", "Normal Q-Q","Scale-Location", "Cook's distance","Residuals vs Leverage", "Cook's distance vs Leverage"))

lm2 <- step(lm1)
## Start:  AIC=23316.43
## charges ~ age + sex + bmi + children + smoker + region
## 
##            Df  Sum of Sq        RSS   AIC
## - sex       1 5.7164e+06 4.8845e+10 23315
## <none>                   4.8840e+10 23316
## - region    3 2.3343e+08 4.9073e+10 23317
## - children  1 4.3755e+08 4.9277e+10 23326
## - bmi       1 5.1692e+09 5.4009e+10 23449
## - age       1 1.7124e+10 6.5964e+10 23717
## - smoker    1 1.2245e+11 1.7129e+11 24993
## 
## Step:  AIC=23314.58
## charges ~ age + bmi + children + smoker + region
## 
##            Df  Sum of Sq        RSS   AIC
## <none>                   4.8845e+10 23315
## - region    3 2.3320e+08 4.9078e+10 23315
## - children  1 4.3596e+08 4.9281e+10 23324
## - bmi       1 5.1645e+09 5.4010e+10 23447
## - age       1 1.7151e+10 6.5996e+10 23715
## - smoker    1 1.2301e+11 1.7186e+11 24996
#lm3<- step.up(lm2)
lm3<- lm(formula = charges ~ age + bmi + children + smoker + region + 
             bmi:smoker + bmi:region, data = I)
summary(lm3)
## 
## Call:
## lm(formula = charges ~ age + bmi + children + smoker + region + 
##     bmi:smoker + bmi:region, data = I)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13049.4  -1936.9  -1283.3   -305.6  30037.5 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4398.068   1434.667  -3.066  0.00222 ** 
## age                    262.440      9.519  27.570  < 2e-16 ***
## bmi                     91.966     47.110   1.952  0.05113 .  
## children               504.018    110.063   4.579  5.1e-06 ***
## smokeryes           -20812.371   1658.325 -12.550  < 2e-16 ***
## regionnorthwest       -789.909   2061.287  -0.383  0.70162    
## regionsoutheast       4245.168   1899.795   2.235  0.02561 *  
## regionsouthwest        820.042   2000.516   0.410  0.68193    
## bmi:smokeryes         1454.592     52.923  27.485  < 2e-16 ***
## bmi:regionnorthwest      7.005     69.401   0.101  0.91962    
## bmi:regionsoutheast   -172.683     60.067  -2.875  0.00411 ** 
## bmi:regionsouthwest    -70.377     65.616  -1.073  0.28366    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4836 on 1326 degrees of freedom
## Multiple R-squared:  0.8418, Adjusted R-squared:  0.8405 
## F-statistic: 641.6 on 11 and 1326 DF,  p-value: < 2.2e-16
library(DAAG)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.2
## 
## Attaching package: 'DAAG'
## 
## The following object is masked from 'package:MASS':
## 
##     hills
    vif(lm1)
##             age         sexmale             bmi        children 
##          1.0168          1.0089          1.1066          1.0040 
##       smokeryes regionnorthwest regionsoutheast regionsouthwest 
##          1.0121          1.5188          1.6522          1.5294
par(mfrow=c(1,2))
ggplot(data=I, aes(age, charges)) +geom_point() + 
    geom_smooth(method="lm", color="blue")+ 
    geom_smooth(method="loess", color='red')

qplot(age, charges, data=I, color="black") + 
    geom_smooth(method="lm", formula=y~poly(x,2))

ggplot(I, aes(x=bmi, y=charges)) + geom_point() + 
    geom_smooth(method="lm", formula=y~poly(x,2))

library(psych) 
## Warning: package 'psych' was built under R version 3.1.2
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:DAAG':
## 
##     cities
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%
par(mfrow=c(1,1))
pairs.panels(I)

i <- I
boxcox(lm(formula = charges ~ 1, data = i))

#ENTROPY
#install.packages("entropy")
library(entropy)
## Warning: package 'entropy' was built under R version 3.1.2
#****Entropy gain***
entropy_gain = function(x, y, value) {
    n = length(y)
    shan_tot = entropy(y)
    subx1 = x[x<=value]
    subx2 = x[x>value]
    suby1 = y[x<=value]
    suby2 = y[x>value]
    n1 = length(suby1)
    n2 = length(suby2)
    shan_sub = (n1*entropy(suby1) + n2*entropy(suby2)) / n 
    gain = shan_tot - shan_sub
    return(gain)
}
#***************
#Best entropy gain for bmi 
result=sapply(I$bmi, function(x) entropy_gain(I$bmi, I$charges, x))
I$bmi[which.max(result)]
## [1] 30.495
I$bmi30 = ifelse(I$bmi >= 30, 1, 0)

ggplot(I, aes(x=bmi, y=charges, color=interaction(smoker, bmi30))) +
    geom_point()  +
    labs(title="bmi vs charges (segregated by max Entropy & colored by smoker)")

#New Regression series with add'l column BMI30
lm1a <- lm(charges~., data=I)
summary(lm1a)
## 
## Call:
## lm(formula = charges ~ ., data = I)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11944.9  -3431.6   -102.5   1538.8  28489.6 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -7652.55    1279.06  -5.983 2.81e-09 ***
## age               257.20      11.78  21.826  < 2e-16 ***
## sexmale          -161.10     329.78  -0.489  0.62527    
## bmi               149.07      46.24   3.224  0.00130 ** 
## children          477.73     136.47   3.501  0.00048 ***
## smokeryes       23847.20     409.16  58.283  < 2e-16 ***
## regionnorthwest  -388.48     471.72  -0.824  0.41035    
## regionsoutheast  -885.06     474.94  -1.864  0.06261 .  
## regionsouthwest  -949.51     473.32  -2.006  0.04505 *  
## bmi30            2855.09     548.91   5.201 2.29e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6004 on 1328 degrees of freedom
## Multiple R-squared:  0.7559, Adjusted R-squared:  0.7542 
## F-statistic: 456.9 on 9 and 1328 DF,  p-value: < 2.2e-16
#plot(lm1a)
par(mfrow=c(2,2))
plot(lm1a, which = c(1:3,5), caption = c("Residuals vs Fitted", "Normal Q-Q","Scale-Location", "Cook's distance","Residuals vs Leverage", "Cook's distance vs Leverage"))

# prone regression to remove redundancy
lm1b <- update(lm1a, .~. -bmi)
summary(lm1b)
## 
## Call:
## lm(formula = charges ~ age + sex + children + smoker + region + 
##     bmi30, data = I)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13061.1  -3597.8    128.9   1356.1  27567.2 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -4044.55     621.29  -6.510 1.06e-10 ***
## age               260.09      11.79  22.057  < 2e-16 ***
## sexmale          -145.17     330.90  -0.439 0.660948    
## children          482.27     136.95   3.522 0.000443 ***
## smokeryes       23834.51     410.59  58.050  < 2e-16 ***
## regionnorthwest  -404.68     473.36  -0.855 0.392762    
## regionsoutheast  -576.79     466.86  -1.235 0.216872    
## regionsouthwest  -865.89     474.28  -1.826 0.068119 .  
## bmi30            4253.79     337.39  12.608  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6025 on 1329 degrees of freedom
## Multiple R-squared:  0.754,  Adjusted R-squared:  0.7525 
## F-statistic: 509.1 on 8 and 1329 DF,  p-value: < 2.2e-16
# Apply Interactions in the regression
#lm2a <- step.up(lm1b)
#Results of step.up function is 
lm2a<-lm(formula = charges ~ age + sex + children + smoker + region + bmi30 + smoker:bmi30, data = I)
summary(lm2a)
## 
## Call:
## lm(formula = charges ~ age + sex + children + smoker + region + 
##     bmi30 + smoker:bmi30, data = I)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18829.3  -1872.4  -1306.7   -582.2  24710.6 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1954.749    468.687  -4.171 3.23e-05 ***
## age               265.486      8.812  30.126  < 2e-16 ***
## sexmale          -479.924    247.474  -1.939  0.05268 .  
## children          524.041    102.338   5.121 3.49e-07 ***
## smokeryes       13360.059    445.408  29.995  < 2e-16 ***
## regionnorthwest  -278.942    353.726  -0.789  0.43050    
## regionsoutheast  -587.138    348.845  -1.683  0.09259 .  
## regionsouthwest -1160.701    354.505  -3.274  0.00109 ** 
## bmi30             201.748    281.354   0.717  0.47346    
## smokeryes:bmi30 19856.483    612.121  32.439  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4502 on 1328 degrees of freedom
## Multiple R-squared:  0.8627, Adjusted R-squared:  0.8618 
## F-statistic: 927.4 on 9 and 1328 DF,  p-value: < 2.2e-16
# prone regression to remove redundancy
lm2b <- update(lm2a, .~. -bmi30)
summary(lm2b)
## 
## Call:
## lm(formula = charges ~ age + sex + children + smoker + region + 
##     smoker:bmi30, data = I)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18829.3  -1872.4  -1306.7   -582.2  24710.6 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1954.749    468.687  -4.171 3.23e-05 ***
## age               265.486      8.812  30.126  < 2e-16 ***
## sexmale          -479.924    247.474  -1.939  0.05268 .  
## children          524.041    102.338   5.121 3.49e-07 ***
## smokeryes       13360.059    445.408  29.995  < 2e-16 ***
## regionnorthwest  -278.942    353.726  -0.789  0.43050    
## regionsoutheast  -587.138    348.845  -1.683  0.09259 .  
## regionsouthwest -1160.701    354.505  -3.274  0.00109 ** 
## smokerno:bmi30    201.748    281.354   0.717  0.47346    
## smokeryes:bmi30 20058.231    548.569  36.565  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4502 on 1328 degrees of freedom
## Multiple R-squared:  0.8627, Adjusted R-squared:  0.8618 
## F-statistic: 927.4 on 9 and 1328 DF,  p-value: < 2.2e-16
#plot(lm2b)
par(mfrow=c(2,2))
plot(lm2b, which = c(1:3,5), caption = c("Residuals vs Fitted", "Normal Q-Q","Scale-Location", "Cook's distance","Residuals vs Leverage", "Cook's distance vs Leverage"))