library(MASS); library(ggplot2)
## Warning: package 'MASS' was built under R version 3.1.2
setwd("/Users/SKSN/Documents/NYDSA/Lectures/Week3/Wk3data")
I <- read.csv("insurance.csv")
summary(I)
## age sex bmi children smoker
## Min. :18.00 female:662 Min. :15.96 Min. :0.000 no :1064
## 1st Qu.:27.00 male :676 1st Qu.:26.30 1st Qu.:0.000 yes: 274
## Median :39.00 Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## region charges
## northeast:324 Min. : 1122
## northwest:325 1st Qu.: 4740
## southeast:364 Median : 9382
## southwest:325 Mean :13270
## 3rd Qu.:16640
## Max. :63770
str(I)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
lm1 <- lm(charges~., data=I)
summary(lm1)
##
## Call:
## lm(formula = charges ~ ., data = I)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11304.9 -2848.1 -982.1 1393.9 29992.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11938.5 987.8 -12.086 < 2e-16 ***
## age 256.9 11.9 21.587 < 2e-16 ***
## sexmale -131.3 332.9 -0.394 0.693348
## bmi 339.2 28.6 11.860 < 2e-16 ***
## children 475.5 137.8 3.451 0.000577 ***
## smokeryes 23848.5 413.1 57.723 < 2e-16 ***
## regionnorthwest -353.0 476.3 -0.741 0.458769
## regionsoutheast -1035.0 478.7 -2.162 0.030782 *
## regionsouthwest -960.0 477.9 -2.009 0.044765 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
## F-statistic: 500.8 on 8 and 1329 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm1, which = c(1:3,5), caption = c("Residuals vs Fitted", "Normal Q-Q","Scale-Location", "Cook's distance","Residuals vs Leverage", "Cook's distance vs Leverage"))

lm2 <- step(lm1)
## Start: AIC=23316.43
## charges ~ age + sex + bmi + children + smoker + region
##
## Df Sum of Sq RSS AIC
## - sex 1 5.7164e+06 4.8845e+10 23315
## <none> 4.8840e+10 23316
## - region 3 2.3343e+08 4.9073e+10 23317
## - children 1 4.3755e+08 4.9277e+10 23326
## - bmi 1 5.1692e+09 5.4009e+10 23449
## - age 1 1.7124e+10 6.5964e+10 23717
## - smoker 1 1.2245e+11 1.7129e+11 24993
##
## Step: AIC=23314.58
## charges ~ age + bmi + children + smoker + region
##
## Df Sum of Sq RSS AIC
## <none> 4.8845e+10 23315
## - region 3 2.3320e+08 4.9078e+10 23315
## - children 1 4.3596e+08 4.9281e+10 23324
## - bmi 1 5.1645e+09 5.4010e+10 23447
## - age 1 1.7151e+10 6.5996e+10 23715
## - smoker 1 1.2301e+11 1.7186e+11 24996
#lm3<- step.up(lm2)
lm3<- lm(formula = charges ~ age + bmi + children + smoker + region +
bmi:smoker + bmi:region, data = I)
summary(lm3)
##
## Call:
## lm(formula = charges ~ age + bmi + children + smoker + region +
## bmi:smoker + bmi:region, data = I)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13049.4 -1936.9 -1283.3 -305.6 30037.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4398.068 1434.667 -3.066 0.00222 **
## age 262.440 9.519 27.570 < 2e-16 ***
## bmi 91.966 47.110 1.952 0.05113 .
## children 504.018 110.063 4.579 5.1e-06 ***
## smokeryes -20812.371 1658.325 -12.550 < 2e-16 ***
## regionnorthwest -789.909 2061.287 -0.383 0.70162
## regionsoutheast 4245.168 1899.795 2.235 0.02561 *
## regionsouthwest 820.042 2000.516 0.410 0.68193
## bmi:smokeryes 1454.592 52.923 27.485 < 2e-16 ***
## bmi:regionnorthwest 7.005 69.401 0.101 0.91962
## bmi:regionsoutheast -172.683 60.067 -2.875 0.00411 **
## bmi:regionsouthwest -70.377 65.616 -1.073 0.28366
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4836 on 1326 degrees of freedom
## Multiple R-squared: 0.8418, Adjusted R-squared: 0.8405
## F-statistic: 641.6 on 11 and 1326 DF, p-value: < 2.2e-16
library(DAAG)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.2
##
## Attaching package: 'DAAG'
##
## The following object is masked from 'package:MASS':
##
## hills
vif(lm1)
## age sexmale bmi children
## 1.0168 1.0089 1.1066 1.0040
## smokeryes regionnorthwest regionsoutheast regionsouthwest
## 1.0121 1.5188 1.6522 1.5294
par(mfrow=c(1,2))
ggplot(data=I, aes(age, charges)) +geom_point() +
geom_smooth(method="lm", color="blue")+
geom_smooth(method="loess", color='red')

qplot(age, charges, data=I, color="black") +
geom_smooth(method="lm", formula=y~poly(x,2))

ggplot(I, aes(x=bmi, y=charges)) + geom_point() +
geom_smooth(method="lm", formula=y~poly(x,2))

library(psych)
## Warning: package 'psych' was built under R version 3.1.2
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:DAAG':
##
## cities
##
## The following object is masked from 'package:ggplot2':
##
## %+%
par(mfrow=c(1,1))
pairs.panels(I)

i <- I
boxcox(lm(formula = charges ~ 1, data = i))

#ENTROPY
#install.packages("entropy")
library(entropy)
## Warning: package 'entropy' was built under R version 3.1.2
#****Entropy gain***
entropy_gain = function(x, y, value) {
n = length(y)
shan_tot = entropy(y)
subx1 = x[x<=value]
subx2 = x[x>value]
suby1 = y[x<=value]
suby2 = y[x>value]
n1 = length(suby1)
n2 = length(suby2)
shan_sub = (n1*entropy(suby1) + n2*entropy(suby2)) / n
gain = shan_tot - shan_sub
return(gain)
}
#***************
#Best entropy gain for bmi
result=sapply(I$bmi, function(x) entropy_gain(I$bmi, I$charges, x))
I$bmi[which.max(result)]
## [1] 30.495
I$bmi30 = ifelse(I$bmi >= 30, 1, 0)
ggplot(I, aes(x=bmi, y=charges, color=interaction(smoker, bmi30))) +
geom_point() +
labs(title="bmi vs charges (segregated by max Entropy & colored by smoker)")

#New Regression series with add'l column BMI30
lm1a <- lm(charges~., data=I)
summary(lm1a)
##
## Call:
## lm(formula = charges ~ ., data = I)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11944.9 -3431.6 -102.5 1538.8 28489.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7652.55 1279.06 -5.983 2.81e-09 ***
## age 257.20 11.78 21.826 < 2e-16 ***
## sexmale -161.10 329.78 -0.489 0.62527
## bmi 149.07 46.24 3.224 0.00130 **
## children 477.73 136.47 3.501 0.00048 ***
## smokeryes 23847.20 409.16 58.283 < 2e-16 ***
## regionnorthwest -388.48 471.72 -0.824 0.41035
## regionsoutheast -885.06 474.94 -1.864 0.06261 .
## regionsouthwest -949.51 473.32 -2.006 0.04505 *
## bmi30 2855.09 548.91 5.201 2.29e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6004 on 1328 degrees of freedom
## Multiple R-squared: 0.7559, Adjusted R-squared: 0.7542
## F-statistic: 456.9 on 9 and 1328 DF, p-value: < 2.2e-16
#plot(lm1a)
par(mfrow=c(2,2))
plot(lm1a, which = c(1:3,5), caption = c("Residuals vs Fitted", "Normal Q-Q","Scale-Location", "Cook's distance","Residuals vs Leverage", "Cook's distance vs Leverage"))

# prone regression to remove redundancy
lm1b <- update(lm1a, .~. -bmi)
summary(lm1b)
##
## Call:
## lm(formula = charges ~ age + sex + children + smoker + region +
## bmi30, data = I)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13061.1 -3597.8 128.9 1356.1 27567.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4044.55 621.29 -6.510 1.06e-10 ***
## age 260.09 11.79 22.057 < 2e-16 ***
## sexmale -145.17 330.90 -0.439 0.660948
## children 482.27 136.95 3.522 0.000443 ***
## smokeryes 23834.51 410.59 58.050 < 2e-16 ***
## regionnorthwest -404.68 473.36 -0.855 0.392762
## regionsoutheast -576.79 466.86 -1.235 0.216872
## regionsouthwest -865.89 474.28 -1.826 0.068119 .
## bmi30 4253.79 337.39 12.608 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6025 on 1329 degrees of freedom
## Multiple R-squared: 0.754, Adjusted R-squared: 0.7525
## F-statistic: 509.1 on 8 and 1329 DF, p-value: < 2.2e-16
# Apply Interactions in the regression
#lm2a <- step.up(lm1b)
#Results of step.up function is
lm2a<-lm(formula = charges ~ age + sex + children + smoker + region + bmi30 + smoker:bmi30, data = I)
summary(lm2a)
##
## Call:
## lm(formula = charges ~ age + sex + children + smoker + region +
## bmi30 + smoker:bmi30, data = I)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18829.3 -1872.4 -1306.7 -582.2 24710.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1954.749 468.687 -4.171 3.23e-05 ***
## age 265.486 8.812 30.126 < 2e-16 ***
## sexmale -479.924 247.474 -1.939 0.05268 .
## children 524.041 102.338 5.121 3.49e-07 ***
## smokeryes 13360.059 445.408 29.995 < 2e-16 ***
## regionnorthwest -278.942 353.726 -0.789 0.43050
## regionsoutheast -587.138 348.845 -1.683 0.09259 .
## regionsouthwest -1160.701 354.505 -3.274 0.00109 **
## bmi30 201.748 281.354 0.717 0.47346
## smokeryes:bmi30 19856.483 612.121 32.439 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4502 on 1328 degrees of freedom
## Multiple R-squared: 0.8627, Adjusted R-squared: 0.8618
## F-statistic: 927.4 on 9 and 1328 DF, p-value: < 2.2e-16
# prone regression to remove redundancy
lm2b <- update(lm2a, .~. -bmi30)
summary(lm2b)
##
## Call:
## lm(formula = charges ~ age + sex + children + smoker + region +
## smoker:bmi30, data = I)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18829.3 -1872.4 -1306.7 -582.2 24710.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1954.749 468.687 -4.171 3.23e-05 ***
## age 265.486 8.812 30.126 < 2e-16 ***
## sexmale -479.924 247.474 -1.939 0.05268 .
## children 524.041 102.338 5.121 3.49e-07 ***
## smokeryes 13360.059 445.408 29.995 < 2e-16 ***
## regionnorthwest -278.942 353.726 -0.789 0.43050
## regionsoutheast -587.138 348.845 -1.683 0.09259 .
## regionsouthwest -1160.701 354.505 -3.274 0.00109 **
## smokerno:bmi30 201.748 281.354 0.717 0.47346
## smokeryes:bmi30 20058.231 548.569 36.565 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4502 on 1328 degrees of freedom
## Multiple R-squared: 0.8627, Adjusted R-squared: 0.8618
## F-statistic: 927.4 on 9 and 1328 DF, p-value: < 2.2e-16
#plot(lm2b)
par(mfrow=c(2,2))
plot(lm2b, which = c(1:3,5), caption = c("Residuals vs Fitted", "Normal Q-Q","Scale-Location", "Cook's distance","Residuals vs Leverage", "Cook's distance vs Leverage"))
