alisonlo — Apr 13, 2014, 11:41 AM
# Linear Regression predicting Medical Expense
insure = read.csv("insurance.csv")
set.seed(123)
spl = sample(1:nrow(insure), size=1000)
str(spl)
int [1:1000] 385 1054 547 1179 1255 61 704 1188 734 607 ...
train = insure[spl, ]
test = insure[-spl,]
Train1 = lm(charges ~., data=train)
summary(Train1)
Call:
lm(formula = charges ~ ., data = train)
Residuals:
Min 1Q Median 3Q Max
-11560 -2831 -1007 1457 29719
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -11672.4 1171.7 -9.96 <2e-16 ***
age 249.8 13.9 18.01 <2e-16 ***
sexmale -181.6 387.3 -0.47 0.639
bmi 342.8 33.4 10.26 <2e-16 ***
children 495.2 159.6 3.10 0.002 **
smokeryes 24256.0 486.6 49.85 <2e-16 ***
regionnorthwest -569.5 557.0 -1.02 0.307
regionsoutheast -1175.0 563.9 -2.08 0.037 *
regionsouthwest -1129.1 558.1 -2.02 0.043 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6100 on 991 degrees of freedom
Multiple R-squared: 0.75, Adjusted R-squared: 0.748
F-statistic: 372 on 8 and 991 DF, p-value: <2e-16
# Train1 R2 = 0.75
# the estimated beta coefficients indicate the increase in charges for an increase in one in each of
# variables when the other variables are held constant. For instances, for each year that age increases,
# we would expect $256.90 higher medical expenses on average. Similarly, each additional child results in
# an average of $275.50 additional medical expenses each year.
# You might notice that although we only specified 6 features in the model, there are 8 coefficients
# reported in addition to the intercept. This happened because the lm(function) automatically applied
# a technique known as dummy codding to each of the factor (nominal) type variables (sex, smoker and region).
# Dummy coding allows a nominal feature to be treated as numeric by creating a binary variable for each
# category of the feature. For instance, the sex variable has two categories, male and female.
# This will be split into two binary values, whhich R names sexmale and sexfemale. For observations where
# sex = male, then sexmale = 1 and sexfemale = 0; if sex = female, then sexmale = 0 and sexfemale = 1.
# The four-category feature region can be split into 4 variables: regionnorthwest, regionnortheast,
# regionsouthwest, regionsoutheast.
# When adding a dummy-coded variable to a regression model, one category is always left out to serve as
# the reference. In our model, R automatically held out the sexfemale, smokerno, and regionnortheast
# variables (alphabetical order), making female non-smokers in the northeast region the reference group.
# Thus males have $182 less medical costs relative to females, smokers cost $24256 more than non-smokers.
# The coefficients for each of the other three regions are negative, which implies that the northeast
# region tends to have the highest average medical expenses.
# Effect of gender on medical expense
# Train1 predicted expense: female age 35 no-children, northwest, non-smoker, bmi 20
F35NWNSmo20= Train1$coef[1] + 35*Train1$coef[2] + 20*Train1$coef[4] + Train1$coef[7]
F35NWNSmo20
(Intercept)
3356
# F35NWNSmo20 = $3356
# Train1 predicted expense: male age 35 no-children, northwest, non-smoker, bmi 20
M35NWNSmo20= Train1$coef[1] + 35*Train1$coef[2] + Train1$coef[3] + 20*Train1$coef[4] + Train1$coef[7]
M35NWNSmo20
(Intercept)
3174
# M35NWNSmo20 = $3174 (3356-3174 = 182 = Train1$coef[3] i.e. coefficient of sexmale)
# Examine medical expenses based on individual's BMI and smoking status
boxplot(charges ~ smoker, data=insure, xlab = "smoker", ylab= "annual medical costs")
tapply(insure$charges, insure$smoker, summary)
$no
Min. 1st Qu. Median Mean 3rd Qu. Max.
1120 3990 7350 8430 11400 36900
$yes
Min. 1st Qu. Median Mean 3rd Qu. Max.
12800 20800 34500 32100 41000 63800
library(car)
scatterplot(charges ~ bmi, data=insure, xlab = "bmi", ylab = "annual medical costs")
scatterplot(charges ~ bmi | smoker , data=insure, xlab = "bmi" , ylab = "annual medical costs" )
# create an interaction term for bmi and smoker: bmi * smoker
Train2 = lm(charges ~. + bmi* smoker, data=insure)
summary(Train2)
Call:
lm(formula = charges ~ . + bmi * smoker, data = insure)
Residuals:
Min 1Q Median 3Q Max
-14581 -1857 -1361 -476 30552
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2223.45 865.61 -2.57 0.0103 *
age 263.62 9.52 27.70 < 2e-16 ***
sexmale -500.15 266.52 -1.88 0.0608 .
bmi 23.53 25.60 0.92 0.3581
children 516.40 110.18 4.69 3.1e-06 ***
smokeryes -20415.61 1648.28 -12.39 < 2e-16 ***
regionnorthwest -585.48 380.86 -1.54 0.1245
regionsoutheast -1210.13 382.75 -3.16 0.0016 **
regionsouthwest -1231.11 382.22 -3.22 0.0013 **
bmi:smokeryes 1443.10 52.65 27.41 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4850 on 1328 degrees of freedom
Multiple R-squared: 0.841, Adjusted R-squared: 0.84
F-statistic: 780 on 9 and 1328 DF, p-value: <2e-16
# Train2 R2 = 0.84
# Train 2 predicted expense: male age 35 no-children, northwest, non-smoker, bmi 40
M35NWNSmo40= Train2$coef[1] + 35*Train2$coef[2] + Train2$coef[3] + 40*Train2$coef[4] + Train2$coef[7]
M35NWNSmo40
(Intercept)
6859
#M35NWNSmo40= $6859
# Train 2 predicted expense: male age 35 no-children, northwest, smoker, bmi 40
M35NWSmo40 = Train2$coef[1] + 35*Train2$coef[2] + Train2$coef[3] + 40*Train2$coef[4] + Train2$coef[6] + Train2$coef[7] + 40*Train2$coef[10]
M35NWSmo40
(Intercept)
44167
#M35NWSmo40= $44167
# Compare prediction with (Train2) vs without (Train1) interaction term
# Train1 predicted expense: male age 35 no-children, northwest, non-smoker, bmi 40
T1M35NWNSmo40= Train1$coef[1] + 35*Train1$coef[2] + Train1$coef[3] + 40*Train1$coef[4] + Train1$coef[7]
T1M35NWNSmo40
(Intercept)
10030
# T1M35NWNSmo40= $10029
# Train1 predicted expense: male age 35 no-children, northwest, smoker, bmi 40
T1M35NWSmo40= Train1$coef[1] + 35*Train1$coef[2] + Train1$coef[3] + 40*Train1$coef[4] + Train1$coef[6] + Train1$coef[7]
T1M35NWSmo40
(Intercept)
34286
#T1M35NWSmo40= $34285
# Adding the interaction term increases the estimated expenses of smokers and reduces the estiamted
# expenses of non-smokers
# Predict test set using Train1
PredTest1 = predict(Train1, newdata= test)
SSE1 = sum ((test$charges - PredTest1)^2)
RMSE1= sqrt(SSE1/nrow(test))
SST = sum ((test$charges - mean(train$charges))^2)
R2_1 = 1 - SSE1/SST
# R2 Test set model 1 = 0.75
# Predict test set using Train2
PredTest2 = predict(Train2, newdata= test)
SSE2 = sum ((test$charges - PredTest2)^2)
RMSE2= sqrt(SSE2/nrow(test))
SST = sum ((test$charges - mean(train$charges))^2)
R2_2 = 1 - SSE2/SST
# R2 Test set model 2 = 0.85
# Model 2 (with interaction term) has higher R2 in both training (0.84) and test set (0.85) than Model 1 (0.75, 0.75)