linear_reg_hw.R

alisonlo — Apr 13, 2014, 11:41 AM

# Linear Regression predicting Medical Expense

insure = read.csv("insurance.csv")

set.seed(123)
spl = sample(1:nrow(insure), size=1000)
str(spl)
 int [1:1000] 385 1054 547 1179 1255 61 704 1188 734 607 ...

train = insure[spl, ]
test = insure[-spl,]

Train1 = lm(charges ~., data=train)
summary(Train1)

Call:
lm(formula = charges ~ ., data = train)

Residuals:
   Min     1Q Median     3Q    Max 
-11560  -2831  -1007   1457  29719 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -11672.4     1171.7   -9.96   <2e-16 ***
age                249.8       13.9   18.01   <2e-16 ***
sexmale           -181.6      387.3   -0.47    0.639    
bmi                342.8       33.4   10.26   <2e-16 ***
children           495.2      159.6    3.10    0.002 ** 
smokeryes        24256.0      486.6   49.85   <2e-16 ***
regionnorthwest   -569.5      557.0   -1.02    0.307    
regionsoutheast  -1175.0      563.9   -2.08    0.037 *  
regionsouthwest  -1129.1      558.1   -2.02    0.043 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6100 on 991 degrees of freedom
Multiple R-squared:  0.75,  Adjusted R-squared:  0.748 
F-statistic:  372 on 8 and 991 DF,  p-value: <2e-16

# Train1 R2 = 0.75

# the estimated beta coefficients indicate the increase in charges for an increase in one in each of 
# variables when the other variables are held constant. For instances, for each year that age increases,
# we would expect $256.90 higher medical expenses on average. Similarly, each additional child results in
# an average of $275.50 additional medical expenses each year. 

# You might notice that although we only specified 6 features in the model, there are 8 coefficients 
# reported in addition to the intercept. This happened because the lm(function) automatically applied
# a technique known as dummy codding to each of the factor (nominal) type variables (sex, smoker and region).

# Dummy coding allows a nominal feature to be treated as numeric by creating a binary variable for each
# category of the feature. For instance, the sex variable has two categories, male and female. 
# This will be split into two binary values, whhich R names sexmale and sexfemale. For observations where
# sex = male, then sexmale = 1 and sexfemale = 0; if sex = female, then sexmale = 0 and sexfemale = 1. 
# The four-category feature region can be split into 4 variables: regionnorthwest, regionnortheast, 
# regionsouthwest, regionsoutheast. 

# When adding a dummy-coded variable to a regression model, one category is always left out to serve as
# the reference. In our model, R automatically held out the sexfemale, smokerno, and regionnortheast 
# variables (alphabetical order), making female non-smokers in the northeast region the reference group. 
# Thus males have $182 less medical costs relative to females, smokers cost $24256 more than non-smokers.
# The coefficients for each of the other three regions are negative, which implies that the northeast 
# region tends to have the highest average medical expenses. 

# Effect of gender on medical expense
# Train1 predicted expense: female age 35 no-children, northwest, non-smoker, bmi 20

F35NWNSmo20= Train1$coef[1] + 35*Train1$coef[2]  + 20*Train1$coef[4] + Train1$coef[7] 
F35NWNSmo20
(Intercept) 
       3356 

# F35NWNSmo20 = $3356

# Train1 predicted expense: male age 35 no-children, northwest, non-smoker, bmi 20

M35NWNSmo20= Train1$coef[1] + 35*Train1$coef[2]  + Train1$coef[3] + 20*Train1$coef[4] + Train1$coef[7] 
M35NWNSmo20
(Intercept) 
       3174 

# M35NWNSmo20 = $3174  (3356-3174 = 182 = Train1$coef[3] i.e. coefficient of sexmale)



# Examine medical expenses based on individual's BMI and smoking status

boxplot(charges ~  smoker, data=insure, xlab = "smoker", ylab= "annual medical costs")

plot of chunk unnamed-chunk-1


tapply(insure$charges, insure$smoker, summary)
$no
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1120    3990    7350    8430   11400   36900 

$yes
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  12800   20800   34500   32100   41000   63800 

library(car)
scatterplot(charges ~ bmi, data=insure, xlab = "bmi", ylab = "annual medical costs")

plot of chunk unnamed-chunk-1

scatterplot(charges ~ bmi | smoker  , data=insure, xlab = "bmi" , ylab = "annual medical costs" )

plot of chunk unnamed-chunk-1


# create an interaction term for bmi and smoker: bmi * smoker
 Train2 = lm(charges ~. + bmi* smoker, data=insure)
summary(Train2)

Call:
lm(formula = charges ~ . + bmi * smoker, data = insure)

Residuals:
   Min     1Q Median     3Q    Max 
-14581  -1857  -1361   -476  30552 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -2223.45     865.61   -2.57   0.0103 *  
age                263.62       9.52   27.70  < 2e-16 ***
sexmale           -500.15     266.52   -1.88   0.0608 .  
bmi                 23.53      25.60    0.92   0.3581    
children           516.40     110.18    4.69  3.1e-06 ***
smokeryes       -20415.61    1648.28  -12.39  < 2e-16 ***
regionnorthwest   -585.48     380.86   -1.54   0.1245    
regionsoutheast  -1210.13     382.75   -3.16   0.0016 ** 
regionsouthwest  -1231.11     382.22   -3.22   0.0013 ** 
bmi:smokeryes     1443.10      52.65   27.41  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4850 on 1328 degrees of freedom
Multiple R-squared:  0.841, Adjusted R-squared:  0.84 
F-statistic:  780 on 9 and 1328 DF,  p-value: <2e-16

# Train2 R2 = 0.84
# Train 2 predicted expense: male age 35 no-children, northwest, non-smoker, bmi 40
M35NWNSmo40= Train2$coef[1] + 35*Train2$coef[2] + Train2$coef[3] + 40*Train2$coef[4] + Train2$coef[7] 
M35NWNSmo40
(Intercept) 
       6859 
#M35NWNSmo40= $6859

# Train 2 predicted expense: male age 35 no-children, northwest, smoker, bmi 40
M35NWSmo40 = Train2$coef[1] + 35*Train2$coef[2] + Train2$coef[3] + 40*Train2$coef[4] +  Train2$coef[6] + Train2$coef[7] + 40*Train2$coef[10]
M35NWSmo40
(Intercept) 
      44167 
#M35NWSmo40= $44167

# Compare prediction with (Train2) vs without (Train1) interaction term
# Train1 predicted expense: male age 35 no-children, northwest, non-smoker, bmi 40
T1M35NWNSmo40= Train1$coef[1] + 35*Train1$coef[2] + Train1$coef[3] + 40*Train1$coef[4] + Train1$coef[7] 
T1M35NWNSmo40
(Intercept) 
      10030 
# T1M35NWNSmo40= $10029

# Train1 predicted expense: male age 35 no-children, northwest, smoker, bmi 40
T1M35NWSmo40= Train1$coef[1] + 35*Train1$coef[2] + Train1$coef[3] + 40*Train1$coef[4] + Train1$coef[6] + Train1$coef[7] 
T1M35NWSmo40
(Intercept) 
      34286 
#T1M35NWSmo40= $34285
# Adding the interaction term increases the estimated expenses of smokers and reduces the estiamted 
# expenses of non-smokers

# Predict test set using Train1

PredTest1 = predict(Train1, newdata= test)
SSE1 = sum ((test$charges - PredTest1)^2)
RMSE1= sqrt(SSE1/nrow(test))
SST = sum ((test$charges - mean(train$charges))^2)
R2_1 = 1 - SSE1/SST
# R2 Test set model 1 = 0.75

# Predict test set using Train2

PredTest2 = predict(Train2, newdata= test)
SSE2 = sum ((test$charges - PredTest2)^2)
RMSE2= sqrt(SSE2/nrow(test))
SST = sum ((test$charges - mean(train$charges))^2)
R2_2 = 1 - SSE2/SST
# R2 Test set model 2 = 0.85
# Model 2 (with interaction term) has higher R2 in both training (0.84) and test set (0.85) than Model 1 (0.75, 0.75)