# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 245 Homework 1  ~
# ~    Problem 1          ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~

# These data were used to examine the effect of participant characteristics 
# on medical costs billed by health insurance and are comprised of a sample 
# of 1138 beneficiaries currently enrolled in a specific insurance plan along
# with multiple beneficiary characteristics and total medical expenses charged
# to the plan for the calendar year. One set of analyses examined the 
# association between charges and age.  

# Data Set: insurance

# Data Dictionary: 
# (1) age       This is an integer indicating the age of the primary 
#               beneficiary (excluding those above 64 years, since 
#               they are generally covered by the government).
# (2) sex       This is the policy holder's sex at birth, either male or female.
# (3) bmi       This is the body mass index (BMI), which provides a sense of 
#               how over or under-weight a person is relative to their height. 
#               BMI is equal to weight (in kilograms) divided by height 
#               (in meters) squared. Normal BMI is usually considered within 
#               the range of 18.5 to 24.9.
# (4) children  This is an integer indicating the number of children/dependents
#               covered by the insurance plan.
# (5) smoker    This is yes or no depending on whether the insured regularly
#               smokes tobacco.
# (6) region    This is the beneficiary's place of residence in the U.S., 
#               divided into four geographic regions: northeast, southeast, 
#               southwest, or northwest.
# (7) charges   Individual medical costs billed by health insurance

## Download and load the data file = insurance
load(url("http://www.duke.edu/~sgrambow/crp241data/insurance.RData"))
# examine structure of data
str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
# summarize data
summary(insurance)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770
# visualize data using ggplot package
library(ggplot2)
# Scatter Plot with overlaid fitted regression line
# and 95% confidence bands 
ggplot(insurance, aes(x=age, y=charges))+
  geom_point()+
  geom_smooth(method=lm, se=TRUE)
## `geom_smooth()` using formula 'y ~ x'

# fitting regression model
fit.age <- lm(charges ~ age, data=insurance)
summary(fit.age)
## 
## Call:
## lm(formula = charges ~ age, data = insurance)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8059  -6671  -5939   5440  47829 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3165.9      937.1   3.378 0.000751 ***
## age            257.7       22.5  11.453  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11560 on 1336 degrees of freedom
## Multiple R-squared:  0.08941,    Adjusted R-squared:  0.08872 
## F-statistic: 131.2 on 1 and 1336 DF,  p-value: < 2.2e-16
# Key Output
#  Coefficients:
#              Estimate Std. Error t value Pr(>|t|)    
#  (Intercept)   3165.9      937.1   3.378 0.000751 ***
#  age            257.7       22.5  11.453  < 2e-16 ***

#
# Slope estimate is key inference target
# Interpretation of Slope
# For each 1 unit increase in age
# the mean charges increase by $257.7
# 
confint(fit.age)
##                 2.5 %    97.5 %
## (Intercept) 1327.4403 5004.3297
## age          213.5788  301.8665
#                 2.5 %    97.5 %
# (Intercept) 1327.4403 5004.3297
# age          213.5788  301.8665
# standard 95% confidence interval is (213.58, 301.87)
# 
# if we want to rescale the slope and report 
# a point estimate and 95% confidence interval
# for each 5 year increment in age we can multiply 
# the slope estimate and confidence limits by 5
fit.age$coefficients*5
## (Intercept)         age 
##   15829.425    1288.613
# yields
# 
# (Intercept)         age 
#   15829.425    1288.613  
#
# estimate = 1288.61, meaning that 
# For each 5 unit increase in age
# the mean charges increase by 1288.61
# and for 95% CI
confint(fit.age)*5
##                2.5 %    97.5 %
## (Intercept) 6637.201 25021.649
## age         1067.894  1509.332
#                 2.5 %    97.5 %
#  (Intercept) 6637.201 25021.649
#  age         1067.894  1509.332
#
# so point estimate and 95% CI is
# 1288.61 with 95% CI: 1067.89, 1509.33
#
# Determine the estimated mean annual charges for 28 year old beneficiaries
# and associated confidence interval
# We can do this by hand in R or use the predicted function as 
# shown below. Note that we need to provide the desired value of the 
# X variable using the data.frame function and we specify the type 
# of interval we want for that estimate -- we want a confidence interval
# for the mean estimated annual charges for 28 year old beneficiaries
# 
predict(fit.age,data.frame(age=28),interval = "confidence")
##        fit      lwr     upr
## 1 10382.12 9588.939 11175.3
#        fit      lwr        upr
# 1 10382.12  9588.94   11756.30
# Estimated mean annual charges are 10382.12 with 95% CI from 9588.94 to 11756.30
#
# END OF PROGRAM