library(tidyverse)
## ── Attaching packages ───────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
df<-read_csv( "~/Desktop/ecob2000_lecture1/New_files/empirical_hw/insurance.csv")
## Parsed with column specification:
## cols(
##   age = col_double(),
##   sex = col_character(),
##   bmi = col_double(),
##   children = col_double(),
##   smoker = col_character(),
##   region = col_character(),
##   charges = col_double()
## )
glimpse(df)
## Rows: 1,338
## Columns: 7
## $ age      <dbl> 19, 18, 28, 33, 32, 31, 46, 37, 37, 60, 25, 62, 23, 56, 27, …
## $ sex      <chr> "female", "male", "male", "male", "male", "female", "female"…
## $ bmi      <dbl> 27.900, 33.770, 33.000, 22.705, 28.880, 25.740, 33.440, 27.7…
## $ children <dbl> 0, 1, 3, 0, 0, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, …
## $ smoker   <chr> "yes", "no", "no", "no", "no", "no", "no", "no", "no", "no",…
## $ region   <chr> "southwest", "southeast", "southeast", "northwest", "northwe…
## $ charges  <dbl> 16884.924, 1725.552, 4449.462, 21984.471, 3866.855, 3756.622…
summary(df)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770
# take a look at the distribution of the age, BIM and charges of these policyholders.

d_age<- df %>%
  ggplot(aes(x=age))+
  geom_histogram(binwidth = 4,
   show.legend = F, fill= "red"         )+
  labs(
    x="age of the policyholders",
    y="number of the holders",
    title="distribution of ages"
  )

d_BIM <- df %>%
  ggplot(aes(x=bmi))+
  geom_histogram(binwidth = 4,
                 show.legend = F,
                 fill="blue")+
  labs(
    x="BIM of policyholder",
    y="number of policyholders",
    title = "distribution of IBM "
  )

d_charges <- df %>%
  ggplot(aes(x=charges))+
  geom_histogram(binwidth = 1000,
                 show.legend = F,
                 fill="green")+
  labs(x="charges of policyholders",
       y="number",
       title = "distribution of charges")
d_age

d_BIM

d_charges

#data exploring 2: create scatter plots to observe the pattern between charges and other factors.

sex_age <- df %>%
  ggplot(aes(x=age,y=charges, color=sex))+
  geom_point()+
  labs(x="Age",
       y="Charge $",
       title="Medical Charges Across Different Age and Sex"
  )

sex_bmi <- df %>%
  ggplot(aes(x=bmi,y=charges, color=sex))+
  geom_point()+
  labs(x="BMI",
       y="charges",
       title="Charges VS BMI by SEX")

smoker_age <- df %>%
  ggplot(aes(x=age, y=charges, color=smoker))+
  geom_point()+
  labs(x="Age",
       y="charge",
       title="Charges VS Age by Smoker")

bmi_children <-df %>%
  ggplot(aes(x=bmi,y=charges, color=children))+
  geom_point()+
  labs(x="BMI", y="Charges", title = "The Charge of BMI in Children ")

region_bmi <- df %>%
  ggplot(aes(x=bmi,y=charges, color=region))+
  geom_point()+
  labs(x="BMI", y="Charges", title = "The Charge of BMI in Regions ")

smoker_bmi <-df%>%
  ggplot(aes(x=bmi,y=charges, color=smoker))+
  geom_point()+
  labs(x="BMI", y="Charges", title = "The Charge of BMI of smoker ")
 
sex_age

sex_bmi

smoker_age

bmi_children

region_bmi

smoker_bmi

conclusions: 1. Young policyholders has the largest share and after the range of age 25, they are distributed evenly. BMI demonstrates a normal distribution. The charges of policyholders skews to the right which means the majority charges are lower than $20,000.

2.From the scatter plots, charges are mostly-related to the factors of smoker and age since only graph1 and graph3 demonstrate a linear-relationship. For the graph6, it does show a clustering of smoker and non-smoker but charges doesn’t evolve as the BMI goes higher.

MLR

step1: split the data into two part: training data (80%) and testing data(20%).

step2: build regresison models

step3: evaluation by vif to test the multicollinearity.

set.seed(12345)
training_data <- log1p(df$charges) %>%
  createDataPartition(p=0.8,list = F)
train <-df[training_data,]
test<- df[-training_data,]

H0: there is no significant difference of medical charges by the factors of age, sex,BMI, children covered and regions. Ha: there is significant difference across the factors. set up the significance to be 0.95.

model1 <- lm(log1p(charges)~age+sex+bmi+smoker+children+region, data=train)
summary(model1)
## 
## Call:
## lm(formula = log1p(charges) ~ age + sex + bmi + smoker + children + 
##     region, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07202 -0.19880 -0.05182  0.06021  2.16509 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.0137873  0.0808799  86.719  < 2e-16 ***
## age              0.0350575  0.0009756  35.936  < 2e-16 ***
## sexmale         -0.0700323  0.0273526  -2.560 0.010594 *  
## bmi              0.0132526  0.0023686   5.595  2.8e-08 ***
## smokeryes        1.5621925  0.0340308  45.905  < 2e-16 ***
## children         0.0990191  0.0114206   8.670  < 2e-16 ***
## regionnorthwest -0.0550502  0.0392366  -1.403 0.160899    
## regionsoutheast -0.1468597  0.0394551  -3.722 0.000208 ***
## regionsouthwest -0.1293880  0.0389656  -3.321 0.000929 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4448 on 1063 degrees of freedom
## Multiple R-squared:  0.7655, Adjusted R-squared:  0.7637 
## F-statistic: 433.8 on 8 and 1063 DF,  p-value: < 2.2e-16

results of the model1: The result gets a p-value <2.2e-16 with 0.7637 adjusted R-squared, which is significantly different at the significant level of 95%. The model can explain 76.37% of the data.

tstep<-step(model1)
## Start:  AIC=-1728.16
## log1p(charges) ~ age + sex + bmi + smoker + children + region
## 
##            Df Sum of Sq    RSS      AIC
## <none>                  210.27 -1728.16
## - sex       1      1.30 211.57 -1723.57
## - region    3      3.49 213.76 -1716.51
## - bmi       1      6.19 216.46 -1699.04
## - children  1     14.87 225.14 -1656.91
## - age       1    255.45 465.72  -877.72
## - smoker    1    416.84 627.12  -558.75
summary(tstep)
## 
## Call:
## lm(formula = log1p(charges) ~ age + sex + bmi + smoker + children + 
##     region, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07202 -0.19880 -0.05182  0.06021  2.16509 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.0137873  0.0808799  86.719  < 2e-16 ***
## age              0.0350575  0.0009756  35.936  < 2e-16 ***
## sexmale         -0.0700323  0.0273526  -2.560 0.010594 *  
## bmi              0.0132526  0.0023686   5.595  2.8e-08 ***
## smokeryes        1.5621925  0.0340308  45.905  < 2e-16 ***
## children         0.0990191  0.0114206   8.670  < 2e-16 ***
## regionnorthwest -0.0550502  0.0392366  -1.403 0.160899    
## regionsoutheast -0.1468597  0.0394551  -3.722 0.000208 ***
## regionsouthwest -0.1293880  0.0389656  -3.321 0.000929 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4448 on 1063 degrees of freedom
## Multiple R-squared:  0.7655, Adjusted R-squared:  0.7637 
## F-statistic: 433.8 on 8 and 1063 DF,  p-value: < 2.2e-16

1)By running step regression, no variable is deleted from model1. So model1 is the best fit.

  1. to check its multicollinearity
vif(model1)
##              GVIF Df GVIF^(1/(2*Df))
## age      1.023014  1        1.011441
## sex      1.012737  1        1.006348
## bmi      1.134090  1        1.064937
## smoker   1.013273  1        1.006614
## children 1.002614  1        1.001306
## region   1.117293  3        1.018657

result: the value from the vif test of each variable is less than 3, which indicates little or no multicollinearity.