library(tidyverse)
## ── Attaching packages ───────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
df<-read_csv( "~/Desktop/ecob2000_lecture1/New_files/empirical_hw/insurance.csv")
## Parsed with column specification:
## cols(
## age = col_double(),
## sex = col_character(),
## bmi = col_double(),
## children = col_double(),
## smoker = col_character(),
## region = col_character(),
## charges = col_double()
## )
glimpse(df)
## Rows: 1,338
## Columns: 7
## $ age <dbl> 19, 18, 28, 33, 32, 31, 46, 37, 37, 60, 25, 62, 23, 56, 27, …
## $ sex <chr> "female", "male", "male", "male", "male", "female", "female"…
## $ bmi <dbl> 27.900, 33.770, 33.000, 22.705, 28.880, 25.740, 33.440, 27.7…
## $ children <dbl> 0, 1, 3, 0, 0, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, …
## $ smoker <chr> "yes", "no", "no", "no", "no", "no", "no", "no", "no", "no",…
## $ region <chr> "southwest", "southeast", "southeast", "northwest", "northwe…
## $ charges <dbl> 16884.924, 1725.552, 4449.462, 21984.471, 3866.855, 3756.622…
summary(df)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
# take a look at the distribution of the age, BIM and charges of these policyholders.
d_age<- df %>%
ggplot(aes(x=age))+
geom_histogram(binwidth = 4,
show.legend = F, fill= "red" )+
labs(
x="age of the policyholders",
y="number of the holders",
title="distribution of ages"
)
d_BIM <- df %>%
ggplot(aes(x=bmi))+
geom_histogram(binwidth = 4,
show.legend = F,
fill="blue")+
labs(
x="BIM of policyholder",
y="number of policyholders",
title = "distribution of IBM "
)
d_charges <- df %>%
ggplot(aes(x=charges))+
geom_histogram(binwidth = 1000,
show.legend = F,
fill="green")+
labs(x="charges of policyholders",
y="number",
title = "distribution of charges")
d_age
d_BIM
d_charges
#data exploring 2: create scatter plots to observe the pattern between charges and other factors.
sex_age <- df %>%
ggplot(aes(x=age,y=charges, color=sex))+
geom_point()+
labs(x="Age",
y="Charge $",
title="Medical Charges Across Different Age and Sex"
)
sex_bmi <- df %>%
ggplot(aes(x=bmi,y=charges, color=sex))+
geom_point()+
labs(x="BMI",
y="charges",
title="Charges VS BMI by SEX")
smoker_age <- df %>%
ggplot(aes(x=age, y=charges, color=smoker))+
geom_point()+
labs(x="Age",
y="charge",
title="Charges VS Age by Smoker")
bmi_children <-df %>%
ggplot(aes(x=bmi,y=charges, color=children))+
geom_point()+
labs(x="BMI", y="Charges", title = "The Charge of BMI in Children ")
region_bmi <- df %>%
ggplot(aes(x=bmi,y=charges, color=region))+
geom_point()+
labs(x="BMI", y="Charges", title = "The Charge of BMI in Regions ")
smoker_bmi <-df%>%
ggplot(aes(x=bmi,y=charges, color=smoker))+
geom_point()+
labs(x="BMI", y="Charges", title = "The Charge of BMI of smoker ")
sex_age
sex_bmi
smoker_age
bmi_children
region_bmi
smoker_bmi
conclusions: 1. Young policyholders has the largest share and after the range of age 25, they are distributed evenly. BMI demonstrates a normal distribution. The charges of policyholders skews to the right which means the majority charges are lower than $20,000.
2.From the scatter plots, charges are mostly-related to the factors of smoker and age since only graph1 and graph3 demonstrate a linear-relationship. For the graph6, it does show a clustering of smoker and non-smoker but charges doesn’t evolve as the BMI goes higher.
set.seed(12345)
training_data <- log1p(df$charges) %>%
createDataPartition(p=0.8,list = F)
train <-df[training_data,]
test<- df[-training_data,]
H0: there is no significant difference of medical charges by the factors of age, sex,BMI, children covered and regions. Ha: there is significant difference across the factors. set up the significance to be 0.95.
model1 <- lm(log1p(charges)~age+sex+bmi+smoker+children+region, data=train)
summary(model1)
##
## Call:
## lm(formula = log1p(charges) ~ age + sex + bmi + smoker + children +
## region, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07202 -0.19880 -0.05182 0.06021 2.16509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0137873 0.0808799 86.719 < 2e-16 ***
## age 0.0350575 0.0009756 35.936 < 2e-16 ***
## sexmale -0.0700323 0.0273526 -2.560 0.010594 *
## bmi 0.0132526 0.0023686 5.595 2.8e-08 ***
## smokeryes 1.5621925 0.0340308 45.905 < 2e-16 ***
## children 0.0990191 0.0114206 8.670 < 2e-16 ***
## regionnorthwest -0.0550502 0.0392366 -1.403 0.160899
## regionsoutheast -0.1468597 0.0394551 -3.722 0.000208 ***
## regionsouthwest -0.1293880 0.0389656 -3.321 0.000929 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4448 on 1063 degrees of freedom
## Multiple R-squared: 0.7655, Adjusted R-squared: 0.7637
## F-statistic: 433.8 on 8 and 1063 DF, p-value: < 2.2e-16
results of the model1: The result gets a p-value <2.2e-16 with 0.7637 adjusted R-squared, which is significantly different at the significant level of 95%. The model can explain 76.37% of the data.
tstep<-step(model1)
## Start: AIC=-1728.16
## log1p(charges) ~ age + sex + bmi + smoker + children + region
##
## Df Sum of Sq RSS AIC
## <none> 210.27 -1728.16
## - sex 1 1.30 211.57 -1723.57
## - region 3 3.49 213.76 -1716.51
## - bmi 1 6.19 216.46 -1699.04
## - children 1 14.87 225.14 -1656.91
## - age 1 255.45 465.72 -877.72
## - smoker 1 416.84 627.12 -558.75
summary(tstep)
##
## Call:
## lm(formula = log1p(charges) ~ age + sex + bmi + smoker + children +
## region, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07202 -0.19880 -0.05182 0.06021 2.16509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0137873 0.0808799 86.719 < 2e-16 ***
## age 0.0350575 0.0009756 35.936 < 2e-16 ***
## sexmale -0.0700323 0.0273526 -2.560 0.010594 *
## bmi 0.0132526 0.0023686 5.595 2.8e-08 ***
## smokeryes 1.5621925 0.0340308 45.905 < 2e-16 ***
## children 0.0990191 0.0114206 8.670 < 2e-16 ***
## regionnorthwest -0.0550502 0.0392366 -1.403 0.160899
## regionsoutheast -0.1468597 0.0394551 -3.722 0.000208 ***
## regionsouthwest -0.1293880 0.0389656 -3.321 0.000929 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4448 on 1063 degrees of freedom
## Multiple R-squared: 0.7655, Adjusted R-squared: 0.7637
## F-statistic: 433.8 on 8 and 1063 DF, p-value: < 2.2e-16
1)By running step regression, no variable is deleted from model1. So model1 is the best fit.
vif(model1)
## GVIF Df GVIF^(1/(2*Df))
## age 1.023014 1 1.011441
## sex 1.012737 1 1.006348
## bmi 1.134090 1 1.064937
## smoker 1.013273 1 1.006614
## children 1.002614 1 1.001306
## region 1.117293 3 1.018657
result: the value from the vif test of each variable is less than 3, which indicates little or no multicollinearity.