insurance<-read.csv("C:\\Users\\user\\Desktop\\myR\\insurance.csv")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
str(insurance)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
summary(insurance)
## age sex bmi children smoker
## Min. :18.00 female:662 Min. :15.96 Min. :0.000 no :1064
## 1st Qu.:27.00 male :676 1st Qu.:26.30 1st Qu.:0.000 yes: 274
## Median :39.00 Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## region charges
## northeast:324 Min. : 1122
## northwest:325 1st Qu.: 4740
## southeast:364 Median : 9382
## southwest:325 Mean :13270
## 3rd Qu.:16640
## Max. :63770
#A
#response variable: charges (insurance charges $)
#Categorical predictor: smoker (“yes”/ “no”) only has two levels
#Numeric predictor: age (Age of primary beneficiary (years))
mod2<-lm(charges~age, insurance)
summary(mod2)
##
## Call:
## lm(formula = charges ~ age, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8059 -6671 -5939 5440 47829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3165.9 937.1 3.378 0.000751 ***
## age 257.7 22.5 11.453 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11560 on 1336 degrees of freedom
## Multiple R-squared: 0.08941, Adjusted R-squared: 0.08872
## F-statistic: 131.2 on 1 and 1336 DF, p-value: < 2.2e-16
ggplot(insurance, aes(age,charges))+
geom_point()+
geom_smooth(method="lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

#B) simple linear model between charges and age
#(Charges)= 3165.9+ 257.7*(Age)
#According to Milestone3, there is convincing evidence to suggest that there is a significant linear relationship between the ‘age’ and the ‘charges’. However, if we see the graphics, the fitted model does not seem to explain the whole scatterplot.
contrasts(insurance$smoker)
## yes
## no 0
## yes 1
#C) dummy variables
#Since the categorical variable ‘age’ has only two levels, we only need one dummy variable.
ggplot(insurance, aes(y=charges, x=smoker, fill=smoker))+
geom_boxplot()

#D
smokermod<-lm(charges~smoker, insurance)
summary(smokermod)
##
## Call:
## lm(formula = charges ~ smoker, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19221 -5042 -919 3705 31720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8434.3 229.0 36.83 <2e-16 ***
## smokeryes 23616.0 506.1 46.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7470 on 1336 degrees of freedom
## Multiple R-squared: 0.6198, Adjusted R-squared: 0.6195
## F-statistic: 2178 on 1 and 1336 DF, p-value: < 2.2e-16
anova(smokermod)
## Analysis of Variance Table
##
## Response: charges
## Df Sum Sq Mean Sq F value Pr(>F)
## smoker 1 1.2152e+11 1.2152e+11 2177.6 < 2.2e-16 ***
## Residuals 1336 7.4554e+10 5.5804e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#H0: μ(smoker)= μ(nonsmoker) Ha: μ(smoker)!= μ(nonsmoker)
#One-way ANOVA F-test
#test statistic: F= 2177.6, reference distribution is the F-distribution
#p-value=2.2*10^(-16) <0.05
#We reject the null hypothesis with a p-value of 2.2*(10^(-16)) at the significance level of 0.05
#There is convincing evidence to suggest that μ(average of charges) are different between smokers and non-smokers
#E
smokeragemod<-lm(charges~smoker+age, insurance)
summary(smokeragemod)
##
## Call:
## lm(formula = charges ~ smoker + age, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16088.1 -2046.8 -1336.4 -212.7 28760.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2391.63 528.30 -4.527 6.52e-06 ***
## smokeryes 23855.30 433.49 55.031 < 2e-16 ***
## age 274.87 12.46 22.069 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6397 on 1335 degrees of freedom
## Multiple R-squared: 0.7214, Adjusted R-squared: 0.721
## F-statistic: 1728 on 2 and 1335 DF, p-value: < 2.2e-16
anova(smokeragemod)
## Analysis of Variance Table
##
## Response: charges
## Df Sum Sq Mean Sq F value Pr(>F)
## smoker 1 1.2152e+11 1.2152e+11 2969.81 < 2.2e-16 ***
## age 1 1.9928e+10 1.9928e+10 487.02 < 2.2e-16 ***
## Residuals 1335 5.4626e+10 4.0918e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#(smokers)
#(charges)=21463.67+272.87(age)
#(nonsmokers)
#(charges)=-2391.63+274.87(age)
ggplot(insurance, aes(x=age, y=charges, color=smoker))+
geom_point()+
geom_abline(intercept=21463.67, slope=274.87, col="blue")+
geom_abline(intercept=-2391.63, slope=274.87, col="red")

#F
intersmokeragemod<-lm(charges~smoker*age, insurance)
summary(intersmokeragemod)
##
## Call:
## lm(formula = charges ~ smoker * age, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16072.5 -2043.0 -1348.7 -258.9 28814.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2091.42 582.57 -3.590 0.000343 ***
## smokeryes 22385.55 1278.73 17.506 < 2e-16 ***
## age 267.25 13.93 19.187 < 2e-16 ***
## smokeryes:age 37.99 31.09 1.222 0.222037
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6396 on 1334 degrees of freedom
## Multiple R-squared: 0.7217, Adjusted R-squared: 0.7211
## F-statistic: 1153 on 3 and 1334 DF, p-value: < 2.2e-16
anova(intersmokeragemod)
## Analysis of Variance Table
##
## Response: charges
## Df Sum Sq Mean Sq F value Pr(>F)
## smoker 1 1.2152e+11 1.2152e+11 2970.9036 <2e-16 ***
## age 1 1.9928e+10 1.9928e+10 487.2022 <2e-16 ***
## smoker:age 1 6.1050e+07 6.1050e+07 1.4925 0.222
## Residuals 1334 5.4565e+10 4.0903e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#(smokers)
#(charges)=20294.13+305.24(age)
#(nonsmokers)
#(charges)=-2091.42+267.25(age)
ggplot(insurance, aes(x=age, y=charges, color=smoker))+
geom_point()+
geom_abline(intercept=20294.13, slope=305.24, col="blue")+
geom_abline(intercept=-2091.42, slope=267.25, col="red")

#G
mean(mod2$residuals^2)
## [1] 133440979
mean(smokermod$residuals^2)
## [1] 55720716
mean(smokeragemod$residuals^2)
## [1] 40826694
mean(intersmokeragemod$residuals^2)
## [1] 40781066
#the model from F has the smallest MSE(mean squared error). The MSE decreases when it goes from model B to model F. Adjusted R-Squared increases when it goes from model B to model F, which means model F explains most of the relationship of age,smoker, and charges.
#H
#Since the linear model from F which includes interaction between ‘age’ and ‘smoker’ has the smallest MSE and has the highest Adjusted R-Squared, model F is the best model above. Therefore, the relationship between ‘age’ and ‘charges’ seems to differ by the categorical explanatory variable ‘smoker’. The linear model of smokers and nonsmokers have different slopes and intercepts too. Also, smokers tend to have higher insurance charges than nonsmokers.