insurance<-read.csv("C:\\Users\\user\\Desktop\\myR\\insurance.csv")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Step0
#Numeric response variable: charges (Individual medical costs billed by health insurance)
#Numeric explanatory variable: age (Age of primary beneficiary)
#Step1
##model of charges, age
mod2<-lm(charges~age, insurance)
summary(mod2)
##
## Call:
## lm(formula = charges ~ age, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8059 -6671 -5939 5440 47829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3165.9 937.1 3.378 0.000751 ***
## age 257.7 22.5 11.453 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11560 on 1336 degrees of freedom
## Multiple R-squared: 0.08941, Adjusted R-squared: 0.08872
## F-statistic: 131.2 on 1 and 1336 DF, p-value: < 2.2e-16
anova(mod2)
## Analysis of Variance Table
##
## Response: charges
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 1.7530e+10 1.7530e+10 131.17 < 2.2e-16 ***
## Residuals 1336 1.7854e+11 1.3364e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#(Charges)= 3165.9+ 257.7*(Age)
#Step2
#H0: B1=0
#Ha: B1!=0
#We reject the null hypothesis with a p-value of 2*(10^(-16)) at the significance level of 0.05
#t-test(t distribution with degree of freedom=1336), test statistic is t=11.453
#There is convincing evidence to suggest that there is a significant linear relationship between the Ages and the Charges.
#Step3
#Step4, Step5
plot(mod2)
#residual plot: the mean of the residuals are not perfectly zero. Residuals do not have a certain pattern
#QQ plot: It is not close to a normal distribution.
#leverage plot: No influential outliers because we cannot see the Cook's distance line in the graph which means all cases are well inside the Cook's distance line.