insurance<-read.csv("C:\\Users\\user\\Desktop\\myR\\insurance.csv")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#Step0
#Numeric response variable: charges (Individual medical costs billed by health insurance)
#Numeric explanatory variable: age (Age of primary beneficiary)
#Step1
##model of charges, age
mod2<-lm(charges~age, insurance)
summary(mod2)
## 
## Call:
## lm(formula = charges ~ age, data = insurance)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8059  -6671  -5939   5440  47829 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3165.9      937.1   3.378 0.000751 ***
## age            257.7       22.5  11.453  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11560 on 1336 degrees of freedom
## Multiple R-squared:  0.08941,    Adjusted R-squared:  0.08872 
## F-statistic: 131.2 on 1 and 1336 DF,  p-value: < 2.2e-16
anova(mod2)
## Analysis of Variance Table
## 
## Response: charges
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## age          1 1.7530e+10 1.7530e+10  131.17 < 2.2e-16 ***
## Residuals 1336 1.7854e+11 1.3364e+08                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#(Charges)= 3165.9+ 257.7*(Age)

#Step2
#H0: B1=0
#Ha: B1!=0
#We reject the null hypothesis with a p-value of 2*(10^(-16)) at the significance level of 0.05
#t-test(t distribution with degree of freedom=1336), test statistic is t=11.453
#There is convincing evidence to suggest that there is a significant linear relationship between the Ages and the Charges.

#Step3
#Step4, Step5
plot(mod2)

#residual plot: the mean of the residuals are not perfectly zero. Residuals do not have a certain pattern
#QQ plot: It is not close to a normal distribution.
#leverage plot: No influential outliers because we cannot see the Cook's distance line in the graph which means all cases are well inside the Cook's distance line.