insurance<-read.csv("C:\\Users\\user\\Desktop\\myR\\insurance.csv")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
summary(insurance)
##       age            sex           bmi           children     smoker    
##  Min.   :18.00   female:662   Min.   :15.96   Min.   :0.000   no :1064  
##  1st Qu.:27.00   male  :676   1st Qu.:26.30   1st Qu.:0.000   yes: 274  
##  Median :39.00                Median :30.40   Median :1.000             
##  Mean   :39.21                Mean   :30.66   Mean   :1.095             
##  3rd Qu.:51.00                3rd Qu.:34.69   3rd Qu.:2.000             
##  Max.   :64.00                Max.   :53.13   Max.   :5.000             
##        region       charges     
##  northeast:324   Min.   : 1122  
##  northwest:325   1st Qu.: 4740  
##  southeast:364   Median : 9382  
##  southwest:325   Mean   :13270  
##                  3rd Qu.:16640  
##                  Max.   :63770
#A
#response variable: charges (insurance charges $)
#Categorical predictor: smoker (“yes”/ “no”) only has two levels
#Numeric predictor: age (Age of primary beneficiary (years))
mod2<-lm(charges~age, insurance)
summary(mod2)
## 
## Call:
## lm(formula = charges ~ age, data = insurance)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8059  -6671  -5939   5440  47829 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3165.9      937.1   3.378 0.000751 ***
## age            257.7       22.5  11.453  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11560 on 1336 degrees of freedom
## Multiple R-squared:  0.08941,    Adjusted R-squared:  0.08872 
## F-statistic: 131.2 on 1 and 1336 DF,  p-value: < 2.2e-16
ggplot(insurance, aes(age,charges))+
  geom_point()+
  geom_smooth(method="lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

#B) simple linear model between charges and age
#(Charges)= 3165.9+ 257.7*(Age)
#According to Milestone3, there is convincing evidence to suggest that there is a significant linear relationship between the ‘age’ and the ‘charges’. However, if we see the graphics, the fitted model does not seem to explain the whole scatterplot. 
contrasts(insurance$smoker)
##     yes
## no    0
## yes   1
#C) dummy variables
#Since the categorical variable ‘age’ has only two levels, we only need one dummy variable. 
ggplot(insurance, aes(y=charges, x=smoker, fill=smoker))+
         geom_boxplot()

#D
smokermod<-lm(charges~smoker, insurance)
summary(smokermod)
## 
## Call:
## lm(formula = charges ~ smoker, data = insurance)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -19221  -5042   -919   3705  31720 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8434.3      229.0   36.83   <2e-16 ***
## smokeryes    23616.0      506.1   46.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7470 on 1336 degrees of freedom
## Multiple R-squared:  0.6198, Adjusted R-squared:  0.6195 
## F-statistic:  2178 on 1 and 1336 DF,  p-value: < 2.2e-16
anova(smokermod)
## Analysis of Variance Table
## 
## Response: charges
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## smoker       1 1.2152e+11 1.2152e+11  2177.6 < 2.2e-16 ***
## Residuals 1336 7.4554e+10 5.5804e+07                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#H0: μ(smoker)= μ(nonsmoker) Ha: μ(smoker)!= μ(nonsmoker)
 #One-way ANOVA F-test
 #test statistic: F= 2177.6, reference distribution is the F-distribution
 #p-value=2.2*10^(-16) <0.05
 #We reject the null hypothesis with a p-value of 2.2*(10^(-16)) at the significance level of 0.05
 #There is convincing evidence to suggest that μ(average of charges) are different between smokers and non-smokers
#E
smokeragemod<-lm(charges~smoker+age, insurance)
summary(smokeragemod)
## 
## Call:
## lm(formula = charges ~ smoker + age, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16088.1  -2046.8  -1336.4   -212.7  28760.0 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2391.63     528.30  -4.527 6.52e-06 ***
## smokeryes   23855.30     433.49  55.031  < 2e-16 ***
## age           274.87      12.46  22.069  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6397 on 1335 degrees of freedom
## Multiple R-squared:  0.7214, Adjusted R-squared:  0.721 
## F-statistic:  1728 on 2 and 1335 DF,  p-value: < 2.2e-16
anova(smokeragemod)
## Analysis of Variance Table
## 
## Response: charges
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## smoker       1 1.2152e+11 1.2152e+11 2969.81 < 2.2e-16 ***
## age          1 1.9928e+10 1.9928e+10  487.02 < 2.2e-16 ***
## Residuals 1335 5.4626e+10 4.0918e+07                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#(smokers)
#(charges)=21463.67+272.87(age)

#(nonsmokers)
#(charges)=-2391.63+274.87(age)
ggplot(insurance, aes(x=age, y=charges, color=smoker))+
  geom_point()+
  geom_abline(intercept=21463.67, slope=274.87, col="blue")+
  geom_abline(intercept=-2391.63, slope=274.87, col="red")

#F
intersmokeragemod<-lm(charges~smoker*age, insurance)
summary(intersmokeragemod)
## 
## Call:
## lm(formula = charges ~ smoker * age, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16072.5  -2043.0  -1348.7   -258.9  28814.6 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2091.42     582.57  -3.590 0.000343 ***
## smokeryes     22385.55    1278.73  17.506  < 2e-16 ***
## age             267.25      13.93  19.187  < 2e-16 ***
## smokeryes:age    37.99      31.09   1.222 0.222037    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6396 on 1334 degrees of freedom
## Multiple R-squared:  0.7217, Adjusted R-squared:  0.7211 
## F-statistic:  1153 on 3 and 1334 DF,  p-value: < 2.2e-16
anova(intersmokeragemod)
## Analysis of Variance Table
## 
## Response: charges
##              Df     Sum Sq    Mean Sq   F value Pr(>F)    
## smoker        1 1.2152e+11 1.2152e+11 2970.9036 <2e-16 ***
## age           1 1.9928e+10 1.9928e+10  487.2022 <2e-16 ***
## smoker:age    1 6.1050e+07 6.1050e+07    1.4925  0.222    
## Residuals  1334 5.4565e+10 4.0903e+07                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#(smokers)
#(charges)=20294.13+305.24(age)

#(nonsmokers)
#(charges)=-2091.42+267.25(age)
ggplot(insurance, aes(x=age, y=charges, color=smoker))+
  geom_point()+
  geom_abline(intercept=20294.13, slope=305.24, col="blue")+
  geom_abline(intercept=-2091.42, slope=267.25, col="red")

#G
mean(mod2$residuals^2)
## [1] 133440979
mean(smokermod$residuals^2)
## [1] 55720716
mean(smokeragemod$residuals^2)
## [1] 40826694
mean(intersmokeragemod$residuals^2)
## [1] 40781066
#the model from F has the smallest MSE(mean squared error). The MSE decreases when it goes from model B to model F. Adjusted R-Squared increases when it goes from model B to model F, which means model F explains most of the relationship of age,smoker, and charges.
#H
#Since the linear model from F which includes interaction between ‘age’ and ‘smoker’ has the smallest MSE and has the highest Adjusted R-Squared, model F is the best model above. Therefore, the relationship between ‘age’ and ‘charges’ seems to differ by the categorical explanatory variable ‘smoker’. The linear model of smokers and nonsmokers have different slopes and intercepts too. Also, smokers tend to have higher insurance charges than nonsmokers.