Using R, build a multiple regression model for data that interests you.
Include in this model at least one quadratic term,bone dichotomous term, 
and one dichotomous vs. quantitative interaction term. 
Interpret all coefficients. Conduct residual analysis. Was the linear model appropriate? Why or why not?
insurance <- read.csv("insurance.csv")
dim(insurance)
## [1] 1338    7
summary(insurance)
##       age            sex           bmi           children     smoker    
##  Min.   :18.00   female:662   Min.   :15.96   Min.   :0.000   no :1064  
##  1st Qu.:27.00   male  :676   1st Qu.:26.30   1st Qu.:0.000   yes: 274  
##  Median :39.00                Median :30.40   Median :1.000             
##  Mean   :39.21                Mean   :30.66   Mean   :1.095             
##  3rd Qu.:51.00                3rd Qu.:34.69   3rd Qu.:2.000             
##  Max.   :64.00                Max.   :53.13   Max.   :5.000             
##        region       charges     
##  northeast:324   Min.   : 1122  
##  northwest:325   1st Qu.: 4740  
##  southeast:364   Median : 9382  
##  southwest:325   Mean   :13270  
##                  3rd Qu.:16640  
##                  Max.   :63770
str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
head(insurance, n = 10)
##    age    sex    bmi children smoker    region   charges
## 1   19 female 27.900        0    yes southwest 16884.924
## 2   18   male 33.770        1     no southeast  1725.552
## 3   28   male 33.000        3     no southeast  4449.462
## 4   33   male 22.705        0     no northwest 21984.471
## 5   32   male 28.880        0     no northwest  3866.855
## 6   31 female 25.740        0     no southeast  3756.622
## 7   46 female 33.440        1     no southeast  8240.590
## 8   37 female 27.740        3     no northwest  7281.506
## 9   37   male 29.830        2     no northeast  6406.411
## 10  60 female 25.840        0     no northwest 28923.137
par(mfrow=c(1,2))
hist(insurance$bmi, xlab = "BMI (Body Mass Index)",
     main = "Histogram of BMI")
hist(insurance$charges, xlab = "Medical Charges",
     main = "Histogram of Medical Charges") # looks more right-skewed

# let's do some plotting of charges against other factors like sex and smoker and region
par(mfrow=c(1,3))
with(insurance, plot(charges ~ smoker + sex + region))

\[ \begin{aligned} \widehat{charges} = \beta_0 + \beta_1 * Sex + \beta_2 * bmi + \beta_3 * age + \beta_4 * smoker + \beta_5 (bmi*sex) \end{aligned} \]

lm_insurance <- lm(charges ~ sex + bmi + age + smoker + bmi*sex, data = insurance)
summary(lm_insurance)
## 
## Call:
## lm(formula = charges ~ sex + bmi + age + smoker + bmi * sex, 
##     data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12396.7  -2983.0   -985.4   1478.3  29015.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11717.369   1282.175  -9.139  < 2e-16 ***
## sexmale         54.096   1712.963   0.032    0.975    
## bmi            325.779     39.341   8.281 2.94e-16 ***
## age            259.469     11.947  21.718  < 2e-16 ***
## smokeryes    23836.067    414.958  57.442  < 2e-16 ***
## sexmale:bmi     -5.326     54.846  -0.097    0.923    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6097 on 1332 degrees of freedom
## Multiple R-squared:  0.7475, Adjusted R-squared:  0.7466 
## F-statistic: 788.6 on 5 and 1332 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
hist(lm_insurance$residuals, main = "Histogram of Residuals", xlab= "")
plot(lm_insurance$residuals, fitted(lm_insurance))
qqnorm(lm_insurance$residuals)
qqline(lm_insurance$residuals)

\[ \begin{aligned} \widehat{charges} = -11717.37 + 54.1*sex + 325.78*bmi + 259.47*age + 23836.07*smoker - 5.33(bmi*sex) \end{aligned} \]

my_predicted_medical_cost <- predict(lm_insurance,
                                     data.frame(age=31, smoker="no", 
                                                bmi=31.3, sex="male"))
my_predicted_medical_cost
##        1 
## 6410.444