suppressMessages(library(tidyverse))
suppressMessages(library(GGally))
suppressMessages(library(stats))
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))

choose.files()
## character(0)
insurance <- read.csv("C:\\Users\\iasha.pickens\\Downloads\\insurance.csv")
view(insurance)
summary(insurance)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770
# Scatterplot of charges vs. age
ggplot(insurance, aes(x = age, y = charges)) +
  geom_point() +
  labs(title = "Charges vs. Age", x = "Age", y = "Charges") +
  theme_minimal()

# Scatterplot of charges vs. bmi
ggplot(insurance, aes(x = bmi, y = charges)) +
  geom_point() +
  labs(title = "Charges vs. BMI", x = "BMI", y = "Charges") +
  theme_minimal()

# Scatterplot of charges vs. children
ggplot(insurance, aes(x = children, y = charges)) +
  geom_point() +
  labs(title = "Charges vs. Number of Children", x = "Number of Children", y = "Charges") +
  theme_minimal()

# Fit the linear model
model_bmi <- lm(charges ~ bmi, data = insurance)

# Summary of the model
summary(model_bmi)
## 
## Call:
## lm(formula = charges ~ bmi, data = insurance)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20956  -8118  -3757   4722  49442 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1192.94    1664.80   0.717    0.474    
## bmi           393.87      53.25   7.397 2.46e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11870 on 1336 degrees of freedom
## Multiple R-squared:  0.03934,    Adjusted R-squared:  0.03862 
## F-statistic: 54.71 on 1 and 1336 DF,  p-value: 2.459e-13
# Scatterplot with linear regression line
ggplot(insurance, aes(x = bmi, y = charges)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Linear Regression: Charges vs. BMI", x = "BMI", y = "Charges") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

summary(insurance)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770
# Filter the data
smokers <- filter(insurance, smoker == "yes")
nonsmokers <- filter(insurance, smoker == "no")

# Scatterplot for smokers
ggplot(smokers, aes(x = bmi, y = charges)) +
  geom_point() +
  labs(title = "Scatterplot of Charges vs. BMI for Smokers", x = "BMI", y = "Charges") +
  theme_minimal()

# Scatterplot for non-smokers
ggplot(nonsmokers, aes(x = bmi, y = charges)) +
  geom_point() +
  labs(title = "Scatterplot of Charges vs. BMI for Non-Smokers", x = "BMI", y = "Charges") +
  theme_minimal()

# Fit the linear model for smokers
model_smokers <- lm(charges ~ bmi, data = smokers)

summary(model_smokers)
## 
## Call:
## lm(formula = charges ~ bmi, data = smokers)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19768.0  -4487.9     34.4   3263.9  31055.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -13186.58    2052.88  -6.423 5.93e-10 ***
## bmi           1473.11      65.48  22.496  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6837 on 272 degrees of freedom
## Multiple R-squared:  0.6504, Adjusted R-squared:  0.6491 
## F-statistic: 506.1 on 1 and 272 DF,  p-value: < 2.2e-16
# Fit the linear model for non-smokers
model_nonsmokers <- lm(charges ~ bmi, data = nonsmokers)

summary(model_nonsmokers)
## 
## Call:
## lm(formula = charges ~ bmi, data = nonsmokers)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -9144  -4360  -1009   2922  28131 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5879.42     947.48   6.205 7.81e-10 ***
## bmi            83.35      30.33   2.748  0.00609 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5975 on 1062 degrees of freedom
## Multiple R-squared:  0.007062,   Adjusted R-squared:  0.006127 
## F-statistic: 7.553 on 1 and 1062 DF,  p-value: 0.006091