suppressMessages(library(tidyverse))
suppressMessages(library(GGally))
suppressMessages(library(stats))
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
choose.files()
## character(0)
insurance <- read.csv("C:\\Users\\iasha.pickens\\Downloads\\insurance.csv")
view(insurance)
summary(insurance)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
# Scatterplot of charges vs. age
ggplot(insurance, aes(x = age, y = charges)) +
geom_point() +
labs(title = "Charges vs. Age", x = "Age", y = "Charges") +
theme_minimal()

# Scatterplot of charges vs. bmi
ggplot(insurance, aes(x = bmi, y = charges)) +
geom_point() +
labs(title = "Charges vs. BMI", x = "BMI", y = "Charges") +
theme_minimal()

# Scatterplot of charges vs. children
ggplot(insurance, aes(x = children, y = charges)) +
geom_point() +
labs(title = "Charges vs. Number of Children", x = "Number of Children", y = "Charges") +
theme_minimal()

# Fit the linear model
model_bmi <- lm(charges ~ bmi, data = insurance)
# Summary of the model
summary(model_bmi)
##
## Call:
## lm(formula = charges ~ bmi, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20956 -8118 -3757 4722 49442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1192.94 1664.80 0.717 0.474
## bmi 393.87 53.25 7.397 2.46e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11870 on 1336 degrees of freedom
## Multiple R-squared: 0.03934, Adjusted R-squared: 0.03862
## F-statistic: 54.71 on 1 and 1336 DF, p-value: 2.459e-13
# Scatterplot with linear regression line
ggplot(insurance, aes(x = bmi, y = charges)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Linear Regression: Charges vs. BMI", x = "BMI", y = "Charges") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

summary(insurance)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
# Filter the data
smokers <- filter(insurance, smoker == "yes")
nonsmokers <- filter(insurance, smoker == "no")
# Scatterplot for smokers
ggplot(smokers, aes(x = bmi, y = charges)) +
geom_point() +
labs(title = "Scatterplot of Charges vs. BMI for Smokers", x = "BMI", y = "Charges") +
theme_minimal()

# Scatterplot for non-smokers
ggplot(nonsmokers, aes(x = bmi, y = charges)) +
geom_point() +
labs(title = "Scatterplot of Charges vs. BMI for Non-Smokers", x = "BMI", y = "Charges") +
theme_minimal()

# Fit the linear model for smokers
model_smokers <- lm(charges ~ bmi, data = smokers)
summary(model_smokers)
##
## Call:
## lm(formula = charges ~ bmi, data = smokers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19768.0 -4487.9 34.4 3263.9 31055.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13186.58 2052.88 -6.423 5.93e-10 ***
## bmi 1473.11 65.48 22.496 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6837 on 272 degrees of freedom
## Multiple R-squared: 0.6504, Adjusted R-squared: 0.6491
## F-statistic: 506.1 on 1 and 272 DF, p-value: < 2.2e-16
# Fit the linear model for non-smokers
model_nonsmokers <- lm(charges ~ bmi, data = nonsmokers)
summary(model_nonsmokers)
##
## Call:
## lm(formula = charges ~ bmi, data = nonsmokers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9144 -4360 -1009 2922 28131
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5879.42 947.48 6.205 7.81e-10 ***
## bmi 83.35 30.33 2.748 0.00609 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5975 on 1062 degrees of freedom
## Multiple R-squared: 0.007062, Adjusted R-squared: 0.006127
## F-statistic: 7.553 on 1 and 1062 DF, p-value: 0.006091