library(tidyverse)
library(patchwork)
library(ggforce)
library(statsr)
url <- "https://raw.githubusercontent.com/omocharly/DATA606_PROJECT/main/insurance.csv"
insurance <- read.csv(url)
head(insurance)
## age sex bmi children smoker region charges
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
Dataset has 7 variable and 1338 Observation
glimpse(insurance)
## Rows: 1,338
## Columns: 7
## $ age <int> 19, 18, 28, 33, 32, 31, 46, 37, 37, 60, 25, 62, 23, 56, 27, 1~
## $ sex <chr> "female", "male", "male", "male", "male", "female", "female",~
## $ bmi <dbl> 27.900, 33.770, 33.000, 22.705, 28.880, 25.740, 33.440, 27.74~
## $ children <int> 0, 1, 3, 0, 0, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0~
## $ smoker <chr> "yes", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~
## $ region <chr> "southwest", "southeast", "southeast", "northwest", "northwes~
## $ charges <dbl> 16884.924, 1725.552, 4449.462, 21984.471, 3866.855, 3756.622,~
This project aims to:
Determine if the mean insurance charges of male individuals in the dataset is significantly different from the mean charges of female
Determine if the mean insurance charges of Smokers in the dataset is different from the mean charges of Non - smokers
Formulate a multiple Regression model or predicting the insurance charges of individuals
There are 7 variables and 1338 observations in the dataset. six(6) of the Variable in the dataset are potential predictor of the of the 7th variables (Insurance charges). There are no missing value in any of the observation. Each observation represents the likely variable that play vital roles in determining the insurance charge
This dataset was downloaded from kaggle and then uploaded to my github repository. The data can be accessed directly from the repository at Github
This is an observational study as there is no control group.
Data is from kaggle public datasets and can be found online here: https://www.kaggle.com/mirichoi0218/insurance
The Dependent variable is the Insurance Charges and its numerical
There are six(6) independent used. They independent variables are: Age(numeric), sex(numeric), BMI(numeric), Children(numeric), Smoker(categorical), Region(categorical)
summary(insurance)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
plot1<-ggplot(insurance, aes(x=age))+
geom_histogram(color="black", fill="purple", bins=10)+
labs(title="Historam of Age Distribution")+
theme(plot.title = element_text(size=14))
plot1
plot2<-ggplot(insurance, aes(x=smoker, y=age, color=smoker)) +
geom_boxplot()+
scale_color_manual(values=c('green', "violet"))+
labs(title="Age Distribution by smoker")+
theme(plot.title = element_text(size=14))
plot2
p1<-ggplot(insurance, aes(x=charges))+
geom_histogram(color="black", fill="mediumorchid1", bins=40)+
geom_vline(aes(xintercept= 13270), color="blue", linetype="dashed", size=1)+
geom_vline(aes(xintercept= 9382), color="red", linetype="dashed", size=1)+
annotate("text", x= 20000, y=110, size=2, label="Mean=13270", color="blue")+
annotate("text", x= 20000, y=120, size=2, label="Median=9382", color="red")
p2<-ggplot(insurance, aes(x=sex, y=charges)) +
geom_boxplot(fill="mediumorchid1")+
stat_summary(fun=mean, geom="point", color="blue")+
coord_flip()
p3<-ggplot(insurance, aes(x=smoker, y=charges)) +
geom_boxplot(fill="mediumorchid1")+
stat_summary(fun=mean, geom="point", color="blue")+
coord_flip()
p4<-ggplot(insurance, aes(x=region, y=charges)) +
geom_boxplot(fill="mediumorchid1")+
stat_summary(fun=mean, geom="point", color="blue")+
coord_flip()
options(repr.plot.width=9, repr.plot.height=20)
layout<-"
ABB
CDD"
p1 + p2 + p3 + p4 + plot_layout(design = layout)+
plot_annotation(title="Charges Distribution",
theme = theme(plot.title = element_text(size = 28, hjust = 0.5)))
### age, charges, sex
plot3<-ggplot(insurance, aes(x=sex, y=age, color=sex)) +
geom_sina()+
scale_color_manual(values=c('hotpink', "royalblue"))+
labs(title="Age Distribution by sex")+
theme(plot.title = element_text(size=14))
plot4<-ggplot(insurance, aes(x=age, y=charges, color= sex))+
geom_jitter(alpha=0.3, size=2.5)+
scale_color_manual(values=c('hotpink', "royalblue"))+
geom_rug()+
geom_smooth(method=lm, formula=y~x)+
labs(title="Age x Charges by sex")+
theme(plot.title = element_text(size=14))
### age, charges, smoker
plot5<-ggplot(insurance, aes(x=smoker, y=age, color=smoker)) +
geom_sina()+
scale_color_manual(values=c('grey', "brown"))+
labs(title="Age Distribution by smoker")+
theme(plot.title = element_text(size=14))
plot6<-ggplot(insurance, aes(x=age, y=charges, color= smoker))+
geom_jitter(alpha=0.3, size=2.5)+
scale_color_manual(values=c('brown', "grey"))+
geom_rug()+
geom_smooth(method=lm, formula=y~x)+
labs(title="Age x Charges by smoker")+
theme(plot.title = element_text(size=14))
options(repr.plot.width=10, repr.plot.height=35)
layout<-"
ABB
CDD"
plot3 + plot4 + plot5 + plot6 + plot_layout(design = layout)