library(tidyverse)
library(ggplot2)
library(corrplot)
insurance <- read.csv("C:/Users/PC/Documents/R_4DS/Insurance/insurance.csv")
head(insurance)
## age sex bmi children smoker region charges
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
str(insurance)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
colSums(is.na(insurance)) #=> 0
## age sex bmi children smoker region charges
## 0 0 0 0 0 0 0
Top DISTRIBUTIION
LEt us assume we are trying to PRedict Charges
options(repr.plot.height=4)
insurance %>%
ggplot(aes(x = charges)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
It is important to note the type of Outcome Variable we have as this will determine the type of analysis we will perform on the variable.
## numeric Vs factorVars
numericVars <- select_if(insurance, is.numeric)
factorVars <- select_if(insurance, is.factor)
cat("The dataset has", length(numericVars), "numeric Variables and ", length(factorVars), "factor variables")
## The dataset has 4 numeric Variables and 3 factor variables
names(numericVars)
## [1] "age" "bmi" "children" "charges"
## function to manage Distribution of Numerical Variables
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
plot_numeric_distributions <- function(.data, numericVar) {
# var_col <- enquo(numericVar)
v1 <- .data %>%ggplot(aes(x = {{numericVar}}, y = ..count../sum(..count..))) +
geom_histogram(fill = "cornflowerblue",
color = "white",
binwidth = 5) +
labs(title = "Participants",
y = "Insurance Percentage",
x = "Distribution") +
scale_y_continuous(labels = percent)
v2 <- .data %>%
ggplot(aes(x = {{numericVar}})) +
geom_density(fill = "cornflowerblue") +
labs(title = "Density Distribution",
y = "Insurance Percentage",
x = "Distribution")
gridExtra::grid.arrange(v1, v2)
}
## Check
### age
plot_numeric_distributions(insurance, age)
We have the most participants aged 20, and this distribution withers on and off till around age 70, where the participants are the least.
names(factorVars)
## [1] "sex" "smoker" "region"
## function to tidy plot data
tidy_plot_data <- function(.data, var) {
plotdata <- .data %>%
count({{var}}) %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct * 100), "%"))
plotdata %>%
ggplot(aes(x = reorder({{var}}, -pct), y = pct)) +
geom_bar(stat = "identity", fill = "indianred3", color = "orange") +
geom_text(aes(label = pctlabel), vjust = -0.25) +
scale_y_continuous(labels = percent) +
labs(x = "Factor",
y = "Percent",
title = paste0("Participants"))
}
## Check
### sex
tidy_plot_data(insurance, sex)
## Measuring Relationships ### Numeric X Outcome Variable: Correlation Analysis
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
correlation <- cor(numericVars, use = "pairwise.complete.obs")
kable(round(correlation, 2))
| age | bmi | children | charges | |
|---|---|---|---|---|
| age | 1.00 | 0.11 | 0.04 | 0.30 |
| bmi | 0.11 | 1.00 | 0.01 | 0.20 |
| children | 0.04 | 0.01 | 1.00 | 0.07 |
| charges | 0.30 | 0.20 | 0.07 | 1.00 |
library(ggcorrplot)
ggcorrplot(correlation,
hc.order = TRUE, # reorders the variables, placing variables with similar correlation patterns together.
type = "lower", lab = TRUE)
cor(numericVars) %>%
corrplot(method = "color", type = "lower", tl.col = "black", tl.srt = 45,
p.mat = cor.mtest(numericVars)$p,
insig = "p-value", sig.level = -1)
Fig. 12
To use Correlation to analyze for the whole data, we have do some encoding - so that the modelling can deal with numbers and not labels.
encode_df <- insurance
encode_df$sex <- ifelse(encode_df$sex == "female", 0, 1) #=> Female = 0, Male = 1
encode_df$smoker <- ifelse(encode_df$smoker == "no", 0, 1)
encode_df$region <- ifelse(encode_df$region == "northeast", 4, ## if region is NE make it 4, but also
ifelse(encode_df$region == "northwest", 3, ## if region is NW make it 3, but also
ifelse(encode_df$region == "southeast", 2, 1))) ## if region is SE make it 2, else make it 1
str(encode_df) ## Note they are classified as numeric Vars
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : num 0 1 1 1 1 0 0 0 1 0 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : num 1 0 0 0 0 0 0 0 0 0 ...
## $ region : num 1 2 2 3 3 2 2 3 4 3 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
# calulate the correlations
correlation_II <- cor(encode_df, use="complete.obs")
round(correlation_II,2)
## age sex bmi children smoker region charges
## age 1.00 -0.02 0.11 0.04 -0.03 0.00 0.30
## sex -0.02 1.00 0.05 0.02 0.08 0.00 0.06
## bmi 0.11 0.05 1.00 0.01 0.00 -0.16 0.20
## children 0.04 0.02 0.01 1.00 0.01 -0.02 0.07
## smoker -0.03 0.08 0.00 0.01 1.00 0.00 0.79
## region 0.00 0.00 -0.16 -0.02 0.00 1.00 0.01
## charges 0.30 0.06 0.20 0.07 0.79 0.01 1.00
correlation_II %>%
ggcorrplot(hc.order = TRUE,
type = "lower",
lab = TRUE)
correlation_II %>%
corrplot(method = "color", type = "lower", tl.col = "black", tl.srt = 45,
p.mat = cor.mtest(correlation_II)$p,
insig = "p-value", sig.level = -1)
Fig. 12
Where p < 0.05 we can accept that there is a significant relationship between the highlighted Variables. As we can see between charges and Smokers Variable
# plot the distribution of Insurance by Outcome Variable_ Charges
# by rank using jittering
library(ggpol)
library(scales)
insurance %>%
ggplot(aes(x = factor(smoker),
y = charges,
fill = smoker)) +
geom_boxjitter(color = "black",
jitter.color = "darkgrey",
errorbar.draw = TRUE) +
scale_y_continuous(label = dollar) +
labs(title = "Smoker Distribution by Insurance Charges",
x = "Do you smoke?") +
theme_minimal() +
theme(legend.position = "none")
The Bulk of those who do not Smoke are dense at the bottom of Insurance Charges at almost exactly the top of this pack is where does who smoke start to converge an get ever higher charges - the graph is simple and clearly demarcates.
At this stage we can say we have satisfactorily Explore the primary relationship within the dataset, however for good measure let us create a Linear Regression - to atleast have a single coeficient that can describe our dataset.
# insurance_lm <- lm(charges ~ )
insurance_lm <- lm(charges ~ age + sex + bmi + children + smoker + region,
data = insurance)
insurance_lm
##
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker +
## region, data = insurance)
##
## Coefficients:
## (Intercept) age sexmale bmi
## -11938.5 256.9 -131.3 339.2
## children smokeryes regionnorthwest regionsoutheast
## 475.5 23848.5 -353.0 -1035.0
## regionsouthwest
## -960.1
Visualizing our Results.
# conditional plot of price vs. living area
library(ggplot2)
library(visreg)
visreg(insurance_lm, "age", gg = TRUE)
# conditional plot of price vs. waterfront location
visreg(insurance_lm, "smoker", gg = TRUE) +
scale_y_continuous(label = scales::dollar) +
labs(title = "Relationship between Insurance Charges and Smokers",
subtitle = "controlling for age, sex, bmi, children, smoker and region",
caption = "source: Kaggle Dataset",
y = "Insurance Charges",
x = "Smokers")