library(tidyverse)
library(ggplot2)
library(corrplot)
insurance <- read.csv("C:/Users/PC/Documents/R_4DS/Insurance/insurance.csv")

head(insurance)
##   age    sex    bmi children smoker    region   charges
## 1  19 female 27.900        0    yes southwest 16884.924
## 2  18   male 33.770        1     no southeast  1725.552
## 3  28   male 33.000        3     no southeast  4449.462
## 4  33   male 22.705        0     no northwest 21984.471
## 5  32   male 28.880        0     no northwest  3866.855
## 6  31 female 25.740        0     no southeast  3756.622
str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
##  $ charges : num  16885 1726 4449 21984 3867 ...

Data Cleaning

colSums(is.na(insurance)) #=> 0
##      age      sex      bmi children   smoker   region  charges 
##        0        0        0        0        0        0        0

Top DISTRIBUTIION

Outcome Variables

LEt us assume we are trying to PRedict Charges

options(repr.plot.height=4)

insurance %>%
  ggplot(aes(x = charges)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

It is important to note the type of Outcome Variable we have as this will determine the type of analysis we will perform on the variable.

## numeric  Vs factorVars
numericVars <- select_if(insurance, is.numeric)
factorVars <- select_if(insurance, is.factor)

cat("The dataset has", length(numericVars), "numeric Variables and ", length(factorVars), "factor variables")
## The dataset has 4 numeric Variables and  3 factor variables
names(numericVars)
## [1] "age"      "bmi"      "children" "charges"

Distribution of Numerical Variables: Histogram and Density Plot.

## function to manage Distribution of Numerical Variables
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
plot_numeric_distributions <- function(.data, numericVar) {
  # var_col <- enquo(numericVar)
  v1 <- .data %>%ggplot(aes(x = {{numericVar}}, y = ..count../sum(..count..))) +
              geom_histogram(fill = "cornflowerblue",
                             color = "white",
                             binwidth = 5) +
              labs(title = "Participants",
                   y = "Insurance Percentage",
                   x = "Distribution") +
              scale_y_continuous(labels = percent)
  v2 <- .data %>%
          ggplot(aes(x = {{numericVar}})) +
            geom_density(fill = "cornflowerblue") + 
            labs(title = "Density Distribution",
                 y = "Insurance Percentage",
                 x = "Distribution")

  gridExtra::grid.arrange(v1, v2)
}

## Check
### age
plot_numeric_distributions(insurance, age)

We have the most participants aged 20, and this distribution withers on and off till around age 70, where the participants are the least.

names(factorVars)
## [1] "sex"    "smoker" "region"

Distribution by Factor Variables: Bar Charts

## function to tidy plot data
tidy_plot_data <- function(.data, var) {
  plotdata <- .data %>%
    count({{var}}) %>%
    mutate(pct = n / sum(n),
           pctlabel = paste0(round(pct * 100), "%")) 
  
  plotdata %>%
    ggplot(aes(x = reorder({{var}}, -pct), y = pct)) +
      geom_bar(stat = "identity", fill = "indianred3", color = "orange") +
      geom_text(aes(label = pctlabel), vjust = -0.25) +
      scale_y_continuous(labels = percent) +
      labs(x = "Factor",
           y = "Percent",
           title = paste0("Participants"))
}

## Check
### sex
tidy_plot_data(insurance, sex)

## Measuring Relationships ### Numeric X Outcome Variable: Correlation Analysis

library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
correlation <- cor(numericVars, use = "pairwise.complete.obs")
kable(round(correlation, 2))
age bmi children charges
age 1.00 0.11 0.04 0.30
bmi 0.11 1.00 0.01 0.20
children 0.04 0.01 1.00 0.07
charges 0.30 0.20 0.07 1.00
library(ggcorrplot)

ggcorrplot(correlation, 
           hc.order = TRUE, # reorders the variables, placing variables with similar correlation patterns together.
           type = "lower", lab = TRUE)

cor(numericVars) %>%
  corrplot(method = "color", type = "lower", tl.col = "black", tl.srt = 45,
           p.mat = cor.mtest(numericVars)$p,
           insig = "p-value", sig.level = -1)
Fig. 12

Fig. 12

To use Correlation to analyze for the whole data, we have do some encoding - so that the modelling can deal with numbers and not labels.

Correlation Analysis II

encode_df <- insurance
encode_df$sex <- ifelse(encode_df$sex == "female", 0, 1) #=> Female = 0, Male = 1
encode_df$smoker <- ifelse(encode_df$smoker == "no", 0, 1)
encode_df$region <- ifelse(encode_df$region == "northeast", 4, ## if region is NE make it 4, but also 
                           ifelse(encode_df$region == "northwest", 3, ## if region is NW make it 3, but also
                                  ifelse(encode_df$region == "southeast", 2, 1))) ## if region is SE make it 2, else make it 1

str(encode_df) ## Note they are classified as numeric Vars
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : num  0 1 1 1 1 0 0 0 1 0 ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ region  : num  1 2 2 3 3 2 2 3 4 3 ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
# calulate the correlations
correlation_II <- cor(encode_df, use="complete.obs")
round(correlation_II,2)
##            age   sex   bmi children smoker region charges
## age       1.00 -0.02  0.11     0.04  -0.03   0.00    0.30
## sex      -0.02  1.00  0.05     0.02   0.08   0.00    0.06
## bmi       0.11  0.05  1.00     0.01   0.00  -0.16    0.20
## children  0.04  0.02  0.01     1.00   0.01  -0.02    0.07
## smoker   -0.03  0.08  0.00     0.01   1.00   0.00    0.79
## region    0.00  0.00 -0.16    -0.02   0.00   1.00    0.01
## charges   0.30  0.06  0.20     0.07   0.79   0.01    1.00
correlation_II %>%
  ggcorrplot(hc.order = TRUE, 
           type = "lower",
           lab = TRUE)

correlation_II %>%
  corrplot(method = "color", type = "lower", tl.col = "black", tl.srt = 45,
           p.mat = cor.mtest(correlation_II)$p,
           insig = "p-value", sig.level = -1)
Fig. 12

Fig. 12

Where p < 0.05 we can accept that there is a significant relationship between the highlighted Variables. As we can see between charges and Smokers Variable

Categorical X Outcome Variable: BeesSwarm Plots

# plot the distribution of Insurance by Outcome Variable_ Charges
# by rank using jittering
library(ggpol)
library(scales)

insurance %>%
  ggplot(aes(x = factor(smoker),
         y = charges,
         fill = smoker)) + 
  geom_boxjitter(color = "black",
                 jitter.color = "darkgrey",
                 errorbar.draw = TRUE) +
  scale_y_continuous(label = dollar) +
  labs(title = "Smoker Distribution by Insurance Charges",
       x = "Do you smoke?") +
  theme_minimal() +
  theme(legend.position = "none")

The Bulk of those who do not Smoke are dense at the bottom of Insurance Charges at almost exactly the top of this pack is where does who smoke start to converge an get ever higher charges - the graph is simple and clearly demarcates.

At this stage we can say we have satisfactorily Explore the primary relationship within the dataset, however for good measure let us create a Linear Regression - to atleast have a single coeficient that can describe our dataset.

# insurance_lm <- lm(charges ~ )

insurance_lm <- lm(charges ~ age + sex + bmi + children + smoker + region, 
                data = insurance)

insurance_lm
## 
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker + 
##     region, data = insurance)
## 
## Coefficients:
##     (Intercept)              age          sexmale              bmi  
##        -11938.5            256.9           -131.3            339.2  
##        children        smokeryes  regionnorthwest  regionsoutheast  
##           475.5          23848.5           -353.0          -1035.0  
## regionsouthwest  
##          -960.1

Visualizing our Results.

# conditional plot of price vs. living area
library(ggplot2)
library(visreg)
visreg(insurance_lm, "age", gg = TRUE) 

# conditional plot of price vs. waterfront location
visreg(insurance_lm, "smoker", gg = TRUE) +
  scale_y_continuous(label = scales::dollar) +
  labs(title = "Relationship between Insurance Charges and Smokers",
       subtitle = "controlling for  age, sex, bmi, children, smoker and region",
       caption = "source: Kaggle Dataset",
       y = "Insurance Charges",
       x = "Smokers")