## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'flexdashboard' was built under R version 4.3.3
## Warning: package 'gapminder' was built under R version 4.3.3
## Warning: package 'ggthemes' was built under R version 4.3.3
## Warning: package 'lessR' was built under R version 4.3.3
library(dplyr)
library(ggplot2)
library(caret)
library(DataExplorer)
library(car)
library(ggcorrplot)
library(GGally)
library(kableExtra)
library(dplyr)
library(GGally)
library(ggcorrplot)
library(MLmetrics)
library(performance)
library(lmtest)
library(highcharter)
library(RColorBrewer)
library(DT)
library(moments)
#library(see)
library(MASS)
library(psych)
library(Hmisc)
library(corrplot)
library(ROCR)
library(broom)
library(WVPlots)
library(randomForest)
library(pROC)
Insurance <- read.csv("https://raw.githubusercontent.com/uplotnik/DATA621/main/insurance_data.csv")
str(Insurance)## 'data.frame': 10008 obs. of 13 variables:
## $ age : num 45 64 19 36 19 34 47 63 19 20 ...
## $ sex : chr "male" "male" "female" "male" ...
## $ bmi : num 28.7 34.5 32.1 28.9 24.6 ...
## $ children : num 2 0 0 3 1 0 1 0 1 0 ...
## $ smoker : chr "no" "no" "no" "no" ...
## $ Claim_Amount : num 32994 38448 50778 33741 12198 ...
## $ past_consultations : num 16 8 15 10 16 5 23 7 3 10 ...
## $ num_of_steps : num 902022 956604 758688 879560 793026 ...
## $ Hospital_expenditure : num 8640895 11022389 1642626 1985637 10009377 ...
## $ NUmber_of_past_hospitalizations: num 1 1 0 1 1 1 1 1 0 0 ...
## $ Anual_Salary : num 94365914 230021899 46443495 130616936 61133917 ...
## $ region : chr "southwest" "southwest" "northwest" "northeast" ...
## $ charges : num 8028 13823 2131 6749 2709 ...
## age sex bmi children
## Min. :18 Length:10008 Min. :15.96 Min. :0.000
## 1st Qu.:26 Class :character 1st Qu.:26.12 1st Qu.:0.000
## Median :39 Mode :character Median :30.03 Median :1.000
## Mean :39 Mean :30.44 Mean :1.063
## 3rd Qu.:51 3rd Qu.:34.32 3rd Qu.:2.000
## Max. :64 Max. :53.13 Max. :5.000
## NA's :73 NA's :24 NA's :45
## smoker Claim_Amount past_consultations num_of_steps
## Length:10008 Min. : 1920 Min. : 1.00 Min. : 695430
## Class :character 1st Qu.:19541 1st Qu.: 9.00 1st Qu.: 841902
## Mode :character Median :32800 Median :14.00 Median : 903769
## Mean :32221 Mean :14.42 Mean : 896863
## 3rd Qu.:44148 3rd Qu.:19.00 3rd Qu.: 949238
## Max. :77278 Max. :40.00 Max. :1107872
## NA's :99 NA's :51 NA's :24
## Hospital_expenditure NUmber_of_past_hospitalizations Anual_Salary
## Min. : 29453 Min. :0.0000 Min. : 2747072
## 1st Qu.: 3776053 1st Qu.:1.0000 1st Qu.: 73542243
## Median : 6894887 Median :1.0000 Median : 125901988
## Mean : 11198901 Mean :0.9725 Mean : 263017112
## 3rd Qu.: 9721549 3rd Qu.:1.0000 3rd Qu.: 224741452
## Max. :261631699 Max. :3.0000 Max. :4117196637
## NA's :30 NA's :10 NA's :37
## region charges
## Length:10008 Min. : 1122
## Class :character 1st Qu.: 4441
## Mode :character Median : 8550
## Mean :11025
## 3rd Qu.:13224
## Max. :63770
##
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 10008 13 3 10 0
## total_missing_values complete_rows total_observations memory_usage
## 1 393 9618 130104 1044952
plot_intro(Insurance,
ggtheme = theme_minimal(),
title = "Missing Data",
theme_config = theme(plot.title = element_text(color = "orange")),
geom_label_args = c(hjust = "inward")
)DataExplorer::plot_histogram(
geom_histogram_args = list(alpha = 0.5),
data = Insurance,
ggtheme=theme_bw())categ_cols <- Insurance %>% select_if(~ class(.) == "factor")
for (col in names(categ_cols)) {
t <- Insurance %>%
group_by_(col) %>%
summarise(count = n()) %>%
mutate(frequency = paste0(round(100 * count / sum(count), 0), "%")) %>%
knitr::kable("html", align = "lcc") %>%
kableExtra::kable_styling(full_width = F, position = "left") %>%
print()
}
categ_cols## data frame with 0 columns and 10008 rows
Insurance[ , c("sex", "smoker", "region")] <-
lapply(Insurance[ , c("sex", "smoker", "region")], as.factor)
str(Insurance)## 'data.frame': 10008 obs. of 13 variables:
## $ age : num 45 64 19 36 19 34 47 63 19 20 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 1 1 ...
## $ bmi : num 28.7 34.5 32.1 28.9 24.6 ...
## $ children : num 2 0 0 3 1 0 1 0 1 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Claim_Amount : num 32994 38448 50778 33741 12198 ...
## $ past_consultations : num 16 8 15 10 16 5 23 7 3 10 ...
## $ num_of_steps : num 902022 956604 758688 879560 793026 ...
## $ Hospital_expenditure : num 8640895 11022389 1642626 1985637 10009377 ...
## $ NUmber_of_past_hospitalizations: num 1 1 0 1 1 1 1 1 0 0 ...
## $ Anual_Salary : num 94365914 230021899 46443495 130616936 61133917 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 4 2 1 2 3 4 2 4 3 ...
## $ charges : num 8028 13823 2131 6749 2709 ...
clean<-Insurance %>%
mutate_at(vars(c('age', 'bmi', 'children', 'Claim_Amount', 'past_consultations',
'num_of_steps', 'Hospital_expenditure', 'NUmber_of_past_hospitalizations', 'Anual_Salary')), ~ifelse(is.na(.), median(., na.rm = TRUE), .))
print("Count of total missing values in after imputation ")## [1] "Count of total missing values in after imputation "
## [1] 0
age_group <- cut(clean$age,
breaks = seq(10, 100, by = 10),
include.lowest = TRUE)
InsuranceData <- cbind(clean, age_group)
group<-InsuranceData%>%
count(age_group)
group %>%
hchart('column', hcaes(x = 'age_group', y = 'n'))%>% hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 15, beta = 15), dataLabels=list(enabled=TRUE))%>%
hc_plotOptions(series = list(
borderWidth= 0,
dataLabels = list(
enabled = TRUE,
color = "black",
format = '{point.y}')))sm<-Insurance %>%
count(smoker)
sm %>%
hchart('pie', hcaes(x = 'smoker', y = 'n'))%>% hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 15, beta = 15), dataLabels=list(enabled=TRUE))BMI<-clean %>% mutate(bmi_cat = cut(bmi,
breaks = c(0, 18.5, 25, 30, 60),
labels = c("Under Weight", "Normal Weight", "Overweight", "Obese")
))
BMI %>%
count(bmi_cat) %>%
hchart('bar', hcaes(x = 'bmi_cat', y = 'n'))%>% hc_chart(type = "bar", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>%
hc_plotOptions(series = list(
borderWidth= 0,
dataLabels = list(
enabled = TRUE,
color = "black",
format = '{point.y}')))#Correlation table
library(corrr)
cor<-clean %>%
correlate() %>%
focus(charges) %>%arrange(desc(charges))
knitr::kable(
cor, caption = "Correlation Table")%>%
kable_styling("striped", full_width = F)| term | charges |
|---|---|
| Anual_Salary | 0.9284001 |
| num_of_steps | 0.8758246 |
| Hospital_expenditure | 0.8315979 |
| NUmber_of_past_hospitalizations | 0.7621982 |
| past_consultations | 0.5392697 |
| Claim_Amount | 0.3502274 |
| age | 0.3216199 |
| bmi | 0.1274681 |
| children | 0.0772186 |
clean%>%
hchart(type = "column", hcaes(x = smoker, y = charges))%>% hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='The cost of health insurance for smokers vs. nonsmokers')clean%>%
hchart(type = "scatter", hcaes(x = age, y = charges))%>% hc_chart(type = "scatter", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='Health Insurance Charges by Age')clean%>%
hchart(type = "scatter", hcaes(x = bmi, y = charges))%>% hc_chart(type = "scatter", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='BMI and Health Insurance Charges')clean%>%
hchart(type = "scatter", hcaes(x = children, y = charges))%>% hc_chart(type = "scatter", options3d = list(enabled = TRUE, alpha = 30, beta = 0))%>% hc_title(text='Amount of children and Health Insurance Charges')RNGkind(sample.kind = "Rounding")
set.seed(100)
index_ins <- sample(x = nrow(clean) , size = nrow(clean)*0.8)
data_train <- clean[index_ins , ]
data_test <- clean[-index_ins, ]
cat(" Size of data train:",dim(data_train),"\n","Size of data test :",dim(data_test))## Size of data train: 8006 13
## Size of data test : 2002 13
#Selecting predictor variable (age) and target variable (charges)
simple_linear_model <- lm(charges ~ age, data = Insurance)
#Summary of the model
summary(simple_linear_model)##
## Call:
## lm(formula = charges ~ age, data = Insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6035 -4758 -3886 -2258 50159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2328.490 273.902 8.501 <0.0000000000000002 ***
## age 224.575 6.613 33.962 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9193 on 9933 degrees of freedom
## (73 observations deleted due to missingness)
## Multiple R-squared: 0.104, Adjusted R-squared: 0.1039
## F-statistic: 1153 on 1 and 9933 DF, p-value: < 0.00000000000000022
Overall, the model suggests that there is a statistically significant relationship between age and charges. However, the relatively low R-squared value indicates that age alone explains only a small portion of the variability in charges.
#Selecting predictor variables (age, bmi, smoker, etc.) and target variable (charges)
multiple_linear_model <- lm(charges ~ age + bmi + smoker + children + region, data = Insurance)
#Summary of the model
summary(multiple_linear_model)##
## Call:
## lm(formula = charges ~ age + bmi + smoker + children + region,
## data = Insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9731 -2205 -945 555 32699
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9186.847 311.490 -29.493 < 0.0000000000000002 ***
## age 248.326 3.857 64.388 < 0.0000000000000002 ***
## bmi 243.004 9.207 26.393 < 0.0000000000000002 ***
## smokeryes 21258.245 150.796 140.973 < 0.0000000000000002 ***
## children 460.197 44.555 10.329 < 0.0000000000000002 ***
## regionnorthwest -218.764 151.075 -1.448 0.148
## regionsoutheast -729.285 153.830 -4.741 0.00000216 ***
## regionsouthwest -646.144 152.342 -4.241 0.00002242 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5269 on 9858 degrees of freedom
## (142 observations deleted due to missingness)
## Multiple R-squared: 0.707, Adjusted R-squared: 0.7068
## F-statistic: 3398 on 7 and 9858 DF, p-value: < 0.00000000000000022
Overall, the model suggests that age, BMI, smoking status, and the number of children have significant effects on insurance charges. However, the region variable may not have a significant effect on charges after considering other variables in the model.
#Categorizing charges into two classes based on a threshold value
#For demonstration, let's consider charges above the median as high and all other charges as low
Insurance$charge_category_binary <- ifelse(Insurance$charges > median(Insurance$charges), 1, 0)
#Building logistic regression model with binary target variable
logistic_model <- glm(charge_category_binary ~ age + bmi + smoker + children + region,
data = Insurance, family = binomial(link = "logit"))
#Summary of the model
summary(logistic_model)##
## Call:
## glm(formula = charge_category_binary ~ age + bmi + smoker + children +
## region, family = binomial(link = "logit"), data = Insurance)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.589383 0.251923 -34.095 < 0.0000000000000002 ***
## age 0.189844 0.003975 47.760 < 0.0000000000000002 ***
## bmi 0.019540 0.005882 3.322 0.000894 ***
## smokeryes 22.504310 216.354581 0.104 0.917157
## children 0.207776 0.028368 7.324 0.000000000000240026 ***
## regionnorthwest -0.485189 0.096851 -5.010 0.000000545292976719 ***
## regionsoutheast -0.909917 0.099955 -9.103 < 0.0000000000000002 ***
## regionsouthwest -0.796180 0.097371 -8.177 0.000000000000000292 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13675.6 on 9865 degrees of freedom
## Residual deviance: 5631.1 on 9858 degrees of freedom
## (142 observations deleted due to missingness)
## AIC: 5647.1
##
## Number of Fisher Scoring iterations: 18
In the context of the provided data and the target variable (charge_category_binary), running a logistic regression model might not be the most appropriate choice. Logistic regression is typically used for binary classification problems where the outcome variable is categorical and binary, meaning it has only two possible outcomes or classes.