library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(caret)
## Loading required package: lattice
setwd("C:\\Users\\M.Mando\\Desktop\\M4-data analytics")
df <- read.csv("bank-full.csv",header = TRUE, sep = ";")
attach(df)
head(df)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no unknown 5
## 2 44 technician single secondary no 29 yes no unknown 5
## 3 33 entrepreneur married secondary no 2 yes yes unknown 5
## 4 47 blue-collar married unknown no 1506 yes no unknown 5
## 5 33 unknown single unknown no 1 no no unknown 5
## 6 35 management married tertiary no 231 yes no unknown 5
## month duration campaign pdays previous poutcome y
## 1 may 261 1 -1 0 unknown no
## 2 may 151 1 -1 0 unknown no
## 3 may 76 1 -1 0 unknown no
## 4 may 92 1 -1 0 unknown no
## 5 may 198 1 -1 0 unknown no
## 6 may 139 1 -1 0 unknown no
#1. Feature Importance Visualization:
#To visualize the importance of features, you can use a bar plot to display the distribution of a specific feature #(e.g., "age," "balance," etc.). For example, to visualize the distribution of ages:
# Create a histogram of ages
ggplot(df, aes(x = age)) +
geom_histogram(fill = "blue", bins = 20) +
labs(title = "Distribution of Age", x = "Age", y = "Frequency")
#2. Correlation Visualization:
#To visualize the correlation between numeric features, you can create a correlation matrix heatmap. This can help #identify relationships between variables:
# Select numeric columns for correlation matrix
numeric_data <- df[, c("age", "balance", "duration", "campaign", "pdays", "previous")]
# Compute the correlation matrix
correlation_matrix <- cor(numeric_data)
# Create a correlation matrix heatmap
corrplot(correlation_matrix, method = "color", type = "upper", tl.cex = 0.7)
# Define a color palette (you can customize this or use a different palette)
palette2 <- scales::hue_pal()(2)
df %>%
select(y, age, balance) %>%
gather(Variable, value, -y) %>%
ggplot(aes(y, value, fill = y)) +
geom_bar(position = "dodge", stat = "summary", fun = "mean") +
facet_wrap(~Variable, scales = "free") +
scale_fill_manual(values = palette2) +
labs(x = "Churn", y = "Mean",
title = "Feature associations with the decision to take the credit (Yes/No)",
subtitle = "(Continuous outcomes)") +
theme(legend.position = "none")
# Assuming 'df' contains your dataset
library(dplyr)
library(tidyr)
library(ggplot2)
df %>%
select(y, job, marital, education, default, housing, loan, contact, month, poutcome) %>%
pivot_longer(cols = c(job, marital, education, default, housing, loan, contact, month, poutcome),
names_to = "Variable", values_to = "value") %>%
count(Variable, value, y) %>%
filter(value == "yes") %>%
ggplot(aes(y, n, fill = y)) +
geom_bar(position = "dodge", stat = "identity") +
facet_wrap(~Variable, scales = "free", ncol = 3) +
scale_fill_manual(values = c("yes" = "blue", "no" = "red")) + # You can customize colors
labs(x = "Take the credit", y = "Count",
title = "Feature associations with the decision to take the credit (Yes/No)",
subtitle = "Categorical features (Yes and No)") +
theme(legend.position = "none")
df %>%
select(y, job, marital, education, default, contact, month, poutcome) %>%
pivot_longer(cols = c(job, marital, education, contact, month, poutcome),
names_to = "Variable", values_to = "value") %>%
count(Variable, value, y) %>%
filter(!is.na(value)) %>%
ggplot(aes(value, n, fill = y)) +
geom_bar(position = "dodge", stat = "identity") +
facet_wrap(~Variable, scales = "free") +
scale_fill_manual(values = c("yes" = "blue", "no" = "red")) + # You can customize colors
labs(x = "Value", y = "Count",
title = "Feature associations with the decision to take the credit (Yes/No)",
subtitle = "Three-category features or more") +
theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1))
# Set a random seed for reproducibility
set.seed(123)
# Create an index for splitting the data
index <- createDataPartition(df$y, p = 0.65, list = FALSE)
# Split the data into training and test sets
df_train <- df[index, ]
df_test <- df[-index, ]
dim(df_train)
## [1] 29388 17
dim(df_test)
## [1] 15823 17
# Encode categorical variables using one-hot encoding
df1 <- df %>%
mutate_if(is.character, as.factor) %>%
select_if(is.factor) %>%
mutate_all(funs(as.numeric(.)))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
##
## # Simple named list: list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
##
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Binning age into age groups
df1 <- df1 %>%
mutate(age_group = cut(age, breaks = c(0, 30, 50, Inf), labels = c("young", "middle-aged", "senior")))
# Perform feature scaling on numeric variables (e.g., z-score standardization)
numeric_vars <- c("age", "balance", "duration", "campaign", "pdays", "previous")
df1[numeric_vars] <- scale(df[numeric_vars])
# Create interaction terms (example: housing_loan_interaction)
df1 <- df1 %>%
mutate(
housing_loan_interaction = housing * loan,
education_job_interaction = education * job,
balance_duration_ratio = balance / duration
)
# Derive new features (balance_to_age_ratio)
df1 <- df1 %>%
mutate(balance_to_age_ratio = balance / age)
The feature engineering steps applied to the dataset involve several transformations to enhance the data’s suitability for predictive modeling. Firstly, categorical variables are one-hot encoded, enabling the model to process them effectively. Next, numeric variables are standardized using z-score standardization, ensuring that they have consistent scales for modeling. Interaction terms are introduced, like the “housing_loan_interaction,” which captures potential interactions between owning a house and having a loan. Additionally, a novel feature, the “balance_to_age_ratio,” is created to gauge the relationship between account balance and age. Lastly, age is binned into age groups, providing a categorical representation of age for modeling. These feature engineering techniques aim to enrich the dataset with valuable information, making it more conducive to uncovering patterns and relationships that can enhance predictive performance.
# Kitchen Sink Model: Using all available features
kitchen_sink_model <- lm(y ~ ., data = df1)
# Engineered Regression Model: Using selected features including engineered ones
engineered_model <- lm(y ~ education_job_interaction + balance_duration_ratio + housing_loan_interaction + balance_to_age_ratio , data = df1)
# Display regression summaries
summary(kitchen_sink_model)
##
## Call:
## lm(formula = y ~ ., data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.38071 -0.13727 -0.05860 0.02169 1.10537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.375e+00 2.319e-02 59.286 < 2e-16 ***
## job 1.836e-03 1.357e-03 1.353 0.1762
## marital 1.379e-02 2.463e-03 5.600 2.15e-08 ***
## education 1.962e-02 3.554e-03 5.520 3.41e-08 ***
## default -1.784e-02 1.012e-02 -1.762 0.0781 .
## housing -1.743e-01 9.167e-03 -19.017 < 2e-16 ***
## loan -1.732e-01 1.249e-02 -13.870 < 2e-16 ***
## contact -3.855e-02 1.710e-03 -22.550 < 2e-16 ***
## month 4.892e-03 4.980e-04 9.823 < 2e-16 ***
## poutcome 2.800e-02 2.734e-03 10.241 < 2e-16 ***
## age_groupmiddle-aged -6.635e-02 4.821e-03 -13.762 < 2e-16 ***
## age_groupsenior -7.591e-02 8.635e-03 -8.791 < 2e-16 ***
## age 2.315e-02 2.731e-03 8.477 < 2e-16 ***
## balance 6.528e-03 1.378e-03 4.737 2.18e-06 ***
## duration 1.242e-01 1.345e-03 92.334 < 2e-16 ***
## campaign -9.039e-03 1.360e-03 -6.644 3.08e-11 ***
## pdays 4.381e-02 2.631e-03 16.653 < 2e-16 ***
## previous 1.836e-02 1.541e-03 11.918 < 2e-16 ***
## housing_loan_interaction 8.026e-02 7.465e-03 10.752 < 2e-16 ***
## education_job_interaction -5.368e-04 6.043e-04 -0.888 0.3743
## balance_duration_ratio 1.338e-05 1.714e-05 0.781 0.4349
## balance_to_age_ratio 4.361e-05 5.531e-05 0.789 0.4304
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2845 on 45189 degrees of freedom
## Multiple R-squared: 0.2166, Adjusted R-squared: 0.2163
## F-statistic: 595.1 on 21 and 45189 DF, p-value: < 2.2e-16
summary(engineered_model)
##
## Call:
## lm(formula = y ~ education_job_interaction + balance_duration_ratio +
## housing_loan_interaction + balance_to_age_ratio, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36158 -0.14128 -0.11273 -0.09111 1.01365
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.181e+00 4.298e-03 274.846 <2e-16 ***
## education_job_interaction 1.661e-03 1.721e-04 9.652 <2e-16 ***
## balance_duration_ratio 4.809e-06 1.913e-05 0.251 0.8016
## housing_loan_interaction -4.675e-02 1.762e-03 -26.528 <2e-16 ***
## balance_to_age_ratio 1.404e-04 6.120e-05 2.295 0.0218 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3184 on 45206 degrees of freedom
## Multiple R-squared: 0.019, Adjusted R-squared: 0.01892
## F-statistic: 218.9 on 4 and 45206 DF, p-value: < 2.2e-16
# Create a control object for cross-validation
ctrl <- trainControl(method = "cv", number = 5)
# Kitchen Sink Model: Using all available features
kitchen_sink_model_cv <- train(y ~ ., data = df1, method = "lm", trControl = ctrl)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Engineered Regression Model: Using selected features including engineered ones
engineered_model_cv <- train(y ~ education_job_interaction + balance_duration_ratio + housing_loan_interaction + balance_to_age_ratio , data = df1, method = "lm", trControl = ctrl)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Display cross-validation results
print(kitchen_sink_model_cv)
## Linear Regression
##
## 45211 samples
## 20 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 36168, 36169, 36169, 36169, 36169
## Resampling results:
##
## RMSE Rsquared MAE
## 0.2850687 0.2135844 0.1793623
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
print(engineered_model_cv)
## Linear Regression
##
## 45211 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 36169, 36169, 36169, 36168, 36169
## Resampling results:
##
## RMSE Rsquared MAE
## 0.3183547 0.01898069 0.2028148
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Load the pROC package
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Create ROC curves and calculate Sensitivity and Specificity for both models
roc_kitchen_sink <- roc(df1$y, predict(kitchen_sink_model, newdata = df1))
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
roc_engineered <- roc(df1$y, predict(engineered_model, newdata = df1))
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
# Create facetted plots
par(mfrow=c(1,2)) # Arrange plots side by side
# Plot ROC curve for the "kitchen sink" model
plot(roc_kitchen_sink, main = "ROC Curve - Kitchen Sink Model", col = "blue")
legend("bottomright", legend = paste("AUC =", round(auc(roc_kitchen_sink), 2)), col = "blue")
# Plot ROC curve for the "engineered" model
plot(roc_engineered, main = "ROC Curve - Engineered Model", col = "red")
legend("bottomright", legend = paste("AUC =", round(auc(roc_engineered), 2)), col = "red")
# Reset the plotting layout
par(mfrow=c(1,1))