Assignment M3 - Data Analytics For Public Policy

library(ggplot2)
library(corrplot)

## corrplot 0.92 loaded

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(caret)

## Loading required package: lattice

setwd("C:\\Users\\M.Mando\\Desktop\\M4-data analytics")
df <- read.csv("bank-full.csv",header = TRUE, sep = ";")
attach(df)
head(df)

##   age          job marital education default balance housing loan contact day
## 1  58   management married  tertiary      no    2143     yes   no unknown   5
## 2  44   technician  single secondary      no      29     yes   no unknown   5
## 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
## 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
## 5  33      unknown  single   unknown      no       1      no   no unknown   5
## 6  35   management married  tertiary      no     231     yes   no unknown   5
##   month duration campaign pdays previous poutcome  y
## 1   may      261        1    -1        0  unknown no
## 2   may      151        1    -1        0  unknown no
## 3   may       76        1    -1        0  unknown no
## 4   may       92        1    -1        0  unknown no
## 5   may      198        1    -1        0  unknown no
## 6   may      139        1    -1        0  unknown no

#1. Feature Importance Visualization:
#To visualize the importance of features, you can use a bar plot to display the distribution of a specific feature #(e.g., "age," "balance," etc.). For example, to visualize the distribution of ages:
# Create a histogram of ages
ggplot(df, aes(x = age)) +
  geom_histogram(fill = "blue", bins = 20) +
  labs(title = "Distribution of Age", x = "Age", y = "Frequency")

#2. Correlation Visualization:
#To visualize the correlation between numeric features, you can create a correlation matrix heatmap. This can help #identify relationships between variables:
# Select numeric columns for correlation matrix
numeric_data <- df[, c("age", "balance", "duration", "campaign", "pdays", "previous")]

# Compute the correlation matrix
correlation_matrix <- cor(numeric_data)

# Create a correlation matrix heatmap
corrplot(correlation_matrix, method = "color", type = "upper", tl.cex = 0.7)

# Define a color palette (you can customize this or use a different palette)
palette2 <- scales::hue_pal()(2)
df %>%
  select(y, age, balance) %>%
  gather(Variable, value, -y) %>%
  ggplot(aes(y, value, fill = y)) + 
    geom_bar(position = "dodge", stat = "summary", fun = "mean") + 
    facet_wrap(~Variable, scales = "free") +
    scale_fill_manual(values = palette2) +
    labs(x = "Churn", y = "Mean", 
         title = "Feature associations with the decision to take the credit (Yes/No)",
         subtitle = "(Continuous outcomes)") +
    theme(legend.position = "none")

# Assuming 'df' contains your dataset

library(dplyr)
library(tidyr)
library(ggplot2)

df %>%
  select(y, job, marital, education, default, housing, loan, contact, month, poutcome) %>%
  pivot_longer(cols = c(job, marital, education, default, housing, loan, contact, month, poutcome), 
               names_to = "Variable", values_to = "value") %>%
  count(Variable, value, y) %>%
  filter(value == "yes") %>%
  ggplot(aes(y, n, fill = y)) +   
    geom_bar(position = "dodge", stat = "identity") +
    facet_wrap(~Variable, scales = "free", ncol = 3) +
    scale_fill_manual(values = c("yes" = "blue", "no" = "red")) + # You can customize colors
    labs(x = "Take the credit", y = "Count",
         title = "Feature associations with the decision to take the credit (Yes/No)",
         subtitle = "Categorical features (Yes and No)") +
    theme(legend.position = "none")

df %>%
  select(y, job, marital, education, default, contact, month, poutcome) %>%
  pivot_longer(cols = c(job, marital, education, contact, month, poutcome), 
               names_to = "Variable", values_to = "value") %>%
  count(Variable, value, y) %>%
  filter(!is.na(value)) %>%
  ggplot(aes(value, n, fill = y)) +   
    geom_bar(position = "dodge", stat = "identity") +
    facet_wrap(~Variable, scales = "free") +
    scale_fill_manual(values = c("yes" = "blue", "no" = "red")) + # You can customize colors
    labs(x = "Value", y = "Count",
         title = "Feature associations with the decision to take the credit (Yes/No)",
         subtitle = "Three-category features or more") +
    theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1))

# Set a random seed for reproducibility
set.seed(123)

# Create an index for splitting the data
index <- createDataPartition(df$y, p = 0.65, list = FALSE)

# Split the data into training and test sets
df_train <- df[index, ]
df_test <- df[-index, ]

dim(df_train)

## [1] 29388    17

dim(df_test)

## [1] 15823    17

# Encode categorical variables using one-hot encoding
df1 <- df %>%
  mutate_if(is.character, as.factor) %>%
  select_if(is.factor) %>%
  mutate_all(funs(as.numeric(.)))

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Binning age into age groups
df1 <- df1 %>%
  mutate(age_group = cut(age, breaks = c(0, 30, 50, Inf), labels = c("young", "middle-aged", "senior")))

# Perform feature scaling on numeric variables (e.g., z-score standardization)
numeric_vars <- c("age", "balance", "duration", "campaign", "pdays", "previous")
df1[numeric_vars] <- scale(df[numeric_vars])

# Create interaction terms (example: housing_loan_interaction)
df1 <- df1 %>%
  mutate(
    housing_loan_interaction = housing * loan,
    education_job_interaction = education * job,
    balance_duration_ratio = balance / duration
  )


# Derive new features (balance_to_age_ratio)
df1 <- df1 %>%
  mutate(balance_to_age_ratio = balance / age)

The feature engineering steps applied to the dataset involve several transformations to enhance the data’s suitability for predictive modeling. Firstly, categorical variables are one-hot encoded, enabling the model to process them effectively. Next, numeric variables are standardized using z-score standardization, ensuring that they have consistent scales for modeling. Interaction terms are introduced, like the “housing_loan_interaction,” which captures potential interactions between owning a house and having a loan. Additionally, a novel feature, the “balance_to_age_ratio,” is created to gauge the relationship between account balance and age. Lastly, age is binned into age groups, providing a categorical representation of age for modeling. These feature engineering techniques aim to enrich the dataset with valuable information, making it more conducive to uncovering patterns and relationships that can enhance predictive performance.

# Kitchen Sink Model: Using all available features
kitchen_sink_model <- lm(y ~ ., data = df1)

# Engineered Regression Model: Using selected features including engineered ones
engineered_model <- lm(y ~ education_job_interaction + balance_duration_ratio + housing_loan_interaction + balance_to_age_ratio , data = df1)

# Display regression summaries
summary(kitchen_sink_model)

## 
## Call:
## lm(formula = y ~ ., data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.38071 -0.13727 -0.05860  0.02169  1.10537 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.375e+00  2.319e-02  59.286  < 2e-16 ***
## job                        1.836e-03  1.357e-03   1.353   0.1762    
## marital                    1.379e-02  2.463e-03   5.600 2.15e-08 ***
## education                  1.962e-02  3.554e-03   5.520 3.41e-08 ***
## default                   -1.784e-02  1.012e-02  -1.762   0.0781 .  
## housing                   -1.743e-01  9.167e-03 -19.017  < 2e-16 ***
## loan                      -1.732e-01  1.249e-02 -13.870  < 2e-16 ***
## contact                   -3.855e-02  1.710e-03 -22.550  < 2e-16 ***
## month                      4.892e-03  4.980e-04   9.823  < 2e-16 ***
## poutcome                   2.800e-02  2.734e-03  10.241  < 2e-16 ***
## age_groupmiddle-aged      -6.635e-02  4.821e-03 -13.762  < 2e-16 ***
## age_groupsenior           -7.591e-02  8.635e-03  -8.791  < 2e-16 ***
## age                        2.315e-02  2.731e-03   8.477  < 2e-16 ***
## balance                    6.528e-03  1.378e-03   4.737 2.18e-06 ***
## duration                   1.242e-01  1.345e-03  92.334  < 2e-16 ***
## campaign                  -9.039e-03  1.360e-03  -6.644 3.08e-11 ***
## pdays                      4.381e-02  2.631e-03  16.653  < 2e-16 ***
## previous                   1.836e-02  1.541e-03  11.918  < 2e-16 ***
## housing_loan_interaction   8.026e-02  7.465e-03  10.752  < 2e-16 ***
## education_job_interaction -5.368e-04  6.043e-04  -0.888   0.3743    
## balance_duration_ratio     1.338e-05  1.714e-05   0.781   0.4349    
## balance_to_age_ratio       4.361e-05  5.531e-05   0.789   0.4304    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2845 on 45189 degrees of freedom
## Multiple R-squared:  0.2166, Adjusted R-squared:  0.2163 
## F-statistic: 595.1 on 21 and 45189 DF,  p-value: < 2.2e-16

summary(engineered_model)

## 
## Call:
## lm(formula = y ~ education_job_interaction + balance_duration_ratio + 
##     housing_loan_interaction + balance_to_age_ratio, data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36158 -0.14128 -0.11273 -0.09111  1.01365 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.181e+00  4.298e-03 274.846   <2e-16 ***
## education_job_interaction  1.661e-03  1.721e-04   9.652   <2e-16 ***
## balance_duration_ratio     4.809e-06  1.913e-05   0.251   0.8016    
## housing_loan_interaction  -4.675e-02  1.762e-03 -26.528   <2e-16 ***
## balance_to_age_ratio       1.404e-04  6.120e-05   2.295   0.0218 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3184 on 45206 degrees of freedom
## Multiple R-squared:  0.019,  Adjusted R-squared:  0.01892 
## F-statistic: 218.9 on 4 and 45206 DF,  p-value: < 2.2e-16

# Create a control object for cross-validation
ctrl <- trainControl(method = "cv", number = 5)

# Kitchen Sink Model: Using all available features
kitchen_sink_model_cv <- train(y ~ ., data = df1, method = "lm", trControl = ctrl)

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

# Engineered Regression Model: Using selected features including engineered ones
engineered_model_cv <- train(y ~ education_job_interaction + balance_duration_ratio + housing_loan_interaction + balance_to_age_ratio , data = df1, method = "lm", trControl = ctrl)

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

# Display cross-validation results
print(kitchen_sink_model_cv)

## Linear Regression 
## 
## 45211 samples
##    20 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 36168, 36169, 36169, 36169, 36169 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.2850687  0.2135844  0.1793623
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

print(engineered_model_cv)

## Linear Regression 
## 
## 45211 samples
##     4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 36169, 36169, 36169, 36168, 36169 
## Resampling results:
## 
##   RMSE       Rsquared    MAE      
##   0.3183547  0.01898069  0.2028148
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

# Load the pROC package
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Create ROC curves and calculate Sensitivity and Specificity for both models
roc_kitchen_sink <- roc(df1$y, predict(kitchen_sink_model, newdata = df1))

## Setting levels: control = 1, case = 2

## Setting direction: controls < cases

roc_engineered <- roc(df1$y, predict(engineered_model, newdata = df1))

## Setting levels: control = 1, case = 2
## Setting direction: controls < cases

# Create facetted plots
par(mfrow=c(1,2))  # Arrange plots side by side

# Plot ROC curve for the "kitchen sink" model
plot(roc_kitchen_sink, main = "ROC Curve - Kitchen Sink Model", col = "blue")
legend("bottomright", legend = paste("AUC =", round(auc(roc_kitchen_sink), 2)), col = "blue")

# Plot ROC curve for the "engineered" model
plot(roc_engineered, main = "ROC Curve - Engineered Model", col = "red")
legend("bottomright", legend = paste("AUC =", round(auc(roc_engineered), 2)), col = "red")

# Reset the plotting layout
par(mfrow=c(1,1))

Assignment M3 - Data Analytics For Public Policy

Group M&M

2023-09-28