# file path of the data file
filepath <- "bank-term-deposit-marketing-full.csv"
# Read the CSV file into a data frame
df <- read.csv(filepath, sep = ";", header = T, stringsAsFactors = T)
# Examine dataset structure
head(df, 5)
summary(df)
## age job marital education
## Min. :18.00 blue-collar:9732 divorced: 5207 primary : 6851
## 1st Qu.:33.00 management :9458 married :27214 secondary:23202
## Median :39.00 technician :7597 single :12790 tertiary :13301
## Mean :40.94 admin. :5171 unknown : 1857
## 3rd Qu.:48.00 services :4154
## Max. :95.00 retired :2264
## (Other) :6835
## default balance housing loan contact
## no :44396 Min. : -8019 no :20081 no :37967 cellular :29285
## yes: 815 1st Qu.: 72 yes:25130 yes: 7244 telephone: 2906
## Median : 448 unknown :13020
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
##
## day month duration campaign
## Min. : 1.00 may :13766 Min. : 0.0 Min. : 1.000
## 1st Qu.: 8.00 jul : 6895 1st Qu.: 103.0 1st Qu.: 1.000
## Median :16.00 aug : 6247 Median : 180.0 Median : 2.000
## Mean :15.81 jun : 5341 Mean : 258.2 Mean : 2.764
## 3rd Qu.:21.00 nov : 3970 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :31.00 apr : 2932 Max. :4918.0 Max. :63.000
## (Other): 6060
## pdays previous poutcome y
## Min. : -1.0 Min. : 0.0000 failure: 4901 no :39922
## 1st Qu.: -1.0 1st Qu.: 0.0000 other : 1840 yes: 5289
## Median : -1.0 Median : 0.0000 success: 1511
## Mean : 40.2 Mean : 0.5803 unknown:36959
## 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :871.0 Max. :275.0000
##
str(df)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
## $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
# Identify missing values in each column
missing_values <- sapply(df, function(x) sum(is.na(x)))
missing_values[missing_values > 0]
## named integer(0)
# Check for infinite values
infinite_values <- sapply(df, function(x) sum(is.infinite(x)))
print(infinite_values[infinite_values > 0])
## named integer(0)
# Histogram of log(age)
# I asked ChatGPT to replicate my base R graphs in ggplot so that I did not have to convert them manually
ggplot(df, aes(x = log(age))) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
labs(title = "Histogram of Log(Age)", x = "Log(Age)", y = "Frequency")
# Histogram of log(balance)
ggplot(df, aes(x = log(balance))) +
geom_histogram(bins = 40, fill = "lightblue", color = "black") +
labs(title = "Histogram of Log(Balance)", x = "Log(Balance)", y = "Frequency")
## Warning in log(balance): NaNs produced
## Warning: Removed 7280 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Histogram of sqrt(day)
ggplot(df, aes(x = sqrt(day))) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
labs(title = "Histogram of Sqrt(Day)", x = "Sqrt(Day)", y = "Frequency")
# Histogram of log(duration)
ggplot(df, aes(x = log(duration))) +
geom_histogram(bins = 40, fill = "lightblue", color = "black") +
labs(title = "Histogram of Log(Duration)", x = "Log(Duration)", y = "Frequency")
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Histogram of log(campaign)
ggplot(df, aes(x = log(campaign))) +
geom_histogram(bins = 40, fill = "lightblue", color = "black") +
labs(title = "Histogram of Log(Campaign)", x = "Log(Campaign)", y = "Frequency")
# Histogram of sqrt(pdays)
ggplot(df, aes(x = sqrt(pdays))) +
geom_histogram(bins = 40, fill = "lightblue", color = "black") +
labs(title = "Histogram of Sqrt(Pdays)", x = "Sqrt(Pdays)", y = "Frequency")
## Warning in sqrt(pdays): NaNs produced
## Warning: Removed 36954 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Histogram of log(previous)
ggplot(df, aes(x = log(previous))) +
geom_histogram(bins = 40, fill = "lightblue", color = "black") +
labs(title = "Histogram of Log(Previous)", x = "Log(Previous)", y = "Frequency")
## Warning: Removed 36954 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Bar plot for job
ggplot(df, aes(x = job)) +
geom_bar(fill = "lightblue") +
labs(title = "Count of Jobs", x = "Job", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Bar plot for marital status
ggplot(df, aes(x = marital)) +
geom_bar(fill = "lightblue") +
labs(title = "Count of Marital Status", x = "Marital Status", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Bar plot for education
ggplot(df, aes(x = education)) +
geom_bar(fill = "lightblue") +
labs(title = "Count of Education Levels", x = "Education", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Bar plot for default
ggplot(df, aes(x = default)) +
geom_bar(fill = "lightblue") +
labs(title = "Count of Default Status", x = "Default", y = "Count")
# Bar plot for housing
ggplot(df, aes(x = housing)) +
geom_bar(fill = "lightblue") +
labs(title = "Count of Housing Loans", x = "Housing", y = "Count")
# Bar plot for loan
ggplot(df, aes(x = loan)) +
geom_bar(fill = "lightblue") +
labs(title = "Count of Personal Loans", x = "Loan", y = "Count")
# Bar plot for contact
ggplot(df, aes(x = contact)) +
geom_bar(fill = "lightblue") +
labs(title = "Count of Contact Communication Types", x = "Contact", y = "Count")
# Bar plot for month
ggplot(df, aes(x = month)) +
geom_bar(fill = "lightblue") +
labs(title = "Count of Months", x = "Month", y = "Count")
# Bar plot for poutcome
ggplot(df, aes(x = poutcome)) +
geom_bar(fill = "lightblue") +
labs(title = "Count of Previous Outcomes", x = "Poutcome", y = "Count")
# Bar plot for y (target variable)
ggplot(df, aes(x = y)) +
geom_bar(fill = "lightblue") +
labs(title = "Count of Target Variable (y)", x = "y", y = "Count")
# Remove rows where balance is less than zero
df <- df %>% filter(balance >= 0)
# Transform to approximately normal distributions
df_transformed <- df %>%
mutate(
log_age = log(age + 1),
log_balance = log(balance + 1),
sqrt_day = sqrt(day),
log_duration = log(duration + 1),
log_campaign = log(campaign + 1),
sqrt_pdays = sqrt(pdays +1),
log_previous = log(previous + 1)
) %>%
select(-age, -balance, -day, -duration, -campaign, -pdays, -previous)
# Pairs.panels took too long to run, but I used it to examine the colinearity
# pairs.panels(df_transformed)
There are no rows with missing data, therefore they do not need to be handled. Logistic regression models are parametric, and therefore require that the data be normally distributed. I was unable to transform the “campaign” and “previous” columns to be gaussian, which may affect the model. I was also unable to handle the case where “balance” was below zero and also perform normalization, so I removed balances below zero. This may also affect the model. The pairs.panel correlations showed some significant multicolinearity, indicating that we have redundancy between some predictive factors, which may increase the standard errors of the coefficients.
# Partition data on target variable y
train_index <- createDataPartition(df_transformed$y, p = 0.8, list = FALSE)
table(df_transformed$y)
##
## no yes
## 36366 5079
# Created test and train sets later due to feature normalization step
# Convert target variable to binary (0 and 1)
df_transformed$y <- as.factor(ifelse(df_transformed$y == "yes", 1, 0))
set.seed(123)
# Create WoE model for categorical variables
woe_model <- woe(y ~ job+marital+education+default+housing+loan+contact+month+poutcome, data = df_transformed[train_index, ], zeroadj = 0.5)
# Apply WoE encoding to both training and validation sets
df_transformed_woe <- predict(woe_model, newdata = df_transformed, type = "woe")
## No woe model for variable(s): y
# Omit any rows with NA values
df_transformed_final <- na.omit(df_transformed_woe)
# Split the data into training and validation sets
train_set <- df_transformed_final[train_index, ]
validation_set <- df_transformed_final[-train_index, ]
# Build the logistic regression model using all features
logistic_model <- glm(formula = y ~ ., data = train_set, family = binomial)
# View the summary of the model
summary(logistic_model)
##
## Call:
## glm(formula = y ~ ., family = binomial, data = train_set)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.230640 0.382987 -29.324 < 2e-16 ***
## log_age -0.244574 0.089185 -2.742 0.006100 **
## log_balance 0.060385 0.009298 6.494 8.34e-11 ***
## sqrt_day -0.006050 0.017709 -0.342 0.732617
## log_duration 1.832785 0.031835 57.571 < 2e-16 ***
## log_campaign -0.392917 0.050420 -7.793 6.55e-15 ***
## sqrt_pdays -0.021244 0.005880 -3.613 0.000302 ***
## log_previous 0.156261 0.060968 2.563 0.010378 *
## woe_job -0.493511 0.053324 -9.255 < 2e-16 ***
## woe_marital -0.535863 0.129143 -4.149 3.33e-05 ***
## woe_education -0.418534 0.097183 -4.307 1.66e-05 ***
## woe_default 0.001298 0.360988 0.004 0.997131
## woe_housing -0.669331 0.052412 -12.771 < 2e-16 ***
## woe_loan -0.547599 0.093490 -5.857 4.70e-09 ***
## woe_contact -0.533929 0.046146 -11.570 < 2e-16 ***
## woe_month -0.745875 0.031909 -23.375 < 2e-16 ***
## woe_poutcome -0.873545 0.034153 -25.577 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24670 on 33156 degrees of freedom
## Residual deviance: 15834 on 33140 degrees of freedom
## AIC: 15868
##
## Number of Fisher Scoring iterations: 6
# Build the logistic regression model using all features
# removed sqrt_day, woe_default
logistic_model <- glm(formula = y ~ log_age+log_balance+log_duration+log_campaign+sqrt_pdays+log_previous+woe_job+woe_marital+woe_education+woe_housing+woe_loan+woe_contact+woe_month+woe_poutcome, data = train_set, family = binomial)
# View the summary of the model
summary(logistic_model)
##
## Call:
## glm(formula = y ~ log_age + log_balance + log_duration + log_campaign +
## sqrt_pdays + log_previous + woe_job + woe_marital + woe_education +
## woe_housing + woe_loan + woe_contact + woe_month + woe_poutcome,
## family = binomial, data = train_set)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.250745 0.378456 -29.728 < 2e-16 ***
## log_age -0.244629 0.089187 -2.743 0.006091 **
## log_balance 0.060334 0.009242 6.528 6.66e-11 ***
## log_duration 1.832685 0.031831 57.575 < 2e-16 ***
## log_campaign -0.394441 0.050218 -7.855 4.01e-15 ***
## sqrt_pdays -0.021228 0.005880 -3.610 0.000306 ***
## log_previous 0.156840 0.060946 2.573 0.010070 *
## woe_job -0.493495 0.053327 -9.254 < 2e-16 ***
## woe_marital -0.535851 0.129134 -4.150 3.33e-05 ***
## woe_education -0.418400 0.097183 -4.305 1.67e-05 ***
## woe_housing -0.669384 0.052392 -12.776 < 2e-16 ***
## woe_loan -0.547339 0.093376 -5.862 4.58e-09 ***
## woe_contact -0.533137 0.046088 -11.568 < 2e-16 ***
## woe_month -0.746522 0.031844 -23.443 < 2e-16 ***
## woe_poutcome -0.873527 0.034154 -25.576 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24670 on 33156 degrees of freedom
## Residual deviance: 15834 on 33142 degrees of freedom
## AIC: 15864
##
## Number of Fisher Scoring iterations: 6
# Predict probabilities on the validation set
predicted_probs <- predict(logistic_model, newdata = validation_set, type = "response")
# Convert probabilities to binary outcomes using a threshold of 0.5
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
# Create a confusion matrix
confusion_matrix <- table(Predicted = predicted_classes, Actual = validation_set$y)
print(confusion_matrix)
## Actual
## Predicted 0 1
## 0 7076 669
## 1 197 346
# Calculate overall accuracy, true positive rate, and true negative rate
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
true_positive_rate <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
true_negative_rate <- confusion_matrix[1, 1] / sum(confusion_matrix[1, ])
The logistic regression model achieved an overall accuracy of 0.9 on the validation dataset. The true positive rate, which indicates the proportion of actual positives correctly identified, was 0.64. Conversely, the true negative rate, reflecting the proportion of actual negatives correctly identified, was 0.91.
These results indicate the model’s capacity to predict both classes. It is important to consider potential class imbalances in the dataset, which may lead to an inflated accuracy rate when the model performs well simply by predicting the majority class. The lower true positive rate (minority class) indicates the presence of class imbalance.
The pruned decision tree model from the previous assignment showed higher true positive rate (0.96) and lower true negative rate (0.48) with an overall higher accuracy (0.91), which is likely inflated due to class imbalance. The p-value eliminated logistic regression had higher accuracy with the minority class (true positive, 0.64), slightly lower accuracy with the majority class (true negative, 0.91), and a similar overall accuracy (0.9). These statistics indicate that the decision tree model more accurately predicts the minority class, which is the class of interest for our data, and is therefore the mroe useful model. These models are different because they take different approaches to modeling the data, and in this case the pruned decision tree model provided a more useful outcome.