library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
library(ggplot2)
library(patchwork)
library(broom)
library(lindia)
data_frame = read.csv('C:/Users/prera/OneDrive/Desktop/INFO-I590/bank-full2.csv',header=TRUE, sep = ",")
summary(data_frame)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
1 - age;
2 - job;
3 - marital(marital status);
4 - education;
5 - default: has credit in default?;
6 - balance: average yearly balance, in euros
7 - housing: has housing loan?;
8 - loan: has personal loan?;
9 - contact: contact communication type;
10 - day: last contact day of the month
11 - month: last contact month of year;
12 - duration: last contact duration, in seconds;
13 - campaign: number of contacts performed during this campaign and for this client
14 - pdays: number of days that passed by after the client was last contacted from a previous campaign
15 - previous: number of contacts performed before this campaign and for this client
16 - poutcome: outcome of the previous marketing campaign;
17 - y : has the client subscribed a term deposit?
data_frame <- na.omit(data_frame)
data_frame <- data.frame (data_frame)
Select a binary column of data
Build a logistic regression model for this variable, using between 1-4 explanatory variables. Interpret the coefficients, and explain what they mean in your notebook
Consider a transformation for any explanatory variable, and illustrate why you need the transformation
The binary column of data I have selected is ‘y’. In the data the column has information on if the client has subscribed to a term deposit or not
I have selected the following variables as the explanatory variables-
duration
education
df_loan <- data_frame|>
group_by(loan,education,default)|>
summarise(avg_balance = mean(balance,na.rm=TRUE), avg_age = mean(age,na.rm=TRUE), size=n())
## `summarise()` has grouped output by 'loan', 'education'. You can override using
## the `.groups` argument.
df_loan
## # A tibble: 12 × 6
## # Groups: loan, education [6]
## loan education default avg_balance avg_age size
## <chr> <chr> <chr> <dbl> <dbl> <int>
## 1 no primary no 1542. 47.8 878
## 2 no primary yes -243 36.7 3
## 3 no secondary no 1414. 40.1 3514
## 4 no secondary yes -25.0 37.1 27
## 5 no tertiary no 2092. 39.3 2324
## 6 no tertiary yes 42.9 38.3 7
## 7 yes primary no 766. 43.6 129
## 8 yes primary yes -213 44.5 2
## 9 yes secondary no 842. 40.6 644
## 10 yes secondary yes -139. 35.7 12
## 11 yes tertiary no 1193. 39.8 297
## 12 yes tertiary yes -392. 37.6 5
summary(data_frame$balance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1884 162 595 1552 1734 81204
data_frame <- data_frame |>
mutate(y_value = ifelse(y %in% c("yes"),1, 0))
data_frame <- data_frame |>
mutate(default_value = ifelse(default %in% c("yes"),1, 0))
data_frame <- data_frame |>
mutate(loan_value = ifelse(loan %in% c("yes"),1, 0))
data_frame$poutcome <- unclass(factor(data_frame$poutcome))
tail(data_frame)
## age job marital education default balance housing loan contact
## 45196 68 retired married secondary no 1146 no no cellular
## 45200 34 blue-collar single secondary no 1475 yes no cellular
## 45202 53 management married tertiary no 583 no no cellular
## 45205 73 retired married secondary no 2850 no no cellular
## 45209 72 retired married secondary no 5715 no no cellular
## 45211 37 entrepreneur married secondary no 2971 no no cellular
## day month duration campaign pdays previous poutcome y y_value
## 45196 16 nov 212 1 187 6 3 yes 1
## 45200 16 nov 1166 3 530 12 2 no 0
## 45202 17 nov 226 1 184 4 3 yes 1
## 45205 17 nov 300 1 40 8 1 yes 1
## 45209 17 nov 1127 5 184 3 3 yes 1
## 45211 17 nov 361 2 188 11 2 no 0
## default_value loan_value
## 45196 0 0
## 45200 0 0
## 45202 0 0
## 45205 0 0
## 45209 0 0
## 45211 0 0
summary(data_frame$poutcome)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.584 2.000 3.000
head(data_frame)
## age job marital education default balance housing loan contact
## 24061 33 admin. married tertiary no 882 no no telephone
## 24063 42 admin. single secondary no -247 yes yes telephone
## 24065 33 services married secondary no 3444 yes no telephone
## 24073 36 management married tertiary no 2415 yes no telephone
## 24078 36 management married tertiary no 0 yes no telephone
## 24087 44 blue-collar married secondary no 1324 yes no telephone
## day month duration campaign pdays previous poutcome y y_value
## 24061 21 oct 39 1 151 3 1 no 0
## 24063 21 oct 519 1 166 1 2 yes 1
## 24065 21 oct 144 1 91 4 1 yes 1
## 24073 22 oct 73 1 86 4 2 no 0
## 24078 23 oct 140 1 143 3 1 yes 1
## 24087 25 oct 119 1 89 2 2 no 0
## default_value loan_value
## 24061 0 0
## 24063 0 1
## 24065 0 0
## 24073 0 0
## 24078 0 0
## 24087 0 0
data_frame |>
ggplot(mapping = aes(x = duration, y = y_value)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3, color='lightpink') +
geom_smooth(method = 'lm', color = 'lightblue', se = FALSE) +
labs(title = "Modeling a Binary Response with OLS") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Modelling
model <- glm(y_value ~ duration, data = data_frame, family = binomial(link="logit"))
model$coefficients
## (Intercept) duration
## -2.183200878 0.003314537
Applying the ‘sigmoid’ function
sigmoid <- \(x) 1 / (1 + exp(-(-2.140 + 0.0032 * x)))
data_frame |>
ggplot(mapping = aes(x = duration, y = y_value)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3, color='lightpink') +
geom_function(fun = sigmoid, color = 'lightblue', linewidth = 1) +
labs(title = "Modeling a Binary Response with Sigmoid") +
scale_y_continuous(breaks = c(0, 0.5, 1)) +
theme_minimal()
Summary of the model
model <- glm(y_value ~ duration, data = data_frame, family = binomial(link="logit"))
print(summary(model),show.residuals=TRUE)
##
## Call:
## glm(formula = y_value ~ duration, family = binomial(link = "logit"),
## data = data_frame)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2179 -0.6587 -0.5594 -0.4753 2.0515
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1832009 0.0480225 -45.46 <2e-16 ***
## duration 0.0033145 0.0001281 25.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8415.1 on 7841 degrees of freedom
## Residual deviance: 7569.8 on 7840 degrees of freedom
## AIC: 7573.8
##
## Number of Fisher Scoring iterations: 4
model$coefficients
## (Intercept) duration
## -2.183200878 0.003314537
Interpretation of the Coefficients
These are the estimated values for the effect each explanatory variable has on the response variable in the model.
Now when we take in a value of Duration, and place it into our linear combination, then we are returned with some “probability” between 0 (not subscribing for a term deposit) and 1 (subscribing for a term deposit).
\[ y\_value = log(\frac{p}{1-p})=-2.1832 + 0.0033 × duration \]
So, for every increase in Duration, the probabiliity that the client has subscribed to a term deposit is 1 multiplied by 0.99.
This means that for increase in duration, the probability the client does not subscribe to a term deposite goes down by about 1% (1−0.99).
The Intercept can be used to determine a 50%-probability, i.e.”decision threshold” for any variable.
\[ 0 = -2.1832 + 0.0033 * duration \]
\[ duration = \frac{2.1832}{0.0033} = 661.57 \]
So, when the duration is nearly 662 seconds there is a 50 - 50 chance that the client has subscribed to a term deposit.
model <- glm(y_value ~ duration+campaign, data = data_frame, family = binomial(link="logit"))
model$coefficients
## (Intercept) duration campaign
## -1.855667798 0.003287774 -0.167450354
Interpretation of the Coefficients
\[ y\_value = log(\frac{p}{1-p})=-1.85566 + 0.0032 × duration \]
So, for every increase in Duration, the probability that the client has subscribed to a term deposit is 1 multiplied by 0.99.
This means that for increase in duration, the probability the client does not subscribe to a term deposit goes down by about 1% (1−0.99).
The Intercept can be used to determine a 50%-probability, i.e.”decision threshold” for any variable.
\[ 0 = -1.85566 + 0.0032 * duration \]
\[ duration = \frac{1.85566}{0.0032} = 579 \]
So, when the duration is nearly 579 seconds there is a 50 - 50 chance that the client has subscribed to a term deposit.
\[ y\_value = log(\frac{p}{1-p})=-1.85566 - 0.167 × campaign \]
So, for every increase in Duration, the probability that the client has subscribed to a term deposit is 1 multiplied by -1.18.
This means that for increase in the number of contacts, the probability the client does not subscribe to a term deposit increases by about 18% (1−1.18).
The Intercept can be used to determine a 50%-probability, i.e.”decision threshold” for any variable.
\[ 0 = -1.85566 - 0.167 * campaign \]
\[ campaign = \frac{1.85566}{0.0032} = -11 \]
summary(model)
##
## Call:
## glm(formula = y_value ~ duration + campaign, family = binomial(link = "logit"),
## data = data_frame)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8556678 0.0642589 -28.878 < 2e-16 ***
## duration 0.0032878 0.0001284 25.614 < 2e-16 ***
## campaign -0.1674504 0.0236412 -7.083 1.41e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8415.1 on 7841 degrees of freedom
## Residual deviance: 7512.9 on 7839 degrees of freedom
## AIC: 7518.9
##
## Number of Fisher Scoring iterations: 4
sigmoid_1 <- \(x) 1 / (1 + exp(-(model$coefficients[1] + model$coefficients[2] * x)))
sigmoid_2 <- \(x) 1 / (1 + exp(-(model$coefficients[1] + model$coefficients[3] * x)))
p1 <- data_frame |>
ggplot(mapping = aes(x = duration, y = y_value)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3, color='purple') +
geom_function(fun = sigmoid_1, color = 'grey', linewidth = 1) +
labs(title = "y_value VS duration") +
scale_y_continuous(breaks = c(0, 0.5, 1)) +
theme_minimal()
p2 <- data_frame |>
ggplot(mapping = aes(x = campaign, y = y_value)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3, color='pink') +
geom_function(fun = sigmoid_2, color = 'black', linewidth = 1) +
labs(title = "y_value VS campaign") +
scale_y_continuous(breaks = c(0, 0.5, 1)) +
theme_minimal()
p1+p2
T_model <- lm(y_value ~ campaign,data_frame)
T_rsquared <- summary(T_model)$r.squared
T_rsquared
## [1] 0.009117893
data_frame |>
ggplot(mapping = aes(x = campaign, y = y_value)) +
geom_point(color='pink') +
geom_smooth(method = 'lm', color = 'blue', se = FALSE) +
labs(title = "Price vs. Price Per Sq. Ft.",
subtitle = paste("Linear Fit R-Squared =", round(T_rsquared, 2))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
data_frame <- data_frame |>
mutate(transformed_var = -log(sqrt(1/campaign))^2) # add new variable
test_model <- lm(y_value ~ transformed_var ,data_frame)
test_rsquared <- summary(test_model)$r.squared
data_frame |>
ggplot(mapping = aes(x = transformed_var, y = y_value)) +
geom_point() +
geom_smooth(method = 'gam', color = 'gray', linetype = 'dashed',
se = FALSE) +
geom_smooth(se = FALSE) +
labs(title = "Out vs. (Price Per Sq. Ft.) ^ 2",
subtitle = paste("Linear Fit R-Squared =", round(test_rsquared, 3))) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
model_pg <- glm(y_value ~ transformed_var, data = data_frame,family = binomial(link = "logit"))
model_pg$coefficients
## (Intercept) transformed_var
## -1.065613 1.124177
sigmoid <- \(x) 1 / (1 + exp(-(model_pg$coefficients[1] + model_pg$coefficients[2] * x)))
data_frame |>
ggplot(mapping = aes(x = transformed_var, y = y_value)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 5, color='lightblue') +
geom_function(fun = sigmoid, color = 'blue', linewidth = 1) +
labs(title = "Modeling a Binary Response with Sigmoid") +
scale_y_continuous(breaks = c(0, 0.5, 1)) +
theme_minimal()