Data Dive Week 10

Loading data and libraries

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggthemes)
library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(ggrepel)
library(effsize)
library(pwrss)

## 
## Attaching package: 'pwrss'
## 
## The following object is masked from 'package:stats':
## 
##     power.t.test

library(ggplot2)
library(patchwork)
library(broom)
library(lindia)

data_frame = read.csv('C:/Users/prera/OneDrive/Desktop/INFO-I590/bank-full2.csv',header=TRUE, sep = ",")

Summary of data frame

summary(data_frame)

##       age            job              marital           education        
##  Min.   :18.00   Length:45211       Length:45211       Length:45211      
##  1st Qu.:33.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.94                                                           
##  3rd Qu.:48.00                                                           
##  Max.   :95.00                                                           
##    default             balance         housing              loan          
##  Length:45211       Min.   : -8019   Length:45211       Length:45211      
##  Class :character   1st Qu.:    72   Class :character   Class :character  
##  Mode  :character   Median :   448   Mode  :character   Mode  :character  
##                     Mean   :  1362                                        
##                     3rd Qu.:  1428                                        
##                     Max.   :102127                                        
##    contact               day           month              duration     
##  Length:45211       Min.   : 1.00   Length:45211       Min.   :   0.0  
##  Class :character   1st Qu.: 8.00   Class :character   1st Qu.: 103.0  
##  Mode  :character   Median :16.00   Mode  :character   Median : 180.0  
##                     Mean   :15.81                      Mean   : 258.2  
##                     3rd Qu.:21.00                      3rd Qu.: 319.0  
##                     Max.   :31.00                      Max.   :4918.0  
##     campaign          pdays          previous          poutcome        
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000   Length:45211      
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000   Class :character  
##  Median : 2.000   Median : -1.0   Median :  0.0000   Mode  :character  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803                     
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000                     
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000                     
##       y            
##  Length:45211      
##  Class :character  
##  Mode  :character  
##                    
##                    
##

1 - age;

2 - job;

3 - marital(marital status);

4 - education;

5 - default: has credit in default?;

6 - balance: average yearly balance, in euros

7 - housing: has housing loan?;

8 - loan: has personal loan?;

9 - contact: contact communication type;

10 - day: last contact day of the month

11 - month: last contact month of year;

12 - duration: last contact duration, in seconds;

13 - campaign: number of contacts performed during this campaign and for this client

14 - pdays: number of days that passed by after the client was last contacted from a previous campaign

15 - previous: number of contacts performed before this campaign and for this client

16 - poutcome: outcome of the previous marketing campaign;

17 - y : has the client subscribed a term deposit?

data_frame <- na.omit(data_frame)
data_frame <- data.frame (data_frame)

Tasks for the Data Dive

Select a binary column of data
Build a logistic regression model for this variable, using between 1-4 explanatory variables. Interpret the coefficients, and explain what they mean in your notebook
- (Bonus) Using the Standard Error for at least one coefficient, build a C.I. for that coefficient, and interpret its meaning
Consider a transformation for any explanatory variable, and illustrate why you need the transformation

Selecting a binary column of data

The binary column of data I have selected is ‘y’. In the data the column has information on if the client has subscribed to a term deposit or not

Selecting explanatory variables

I have selected the following variables as the explanatory variables-

duration
education

Analysis of response and explanatory variables

df_loan <- data_frame|>
    group_by(loan,education,default)|>
      summarise(avg_balance = mean(balance,na.rm=TRUE), avg_age = mean(age,na.rm=TRUE), size=n())

## `summarise()` has grouped output by 'loan', 'education'. You can override using
## the `.groups` argument.

df_loan

## # A tibble: 12 × 6
## # Groups:   loan, education [6]
##    loan  education default avg_balance avg_age  size
##    <chr> <chr>     <chr>         <dbl>   <dbl> <int>
##  1 no    primary   no           1542.     47.8   878
##  2 no    primary   yes          -243      36.7     3
##  3 no    secondary no           1414.     40.1  3514
##  4 no    secondary yes           -25.0    37.1    27
##  5 no    tertiary  no           2092.     39.3  2324
##  6 no    tertiary  yes            42.9    38.3     7
##  7 yes   primary   no            766.     43.6   129
##  8 yes   primary   yes          -213      44.5     2
##  9 yes   secondary no            842.     40.6   644
## 10 yes   secondary yes          -139.     35.7    12
## 11 yes   tertiary  no           1193.     39.8   297
## 12 yes   tertiary  yes          -392.     37.6     5

summary(data_frame$balance)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1884     162     595    1552    1734   81204

data_frame <- data_frame |>
  mutate(y_value = ifelse(y %in% c("yes"),1, 0))

data_frame <- data_frame |>
  mutate(default_value = ifelse(default %in% c("yes"),1, 0))

data_frame <- data_frame |>
  mutate(loan_value = ifelse(loan %in% c("yes"),1, 0))

data_frame$poutcome <- unclass(factor(data_frame$poutcome))

tail(data_frame)

##       age          job marital education default balance housing loan  contact
## 45196  68      retired married secondary      no    1146      no   no cellular
## 45200  34  blue-collar  single secondary      no    1475     yes   no cellular
## 45202  53   management married  tertiary      no     583      no   no cellular
## 45205  73      retired married secondary      no    2850      no   no cellular
## 45209  72      retired married secondary      no    5715      no   no cellular
## 45211  37 entrepreneur married secondary      no    2971      no   no cellular
##       day month duration campaign pdays previous poutcome   y y_value
## 45196  16   nov      212        1   187        6        3 yes       1
## 45200  16   nov     1166        3   530       12        2  no       0
## 45202  17   nov      226        1   184        4        3 yes       1
## 45205  17   nov      300        1    40        8        1 yes       1
## 45209  17   nov     1127        5   184        3        3 yes       1
## 45211  17   nov      361        2   188       11        2  no       0
##       default_value loan_value
## 45196             0          0
## 45200             0          0
## 45202             0          0
## 45205             0          0
## 45209             0          0
## 45211             0          0

summary(data_frame$poutcome)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.584   2.000   3.000

head(data_frame)

##       age         job marital education default balance housing loan   contact
## 24061  33      admin. married  tertiary      no     882      no   no telephone
## 24063  42      admin.  single secondary      no    -247     yes  yes telephone
## 24065  33    services married secondary      no    3444     yes   no telephone
## 24073  36  management married  tertiary      no    2415     yes   no telephone
## 24078  36  management married  tertiary      no       0     yes   no telephone
## 24087  44 blue-collar married secondary      no    1324     yes   no telephone
##       day month duration campaign pdays previous poutcome   y y_value
## 24061  21   oct       39        1   151        3        1  no       0
## 24063  21   oct      519        1   166        1        2 yes       1
## 24065  21   oct      144        1    91        4        1 yes       1
## 24073  22   oct       73        1    86        4        2  no       0
## 24078  23   oct      140        1   143        3        1 yes       1
## 24087  25   oct      119        1    89        2        2  no       0
##       default_value loan_value
## 24061             0          0
## 24063             0          1
## 24065             0          0
## 24073             0          0
## 24078             0          0
## 24087             0          0

Building a Logistic Regression Model

For one explanatory variable

data_frame |>
  ggplot(mapping = aes(x = duration, y = y_value)) +
  geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3, color='lightpink') +
  geom_smooth(method = 'lm', color = 'lightblue', se = FALSE) +
  labs(title = "Modeling a Binary Response with OLS") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Modelling

model <- glm(y_value ~ duration, data = data_frame, family = binomial(link="logit"))
model$coefficients

##  (Intercept)     duration 
## -2.183200878  0.003314537

Applying the ‘sigmoid’ function

sigmoid <- \(x) 1 / (1 + exp(-(-2.140 +  0.0032 * x)))

data_frame |>
  ggplot(mapping = aes(x = duration, y = y_value)) +
  geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3, color='lightpink') +
  geom_function(fun = sigmoid, color = 'lightblue', linewidth = 1) +
  labs(title = "Modeling a Binary Response with Sigmoid") +
  scale_y_continuous(breaks = c(0, 0.5, 1)) +
  theme_minimal()

Summary of the model

model <- glm(y_value ~ duration, data = data_frame, family = binomial(link="logit"))
print(summary(model),show.residuals=TRUE)

## 
## Call:
## glm(formula = y_value ~ duration, family = binomial(link = "logit"), 
##     data = data_frame)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2179  -0.6587  -0.5594  -0.4753   2.0515  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.1832009  0.0480225  -45.46   <2e-16 ***
## duration     0.0033145  0.0001281   25.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8415.1  on 7841  degrees of freedom
## Residual deviance: 7569.8  on 7840  degrees of freedom
## AIC: 7573.8
## 
## Number of Fisher Scoring iterations: 4

model$coefficients

##  (Intercept)     duration 
## -2.183200878  0.003314537

Interpretation of the Coefficients

These are the estimated values for the effect each explanatory variable has on the response variable in the model.

Now when we take in a value of Duration, and place it into our linear combination, then we are returned with some “probability” between 0 (not subscribing for a term deposit) and 1 (subscribing for a term deposit).

\[ y\_value = log(\frac{p}{1-p})=-2.1832 + 0.0033 × duration \]

So, for every increase in Duration, the probabiliity that the client has subscribed to a term deposit is 1 multiplied by 0.99.

This means that for increase in duration, the probability the client does not subscribe to a term deposite goes down by about 1% (1−0.99).

The Intercept can be used to determine a 50%-probability, i.e.”decision threshold” for any variable.

\[ 0 = -2.1832 + 0.0033 * duration \]

\[ duration = \frac{2.1832}{0.0033} = 661.57 \]

So, when the duration is nearly 662 seconds there is a 50 - 50 chance that the client has subscribed to a term deposit.

For multiple explanatory variables

model <- glm(y_value ~ duration+campaign, data = data_frame, family = binomial(link="logit"))

model$coefficients

##  (Intercept)     duration     campaign 
## -1.855667798  0.003287774 -0.167450354

Interpretation of the Coefficients

\[ y\_value = log(\frac{p}{1-p})=-1.85566 + 0.0032 × duration \]

So, for every increase in Duration, the probability that the client has subscribed to a term deposit is 1 multiplied by 0.99.

This means that for increase in duration, the probability the client does not subscribe to a term deposit goes down by about 1% (1−0.99).

The Intercept can be used to determine a 50%-probability, i.e.”decision threshold” for any variable.

\[ 0 = -1.85566 + 0.0032 * duration \]

\[ duration = \frac{1.85566}{0.0032} = 579 \]

So, when the duration is nearly 579 seconds there is a 50 - 50 chance that the client has subscribed to a term deposit.

\[ y\_value = log(\frac{p}{1-p})=-1.85566 - 0.167 × campaign \]

So, for every increase in Duration, the probability that the client has subscribed to a term deposit is 1 multiplied by -1.18.

This means that for increase in the number of contacts, the probability the client does not subscribe to a term deposit increases by about 18% (1−1.18).

The Intercept can be used to determine a 50%-probability, i.e.”decision threshold” for any variable.

\[ 0 = -1.85566 - 0.167 * campaign \]

\[ campaign = \frac{1.85566}{0.0032} = -11 \]

Summary of the model

summary(model)

## 
## Call:
## glm(formula = y_value ~ duration + campaign, family = binomial(link = "logit"), 
##     data = data_frame)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.8556678  0.0642589 -28.878  < 2e-16 ***
## duration     0.0032878  0.0001284  25.614  < 2e-16 ***
## campaign    -0.1674504  0.0236412  -7.083 1.41e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8415.1  on 7841  degrees of freedom
## Residual deviance: 7512.9  on 7839  degrees of freedom
## AIC: 7518.9
## 
## Number of Fisher Scoring iterations: 4

Plotting each explanatory variable

sigmoid_1 <- \(x) 1 / (1 + exp(-(model$coefficients[1] +  model$coefficients[2] * x)))
sigmoid_2 <- \(x) 1 / (1 + exp(-(model$coefficients[1] +  model$coefficients[3] * x)))

p1 <- data_frame |>
  ggplot(mapping = aes(x = duration, y = y_value)) +
  geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3, color='purple') +
  geom_function(fun = sigmoid_1, color = 'grey', linewidth = 1) +
  labs(title = "y_value VS duration") +
  scale_y_continuous(breaks = c(0, 0.5, 1)) +
  theme_minimal()

p2 <- data_frame |>
  ggplot(mapping = aes(x = campaign, y = y_value)) +
  geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3, color='pink') +
  geom_function(fun = sigmoid_2, color = 'black', linewidth = 1) +
  labs(title = "y_value VS campaign") +
  scale_y_continuous(breaks = c(0, 0.5, 1)) +
  theme_minimal()

p1+p2

Transforming Explanatory variable

T_model <- lm(y_value ~ campaign,data_frame)

T_rsquared <- summary(T_model)$r.squared
T_rsquared

## [1] 0.009117893

data_frame |> 
  ggplot(mapping = aes(x = campaign, y = y_value)) +
  geom_point(color='pink') +
  geom_smooth(method = 'lm', color = 'blue', se = FALSE) +
  labs(title = "Price vs. Price Per Sq. Ft.",
       subtitle = paste("Linear Fit R-Squared =", round(T_rsquared, 2))) +
  theme_classic()

## `geom_smooth()` using formula = 'y ~ x'

data_frame <- data_frame |>
  mutate(transformed_var = -log(sqrt(1/campaign))^2) # add new variable

test_model <- lm(y_value ~ transformed_var ,data_frame)

test_rsquared <- summary(test_model)$r.squared

data_frame |> 
  ggplot(mapping = aes(x = transformed_var, y = y_value)) +
  geom_point() +
  geom_smooth(method = 'gam', color = 'gray', linetype = 'dashed',
              se = FALSE) +
  geom_smooth(se = FALSE) +
  labs(title = "Out vs. (Price Per Sq. Ft.) ^ 2",
       subtitle = paste("Linear Fit R-Squared =", round(test_rsquared, 3))) +
  theme_classic()

## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

model_pg <- glm(y_value ~ transformed_var, data = data_frame,family = binomial(link = "logit"))
model_pg$coefficients

##     (Intercept) transformed_var 
##       -1.065613        1.124177

sigmoid <- \(x) 1 / (1 + exp(-(model_pg$coefficients[1] +  model_pg$coefficients[2] * x)))

data_frame |>
  ggplot(mapping = aes(x = transformed_var, y = y_value)) +
  geom_jitter(width = 0, height = 0.1, shape = 'O', size = 5, color='lightblue') +
  geom_function(fun = sigmoid, color = 'blue', linewidth = 1) +
  labs(title = "Modeling a Binary Response with Sigmoid") +
  scale_y_continuous(breaks = c(0, 0.5, 1)) +
  theme_minimal()

Data Dive Week 10

PreranaRajole

2023-10-25

Loading data and libraries

Summary of data frame

Tasks for the Data Dive

Selecting a binary column of data

Selecting explanatory variables

Analysis of response and explanatory variables

Building a Logistic Regression Model

For one explanatory variable

For multiple explanatory variables

Summary of the model

Plotting each explanatory variable

Transforming Explanatory variable