Code
bank_trn_data <- read.csv("bank_trn.csv")Group Members : Utsav Raj, Rohit Menon, Satarupa Saha, Mounish Sunkara
bank_trn_data <- read.csv("bank_trn.csv")summ <- summary(bank_trn_data)
#knitr::kable(summary(bank_trn_data), "simple")
knitr::kable(summ, align = "lccrr", caption = "Bank Data Summary")| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :17.00 | Length:37061 | Length:37061 | Length:37061 | Length:37061 | Length:37061 | Length:37061 | Length:37061 | Length:37061 | Length:37061 | Min. : 0.0 | Min. : 1.000 | Min. : 0.0 | Min. :0.0000 | Length:37061 | Min. :-3.40000 | Min. :92.20 | Min. :-50.80 | Min. :0.634 | Min. :4964 | Length:37061 | |
| 1st Qu.:32.00 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.: 102.0 | 1st Qu.: 1.000 | 1st Qu.:999.0 | 1st Qu.:0.0000 | Class :character | 1st Qu.:-1.80000 | 1st Qu.:93.08 | 1st Qu.:-42.70 | 1st Qu.:1.344 | 1st Qu.:5099 | Class :character | |
| Median :38.00 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median : 180.0 | Median : 2.000 | Median :999.0 | Median :0.0000 | Mode :character | Median : 1.10000 | Median :93.75 | Median :-41.80 | Median :4.857 | Median :5191 | Mode :character | |
| Mean :40.04 | NA | NA | NA | NA | NA | NA | NA | NA | NA | Mean : 258.5 | Mean : 2.565 | Mean :962.2 | Mean :0.1738 | NA | Mean : 0.08039 | Mean :93.58 | Mean :-40.49 | Mean :3.620 | Mean :5167 | NA | |
| 3rd Qu.:47.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 3rd Qu.: 320.0 | 3rd Qu.: 3.000 | 3rd Qu.:999.0 | 3rd Qu.:0.0000 | NA | 3rd Qu.: 1.40000 | 3rd Qu.:93.99 | 3rd Qu.:-36.40 | 3rd Qu.:4.961 | 3rd Qu.:5228 | NA | |
| Max. :98.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | Max. :4918.0 | Max. :56.000 | Max. :999.0 | Max. :6.0000 | NA | Max. : 1.40000 | Max. :94.77 | Max. :-26.90 | Max. :5.045 | Max. :5228 | NA |
Checking missing and duplicate values
colSums(is.na(bank_trn_data)) age job marital education default
0 0 0 0 0
housing loan contact month day_of_week
0 0 0 0 0
duration campaign pdays previous poutcome
0 0 0 0 0
emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed
0 0 0 0 0
subscribed
0
bank_trn_data_fnl <-unique(bank_trn_data)We have 9 duplicate records.
Plotting the Graphs
par(mfrow = c(3,2))
hist(bank_trn_data_fnl$age, main = "Distribution of Age", xlab = "Age")
hist(bank_trn_data_fnl$duration, main = "Distribution of Duration", xlab = "Duration")
barplot(table(bank_trn_data_fnl$default), main = "Frequency of Default Status")
barplot(table(bank_trn_data_fnl$marital), main = "Marital Status")
barplot(table(bank_trn_data_fnl$loan), main = "Frequency by Loan Status")
barplot(table(bank_trn_data_fnl$contact), main = "Distribution by Contact Status")par(mfrow = c(2,2))
barplot(table(bank_trn_data_fnl$pdays), main = "Distribution by PDays")
barplot(table(bank_trn_data_fnl$previous), main = "Distribution by Previous")
#bank_trn_data_fnl %>% group_by(pdays) %>% count()
hist(bank_trn_data_fnl$nr_employed, main = "Distribution of Number Employed", xlab = "Number of Employees")
barplot(table(bank_trn_data_fnl$subscribed), main = "Frequency by Subscribed Status")Checking correlation on all numeric variables
# Look at correlations between numeric features
num <- sapply(bank_trn_data_fnl, FUN = is.numeric)
corx <- cor(bank_trn_data_fnl[, num], use="pairwise", method="spearman")
# Visualize correlations
corrplot(corx, method = "color", order = "FPC", type = "lower",addCoef.col = "black",number.cex = 0.65, diag = TRUE)High Correlation between emp_var_rate with nr_emolpyed and euribor3m and b/w euribor3m and nr_emolpyed.
3 variables to choose in terms of degree of association to the response.
bank_trn_data_fnl$subscribed <- as.factor(bank_trn_data_fnl$subscribed)
full_model <- glm(subscribed ~ ., data = bank_trn_data_fnl,family = binomial(link = "logit") )
summary(full_model)
Call:
glm(formula = subscribed ~ ., family = binomial(link = "logit"),
data = bank_trn_data_fnl)
Deviance Residuals:
Min 1Q Median 3Q Max
-6.0002 -0.2991 -0.1858 -0.1339 3.3550
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.978e+02 4.040e+01 -4.896 9.76e-07 ***
age 8.825e-04 2.564e-03 0.344 0.730658
jobblue-collar -2.040e-01 8.420e-02 -2.423 0.015407 *
jobentrepreneur -1.333e-01 1.327e-01 -1.005 0.314813
jobhousemaid 7.199e-02 1.515e-01 0.475 0.634756
jobmanagement 1.335e-02 8.940e-02 0.149 0.881328
jobretired 3.641e-01 1.123e-01 3.242 0.001186 **
jobself-employed -8.727e-02 1.237e-01 -0.706 0.480367
jobservices -1.073e-01 9.107e-02 -1.179 0.238568
jobstudent 2.216e-01 1.190e-01 1.862 0.062630 .
jobtechnician 3.994e-02 7.491e-02 0.533 0.593883
jobunemployed 1.768e-02 1.364e-01 0.130 0.896927
jobunknown 7.241e-02 2.409e-01 0.301 0.763750
maritalmarried 5.464e-03 7.212e-02 0.076 0.939606
maritalsingle 8.122e-02 8.258e-02 0.983 0.325362
maritalunknown 6.766e-02 4.204e-01 0.161 0.872146
educationbasic.6y 1.274e-01 1.269e-01 1.004 0.315296
educationbasic.9y 1.791e-02 1.003e-01 0.178 0.858341
educationhigh.school 8.749e-02 9.672e-02 0.905 0.365721
educationilliterate 1.075e+00 7.537e-01 1.426 0.153938
educationprofessional.course 1.068e-01 1.066e-01 1.001 0.316788
educationuniversity.degree 2.160e-01 9.696e-02 2.228 0.025890 *
educationunknown 1.307e-01 1.259e-01 1.039 0.299028
defaultunknown -3.039e-01 7.098e-02 -4.281 1.86e-05 ***
defaultyes -7.330e+00 1.134e+02 -0.065 0.948460
housingunknown -9.865e-02 1.472e-01 -0.670 0.502716
housingyes 7.738e-03 4.361e-02 0.177 0.859161
loanunknown NA NA NA NA
loanyes -1.077e-01 6.134e-02 -1.756 0.079027 .
contacttelephone -6.229e-01 8.032e-02 -7.755 8.81e-15 ***
monthaug 7.853e-01 1.276e-01 6.154 7.58e-10 ***
monthdec 2.869e-01 2.227e-01 1.288 0.197634
monthjul 1.423e-01 1.015e-01 1.403 0.160719
monthjun -3.999e-01 1.331e-01 -3.004 0.002667 **
monthmar 1.910e+00 1.520e-01 12.567 < 2e-16 ***
monthmay -4.793e-01 8.711e-02 -5.502 3.76e-08 ***
monthnov -4.498e-01 1.276e-01 -3.525 0.000424 ***
monthoct 1.126e-01 1.626e-01 0.692 0.488688
monthsep 2.619e-01 1.892e-01 1.384 0.166326
day_of_weekmon -1.118e-01 6.981e-02 -1.602 0.109257
day_of_weekthu 5.488e-02 6.768e-02 0.811 0.417466
day_of_weektue 1.316e-01 6.918e-02 1.902 0.057151 .
day_of_weekwed 1.585e-01 6.949e-02 2.280 0.022589 *
duration 4.693e-03 7.855e-05 59.750 < 2e-16 ***
campaign -3.689e-02 1.204e-02 -3.064 0.002187 **
pdays -9.286e-04 2.270e-04 -4.091 4.29e-05 ***
previous -4.340e-02 6.301e-02 -0.689 0.490969
poutcomenonexistent 4.428e-01 9.999e-02 4.429 9.48e-06 ***
poutcomesuccess 9.704e-01 2.212e-01 4.386 1.15e-05 ***
emp_var_rate -1.621e+00 1.505e-01 -10.776 < 2e-16 ***
cons_price_idx 1.924e+00 2.666e-01 7.219 5.23e-13 ***
cons_conf_idx 1.589e-02 8.163e-03 1.947 0.051554 .
euribor3m 3.756e-01 1.369e-01 2.743 0.006079 **
nr_employed 2.633e-03 3.279e-03 0.803 0.421975
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 26093 on 37060 degrees of freedom
Residual deviance: 15383 on 37008 degrees of freedom
AIC: 15489
Number of Fisher Scoring iterations: 10