system2("echo", args = "Output that would normally be lost")

#require library
required_packages <- c("GGally", "naniar", "gridExtra", "scales", "ggplot2",
                       "dplyr", "tidyr", "corrplot", "ggcorrplot", "caret",
                       "naivebayes", "pROC", "car", "DataExplorer", "knitr")

for (pkg in required_packages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}
#;-) Loading required package: GGally
#;-) Loading required package: ggplot2
#;-) Registered S3 method overwritten by 'GGally':
#;-)   method from   
#;-)   +.gg   ggplot2
#;-) Loading required package: naniar
#;-) Loading required package: gridExtra
#;-) Loading required package: scales
#;-) Loading required package: dplyr
#;-) 
#;-) Attaching package: 'dplyr'
#;-) The following object is masked from 'package:gridExtra':
#;-) 
#;-)     combine
#;-) The following objects are masked from 'package:stats':
#;-) 
#;-)     filter, lag
#;-) The following objects are masked from 'package:base':
#;-) 
#;-)     intersect, setdiff, setequal, union
#;-) Loading required package: tidyr
#;-) Loading required package: corrplot
#;-) corrplot 0.95 loaded
#;-) Loading required package: ggcorrplot
#;-) Loading required package: caret
#;-) Loading required package: lattice
#;-) Loading required package: naivebayes
#;-) naivebayes 1.0.0 loaded
#;-) For more information please visit:
#;-) https://majkamichal.github.io/naivebayes/
#;-) Loading required package: pROC
#;-) Type 'citation("pROC")' for a citation.
#;-) 
#;-) Attaching package: 'pROC'
#;-) The following objects are masked from 'package:stats':
#;-) 
#;-)     cov, smooth, var
#;-) Loading required package: car
#;-) Loading required package: carData
#;-) 
#;-) Attaching package: 'car'
#;-) The following object is masked from 'package:dplyr':
#;-) 
#;-)     recode
#;-) Loading required package: DataExplorer
#;-) Loading required package: knitr

#install.packages("rmarkdown")
#install.packages("knitr")
#install.packages("ggplot2")  # Install other missing libraries if needed

library(rmarkdown)
library(knitr)
library(ggplot2)



#loding date and overview 
#data_path <- "Google Drive/Hunter /ML Big data 622/assignment 1/bank-additional-full.csv"
data_path <- "/Users/ahmhamza/Google Drive/Hunter /ML Big data 622/assignment 1/bank-additional-full.csv"
df <- read.csv(data_path, sep = ";", stringsAsFactors = TRUE)

# Overview of the dataset
str(df)
#;-) 'data.frame':  41188 obs. of  21 variables:
#;-)  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
#;-)  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
#;-)  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
#;-)  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
#;-)  $ default       : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
#;-)  $ housing       : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
#;-)  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
#;-)  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
#;-)  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
#;-)  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
#;-)  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
#;-)  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
#;-)  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
#;-)  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
#;-)  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
#;-)  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
#;-)  $ cons.price.idx: num  94 94 94 94 94 ...
#;-)  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
#;-)  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
#;-)  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
#;-)  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
summary(df)
#;-)       age                 job            marital     
#;-)  Min.   :17.00   admin.     :10422   divorced: 4612  
#;-)  1st Qu.:32.00   blue-collar: 9254   married :24928  
#;-)  Median :38.00   technician : 6743   single  :11568  
#;-)  Mean   :40.02   services   : 3969   unknown :   80  
#;-)  3rd Qu.:47.00   management : 2924                   
#;-)  Max.   :98.00   retired    : 1720                   
#;-)                  (Other)    : 6156                   
#;-)                education        default         housing           loan      
#;-)  university.degree  :12168   no     :32588   no     :18622   no     :33950  
#;-)  high.school        : 9515   unknown: 8597   unknown:  990   unknown:  990  
#;-)  basic.9y           : 6045   yes    :    3   yes    :21576   yes    : 6248  
#;-)  professional.course: 5243                                                  
#;-)  basic.4y           : 4176                                                  
#;-)  basic.6y           : 2292                                                  
#;-)  (Other)            : 1749                                                  
#;-)       contact          month       day_of_week    duration     
#;-)  cellular :26144   may    :13769   fri:7827    Min.   :   0.0  
#;-)  telephone:15044   jul    : 7174   mon:8514    1st Qu.: 102.0  
#;-)                    aug    : 6178   thu:8623    Median : 180.0  
#;-)                    jun    : 5318   tue:8090    Mean   : 258.3  
#;-)                    nov    : 4101   wed:8134    3rd Qu.: 319.0  
#;-)                    apr    : 2632               Max.   :4918.0  
#;-)                    (Other): 2016                               
#;-)     campaign          pdays          previous            poutcome    
#;-)  Min.   : 1.000   Min.   :  0.0   Min.   :0.000   failure    : 4252  
#;-)  1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.000   nonexistent:35563  
#;-)  Median : 2.000   Median :999.0   Median :0.000   success    : 1373  
#;-)  Mean   : 2.568   Mean   :962.5   Mean   :0.173                      
#;-)  3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.000                      
#;-)  Max.   :56.000   Max.   :999.0   Max.   :7.000                      
#;-)                                                                      
#;-)   emp.var.rate      cons.price.idx  cons.conf.idx     euribor3m    
#;-)  Min.   :-3.40000   Min.   :92.20   Min.   :-50.8   Min.   :0.634  
#;-)  1st Qu.:-1.80000   1st Qu.:93.08   1st Qu.:-42.7   1st Qu.:1.344  
#;-)  Median : 1.10000   Median :93.75   Median :-41.8   Median :4.857  
#;-)  Mean   : 0.08189   Mean   :93.58   Mean   :-40.5   Mean   :3.621  
#;-)  3rd Qu.: 1.40000   3rd Qu.:93.99   3rd Qu.:-36.4   3rd Qu.:4.961  
#;-)  Max.   : 1.40000   Max.   :94.77   Max.   :-26.9   Max.   :5.045  
#;-)                                                                    
#;-)   nr.employed     y        
#;-)  Min.   :4964   no :36548  
#;-)  1st Qu.:5099   yes: 4640  
#;-)  Median :5191              
#;-)  Mean   :5167              
#;-)  3rd Qu.:5228              
#;-)  Max.   :5228              
#;-) 

#auto data report 
create_report(df)
#;-) processing file: report.rmd
#;-) output file: /Users/ahmhamza/Downloads/report.knit.md
#;-) /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/x86_64/pandoc +RTS -K512m -RTS /Users/ahmhamza/Downloads/report.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output /Users/ahmhamza/Downloads/report.html --lua-filter /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/library/rmarkdown/rmarkdown/lua/latex-div.lua --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /var/folders/m9/w8sx54nj2jz7d6t1yldp4qmc0000gn/T//Rtmp4Zwh16/rmarkdown-str33cc2709940f.html
#;-) 
#;-) Output created: report.html

#checking for missing values 
missing_vals <- colSums(is.na(df))
print("Missing values in each column:")
#;-) [1] "Missing values in each column:"
print(missing_vals)
#;-)            age            job        marital      education        default 
#;-)              0              0              0              0              0 
#;-)        housing           loan        contact          month    day_of_week 
#;-)              0              0              0              0              0 
#;-)       duration       campaign          pdays       previous       poutcome 
#;-)              0              0              0              0              0 
#;-)   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
#;-)              0              0              0              0              0 
#;-)              y 
#;-)              0


#summary stat 

numerical_vars <- c("age", "duration", "campaign", "pdays", "previous", 
                    "emp.var.rate", "cons.price.idx", "euribor3m", "nr.employed")
summary(df[, numerical_vars])
#;-)       age           duration         campaign          pdays      
#;-)  Min.   :17.00   Min.   :   0.0   Min.   : 1.000   Min.   :  0.0  
#;-)  1st Qu.:32.00   1st Qu.: 102.0   1st Qu.: 1.000   1st Qu.:999.0  
#;-)  Median :38.00   Median : 180.0   Median : 2.000   Median :999.0  
#;-)  Mean   :40.02   Mean   : 258.3   Mean   : 2.568   Mean   :962.5  
#;-)  3rd Qu.:47.00   3rd Qu.: 319.0   3rd Qu.: 3.000   3rd Qu.:999.0  
#;-)  Max.   :98.00   Max.   :4918.0   Max.   :56.000   Max.   :999.0  
#;-)     previous      emp.var.rate      cons.price.idx    euribor3m    
#;-)  Min.   :0.000   Min.   :-3.40000   Min.   :92.20   Min.   :0.634  
#;-)  1st Qu.:0.000   1st Qu.:-1.80000   1st Qu.:93.08   1st Qu.:1.344  
#;-)  Median :0.000   Median : 1.10000   Median :93.75   Median :4.857  
#;-)  Mean   :0.173   Mean   : 0.08189   Mean   :93.58   Mean   :3.621  
#;-)  3rd Qu.:0.000   3rd Qu.: 1.40000   3rd Qu.:93.99   3rd Qu.:4.961  
#;-)  Max.   :7.000   Max.   : 1.40000   Max.   :94.77   Max.   :5.045  
#;-)   nr.employed  
#;-)  Min.   :4964  
#;-)  1st Qu.:5099  
#;-)  Median :5191  
#;-)  Mean   :5167  
#;-)  3rd Qu.:5228  
#;-)  Max.   :5228


## cor for numerical feauters 
numeric_cols <- df %>% select_if(is.numeric)
cor_matrix <- cor(numeric_cols, use = "pairwise.complete.obs")
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", 
         tl.srt = 60, col = colorRampPalette(c("blue", "white", "red"))(200),
         addCoef.col = "black", number.cex = 0.6)


#distri of numeric vars
numeric_cols %>% 
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>% 
  ggplot(aes(x = value)) +
  facet_wrap(~ variable, scales = "free") +
  geom_histogram(bins = 30, fill = "steelblue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Numeric Variables")




#boxplot outliers 
ggplot(stack(df[, numerical_vars]), aes(x = ind, y = values)) +
  geom_boxplot(fill = "lightgreen") +
  coord_flip() +
  ggtitle("Outlier Detection for Numeric Features")



#central tendency stat 
df %>%
  select(all_of(numerical_vars)) %>%
  summarise(across(everything(), list(mean = mean, sd = sd, median = median), na.rm = TRUE)) %>%
  pivot_longer(everything(), names_to = "stat", values_to = "value") %>%
  separate(stat, into = c("variable", "stat"), sep = "_") %>%
  pivot_wider(names_from = stat, values_from = value) %>%
  knitr::kable()
#;-) Warning: There was 1 warning in `summarise()`.
#;-) ℹ In argument: `across(...)`.
#;-) Caused by warning:
#;-) ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
#;-) Supply arguments directly to `.fns` through an anonymous function instead.
#;-) 
#;-)   # Previously
#;-)   across(a:b, mean, na.rm = TRUE)
#;-) 
#;-)   # Now
#;-)   across(a:b, \(x) mean(x, na.rm = TRUE))

variable	mean	sd	median
age	40.0240604	10.4212500	38.000
duration	258.2850102	259.2792488	180.000
campaign	2.5675925	2.7700135	2.000
pdays	962.4754540	186.9109073	999.000
previous	0.1729630	0.4949011	0.000
emp.var.rate	0.0818855	1.5709597	1.100
cons.price.idx	93.5756644	0.5788400	93.749
euribor3m	3.6212908	1.7344474	4.857
nr.employed	5167.0359109	72.2515277	5191.000



#categorical vars 
categorical_vars <- c("job", "marital", "education", "default", "housing", 
                      "loan", "contact", "month", "day_of_week", "poutcome")

df_cat <- df %>% 
  select(all_of(categorical_vars)) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value")

df_summary <- df_cat %>%
  group_by(variable, value) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(variable) %>%
  mutate(percent = count / sum(count)) %>%
  ungroup()

# Bar chart for categorical variables
ggplot(df_summary, aes(x = value, y = count)) +
  geom_bar(stat = "identity", fill = "dodgerblue", color = "black") +
  geom_text(aes(label = scales::percent(percent, accuracy = 1)), 
            vjust = -0.5, size = 2) +
  facet_wrap(~ variable, scales = "free_x") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Frequency Distribution of Categorical Variables",
       x = "Category", y = "Frequency")










#data preprocessing

unknown_vars <- c("job", "marital", "education", "default", "housing", "loan", "contact")
df <- df %>%
  mutate(across(all_of(unknown_vars), ~ ifelse(. == "unknown", NA, .)))

# Function to calculate mode
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

# Impute for numerical variables (using median)
for (var in numerical_vars) {
  med_val <- median(df[[var]], na.rm = TRUE)
  df[[var]][is.na(df[[var]])] <- med_val
}

# Impute for categorical variables (using mode)
for (var in categorical_vars) {
  mode_val <- getmode(df[[var]])
  df[[var]][is.na(df[[var]])] <- mode_val
}

# Convert categorical variables to factors
df[categorical_vars] <- lapply(df[categorical_vars], factor)


#splitting the data 
set.seed(1234)
train_index <- createDataPartition(df$y, p = 0.8, list = FALSE)
train <- df[train_index, ]
test  <- df[-train_index, ]


#building model ---------- 

#logistic reg 

logit_model <- glm(y ~ ., data = train, family = binomial)
pred_logit_prob <- predict(logit_model, test, type = "response")
pred_logit_class <- ifelse(pred_logit_prob > 0.5, "yes", "no")
conf_matrix_logit <- table(Predicted = pred_logit_class, Actual = test$y)

print("Logistic Regression Confusion Matrix:")
#;-) [1] "Logistic Regression Confusion Matrix:"
print(conf_matrix_logit)
#;-)          Actual
#;-) Predicted   no  yes
#;-)       no  7111  515
#;-)       yes  198  413

misclass_logit <- mean(pred_logit_class != test$y)
print(paste("Logistic Regression Misclassification Error:", misclass_logit))
#;-) [1] "Logistic Regression Misclassification Error: 0.0865606410100765"

# Check for multicollinearity
vif(logit_model)
#;-)                      GVIF Df GVIF^(1/(2*Df))
#;-) age              2.256226  1        1.502074
#;-) job              5.188047 10        1.085801
#;-) marital          1.461178  2        1.099451
#;-) education        2.843305  6        1.090985
#;-) default          1.000001  1        1.000000
#;-) housing          1.012374  1        1.006168
#;-) loan             1.006373  1        1.003181
#;-) contact          2.309911  1        1.519839
#;-) month           63.447258  9        1.259314
#;-) day_of_week      1.067990  4        1.008256
#;-) duration         1.233953  1        1.110834
#;-) campaign         1.053757  1        1.026527
#;-) pdays           11.159807  1        3.340630
#;-) previous         4.514174  1        2.124659
#;-) poutcome        24.842833  2        2.232545
#;-) emp.var.rate   144.022282  1       12.000928
#;-) cons.price.idx  67.940605  1        8.242609
#;-) cons.conf.idx    5.337276  1        2.310255
#;-) euribor3m      135.344539  1       11.633767
#;-) nr.employed    170.604033  1       13.061548


#naive bayes 
nb_model <- naive_bayes(y ~ ., data = train)
#;-) Warning: naive_bayes(): Feature default - zero probabilities are present.
#;-) Consider Laplace smoothing.
print("Naive Bayes Model Summary:")
#;-) [1] "Naive Bayes Model Summary:"
print(nb_model)
#;-) 
#;-) ================================= Naive Bayes ==================================
#;-) 
#;-) Call:
#;-) naive_bayes.formula(formula = y ~ ., data = train)
#;-) 
#;-) -------------------------------------------------------------------------------- 
#;-)  
#;-) Laplace smoothing: 0
#;-) 
#;-) -------------------------------------------------------------------------------- 
#;-)  
#;-) A priori probabilities: 
#;-) 
#;-)        no       yes 
#;-) 0.8873479 0.1126521 
#;-) 
#;-) -------------------------------------------------------------------------------- 
#;-)  
#;-) Tables: 
#;-) 
#;-) -------------------------------------------------------------------------------- 
#;-) :: age (Gaussian) 
#;-) -------------------------------------------------------------------------------- 
#;-)       
#;-) age           no       yes
#;-)   mean 39.912548 41.015625
#;-)   sd    9.889584 13.897216
#;-) 
#;-) -------------------------------------------------------------------------------- 
#;-) :: job (Categorical) 
#;-) -------------------------------------------------------------------------------- 
#;-)     
#;-) job          no        yes
#;-)   1  0.25459147 0.29391164
#;-)   2  0.23697801 0.13981681
#;-)   3  0.03591094 0.02532328
#;-)   4  0.02609528 0.02505388
#;-)   5  0.07127467 0.07219828
#;-)   6  0.03556893 0.09482759
#;-)   7  0.03444030 0.03286638
#;-)   8  0.10024283 0.07004310
#;-)   9  0.01583501 0.05818966
#;-)   10 0.16556654 0.15544181
#;-)   11 0.02349602 0.03232759
#;-) 
#;-) -------------------------------------------------------------------------------- 
#;-) :: marital (Categorical) 
#;-) -------------------------------------------------------------------------------- 
#;-)        
#;-) marital        no       yes
#;-)       1 0.1128972 0.1066810
#;-)       2 0.6141113 0.5449892
#;-)       3 0.2729916 0.3483297
#;-) 
#;-) -------------------------------------------------------------------------------- 
#;-) :: education (Categorical) 
#;-) -------------------------------------------------------------------------------- 
#;-)          
#;-) education          no         yes
#;-)         1 0.101645063 0.096174569
#;-)         2 0.058415130 0.040948276
#;-)         3 0.152741202 0.102370690
#;-)         4 0.232224084 0.221443966
#;-)         5 0.000342009 0.001077586
#;-)         6 0.127466740 0.127963362
#;-)         7 0.327165772 0.410021552
#;-) 
#;-) -------------------------------------------------------------------------------- 
#;-) :: default (Bernoulli) 
#;-) -------------------------------------------------------------------------------- 
#;-)        
#;-) default           no          yes
#;-)       1 0.9998973973 1.0000000000
#;-)       3 0.0001026027 0.0000000000
#;-) 
#;-) --------------------------------------------------------------------------------
#;-) 
#;-) # ... and 15 more tables
#;-) 
#;-) --------------------------------------------------------------------------------

pred_nb_prob <- predict(nb_model, test, type = "prob")
#;-) Warning: predict.naive_bayes(): more features in the newdata are provided as
#;-) there are probability tables in the object. Calculation is performed based on
#;-) features to be found in the tables.
head(pred_nb_prob)
#;-)             no          yes
#;-) [1,] 0.9999475 5.249120e-05
#;-) [2,] 0.9999710 2.902459e-05
#;-) [3,] 0.9998522 1.477800e-04
#;-) [4,] 0.9999676 3.243193e-05
#;-) [5,] 0.9999001 9.991972e-05
#;-) [6,] 0.9997024 2.975520e-04

pred_nb_class <- predict(nb_model, test)
#;-) Warning: predict.naive_bayes(): more features in the newdata are provided as
#;-) there are probability tables in the object. Calculation is performed based on
#;-) features to be found in the tables.
conf_matrix_nb <- table(Predicted = pred_nb_class, Actual = test$y)

print("Naive Bayes Confusion Matrix:")
#;-) [1] "Naive Bayes Confusion Matrix:"
print(conf_matrix_nb)
#;-)          Actual
#;-) Predicted   no  yes
#;-)       no  6514  357
#;-)       yes  795  571

misclass_nb <- mean(pred_nb_class != test$y)
print(paste("Naive Bayes Misclassification Error:", misclass_nb))
#;-) [1] "Naive Bayes Misclassification Error: 0.13985674396018"

Session info

Because session_info is TRUE, the rendered result includes session info, even though no such code is included here in the source document.