modeling

Load the libraries

# Load required packages
library(tibble)
library(skimr)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(rpart)
library(rpart.plot)
library(corrplot)

## corrplot 0.95 loaded

library(ggplot2)
library(vip) # for feature importance

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

## 1. Data understanding

#1. load and inspecting the dataset (IDA)
data <- read.csv("Data_Science_Social_Network_Ads_and_purchase_Data.csv", head=TRUE);

#attach
attach(data);

#convert dataframe to a tibble for readability
data <- as_tibble(data)

#view data
view(data)

#check few top and last rows
head(data); # top 6 rows

## # A tibble: 6 × 5
##    User.ID Gender   Age EstimatedSalary Purchased
##      <int> <chr>  <int>           <int>     <int>
## 1 15624510 Male      19           19000         0
## 2 15810944 Male      35           20000         0
## 3 15668575 Female    26           43000         0
## 4 15603246 Female    27           57000         0
## 5 15804002 Male      19           76000         0
## 6 15728773 Male      27           58000         0

tail(data); #bottom 6 rows

## # A tibble: 6 × 5
##    User.ID Gender   Age EstimatedSalary Purchased
##      <int> <chr>  <int>           <int>     <int>
## 1 15757632 Female    39           59000         0
## 2 15691863 Female    46           41000         1
## 3 15706071 Male      51           23000         1
## 4 15654296 Female    50           20000         1
## 5 15755018 Male      36           33000         0
## 6 15594041 Female    49           36000         1

#check dimension and structure
dim(data)

## [1] 400   5

str(data)

## tibble [400 × 5] (S3: tbl_df/tbl/data.frame)
##  $ User.ID        : int [1:400] 15624510 15810944 15668575 15603246 15804002 15728773 15598044 15694829 15600575 15727311 ...
##  $ Gender         : chr [1:400] "Male" "Male" "Female" "Female" ...
##  $ Age            : int [1:400] 19 35 26 27 19 27 27 32 25 35 ...
##  $ EstimatedSalary: int [1:400] 19000 20000 43000 57000 76000 58000 84000 150000 33000 65000 ...
##  $ Purchased      : int [1:400] 0 0 0 0 0 0 0 1 0 0 ...

#check for duplicates based on unique identifier
print(paste0("The dataset contains ", sum(duplicated(data)), " duplicated values"))

## [1] "The dataset contains 0 duplicated values"

#Remove the user_id
data <-data[, -1] # this is unique identifier

print('**********************************')

## [1] "**********************************"

#check data structure # you can also use str()
glimpse(data)

## Rows: 400
## Columns: 4
## $ Gender          <chr> "Male", "Male", "Female", "Female", "Male", "Male", "F…
## $ Age             <int> 19, 35, 26, 27, 19, 27, 27, 32, 25, 35, 26, 26, 20, 32…
## $ EstimatedSalary <int> 19000, 20000, 43000, 57000, 76000, 58000, 84000, 15000…
## $ Purchased       <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …

print('**********************************')

## [1] "**********************************"

#concise statistical summary
summary(data) # for numerical variables

##     Gender               Age        EstimatedSalary    Purchased     
##  Length:400         Min.   :18.00   Min.   : 15000   Min.   :0.0000  
##  Class :character   1st Qu.:29.75   1st Qu.: 43000   1st Qu.:0.0000  
##  Mode  :character   Median :37.00   Median : 70000   Median :0.0000  
##                     Mean   :37.66   Mean   : 69743   Mean   :0.3575  
##                     3rd Qu.:46.00   3rd Qu.: 88000   3rd Qu.:1.0000  
##                     Max.   :60.00   Max.   :150000   Max.   :1.0000

##2. Data Preparation

###1. Data cleaning

#Create a copy so that we don't mess up with the original dataset
data1 <- data

#check the null values
print(paste0("The dataset contains ", sum(is.na(data1)), " null values"))

## [1] "The dataset contains 0 null values"

#check for outliers
boxplot(EstimatedSalary, main = "Box Plot of Estimated Salary", ylab = "Values")

boxplot(Age, main = "Box Plot of Age", ylab = "Values")

###2. Exploratory Data Analysis

a. Univariate analysis

#Univariate analysis: purchased variable
#Get the crosstab table
table(Purchased)

## Purchased
##   0   1 
## 257 143

# Get counts and percentages
purchased_counts <- table(Purchased)
percentages <- round(prop.table(purchased_counts) * 100, 1)  # Calculate percentages
# Create labels
labels <- paste0(c("Not Purchased (0)", "Purchased (1)"), "\n", 
                 " (", percentages, "%)")
# Plot pie chart
pie(purchased_counts,
    labels = labels,
    main = "Distribution of Purchased",
    col = c("#9ACBD0", "#48A6A7"),
    border = "#F2EFE7")

print("***********************************************")

## [1] "***********************************************"

#Univariate analysis: Gender variable
#Get the crosstab table
table(Gender)

## Gender
## Female   Male 
##    204    196

# Get counts and percentages
gender_counts <- table(Gender)
gender_pct <- round(prop.table(gender_counts) * 100, 1)  # Calculate percentages
# Create labels
labels1 <- paste0(c("Female", "Male"),"\n", gender_counts,
                 " (",gender_pct , "%)")
# Plot pie chart
pie(gender_counts,
    labels = labels1,
    main = "Gender distribution",
    col = c("lightgreen", "#ACC572"),
    border = "white")

print("*********************************")

## [1] "*********************************"

#Univariate analysis of numerical variable distribution
par(mfrow = c(2, 2)) # Split plot window into 2x2 grid
hist(Age, main = "Distribution of Age", xlab = "Age", col="skyblue")
hist(EstimatedSalary, main = "Distribution of Salary", xlab = "Salary", col="darkgreen")

b. Bivariate analysis

#Gender vs purchased Barchart
ggplot(data1, aes(x = factor(Gender), fill = factor(Purchased))) +
 geom_bar(position = "dodge") + 
 labs(title = "Gender vs. Purchases", x = "Gender", fill = "Purchased")

#Estimated salary vs purchased scatterplot
ggplot(data1, aes(x =Age , y = EstimatedSalary)) +
 geom_point(color = "darkblue") +
 labs(title = "Salary vs. Age", x = "Age", y = "Salary")

print("*********************************************")

## [1] "*********************************************"

#correlation matrix
# Create numerical version of Gender
data1$Gender <- ifelse(data$Gender == "Male", 1, 0)

#compute corr matrix
corr_matrix <- cor(data1[, c("EstimatedSalary", "Age", "Purchased", "Gender")])
print(corr_matrix)

##                 EstimatedSalary         Age   Purchased      Gender
## EstimatedSalary      1.00000000  0.15523802  0.36208303 -0.06043469
## Age                  0.15523802  1.00000000  0.62245420 -0.07374133
## Purchased            0.36208303  0.62245420  1.00000000 -0.04246946
## Gender              -0.06043469 -0.07374133 -0.04246946  1.00000000

#Visualize corr
corrplot(corr_matrix, method = "color", type = "upper", order = "hclust",
addCoef.col = "black", number.cex = 0.7, insig = "blank" ,
number.digits = 2, tl.col = "black", tl.srt = 45)

b. Multivariate analysis

#Estimated salary vs Age by Gender scatterplot
ggplot(data1, aes(x =Age , y = EstimatedSalary, color = factor(Gender))) +
geom_point() +
labs(title = "Salary vs Age by Gender", x = "Age", y = "Salary")

#Estimated salary vs Age by Gender scatterplot
ggplot(data1, aes(x =Age , y = EstimatedSalary, color = factor(Purchased))) +
geom_point() +
labs(title = "Salary vs Age by purchased", x = "Age", y = "Salary")

###2. Data Preprocessing

# Convert categorical variables to factors
data1$Gender <- as.factor(Gender)
data1$Purchased <- as.factor(Purchased)

#Confirm the conversion
str(data1)

## tibble [400 × 4] (S3: tbl_df/tbl/data.frame)
##  $ Gender         : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 2 1 1 2 1 ...
##  $ Age            : int [1:400] 19 35 26 27 19 27 27 32 25 35 ...
##  $ EstimatedSalary: int [1:400] 19000 20000 43000 57000 76000 58000 84000 150000 33000 65000 ...
##  $ Purchased      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...

# Splitting data into training (80%) and testing (20%) sets
set.seed(123) # For reproducibility
index <- createDataPartition(data1$Purchased, p = 0.8, list = FALSE)
train <- data1[index, ]
test <- data1[-index, ]

#confirm the splitting
dim(train)

## [1] 321   4

dim(test)

## [1] 79  4

###3. summary statistics table for training and testing set

#Summary statistics for training set # you can achieve the below result by using skim() as well

# Create combined summary table
merged_table <- bind_rows(
# Training set summary
    train %>%
    summarize(
      Dataset = "Training Set",
      Age_mean = mean(Age),
      Age_sd = sd(Age),
      Salary_mean = mean(EstimatedSalary),
      Salary_sd = sd(EstimatedSalary),
      Male = sum(Gender == "Male"),
      Female = sum(Gender == "Female")
    ),
  
  # Test set summary
  test %>%
    summarize(
      Dataset = "Test Set",
      Age_mean = mean(Age),
      Age_sd = sd(Age),
      Salary_mean = mean(EstimatedSalary),
      Salary_sd = sd(EstimatedSalary),
      Male = sum(Gender == "Male"),
      Female = sum(Gender == "Female")
      
    )
) %>%
  select(Dataset, everything()) %>%
  mutate(across(where(is.numeric), ~round(., 2)))

# View formatted table
merged_table

## # A tibble: 2 × 7
##   Dataset      Age_mean Age_sd Salary_mean Salary_sd  Male Female
##   <chr>           <dbl>  <dbl>       <dbl>     <dbl> <dbl>  <dbl>
## 1 Training Set     37.3   10.5      71252.    34733.   158    163
## 2 Test Set         39.0   10.4      63608.    30829.    38     41

#Purchased variable Summary statistics after spliting
train_summary_purchased <- table(train$Purchased)
test_summary_purchased <- table(test$Purchased)
train_summary_purchased

## 
##   0   1 
## 206 115

test_summary_purchased

## 
##  0  1 
## 51 28

Modeling

# Model 1: Logistic Regression
lr_model1 <- glm(Purchased ~ ., data = train, family = "binomial")
summary(lr_model1)

## 
## Call:
## glm(formula = Purchased ~ ., family = "binomial", data = train)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.172e+01  1.389e+00  -8.441  < 2e-16 ***
## GenderMale       2.526e-01  3.345e-01   0.755     0.45    
## Age              2.169e-01  2.735e-02   7.931 2.17e-15 ***
## EstimatedSalary  3.400e-05  5.733e-06   5.931 3.01e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 418.85  on 320  degrees of freedom
## Residual deviance: 228.37  on 317  degrees of freedom
## AIC: 236.37
## 
## Number of Fisher Scoring iterations: 6

#IRR 
exp(coef(lr_model1))

##     (Intercept)      GenderMale             Age EstimatedSalary 
##    8.095921e-06    1.287423e+00    1.242257e+00    1.000034e+00

print("***********************************************")

## [1] "***********************************************"

#confidence interval
exp(confint(lr_model1))

## Waiting for profiling to be done...

##                        2.5 %       97.5 %
## (Intercept)     4.110395e-07 9.796939e-05
## GenderMale      6.711233e-01 2.502182e+00
## Age             1.182443e+00 1.317013e+00
## EstimatedSalary 1.000023e+00 1.000046e+00

# Predict and evaluate
log_probs <- predict(lr_model1, test, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0) %>% factor(levels = c(0, 1))

# Confusion Matrix and Metrics
log_cm <- confusionMatrix(log_pred, test$Purchased)
log_auc <- auc(roc(test$Purchased, log_probs))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

log_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 46  6
##          1  5 22
##                                           
##                Accuracy : 0.8608          
##                  95% CI : (0.7645, 0.9284)
##     No Information Rate : 0.6456          
##     P-Value [Acc > NIR] : 1.657e-05       
##                                           
##                   Kappa : 0.6933          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9020          
##             Specificity : 0.7857          
##          Pos Pred Value : 0.8846          
##          Neg Pred Value : 0.8148          
##              Prevalence : 0.6456          
##          Detection Rate : 0.5823          
##    Detection Prevalence : 0.6582          
##       Balanced Accuracy : 0.8438          
##                                           
##        'Positive' Class : 0               
##

log_auc

## Area under the curve: 0.9524

# Model 2: Logistic Regression with balanced data

# Check original class distribution
table(train$Purchased)

## 
##   0   1 
## 206 115

#oversample the minority class
# Set seed for reproducibility
set.seed(123)

# Get column index dynamically 
purchased_col <- which(names(train) == "Purchased")

train_balanced <- upSample(
  x = train[, -purchased_col],  
  y = train$Purchased,          
  yname = "Purchased"  
)

#confirm the oversampling worked
print(table(train_balanced$Purchased))

## 
##   0   1 
## 206 206

# 1. Build Logistic Regression Model with Balanced Data
lr_model2 <- glm(Purchased ~.,
                         data = train_balanced,
                         family = "binomial")

summary(lr_model2)

## 
## Call:
## glm(formula = Purchased ~ ., family = "binomial", data = train_balanced)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.259e+01  1.351e+00  -9.314  < 2e-16 ***
## GenderMale       2.686e-01  2.939e-01   0.914    0.361    
## Age              2.419e-01  2.643e-02   9.150  < 2e-16 ***
## EstimatedSalary  3.934e-05  5.519e-06   7.127 1.02e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 571.15  on 411  degrees of freedom
## Residual deviance: 289.77  on 408  degrees of freedom
## AIC: 297.77
## 
## Number of Fisher Scoring iterations: 6

#IRR 
exp(coef(lr_model2))

##     (Intercept)      GenderMale             Age EstimatedSalary 
##    3.412025e-06    1.308110e+00    1.273625e+00    1.000039e+00

print("***********************************************")

## [1] "***********************************************"

#confidence interval
exp(confint(lr_model2))

## Waiting for profiling to be done...

##                        2.5 %       97.5 %
## (Intercept)     1.916413e-07 0.0000391967
## GenderMale      7.371774e-01 2.3398311592
## Age             1.213986e+00 1.3471064581
## EstimatedSalary 1.000029e+00 1.0000508495

Observation At 95% confident, a one-unit increase in the Estimated Salary, the result of the purchase multiplies by a factor between 1.00002 and 1.00005

#Make Predictions on Test Set
lr_probs <- predict(lr_model2, newdata = test, type = "response")
lr_pred <- factor(ifelse(lr_probs > 0.5, 1, 0),
                           levels = levels(test$Purchased))


lr_cm2 <- confusionMatrix(lr_pred, test$Purchased)
lr_auc2 <- auc(roc(test$Purchased, lr_probs))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

lr_cm2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 45  3
##          1  6 25
##                                           
##                Accuracy : 0.8861          
##                  95% CI : (0.7947, 0.9466)
##     No Information Rate : 0.6456          
##     P-Value [Acc > NIR] : 1.161e-06       
##                                           
##                   Kappa : 0.7569          
##                                           
##  Mcnemar's Test P-Value : 0.505           
##                                           
##             Sensitivity : 0.8824          
##             Specificity : 0.8929          
##          Pos Pred Value : 0.9375          
##          Neg Pred Value : 0.8065          
##              Prevalence : 0.6456          
##          Detection Rate : 0.5696          
##    Detection Prevalence : 0.6076          
##       Balanced Accuracy : 0.8876          
##                                           
##        'Positive' Class : 0               
##

lr_auc2

## Area under the curve: 0.9517

## For feature importance visualization
vip(lr_model1) + ggtitle("Feature Importance")

## Decision Tree

#Create the decision tree model
tree_model <- rpart(Purchased ~ ., data = train, method = "class")

#visualize the model
rpart.plot(tree_model)

#model prediction
tree_pred <- predict(tree_model, test, type = "class")
tree_probs <- predict(tree_model, test, type = "prob")[, 2]

#Confusion Matrix and Metrics
tree_cm <- confusionMatrix(tree_pred, test$Purchased)
tree_auc <- auc(roc(test$Purchased, tree_probs))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

tree_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 46  2
##          1  5 26
##                                           
##                Accuracy : 0.9114          
##                  95% CI : (0.8259, 0.9636)
##     No Information Rate : 0.6456          
##     P-Value [Acc > NIR] : 5.075e-08       
##                                           
##                   Kappa : 0.8109          
##                                           
##  Mcnemar's Test P-Value : 0.4497          
##                                           
##             Sensitivity : 0.9020          
##             Specificity : 0.9286          
##          Pos Pred Value : 0.9583          
##          Neg Pred Value : 0.8387          
##              Prevalence : 0.6456          
##          Detection Rate : 0.5823          
##    Detection Prevalence : 0.6076          
##       Balanced Accuracy : 0.9153          
##                                           
##        'Positive' Class : 0               
##

tree_auc

## Area under the curve: 0.9065

# 1. View variable importance
print("Variable Importance:")

## [1] "Variable Importance:"

print(tree_model$variable.importance)

##             Age EstimatedSalary 
##        60.51107        44.66782

# 2. Get confidence via cross-validation
set.seed(123)
tree_cv <- train(Purchased ~ ., 
                data = train_balanced,
                method = "rpart",
                trControl = trainControl(method = "cv", number = 10))

print("Cross-Validated Accuracy:")

## [1] "Cross-Validated Accuracy:"

print(tree_cv$results)

##            cp  Accuracy     Kappa AccuracySD    KappaSD
## 1 0.001213592 0.8835453 0.7667259 0.04530318 0.09062592
## 2 0.223300971 0.8302381 0.6609638 0.08622506 0.17068969
## 3 0.606796117 0.5952149 0.2011829 0.13837852 0.26923491

# 3. Plot the tree structure
rpart.plot(tree_model, 
           box.palette = "GnBu",
           shadow.col = "gray",
           nn = TRUE)

###. Model comparison

# Create comparison table
performance_table <- data.frame(
  Model = c("Logistic Regression","balanced Logistic regresion", "Decision Tree"),
  Accuracy = c(round(log_cm$overall["Accuracy"], 3),
              round(lr_cm2$overall["Accuracy"], 3),
              round(tree_cm$overall["Accuracy"], 3)),
              
  Sensitivity = c(round(log_cm$byClass["Sensitivity"], 3),
                 round(lr_cm2$byClass["Sensitivity"], 3),
                round(tree_cm$byClass["Sensitivity"], 3)),
  AUC = c(round(log_auc, 3),
               round(lr_auc2, 3),
              round(tree_auc, 3))
)

# Format table using kable
kable(performance_table, align = 'c', caption = "Model Performance Comparison") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
  column_spec(4:4)

Model Performance Comparison
Model	Accuracy	Sensitivity	AUC
Logistic Regression	0.861	0.902	0.952
balanced Logistic regresion	0.886	0.882	0.952
Decision Tree	0.911	0.902	0.907

###. Next Step of action * 1. Rectifying class imbalance using SMOTE or ROSE method. * 2. Applying hyperparamater tuning.