# Load required packages
library(tibble)
library(skimr)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(rpart)
library(rpart.plot)
library(corrplot)
## corrplot 0.95 loaded
library(ggplot2)
library(vip) # for feature importance
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
## 1. Data understanding
#1. load and inspecting the dataset (IDA)
data <- read.csv("Data_Science_Social_Network_Ads_and_purchase_Data.csv", head=TRUE);
#attach
attach(data);
#convert dataframe to a tibble for readability
data <- as_tibble(data)
#view data
view(data)
#check few top and last rows
head(data); # top 6 rows
## # A tibble: 6 × 5
## User.ID Gender Age EstimatedSalary Purchased
## <int> <chr> <int> <int> <int>
## 1 15624510 Male 19 19000 0
## 2 15810944 Male 35 20000 0
## 3 15668575 Female 26 43000 0
## 4 15603246 Female 27 57000 0
## 5 15804002 Male 19 76000 0
## 6 15728773 Male 27 58000 0
tail(data); #bottom 6 rows
## # A tibble: 6 × 5
## User.ID Gender Age EstimatedSalary Purchased
## <int> <chr> <int> <int> <int>
## 1 15757632 Female 39 59000 0
## 2 15691863 Female 46 41000 1
## 3 15706071 Male 51 23000 1
## 4 15654296 Female 50 20000 1
## 5 15755018 Male 36 33000 0
## 6 15594041 Female 49 36000 1
#check dimension and structure
dim(data)
## [1] 400 5
str(data)
## tibble [400 × 5] (S3: tbl_df/tbl/data.frame)
## $ User.ID : int [1:400] 15624510 15810944 15668575 15603246 15804002 15728773 15598044 15694829 15600575 15727311 ...
## $ Gender : chr [1:400] "Male" "Male" "Female" "Female" ...
## $ Age : int [1:400] 19 35 26 27 19 27 27 32 25 35 ...
## $ EstimatedSalary: int [1:400] 19000 20000 43000 57000 76000 58000 84000 150000 33000 65000 ...
## $ Purchased : int [1:400] 0 0 0 0 0 0 0 1 0 0 ...
#check for duplicates based on unique identifier
print(paste0("The dataset contains ", sum(duplicated(data)), " duplicated values"))
## [1] "The dataset contains 0 duplicated values"
#Remove the user_id
data <-data[, -1] # this is unique identifier
print('**********************************')
## [1] "**********************************"
#check data structure # you can also use str()
glimpse(data)
## Rows: 400
## Columns: 4
## $ Gender <chr> "Male", "Male", "Female", "Female", "Male", "Male", "F…
## $ Age <int> 19, 35, 26, 27, 19, 27, 27, 32, 25, 35, 26, 26, 20, 32…
## $ EstimatedSalary <int> 19000, 20000, 43000, 57000, 76000, 58000, 84000, 15000…
## $ Purchased <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
print('**********************************')
## [1] "**********************************"
#concise statistical summary
summary(data) # for numerical variables
## Gender Age EstimatedSalary Purchased
## Length:400 Min. :18.00 Min. : 15000 Min. :0.0000
## Class :character 1st Qu.:29.75 1st Qu.: 43000 1st Qu.:0.0000
## Mode :character Median :37.00 Median : 70000 Median :0.0000
## Mean :37.66 Mean : 69743 Mean :0.3575
## 3rd Qu.:46.00 3rd Qu.: 88000 3rd Qu.:1.0000
## Max. :60.00 Max. :150000 Max. :1.0000
##2. Data Preparation
###1. Data cleaning
#Create a copy so that we don't mess up with the original dataset
data1 <- data
#check the null values
print(paste0("The dataset contains ", sum(is.na(data1)), " null values"))
## [1] "The dataset contains 0 null values"
#check for outliers
boxplot(EstimatedSalary, main = "Box Plot of Estimated Salary", ylab = "Values")
boxplot(Age, main = "Box Plot of Age", ylab = "Values")
###2. Exploratory Data Analysis
#Univariate analysis: purchased variable
#Get the crosstab table
table(Purchased)
## Purchased
## 0 1
## 257 143
# Get counts and percentages
purchased_counts <- table(Purchased)
percentages <- round(prop.table(purchased_counts) * 100, 1) # Calculate percentages
# Create labels
labels <- paste0(c("Not Purchased (0)", "Purchased (1)"), "\n",
" (", percentages, "%)")
# Plot pie chart
pie(purchased_counts,
labels = labels,
main = "Distribution of Purchased",
col = c("#9ACBD0", "#48A6A7"),
border = "#F2EFE7")
print("***********************************************")
## [1] "***********************************************"
#Univariate analysis: Gender variable
#Get the crosstab table
table(Gender)
## Gender
## Female Male
## 204 196
# Get counts and percentages
gender_counts <- table(Gender)
gender_pct <- round(prop.table(gender_counts) * 100, 1) # Calculate percentages
# Create labels
labels1 <- paste0(c("Female", "Male"),"\n", gender_counts,
" (",gender_pct , "%)")
# Plot pie chart
pie(gender_counts,
labels = labels1,
main = "Gender distribution",
col = c("lightgreen", "#ACC572"),
border = "white")
print("*********************************")
## [1] "*********************************"
#Univariate analysis of numerical variable distribution
par(mfrow = c(2, 2)) # Split plot window into 2x2 grid
hist(Age, main = "Distribution of Age", xlab = "Age", col="skyblue")
hist(EstimatedSalary, main = "Distribution of Salary", xlab = "Salary", col="darkgreen")
#Gender vs purchased Barchart
ggplot(data1, aes(x = factor(Gender), fill = factor(Purchased))) +
geom_bar(position = "dodge") +
labs(title = "Gender vs. Purchases", x = "Gender", fill = "Purchased")
#Estimated salary vs purchased scatterplot
ggplot(data1, aes(x =Age , y = EstimatedSalary)) +
geom_point(color = "darkblue") +
labs(title = "Salary vs. Age", x = "Age", y = "Salary")
print("*********************************************")
## [1] "*********************************************"
#correlation matrix
# Create numerical version of Gender
data1$Gender <- ifelse(data$Gender == "Male", 1, 0)
#compute corr matrix
corr_matrix <- cor(data1[, c("EstimatedSalary", "Age", "Purchased", "Gender")])
print(corr_matrix)
## EstimatedSalary Age Purchased Gender
## EstimatedSalary 1.00000000 0.15523802 0.36208303 -0.06043469
## Age 0.15523802 1.00000000 0.62245420 -0.07374133
## Purchased 0.36208303 0.62245420 1.00000000 -0.04246946
## Gender -0.06043469 -0.07374133 -0.04246946 1.00000000
#Visualize corr
corrplot(corr_matrix, method = "color", type = "upper", order = "hclust",
addCoef.col = "black", number.cex = 0.7, insig = "blank" ,
number.digits = 2, tl.col = "black", tl.srt = 45)
#Estimated salary vs Age by Gender scatterplot
ggplot(data1, aes(x =Age , y = EstimatedSalary, color = factor(Gender))) +
geom_point() +
labs(title = "Salary vs Age by Gender", x = "Age", y = "Salary")
#Estimated salary vs Age by Gender scatterplot
ggplot(data1, aes(x =Age , y = EstimatedSalary, color = factor(Purchased))) +
geom_point() +
labs(title = "Salary vs Age by purchased", x = "Age", y = "Salary")
###2. Data Preprocessing
# Convert categorical variables to factors
data1$Gender <- as.factor(Gender)
data1$Purchased <- as.factor(Purchased)
#Confirm the conversion
str(data1)
## tibble [400 × 4] (S3: tbl_df/tbl/data.frame)
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 2 1 1 2 1 ...
## $ Age : int [1:400] 19 35 26 27 19 27 27 32 25 35 ...
## $ EstimatedSalary: int [1:400] 19000 20000 43000 57000 76000 58000 84000 150000 33000 65000 ...
## $ Purchased : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
# Splitting data into training (80%) and testing (20%) sets
set.seed(123) # For reproducibility
index <- createDataPartition(data1$Purchased, p = 0.8, list = FALSE)
train <- data1[index, ]
test <- data1[-index, ]
#confirm the splitting
dim(train)
## [1] 321 4
dim(test)
## [1] 79 4
###3. summary statistics table for training and testing set
#Summary statistics for training set # you can achieve the below result by using skim() as well
# Create combined summary table
merged_table <- bind_rows(
# Training set summary
train %>%
summarize(
Dataset = "Training Set",
Age_mean = mean(Age),
Age_sd = sd(Age),
Salary_mean = mean(EstimatedSalary),
Salary_sd = sd(EstimatedSalary),
Male = sum(Gender == "Male"),
Female = sum(Gender == "Female")
),
# Test set summary
test %>%
summarize(
Dataset = "Test Set",
Age_mean = mean(Age),
Age_sd = sd(Age),
Salary_mean = mean(EstimatedSalary),
Salary_sd = sd(EstimatedSalary),
Male = sum(Gender == "Male"),
Female = sum(Gender == "Female")
)
) %>%
select(Dataset, everything()) %>%
mutate(across(where(is.numeric), ~round(., 2)))
# View formatted table
merged_table
## # A tibble: 2 × 7
## Dataset Age_mean Age_sd Salary_mean Salary_sd Male Female
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Training Set 37.3 10.5 71252. 34733. 158 163
## 2 Test Set 39.0 10.4 63608. 30829. 38 41
#Purchased variable Summary statistics after spliting
train_summary_purchased <- table(train$Purchased)
test_summary_purchased <- table(test$Purchased)
train_summary_purchased
##
## 0 1
## 206 115
test_summary_purchased
##
## 0 1
## 51 28
# Model 1: Logistic Regression
lr_model1 <- glm(Purchased ~ ., data = train, family = "binomial")
summary(lr_model1)
##
## Call:
## glm(formula = Purchased ~ ., family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.172e+01 1.389e+00 -8.441 < 2e-16 ***
## GenderMale 2.526e-01 3.345e-01 0.755 0.45
## Age 2.169e-01 2.735e-02 7.931 2.17e-15 ***
## EstimatedSalary 3.400e-05 5.733e-06 5.931 3.01e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 418.85 on 320 degrees of freedom
## Residual deviance: 228.37 on 317 degrees of freedom
## AIC: 236.37
##
## Number of Fisher Scoring iterations: 6
#IRR
exp(coef(lr_model1))
## (Intercept) GenderMale Age EstimatedSalary
## 8.095921e-06 1.287423e+00 1.242257e+00 1.000034e+00
print("***********************************************")
## [1] "***********************************************"
#confidence interval
exp(confint(lr_model1))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 4.110395e-07 9.796939e-05
## GenderMale 6.711233e-01 2.502182e+00
## Age 1.182443e+00 1.317013e+00
## EstimatedSalary 1.000023e+00 1.000046e+00
# Predict and evaluate
log_probs <- predict(lr_model1, test, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0) %>% factor(levels = c(0, 1))
# Confusion Matrix and Metrics
log_cm <- confusionMatrix(log_pred, test$Purchased)
log_auc <- auc(roc(test$Purchased, log_probs))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
log_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 46 6
## 1 5 22
##
## Accuracy : 0.8608
## 95% CI : (0.7645, 0.9284)
## No Information Rate : 0.6456
## P-Value [Acc > NIR] : 1.657e-05
##
## Kappa : 0.6933
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9020
## Specificity : 0.7857
## Pos Pred Value : 0.8846
## Neg Pred Value : 0.8148
## Prevalence : 0.6456
## Detection Rate : 0.5823
## Detection Prevalence : 0.6582
## Balanced Accuracy : 0.8438
##
## 'Positive' Class : 0
##
log_auc
## Area under the curve: 0.9524
# Model 2: Logistic Regression with balanced data
# Check original class distribution
table(train$Purchased)
##
## 0 1
## 206 115
#oversample the minority class
# Set seed for reproducibility
set.seed(123)
# Get column index dynamically
purchased_col <- which(names(train) == "Purchased")
train_balanced <- upSample(
x = train[, -purchased_col],
y = train$Purchased,
yname = "Purchased"
)
#confirm the oversampling worked
print(table(train_balanced$Purchased))
##
## 0 1
## 206 206
# 1. Build Logistic Regression Model with Balanced Data
lr_model2 <- glm(Purchased ~.,
data = train_balanced,
family = "binomial")
summary(lr_model2)
##
## Call:
## glm(formula = Purchased ~ ., family = "binomial", data = train_balanced)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.259e+01 1.351e+00 -9.314 < 2e-16 ***
## GenderMale 2.686e-01 2.939e-01 0.914 0.361
## Age 2.419e-01 2.643e-02 9.150 < 2e-16 ***
## EstimatedSalary 3.934e-05 5.519e-06 7.127 1.02e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 571.15 on 411 degrees of freedom
## Residual deviance: 289.77 on 408 degrees of freedom
## AIC: 297.77
##
## Number of Fisher Scoring iterations: 6
#IRR
exp(coef(lr_model2))
## (Intercept) GenderMale Age EstimatedSalary
## 3.412025e-06 1.308110e+00 1.273625e+00 1.000039e+00
print("***********************************************")
## [1] "***********************************************"
#confidence interval
exp(confint(lr_model2))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.916413e-07 0.0000391967
## GenderMale 7.371774e-01 2.3398311592
## Age 1.213986e+00 1.3471064581
## EstimatedSalary 1.000029e+00 1.0000508495
Observation At 95% confident, a one-unit increase in the Estimated Salary, the result of the purchase multiplies by a factor between 1.00002 and 1.00005
#Make Predictions on Test Set
lr_probs <- predict(lr_model2, newdata = test, type = "response")
lr_pred <- factor(ifelse(lr_probs > 0.5, 1, 0),
levels = levels(test$Purchased))
lr_cm2 <- confusionMatrix(lr_pred, test$Purchased)
lr_auc2 <- auc(roc(test$Purchased, lr_probs))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
lr_cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 45 3
## 1 6 25
##
## Accuracy : 0.8861
## 95% CI : (0.7947, 0.9466)
## No Information Rate : 0.6456
## P-Value [Acc > NIR] : 1.161e-06
##
## Kappa : 0.7569
##
## Mcnemar's Test P-Value : 0.505
##
## Sensitivity : 0.8824
## Specificity : 0.8929
## Pos Pred Value : 0.9375
## Neg Pred Value : 0.8065
## Prevalence : 0.6456
## Detection Rate : 0.5696
## Detection Prevalence : 0.6076
## Balanced Accuracy : 0.8876
##
## 'Positive' Class : 0
##
lr_auc2
## Area under the curve: 0.9517
## For feature importance visualization
vip(lr_model1) + ggtitle("Feature Importance")
## Decision Tree
#Create the decision tree model
tree_model <- rpart(Purchased ~ ., data = train, method = "class")
#visualize the model
rpart.plot(tree_model)
#model prediction
tree_pred <- predict(tree_model, test, type = "class")
tree_probs <- predict(tree_model, test, type = "prob")[, 2]
#Confusion Matrix and Metrics
tree_cm <- confusionMatrix(tree_pred, test$Purchased)
tree_auc <- auc(roc(test$Purchased, tree_probs))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
tree_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 46 2
## 1 5 26
##
## Accuracy : 0.9114
## 95% CI : (0.8259, 0.9636)
## No Information Rate : 0.6456
## P-Value [Acc > NIR] : 5.075e-08
##
## Kappa : 0.8109
##
## Mcnemar's Test P-Value : 0.4497
##
## Sensitivity : 0.9020
## Specificity : 0.9286
## Pos Pred Value : 0.9583
## Neg Pred Value : 0.8387
## Prevalence : 0.6456
## Detection Rate : 0.5823
## Detection Prevalence : 0.6076
## Balanced Accuracy : 0.9153
##
## 'Positive' Class : 0
##
tree_auc
## Area under the curve: 0.9065
# 1. View variable importance
print("Variable Importance:")
## [1] "Variable Importance:"
print(tree_model$variable.importance)
## Age EstimatedSalary
## 60.51107 44.66782
# 2. Get confidence via cross-validation
set.seed(123)
tree_cv <- train(Purchased ~ .,
data = train_balanced,
method = "rpart",
trControl = trainControl(method = "cv", number = 10))
print("Cross-Validated Accuracy:")
## [1] "Cross-Validated Accuracy:"
print(tree_cv$results)
## cp Accuracy Kappa AccuracySD KappaSD
## 1 0.001213592 0.8835453 0.7667259 0.04530318 0.09062592
## 2 0.223300971 0.8302381 0.6609638 0.08622506 0.17068969
## 3 0.606796117 0.5952149 0.2011829 0.13837852 0.26923491
# 3. Plot the tree structure
rpart.plot(tree_model,
box.palette = "GnBu",
shadow.col = "gray",
nn = TRUE)
###. Model comparison
# Create comparison table
performance_table <- data.frame(
Model = c("Logistic Regression","balanced Logistic regresion", "Decision Tree"),
Accuracy = c(round(log_cm$overall["Accuracy"], 3),
round(lr_cm2$overall["Accuracy"], 3),
round(tree_cm$overall["Accuracy"], 3)),
Sensitivity = c(round(log_cm$byClass["Sensitivity"], 3),
round(lr_cm2$byClass["Sensitivity"], 3),
round(tree_cm$byClass["Sensitivity"], 3)),
AUC = c(round(log_auc, 3),
round(lr_auc2, 3),
round(tree_auc, 3))
)
# Format table using kable
kable(performance_table, align = 'c', caption = "Model Performance Comparison") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
column_spec(4:4)
Model | Accuracy | Sensitivity | AUC |
---|---|---|---|
Logistic Regression | 0.861 | 0.902 | 0.952 |
balanced Logistic regresion | 0.886 | 0.882 | 0.952 |
Decision Tree | 0.911 | 0.902 | 0.907 |
###. Next Step of action * 1. Rectifying class imbalance using SMOTE or ROSE method. * 2. Applying hyperparamater tuning.