#This script provides a comprehensive guide to performing data cleaning and exploratory data analysis (EDA) on a sales and marketing dataset # SECTION 1: Setup and Data Loading
tidyverse for data manipulation and
visualization, and skimr for quick data summaries.library(tidyverse) # Includes dplyr, ggplot2, etc.
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr) # For quick and comprehensive data summaries
set.seed(123)
df <- read_csv("sales_marketing_data.csv")
Rows: 2500 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): gender, education, region, campaign_type
dbl (9): customer_id, age, income, marketing_spend, impressions, clicks, co...
date (1): date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(head(df))
# A tibble: 6 × 14
customer_id age gender income education region campaign_type marketing_spend
<dbl> <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl>
1 1 56 Female 52778. High Sch… North Social Media 114.
2 2 69 Female 56382. Bachelors East Print Ad 72.5
3 3 46 Female 53334. High Sch… South Email 89.0
4 4 32 Female 69183. Bachelors West Email 101.
5 5 60 Male 35715. Bachelors East Social Media 89.5
6 6 25 Male 39845. Bachelors East Print Ad 127.
# ℹ 6 more variables: impressions <dbl>, clicks <dbl>, conversion_rate <dbl>,
# converted <dbl>, sales_amount <dbl>, date <date>
print(str(df))
spc_tbl_ [2,500 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ customer_id : num [1:2500] 1 2 3 4 5 6 7 8 9 10 ...
$ age : num [1:2500] 56 69 46 32 60 25 38 56 36 40 ...
$ gender : chr [1:2500] "Female" "Female" "Female" "Female" ...
$ income : num [1:2500] 52778 56382 53334 69183 35715 ...
$ education : chr [1:2500] "High School" "Bachelors" "High School" "Bachelors" ...
$ region : chr [1:2500] "North" "East" "South" "West" ...
$ campaign_type : chr [1:2500] "Social Media" "Print Ad" "Email" "Email" ...
$ marketing_spend: num [1:2500] 114.3 72.5 89 101 89.5 ...
$ impressions : num [1:2500] 2886 1030 2709 4563 4368 ...
$ clicks : num [1:2500] 69 388 11 406 459 66 436 310 296 147 ...
$ conversion_rate: num [1:2500] 0.025 0.028 0.03 0.04 0.02 0.023 0.035 0.038 0.04 0.028 ...
$ converted : num [1:2500] 0 0 0 1 0 0 0 0 0 0 ...
$ sales_amount : num [1:2500] 0 0 0 242 0 ...
$ date : Date[1:2500], format: "2024-12-26" "2024-04-18" ...
- attr(*, "spec")=
.. cols(
.. customer_id = col_double(),
.. age = col_double(),
.. gender = col_character(),
.. income = col_double(),
.. education = col_character(),
.. region = col_character(),
.. campaign_type = col_character(),
.. marketing_spend = col_double(),
.. impressions = col_double(),
.. clicks = col_double(),
.. conversion_rate = col_double(),
.. converted = col_double(),
.. sales_amount = col_double(),
.. date = col_date(format = "")
.. )
- attr(*, "problems")=<externalptr>
NULL
print(skim(df))
── Data Summary ────────────────────────
Values
Name df
Number of rows 2500
Number of columns 14
_______________________
Column type frequency:
character 4
Date 1
numeric 9
________________________
Group variables None
── Variable type: character ────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 gender 0 1 4 6 0 3 0
2 education 0 1 3 11 0 4 0
3 region 0 1 4 5 0 4 0
4 campaign_type 0 1 5 12 0 4 0
── Variable type: Date ─────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max median
1 date 0 1 2024-01-01 2024-12-30 2024-06-25
n_unique
1 365
── Variable type: numeric ──────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0
1 customer_id 0 1 1250. 722. 1
2 age 75 0.97 43.6 15.0 18
3 income 75 0.97 49807. 14989. 2349.
4 marketing_spend 0 1 101. 30.3 7.52
5 impressions 0 1 2774. 1306. 500
6 clicks 75 0.97 252. 143. 10
7 conversion_rate 0 1 0.0308 0.00720 0.02
8 converted 0 1 0.0308 0.173 0
9 sales_amount 0 1 7.47 44.1 0
p25 p50 p75 p100 hist
1 626. 1250. 1875. 2500 ▇▇▇▇▇
2 31 44 56 69 ▇▇▇▇▇
3 39812. 49958. 59931. 98646. ▁▃▇▃▁
4 80.3 101. 121. 203. ▁▅▇▃▁
5 1622. 2806 3886 4999 ▇▇▇▇▇
6 125 256 372 499 ▇▇▇▇▇
7 0.025 0.03 0.035 0.05 ▇▆▃▃▁
8 0 0 0 1 ▇▁▁▁▁
9 0 0 0 397. ▇▁▁▁▁
#SECTION 2: Data Cleaning # 2.1. Handle Missing Values # From the skim() output, we can see missing values in ‘income’, ‘age’, and ‘clicks’. # For numerical columns, median imputation is often preferred over mean imputation because it is less sensitive to outliers.
print(colSums(is.na(df)))
customer_id age gender income education
0 75 0 75 0
region campaign_type marketing_spend impressions clicks
0 0 0 0 75
conversion_rate converted sales_amount date
0 0 0 0
median_income <- median(df$income, na.rm = TRUE)
df$income[is.na(df$income)] <- median_income
median_age <- median(df$age, na.rm = TRUE)
df$age[is.na(df$age)] <- median_age
print(paste("Imputed missing 'age' with median:", median_age))
[1] "Imputed missing 'age' with median: 44"
median_clicks <- median(df$clicks, na.rm = TRUE)
df$clicks[is.na(df$clicks)] <- median_clicks
print(paste("Imputed missing 'clicks' with median:", median_clicks))
[1] "Imputed missing 'clicks' with median: 256"
print(colSums(is.na(df)))
customer_id age gender income education
0 0 0 0 0
region campaign_type marketing_spend impressions clicks
0 0 0 0 0
conversion_rate converted sales_amount date
0 0 0 0
read_csv from tidyverse often does a good
job, but it’s good to explicitly check.df$gender <- as.factor(df$gender)
df$education <- as.factor(df$education)
df$region <- as.factor(df$region)
df$campaign_type <- as.factor(df$campaign_type)
df$converted <- as.factor(df$converted) # Target variable for classification
df$date <- as.Date(df$date)
print(str(df))
spc_tbl_ [2,500 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ customer_id : num [1:2500] 1 2 3 4 5 6 7 8 9 10 ...
$ age : num [1:2500] 56 69 46 32 60 25 38 56 36 40 ...
$ gender : Factor w/ 3 levels "Female","Male",..: 1 1 1 1 2 2 1 2 1 1 ...
$ income : num [1:2500] 52778 56382 53334 69183 35715 ...
$ education : Factor w/ 4 levels "Bachelors","High School",..: 2 1 2 1 1 1 1 3 3 1 ...
$ region : Factor w/ 4 levels "East","North",..: 2 1 3 4 1 1 1 2 3 1 ...
$ campaign_type : Factor w/ 4 levels "Email","Print Ad",..: 3 2 1 1 3 2 4 2 1 2 ...
$ marketing_spend: num [1:2500] 114.3 72.5 89 101 89.5 ...
$ impressions : num [1:2500] 2886 1030 2709 4563 4368 ...
$ clicks : num [1:2500] 69 388 11 406 459 66 436 310 296 147 ...
$ conversion_rate: num [1:2500] 0.025 0.028 0.03 0.04 0.02 0.023 0.035 0.038 0.04 0.028 ...
$ converted : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
$ sales_amount : num [1:2500] 0 0 0 242 0 ...
$ date : Date[1:2500], format: "2024-12-26" "2024-04-18" ...
- attr(*, "spec")=
.. cols(
.. customer_id = col_double(),
.. age = col_double(),
.. gender = col_character(),
.. income = col_double(),
.. education = col_character(),
.. region = col_character(),
.. campaign_type = col_character(),
.. marketing_spend = col_double(),
.. impressions = col_double(),
.. clicks = col_double(),
.. conversion_rate = col_double(),
.. converted = col_double(),
.. sales_amount = col_double(),
.. date = col_date(format = "")
.. )
- attr(*, "problems")=<externalptr>
NULL
library(psych)
Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':
%+%, alpha
selected_variables<-df %>%
select(income,marketing_spend,impressions,clicks,sales_amount)
describe(selected_variables)
vars n mean sd median trimmed mad min
income 1 2500 49811.57 14762.85 49958.21 49860.72 14392.47 2349.44
marketing_spend 2 2500 100.73 30.30 100.61 100.63 30.17 7.52
impressions 3 2500 2773.97 1305.95 2806.00 2783.16 1668.67 500.00
clicks 4 2500 252.15 140.55 256.00 251.93 177.91 10.00
sales_amount 5 2500 7.47 44.11 0.00 0.00 0.00 0.00
max range skew kurtosis se
income 98646.39 96296.95 -0.02 0.10 295.26
marketing_spend 202.87 195.35 0.04 0.04 0.61
impressions 4999.00 4499.00 -0.06 -1.21 26.12
clicks 499.00 489.00 -0.01 -1.16 2.81
sales_amount 397.36 397.36 6.21 38.91 0.88
plot_income_dist <- ggplot(df, aes(x = income)) +
geom_histogram(binwidth = 5000, fill = "skyblue", color = "black") +
labs(title = "Distribution of Customer Income", x = "Income", y = "Frequency")
print(plot_income_dist)
ggsave("income_distribution.png", plot_income_dist, width = 8, height = 6)
print("Saved: income_distribution.png")
[1] "Saved: income_distribution.png"
plot_sales_dist <- ggplot(filter(df, converted == 1), aes(x = sales_amount)) +
geom_histogram(binwidth = 25, fill = "lightgreen", color = "black") +
labs(title = "Distribution of Sales Amount (Converted Customers)", x = "Sales Amount", y = "Frequency")
print(plot_sales_dist)
ggsave("sales_amount_distribution.png", plot_sales_dist, width = 8, height = 6)
print("Saved: sales_amount_distribution.png")
[1] "Saved: sales_amount_distribution.png"
plot_spend_clicks <- ggplot(df, aes(x = marketing_spend, y = clicks)) +
geom_point(alpha = 0.6, color = "darkblue") +
labs(title = "Marketing Spend vs. Clicks", x = "Marketing Spend", y = "Clicks")
print(plot_spend_clicks)
ggsave("marketing_spend_vs_clicks.png", plot_spend_clicks, width = 8, height = 6)
print("Saved: marketing_spend_vs_clicks.png")
[1] "Saved: marketing_spend_vs_clicks.png"
cor() requires a matrix of numerical values.num_vars <- df %>% select(income, marketing_spend, impressions, clicks, sales_amount)
print(cor(num_vars))
income marketing_spend impressions clicks
income 1.000000000 -0.007837892 0.020403122 0.010405013
marketing_spend -0.007837892 1.000000000 -0.005066432 -0.008536314
impressions 0.020403122 -0.005066432 1.000000000 -0.048113517
clicks 0.010405013 -0.008536314 -0.048113517 1.000000000
sales_amount 0.025573430 -0.014129812 0.014787840 -0.005072715
sales_amount
income 0.025573430
marketing_spend -0.014129812
impressions 0.014787840
clicks -0.005072715
sales_amount 1.000000000
plot_gender_counts <- ggplot(df, aes(x = gender, fill = gender)) +
geom_bar() +
labs(title = "Customer Count by Gender", x = "Gender", y = "Count")
print(plot_gender_counts)
ggsave("gender_counts.png", plot_gender_counts, width = 8, height = 6)
print("Saved: gender_counts.png")
[1] "Saved: gender_counts.png"
plot_sales_by_campaign <- ggplot(df, aes(x = campaign_type, y = sales_amount, fill = campaign_type)) +
geom_boxplot() +
labs(title = "Sales Amount by Campaign Type", x = "Campaign Type", y = "Sales Amount")
print(plot_sales_by_campaign)
conversion_by_education <- df %>%
group_by(education) %>%
summarise(conversion_rate = mean(converted == 1)) # Calculate mean of TRUE (1) for conversion rate
plot_conversion_by_education <- ggplot(conversion_by_education, aes(x = education, y = conversion_rate, fill = education)) +
geom_bar(stat = "identity") +
labs(title = "Conversion Rate by Education Level", x = "Education Level", y = "Conversion Rate")
print(plot_conversion_by_education)
ggsave("conversion_by_education.png", plot_conversion_by_education, width = 8, height = 6)
print("Saved: conversion_by_education.png")
[1] "Saved: conversion_by_education.png"
daily_sales <- df %>%
group_by(date) %>%
summarise(total_sales = sum(sales_amount))
plot_daily_sales <- ggplot(daily_sales, aes(x = date, y = total_sales)) +
geom_line(color = "purple") +
labs(title = "Daily Total Sales Amount", x = "Date", y = "Total Sales")
print(plot_daily_sales)
ggsave("daily_total_sales.png", plot_daily_sales, width = 10, height = 6)
print("Saved: daily_total_sales.png")
[1] "Saved: daily_total_sales.png"
caret for streamlined model training and
evaluation, and randomForest for a popular ensemble
model.library(caret) # For model training and evaluation workflow
Loading required package: lattice
Attaching package: 'caret'
The following object is masked from 'package:purrr':
lift
library(randomForest) # For Random Forest model
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 'randomForest'
The following object is masked from 'package:psych':
outlier
The following object is masked from 'package:dplyr':
combine
The following object is masked from 'package:ggplot2':
margin
library(e1071) # Contains SVM and other functions, often a caret dependency
library(glmnet) # For regularized regression (Lasso, Ridge)
Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
Loaded glmnet 4.1-8
sales_amount.converted.customer_id and date as they are
not features for modeling.conversion_rate is also a calculated outcome, not a
direct feature for prediction.model_df <- df %>% select(-customer_id, -date, -conversion_rate)
caretsales_amountconvertedregression_df <- model_df %>% filter(sales_amount > 0)
createDataPartition from caret ensures
stratified sampling for classification, but works for regression
too.set.seed(123) # For reproducibility
regression_index <- createDataPartition(regression_df$sales_amount, p = 0.8, list = FALSE)
regression_train <- regression_df[regression_index, ]
regression_test <- regression_df[-regression_index, ]
print(paste("Regression Training set size:", nrow(regression_train)))
[1] "Regression Training set size: 64"
print(paste("Regression Test set size:", nrow(regression_test)))
[1] "Regression Test set size: 13"
converted (0 or 1).set.seed(123) # For reproducibility
classification_index <- createDataPartition(model_df$converted, p = 0.8, list = FALSE)
classification_train <- model_df[classification_index, ]
classification_test <- model_df[-classification_index, ]
print(paste("Classification Training set size:", nrow(classification_train)))
[1] "Classification Training set size: 2001"
print(paste("Classification Test set size:", nrow(classification_test)))
[1] "Classification Test set size: 499"
caret for training and evaluating
models.trainControl sets up the resampling method (e.g.,
cross-validation).method = "cv" for cross-validation,
number = 10 for 10-fold CV.verboseIter = FALSE to suppress detailed output during
training.train_control <- trainControl(method = "cv", number = 10, verboseIter = FALSE)
set.seed(123)
linear_model <- train(sales_amount ~ ., data = regression_train, method = "lm", trControl = train_control)
print(linear_model)
Linear Regression
64 samples
10 predictors
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 58, 58, 57, 57, 57, 60, ...
Resampling results:
RMSE Rsquared MAE
93.14125 0.1716143 80.27489
Tuning parameter 'intercept' was held constant at a value of TRUE
linear_predictions <- predict(linear_model, newdata = regression_test)
linear_metrics <- postResample(linear_predictions, regression_test$sales_amount)
print("Linear Regression Metrics:")
[1] "Linear Regression Metrics:"
print(linear_metrics)
RMSE Rsquared MAE
68.54136400 0.05299308 53.05824823
set.seed(123)
rf_model <- train(sales_amount ~ ., data = regression_train, method = "rf", trControl = train_control)
print(rf_model)
Random Forest
64 samples
10 predictors
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 58, 58, 57, 57, 57, 60, ...
Resampling results across tuning parameters:
mtry RMSE Rsquared MAE
2 82.13044 0.1219000 70.31225
9 84.51320 0.1079994 71.91191
17 85.16276 0.1174431 71.79690
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 2.
rf_predictions <- predict(rf_model, newdata = regression_test)
rf_metrics <- postResample(rf_predictions, regression_test$sales_amount)
print("Random Forest Regression Metrics:")
[1] "Random Forest Regression Metrics:"
print(rf_metrics)
RMSE Rsquared MAE
66.76279754 0.07003723 54.26066152
family = "binomial" specifies logistic regression.set.seed(123)
logistic_model <- train(converted ~ ., data = classification_train, method = "glm", family = "binomial", trControl = train_control)
print(logistic_model)
Generalized Linear Model
2001 samples
10 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1801, 1801, 1800, 1801, 1801, 1801, ...
Resampling results:
Accuracy Kappa
1 1
logistic_predictions <- predict(logistic_model, newdata = classification_test)
logistic_metrics <- confusionMatrix(logistic_predictions, classification_test$converted)
print("Logistic Regression Confusion Matrix:")
[1] "Logistic Regression Confusion Matrix:"
print(logistic_metrics)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 484 0
1 0 15
Accuracy : 1
95% CI : (0.9926, 1)
No Information Rate : 0.9699
P-Value [Acc > NIR] : 2.43e-07
Kappa : 1
Mcnemar's Test P-Value : NA
Sensitivity : 1.0000
Specificity : 1.0000
Pos Pred Value : 1.0000
Neg Pred Value : 1.0000
Prevalence : 0.9699
Detection Rate : 0.9699
Detection Prevalence : 0.9699
Balanced Accuracy : 1.0000
'Positive' Class : 0
set.seed(123)
svm_model <- train(converted ~ ., data = classification_train, method = "svmRadial", trControl = train_control)
print(svm_model)
Support Vector Machines with Radial Basis Function Kernel
2001 samples
10 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1801, 1801, 1800, 1801, 1801, 1801, ...
Resampling results across tuning parameters:
C Accuracy Kappa
0.25 0.9989975 0.9813071
0.50 1.0000000 1.0000000
1.00 1.0000000 1.0000000
Tuning parameter 'sigma' was held constant at a value of 0.0387194
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were sigma = 0.0387194 and C = 0.5.
svm_predictions <- predict(svm_model, newdata = classification_test)
svm_metrics <- confusionMatrix(svm_predictions, classification_test$converted)
print("SVM Confusion Matrix:")
[1] "SVM Confusion Matrix:"
print(svm_metrics)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 484 0
1 0 15
Accuracy : 1
95% CI : (0.9926, 1)
No Information Rate : 0.9699
P-Value [Acc > NIR] : 2.43e-07
Kappa : 1
Mcnemar's Test P-Value : NA
Sensitivity : 1.0000
Specificity : 1.0000
Pos Pred Value : 1.0000
Neg Pred Value : 1.0000
Prevalence : 0.9699
Detection Rate : 0.9699
Detection Prevalence : 0.9699
Balanced Accuracy : 1.0000
'Positive' Class : 0
set.seed(123)
knn_model <- train(converted ~ ., data = classification_train, method = "knn", trControl = train_control)
print(knn_model)
k-Nearest Neighbors
2001 samples
10 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1801, 1801, 1800, 1801, 1801, 1801, ...
Resampling results across tuning parameters:
k Accuracy Kappa
5 0.9690198 0
7 0.9690198 0
9 0.9690198 0
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was k = 9.
knn_predictions <- predict(knn_model, newdata = classification_test)
knn_metrics <- confusionMatrix(knn_predictions, classification_test$converted)
print("KNN Confusion Matrix:")
[1] "KNN Confusion Matrix:"
print(knn_metrics)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 484 15
1 0 0
Accuracy : 0.9699
95% CI : (0.9509, 0.9831)
No Information Rate : 0.9699
P-Value [Acc > NIR] : 0.5680975
Kappa : 0
Mcnemar's Test P-Value : 0.0003006
Sensitivity : 1.0000
Specificity : 0.0000
Pos Pred Value : 0.9699
Neg Pred Value : NaN
Prevalence : 0.9699
Detection Rate : 0.9699
Detection Prevalence : 1.0000
Balanced Accuracy : 0.5000
'Positive' Class : 0
clustering_df <- df %>%
select(age, income, marketing_spend, impressions, clicks, sales_amount) %>%
na.omit() # K-Means cannot handle missing values
scaled_clustering_df <- scale(clustering_df)
print("\n--- SECTION 5: Unsupervised Learning (Clustering) ---")
[1] "\n--- SECTION 5: Unsupervised Learning (Clustering) ---"
print("Data prepared and scaled for clustering.")
[1] "Data prepared and scaled for clustering."
wss <- (nrow(scaled_clustering_df)-1)*sum(apply(scaled_clustering_df,2,var))
for (i in 2:10) wss[i] <- sum(kmeans(scaled_clustering_df, centers=i)$withinss)
plot(1:10, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",
main="Elbow Method for K-Means Clustering")
k_clusters <- 3
set.seed(123)
kmeans_result <- kmeans(scaled_clustering_df, centers = k_clusters, nstart = 25)
df_clustered <- df %>%
filter(!is.na(income) & !is.na(age) & !is.na(clicks)) # Filter to match clustering_df
df_clustered$cluster <- as.factor(kmeans_result$cluster)
print(paste("K-Means Clustering performed with K =", k_clusters))
[1] "K-Means Clustering performed with K = 3"
print("Cluster sizes:")
[1] "Cluster sizes:"
print(table(df_clustered$cluster))
1 2 3
1261 69 1170
print("\nMean values of features per cluster:")
[1] "\nMean values of features per cluster:"
print(df_clustered %>%
group_by(cluster) %>%
summarise(across(c(age, income, marketing_spend, impressions, clicks, sales_amount), mean)))
# A tibble: 3 × 7
cluster age income marketing_spend impressions clicks sales_amount
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 44.0 50193. 99.4 2519. 366. 0.278
2 2 42.1 51305. 99.0 2948. 249. 257.
3 3 43.3 49312. 102. 3039. 130. 0.502
table() or prop.table()print("\nGender distribution per cluster:")
[1] "\nGender distribution per cluster:"
print(prop.table(table(df_clustered$cluster, df_clustered$gender), margin = 1))
Female Male Other
1 0.47581285 0.48374306 0.04044409
2 0.47826087 0.52173913 0.00000000
3 0.47521368 0.48034188 0.04444444
print("\nCampaign Type distribution per cluster:")
[1] "\nCampaign Type distribution per cluster:"
print(prop.table(table(df_clustered$cluster, df_clustered$campaign_type), margin = 1))
Email Print Ad Social Media TV Ad
1 0.4147502 0.1562252 0.2862807 0.1427439
2 0.4202899 0.1739130 0.2898551 0.1159420
3 0.4000000 0.1572650 0.2965812 0.1461538
print("\nSales and Marketing R Script (Predictive Modeling and Unsupervised Learning) completed.")
[1] "\nSales and Marketing R Script (Predictive Modeling and Unsupervised Learning) completed."