```#{r,include=FALSE,warning=FALSE, results=FALSE, message=FALSE} install.packages(“ggplot2”) install.packages(“caret”) install.packages(“randomForest”) install.packages(“xgboost”) install.packages(“e1071”) install.packages(“vctrs”) install.packages(“dplyr”) install.packages(“cowplot”) install.packages(“corrplot”) install.packages(“h2o”) install.packages(“factoextra”) install.packages(“gridExtra”) install.packages(“cvms”) install.packages(“reshape2”) install.packages(“ROCR”) install.packages(“mice”) install.packages(“MLmetrics”)







```r
# Load the training and test datasets
train_data <- read.csv("client_attrition_train.csv")
test_data <- read.csv("client_attrition_test.csv")

dim(train_data)
## [1] 10127    21
dim(test_data)
## [1] 5063   20
str(train_data)
## 'data.frame':    10127 obs. of  21 variables:
##  $ customer_id                    : int  755410 568093 595389 287252 231901 418821 896187 258495 881720 367251 ...
##  $ customer_age                   : int  38 46 43 46 40 51 45 50 45 40 ...
##  $ customer_sex                   : chr  "F" "F" "M" "F" ...
##  $ customer_number_of_dependents  : int  2 2 1 4 4 4 3 1 5 2 ...
##  $ customer_education             : chr  "High School" "Graduate" "High School" "High School" ...
##  $ customer_civil_status          : chr  "Married" "Unknown" "Married" "Married" ...
##  $ customer_salary_range          : chr  "40-60K" "below 40K" "80-120K" "below 40K" ...
##  $ customer_relationship_length   : int  31 40 30 36 29 42 39 36 40 29 ...
##  $ customer_available_credit_limit: num  1593 6568 34516 2374 12978 ...
##  $ credit_card_classification     : chr  "Blue" "Blue" "Silver" "Blue" ...
##  $ total_products                 : int  4 5 1 5 3 6 5 1 3 5 ...
##  $ period_inactive                : int  2 2 1 2 3 1 4 1 2 1 ...
##  $ contacts_in_last_year          : int  4 2 3 1 2 2 1 3 2 2 ...
##  $ credit_card_debt_balance       : int  1091 0 2045 1332 0 2517 2253 2238 0 1953 ...
##  $ remaining_credit_limit         : num  502 6568 32471 1042 12978 ...
##  $ transaction_amount_ratio       : num  0.87 0.101 0.59 0.686 0.628 0.853 0.688 0.625 0.708 0.916 ...
##  $ total_transaction_amount       : int  4136 1507 4081 4253 14134 2090 NA 4686 4660 4584 ...
##  $ total_transaction_count        : int  67 33 54 81 85 47 63 81 85 70 ...
##  $ transaction_count_ratio        : num  0.718 0.222 0.421 0.884 0.7 0.88 0.909 0.761 0.735 0.591 ...
##  $ average_utilization            : num  0.685 0 0.059 0.561 0 0.174 0.883 0.495 0 0.741 ...
##  $ account_status                 : chr  "open" "closed" "open" "open" ...
str(test_data)
## 'data.frame':    5063 obs. of  20 variables:
##  $ customer_id                    : int  699849 268360 274908 877343 634148 277346 159337 680814 476166 859216 ...
##  $ customer_age                   : int  45 49 52 44 47 51 46 63 58 42 ...
##  $ customer_sex                   : chr  "M" "M" "F" "F" ...
##  $ customer_number_of_dependents  : int  2 2 2 1 2 1 4 1 0 2 ...
##  $ customer_education             : chr  "College" "High School" "Uneducated" "Graduate" ...
##  $ customer_civil_status          : chr  "Married" "Married" "Married" "Married" ...
##  $ customer_salary_range          : chr  "Unknown" "60-80K" "below 40K" "below 40K" ...
##  $ customer_relationship_length   : int  39 39 35 39 30 44 36 45 43 33 ...
##  $ customer_available_credit_limit: num  32305 6034 1923 3117 23467 ...
##  $ credit_card_classification     : chr  "Blue" "Blue" "Blue" "Blue" ...
##  $ total_products                 : int  1 4 2 6 6 5 5 3 2 3 ...
##  $ period_inactive                : int  2 1 1 1 2 1 2 2 3 3 ...
##  $ contacts_in_last_year          : int  1 2 2 2 2 3 0 2 3 2 ...
##  $ credit_card_debt_balance       : int  1616 472 1003 1385 1692 316 1189 180 729 0 ...
##  $ remaining_credit_limit         : num  30689 5562 919 1732 21774 ...
##  $ transaction_amount_ratio       : num  0.734 0.601 0.71 0.707 0.741 ...
##  $ total_transaction_amount       : int  7129 2449 33472 4606 3603 2461 4317 3826 2367 2574 ...
##  $ total_transaction_count        : int  84 54 77 77 71 42 75 58 48 45 ...
##  $ transaction_count_ratio        : num  0.623 0.756 0.853 0.535 0.735 ...
##  $ average_utilization            : num  0.053 0.0501 0.5265 0.4684 0.0712 ...
# Check missing values

missing_train <- colSums(is.na(train_data))
missing_train
##                     customer_id                    customer_age 
##                               0                             624 
##                    customer_sex   customer_number_of_dependents 
##                            1018                               0 
##              customer_education           customer_civil_status 
##                               0                               0 
##           customer_salary_range    customer_relationship_length 
##                             681                               0 
## customer_available_credit_limit      credit_card_classification 
##                               0                               0 
##                  total_products                 period_inactive 
##                               0                               0 
##           contacts_in_last_year        credit_card_debt_balance 
##                               0                               0 
##          remaining_credit_limit        transaction_amount_ratio 
##                               0                               0 
##        total_transaction_amount         total_transaction_count 
##                             407                               0 
##         transaction_count_ratio             average_utilization 
##                               0                               0 
##                  account_status 
##                               0
missing_test <- colSums(is.na(test_data))
missing_test
##                     customer_id                    customer_age 
##                               0                             294 
##                    customer_sex   customer_number_of_dependents 
##                             472                               0 
##              customer_education           customer_civil_status 
##                               0                               0 
##           customer_salary_range    customer_relationship_length 
##                             331                               0 
## customer_available_credit_limit      credit_card_classification 
##                               0                               0 
##                  total_products                 period_inactive 
##                               0                               0 
##           contacts_in_last_year        credit_card_debt_balance 
##                               0                               0 
##          remaining_credit_limit        transaction_amount_ratio 
##                               0                               0 
##        total_transaction_amount         total_transaction_count 
##                             186                               0 
##         transaction_count_ratio             average_utilization 
##                               0                               0

Customers age, sex, salary range and total transaction amount variables has a missing values that we need to handle. First lets make some EDA for those variables.

##EDA Analyze

I first create the frequency counts table which provide a quick overview of the distribution of each category within the variable for categorical variables.

###Customer age

From the histogram i have feeling that customer age variables close to normal distrubition but need further analyze.

# Histogram of customer_age
hist(train_data$customer_age, main = "Distribution of Customer Age (Training)")

hist(test_data$customer_age, main = "Distribution of Customer Age (Test)")

# Create a Q-Q plot
qqnorm(train_data$customer_age)
qqline(train_data$customer_age, col = "red")

qqnorm(test_data$customer_age)
qqline(test_data$customer_age, col = "red")

# Create a box plot
boxplot(train_data$customer_age, main = "Box Plot - Customer Age for train")

boxplot(test_data$customer_age, main = "Box Plot - Customer Age for test ")

# Sex distribution with Calculate percentages
sex_counts <- table(train_data$customer_sex)
sex_percentages <- round(prop.table(sex_counts) * 100, 2)
text(x = barplot(sex_counts), y = sex_counts, labels = paste0(sex_percentages, "%"), pos = 1)

sex_counts <- table(test_data$customer_sex)
sex_percentages <- round(prop.table(sex_counts) * 100, 2)
text(x = barplot(sex_counts), y = sex_counts, labels = paste0(sex_percentages, "%"), pos = 1)

# Bar plot of customer_salary_range in the training dataset
barplot(table(train_data$customer_salary_range), main = "Distribution of Customer Salary Range (Training)",
        xlab = "Salary Range", ylab = "Count", las = 2)

barplot(table(test_data$customer_salary_range), main = "Distribution of Customer Salary Range (Training)",
        xlab = "Salary Range", ylab = "Count", las = 2)

# total_transaction_amount 
# Histogram
hist(train_data$total_transaction_amount, breaks = 30, main = "Distribution of Total Transaction Amount",
     xlab = "Total Transaction Amount")

hist(test_data$total_transaction_amount, breaks = 30, main = "Distribution of Total Transaction Amount",
     xlab = "Total Transaction Amount")

###more EDA 

# Bar plot of customer_education in the training dataset
barplot(table(train_data$customer_education), main = "Distribution of Customer Education (Training)",
        xlab = "Education Level", ylab = "Count", las = 2)

# Bar plot of customer_civil_status in the training dataset
barplot(table(train_data$customer_civil_status), main = "Distribution of Customer Civil Status (Training)")

# Calculate percentages
card_counts <- table(train_data$credit_card_classification)
card_percentages <- round(prop.table(card_counts) * 100,2)


# Add percentage labels
text(x = barplot(card_counts), y = card_percentages, labels = paste0(card_percentages, "%"), pos = 3)

Customer age is close to normal distribution Number of female customer is higher than number of male customer (53.38%) Most of the customers are married 93.05 % of the customers are using Blue Card.

train_data %>% select(where(is.numeric)) %>% as.matrix() %>%
cor() %>% corrplot(method = "number", type="lower")

From the correlation plot, we observe that customer relationship lenght highly correlated with customer age (0.7) Remaining credit limit and customer available credit limir are highly correlated (0.6) Average utilization is highly pozitive correlated (0.6) with credit card debt balanceand highly negative correlated (-6.54) with remaning credit limit. Total transaction account and total transaction also has positive corelation (+0.4) Thus we need to be carefull when we are choosing variables that have high correlation. Lets examine highly correlated variables more detaill and lets remove some variables to avoide multicollinearity.

# Scatter plot of Customer Relationship Length vs. Customer Age
plot(train_data$customer_age, train_data$customer_relationship_length, 
     xlab = "Customer Age", ylab = "Customer Relationship Length", 
     main = "Scatter plot of Customer Relationship Length vs. Customer Age")

# Scatter plot of Remaining Credit Limit vs. Customer Available Credit Limit
plot(train_data$remaining_credit_limit, train_data$customer_available_credit_limit, 
     xlab = "Remaining Credit Limit", ylab = "Customer Available Credit Limit", 
     main = "Scatter plot of Remaining Credit Limit vs. Customer Available Credit Limit")

# Scatter plot of Average Utilization vs. Credit Card Debt Balance
plot(train_data$average_utilization, train_data$credit_card_debt_balance, 
     xlab = "Average Utilization", ylab = "Credit Card Debt Balance", 
     main = "Scatter plot of Average Utilization vs. Credit Card Debt Balance")

# Scatter plot of Average Utilization vs. Remaining Credit Limit
plot(train_data$average_utilization, train_data$remaining_credit_limit, 
     xlab = "Average Utilization", ylab = "Remaining Credit Limit", 
     main = "Scatter plot of Average Utilization vs. Remaining Credit Limit")

# Dropping features
train_data_new <- subset(train_data, select = -c(customer_relationship_length, remaining_credit_limit,credit_card_debt_balance, customer_id ))

# Checking the dimensions and structure of the new training dataset
dim(train_data_new)
## [1] 10127    17
str(train_data_new)
## 'data.frame':    10127 obs. of  17 variables:
##  $ customer_age                   : int  38 46 43 46 40 51 45 50 45 40 ...
##  $ customer_sex                   : chr  "F" "F" "M" "F" ...
##  $ customer_number_of_dependents  : int  2 2 1 4 4 4 3 1 5 2 ...
##  $ customer_education             : chr  "High School" "Graduate" "High School" "High School" ...
##  $ customer_civil_status          : chr  "Married" "Unknown" "Married" "Married" ...
##  $ customer_salary_range          : chr  "40-60K" "below 40K" "80-120K" "below 40K" ...
##  $ customer_available_credit_limit: num  1593 6568 34516 2374 12978 ...
##  $ credit_card_classification     : chr  "Blue" "Blue" "Silver" "Blue" ...
##  $ total_products                 : int  4 5 1 5 3 6 5 1 3 5 ...
##  $ period_inactive                : int  2 2 1 2 3 1 4 1 2 1 ...
##  $ contacts_in_last_year          : int  4 2 3 1 2 2 1 3 2 2 ...
##  $ transaction_amount_ratio       : num  0.87 0.101 0.59 0.686 0.628 0.853 0.688 0.625 0.708 0.916 ...
##  $ total_transaction_amount       : int  4136 1507 4081 4253 14134 2090 NA 4686 4660 4584 ...
##  $ total_transaction_count        : int  67 33 54 81 85 47 63 81 85 70 ...
##  $ transaction_count_ratio        : num  0.718 0.222 0.421 0.884 0.7 0.88 0.909 0.761 0.735 0.591 ...
##  $ average_utilization            : num  0.685 0 0.059 0.561 0 0.174 0.883 0.495 0 0.741 ...
##  $ account_status                 : chr  "open" "closed" "open" "open" ...
# Dropping customer_relationship_length and average_utilization from test_data
test_data_new <- subset(test_data, select = -c(customer_relationship_length, remaining_credit_limit,credit_card_debt_balance,customer_id ))

# Checking the dimensions and structure of the new test dataset
dim(test_data_new)
## [1] 5063   16
str(test_data_new)
## 'data.frame':    5063 obs. of  16 variables:
##  $ customer_age                   : int  45 49 52 44 47 51 46 63 58 42 ...
##  $ customer_sex                   : chr  "M" "M" "F" "F" ...
##  $ customer_number_of_dependents  : int  2 2 2 1 2 1 4 1 0 2 ...
##  $ customer_education             : chr  "College" "High School" "Uneducated" "Graduate" ...
##  $ customer_civil_status          : chr  "Married" "Married" "Married" "Married" ...
##  $ customer_salary_range          : chr  "Unknown" "60-80K" "below 40K" "below 40K" ...
##  $ customer_available_credit_limit: num  32305 6034 1923 3117 23467 ...
##  $ credit_card_classification     : chr  "Blue" "Blue" "Blue" "Blue" ...
##  $ total_products                 : int  1 4 2 6 6 5 5 3 2 3 ...
##  $ period_inactive                : int  2 1 1 1 2 1 2 2 3 3 ...
##  $ contacts_in_last_year          : int  1 2 2 2 2 3 0 2 3 2 ...
##  $ transaction_amount_ratio       : num  0.734 0.601 0.71 0.707 0.741 ...
##  $ total_transaction_amount       : int  7129 2449 33472 4606 3603 2461 4317 3826 2367 2574 ...
##  $ total_transaction_count        : int  84 54 77 77 71 42 75 58 48 45 ...
##  $ transaction_count_ratio        : num  0.623 0.756 0.853 0.535 0.735 ...
##  $ average_utilization            : num  0.053 0.0501 0.5265 0.4684 0.0712 ...

for customer age filling missing values with mean , bc its close to normal distribution

for Gender , Female and male looks balanced in the data (even female slighty higher than male ) and almost 10 percentage of gender value is missing. filling it only one of them may cause a problem about interpretation of relation. Also to we don’t remove such high amount of data, so I used to fill with it randomly as F or M.

for Salary range , its a categorical variable and we used mode to fill missing values, its one of the easiest way to do, althoug we need to noted that this way also will create imbalanced class and may cause a bias in estimation.

for total transaction amount which is numeric variable we used the median bc of having highly skewed distribution

# Preprocessing 
##Fill missing values

###for customer age 
# Calculate mean value for customer_age in the training dataset
mean_age_train <- mean(train_data_new$customer_age, na.rm = TRUE)

# Fill missing values in customer_age with the mean value in the training dataset
train_data_new$customer_age[is.na(train_data_new$customer_age)] <- mean_age_train

# Calculate mean value for customer_age in the test dataset
mean_age_test <- mean(test_data_new$customer_age, na.rm = TRUE)

# Fill missing values in customer_age with the mean value in the test dataset
test_data_new$customer_age[is.na(test_data_new$customer_age)] <- mean_age_test

###for Gender 

# Get the number of missing values in customer_sex
missing_count_train <- sum(is.na(train_data_new$customer_sex))
missing_count_test <- sum(is.na(test_data_new$customer_sex))

# Generate random values for customer_sex (F or M)
random_values <- sample(c("F", "M"), size = missing_count_train, replace = TRUE)
random_values2 <- sample(c("F", "M"), size = missing_count_test, replace = TRUE)
# Replace missing values with the randomly generated values
train_data_new$customer_sex[is.na(train_data_new$customer_sex)] <- random_values
test_data_new$customer_sex[is.na(test_data_new$customer_sex)] <- random_values2


##for Salary Range

# Convert the "customer_salary_range" variable to a factor
train_data_new$customer_salary_range <- factor(train_data_new$customer_salary_range)
test_data_new$customer_salary_range <- factor(test_data_new$customer_salary_range)

# Calculate the mode of the "customer_salary_range" variable
mode_salary_range <- as.character(which.max(table(train_data_new$customer_salary_range)))

# Manually assign a valid factor level as the mode value
valid_levels <- levels(train_data_new$customer_salary_range)
if (!(mode_salary_range %in% valid_levels)) {
  mode_salary_range <- valid_levels[1]  # Choose the first valid level as the mode value
}

# Fill missing values with the mode
train_data_new$customer_salary_range[is.na(train_data_new$customer_salary_range)] <- mode_salary_range
test_data_new$customer_salary_range[is.na(test_data_new$customer_salary_range)] <- mode_salary_range


##for # total transp.
# Calculate median value for total_transaction_amount in the training dataset
median_amount_train <- median(train_data_new$total_transaction_amount, na.rm = TRUE)

# Fill missing values in total_transaction_amount with the median value in the training dataset
train_data_new$total_transaction_amount[is.na(train_data_new$total_transaction_amount)] <- median_amount_train

# Calculate median value for total_transaction_amount in the test dataset
median_amount_test <- median(test_data_new$total_transaction_amount, na.rm = TRUE)

# Fill missing values in total_transaction_amount with the median value in the test dataset
test_data_new$total_transaction_amount[is.na(test_data_new$total_transaction_amount)] <- median_amount_test
# Check missing values in the training dataset
sum(is.na(train_data_new))
## [1] 0
# Check missing values in the test dataset
sum(is.na(test_data_new))
## [1] 0
dim(train_data_new)
## [1] 10127    17
dim(test_data_new)
## [1] 5063   16
# Save train_scaled as CSV
write.csv(train_data_new, "train_data_new.csv", row.names = TRUE)

# Save test_scaled as CSV
write.csv(test_data_new, "test_data_new.csv", row.names = TRUE)
# Get the names of categorical variables
categorical_vars <- c("customer_sex", "customer_education", "customer_civil_status", "customer_salary_range", "credit_card_classification")

# Perform one-hot encoding
train_encoded <- train_data_new %>%
  mutate(across(all_of(categorical_vars), as.factor))

# Complete operation on train_encoded
train_encoded <- train_encoded %>%
  tidyr::complete(nesting(customer_age, total_transaction_amount), fill = list()) %>%
  tidyr::drop_na()

# Create dummy variables using model.matrix()
dummy_vars <- model.matrix(~.-1, data = train_encoded[, names(train_encoded) %in% categorical_vars])

# Convert the resulting matrix to a data frame
dummy_vars <- as.data.frame(dummy_vars)

# Combine the original variables with the dummy variables
train_encoded <- cbind(train_encoded[, !names(train_encoded) %in% categorical_vars], dummy_vars)


# Replace "account_status" values with binary values
train_encoded$account_status <- ifelse(train_encoded$account_status == "closed", 1, 0)

# Print the structure of the data frame
str(train_encoded)
## 'data.frame':    10127 obs. of  31 variables:
##  $ customer_age                      : num  26 26 26 26 26 26 26 26 26 26 ...
##  $ total_transaction_amount          : num  869 947 968 982 1027 ...
##  $ customer_number_of_dependents     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_available_credit_limit   : num  1730 1438 6152 29916 2469 ...
##  $ total_products                    : int  2 3 1 3 5 3 4 6 4 5 ...
##  $ period_inactive                   : int  2 2 2 1 2 2 1 3 1 3 ...
##  $ contacts_in_last_year             : int  4 4 4 3 4 3 3 3 2 3 ...
##  $ transaction_amount_ratio          : num  0.331 0.506 0.391 0.537 0.544 0.496 0.766 0.418 0.472 0.758 ...
##  $ total_transaction_count           : int  27 20 23 22 20 26 27 44 47 42 ...
##  $ transaction_count_ratio           : num  0.286 0.053 0.533 0.692 0.176 0.238 0.588 0.419 0.469 0.355 ...
##  $ average_utilization               : num  0.934 0 0 0 0.976 0.99 0.785 0.755 0 0 ...
##  $ account_status                    : num  1 1 1 1 1 1 0 0 0 0 ...
##  $ customer_sexF                     : num  1 1 1 0 0 1 0 0 0 1 ...
##  $ customer_sexM                     : num  0 0 0 1 1 0 1 1 1 0 ...
##  $ customer_educationDoctorate       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_educationGraduate        : num  0 0 0 0 1 0 0 1 1 1 ...
##  $ customer_educationHigh School     : num  1 0 1 1 0 0 0 0 0 0 ...
##  $ customer_educationPost-Graduate   : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ customer_educationUneducated      : num  0 1 0 0 0 0 1 0 0 0 ...
##  $ customer_educationUnknown         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_civil_statusMarried      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_civil_statusSingle       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ customer_civil_statusUnknown      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_salary_range40-60K       : num  1 0 0 1 0 0 1 0 0 1 ...
##  $ customer_salary_range60-80K       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_salary_range80-120K      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_salary_rangebelow 40K    : num  0 1 0 0 1 1 0 1 0 0 ...
##  $ customer_salary_rangeUnknown      : num  0 0 1 0 0 0 0 0 1 0 ...
##  $ credit_card_classificationGold    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ credit_card_classificationPlatinum: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ credit_card_classificationSilver  : num  0 0 0 0 0 0 0 0 0 0 ...
dim(train_encoded)
## [1] 10127    31
# Perform one-hot encoding on test_data
test_encoded <- test_data_new %>%
  mutate(across(all_of(categorical_vars), as.factor))

# Complete operation on test_encoded
test_encoded <- test_encoded %>%
  tidyr::complete(nesting(customer_age, total_transaction_amount), fill = list()) %>%
  tidyr::drop_na()

# Create dummy variables using model.matrix() for test_encoded
test_dummy_vars <- model.matrix(~.-1, data = test_encoded[, names(test_encoded) %in% categorical_vars])

# Convert the resulting matrix to a data frame
test_dummy_vars <- as.data.frame(test_dummy_vars)

# Combine the original variables with the dummy variables for test_encoded
test_encoded <- cbind(test_encoded[, !names(test_encoded) %in% categorical_vars], test_dummy_vars)

# Print the structure of the test_encoded data frame
str(test_encoded)
## 'data.frame':    5063 obs. of  29 variables:
##  $ customer_age                    : num  26 26 26 26 26 26 26 27 27 27 ...
##  $ total_transaction_amount        : int  2211 2214 2335 2485 2558 3137 22720 1920 2272 4591 ...
##  $ customer_number_of_dependents   : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ customer_available_credit_limit : num  4232 2267 1972 6933 2595 ...
##  $ total_products                  : int  4 5 4 3 3 4 5 5 5 2 ...
##  $ period_inactive                 : int  1 2 0 3 2 3 2 1 1 2 ...
##  $ contacts_in_last_year           : int  2 2 2 3 2 3 3 2 2 2 ...
##  $ transaction_amount_ratio        : num  0.648 0.657 0.451 0.546 0.605 ...
##  $ total_transaction_count         : int  42 33 60 42 44 70 38 43 48 75 ...
##  $ transaction_count_ratio         : num  0.626 0.949 0.454 0.302 0.364 ...
##  $ average_utilization             : num  0.3776 0.5631 0.5464 0.0291 0.7177 ...
##  $ customer_sexF                   : num  1 1 0 0 1 0 1 1 1 1 ...
##  $ customer_sexM                   : num  0 0 1 1 0 1 0 0 0 0 ...
##  $ customer_educationDoctorate     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_educationGraduate      : num  1 0 0 0 1 0 0 1 1 0 ...
##  $ customer_educationHigh School   : num  0 0 1 0 0 0 0 0 0 1 ...
##  $ customer_educationPost-Graduate : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_educationUneducated    : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ customer_educationUnknown       : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ customer_civil_statusMarried    : num  1 1 0 0 1 0 0 0 0 1 ...
##  $ customer_civil_statusSingle     : num  0 0 1 1 0 1 1 0 1 0 ...
##  $ customer_civil_statusUnknown    : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ customer_salary_range40-60K     : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ customer_salary_range60-80K     : num  0 0 1 1 0 0 0 0 0 0 ...
##  $ customer_salary_range80-120K    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_salary_rangebelow 40K  : num  0 1 0 0 1 1 1 1 1 1 ...
##  $ customer_salary_rangeUnknown    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ credit_card_classificationGold  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ credit_card_classificationSilver: num  0 0 0 0 0 0 0 0 0 0 ...
dim(test_encoded)
## [1] 5063   29
# Define columns to scale
columns_to_scale <- c('customer_age', 'customer_number_of_dependents',
                      'customer_available_credit_limit',
                      'total_transaction_amount', 'total_transaction_count', 'average_utilization',
                      'period_inactive', 'contacts_in_last_year')

# Perform feature scaling
train_scaled <- train_encoded
train_scaled[columns_to_scale] <- scale(train_scaled[columns_to_scale])

test_scaled <- test_encoded
test_scaled[columns_to_scale] <- scale(test_scaled[columns_to_scale])

# Add "credit_card_classificationPlatinum" variable to the test dataset
test_scaled$credit_card_classificationPlatinum <- 0
# Save train_scaled as CSV
write.csv(train_scaled, "train_scaled.csv", row.names = TRUE)

# Save test_scaled as CSV
write.csv(test_scaled, "test_scaled.csv", row.names = TRUE)
str(train_scaled)
## 'data.frame':    10127 obs. of  31 variables:
##  $ customer_age                      : num  -2.62 -2.62 -2.62 -2.62 -2.62 ...
##  $ total_transaction_amount          : num  -0.597 -0.586 -0.584 -0.582 -0.575 ...
##  $ customer_number_of_dependents     : num  -1.81 -1.81 -1.81 -1.81 -1.81 ...
##  $ customer_available_credit_limit   : num  -0.471 -0.488 -0.22 1.128 -0.429 ...
##  $ total_products                    : int  2 3 1 3 5 3 4 6 4 5 ...
##  $ period_inactive                   : num  -0.338 -0.338 -0.338 -1.327 -0.338 ...
##  $ contacts_in_last_year             : num  1.396 1.396 1.396 0.492 1.396 ...
##  $ transaction_amount_ratio          : num  0.331 0.506 0.391 0.537 0.544 0.496 0.766 0.418 0.472 0.758 ...
##  $ total_transaction_count           : num  -1.61 -1.91 -1.78 -1.83 -1.91 ...
##  $ transaction_count_ratio           : num  0.286 0.053 0.533 0.692 0.176 0.238 0.588 0.419 0.469 0.355 ...
##  $ average_utilization               : num  2.391 -0.997 -0.997 -0.997 2.543 ...
##  $ account_status                    : num  1 1 1 1 1 1 0 0 0 0 ...
##  $ customer_sexF                     : num  1 1 1 0 0 1 0 0 0 1 ...
##  $ customer_sexM                     : num  0 0 0 1 1 0 1 1 1 0 ...
##  $ customer_educationDoctorate       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_educationGraduate        : num  0 0 0 0 1 0 0 1 1 1 ...
##  $ customer_educationHigh School     : num  1 0 1 1 0 0 0 0 0 0 ...
##  $ customer_educationPost-Graduate   : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ customer_educationUneducated      : num  0 1 0 0 0 0 1 0 0 0 ...
##  $ customer_educationUnknown         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_civil_statusMarried      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_civil_statusSingle       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ customer_civil_statusUnknown      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_salary_range40-60K       : num  1 0 0 1 0 0 1 0 0 1 ...
##  $ customer_salary_range60-80K       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_salary_range80-120K      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_salary_rangebelow 40K    : num  0 1 0 0 1 1 0 1 0 0 ...
##  $ customer_salary_rangeUnknown      : num  0 0 1 0 0 0 0 0 1 0 ...
##  $ credit_card_classificationGold    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ credit_card_classificationPlatinum: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ credit_card_classificationSilver  : num  0 0 0 0 0 0 0 0 0 0 ...
# Define the target variable and input features for training data
target <- train_scaled$account_status
features <- train_scaled[, !names(train_scaled) %in% "account_status"]
set.seed(123)  # Set seed for reproducibility
train_indices <- sample(1:nrow(train_scaled), 0.7 * nrow(train_scaled))
valid_indices <- sample(setdiff(1:nrow(train_scaled), train_indices), 0.15 * nrow(train_scaled))
test_indices <- setdiff(1:nrow(train_scaled), c(train_indices, valid_indices))

train_data <- train_scaled[train_indices, ]
valid_data <- train_scaled[valid_indices, ]
test_data <- train_scaled[test_indices, ]

target_train <- factor(train_data$account_status, levels = c(0, 1))
features_train <- train_data[, !names(train_data) %in% "account_status"]

target_valid <- factor(valid_data$account_status, levels = c(0, 1))
features_valid <- valid_data[, !names(valid_data) %in% "account_status"]

target_test <- factor(test_data$account_status, levels = c(0, 1))
features_test <- test_data[, !names(test_data) %in% "account_status"]
# Fit ridge logistic regression
logit_model <- cv.glmnet(as.matrix(features_train), target_train, family = "binomial", alpha = 0)

# Predict on validation data
logit_pred <- predict(logit_model, newx = as.matrix(features_valid), type = "response")
logit_accuracy <- sum(ifelse(logit_pred >= 0.5, 1, 0) == target_valid) / length(target_valid)

# K-nearest neighbors (KNN)
# Define cross-validation configuration
ctrl <- trainControl(method = "cv", number = 5)  # 5-fold cross-validation

# Train KNN model
knn_model <- train(features_train, target_train, method = "knn", trControl = ctrl)
knn_pred <- predict(knn_model, newdata = valid_data)
knn_accuracy <- sum(knn_pred == target_valid) / length(target_valid)
# Calculate accuracy
logit_accuracy <- sum(ifelse(logit_pred >= 0.5, 1, 0) == target_valid) / length(target_valid)

# Calculate confusion matrix
logit_confusion <- table(ifelse(logit_pred >= 0.5, 1, 0), target_valid)
print(logit_confusion)
##    target_valid
##        0    1
##   0 1250  170
##   1   25   74
# Calculate precision, recall, and F1-score
logit_precision <- logit_confusion[2, 2] / sum(logit_confusion[, 2])
logit_recall <- logit_confusion[2, 2] / sum(logit_confusion[2, ])
logit_f1 <- 2 * logit_precision * logit_recall / (logit_precision + logit_recall)

# Print evaluation metrics
print(paste("Accuracy:", logit_accuracy))
## [1] "Accuracy: 0.871626069782752"
print(paste("Precision:", logit_precision))
## [1] "Precision: 0.30327868852459"
print(paste("Recall:", logit_recall))
## [1] "Recall: 0.747474747474748"
print(paste("F1-score:", logit_f1))
## [1] "F1-score: 0.431486880466472"
# Calculate balanced accuracy for logistic regression
logit_sensitivity <- logit_confusion[2, 2] / sum(logit_confusion[2, ])
logit_specificity <- logit_confusion[1, 1] / sum(logit_confusion[1, ])
logit_balanced_accuracy <- (logit_sensitivity + logit_specificity) / 2

# Print balanced accuracy
print(paste("Balanced Accuracy (Logistic Regression):", logit_balanced_accuracy))
## [1] "Balanced Accuracy (Logistic Regression): 0.813878218807796"
# Calculate accuracy
knn_accuracy <- sum(knn_pred == target_valid) / length(target_valid)

# Calculate confusion matrix
knn_confusion <- table(knn_pred, target_valid)
print(knn_confusion)
##         target_valid
## knn_pred    0    1
##        0 1249  129
##        1   26  115
# Calculate precision, recall, and F1-score
knn_precision <- knn_confusion[2, 2] / sum(knn_confusion[, 2])
knn_recall <- knn_confusion[2, 2] / sum(knn_confusion[2, ])
knn_f1 <- 2 * knn_precision * knn_recall / (knn_precision + knn_recall)

# Print evaluation metrics
print(paste("Accuracy:", knn_accuracy))
## [1] "Accuracy: 0.897959183673469"
print(paste("Precision:", knn_precision))
## [1] "Precision: 0.471311475409836"
print(paste("Recall:", knn_recall))
## [1] "Recall: 0.815602836879433"
print(paste("F1-score:", knn_f1))
## [1] "F1-score: 0.597402597402597"
# Calculate balanced accuracy for KNN
knn_sensitivity <- knn_confusion[2, 2] / sum(knn_confusion[2, ])
knn_specificity <- knn_confusion[1, 1] / sum(knn_confusion[1, ])
knn_balanced_accuracy <- (knn_sensitivity + knn_specificity) / 2

# Print balanced accuracy
print(paste("Balanced Accuracy (KNN):", knn_balanced_accuracy))
## [1] "Balanced Accuracy (KNN): 0.860994451821429"
# LASSO
lasso_model <- cv.glmnet(as.matrix(features_train), target_train, family = "binomial", alpha = 1)
lasso_pred <- predict(lasso_model, newx = as.matrix(features_valid), type = "response")
lasso_accuracy <- sum(ifelse(lasso_pred >= 0.5, 1, 0) == target_valid) / length(target_valid)

# Ridge
ridge_model <- cv.glmnet(as.matrix(features_train), target_train, family = "binomial", alpha = 0)
ridge_pred <- predict(ridge_model, newx = as.matrix(features_valid), type = "response")
ridge_accuracy <- sum(ifelse(ridge_pred >= 0.5, 1, 0) == target_valid) / length(target_valid)
# Calculate accuracy for LASSO
lasso_accuracy <- sum(ifelse(lasso_pred >= 0.5, 1, 0) == target_valid) / length(target_valid)

# Calculate confusion matrix for LASSO
lasso_confusion <- table(ifelse(lasso_pred >= 0.5, 1, 0), target_valid)
print(lasso_confusion)
##    target_valid
##        0    1
##   0 1242  162
##   1   33   82
# Calculate precision, recall, and F1-score for LASSO
lasso_precision <- lasso_confusion[2, 2] / sum(lasso_confusion[, 2])
lasso_recall <- lasso_confusion[2, 2] / sum(lasso_confusion[2, ])
lasso_f1 <- 2 * lasso_precision * lasso_recall / (lasso_precision + lasso_recall)

# Print evaluation metrics for LASSO
print(paste("Accuracy (LASSO):", lasso_accuracy))
## [1] "Accuracy (LASSO): 0.871626069782752"
print(paste("Precision (LASSO):", lasso_precision))
## [1] "Precision (LASSO): 0.336065573770492"
print(paste("Recall (LASSO):", lasso_recall))
## [1] "Recall (LASSO): 0.71304347826087"
print(paste("F1-score (LASSO):", lasso_f1))
## [1] "F1-score (LASSO): 0.456824512534819"
# Calculate balanced accuracy for LASSO
lasso_sensitivity <- lasso_confusion[2, 2] / sum(lasso_confusion[2, ])
lasso_specificity <- lasso_confusion[1, 1] / sum(lasso_confusion[1, ])
lasso_balanced_accuracy <- (lasso_sensitivity + lasso_specificity) / 2

# Print balanced accuracy for LASSO
print(paste("Balanced Accuracy (LASSO):", lasso_balanced_accuracy))
## [1] "Balanced Accuracy (LASSO): 0.798829431438127"
# Calculate accuracy for Ridge
ridge_accuracy <- sum(ifelse(ridge_pred >= 0.5, 1, 0) == target_valid) / length(target_valid)

# Calculate confusion matrix for Ridge
ridge_confusion <- table(ifelse(ridge_pred >= 0.5, 1, 0), target_valid)
print(ridge_confusion)
##    target_valid
##        0    1
##   0 1250  170
##   1   25   74
# Calculate precision, recall, and F1-score for Ridge
ridge_precision <- ridge_confusion[2, 2] / sum(ridge_confusion[, 2])
ridge_recall <- ridge_confusion[2, 2] / sum(ridge_confusion[2, ])
ridge_f1 <- 2 * ridge_precision * ridge_recall / (ridge_precision + ridge_recall)

# Print evaluation metrics for Ridge
print(paste("Accuracy (Ridge):", ridge_accuracy))
## [1] "Accuracy (Ridge): 0.871626069782752"
print(paste("Precision (Ridge):", ridge_precision))
## [1] "Precision (Ridge): 0.30327868852459"
print(paste("Recall (Ridge):", ridge_recall))
## [1] "Recall (Ridge): 0.747474747474748"
print(paste("F1-score (Ridge):", ridge_f1))
## [1] "F1-score (Ridge): 0.431486880466472"
# Calculate balanced accuracy for Ridge
ridge_sensitivity <- ridge_confusion[2, 2] / sum(ridge_confusion[2, ])
ridge_specificity <- ridge_confusion[1, 1] / sum(ridge_confusion[1, ])
ridge_balanced_accuracy <- (ridge_sensitivity + ridge_specificity) / 2

# Print balanced accuracy for Ridge
print(paste("Balanced Accuracy (Ridge):", ridge_balanced_accuracy))
## [1] "Balanced Accuracy (Ridge): 0.813878218807796"

Based on these results, the K-Nearest Neighbors (KNN) model has the highest accuracy, precision, recall, and F1-score among the evaluated models. It also has a relatively high balanced accuracy.

str(test_scaled)
## 'data.frame':    5063 obs. of  30 variables:
##  $ customer_age                      : num  -3.14 -3.14 -3.14 -3.14 -3.14 ...
##  $ total_transaction_amount          : num  -0.45 -0.45 -0.431 -0.408 -0.396 ...
##  $ customer_number_of_dependents     : num  -1.84 -1.84 -1.84 -1.84 -1.84 ...
##  $ customer_available_credit_limit   : num  -0.314 -0.434 -0.452 -0.15 -0.414 ...
##  $ total_products                    : int  4 5 4 3 3 4 5 5 5 2 ...
##  $ period_inactive                   : num  -1.22253 0.00655 -2.45162 1.23564 0.00655 ...
##  $ contacts_in_last_year             : num  -0.0988 -0.0988 -0.0988 0.9959 -0.0988 ...
##  $ transaction_amount_ratio          : num  0.648 0.657 0.451 0.546 0.605 ...
##  $ total_transaction_count           : num  -1.114 -1.578 -0.188 -1.114 -1.011 ...
##  $ transaction_count_ratio           : num  0.626 0.949 0.454 0.302 0.364 ...
##  $ average_utilization               : num  0.414 1.2 1.129 -1.062 1.855 ...
##  $ customer_sexF                     : num  1 1 0 0 1 0 1 1 1 1 ...
##  $ customer_sexM                     : num  0 0 1 1 0 1 0 0 0 0 ...
##  $ customer_educationDoctorate       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_educationGraduate        : num  1 0 0 0 1 0 0 1 1 0 ...
##  $ customer_educationHigh School     : num  0 0 1 0 0 0 0 0 0 1 ...
##  $ customer_educationPost-Graduate   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_educationUneducated      : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ customer_educationUnknown         : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ customer_civil_statusMarried      : num  1 1 0 0 1 0 0 0 0 1 ...
##  $ customer_civil_statusSingle       : num  0 0 1 1 0 1 1 0 1 0 ...
##  $ customer_civil_statusUnknown      : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ customer_salary_range40-60K       : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ customer_salary_range60-80K       : num  0 0 1 1 0 0 0 0 0 0 ...
##  $ customer_salary_range80-120K      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_salary_rangebelow 40K    : num  0 1 0 0 1 1 1 1 1 1 ...
##  $ customer_salary_rangeUnknown      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ credit_card_classificationGold    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ credit_card_classificationSilver  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ credit_card_classificationPlatinum: num  0 0 0 0 0 0 0 0 0 0 ...
# Load the necessary library
library(class)

# Fit the KNN model with the chosen k value (replace 'k_value' with the chosen k value)
knn_model <- knn(train = features_train, test = features_valid, cl = target_train, k = 5)

# Predict on the test_scaled dataset
knn_pred <- knn(train = features_train, test = test_scaled, cl = target_train, k = 5)

# Convert the predictions to a data frame
predictions <- data.frame(knn_pred)

# Save the predictions as a CSV file
write.csv(predictions, file = "knn_predictions.csv", row.names = FALSE)