library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
df <- read.csv("preprocessing_dataset.csv")
# Display first few rows
head(df)
## Age Income Credit_Score Gender Region
## 1 20.28857 132075.04 426.9625 Female East
## 2 69.21595 44869.27 704.3927 Male West
## 3 65.62840 203902.12 597.6344 Other East
## 4 43.11777 237117.77 585.4740 Male South
## 5 27.40355 61800.95 717.4642 Female North
## 6 92.94005 266309.56 395.4117 Female East
# Check for missing values
colSums(is.na(df))
## Age Income Credit_Score Gender Region
## 51 100 0 0 0
# Handle missing data by replacing NAs with column medians
for (col in names(df)) {
if (any(is.na(df[[col]]))) {
df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm = TRUE)
}
}
# Exploratory Data Analysis (EDA)
# Visualize Data Distributions
numeric_columns <- sapply(df, is.numeric)
par(mfrow=c(1,1))
for (col in names(df)[numeric_columns]) {
hist(df[[col]], main=paste("Histogram of", col), xlab=col, col="skyblue", border="black")
}



# Verify missing values are handled
colSums(is.na(df))
## Age Income Credit_Score Gender Region
## 0 0 0 0 0
# Split dataset into training and test sets (without requiring a target variable)
set.seed(123) # Ensure reproducibility
train_index <- sample(seq_len(nrow(df)), size = 0.8 * nrow(df))
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
# Scale features using standardization (mean = 0, SD = 1)
preproc <- preProcess(train_data, method=c("center", "scale"))
train_data <- predict(preproc, train_data)
test_data <- predict(preproc, test_data)
# Output Results
head(df) # Original Dataset
## Age Income Credit_Score Gender Region
## 1 20.28857 132075.04 426.9625 Female East
## 2 69.21595 44869.27 704.3927 Male West
## 3 65.62840 203902.12 597.6344 Other East
## 4 43.11777 237117.77 585.4740 Male South
## 5 27.40355 61800.95 717.4642 Female North
## 6 92.94005 266309.56 395.4117 Female East
head(train_data) # Cleaned and Scaled Training Data
## Age Income Credit_Score Gender Region
## 415 0.3576519 -0.06004865 0.2446674 Male East
## 463 0.8899069 -0.02330524 -1.3751446 Female East
## 179 1.4091048 -0.95074046 -1.2761786 Female South
## 526 1.1786298 1.01827012 0.8118637 Female East
## 195 -1.2084732 1.43737313 -0.4212498 Female North
## 938 -1.0509714 -0.06004865 0.5319868 Male South
head(test_data) # Cleaned and Scaled Test Data
## Age Income Credit_Score Gender Region
## 1 -1.7181878 -0.5660368 -1.00465336 Female East
## 3 0.2808138 0.2214363 0.04779272 Other East
## 7 -0.3039057 -1.3426138 0.71801213 Male North
## 9 1.4560090 0.8910361 -1.53995092 Male North
## 12 1.2855930 0.8490707 -1.23298338 Female South
## 15 1.3466984 -0.8651915 -0.84959953 Other West