assignment01-data.mining

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.2

library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## Loading required package: lattice

df <- read.csv("preprocessing_dataset.csv")

# Display first few rows
head(df)

##        Age    Income Credit_Score Gender Region
## 1 20.28857 132075.04     426.9625 Female   East
## 2 69.21595  44869.27     704.3927   Male   West
## 3 65.62840 203902.12     597.6344  Other   East
## 4 43.11777 237117.77     585.4740   Male  South
## 5 27.40355  61800.95     717.4642 Female  North
## 6 92.94005 266309.56     395.4117 Female   East

# Check for missing values
colSums(is.na(df))

##          Age       Income Credit_Score       Gender       Region 
##           51          100            0            0            0

# Handle missing data by replacing NAs with column medians
for (col in names(df)) {
  if (any(is.na(df[[col]]))) {
    df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm = TRUE)
  }
}

# Exploratory Data Analysis (EDA)
# Visualize Data Distributions
numeric_columns <- sapply(df, is.numeric)
par(mfrow=c(1,1))
for (col in names(df)[numeric_columns]) {
  hist(df[[col]], main=paste("Histogram of", col), xlab=col, col="skyblue", border="black")
}

# Verify missing values are handled
colSums(is.na(df))

##          Age       Income Credit_Score       Gender       Region 
##            0            0            0            0            0

# Split dataset into training and test sets (without requiring a target variable)
set.seed(123)  # Ensure reproducibility
train_index <- sample(seq_len(nrow(df)), size = 0.8 * nrow(df))
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

# Scale features using standardization (mean = 0, SD = 1)
preproc <- preProcess(train_data, method=c("center", "scale"))
train_data <- predict(preproc, train_data)
test_data <- predict(preproc, test_data)

# Output Results
head(df)   # Original Dataset

##        Age    Income Credit_Score Gender Region
## 1 20.28857 132075.04     426.9625 Female   East
## 2 69.21595  44869.27     704.3927   Male   West
## 3 65.62840 203902.12     597.6344  Other   East
## 4 43.11777 237117.77     585.4740   Male  South
## 5 27.40355  61800.95     717.4642 Female  North
## 6 92.94005 266309.56     395.4117 Female   East

head(train_data)  # Cleaned and Scaled Training Data

##            Age      Income Credit_Score Gender Region
## 415  0.3576519 -0.06004865    0.2446674   Male   East
## 463  0.8899069 -0.02330524   -1.3751446 Female   East
## 179  1.4091048 -0.95074046   -1.2761786 Female  South
## 526  1.1786298  1.01827012    0.8118637 Female   East
## 195 -1.2084732  1.43737313   -0.4212498 Female  North
## 938 -1.0509714 -0.06004865    0.5319868   Male  South

head(test_data)  # Cleaned and Scaled Test Data

##           Age     Income Credit_Score Gender Region
## 1  -1.7181878 -0.5660368  -1.00465336 Female   East
## 3   0.2808138  0.2214363   0.04779272  Other   East
## 7  -0.3039057 -1.3426138   0.71801213   Male  North
## 9   1.4560090  0.8910361  -1.53995092   Male  North
## 12  1.2855930  0.8490707  -1.23298338 Female  South
## 15  1.3466984 -0.8651915  -0.84959953  Other   West

assignment01-data.mining

dhwani pathak

2025-02-02