#Homework Week 2
#STEP 1
#STEP2
setwd("C:/Users/antho/OneDrive/Bureau/Data Mining/Week 2")
dataset <- read.csv("preprocessing_dataset.csv")
head(dataset)
## Age Income Credit_Score Gender Region
## 1 20.28857 132075.04 426.9625 Female East
## 2 69.21595 44869.27 704.3927 Male West
## 3 65.62840 203902.12 597.6344 Other East
## 4 43.11777 237117.77 585.4740 Male South
## 5 27.40355 61800.95 717.4642 Female North
## 6 92.94005 266309.56 395.4117 Female East
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
#STEP 3
#histogram of Income by Age
ggplot(data=dataset, aes(x=Credit_Score)) +
geom_histogram(binwidth = 35, fill = "cyan", color = "black") +
labs(title="Distribution of Income",
x = "Credit Score", Y = "Frequency")

#Correlation matrix
numeric_dataset <- dataset[sapply(dataset, is.numeric)]
correlation_matrix <- cor(numeric_dataset, use = "complete.obs")
corrplot(correlation_matrix, method = 'color')

#Comment: based on this correlation matrix we can see that Income and Credit Score doesn't have a correlation.
#Age and Income have a slightly positive correlation but nothing major.
#Age and Credit Score have a slightly Negative correlation but nothing major.
#Missing value
missing_values<- sapply(dataset,function(x) sum(is.na(x)))
print(missing_values)
## Age Income Credit_Score Gender Region
## 51 100 0 0 0
#STEP 4
#replace missing value
dataset_imputed <- dataset
for (col in which(sapply(dataset_imputed, is.numeric))) {
dataset_imputed[is.na(dataset_imputed[, col]), col] <- mean(dataset_imputed[, col], na.rm = TRUE)
}
#checking
missing_values<- sapply(dataset_imputed,function(x) sum(is.na(x)))
print(missing_values)
## Age Income Credit_Score Gender Region
## 0 0 0 0 0
#I decided to replace the data with the mean because after seeing the number of missing values which was almost 15%
# in a sample of 1020, replacing with the mean will have been more relevant and less impact on the data
library(caret)
## Loading required package: lattice
#STEP 5
# Split the dataset into training and test sets
#Make it Reproducable with the set.seed(123)
set.seed(123)
#Create Data Partition and 80/20
train_index <- createDataPartition(dataset_imputed$Credit_Score, p = 0.8, list = FALSE)
#Split of the data in training and test set
training_set <- dataset_imputed[train_index, ]
test_set <- dataset_imputed[-train_index, ]
#STEP 6
#Scale the feature : I decided to do Normalization because the dataset doesn't have big outliers if the data had multiple outliers I will have chose
#Standardization because it is better with outliers since it use the standard deviation.
numeric_features <- which(sapply(training_set, is.numeric))
normalization_params <- preProcess(training_set[, numeric_features], method = "range")
#Apply normalization to training and test sets
normalized_training_set <- predict(normalization_params, training_set[, numeric_features])
normalized_test_set <- predict(normalization_params, test_set[, numeric_features])
# Add the non-numeric data back to the dataset
normalized_training_set <- cbind(normalized_training_set, training_set[, -numeric_features, drop=FALSE])
normalized_test_set <- cbind(normalized_test_set, test_set[, -numeric_features, drop=FALSE])
#Outcome is that all the numeric data are now between 0 and 1 and the data can be compared more easily
#STEP 7
head(dataset)
## Age Income Credit_Score Gender Region
## 1 20.28857 132075.04 426.9625 Female East
## 2 69.21595 44869.27 704.3927 Male West
## 3 65.62840 203902.12 597.6344 Other East
## 4 43.11777 237117.77 585.4740 Male South
## 5 27.40355 61800.95 717.4642 Female North
## 6 92.94005 266309.56 395.4117 Female East
head(dataset_imputed)
## Age Income Credit_Score Gender Region
## 1 20.28857 132075.04 426.9625 Female East
## 2 69.21595 44869.27 704.3927 Male West
## 3 65.62840 203902.12 597.6344 Other East
## 4 43.11777 237117.77 585.4740 Male South
## 5 27.40355 61800.95 717.4642 Female North
## 6 92.94005 266309.56 395.4117 Female East
head(normalized_training_set)
## Age Income Credit_Score Gender Region
## 1 0.02777433 0.33791613 0.23157185 Female East
## 2 0.62545933 0.07292689 0.74018693 Male West
## 3 0.58163472 0.55617467 0.54446603 Other East
## 7 0.41962737 0.12267757 0.74372350 Male North
## 9 0.90724432 0.74176311 0.07242689 Male North
## 10 0.64900776 0.14894351 0.03216853 Male North
head(normalized_test_set)
## Age Income Credit_Score Gender Region
## 4 0.30665028 0.6571059 0.5221722 Male South
## 5 0.11468919 0.1243766 0.7641510 Female North
## 6 0.91526716 0.7458101 0.1737295 Female East
## 8 0.12136652 0.4397601 0.1232998 Male West
## 14 0.04683336 0.4422337 0.7099313 Male North
## 35 0.53995746 0.9569101 0.1034591 Male East
#Summary : We load the dataset, Performed EDA to check on outliers, Cleaned the dataset and replace missing values
#We split the dataset in two set, apply Normalization to scale to dataset between 0 to 1 and showed the different dataset at each stage
#STEP 8
#Reflection : The insights I gained from this EDA and the dataset are that you need to follow a step-by-step process to determine the best next steps.
#Without performing the EDA and understanding the dataset, I wouldn't have made the right choices for handling missing values or selecting the appropriate scaling method.
#You have to be pragmatic and make decisions that make sense for your specific dataset. This is why I decided to replace the missing values with the mean instead of removing rows,
#as that would have had a significant impact on my dataset. The same logic applied to scaling methods—at first, I thought standardization would be the right choice because I assumed a financial dataset would have multiple outliers.
#However, after handling missing values with the mean, I realized there weren’t many outliers, making Min-Max Normalization the better approach in this case.
#I really enjoyed this exercise, Professor! God bless!