#Homework Week 2
#STEP 1
#STEP2
setwd("C:/Users/antho/OneDrive/Bureau/Data Mining/Week 2")
dataset <- read.csv("preprocessing_dataset.csv")
head(dataset)
##        Age    Income Credit_Score Gender Region
## 1 20.28857 132075.04     426.9625 Female   East
## 2 69.21595  44869.27     704.3927   Male   West
## 3 65.62840 203902.12     597.6344  Other   East
## 4 43.11777 237117.77     585.4740   Male  South
## 5 27.40355  61800.95     717.4642 Female  North
## 6 92.94005 266309.56     395.4117 Female   East
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
#STEP 3
#histogram of Income by Age
ggplot(data=dataset, aes(x=Credit_Score)) +
  geom_histogram(binwidth = 35, fill = "cyan", color = "black") +
  labs(title="Distribution of Income",
       x = "Credit Score", Y = "Frequency")

#Correlation matrix
numeric_dataset <- dataset[sapply(dataset, is.numeric)]
correlation_matrix <- cor(numeric_dataset, use = "complete.obs")
corrplot(correlation_matrix, method = 'color')

#Comment: based on this correlation matrix we can see that Income and Credit Score doesn't have a correlation. 
#Age and Income have a slightly positive correlation but nothing major.
#Age and Credit Score have a slightly Negative correlation but nothing major.

#Missing value
missing_values<- sapply(dataset,function(x) sum(is.na(x)))
print(missing_values)
##          Age       Income Credit_Score       Gender       Region 
##           51          100            0            0            0
#STEP 4
#replace missing value
dataset_imputed <- dataset 
for (col in which(sapply(dataset_imputed, is.numeric))) {
  dataset_imputed[is.na(dataset_imputed[, col]), col] <- mean(dataset_imputed[, col], na.rm = TRUE)
}
#checking
missing_values<- sapply(dataset_imputed,function(x) sum(is.na(x)))
print(missing_values)
##          Age       Income Credit_Score       Gender       Region 
##            0            0            0            0            0
#I decided to replace the data with the mean because after seeing the number of missing values which was almost 15% 
# in a sample of 1020, replacing with the mean will have been more relevant and less impact on the data

library(caret)
## Loading required package: lattice
#STEP 5
# Split the dataset into training and test sets
#Make it Reproducable with the set.seed(123)
set.seed(123)
#Create Data Partition and 80/20
train_index <- createDataPartition(dataset_imputed$Credit_Score, p = 0.8, list = FALSE)
#Split of the data in training and test set
training_set <- dataset_imputed[train_index, ]
test_set <- dataset_imputed[-train_index, ]

#STEP 6
#Scale the feature : I decided to do Normalization because the dataset doesn't have big outliers if the data had multiple outliers I will have chose 
#Standardization because it is better with outliers since it use the standard deviation.

numeric_features <- which(sapply(training_set, is.numeric))
normalization_params <- preProcess(training_set[, numeric_features], method = "range")
#Apply normalization to training and test sets
normalized_training_set <- predict(normalization_params, training_set[, numeric_features])
normalized_test_set <- predict(normalization_params, test_set[, numeric_features])
# Add the non-numeric data back to the dataset
normalized_training_set <- cbind(normalized_training_set, training_set[, -numeric_features, drop=FALSE])
normalized_test_set <- cbind(normalized_test_set, test_set[, -numeric_features, drop=FALSE])
#Outcome is that all the numeric data are now between 0 and 1 and the data can be compared more easily

#STEP 7
head(dataset)
##        Age    Income Credit_Score Gender Region
## 1 20.28857 132075.04     426.9625 Female   East
## 2 69.21595  44869.27     704.3927   Male   West
## 3 65.62840 203902.12     597.6344  Other   East
## 4 43.11777 237117.77     585.4740   Male  South
## 5 27.40355  61800.95     717.4642 Female  North
## 6 92.94005 266309.56     395.4117 Female   East
head(dataset_imputed)
##        Age    Income Credit_Score Gender Region
## 1 20.28857 132075.04     426.9625 Female   East
## 2 69.21595  44869.27     704.3927   Male   West
## 3 65.62840 203902.12     597.6344  Other   East
## 4 43.11777 237117.77     585.4740   Male  South
## 5 27.40355  61800.95     717.4642 Female  North
## 6 92.94005 266309.56     395.4117 Female   East
head(normalized_training_set)
##           Age     Income Credit_Score Gender Region
## 1  0.02777433 0.33791613   0.23157185 Female   East
## 2  0.62545933 0.07292689   0.74018693   Male   West
## 3  0.58163472 0.55617467   0.54446603  Other   East
## 7  0.41962737 0.12267757   0.74372350   Male  North
## 9  0.90724432 0.74176311   0.07242689   Male  North
## 10 0.64900776 0.14894351   0.03216853   Male  North
head(normalized_test_set)
##           Age    Income Credit_Score Gender Region
## 4  0.30665028 0.6571059    0.5221722   Male  South
## 5  0.11468919 0.1243766    0.7641510 Female  North
## 6  0.91526716 0.7458101    0.1737295 Female   East
## 8  0.12136652 0.4397601    0.1232998   Male   West
## 14 0.04683336 0.4422337    0.7099313   Male  North
## 35 0.53995746 0.9569101    0.1034591   Male   East
#Summary : We load the dataset, Performed EDA to check on outliers, Cleaned the dataset and replace missing values
#We split the dataset in two set, apply Normalization to scale to dataset between 0 to 1 and showed the different dataset at each stage

#STEP 8
#Reflection : The insights I gained from this EDA and the dataset are that you need to follow a step-by-step process to determine the best next steps. 
#Without performing the EDA and understanding the dataset, I wouldn't have made the right choices for handling missing values or selecting the appropriate scaling method.

#You have to be pragmatic and make decisions that make sense for your specific dataset. This is why I decided to replace the missing values with the mean instead of removing rows, 
#as that would have had a significant impact on my dataset. The same logic applied to scaling methods—at first, I thought standardization would be the right choice because I assumed a financial dataset would have multiple outliers. 
#However, after handling missing values with the mean, I realized there weren’t many outliers, making Min-Max Normalization the better approach in this case.

#I really enjoyed this exercise, Professor! God bless!