#install.packages(c("tidyverse", "skimr", "naniar", "DataExplorer"))

library(tidyverse)    # For data manipulation (dplyr) and plotting (ggplot2)

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'forcats' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(skimr)        # For detailed data summaries

## Warning: package 'skimr' was built under R version 4.5.2

library(naniar)       # For visualizing missing data

## Warning: package 'naniar' was built under R version 4.5.2

## 
## Attaching package: 'naniar'
## 
## The following object is masked from 'package:skimr':
## 
##     n_complete

library(DataExplorer) # For automated EDA reports

## Warning: package 'DataExplorer' was built under R version 4.5.2

# Set your working directory or provide the full file path
df <- read.csv("C:\\Users\\Valued Customer\\Downloads\\train_u6lujuX_CVtuZ9i (1).csv")

# Look at the first few rows
print(head(df))

##    Loan_ID Gender Married Dependents    Education Self_Employed ApplicantIncome
## 1 LP001002   Male      No          0     Graduate            No            5849
## 2 LP001003   Male     Yes          1     Graduate            No            4583
## 3 LP001005   Male     Yes          0     Graduate           Yes            3000
## 4 LP001006   Male     Yes          0 Not Graduate            No            2583
## 5 LP001008   Male      No          0     Graduate            No            6000
## 6 LP001011   Male     Yes          2     Graduate           Yes            5417
##   CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 1                 0         NA              360              1         Urban
## 2              1508        128              360              1         Rural
## 3                 0         66              360              1         Urban
## 4              2358        120              360              1         Urban
## 5                 0        141              360              1         Urban
## 6              4196        267              360              1         Urban
##   Loan_Status
## 1           Y
## 2           N
## 3           Y
## 4           Y
## 5           Y
## 6           Y

# Get the structure of the data 
# This shows column names, data types, and the first few values
print(str(df))

## 'data.frame':    614 obs. of  13 variables:
##  $ Loan_ID          : chr  "LP001002" "LP001003" "LP001005" "LP001006" ...
##  $ Gender           : chr  "Male" "Male" "Male" "Male" ...
##  $ Married          : chr  "No" "Yes" "Yes" "Yes" ...
##  $ Dependents       : chr  "0" "1" "0" "0" ...
##  $ Education        : chr  "Graduate" "Graduate" "Graduate" "Not Graduate" ...
##  $ Self_Employed    : chr  "No" "No" "Yes" "No" ...
##  $ ApplicantIncome  : int  5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
##  $ CoapplicantIncome: num  0 1508 0 2358 0 ...
##  $ LoanAmount       : int  NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : int  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : chr  "Urban" "Rural" "Urban" "Urban" ...
##  $ Loan_Status      : chr  "Y" "N" "Y" "Y" ...
## NULL

# 'summary()' gives basic stats for numeric columns and counts for text/factor columns
print(summary(df))

##    Loan_ID             Gender            Married           Dependents       
##  Length:614         Length:614         Length:614         Length:614        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Education         Self_Employed      ApplicantIncome CoapplicantIncome
##  Length:614         Length:614         Min.   :  150   Min.   :    0    
##  Class :character   Class :character   1st Qu.: 2878   1st Qu.:    0    
##  Mode  :character   Mode  :character   Median : 3812   Median : 1188    
##                                        Mean   : 5403   Mean   : 1621    
##                                        3rd Qu.: 5795   3rd Qu.: 2297    
##                                        Max.   :81000   Max.   :41667    
##                                                                         
##    LoanAmount    Loan_Amount_Term Credit_History   Property_Area     
##  Min.   :  9.0   Min.   : 12      Min.   :0.0000   Length:614        
##  1st Qu.:100.0   1st Qu.:360      1st Qu.:1.0000   Class :character  
##  Median :128.0   Median :360      Median :1.0000   Mode  :character  
##  Mean   :146.4   Mean   :342      Mean   :0.8422                     
##  3rd Qu.:168.0   3rd Qu.:360      3rd Qu.:1.0000                     
##  Max.   :700.0   Max.   :480      Max.   :1.0000                     
##  NA's   :22      NA's   :14       NA's   :50                         
##  Loan_Status       
##  Length:614        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

# 'skim()' from the skimr package gives a more powerful summary
# This shows missing values, min/max, mean, and a small histogram for each variable
skim(df)

Data summary
Name	df
Number of rows	614
Number of columns	13
_______________________
Column type frequency:
character	8
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	empty	n_unique
Loan_ID	1	8	8	0	614
Gender	1	0	6	13	3
Married	1	0	3	3	3
Dependents	1	0	2	15	5
Education	1	8	12	0	2
Self_Employed	1	0	3	32	3
Property_Area	1	5	9	0	3
Loan_Status	1	1	1	0	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ApplicantIncome	0	1.00	5403.46	6109.04	150	2877.5	3812.5	5795.00	81000	▇▁▁▁▁
CoapplicantIncome	0	1.00	1621.25	2926.25	0	0.0	1188.5	2297.25	41667	▇▁▁▁▁
LoanAmount	22	0.96	146.41	85.59	9	100.0	128.0	168.00	700	▇▃▁▁▁
Loan_Amount_Term	14	0.98	342.00	65.12	12	360.0	360.0	360.00	480	▁▁▁▇▁
Credit_History	50	0.92	0.84	0.36	0	1.0	1.0	1.00	1	▂▁▁▁▇

# Check if there are any completely duplicated rows
print(paste("Total duplicate rows:", sum(duplicated(df))))

## [1] "Total duplicate rows: 0"

# The 'Loan_ID' column should be unique, let's check that
print(paste("Duplicate Loan_IDs:", sum(duplicated(df$Loan_ID))))

## [1] "Duplicate Loan_IDs: 0"

# Visualize the missing data locations
# This plot shows  exactly where the NAs are. We see a lot in Credit_History.
gg_miss_upset(df)

# I will fill missing values (NAs).
# Note: R treats blank strings "" as different from NA. 
# I first convert blanks in 'Dependents', 'Gender', etc. to NA.
df <- df %>% 
  mutate_if(is.character, ~na_if(., ""))

# Impute NUMERICAL missing values (LoanAmount, Loan_Amount_Term)
# I use the MEDIAN because the income/amount data is likely skewed (not a normal bell curve)
df <- df %>%
  mutate(
    LoanAmount = ifelse(is.na(LoanAmount),
                        median(LoanAmount, na.rm = TRUE),
                        LoanAmount),
    Loan_Amount_Term = ifelse(is.na(Loan_Amount_Term),
                              median(Loan_Amount_Term, na.rm = TRUE),
                              Loan_Amount_Term)
  )

# Impute CATEGORICAL missing values (Gender, Married, Dependents, Self_Employed)
# I use the MODE (the most frequent value). 
# I'll find the mode for each and fill NAs.
# A simple way is to find the most common value and use it.
# For 'Gender', 'Married', 'Self_Employed' the mode is "Male", "Yes", "No"
df <- df %>%
  mutate(
    Gender = ifelse(is.na(Gender), "Male", Gender),
    Married = ifelse(is.na(Married), "Yes", Married),
    Self_Employed = ifelse(is.na(Self_Employed), "No", Self_Employed),
    Dependents = ifelse(is.na(Dependents), "0", Dependents)
  )

# Impute 'Credit_History'
# This is a critical variable. Most people have a credit history (1.0).
# We will impute the missing values with the mode (1.0).
df <- df %>%
  mutate(
    Credit_History = ifelse(is.na(Credit_History), 1.0, Credit_History)
  )

# Check if we have any NAs left
print(paste("Total NAs left in data:", sum(is.na(df))))

## [1] "Total NAs left in data: 0"

#Correcting Data Types and Values

# Clean the 'Dependents' column (replace "3+" with "3")
df <- df %>%
  mutate(
    Dependents = recode(Dependents, "3+" = "3")
  )

# Convert all our categorical columns to factors (R's way of handling categories)
df <- df %>%
  mutate(
    Gender = as.factor(Gender),
    Married = as.factor(Married),
    Dependents = as.factor(Dependents),
    Education = as.factor(Education),
    Self_Employed = as.factor(Self_Employed),
    Credit_History = as.factor(Credit_History),
    Property_Area = as.factor(Property_Area),
    Loan_Status = as.factor(Loan_Status)
  )

Feature Engineering

# Create new features that might be more predictive
df <- df %>%
  mutate(
    Total_Income = ApplicantIncome + CoapplicantIncome,
    Income_per_Loan = Total_Income / LoanAmount
  )

# Check the structure again to see our changes
print(str(df))

## 'data.frame':    614 obs. of  15 variables:
##  $ Loan_ID          : chr  "LP001002" "LP001003" "LP001005" "LP001006" ...
##  $ Gender           : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Married          : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 2 2 ...
##  $ Dependents       : Factor w/ 4 levels "0","1","2","3": 1 2 1 1 1 3 1 4 3 2 ...
##  $ Education        : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed    : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 1 1 1 ...
##  $ ApplicantIncome  : int  5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
##  $ CoapplicantIncome: num  0 1508 0 2358 0 ...
##  $ LoanAmount       : num  128 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : num  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 2 2 ...
##  $ Property_Area    : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status      : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...
##  $ Total_Income     : num  5849 6091 3000 4941 6000 ...
##  $ Income_per_Loan  : num  45.7 47.6 45.5 41.2 42.6 ...
## NULL

Exploratory Data Analysis

# install.packages("reshape2") 
library(reshape2)     # For the correlation heatmap (melt function)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

Analyze the Target Variable (Loan_Status)

ggplot(df, aes(x = Loan_Status, fill = Loan_Status)) +
  geom_bar() +
  labs(title = "Loan Approval Status (Y/N)",
       x = "Loan Status",
       y = "Count") +
  theme_minimal()

# Analyze Categorical Variables vs. Target

# Credit_History vs. Loan_Status
ggplot(df, aes(x = Credit_History, fill = Loan_Status)) +
  geom_bar(position = "fill") +
  labs(title = "Loan Status by Credit History",
       x = "Credit History (1 = Yes, 0 = No)",
       y = "Proportion") +
  theme_minimal()

# (This plot show Credit_History is the strongest predictor)

# Property_Area vs. Loan_Status
ggplot(df, aes(x = Property_Area, fill = Loan_Status)) +
  geom_bar(position = "fill") +
  labs(title = "Loan Status by Property Area",
       x = "Property Area",
       y = "Proportion") +
  theme_minimal()

# Married vs. Loan_Status
ggplot(df, aes(x = Married, fill = Loan_Status)) +
  geom_bar(position = "fill") +
  labs(title = "Loan Status by Marital Status",
       x = "Married",
       y = "Proportion") +
  theme_minimal()

Analyze Numerical Variables

# Plot a histogram of our new Total_Income feature
ggplot(df, aes(x = Total_Income)) +
  geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
  labs(title = "Distribution of Total Income", x = "Total Income", y = "Frequency") +
  scale_x_log10() # Use a log scale because income is highly skewed

  theme_minimal()

## List of 136
##  $ line                            :List of 6
##   ..$ colour       : chr "black"
##   ..$ linewidth    : num 0.5
##   ..$ linetype     : num 1
##   ..$ lineend      : chr "butt"
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ rect                            :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : chr "black"
##   ..$ linewidth    : num 0.5
##   ..$ linetype     : num 1
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ text                            :List of 11
##   ..$ family       : chr ""
##   ..$ face         : chr "plain"
##   ..$ colour       : chr "black"
##   ..$ size         : num 11
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 0
##   ..$ lineheight   : num 0.9
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ title                           : NULL
##  $ aspect.ratio                    : NULL
##  $ axis.title                      : NULL
##  $ axis.title.x                    :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.75points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.top                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.75points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.bottom             : NULL
##  $ axis.title.y                    :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.75points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y.left               : NULL
##  $ axis.title.y.right              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.75points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text                       :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey30"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x                     :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.2points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.top                 :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.2points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.bottom              : NULL
##  $ axis.text.y                     :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.2points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y.left                : NULL
##  $ axis.text.y.right               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.2points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.theta                 : NULL
##  $ axis.text.r                     :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0.5
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.2points 0points 2.2points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.ticks                      : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.ticks.x                    : NULL
##  $ axis.ticks.x.top                : NULL
##  $ axis.ticks.x.bottom             : NULL
##  $ axis.ticks.y                    : NULL
##  $ axis.ticks.y.left               : NULL
##  $ axis.ticks.y.right              : NULL
##  $ axis.ticks.theta                : NULL
##  $ axis.ticks.r                    : NULL
##  $ axis.minor.ticks.x.top          : NULL
##  $ axis.minor.ticks.x.bottom       : NULL
##  $ axis.minor.ticks.y.left         : NULL
##  $ axis.minor.ticks.y.right        : NULL
##  $ axis.minor.ticks.theta          : NULL
##  $ axis.minor.ticks.r              : NULL
##  $ axis.ticks.length               : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ axis.ticks.length.x             : NULL
##  $ axis.ticks.length.x.top         : NULL
##  $ axis.ticks.length.x.bottom      : NULL
##  $ axis.ticks.length.y             : NULL
##  $ axis.ticks.length.y.left        : NULL
##  $ axis.ticks.length.y.right       : NULL
##  $ axis.ticks.length.theta         : NULL
##  $ axis.ticks.length.r             : NULL
##  $ axis.minor.ticks.length         : 'rel' num 0.75
##  $ axis.minor.ticks.length.x       : NULL
##  $ axis.minor.ticks.length.x.top   : NULL
##  $ axis.minor.ticks.length.x.bottom: NULL
##  $ axis.minor.ticks.length.y       : NULL
##  $ axis.minor.ticks.length.y.left  : NULL
##  $ axis.minor.ticks.length.y.right : NULL
##  $ axis.minor.ticks.length.theta   : NULL
##  $ axis.minor.ticks.length.r       : NULL
##  $ axis.line                       : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.line.x                     : NULL
##  $ axis.line.x.top                 : NULL
##  $ axis.line.x.bottom              : NULL
##  $ axis.line.y                     : NULL
##  $ axis.line.y.left                : NULL
##  $ axis.line.y.right               : NULL
##  $ axis.line.theta                 : NULL
##  $ axis.line.r                     : NULL
##  $ legend.background               : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.margin                   : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing                  : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing.x                : NULL
##  $ legend.spacing.y                : NULL
##  $ legend.key                      : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.key.size                 : 'simpleUnit' num 1.2lines
##   ..- attr(*, "unit")= int 3
##  $ legend.key.height               : NULL
##  $ legend.key.width                : NULL
##  $ legend.key.spacing              : 'simpleUnit' num 5.5points
##   ..- attr(*, "unit")= int 8
##  $ legend.key.spacing.x            : NULL
##  $ legend.key.spacing.y            : NULL
##  $ legend.frame                    : NULL
##  $ legend.ticks                    : NULL
##  $ legend.ticks.length             : 'rel' num 0.2
##  $ legend.axis.line                : NULL
##  $ legend.text                     :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.text.position            : NULL
##  $ legend.title                    :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.title.position           : NULL
##  $ legend.position                 : chr "right"
##  $ legend.position.inside          : NULL
##  $ legend.direction                : NULL
##  $ legend.byrow                    : NULL
##  $ legend.justification            : chr "center"
##  $ legend.justification.top        : NULL
##  $ legend.justification.bottom     : NULL
##  $ legend.justification.left       : NULL
##  $ legend.justification.right      : NULL
##  $ legend.justification.inside     : NULL
##  $ legend.location                 : NULL
##  $ legend.box                      : NULL
##  $ legend.box.just                 : NULL
##  $ legend.box.margin               : 'margin' num [1:4] 0cm 0cm 0cm 0cm
##   ..- attr(*, "unit")= int 1
##  $ legend.box.background           : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.box.spacing              : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##   [list output truncated]
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi TRUE
##  - attr(*, "validate")= logi TRUE

# Create box plots to compare numerical features against the target
# Box plot of Total Income vs. Loan Status
ggplot(df, aes(x = Loan_Status, y = Total_Income, fill = Loan_Status)) +
  geom_boxplot() +
  labs(title = "Total Income by Loan Status",
       x = "Loan Status",
       y = "Total Income") +
  scale_y_log10() + # Log scale makes the box plots easier to read
  theme_minimal()

# Box plot of LoanAmount vs. Loan Status
ggplot(df, aes(x = Loan_Status, y = LoanAmount, fill = Loan_Status)) +
  geom_boxplot() +
  labs(title = "Loan Amount by Loan Status",
       x = "Loan Status",
       y = "Loan Amount") +
  theme_minimal()

Automated EDA Report

# This one command will generate a full HTML report with visualizations!
#create_report(df, y = "Loan_Status")

Data Visualization

Density Plots (Comparing Distributions)

# Compare Total_Income distribution for Approved (Y) vs. Rejected (N)
ggplot(df, aes(x = Total_Income, fill = Loan_Status)) +
  geom_density(alpha = 0.5) + # Use alpha for transparency
  scale_x_log10() +           # Use log scale due to income skew
  labs(title = "Distribution of Total Income by Loan Status",
       x = "Total Income (Log Scale)",
       y = "Density",
       fill = "Loan Status") +
  theme_minimal()

# Compare LoanAmount distribution for Approved (Y) vs. Rejected (N)
ggplot(df, aes(x = LoanAmount, fill = Loan_Status)) +
  geom_density(alpha = 0.5) +
  labs(title = "Distribution of Loan Amount by Loan Status",
       x = "Loan Amount",
       y = "Density",
       fill = "Loan Status") +
  theme_minimal()

Scatter Plots (Bivariate Analysis)

ggplot(df, aes(x = Total_Income, y = LoanAmount, color = Loan_Status)) +
  geom_point(alpha = 0.6) + # Use alpha to see overlapping points
  scale_x_log10() +         # Log scale for income
  labs(title = "Total Income vs. Loan Amount by Loan Status",
       x = "Total Income (Log Scale)",
       y = "Loan Amount",
       color = "Loan Status") +
  theme_minimal()

Facet Grids (Multivariate Analysis)

# Facet by Property Area
ggplot(df, aes(x = Total_Income, y = LoanAmount, color = Loan_Status)) +
  geom_point(alpha = 0.4) +
  scale_x_log10() +
  theme_minimal() +
  facet_wrap(~ Property_Area) + # Create a separate plot for each Property_Area
  labs(title = "Total Income vs. Loan Amount (Faceted by Property Area)",
       x = "Total Income (Log Scale)",
       y = "Loan Amount")

# Facet by Education and Credit History
ggplot(df, aes(x = Total_Income, y = LoanAmount, color = Loan_Status)) +
  geom_point(alpha = 0.4) +
  scale_x_log10() +
  theme_minimal() +
  facet_grid(Education ~ Credit_History) + # Create a grid (rows ~ columns)
  labs(title = "Total Income vs. Loan Amount (by Education and Credit History)",
       x = "Total Income (Log Scale)",
       y = "Loan Amount")

Correlation Heatmap (for Numeric Variables)

# First, create a new data frame with only the numeric columns
numeric_df <- df %>%
  select(ApplicantIncome, CoapplicantIncome, LoanAmount, 
         Loan_Amount_Term, Total_Income, Income_per_Loan)

# Calculate the correlation matrix
cor_matrix <- cor(numeric_df)

# "Melt" the matrix into a format ggplot can use
melted_cor <- melt(cor_matrix)

# Create the heatmap
ggplot(melted_cor, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() + # This creates the colored tiles
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), 
                       name = "Correlation") +
  geom_text(aes(label = round(value, 2)), size = 3) + # Add the numbers
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 10, hjust = 1)) +
  coord_fixed() + # Makes the plot square
  labs(title = "Correlation Heatmap of Numeric Variables",
       x = "", y = "")

Loan Prediction

Mustapha Abdulyekeen

2025-11-03

Feature Engineering

Exploratory Data Analysis

Analyze the Target Variable (Loan_Status)

Analyze Numerical Variables

Automated EDA Report

Data Visualization

Density Plots (Comparing Distributions)

Scatter Plots (Bivariate Analysis)

Facet Grids (Multivariate Analysis)

Correlation Heatmap (for Numeric Variables)