#install.packages(c("tidyverse", "skimr", "naniar", "DataExplorer"))
library(tidyverse) # For data manipulation (dplyr) and plotting (ggplot2)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr) # For detailed data summaries
## Warning: package 'skimr' was built under R version 4.5.2
library(naniar) # For visualizing missing data
## Warning: package 'naniar' was built under R version 4.5.2
##
## Attaching package: 'naniar'
##
## The following object is masked from 'package:skimr':
##
## n_complete
library(DataExplorer) # For automated EDA reports
## Warning: package 'DataExplorer' was built under R version 4.5.2
# Set your working directory or provide the full file path
df <- read.csv("C:\\Users\\Valued Customer\\Downloads\\train_u6lujuX_CVtuZ9i (1).csv")
# Look at the first few rows
print(head(df))
## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## 1 LP001002 Male No 0 Graduate No 5849
## 2 LP001003 Male Yes 1 Graduate No 4583
## 3 LP001005 Male Yes 0 Graduate Yes 3000
## 4 LP001006 Male Yes 0 Not Graduate No 2583
## 5 LP001008 Male No 0 Graduate No 6000
## 6 LP001011 Male Yes 2 Graduate Yes 5417
## CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 1 0 NA 360 1 Urban
## 2 1508 128 360 1 Rural
## 3 0 66 360 1 Urban
## 4 2358 120 360 1 Urban
## 5 0 141 360 1 Urban
## 6 4196 267 360 1 Urban
## Loan_Status
## 1 Y
## 2 N
## 3 Y
## 4 Y
## 5 Y
## 6 Y
# Get the structure of the data
# This shows column names, data types, and the first few values
print(str(df))
## 'data.frame': 614 obs. of 13 variables:
## $ Loan_ID : chr "LP001002" "LP001003" "LP001005" "LP001006" ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Married : chr "No" "Yes" "Yes" "Yes" ...
## $ Dependents : chr "0" "1" "0" "0" ...
## $ Education : chr "Graduate" "Graduate" "Graduate" "Not Graduate" ...
## $ Self_Employed : chr "No" "No" "Yes" "No" ...
## $ ApplicantIncome : int 5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
## $ CoapplicantIncome: num 0 1508 0 2358 0 ...
## $ LoanAmount : int NA 128 66 120 141 267 95 158 168 349 ...
## $ Loan_Amount_Term : int 360 360 360 360 360 360 360 360 360 360 ...
## $ Credit_History : int 1 1 1 1 1 1 1 0 1 1 ...
## $ Property_Area : chr "Urban" "Rural" "Urban" "Urban" ...
## $ Loan_Status : chr "Y" "N" "Y" "Y" ...
## NULL
# 'summary()' gives basic stats for numeric columns and counts for text/factor columns
print(summary(df))
## Loan_ID Gender Married Dependents
## Length:614 Length:614 Length:614 Length:614
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Education Self_Employed ApplicantIncome CoapplicantIncome
## Length:614 Length:614 Min. : 150 Min. : 0
## Class :character Class :character 1st Qu.: 2878 1st Qu.: 0
## Mode :character Mode :character Median : 3812 Median : 1188
## Mean : 5403 Mean : 1621
## 3rd Qu.: 5795 3rd Qu.: 2297
## Max. :81000 Max. :41667
##
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## Min. : 9.0 Min. : 12 Min. :0.0000 Length:614
## 1st Qu.:100.0 1st Qu.:360 1st Qu.:1.0000 Class :character
## Median :128.0 Median :360 Median :1.0000 Mode :character
## Mean :146.4 Mean :342 Mean :0.8422
## 3rd Qu.:168.0 3rd Qu.:360 3rd Qu.:1.0000
## Max. :700.0 Max. :480 Max. :1.0000
## NA's :22 NA's :14 NA's :50
## Loan_Status
## Length:614
## Class :character
## Mode :character
##
##
##
##
# 'skim()' from the skimr package gives a more powerful summary
# This shows missing values, min/max, mean, and a small histogram for each variable
skim(df)
| Name | df |
| Number of rows | 614 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Loan_ID | 0 | 1 | 8 | 8 | 0 | 614 | 0 |
| Gender | 0 | 1 | 0 | 6 | 13 | 3 | 0 |
| Married | 0 | 1 | 0 | 3 | 3 | 3 | 0 |
| Dependents | 0 | 1 | 0 | 2 | 15 | 5 | 0 |
| Education | 0 | 1 | 8 | 12 | 0 | 2 | 0 |
| Self_Employed | 0 | 1 | 0 | 3 | 32 | 3 | 0 |
| Property_Area | 0 | 1 | 5 | 9 | 0 | 3 | 0 |
| Loan_Status | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ApplicantIncome | 0 | 1.00 | 5403.46 | 6109.04 | 150 | 2877.5 | 3812.5 | 5795.00 | 81000 | ▇▁▁▁▁ |
| CoapplicantIncome | 0 | 1.00 | 1621.25 | 2926.25 | 0 | 0.0 | 1188.5 | 2297.25 | 41667 | ▇▁▁▁▁ |
| LoanAmount | 22 | 0.96 | 146.41 | 85.59 | 9 | 100.0 | 128.0 | 168.00 | 700 | ▇▃▁▁▁ |
| Loan_Amount_Term | 14 | 0.98 | 342.00 | 65.12 | 12 | 360.0 | 360.0 | 360.00 | 480 | ▁▁▁▇▁ |
| Credit_History | 50 | 0.92 | 0.84 | 0.36 | 0 | 1.0 | 1.0 | 1.00 | 1 | ▂▁▁▁▇ |
# Check if there are any completely duplicated rows
print(paste("Total duplicate rows:", sum(duplicated(df))))
## [1] "Total duplicate rows: 0"
# The 'Loan_ID' column should be unique, let's check that
print(paste("Duplicate Loan_IDs:", sum(duplicated(df$Loan_ID))))
## [1] "Duplicate Loan_IDs: 0"
# Visualize the missing data locations
# This plot shows exactly where the NAs are. We see a lot in Credit_History.
gg_miss_upset(df)
# I will fill missing values (NAs).
# Note: R treats blank strings "" as different from NA.
# I first convert blanks in 'Dependents', 'Gender', etc. to NA.
df <- df %>%
mutate_if(is.character, ~na_if(., ""))
# Impute NUMERICAL missing values (LoanAmount, Loan_Amount_Term)
# I use the MEDIAN because the income/amount data is likely skewed (not a normal bell curve)
df <- df %>%
mutate(
LoanAmount = ifelse(is.na(LoanAmount),
median(LoanAmount, na.rm = TRUE),
LoanAmount),
Loan_Amount_Term = ifelse(is.na(Loan_Amount_Term),
median(Loan_Amount_Term, na.rm = TRUE),
Loan_Amount_Term)
)
# Impute CATEGORICAL missing values (Gender, Married, Dependents, Self_Employed)
# I use the MODE (the most frequent value).
# I'll find the mode for each and fill NAs.
# A simple way is to find the most common value and use it.
# For 'Gender', 'Married', 'Self_Employed' the mode is "Male", "Yes", "No"
df <- df %>%
mutate(
Gender = ifelse(is.na(Gender), "Male", Gender),
Married = ifelse(is.na(Married), "Yes", Married),
Self_Employed = ifelse(is.na(Self_Employed), "No", Self_Employed),
Dependents = ifelse(is.na(Dependents), "0", Dependents)
)
# Impute 'Credit_History'
# This is a critical variable. Most people have a credit history (1.0).
# We will impute the missing values with the mode (1.0).
df <- df %>%
mutate(
Credit_History = ifelse(is.na(Credit_History), 1.0, Credit_History)
)
# Check if we have any NAs left
print(paste("Total NAs left in data:", sum(is.na(df))))
## [1] "Total NAs left in data: 0"
#Correcting Data Types and Values
# Clean the 'Dependents' column (replace "3+" with "3")
df <- df %>%
mutate(
Dependents = recode(Dependents, "3+" = "3")
)
# Convert all our categorical columns to factors (R's way of handling categories)
df <- df %>%
mutate(
Gender = as.factor(Gender),
Married = as.factor(Married),
Dependents = as.factor(Dependents),
Education = as.factor(Education),
Self_Employed = as.factor(Self_Employed),
Credit_History = as.factor(Credit_History),
Property_Area = as.factor(Property_Area),
Loan_Status = as.factor(Loan_Status)
)
# Create new features that might be more predictive
df <- df %>%
mutate(
Total_Income = ApplicantIncome + CoapplicantIncome,
Income_per_Loan = Total_Income / LoanAmount
)
# Check the structure again to see our changes
print(str(df))
## 'data.frame': 614 obs. of 15 variables:
## $ Loan_ID : chr "LP001002" "LP001003" "LP001005" "LP001006" ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Married : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 2 2 ...
## $ Dependents : Factor w/ 4 levels "0","1","2","3": 1 2 1 1 1 3 1 4 3 2 ...
## $ Education : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
## $ Self_Employed : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 1 1 1 ...
## $ ApplicantIncome : int 5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
## $ CoapplicantIncome: num 0 1508 0 2358 0 ...
## $ LoanAmount : num 128 128 66 120 141 267 95 158 168 349 ...
## $ Loan_Amount_Term : num 360 360 360 360 360 360 360 360 360 360 ...
## $ Credit_History : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 2 2 ...
## $ Property_Area : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
## $ Loan_Status : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...
## $ Total_Income : num 5849 6091 3000 4941 6000 ...
## $ Income_per_Loan : num 45.7 47.6 45.5 41.2 42.6 ...
## NULL
# install.packages("reshape2")
library(reshape2) # For the correlation heatmap (melt function)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
ggplot(df, aes(x = Loan_Status, fill = Loan_Status)) +
geom_bar() +
labs(title = "Loan Approval Status (Y/N)",
x = "Loan Status",
y = "Count") +
theme_minimal()
# Analyze Categorical Variables vs. Target
# Credit_History vs. Loan_Status
ggplot(df, aes(x = Credit_History, fill = Loan_Status)) +
geom_bar(position = "fill") +
labs(title = "Loan Status by Credit History",
x = "Credit History (1 = Yes, 0 = No)",
y = "Proportion") +
theme_minimal()
# (This plot show Credit_History is the strongest predictor)
# Property_Area vs. Loan_Status
ggplot(df, aes(x = Property_Area, fill = Loan_Status)) +
geom_bar(position = "fill") +
labs(title = "Loan Status by Property Area",
x = "Property Area",
y = "Proportion") +
theme_minimal()
# Married vs. Loan_Status
ggplot(df, aes(x = Married, fill = Loan_Status)) +
geom_bar(position = "fill") +
labs(title = "Loan Status by Marital Status",
x = "Married",
y = "Proportion") +
theme_minimal()
# Plot a histogram of our new Total_Income feature
ggplot(df, aes(x = Total_Income)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
labs(title = "Distribution of Total Income", x = "Total Income", y = "Frequency") +
scale_x_log10() # Use a log scale because income is highly skewed
theme_minimal()
## List of 136
## $ line :List of 6
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : num 1
## ..$ lineend : chr "butt"
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ rect :List of 5
## ..$ fill : chr "white"
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : num 1
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ text :List of 11
## ..$ family : chr ""
## ..$ face : chr "plain"
## ..$ colour : chr "black"
## ..$ size : num 11
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : num 0
## ..$ lineheight : num 0.9
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ title : NULL
## $ aspect.ratio : NULL
## $ axis.title : NULL
## $ axis.title.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.75points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.75points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.bottom : NULL
## $ axis.title.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.75points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.y.left : NULL
## $ axis.title.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.75points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey30"
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.2points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.2points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.bottom : NULL
## $ axis.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.2points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y.left : NULL
## $ axis.text.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.2points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.theta : NULL
## $ axis.text.r :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.2points 0points 2.2points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.ticks : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.ticks.x : NULL
## $ axis.ticks.x.top : NULL
## $ axis.ticks.x.bottom : NULL
## $ axis.ticks.y : NULL
## $ axis.ticks.y.left : NULL
## $ axis.ticks.y.right : NULL
## $ axis.ticks.theta : NULL
## $ axis.ticks.r : NULL
## $ axis.minor.ticks.x.top : NULL
## $ axis.minor.ticks.x.bottom : NULL
## $ axis.minor.ticks.y.left : NULL
## $ axis.minor.ticks.y.right : NULL
## $ axis.minor.ticks.theta : NULL
## $ axis.minor.ticks.r : NULL
## $ axis.ticks.length : 'simpleUnit' num 2.75points
## ..- attr(*, "unit")= int 8
## $ axis.ticks.length.x : NULL
## $ axis.ticks.length.x.top : NULL
## $ axis.ticks.length.x.bottom : NULL
## $ axis.ticks.length.y : NULL
## $ axis.ticks.length.y.left : NULL
## $ axis.ticks.length.y.right : NULL
## $ axis.ticks.length.theta : NULL
## $ axis.ticks.length.r : NULL
## $ axis.minor.ticks.length : 'rel' num 0.75
## $ axis.minor.ticks.length.x : NULL
## $ axis.minor.ticks.length.x.top : NULL
## $ axis.minor.ticks.length.x.bottom: NULL
## $ axis.minor.ticks.length.y : NULL
## $ axis.minor.ticks.length.y.left : NULL
## $ axis.minor.ticks.length.y.right : NULL
## $ axis.minor.ticks.length.theta : NULL
## $ axis.minor.ticks.length.r : NULL
## $ axis.line : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.line.x : NULL
## $ axis.line.x.top : NULL
## $ axis.line.x.bottom : NULL
## $ axis.line.y : NULL
## $ axis.line.y.left : NULL
## $ axis.line.y.right : NULL
## $ axis.line.theta : NULL
## $ axis.line.r : NULL
## $ legend.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.margin : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
## ..- attr(*, "unit")= int 8
## $ legend.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.key.size : 'simpleUnit' num 1.2lines
## ..- attr(*, "unit")= int 3
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.key.spacing : 'simpleUnit' num 5.5points
## ..- attr(*, "unit")= int 8
## $ legend.key.spacing.x : NULL
## $ legend.key.spacing.y : NULL
## $ legend.frame : NULL
## $ legend.ticks : NULL
## $ legend.ticks.length : 'rel' num 0.2
## $ legend.axis.line : NULL
## $ legend.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.text.position : NULL
## $ legend.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.title.position : NULL
## $ legend.position : chr "right"
## $ legend.position.inside : NULL
## $ legend.direction : NULL
## $ legend.byrow : NULL
## $ legend.justification : chr "center"
## $ legend.justification.top : NULL
## $ legend.justification.bottom : NULL
## $ legend.justification.left : NULL
## $ legend.justification.right : NULL
## $ legend.justification.inside : NULL
## $ legend.location : NULL
## $ legend.box : NULL
## $ legend.box.just : NULL
## $ legend.box.margin : 'margin' num [1:4] 0cm 0cm 0cm 0cm
## ..- attr(*, "unit")= int 1
## $ legend.box.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.box.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## [list output truncated]
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi TRUE
## - attr(*, "validate")= logi TRUE
# Create box plots to compare numerical features against the target
# Box plot of Total Income vs. Loan Status
ggplot(df, aes(x = Loan_Status, y = Total_Income, fill = Loan_Status)) +
geom_boxplot() +
labs(title = "Total Income by Loan Status",
x = "Loan Status",
y = "Total Income") +
scale_y_log10() + # Log scale makes the box plots easier to read
theme_minimal()
# Box plot of LoanAmount vs. Loan Status
ggplot(df, aes(x = Loan_Status, y = LoanAmount, fill = Loan_Status)) +
geom_boxplot() +
labs(title = "Loan Amount by Loan Status",
x = "Loan Status",
y = "Loan Amount") +
theme_minimal()
# This one command will generate a full HTML report with visualizations!
#create_report(df, y = "Loan_Status")
# Compare Total_Income distribution for Approved (Y) vs. Rejected (N)
ggplot(df, aes(x = Total_Income, fill = Loan_Status)) +
geom_density(alpha = 0.5) + # Use alpha for transparency
scale_x_log10() + # Use log scale due to income skew
labs(title = "Distribution of Total Income by Loan Status",
x = "Total Income (Log Scale)",
y = "Density",
fill = "Loan Status") +
theme_minimal()
# Compare LoanAmount distribution for Approved (Y) vs. Rejected (N)
ggplot(df, aes(x = LoanAmount, fill = Loan_Status)) +
geom_density(alpha = 0.5) +
labs(title = "Distribution of Loan Amount by Loan Status",
x = "Loan Amount",
y = "Density",
fill = "Loan Status") +
theme_minimal()
ggplot(df, aes(x = Total_Income, y = LoanAmount, color = Loan_Status)) +
geom_point(alpha = 0.6) + # Use alpha to see overlapping points
scale_x_log10() + # Log scale for income
labs(title = "Total Income vs. Loan Amount by Loan Status",
x = "Total Income (Log Scale)",
y = "Loan Amount",
color = "Loan Status") +
theme_minimal()
# Facet by Property Area
ggplot(df, aes(x = Total_Income, y = LoanAmount, color = Loan_Status)) +
geom_point(alpha = 0.4) +
scale_x_log10() +
theme_minimal() +
facet_wrap(~ Property_Area) + # Create a separate plot for each Property_Area
labs(title = "Total Income vs. Loan Amount (Faceted by Property Area)",
x = "Total Income (Log Scale)",
y = "Loan Amount")
# Facet by Education and Credit History
ggplot(df, aes(x = Total_Income, y = LoanAmount, color = Loan_Status)) +
geom_point(alpha = 0.4) +
scale_x_log10() +
theme_minimal() +
facet_grid(Education ~ Credit_History) + # Create a grid (rows ~ columns)
labs(title = "Total Income vs. Loan Amount (by Education and Credit History)",
x = "Total Income (Log Scale)",
y = "Loan Amount")
# First, create a new data frame with only the numeric columns
numeric_df <- df %>%
select(ApplicantIncome, CoapplicantIncome, LoanAmount,
Loan_Amount_Term, Total_Income, Income_per_Loan)
# Calculate the correlation matrix
cor_matrix <- cor(numeric_df)
# "Melt" the matrix into a format ggplot can use
melted_cor <- melt(cor_matrix)
# Create the heatmap
ggplot(melted_cor, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() + # This creates the colored tiles
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1),
name = "Correlation") +
geom_text(aes(label = round(value, 2)), size = 3) + # Add the numbers
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1)) +
coord_fixed() + # Makes the plot square
labs(title = "Correlation Heatmap of Numeric Variables",
x = "", y = "")