1. Data preparation

Read file

dataSukses<- read.csv("input/accepted_history.csv")
dataGagal<- read.csv("input/rejected_history.csv")
dim(dataSukses)
## [1] 1048575     151
dim(dataGagal)
## [1] 27648741        9

#{r} #str(dataSukses) #

type_counts <- table(sapply(dataSukses, class))
type_counts
## 
## character   integer   logical   numeric 
##        38        84         1        28
if(FALSE) {
head(dataSukses)
names(dataSukses)
str(dataSukses)
dim(dataSukses)
}
names(dataGagal)
## [1] "Amount.Requested"     "Application.Date"     "Loan.Title"          
## [4] "Risk_Score"           "Debt.To.Income.Ratio" "Zip.Code"            
## [7] "State"                "Employment.Length"    "Policy.Code"

2. High level analysis

Inspect column types, i.e., numeric (int, num), factor, chr, logi Numerical columns (quantitative)

quanti <- dataSukses %>% 
  select_if(is.numeric) %>% 
  colnames()
#quanti

# index of numerical columns
quantivar <- which(colnames(dataSukses) %in% quanti)
quantivar
##   [1]   3   4   5   7   8  14  25  26  28  29  30  31  32  33  34  35  36  37
##  [19]  39  40  41  42  43  44  45  46  47  49  52  53  54  55  56  58  59  61
##  [37]  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
##  [55]  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97
##  [73]  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
##  [91] 116 117 118 120 121 122 123 124 125 126 127 128 133 134 138 139 141 142
## [109] 143 149 150 151

categorical columns (qualitative) …but we have not assign any factor yet. Therefore, we expect no factor type

quali <- dataSukses %>% 
  select_if(is.factor) %>% 
  colnames()
#quali

# index of categorical columns
qualivar <- which(colnames(dataSukses) %in% quali)
qualivar
## integer(0)

character columns

colChar <- dataSukses %>% 
  select_if(is.character) %>% 
  colnames()
#colChar

# index of categorical columns
colCharVar <- which(colnames(dataSukses) %in% colChar)
colCharVar
##  [1]   1   6   9  10  11  12  13  15  16  17  18  19  20  21  22  23  24  27  38
## [20]  48  50  51  57  60 119 129 130 131 132 135 136 137 140 144 145 146 147 148

logical columns

colLog <- dataSukses %>% 
  select_if(is.logical) %>% 
  colnames()
#colLog

# index of categorical columns
colLogVar <- which(colnames(dataSukses) %in% colLog)
colLogVar
## [1] 2

correlation check

library(GGally)
ggcorr(dataSukses, label = T, hjust = 1, layout.exp = 2)
## Warning in ggcorr(dataSukses, label = T, hjust = 1, layout.exp = 2): data
## in column(s) 'id', 'member_id', 'term', 'grade', 'sub_grade', 'emp_title',
## 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status',
## 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state',
## 'earliest_cr_line', 'initial_list_status', 'last_pymnt_d', 'next_pymnt_d',
## 'last_credit_pull_d', 'application_type', 'verification_status_joint',
## 'sec_app_earliest_cr_line', 'hardship_flag', 'hardship_type', 'hardship_reason',
## 'hardship_status', 'hardship_start_date', 'hardship_end_date',
## 'payment_plan_start_date', 'hardship_loan_status', 'disbursement_method',
## 'debt_settlement_flag', 'debt_settlement_flag_date', 'settlement_status',
## 'settlement_date' are not numeric and were ignored
## Warning in cor(data, use = method[1], method = method[2]): the standard
## deviation is zero

it seems dificult to get insight using ggcorr

Check loan status in success Loan

table(dataSukses$loan_status)
## 
##                           Charged Off            Current            Default 
##                 12             129187             419550                 21 
##         Fully Paid    In Grace Period  Late (16-30 days) Late (31-120 days) 
##             482595               4227               2104              10879

Plot loan status

tableLoanFull <- table(dataSukses$loan_status)
barplot(tableLoanFull, xlim=c(0,8), ylim=c(0,750000))

Choose to focus on Fully paid and charged off customer

dataPilih <- (dataSukses[dataSukses$loan_status=="Fully Paid" | dataSukses$loan_status=="Charged Off", ])
nrow(dataPilih)
## [1] 611782

check class proportion

prop.table(table(dataPilih$loan_status))
## 
## Charged Off  Fully Paid 
##   0.2111651   0.7888349

we can see that 21% data are charged off, whereas 78.8% are Fully Paid. Not balance

Plot loan status for a new selected data

tableLoan <- table(dataPilih$loan_status)
barplot(tableLoan, xlim=c(0,2), ylim=c(0,750000), xlab = "Loan Status", ylab = "count")

3. Data cleaning

Plotting statistics of null rows in each column

check NA

anyNA(dataPilih)
## [1] TRUE
na_counts <- colSums(is.na(dataPilih))
na_counts_sorted <- na_counts[order(na_counts, decreasing = TRUE)]
#na_counts_sorted

Plot Null count Note: las = 2: The axis labels are perpendicular to the axis, with vertical labels

barplot(na_counts_sorted, xlim=c(0,190), las = 2, main = "Statistics of null rows", xlab = "Column Name", ylab = "Values")

Display percentage instead of value

na_percent <- colSums(is.na(dataPilih))/nrow(dataPilih)
na_percent_sorted <- na_percent[order(na_percent, decreasing = TRUE)]
barplot(na_percent_sorted, xlim=c(0,190), las = 2, main = "Percentage of null rows", xlab = "Column Name", ylab = "Values")

Plotting statistics of null rows with ggplot

Frame for ggplot

nullData <- data.frame(colNames = names(dataPilih), null_count = na_percent)
sortedNull <- nullData[order(-nullData$null_count, decreasing = TRUE), ]
#sortedNull 

using ggPlot

# Create a vector of colors for each category
color_category <- factor(1:151)
color_palette <- viridisLite::viridis(151)

# Create the bar plot
ggplot(sortedNull, aes(x = reorder(colNames, null_count, decreasing = TRUE), y = null_count, fill = color_category)) +
  geom_bar(stat = "identity") + 
  labs(title = "Percentage of null data in each column", x = "Column Name", y = "Values") +
  scale_fill_manual(values = color_palette) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  guides(fill = FALSE)  
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.

Focus on data <50% null

# Select columns where the null percentage is less than 50%
kolomPilih <- (dataPilih[, (na_percent< 0.5)])
dim(kolomPilih)
## [1] 611782    108

what are the selected columns?

kolomPilih_names <- names(kolomPilih)
length(kolomPilih_names)
## [1] 108

Plot a new null statistics (barplot)

na_percent_kolom <- colSums(is.na(kolomPilih))/nrow(kolomPilih)
na_percent_kolom_sorted <- na_percent_kolom[order(na_percent_kolom, decreasing = TRUE)]
barplot(na_percent_kolom_sorted, xlim=c(0,110), las = 2, main = "New statistics of null rows", xlab = "Column Name", ylab = "Values")

Plot a new null statistics with ggplot Frame for ggplot

nullData_kolom <- data.frame(colNames = names(kolomPilih), null_count = na_percent_kolom)
sortedNull_kolom <- nullData_kolom[order(-nullData_kolom$null_count, decreasing = TRUE), ]
#sortedNull_kolom 

using ggPlot

# Create a vector of colors for each category
color_category <- factor(1:108)
color_palette <- viridisLite::viridis(108)

# Create the bar plot
ggplot(sortedNull_kolom, aes(x = reorder(colNames, null_count, decreasing = TRUE), y = null_count, fill = color_category)) +
  geom_bar(stat = "identity") + 
  labs(title = "Percentage of null (selected) data in each column", x = "Column Name", y = "Values") +
  scale_fill_manual(values = color_palette) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  guides(fill = FALSE)  

Focus on selected features

selected_columns <- c('addr_state', 'annual_inc', 'earliest_cr_line', 'emp_length', 'emp_title', 'fico_range_high', 'fico_range_low', 'grade', 'home_ownership', 'application_type',
                    'initial_list_status', 'int_rate', 'loan_amnt', 'num_actv_bc_tl', 'loan_status', 'mort_acc', 'tot_cur_bal', 'open_acc', 'pub_rec', 'pub_rec_bankruptcies', 
                    'purpose', 'revol_bal', 'revol_util', 'sub_grade', 'term', 'title', 'total_acc', 'verification_status')
#selected_columns
finalPilih <- kolomPilih[, selected_columns]
#head(finalPilih)
dim(finalPilih)
## [1] 611782     28

4. Exploratory data analysis

The sapply() function is used to apply a specified function to each element of a vector or a list and simplify the results into a vector, matrix, or array.

type_counts <- table(sapply(finalPilih, class))
type_counts
## 
## character   integer   numeric 
##        14        11         3

We still keep the original type of data (char, int, numeric)

check correlation

change data types, put in temp-datasets

NOTE:
Use sapply(): if you want a simplified output in the form of a vector, matrix, or array

Use lapply(): to preserve the structure of the original data and obtain the results as a list

finalPilih_temp <- finalPilih

# Convert character columns to factor
ubahChar <- sapply(finalPilih_temp, is.character)
finalPilih_temp[ubahChar] <- lapply(finalPilih_temp[ubahChar], as.factor)

# Convert integer columns to numeric
ubahInt <- sapply(finalPilih_temp, is.integer)
finalPilih_temp[ubahInt] <- lapply(finalPilih_temp[ubahInt], as.numeric)

# Convert data frame to numeric columns
finalPilih_temp <- as.data.frame(lapply(finalPilih_temp, as.numeric))
#head(finalPilih_temp) 

Let’s check the type of column

type_counts <- table(sapply(finalPilih_temp, class))
type_counts
## 
## numeric 
##      28

check correlation

library(GGally)
# Apply ggcorr to the numeric data frame
corr_plot <- ggcorr(finalPilih_temp)
corr_plot

we can see that loan_status has a strong relationship with 1. annual income 2. loan amount 3. fico_range_low, fico_range_high 4. num_act_bc_tl 5. interest rate, negative corr > but the correlation is still not very clear. Let’s try another plot using ggplot

ggplot

library(ggplot2)
library(viridisLite)

# Create a data frame for plotting
correlation <- cor(finalPilih_temp)[-1, "loan_status"]  # Calculate correlation of all col, we exclude `loan status` only to calculate corr, but the column is still there
sorted_correlation <- sort(correlation)  # Sort correlation values
data <- data.frame(variable = names(sorted_correlation), correlation = sorted_correlation)

# Plot correlation with loan_status for continuous features
ggplot(data, aes(x = variable, y = correlation, fill = correlation)) +
  geom_bar(stat = "identity") +
  coord_flip() + #to flip x and y axis
  scale_fill_viridis() +
  labs(x = "", y = "Correlation with loan_status") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) #angle 90 for vertical text, vjust 0.5 to put text in the middle of bar

NOTE: the variable column includes the names of the variables, including loan_status

Check proportion of value in loan status (1 = fully paid, 2 = charge off )

table(finalPilih_temp$loan_status)
## 
##      1      2 
## 129187 482595
prop.table(table(finalPilih_temp$loan_status))
## 
##         1         2 
## 0.2111651 0.7888349
barplot(table(finalPilih_temp$loan_status), xlim=c(0,3), ylim=c(0,500000), main = "the number of Fully Paid and Charge Off", xlab = "Loan Status", ylab = "Values")

barplot(prop.table(table(finalPilih_temp$loan_status)), xlim=c(0,3), ylim=c(0,1), main = "percentage of Fully Paid and Charge Off", xlab = "Loan Status", ylab = "Values")

There is unbalanced proportion of the loan status

#finalPilih_temp$loan_status
#finalPilih_temp$loan_status <- as.integer(factor(as.character(finalPilih_temp$loan_status)))
#finalPilih_temp

Plotting statistics of null rows with ggplot

Remember: now we only focus on selected columns(with missing values<50%, and use features that are available before loan is succeeded)

library(ggplot2)
library(viridis)

# Calculate null counts
library(ggplot2)
library(viridis)

# Calculate null counts
na_counts <- colSums(is.na(finalPilih_temp))
nullData <- data.frame(colNames = names(finalPilih_temp), null_count = na_counts)
sortedNull <- nullData[order(-nullData$null_count, decreasing = TRUE), ]

# Plot null counts
ggplot(sortedNull, aes(x = colNames, y = null_count, fill = null_count)) +
  geom_bar(stat = "identity") +
  scale_fill_viridis(option = "plasma", direction = -1) +
  labs(x = "", y = "Null Count", title = "Null Count by Column") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_text(aes(label = null_count), vjust = -0.5)

#table(sortedNull)
table(sortedNull$null_count)
## 
##   0 326 
##  27   1

we have only 1 column left with null values. The remaining 27 columns have no null

5. Investigate each column

addr_state

#na_counts <- sum(is.na(finalPilih_temp$addr_state))
#na_counts 
any(is.na(finalPilih_temp$addr_state))
## [1] FALSE
unique(finalPilih_temp$addr_state)
##  [1] 38 41 14 11 23 40 39 27  5 45  4 15 20 34 43 16 32  2 47 35 18 10  6 22 24
## [26]  8 19 48 12 46 31  9 42 30 29 37  7  3 33 49 26 50 36 17 25 44 28 21  1 13

ups, we should back to original version, before converted to numeric use finalPilih for Investigation each column

unique(finalPilih$addr_state)
##  [1] "PA" "SD" "IL" "GA" "MN" "SC" "RI" "NC" "CA" "VA" "AZ" "IN" "MD" "NY" "TX"
## [16] "KS" "NM" "AL" "WA" "OH" "LA" "FL" "CO" "MI" "MO" "DC" "MA" "WI" "HI" "VT"
## [31] "NJ" "DE" "TN" "NH" "NE" "OR" "CT" "AR" "NV" "WV" "MT" "WY" "OK" "KY" "MS"
## [46] "UT" "ND" "ME" "AK" "ID"
length(unique(finalPilih$addr_state))
## [1] 50
addr_state_counts <- sort(table(finalPilih$addr_state), decreasing = TRUE)
addr_state_counts
## 
##    CA    TX    NY    FL    IL    NJ    PA    OH    GA    NC    VA    MI    AZ 
## 86062 50691 48842 43967 23619 21377 20586 20312 19888 17472 16955 16622 14939 
##    MD    MA    CO    WA    MN    IN    TN    MO    NV    CT    WI    AL    OR 
## 14236 13874 13638 13043 11205 10629  9964  9576  9277  8782  8248  7517  7261 
##    SC    LA    KY    OK    KS    AR    UT    MS    NM    HI    NH    RI    NE 
##  7162  7117  5895  5694  5008  4576  4468  3722  3412  2937  2928  2658  2233 
##    WV    DE    MT    DC    AK    WY    SD    VT    ME    ND    ID 
##  2222  1780  1707  1445  1407  1262  1254  1232  1153  1006   922

Plot addr_state

library(ggplot2)

# Generate frequency table
freq_table <- sort(table(finalPilih$addr_state), decreasing = TRUE)

# Create a data frame from the frequency table
data <- data.frame(addr_state = names(freq_table), count = as.numeric(freq_table)) #convert to numeric

# Create the bar plot using ggplot2
ggplot(data, aes(x = reorder(addr_state, -count), y = count, fill = count)) +
  geom_bar(stat = "identity") +
   #geom_text(aes(label = count), vjust = -0.5, size = 3, color = "black", angle = 90) +
  geom_text(aes(label = count), vjust = 0, size = 3, color = "darkgray", angle = 90) +
  #geom_text(aes(label = count), position = position_stack(vjust = 0.5), size = 3, color = "black", angle = 90) +  # Set x label, and rotate
  
  labs(title = "Frequency of addr_state", x = "addr_state", y = "Count") +
  theme_minimal() +
  labs(title = "Frequency of addr_state", x = "addr_state", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  scale_fill_viridis(option = "plasma") #to dsiplay the legend

Optional This is just additional code, because I am not comfortable with how R studio display data horizontally instead of vertically

library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.2.3
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(knitr)

# Generate frequency table
addr_state_counts <- sort(table(finalPilih$addr_state), decreasing = TRUE)

# Create a data frame from the frequency table
data <- data.frame(addr_state = names(addr_state_counts), count = as.numeric(addr_state_counts))

# Sort the data frame by count in descending order
data <- data[order(-data$count), ]

# Rotate the labels to vertical
data$addr_state <- sapply(data$addr_state, function(x) paste(strsplit(x, "")[[1]], collapse = "\n")) #strsplit(x, "") splits the string x into individual characters. The "" argument in strsplit() specifies that we want to split the string at each character.
#[[1]]: This extracts the first element of the resulting list, which contains the individual characters.
#paste(...): This function concatenates the individual characters back into a single string
#Overall, the purpose of paste(strsplit(x, "")[[1]] is to convert a string into a vector of individual characters.

# Print the table
kable(data, align = "c", caption = "Frequency of addr_state") %>%
  kable_styling(bootstrap_options = "striped", full_width = TRUE)
Frequency of addr_state
addr_state count
C A 86062
T X 50691
N Y 48842
F L 43967
I L 23619
N J 21377
P A 20586
O H 20312
G A 19888
N C 17472
V A 16955
M I 16622
A Z 14939
M D 14236
M A 13874
C O 13638
W A 13043
M N 11205
I N 10629
T N 9964
M O 9576
N V 9277
C T 8782
W I 8248
A L 7517
O R 7261
S C 7162
L A 7117
K Y 5895
O K 5694
K S 5008
A R 4576
U T 4468
M S 3722
N M 3412
H I 2937
N H 2928
R I 2658
N E 2233
W V 2222
D E 1780
M T 1707
D C 1445
A K 1407
W Y 1262
S D 1254
V T 1232
M E 1153
N D 1006
I D 922
#The kable_styling() function with the bootstrap_options = "striped" argument applies striped row styling to the table generated by kable(). This means that alternate rows in the table will have different background colors, making it easier to distinguish between rows.
#The full_width = FALSE argument specifies that the table should not occupy the full width of the output document

One hot encoding the categorical value

# Convert addr_state into one-hot encoding
finalPilih_encoded <- cbind(finalPilih, model.matrix(~ addr_state - 1, data = finalPilih))

# Remove the original addr_state column
finalPilih_encoded$addr_state <- NULL

# View the encoded dataset
head(finalPilih_encoded)

annual_inc

Income type is numeric

#na_counts <- sum(is.na(finalPilih_temp$addr_state))
#na_counts 
any(is.na(finalPilih$annual_inc))
## [1] FALSE
length(unique(finalPilih$annual_inc))
## [1] 35807

check statistics

summary(finalPilih$annual_inc)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0    45600    65000    76875    91687 10999200
library(ggplot2)

ggplot(finalPilih, aes(x = annual_inc)) +
  geom_histogram(binwidth = 100000, color = "white", fill = "steelblue") +
  labs(title = "Distribution of annual_inc", x = "annual_inc", y = "Frequency") +
  theme_minimal()

Try geom_density() instead

library(ggplot2)
# Disable scientific notation
options(scipen = 999)#display real value instead of, for ex 3e+06

ggplot(finalPilih, aes(x = annual_inc)) +
  geom_density(fill = "steelblue", alpha = 0.5) +
  labs(title = "Distribution of annual_inc", x = "annual_inc", y = "Density") +
  theme_minimal()

try boxplot

library(ggplot2)

# Create a boxplot of annual_inc with color
ggplot(dataPilih, aes(y = annual_inc)) +
  geom_boxplot(fill = "lightblue", color = "darkblue") +
  labs(title = "Boxplot of annual_inc", y = "annual_inc") +
  theme_minimal()

try boxplot with horizontal view

# Create a boxplot of annual_inc with color
ggplot(dataPilih, aes(x = annual_inc)) +
  geom_boxplot(fill = "lightblue", color = "darkblue") +
  labs(title = "Boxplot of annual_inc", x = "annual_inc") +
  theme_minimal()

> it seems too much data are concentrated below 3000000

nrow(dataPilih[dataPilih$annual_inc < 3000000, ])
## [1] 611741
nrow(dataPilih[dataPilih$annual_inc >= 3000000, ])
## [1] 41
nrow(dataPilih)
## [1] 611782
percent_of_rows_of_customers_with_annual_inc_less_than_3000K = (nrow(dataPilih[dataPilih$annual_inc < 3000000, ])/nrow(dataPilih))*100
percent_of_rows_of_customers_with_annual_inc_less_than_3000K
## [1] 99.9933

Plot <300K

# Filter the data
filtered_data <- dataPilih[dataPilih$annual_inc < 3000000, ]

# Create the density plot
ggplot(filtered_data, aes(x = annual_inc)) +
  geom_density(fill = "lightblue", color = "darkblue") +
  labs(title = "Density of annual_inc (annual_inc < 3000.000)", x = "annual_inc", y = "Density") +
  theme_minimal()

nrow(dataPilih[dataPilih$annual_inc < 700000, ])
## [1] 611370
nrow(dataPilih[dataPilih$annual_inc >= 700000, ])
## [1] 412
nrow(dataPilih)
## [1] 611782
percent_of_rows_of_customers_with_annual_inc_less_than_700K = (nrow(dataPilih[dataPilih$annual_inc < 700000, ])/nrow(dataPilih))*100
percent_of_rows_of_customers_with_annual_inc_less_than_700K
## [1] 99.93266
percent_of_rows_of_customers_with_annual_inc_more_than_700K = (nrow(dataPilih[dataPilih$annual_inc >= 700000, ])/nrow(dataPilih))*100
percent_of_rows_of_customers_with_annual_inc_more_than_700K
## [1] 0.06734425

focus on 99% of data

# Filter the data
filtered_data <- dataPilih[dataPilih$annual_inc < 700000, ]

# Create the density plot
ggplot(filtered_data, aes(x = annual_inc)) +
  geom_density(fill = "lightblue", color = "darkblue") +
  labs(title = "Density of annual_inc (annual_inc < 700000)", x = "annual_inc", y = "Density") +
  theme_minimal()

> it is still too skewed

nrow(dataPilih[dataPilih$annual_inc < 400000, ])
## [1] 609819
nrow(dataPilih[dataPilih$annual_inc >= 400000, ])
## [1] 1963
nrow(dataPilih)
## [1] 611782
percent_of_rows_of_customers_with_annual_inc_less_than_400K = (nrow(dataPilih[dataPilih$annual_inc < 400000, ])/nrow(dataPilih))*100
percent_of_rows_of_customers_with_annual_inc_less_than_400K
## [1] 99.67913
percent_of_rows_of_customers_with_annual_inc_more_than_400K = (nrow(dataPilih[dataPilih$annual_inc >= 400000, ])/nrow(dataPilih))*100
percent_of_rows_of_customers_with_annual_inc_more_than_400K
## [1] 0.3208659

focus on 99.67% of data

# Filter the data
filtered_data <- dataPilih[dataPilih$annual_inc < 400000, ]

# Create the density plot
ggplot(filtered_data, aes(x = annual_inc)) +
  geom_density(fill = "lightblue", color = "darkblue") +
  labs(title = "Density of annual_inc (annual_inc < 400.000)", x = "annual_inc", y = "Density") +
  theme_minimal()

it looks better than the initial distribution, after removing 0.32% data

Put the data in temporary dataset

finalPilih_under400k <- finalPilih[finalPilih$annual_inc < 400000, ]
nrow(finalPilih_under400k)
## [1] 609819

__Convert loan_status to factor finalPilih_under400k\(loan_status <- as.factor(finalPilih_under400k\)loan_status)

__Create the histogram and density plot ggplot(finalPilih_under400k, aes(x = annual_inc, fill = loan_status)) + geom_histogram(binwidth = 80, color = “black”, alpha = 0.7) + geom_density(alpha = 0.7, color = “black”, fill = “lightblue”) + labs(title = “Distribution of annual_inc by loan_status”, x = “annual_inc”, y = “Count”) + theme_minimal() + scale_fill_viridis()

The error message “Discrete value supplied to continuous scale” typically occurs when we try to map a discrete variable to a continuous aesthetic in the plot, such as using a factor variable for the fill color. To resolve this issue, we can convert the loan_status column to a character or a factor with explicit levels before plotting

We will use scale_fill_manual() to manually specify the fill colors for each level of the loan_status factor.

unique(finalPilih_under400k$loan_status)
## [1] "Fully Paid"  "Charged Off"

ggplot annual_inc < 400.000

finalPilih_under400k$loan_status <- as.character(finalPilih_under400k$loan_status)

# Create the histogram plot with fill color
ggplot(finalPilih_under400k, aes(x = annual_inc, fill = loan_status)) +
  geom_histogram(binwidth = 5000, color = "black", alpha = 0.7) +
  labs(title = "Distribution of annual_inc by loan_status, for income < 400.000", x = "annual_inc", y = "Count") +
  theme_minimal() +
  scale_fill_manual(values = c("Charged Off" = "red", "Fully Paid" = "blue"))

compare the statistic of loan_status based on annual_income

library(dplyr)

summary_data <- finalPilih_under400k %>%
  group_by(loan_status) %>%
  summarise(mean = mean(annual_inc),
            sd = sd(annual_inc),
            min = min(annual_inc),
            max = max(annual_inc))
print(summary_data)
## # A tibble: 2 × 5
##   loan_status   mean     sd   min    max
##   <chr>        <dbl>  <dbl> <dbl>  <dbl>
## 1 Charged Off 69373. 39736.     0 395000
## 2 Fully Paid  76382. 44738.     0 399821

Charged off data has a mean of 69373.07 annual income, whereas Fully paid data has a mean of 76382.47. There is a difference of 7009 annual income

loan_status_mean <- aggregate(annual_inc ~ loan_status, data = finalPilih_under400k, FUN = mean)
loan_status_mean
mean_diff <- diff(loan_status_mean$annual_inc)
mean_diff
## [1] 7009.405

earliest_cr_line

The month the borrower's earliest reported credit line was opened

check type

class(finalPilih$earliest_cr_line)
## [1] "character"
any(is.na(finalPilih_temp$earliest_cr_line))
## [1] FALSE

head(finalPilih$)

END