Read file
dataSukses<- read.csv("input/accepted_history.csv")
dataGagal<- read.csv("input/rejected_history.csv")dim(dataSukses)## [1] 1048575 151
dim(dataGagal)## [1] 27648741 9
#{r} #str(dataSukses) #
type_counts <- table(sapply(dataSukses, class))
type_counts##
## character integer logical numeric
## 38 84 1 28
if(FALSE) {
head(dataSukses)
names(dataSukses)
str(dataSukses)
dim(dataSukses)
}names(dataGagal)## [1] "Amount.Requested" "Application.Date" "Loan.Title"
## [4] "Risk_Score" "Debt.To.Income.Ratio" "Zip.Code"
## [7] "State" "Employment.Length" "Policy.Code"
Inspect column types, i.e., numeric (int, num), factor, chr, logi Numerical columns (quantitative)
quanti <- dataSukses %>%
select_if(is.numeric) %>%
colnames()
#quanti
# index of numerical columns
quantivar <- which(colnames(dataSukses) %in% quanti)
quantivar## [1] 3 4 5 7 8 14 25 26 28 29 30 31 32 33 34 35 36 37
## [19] 39 40 41 42 43 44 45 46 47 49 52 53 54 55 56 58 59 61
## [37] 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
## [55] 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
## [73] 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## [91] 116 117 118 120 121 122 123 124 125 126 127 128 133 134 138 139 141 142
## [109] 143 149 150 151
categorical columns (qualitative) …but we have not assign any factor yet. Therefore, we expect no factor type
quali <- dataSukses %>%
select_if(is.factor) %>%
colnames()
#quali
# index of categorical columns
qualivar <- which(colnames(dataSukses) %in% quali)
qualivar## integer(0)
character columns
colChar <- dataSukses %>%
select_if(is.character) %>%
colnames()
#colChar
# index of categorical columns
colCharVar <- which(colnames(dataSukses) %in% colChar)
colCharVar## [1] 1 6 9 10 11 12 13 15 16 17 18 19 20 21 22 23 24 27 38
## [20] 48 50 51 57 60 119 129 130 131 132 135 136 137 140 144 145 146 147 148
logical columns
colLog <- dataSukses %>%
select_if(is.logical) %>%
colnames()
#colLog
# index of categorical columns
colLogVar <- which(colnames(dataSukses) %in% colLog)
colLogVar## [1] 2
correlation check
library(GGally)
ggcorr(dataSukses, label = T, hjust = 1, layout.exp = 2)## Warning in ggcorr(dataSukses, label = T, hjust = 1, layout.exp = 2): data
## in column(s) 'id', 'member_id', 'term', 'grade', 'sub_grade', 'emp_title',
## 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status',
## 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state',
## 'earliest_cr_line', 'initial_list_status', 'last_pymnt_d', 'next_pymnt_d',
## 'last_credit_pull_d', 'application_type', 'verification_status_joint',
## 'sec_app_earliest_cr_line', 'hardship_flag', 'hardship_type', 'hardship_reason',
## 'hardship_status', 'hardship_start_date', 'hardship_end_date',
## 'payment_plan_start_date', 'hardship_loan_status', 'disbursement_method',
## 'debt_settlement_flag', 'debt_settlement_flag_date', 'settlement_status',
## 'settlement_date' are not numeric and were ignored
## Warning in cor(data, use = method[1], method = method[2]): the standard
## deviation is zero
it seems dificult to get insight using ggcorr
table(dataSukses$loan_status)##
## Charged Off Current Default
## 12 129187 419550 21
## Fully Paid In Grace Period Late (16-30 days) Late (31-120 days)
## 482595 4227 2104 10879
Plot loan status
tableLoanFull <- table(dataSukses$loan_status)
barplot(tableLoanFull, xlim=c(0,8), ylim=c(0,750000))Choose to focus on Fully paid and
charged off customer
dataPilih <- (dataSukses[dataSukses$loan_status=="Fully Paid" | dataSukses$loan_status=="Charged Off", ])
nrow(dataPilih)## [1] 611782
check class proportion
prop.table(table(dataPilih$loan_status))##
## Charged Off Fully Paid
## 0.2111651 0.7888349
we can see that 21% data are charged off, whereas 78.8% are Fully Paid. Not balance
Plot loan status for a new selected data
tableLoan <- table(dataPilih$loan_status)
barplot(tableLoan, xlim=c(0,2), ylim=c(0,750000), xlab = "Loan Status", ylab = "count")check NA
anyNA(dataPilih)## [1] TRUE
na_counts <- colSums(is.na(dataPilih))
na_counts_sorted <- na_counts[order(na_counts, decreasing = TRUE)]
#na_counts_sortedPlot Null count Note: las = 2: The axis labels are
perpendicular to the axis, with vertical labels
barplot(na_counts_sorted, xlim=c(0,190), las = 2, main = "Statistics of null rows", xlab = "Column Name", ylab = "Values")Display percentage instead of value
na_percent <- colSums(is.na(dataPilih))/nrow(dataPilih)
na_percent_sorted <- na_percent[order(na_percent, decreasing = TRUE)]
barplot(na_percent_sorted, xlim=c(0,190), las = 2, main = "Percentage of null rows", xlab = "Column Name", ylab = "Values")Frame for ggplot
nullData <- data.frame(colNames = names(dataPilih), null_count = na_percent)
sortedNull <- nullData[order(-nullData$null_count, decreasing = TRUE), ]
#sortedNull using ggPlot
# Create a vector of colors for each category
color_category <- factor(1:151)
color_palette <- viridisLite::viridis(151)
# Create the bar plot
ggplot(sortedNull, aes(x = reorder(colNames, null_count, decreasing = TRUE), y = null_count, fill = color_category)) +
geom_bar(stat = "identity") +
labs(title = "Percentage of null data in each column", x = "Column Name", y = "Values") +
scale_fill_manual(values = color_palette) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
guides(fill = FALSE) ## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
# Select columns where the null percentage is less than 50%
kolomPilih <- (dataPilih[, (na_percent< 0.5)])
dim(kolomPilih)## [1] 611782 108
what are the selected columns?
kolomPilih_names <- names(kolomPilih)
length(kolomPilih_names)## [1] 108
Plot a new null statistics (barplot)
na_percent_kolom <- colSums(is.na(kolomPilih))/nrow(kolomPilih)
na_percent_kolom_sorted <- na_percent_kolom[order(na_percent_kolom, decreasing = TRUE)]
barplot(na_percent_kolom_sorted, xlim=c(0,110), las = 2, main = "New statistics of null rows", xlab = "Column Name", ylab = "Values")Plot a new null statistics with ggplot Frame for ggplot
nullData_kolom <- data.frame(colNames = names(kolomPilih), null_count = na_percent_kolom)
sortedNull_kolom <- nullData_kolom[order(-nullData_kolom$null_count, decreasing = TRUE), ]
#sortedNull_kolom using ggPlot
# Create a vector of colors for each category
color_category <- factor(1:108)
color_palette <- viridisLite::viridis(108)
# Create the bar plot
ggplot(sortedNull_kolom, aes(x = reorder(colNames, null_count, decreasing = TRUE), y = null_count, fill = color_category)) +
geom_bar(stat = "identity") +
labs(title = "Percentage of null (selected) data in each column", x = "Column Name", y = "Values") +
scale_fill_manual(values = color_palette) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
guides(fill = FALSE) selected_columns <- c('addr_state', 'annual_inc', 'earliest_cr_line', 'emp_length', 'emp_title', 'fico_range_high', 'fico_range_low', 'grade', 'home_ownership', 'application_type',
'initial_list_status', 'int_rate', 'loan_amnt', 'num_actv_bc_tl', 'loan_status', 'mort_acc', 'tot_cur_bal', 'open_acc', 'pub_rec', 'pub_rec_bankruptcies',
'purpose', 'revol_bal', 'revol_util', 'sub_grade', 'term', 'title', 'total_acc', 'verification_status')
#selected_columnsfinalPilih <- kolomPilih[, selected_columns]
#head(finalPilih)
dim(finalPilih)## [1] 611782 28
The sapply() function is used to apply a specified
function to each element of a vector or a list and simplify the results
into a vector, matrix, or array.
type_counts <- table(sapply(finalPilih, class))
type_counts##
## character integer numeric
## 14 11 3
We still keep the original type of data (char,
int, numeric)
NOTE:
Use sapply(): if you want a simplified output in the form
of a vector, matrix, or array
Use lapply(): to preserve the structure of the original
data and obtain the results as a list
finalPilih_temp <- finalPilih
# Convert character columns to factor
ubahChar <- sapply(finalPilih_temp, is.character)
finalPilih_temp[ubahChar] <- lapply(finalPilih_temp[ubahChar], as.factor)
# Convert integer columns to numeric
ubahInt <- sapply(finalPilih_temp, is.integer)
finalPilih_temp[ubahInt] <- lapply(finalPilih_temp[ubahInt], as.numeric)
# Convert data frame to numeric columns
finalPilih_temp <- as.data.frame(lapply(finalPilih_temp, as.numeric))
#head(finalPilih_temp) Let’s check the type of column
type_counts <- table(sapply(finalPilih_temp, class))
type_counts##
## numeric
## 28
library(GGally)
# Apply ggcorr to the numeric data frame
corr_plot <- ggcorr(finalPilih_temp)
corr_plotwe can see that
loan_statushas a strong relationship with 1.annual income2.loan amount3.fico_range_low,fico_range_high4.num_act_bc_tl5.interest rate, negative corr > but the correlation is still not very clear. Let’s try another plot usingggplot
ggplot
library(ggplot2)
library(viridisLite)
# Create a data frame for plotting
correlation <- cor(finalPilih_temp)[-1, "loan_status"] # Calculate correlation of all col, we exclude `loan status` only to calculate corr, but the column is still there
sorted_correlation <- sort(correlation) # Sort correlation values
data <- data.frame(variable = names(sorted_correlation), correlation = sorted_correlation)
# Plot correlation with loan_status for continuous features
ggplot(data, aes(x = variable, y = correlation, fill = correlation)) +
geom_bar(stat = "identity") +
coord_flip() + #to flip x and y axis
scale_fill_viridis() +
labs(x = "", y = "Correlation with loan_status") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) #angle 90 for vertical text, vjust 0.5 to put text in the middle of bar
NOTE: the variable column includes the names of
the variables, including loan_status
Check proportion of value in loan status (1 = fully paid, 2 = charge off )
table(finalPilih_temp$loan_status)##
## 1 2
## 129187 482595
prop.table(table(finalPilih_temp$loan_status))##
## 1 2
## 0.2111651 0.7888349
barplot(table(finalPilih_temp$loan_status), xlim=c(0,3), ylim=c(0,500000), main = "the number of Fully Paid and Charge Off", xlab = "Loan Status", ylab = "Values")barplot(prop.table(table(finalPilih_temp$loan_status)), xlim=c(0,3), ylim=c(0,1), main = "percentage of Fully Paid and Charge Off", xlab = "Loan Status", ylab = "Values")There is unbalanced proportion of the
loan status
#finalPilih_temp$loan_status#finalPilih_temp$loan_status <- as.integer(factor(as.character(finalPilih_temp$loan_status)))
#finalPilih_tempRemember: now we only focus on selected columns(with
missing values<50%, and use features that are available before loan
is succeeded)
library(ggplot2)
library(viridis)
# Calculate null counts
library(ggplot2)
library(viridis)
# Calculate null counts
na_counts <- colSums(is.na(finalPilih_temp))
nullData <- data.frame(colNames = names(finalPilih_temp), null_count = na_counts)
sortedNull <- nullData[order(-nullData$null_count, decreasing = TRUE), ]
# Plot null counts
ggplot(sortedNull, aes(x = colNames, y = null_count, fill = null_count)) +
geom_bar(stat = "identity") +
scale_fill_viridis(option = "plasma", direction = -1) +
labs(x = "", y = "Null Count", title = "Null Count by Column") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label = null_count), vjust = -0.5)#table(sortedNull)
table(sortedNull$null_count)##
## 0 326
## 27 1
we have only 1 column left with null values. The remaining 27 columns have no
null
#na_counts <- sum(is.na(finalPilih_temp$addr_state))
#na_counts
any(is.na(finalPilih_temp$addr_state))## [1] FALSE
unique(finalPilih_temp$addr_state)## [1] 38 41 14 11 23 40 39 27 5 45 4 15 20 34 43 16 32 2 47 35 18 10 6 22 24
## [26] 8 19 48 12 46 31 9 42 30 29 37 7 3 33 49 26 50 36 17 25 44 28 21 1 13
ups, we should back to original version, before converted to numeric use finalPilih for Investigation each column
unique(finalPilih$addr_state)## [1] "PA" "SD" "IL" "GA" "MN" "SC" "RI" "NC" "CA" "VA" "AZ" "IN" "MD" "NY" "TX"
## [16] "KS" "NM" "AL" "WA" "OH" "LA" "FL" "CO" "MI" "MO" "DC" "MA" "WI" "HI" "VT"
## [31] "NJ" "DE" "TN" "NH" "NE" "OR" "CT" "AR" "NV" "WV" "MT" "WY" "OK" "KY" "MS"
## [46] "UT" "ND" "ME" "AK" "ID"
length(unique(finalPilih$addr_state))## [1] 50
addr_state_counts <- sort(table(finalPilih$addr_state), decreasing = TRUE)
addr_state_counts##
## CA TX NY FL IL NJ PA OH GA NC VA MI AZ
## 86062 50691 48842 43967 23619 21377 20586 20312 19888 17472 16955 16622 14939
## MD MA CO WA MN IN TN MO NV CT WI AL OR
## 14236 13874 13638 13043 11205 10629 9964 9576 9277 8782 8248 7517 7261
## SC LA KY OK KS AR UT MS NM HI NH RI NE
## 7162 7117 5895 5694 5008 4576 4468 3722 3412 2937 2928 2658 2233
## WV DE MT DC AK WY SD VT ME ND ID
## 2222 1780 1707 1445 1407 1262 1254 1232 1153 1006 922
Plot addr_state
library(ggplot2)
# Generate frequency table
freq_table <- sort(table(finalPilih$addr_state), decreasing = TRUE)
# Create a data frame from the frequency table
data <- data.frame(addr_state = names(freq_table), count = as.numeric(freq_table)) #convert to numeric
# Create the bar plot using ggplot2
ggplot(data, aes(x = reorder(addr_state, -count), y = count, fill = count)) +
geom_bar(stat = "identity") +
#geom_text(aes(label = count), vjust = -0.5, size = 3, color = "black", angle = 90) +
geom_text(aes(label = count), vjust = 0, size = 3, color = "darkgray", angle = 90) +
#geom_text(aes(label = count), position = position_stack(vjust = 0.5), size = 3, color = "black", angle = 90) + # Set x label, and rotate
labs(title = "Frequency of addr_state", x = "addr_state", y = "Count") +
theme_minimal() +
labs(title = "Frequency of addr_state", x = "addr_state", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_viridis(option = "plasma") #to dsiplay the legend
Optional This is just additional code, because I am not
comfortable with how R studio display data horizontally instead of
vertically
library(kableExtra)## Warning: package 'kableExtra' was built under R version 4.2.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(knitr)
# Generate frequency table
addr_state_counts <- sort(table(finalPilih$addr_state), decreasing = TRUE)
# Create a data frame from the frequency table
data <- data.frame(addr_state = names(addr_state_counts), count = as.numeric(addr_state_counts))
# Sort the data frame by count in descending order
data <- data[order(-data$count), ]
# Rotate the labels to vertical
data$addr_state <- sapply(data$addr_state, function(x) paste(strsplit(x, "")[[1]], collapse = "\n")) #strsplit(x, "") splits the string x into individual characters. The "" argument in strsplit() specifies that we want to split the string at each character.
#[[1]]: This extracts the first element of the resulting list, which contains the individual characters.
#paste(...): This function concatenates the individual characters back into a single string
#Overall, the purpose of paste(strsplit(x, "")[[1]] is to convert a string into a vector of individual characters.
# Print the table
kable(data, align = "c", caption = "Frequency of addr_state") %>%
kable_styling(bootstrap_options = "striped", full_width = TRUE)| addr_state | count |
|---|---|
| C A | 86062 |
| T X | 50691 |
| N Y | 48842 |
| F L | 43967 |
| I L | 23619 |
| N J | 21377 |
| P A | 20586 |
| O H | 20312 |
| G A | 19888 |
| N C | 17472 |
| V A | 16955 |
| M I | 16622 |
| A Z | 14939 |
| M D | 14236 |
| M A | 13874 |
| C O | 13638 |
| W A | 13043 |
| M N | 11205 |
| I N | 10629 |
| T N | 9964 |
| M O | 9576 |
| N V | 9277 |
| C T | 8782 |
| W I | 8248 |
| A L | 7517 |
| O R | 7261 |
| S C | 7162 |
| L A | 7117 |
| K Y | 5895 |
| O K | 5694 |
| K S | 5008 |
| A R | 4576 |
| U T | 4468 |
| M S | 3722 |
| N M | 3412 |
| H I | 2937 |
| N H | 2928 |
| R I | 2658 |
| N E | 2233 |
| W V | 2222 |
| D E | 1780 |
| M T | 1707 |
| D C | 1445 |
| A K | 1407 |
| W Y | 1262 |
| S D | 1254 |
| V T | 1232 |
| M E | 1153 |
| N D | 1006 |
| I D | 922 |
#The kable_styling() function with the bootstrap_options = "striped" argument applies striped row styling to the table generated by kable(). This means that alternate rows in the table will have different background colors, making it easier to distinguish between rows.
#The full_width = FALSE argument specifies that the table should not occupy the full width of the output documentOne hot encoding the categorical value
# Convert addr_state into one-hot encoding
finalPilih_encoded <- cbind(finalPilih, model.matrix(~ addr_state - 1, data = finalPilih))
# Remove the original addr_state column
finalPilih_encoded$addr_state <- NULL
# View the encoded dataset
head(finalPilih_encoded)Income type is numeric
#na_counts <- sum(is.na(finalPilih_temp$addr_state))
#na_counts
any(is.na(finalPilih$annual_inc))## [1] FALSE
length(unique(finalPilih$annual_inc))## [1] 35807
check statistics
summary(finalPilih$annual_inc)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 45600 65000 76875 91687 10999200
library(ggplot2)
ggplot(finalPilih, aes(x = annual_inc)) +
geom_histogram(binwidth = 100000, color = "white", fill = "steelblue") +
labs(title = "Distribution of annual_inc", x = "annual_inc", y = "Frequency") +
theme_minimal()Try geom_density() instead
library(ggplot2)
# Disable scientific notation
options(scipen = 999)#display real value instead of, for ex 3e+06
ggplot(finalPilih, aes(x = annual_inc)) +
geom_density(fill = "steelblue", alpha = 0.5) +
labs(title = "Distribution of annual_inc", x = "annual_inc", y = "Density") +
theme_minimal()try boxplot
library(ggplot2)
# Create a boxplot of annual_inc with color
ggplot(dataPilih, aes(y = annual_inc)) +
geom_boxplot(fill = "lightblue", color = "darkblue") +
labs(title = "Boxplot of annual_inc", y = "annual_inc") +
theme_minimal()
try
boxplot with horizontal view
# Create a boxplot of annual_inc with color
ggplot(dataPilih, aes(x = annual_inc)) +
geom_boxplot(fill = "lightblue", color = "darkblue") +
labs(title = "Boxplot of annual_inc", x = "annual_inc") +
theme_minimal()
> it seems too much data are concentrated below 3000000
nrow(dataPilih[dataPilih$annual_inc < 3000000, ])## [1] 611741
nrow(dataPilih[dataPilih$annual_inc >= 3000000, ])## [1] 41
nrow(dataPilih)## [1] 611782
percent_of_rows_of_customers_with_annual_inc_less_than_3000K = (nrow(dataPilih[dataPilih$annual_inc < 3000000, ])/nrow(dataPilih))*100
percent_of_rows_of_customers_with_annual_inc_less_than_3000K## [1] 99.9933
Plot <300K
# Filter the data
filtered_data <- dataPilih[dataPilih$annual_inc < 3000000, ]
# Create the density plot
ggplot(filtered_data, aes(x = annual_inc)) +
geom_density(fill = "lightblue", color = "darkblue") +
labs(title = "Density of annual_inc (annual_inc < 3000.000)", x = "annual_inc", y = "Density") +
theme_minimal()nrow(dataPilih[dataPilih$annual_inc < 700000, ])## [1] 611370
nrow(dataPilih[dataPilih$annual_inc >= 700000, ])## [1] 412
nrow(dataPilih)## [1] 611782
percent_of_rows_of_customers_with_annual_inc_less_than_700K = (nrow(dataPilih[dataPilih$annual_inc < 700000, ])/nrow(dataPilih))*100
percent_of_rows_of_customers_with_annual_inc_less_than_700K## [1] 99.93266
percent_of_rows_of_customers_with_annual_inc_more_than_700K = (nrow(dataPilih[dataPilih$annual_inc >= 700000, ])/nrow(dataPilih))*100
percent_of_rows_of_customers_with_annual_inc_more_than_700K## [1] 0.06734425
focus on 99% of data
# Filter the data
filtered_data <- dataPilih[dataPilih$annual_inc < 700000, ]
# Create the density plot
ggplot(filtered_data, aes(x = annual_inc)) +
geom_density(fill = "lightblue", color = "darkblue") +
labs(title = "Density of annual_inc (annual_inc < 700000)", x = "annual_inc", y = "Density") +
theme_minimal()
> it is still too skewed
nrow(dataPilih[dataPilih$annual_inc < 400000, ])## [1] 609819
nrow(dataPilih[dataPilih$annual_inc >= 400000, ])## [1] 1963
nrow(dataPilih)## [1] 611782
percent_of_rows_of_customers_with_annual_inc_less_than_400K = (nrow(dataPilih[dataPilih$annual_inc < 400000, ])/nrow(dataPilih))*100
percent_of_rows_of_customers_with_annual_inc_less_than_400K## [1] 99.67913
percent_of_rows_of_customers_with_annual_inc_more_than_400K = (nrow(dataPilih[dataPilih$annual_inc >= 400000, ])/nrow(dataPilih))*100
percent_of_rows_of_customers_with_annual_inc_more_than_400K## [1] 0.3208659
focus on 99.67% of data
# Filter the data
filtered_data <- dataPilih[dataPilih$annual_inc < 400000, ]
# Create the density plot
ggplot(filtered_data, aes(x = annual_inc)) +
geom_density(fill = "lightblue", color = "darkblue") +
labs(title = "Density of annual_inc (annual_inc < 400.000)", x = "annual_inc", y = "Density") +
theme_minimal()it looks better than the initial distribution, after removing 0.32% data
Put the data in temporary dataset
finalPilih_under400k <- finalPilih[finalPilih$annual_inc < 400000, ]
nrow(finalPilih_under400k)## [1] 609819
__Convert loan_status to factor finalPilih_under400k\(loan_status <- as.factor(finalPilih_under400k\)loan_status)
__Create the histogram and density plot ggplot(finalPilih_under400k, aes(x = annual_inc, fill = loan_status)) + geom_histogram(binwidth = 80, color = “black”, alpha = 0.7) + geom_density(alpha = 0.7, color = “black”, fill = “lightblue”) + labs(title = “Distribution of annual_inc by loan_status”, x = “annual_inc”, y = “Count”) + theme_minimal() + scale_fill_viridis()
The error message “Discrete value supplied to continuous scale” typically occurs when we try to map a discrete variable to a continuous aesthetic in the plot, such as using a factor variable for the fill color. To resolve this issue, we can convert the
loan_statuscolumn to a character or a factor with explicit levels before plotting
We will use scale_fill_manual() to manually specify the
fill colors for each level of the loan_status factor.
unique(finalPilih_under400k$loan_status)## [1] "Fully Paid" "Charged Off"
ggplot annual_inc < 400.000
finalPilih_under400k$loan_status <- as.character(finalPilih_under400k$loan_status)
# Create the histogram plot with fill color
ggplot(finalPilih_under400k, aes(x = annual_inc, fill = loan_status)) +
geom_histogram(binwidth = 5000, color = "black", alpha = 0.7) +
labs(title = "Distribution of annual_inc by loan_status, for income < 400.000", x = "annual_inc", y = "Count") +
theme_minimal() +
scale_fill_manual(values = c("Charged Off" = "red", "Fully Paid" = "blue"))compare the statistic of loan_status based on
annual_income
library(dplyr)
summary_data <- finalPilih_under400k %>%
group_by(loan_status) %>%
summarise(mean = mean(annual_inc),
sd = sd(annual_inc),
min = min(annual_inc),
max = max(annual_inc))
print(summary_data)## # A tibble: 2 × 5
## loan_status mean sd min max
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Charged Off 69373. 39736. 0 395000
## 2 Fully Paid 76382. 44738. 0 399821
Charged offdata has a mean of 69373.07 annual income, whereasFully paiddata has a mean of 76382.47. There is a difference of 7009 annual income
loan_status_mean <- aggregate(annual_inc ~ loan_status, data = finalPilih_under400k, FUN = mean)
loan_status_meanmean_diff <- diff(loan_status_mean$annual_inc)
mean_diff## [1] 7009.405
The month the borrower's earliest reported credit line was opened
check type
class(finalPilih$earliest_cr_line)## [1] "character"
any(is.na(finalPilih_temp$earliest_cr_line))## [1] FALSE
head(finalPilih$)