1 Latar Belakang

Membangun model yang dapat memprediksi credit risk menggunakan dataset yang disediakan oleh company yang terdiri dari data pinjaman yang diterima dan yang ditolak. Dilakukan dengan pengerjaan end-to-end solution.

Tujuan utama adalah untuk melakukan prediksi terhadap suatu individu akan kemampuan mereka untuk melakukan pembayaran terhadap pinjaman/kredit yang diberikan.

setwd("D:/Internship/IDX Partner/Final Project")
# df <- read.csv("loan_data_2007_2014.csv")
library(feather)
## Warning: package 'feather' was built under R version 4.3.2
# feather_filename <- "loan_data_2007_2014.feather"
# if (!file.exists(feather_filename)) {
#   # Convert to feather
#   write_feather(df, "loan_data_2007_2014.feather")
# }

df <- read_feather("loan_data_2007_2014.feather")
head(df,5)
## # A tibble: 5 × 75
##       X      id member_id loan_amnt funded_amnt funded_amnt_inv term    int_rate
##   <int>   <int>     <int>     <int>       <int>           <dbl> <chr>      <dbl>
## 1     0 1077501   1296599      5000        5000            4975 " 36 m…     10.6
## 2     1 1077430   1314167      2500        2500            2500 " 60 m…     15.3
## 3     2 1077175   1313524      2400        2400            2400 " 36 m…     16.0
## 4     3 1076863   1277178     10000       10000           10000 " 36 m…     13.5
## 5     4 1075358   1311748      3000        3000            3000 " 60 m…     12.7
## # ℹ 67 more variables: installment <dbl>, grade <chr>, sub_grade <chr>,
## #   emp_title <chr>, emp_length <chr>, home_ownership <chr>, annual_inc <dbl>,
## #   verification_status <chr>, issue_d <chr>, loan_status <chr>,
## #   pymnt_plan <chr>, url <chr>, desc <chr>, purpose <chr>, title <chr>,
## #   zip_code <chr>, addr_state <chr>, dti <dbl>, delinq_2yrs <dbl>,
## #   earliest_cr_line <chr>, inq_last_6mths <dbl>, mths_since_last_delinq <dbl>,
## #   mths_since_last_record <dbl>, open_acc <dbl>, pub_rec <dbl>, …
colnames(df)
##  [1] "X"                           "id"                         
##  [3] "member_id"                   "loan_amnt"                  
##  [5] "funded_amnt"                 "funded_amnt_inv"            
##  [7] "term"                        "int_rate"                   
##  [9] "installment"                 "grade"                      
## [11] "sub_grade"                   "emp_title"                  
## [13] "emp_length"                  "home_ownership"             
## [15] "annual_inc"                  "verification_status"        
## [17] "issue_d"                     "loan_status"                
## [19] "pymnt_plan"                  "url"                        
## [21] "desc"                        "purpose"                    
## [23] "title"                       "zip_code"                   
## [25] "addr_state"                  "dti"                        
## [27] "delinq_2yrs"                 "earliest_cr_line"           
## [29] "inq_last_6mths"              "mths_since_last_delinq"     
## [31] "mths_since_last_record"      "open_acc"                   
## [33] "pub_rec"                     "revol_bal"                  
## [35] "revol_util"                  "total_acc"                  
## [37] "initial_list_status"         "out_prncp"                  
## [39] "out_prncp_inv"               "total_pymnt"                
## [41] "total_pymnt_inv"             "total_rec_prncp"            
## [43] "total_rec_int"               "total_rec_late_fee"         
## [45] "recoveries"                  "collection_recovery_fee"    
## [47] "last_pymnt_d"                "last_pymnt_amnt"            
## [49] "next_pymnt_d"                "last_credit_pull_d"         
## [51] "collections_12_mths_ex_med"  "mths_since_last_major_derog"
## [53] "policy_code"                 "application_type"           
## [55] "annual_inc_joint"            "dti_joint"                  
## [57] "verification_status_joint"   "acc_now_delinq"             
## [59] "tot_coll_amt"                "tot_cur_bal"                
## [61] "open_acc_6m"                 "open_il_6m"                 
## [63] "open_il_12m"                 "open_il_24m"                
## [65] "mths_since_rcnt_il"          "total_bal_il"               
## [67] "il_util"                     "open_rv_12m"                
## [69] "open_rv_24m"                 "max_bal_bc"                 
## [71] "all_util"                    "total_rev_hi_lim"           
## [73] "inq_fi"                      "total_cu_tl"                
## [75] "inq_last_12m"

Deskripsi data

library(readxl)
desc <- read_xlsx("LCDataDictionary.xlsx", sheet = "LoanStats")
## New names:
## • `` -> `...3`
desc
## # A tibble: 98 × 3
##    Feature                    Description                                  ...3 
##    <chr>                      <chr>                                        <chr>
##  1 _rec                       The total amount committed by investors for… Juml…
##  2 acc_now_delinq             The number of accounts on which the borrowe… Juml…
##  3 addr_state                 The state provided by the borrower in the l… Nega…
##  4 all_util                   Balance to credit limit on all trades        Sald…
##  5 annual_inc                 The self-reported annual income provided by… v    
##  6 annual_inc_joint           The combined self-reported annual income pr… Peng…
##  7 application_type           Indicates whether the loan is an individual… <NA> 
##  8 collection_recovery_fee    post charge off collection fee               Biay…
##  9 collections_12_mths_ex_med Number of collections in 12 months excludin… Juml…
## 10 delinq_2yrs                The number of 30+ days past-due incidences … Juml…
## # ℹ 88 more rows
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(df)
## Rows: 466,285
## Columns: 75
## $ X                           <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …
## $ id                          <int> 1077501, 1077430, 1077175, 1076863, 107535…
## $ member_id                   <int> 1296599, 1314167, 1313524, 1277178, 131174…
## $ loan_amnt                   <int> 5000, 2500, 2400, 10000, 3000, 5000, 7000,…
## $ funded_amnt                 <int> 5000, 2500, 2400, 10000, 3000, 5000, 7000,…
## $ funded_amnt_inv             <dbl> 4975.00, 2500.00, 2400.00, 10000.00, 3000.…
## $ term                        <chr> " 36 months", " 60 months", " 36 months", …
## $ int_rate                    <dbl> 10.65, 15.27, 15.96, 13.49, 12.69, 7.90, 1…
## $ installment                 <dbl> 162.87, 59.83, 84.33, 339.31, 67.79, 156.4…
## $ grade                       <chr> "B", "C", "C", "C", "B", "A", "C", "E", "F…
## $ sub_grade                   <chr> "B2", "C4", "C5", "C1", "B5", "A4", "C5", …
## $ emp_title                   <chr> "", "Ryder", "", "AIR RESOURCES BOARD", "U…
## $ emp_length                  <chr> "10+ years", "< 1 year", "10+ years", "10+…
## $ home_ownership              <chr> "RENT", "RENT", "RENT", "RENT", "RENT", "R…
## $ annual_inc                  <dbl> 24000.00, 30000.00, 12252.00, 49200.00, 80…
## $ verification_status         <chr> "Verified", "Source Verified", "Not Verifi…
## $ issue_d                     <chr> "Dec-11", "Dec-11", "Dec-11", "Dec-11", "D…
## $ loan_status                 <chr> "Fully Paid", "Charged Off", "Fully Paid",…
## $ pymnt_plan                  <chr> "n", "n", "n", "n", "n", "n", "n", "n", "n…
## $ url                         <chr> "https://www.lendingclub.com/browse/loanDe…
## $ desc                        <chr> "  Borrower added on 12/22/11 > I need to …
## $ purpose                     <chr> "credit_card", "car", "small_business", "o…
## $ title                       <chr> "Computer", "bike", "real estate business"…
## $ zip_code                    <chr> "860xx", "309xx", "606xx", "917xx", "972xx…
## $ addr_state                  <chr> "AZ", "GA", "IL", "CA", "OR", "AZ", "NC", …
## $ dti                         <dbl> 27.65, 1.00, 8.72, 20.00, 17.94, 11.20, 23…
## $ delinq_2yrs                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ earliest_cr_line            <chr> "Jan-85", "Apr-99", "Nov-01", "Feb-96", "J…
## $ inq_last_6mths              <dbl> 1, 5, 2, 1, 0, 3, 1, 2, 2, 0, 2, 0, 1, 2, …
## $ mths_since_last_delinq      <dbl> NA, NA, NA, 35, 38, NA, NA, NA, NA, NA, NA…
## $ mths_since_last_record      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ open_acc                    <dbl> 3, 3, 2, 10, 15, 9, 7, 4, 11, 2, 14, 12, 4…
## $ pub_rec                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ revol_bal                   <int> 13648, 1687, 2956, 5598, 27783, 7963, 1772…
## $ revol_util                  <dbl> 83.70, 9.40, 98.50, 21.00, 53.90, 28.30, 8…
## $ total_acc                   <dbl> 9, 4, 10, 37, 38, 12, 11, 4, 13, 3, 23, 34…
## $ initial_list_status         <chr> "f", "f", "f", "f", "f", "f", "f", "f", "f…
## $ out_prncp                   <dbl> 0.00, 0.00, 0.00, 0.00, 766.90, 0.00, 1889…
## $ out_prncp_inv               <dbl> 0.00, 0.00, 0.00, 0.00, 766.90, 0.00, 1889…
## $ total_pymnt                 <dbl> 5861.071, 1008.710, 3003.654, 12226.302, 3…
## $ total_pymnt_inv             <dbl> 5831.78, 1008.71, 3003.65, 12226.30, 3242.…
## $ total_rec_prncp             <dbl> 5000.00, 456.46, 2400.00, 10000.00, 2233.1…
## $ total_rec_int               <dbl> 861.07, 435.17, 603.65, 2209.33, 1009.07, …
## $ total_rec_late_fee          <dbl> 0.00, 0.00, 0.00, 16.97, 0.00, 0.00, 0.00,…
## $ recoveries                  <dbl> 0.00, 117.08, 0.00, 0.00, 0.00, 0.00, 0.00…
## $ collection_recovery_fee     <dbl> 0.0000, 1.1100, 0.0000, 0.0000, 0.0000, 0.…
## $ last_pymnt_d                <chr> "Jan-15", "Apr-13", "Jun-14", "Jan-15", "J…
## $ last_pymnt_amnt             <dbl> 171.62, 119.66, 649.91, 357.48, 67.79, 161…
## $ next_pymnt_d                <chr> "", "", "", "", "Feb-16", "", "Feb-16", ""…
## $ last_credit_pull_d          <chr> "Jan-16", "Sep-13", "Jan-16", "Jan-15", "J…
## $ collections_12_mths_ex_med  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ mths_since_last_major_derog <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ policy_code                 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ application_type            <chr> "INDIVIDUAL", "INDIVIDUAL", "INDIVIDUAL", …
## $ annual_inc_joint            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ dti_joint                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ verification_status_joint   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ acc_now_delinq              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tot_coll_amt                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ tot_cur_bal                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ open_acc_6m                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ open_il_6m                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ open_il_12m                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ open_il_24m                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ mths_since_rcnt_il          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ total_bal_il                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ il_util                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ open_rv_12m                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ open_rv_24m                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ max_bal_bc                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ all_util                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ total_rev_hi_lim            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ inq_fi                      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ total_cu_tl                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ inq_last_12m                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

Akan dilakukan pemisahan kolom yang bertipe kategorik dan numerik

categorical <- df[,sapply(df, is.character)]
categorical <- lapply(categorical, is.factor)

library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.3.2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(df[,names(categorical)])
## df[, names(categorical)] 
## 
##  22  Variables      466285  Observations
## --------------------------------------------------------------------------------
## term 
##        n  missing distinct 
##   466285        0        2 
##                               
## Value      36 months 60 months
## Frequency     337953    128332
## Proportion     0.725     0.275
## --------------------------------------------------------------------------------
## grade 
##        n  missing distinct 
##   466285        0        7 
##                                                            
## Value           A      B      C      D      E      F      G
## Frequency   74867 136929 125293  76888  35757  13229   3322
## Proportion  0.161  0.294  0.269  0.165  0.077  0.028  0.007
## --------------------------------------------------------------------------------
## sub_grade 
##        n  missing distinct 
##   466285        0       35 
## 
## lowest : A1 A2 A3 A4 A5, highest: G1 G2 G3 G4 G5
## --------------------------------------------------------------------------------
## emp_title 
##        n  missing distinct 
##   438697    27588   205475 
## 
## lowest : 'roduction manager                         NSA Industries llc               MOTHER  THERESA  HOME               Quitman Street Community School    Bus Operator and Instructor     
## highest: Zynga Inc.                          Zynga, Inc                          Zynx Health                         Zyquest Inc                         Zytron Control Products, Inc.      
## --------------------------------------------------------------------------------
## emp_length 
##        n  missing distinct 
##   445277    21008       11 
##                                                                       
## Value       < 1 year    1 year 10+ years   2 years   3 years   4 years
## Frequency      36265     29622    150049     41373     36596     28023
## Proportion     0.081     0.067     0.337     0.093     0.082     0.063
##                                                             
## Value        5 years   6 years   7 years   8 years   9 years
## Frequency      30774     26112     26180     22395     17888
## Proportion     0.069     0.059     0.059     0.050     0.040
## --------------------------------------------------------------------------------
## home_ownership 
##        n  missing distinct 
##   466285        0        6 
##                                                                 
## Value           ANY MORTGAGE     NONE    OTHER      OWN     RENT
## Frequency         1   235875       50      182    41704   188473
## Proportion    0.000    0.506    0.000    0.000    0.089    0.404
## --------------------------------------------------------------------------------
## verification_status 
##        n  missing distinct 
##   466285        0        3 
##                                                           
## Value         Not Verified Source Verified        Verified
## Frequency           148237          149993          168055
## Proportion           0.318           0.322           0.360
## --------------------------------------------------------------------------------
## issue_d 
##        n  missing distinct 
##   466285        0       91 
## 
## lowest : Apr-08 Apr-09 Apr-10 Apr-11 Apr-12, highest: Sep-10 Sep-11 Sep-12 Sep-13 Sep-14
## --------------------------------------------------------------------------------
## loan_status 
##        n  missing distinct 
##   466285        0        9 
## 
## lowest : Charged Off                                         Current                                             Default                                             Does not meet the credit policy. Status:Charged Off Does not meet the credit policy. Status:Fully Paid 
## highest: Does not meet the credit policy. Status:Fully Paid  Fully Paid                                          In Grace Period                                     Late (16-30 days)                                   Late (31-120 days)                                 
## --------------------------------------------------------------------------------
## pymnt_plan 
##        n  missing distinct 
##   466285        0        2 
##                         
## Value           n      y
## Frequency  466276      9
## Proportion      1      0
## --------------------------------------------------------------------------------
## url 
##        n  missing distinct 
##   466285        0   466285 
## 
## lowest : https://www.lendingclub.com/browse/loanDetail.action?loan_id=1000007 https://www.lendingclub.com/browse/loanDetail.action?loan_id=1000030 https://www.lendingclub.com/browse/loanDetail.action?loan_id=1000033 https://www.lendingclub.com/browse/loanDetail.action?loan_id=1000045 https://www.lendingclub.com/browse/loanDetail.action?loan_id=1000067
## highest: https://www.lendingclub.com/browse/loanDetail.action?loan_id=999951  https://www.lendingclub.com/browse/loanDetail.action?loan_id=999955  https://www.lendingclub.com/browse/loanDetail.action?loan_id=999981  https://www.lendingclub.com/browse/loanDetail.action?loan_id=999983  https://www.lendingclub.com/browse/loanDetail.action?loan_id=999989 
## --------------------------------------------------------------------------------
## desc 
##        n  missing distinct 
##   125749   340536   124435 
## 
## lowest : - Pay off Dell Financial: $ 1300.00 - Pay off IRS for 2005: $ 1400.00 - Pay off Mac Comp    : $ 1700.00 - Pay off Bill Me Later : $ 1000.00 - Pay off WAMU card  : $ 1000.00 - Pay off Becu Visa     : $ 1000.00                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                -To pay off loans and Credit Card at 17-25% interest -Daughter College tuition and books -Bring VW BUS 1970 and House Good storage in NM. Storage not covered by US Army when assign to Oversee duty. -Replace Matters for better sleeps                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  Looking to expand seafood buisness. Purchasing a new boat with gear. Market to sell my catch is very good and outlook is good. I also work for sea tow so I always have steady income coming in.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           I would like to increase my cash availability mainly to potentially be used for emergencies. While my income to expese flow is solvent and I have some spare cash, I would like to protect myself against uncertainty and am willing to take a small loan out for it.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
## highest: www.music-housecalls.com--The Amadeus Home Music School will be using the loan to expand our Home Music Lesson business which currently has 250 instructors in 17 states.  The expansion will be to add Ontario and other states within the United States of America.  This will assist us in paying our first few months of Google charges.  With Google we have generated a 5 to 1 return. Our web presence is at the top of Google, Yahoo and MSN in the sponsored links.  Please take a look at our work and consider funding our expansion.                                                                                                                                                                                                                                Yes I have a line of credit that the interest rate just went up to 20.99 so I am looking to refi for a lower rate.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Young couple just married.  Own home but have two car loans and wife has school loans.  Husband never had credit card debt.  Was lucky enough that parents paid school college tuition.  Husband is Senior Finance Officer at large denominational healthcare facility chain.  Wife is an Electronic Medical Record Intermediate Analyst and BSN RN by training.    Wife accumulated $15000 in credit card debt during school and throughout her twenties.  Attempting to consolidate this debt into a more manageable rate and not just open up another credit card line.                                                                                                                                                                                                      Young professional who needs to pay down debt acquired from college.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Zensah Performance Apparel is a start-up, and we are now seeking to expand our product line as we enter the midwya point in our 3rd year! We currently sell through our website at www.zensah.com and directly to professional teams. Type ZENSAH in YOU TUBE, and you will see one of the most famous athletes in the world who sports ZENSAH!  Zensah has create a proprietary moisture wicking fabric that doesn't carry odor and is not coarse. Unlike Under Armour and Nike you won't chafe with our fabric.  We want to try Lending Club even though we are well capitalized. If this works out we thinks it could be a great way to allow our fans helps us grow even more!  I encourage you to visit our website to learn more.  Thanks for your interest.  TEAM ZENSAH
## --------------------------------------------------------------------------------
## purpose 
##        n  missing distinct 
##   466285        0       14 
## 
## lowest : car                credit_card        debt_consolidation educational        home_improvement  
## highest: other              renewable_energy   small_business     vacation           wedding           
## --------------------------------------------------------------------------------
## title 
##        n  missing distinct 
##   466265       20    63099 
## 
## lowest : '08 & '09 Roth IRA Investments '08 Rehab                      '56 Dream Trailer              '71 Bobbed Duece               'Cut the Cards'               
## highest: ZMH                            zonball Loan                   Zx                             Zx6r                           zxcvb                         
## --------------------------------------------------------------------------------
## zip_code 
##        n  missing distinct 
##   466285        0      888 
## 
## lowest : 007xx 008xx 010xx 011xx 012xx, highest: 995xx 996xx 997xx 998xx 999xx
## --------------------------------------------------------------------------------
## addr_state 
##        n  missing distinct 
##   466285        0       50 
## 
## lowest : AK AL AR AZ CA, highest: VT WA WI WV WY
## --------------------------------------------------------------------------------
## earliest_cr_line 
##        n  missing distinct 
##   466256       29      664 
## 
## lowest : Apr-00 Apr-01 Apr-02 Apr-03 Apr-04, highest: Sep-95 Sep-96 Sep-97 Sep-98 Sep-99
## --------------------------------------------------------------------------------
## initial_list_status 
##        n  missing distinct 
##   466285        0        2 
##                         
## Value           f      w
## Frequency  303005 163280
## Proportion   0.65   0.35
## --------------------------------------------------------------------------------
## last_pymnt_d 
##        n  missing distinct 
##   465909      376       98 
## 
## lowest : Apr-08 Apr-09 Apr-10 Apr-11 Apr-12, highest: Sep-11 Sep-12 Sep-13 Sep-14 Sep-15
## --------------------------------------------------------------------------------
## next_pymnt_d 
##        n  missing distinct 
##   239071   227214      100 
## 
## lowest : Apr-08 Apr-09 Apr-10 Apr-11 Apr-12, highest: Sep-11 Sep-12 Sep-13 Sep-14 Sep-15
## --------------------------------------------------------------------------------
## last_credit_pull_d 
##        n  missing distinct 
##   466243       42      103 
## 
## lowest : Apr-09 Apr-10 Apr-11 Apr-12 Apr-13, highest: Sep-11 Sep-12 Sep-13 Sep-14 Sep-15
## --------------------------------------------------------------------------------
## application_type 
##          n    missing   distinct      value 
##     466285          0          1 INDIVIDUAL 
##                      
## Value      INDIVIDUAL
## Frequency      466285
## Proportion          1
## --------------------------------------------------------------------------------
numerical <- df[,sapply(df, is.numeric)]
summary(df[,names(numerical)])
##        X                id             member_id          loan_amnt    
##  Min.   :     0   Min.   :   54734   Min.   :   70473   Min.   :  500  
##  1st Qu.:116571   1st Qu.: 3639987   1st Qu.: 4379705   1st Qu.: 8000  
##  Median :233142   Median :10107897   Median :11941075   Median :12000  
##  Mean   :233142   Mean   :13079730   Mean   :14597661   Mean   :14317  
##  3rd Qu.:349713   3rd Qu.:20731209   3rd Qu.:23001541   3rd Qu.:20000  
##  Max.   :466284   Max.   :38098114   Max.   :40860827   Max.   :35000  
##                                                                        
##   funded_amnt    funded_amnt_inv    int_rate      installment     
##  Min.   :  500   Min.   :    0   Min.   : 5.42   Min.   :  15.67  
##  1st Qu.: 8000   1st Qu.: 8000   1st Qu.:10.99   1st Qu.: 256.69  
##  Median :12000   Median :12000   Median :13.66   Median : 379.89  
##  Mean   :14292   Mean   :14222   Mean   :13.83   Mean   : 432.06  
##  3rd Qu.:20000   3rd Qu.:19950   3rd Qu.:16.49   3rd Qu.: 566.58  
##  Max.   :35000   Max.   :35000   Max.   :26.06   Max.   :1409.99  
##                                                                   
##    annual_inc           dti         delinq_2yrs      inq_last_6mths   
##  Min.   :   1896   Min.   : 0.00   Min.   : 0.0000   Min.   : 0.0000  
##  1st Qu.:  45000   1st Qu.:11.36   1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Median :  63000   Median :16.87   Median : 0.0000   Median : 0.0000  
##  Mean   :  73277   Mean   :17.22   Mean   : 0.2847   Mean   : 0.8047  
##  3rd Qu.:  88960   3rd Qu.:22.78   3rd Qu.: 0.0000   3rd Qu.: 1.0000  
##  Max.   :7500000   Max.   :39.99   Max.   :29.0000   Max.   :33.0000  
##  NA's   :4                         NA's   :29        NA's   :29       
##  mths_since_last_delinq mths_since_last_record    open_acc    
##  Min.   :  0.0          Min.   :  0.0          Min.   : 0.00  
##  1st Qu.: 16.0          1st Qu.: 53.0          1st Qu.: 8.00  
##  Median : 31.0          Median : 76.0          Median :10.00  
##  Mean   : 34.1          Mean   : 74.3          Mean   :11.19  
##  3rd Qu.: 49.0          3rd Qu.:102.0          3rd Qu.:14.00  
##  Max.   :188.0          Max.   :129.0          Max.   :84.00  
##  NA's   :250351         NA's   :403647         NA's   :29     
##     pub_rec          revol_bal         revol_util       total_acc     
##  Min.   : 0.0000   Min.   :      0   Min.   :  0.00   Min.   :  1.00  
##  1st Qu.: 0.0000   1st Qu.:   6413   1st Qu.: 39.20   1st Qu.: 17.00  
##  Median : 0.0000   Median :  11764   Median : 57.60   Median : 23.00  
##  Mean   : 0.1606   Mean   :  16230   Mean   : 56.18   Mean   : 25.06  
##  3rd Qu.: 0.0000   3rd Qu.:  20333   3rd Qu.: 74.70   3rd Qu.: 32.00  
##  Max.   :63.0000   Max.   :2568995   Max.   :892.30   Max.   :156.00  
##  NA's   :29                          NA's   :340      NA's   :29      
##    out_prncp       out_prncp_inv      total_pymnt    total_pymnt_inv
##  Min.   :    0.0   Min.   :    0.0   Min.   :    0   Min.   :    0  
##  1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.: 5552   1st Qu.: 5499  
##  Median :  441.5   Median :  441.4   Median : 9419   Median : 9355  
##  Mean   : 4410.1   Mean   : 4408.4   Mean   :11541   Mean   :11470  
##  3rd Qu.: 7341.6   3rd Qu.: 7338.4   3rd Qu.:15308   3rd Qu.:15231  
##  Max.   :32160.4   Max.   :32160.4   Max.   :57778   Max.   :57778  
##                                                                     
##  total_rec_prncp total_rec_int     total_rec_late_fee   recoveries      
##  Min.   :    0   Min.   :    0.0   Min.   :  0.0000   Min.   :    0.00  
##  1st Qu.: 3709   1st Qu.:  957.3   1st Qu.:  0.0000   1st Qu.:    0.00  
##  Median : 6818   Median : 1818.9   Median :  0.0000   Median :    0.00  
##  Mean   : 8866   Mean   : 2588.7   Mean   :  0.6501   Mean   :   85.34  
##  3rd Qu.:12000   3rd Qu.: 3304.5   3rd Qu.:  0.0000   3rd Qu.:    0.00  
##  Max.   :35000   Max.   :24205.6   Max.   :358.6800   Max.   :33520.27  
##                                                                         
##  collection_recovery_fee last_pymnt_amnt   collections_12_mths_ex_med
##  Min.   :   0.000        Min.   :    0.0   Min.   : 0.00000          
##  1st Qu.:   0.000        1st Qu.:  312.6   1st Qu.: 0.00000          
##  Median :   0.000        Median :  546.0   Median : 0.00000          
##  Mean   :   8.962        Mean   : 3123.9   Mean   : 0.00909          
##  3rd Qu.:   0.000        3rd Qu.: 3187.5   3rd Qu.: 0.00000          
##  Max.   :7002.190        Max.   :36234.4   Max.   :20.00000          
##                                            NA's   :145               
##  mths_since_last_major_derog  policy_code acc_now_delinq      tot_coll_amt    
##  Min.   :  0.0               Min.   :1    Min.   :0.000000   Min.   :      0  
##  1st Qu.: 26.0               1st Qu.:1    1st Qu.:0.000000   1st Qu.:      0  
##  Median : 42.0               Median :1    Median :0.000000   Median :      0  
##  Mean   : 42.9               Mean   :1    Mean   :0.004002   Mean   :    192  
##  3rd Qu.: 59.0               3rd Qu.:1    3rd Qu.:0.000000   3rd Qu.:      0  
##  Max.   :188.0               Max.   :1    Max.   :5.000000   Max.   :9152545  
##  NA's   :367311                           NA's   :29         NA's   :70276    
##   tot_cur_bal      total_rev_hi_lim 
##  Min.   :      0   Min.   :      0  
##  1st Qu.:  28618   1st Qu.:  13500  
##  Median :  81539   Median :  22800  
##  Mean   : 138802   Mean   :  30379  
##  3rd Qu.: 208953   3rd Qu.:  37900  
##  Max.   :8000078   Max.   :9999999  
##  NA's   :70276     NA's   :70276
library(dplyr)
library(tidyr)
df_null <- df %>%
  summarise_all(~ mean(is.na(.))*100) %>%
  gather(key = "column", value = "df_null") %>%
  arrange(desc(df_null))
df_null
## # A tibble: 75 × 2
##    column                    df_null
##    <chr>                       <dbl>
##  1 annual_inc_joint              100
##  2 dti_joint                     100
##  3 verification_status_joint     100
##  4 open_acc_6m                   100
##  5 open_il_6m                    100
##  6 open_il_12m                   100
##  7 open_il_24m                   100
##  8 mths_since_rcnt_il            100
##  9 total_bal_il                  100
## 10 il_util                       100
## # ℹ 65 more rows

2 Data Cleaning

Menghapus kolom yang tidak diperlukan:

df <- df %>%
  select((-one_of(c('X', 'id', 'url', 'desc','member_id', 'url', 'title', 'addr_state', 'zip_code', 'policy_code', 'application_type', 'emp_title'))))

Pengecekan Data Duplicated

df[duplicated(df),]
## # A tibble: 0 × 64
## # ℹ 64 variables: loan_amnt <int>, funded_amnt <int>, funded_amnt_inv <dbl>,
## #   term <chr>, int_rate <dbl>, installment <dbl>, grade <chr>,
## #   sub_grade <chr>, emp_length <chr>, home_ownership <chr>, annual_inc <dbl>,
## #   verification_status <chr>, issue_d <chr>, loan_status <chr>,
## #   pymnt_plan <chr>, purpose <chr>, dti <dbl>, delinq_2yrs <dbl>,
## #   earliest_cr_line <chr>, inq_last_6mths <dbl>, mths_since_last_delinq <dbl>,
## #   mths_since_last_record <dbl>, open_acc <dbl>, pub_rec <dbl>, …
df %>%
  distinct() %>%
  anti_join(df, by = names(df))
## # A tibble: 0 × 64
## # ℹ 64 variables: loan_amnt <int>, funded_amnt <int>, funded_amnt_inv <dbl>,
## #   term <chr>, int_rate <dbl>, installment <dbl>, grade <chr>,
## #   sub_grade <chr>, emp_length <chr>, home_ownership <chr>, annual_inc <dbl>,
## #   verification_status <chr>, issue_d <chr>, loan_status <chr>,
## #   pymnt_plan <chr>, purpose <chr>, dti <dbl>, delinq_2yrs <dbl>,
## #   earliest_cr_line <chr>, inq_last_6mths <dbl>, mths_since_last_delinq <dbl>,
## #   mths_since_last_record <dbl>, open_acc <dbl>, pub_rec <dbl>, …

Tidak ada data yang duplicated. Hal ini mengindikasikan setiap kolom telah mewakili satu individu unik.

df %>%
  summarise_all(n_distinct, na.rm = TRUE)
## # A tibble: 1 × 64
##   loan_amnt funded_amnt funded_amnt_inv  term int_rate installment grade
##       <int>       <int>           <int> <int>    <int>       <int> <int>
## 1      1352        1354            9854     2      506       55622     7
## # ℹ 57 more variables: sub_grade <int>, emp_length <int>, home_ownership <int>,
## #   annual_inc <int>, verification_status <int>, issue_d <int>,
## #   loan_status <int>, pymnt_plan <int>, purpose <int>, dti <int>,
## #   delinq_2yrs <int>, earliest_cr_line <int>, inq_last_6mths <int>,
## #   mths_since_last_delinq <int>, mths_since_last_record <int>, open_acc <int>,
## #   pub_rec <int>, revol_bal <int>, revol_util <int>, total_acc <int>,
## #   initial_list_status <int>, out_prncp <int>, out_prncp_inv <int>, …

Missing Values

Menghapus kolom yang memiliki percentage missing values > 50%.

df <- df %>%
  select(-which(colMeans(is.na(.)) > 0.5))

Pada kolom tot_coll_amt, tot_cur_bal, total_rev_hi_lim, terdapat 15% missing values. Penggisian missing values pada kolom ini dilakukan dengan nilai 0. Hal ini dilakukan dengan asumsi bahwa customer tersebut hanya meminjam sekali.

tot_coll_amt Total collection amounts ever owed
tot_cur_bal Total current balance of all accounts
total_rev_hi_lim Total revolving high credit/credit limit
df <- df %>%
  mutate(across(c('tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim'), ~ifelse(is.na(.), 0, .)))

Selanjutnya, pada kolom numerik dilakukan pengisian missing values dengan nilai rata-rata.

df <- df %>%
  mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))

Pada kolom kategorik, penggisian missing values dengan nilai modusnya.

# membangun fungsi modus terlebih dahulu 
mode <- function(x, na.rm = FALSE) {
  if (na.rm) {
    x <- x[!is.na(x)]
  }
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}
# Mendapatkan nama-nama kolom kategorikal
categorical_columns <- colnames(df[, sapply(df, function(x) is.factor(x) | is.character(x))])

# Mengisi nilai yang hilang dengan modus pada kolom kategorikal
df <- df %>%
  mutate(across(all_of(categorical_columns), ~ifelse(is.na(.), mode(., na.rm = TRUE), .)))
# df %>%
#   summarise(across(everything(), ~sum(is.na(.))))

colSums(is.na(df))
##                  loan_amnt                funded_amnt 
##                          0                          0 
##            funded_amnt_inv                       term 
##                          0                          0 
##                   int_rate                installment 
##                          0                          0 
##                      grade                  sub_grade 
##                          0                          0 
##                 emp_length             home_ownership 
##                          0                          0 
##                 annual_inc        verification_status 
##                          0                          0 
##                    issue_d                loan_status 
##                          0                          0 
##                 pymnt_plan                    purpose 
##                          0                          0 
##                        dti                delinq_2yrs 
##                          0                          0 
##           earliest_cr_line             inq_last_6mths 
##                          0                          0 
##                   open_acc                    pub_rec 
##                          0                          0 
##                  revol_bal                 revol_util 
##                          0                          0 
##                  total_acc        initial_list_status 
##                          0                          0 
##                  out_prncp              out_prncp_inv 
##                          0                          0 
##                total_pymnt            total_pymnt_inv 
##                          0                          0 
##            total_rec_prncp              total_rec_int 
##                          0                          0 
##         total_rec_late_fee                 recoveries 
##                          0                          0 
##    collection_recovery_fee               last_pymnt_d 
##                          0                          0 
##            last_pymnt_amnt               next_pymnt_d 
##                          0                          0 
##         last_credit_pull_d collections_12_mths_ex_med 
##                          0                          0 
##             acc_now_delinq               tot_coll_amt 
##                          0                          0 
##                tot_cur_bal           total_rev_hi_lim 
##                          0                          0

Semua kolom tidak lagi memiliki missing values.

3 Labelling

Dalam project credit risk modeling, tujuan utama adalah untuk melakukan prediksi terhadap suatu individu akan kemampuan mereka untuk melakukan pembayaran terhadap pinjaman/kredit yang diberikan. Oleh karena itu, variabel target yang digunakan harus mencerminkan kemampuan individu dalam hal tersebut.

Dalam dataset ini, variabel loan_status adalah variabel yang dapat dijadikan variabel target karena mencerminkan performa masing-masing individu dalam melakukan pembayaran terhadap pinjaman/kredit selama ini.

library(ggplot2)
df %>%
  ggplot(aes(y = reorder(loan_status, -table(loan_status)[loan_status]))) +
  geom_bar() +
  geom_text(stat = "count",aes(label = scales::percent(
    ..count../sum(after_stat(count))),
    x = ..count..),
    position = position_dodge(width = 0.9),
    hjust = -0.1)+
  labs(y = "Loan Status",x = "Count")+
  theme_bw() +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

prop.table(table(df$loan_status))*100
## 
##                                         Charged Off 
##                                           9.1092358 
##                                             Current 
##                                          48.0877575 
##                                             Default 
##                                           0.1784316 
## Does not meet the credit policy. Status:Charged Off 
##                                           0.1632049 
##  Does not meet the credit policy. Status:Fully Paid 
##                                           0.4263487 
##                                          Fully Paid 
##                                          39.6193315 
##                                     In Grace Period 
##                                           0.6746947 
##                                   Late (16-30 days) 
##                                           0.2612136 
##                                  Late (31-120 days) 
##                                           1.4797817

Pada kolom loan_status memiliki beberapa nilai:

Current artinya pembayaran lancar; Charged Off artinya pembayaran macet sehingga dihapusbukukan; Late artinya pembayaran telat dilakukan; In Grace Period artinya dalam masa tenggang; Fully Paid artinya pembayaran lunas; Default artinya pembayaran macet

Dari definisi-definisi tersebut, masing-masing individu dapat ditandai apakah mereka merupakan bad loan (peminjam yang buruk) atau good loan (peminjam yang baik)

Definisi bad dan good loan terkadang bisa berbeda tergantung dari kebutuhan bisnis. Dalam project ini akan menggunakan keterlambatan pembayaran di atas 30 hari dan yang lebih buruk dari itu sebagai penanda bad loan.

library(dplyr)
dfa <- df # untuk backup data
df <- df %>%
  mutate(bad_flag = ifelse(loan_status %in% c('Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off', 'Late (31-120 days)'), 1, 0)) %>%
  select(-loan_status)
prop.table(table(df$bad_flag))*100
## 
##        0        1 
## 89.06935 10.93065
library(ggplot2)
df %>%
  ggplot(aes(x = factor(bad_flag))) +
  geom_bar(aes(y = ..count.., fill  = factor(bad_flag)),
           position = "dodge") +
  geom_text(stat = "count",aes(label = scales::percent(
    ..count../sum(..count..)),
    y = ..count.., group = factor(bad_flag)),
    position = position_dodge(width = 0.9),
    vjust = -0.5)+
  scale_y_continuous(breaks = seq(0, 300000, by = 50000))+
  labs(title = "Proportion of Clients Loan Status",
       x = "Loan Status", y = "Count") +
  scale_fill_manual(values = c("skyblue", "pink"),
                    name = "Loan Status", 
                    labels = c("Good", "Bad")) +
  scale_x_discrete(labels = c("0" = "Good", "1" = "Bad")) +
  theme_bw()

Diperoleh imbalanced dataset problem setelah melakukan labelling terhadap bad/good loan.

4 Feature Enggineering

4.1 emp_length & term

table(unique(df$emp_length))
## 
##            < 1 year    1 year 10+ years   2 years   3 years   4 years   5 years 
##         1         1         1         1         1         1         1         1 
##   6 years   7 years   8 years   9 years 
##         1         1         1         1
unique(df$term)
## [1] " 36 months" " 60 months"
df <- df %>%
 mutate(emp_length = as.numeric(gsub("\\+ years| years| year", "", gsub("< 1 year", "0", emp_length)))) %>%
  mutate(term = as.numeric(gsub("months", "", term)))
sum(is.na(df$emp_length))
## [1] 21008
df <- df %>%
  mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))
sum(is.na(df$emp_length))
## [1] 0

4.2 variabel tanggal

earliest_cr_line The month the borrower’s earliest reported credit line was opened Bulan jalur kredit yang paling awal yang dilaporkan peminjam dibuka
issue_d The month which the loan was funded Bulan yang didanai pinjaman
last_pymnt_d Last month payment was received Bulan lalu pembayaran diterima
next_pymnt_d Next scheduled payment date Tanggal Pembayaran Terjadwal Berikutnya
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
date_columns <- c('earliest_cr_line', 'last_credit_pull_d', 'last_pymnt_d', 'issue_d', 'next_pymnt_d')

df<- df %>%
  mutate(across(all_of(date_columns),
                ~as.Date(parse_date_time(., orders = '%b-%y'))))
class(df$next_pymnt_d)
## [1] "Date"
class(df$issue_d)
## [1] "Date"
head(df[, c('earliest_cr_line', 'last_credit_pull_d', 'last_pymnt_d', 'issue_d', 'next_pymnt_d')])
## # A tibble: 6 × 5
##   earliest_cr_line last_credit_pull_d last_pymnt_d issue_d    next_pymnt_d
##   <date>           <date>             <date>       <date>     <date>      
## 1 1985-01-01       2016-01-01         2015-01-01   2011-12-01 NA          
## 2 1999-04-01       2013-09-01         2013-04-01   2011-12-01 NA          
## 3 2001-11-01       2016-01-01         2014-06-01   2011-12-01 NA          
## 4 1996-02-01       2015-01-01         2015-01-01   2011-12-01 NA          
## 5 1996-01-01       2016-01-01         2016-01-01   2011-12-01 2016-02-01  
## 6 2004-11-01       2015-09-01         2015-01-01   2011-12-01 NA
# mengisi nilai kosong pada kolom tangga dengan modus
df <- df %>%
  mutate(across(all_of(date_columns), ~replace_na(as.Date(., format = "%Y-%m-%d"), mode(as.Date(.[!is.na(.)], format = "%Y-%m-%d")))))
class(df$issue_d)
## [1] "Date"
df[, c('earliest_cr_line', 'last_credit_pull_d', 'last_pymnt_d', 'issue_d', 'next_pymnt_d')]
## # A tibble: 466,285 × 5
##    earliest_cr_line last_credit_pull_d last_pymnt_d issue_d    next_pymnt_d
##    <date>           <date>             <date>       <date>     <date>      
##  1 1985-01-01       2016-01-01         2015-01-01   2011-12-01 2016-02-01  
##  2 1999-04-01       2013-09-01         2013-04-01   2011-12-01 2016-02-01  
##  3 2001-11-01       2016-01-01         2014-06-01   2011-12-01 2016-02-01  
##  4 1996-02-01       2015-01-01         2015-01-01   2011-12-01 2016-02-01  
##  5 1996-01-01       2016-01-01         2016-01-01   2011-12-01 2016-02-01  
##  6 2004-11-01       2015-09-01         2015-01-01   2011-12-01 2016-02-01  
##  7 2005-07-01       2016-01-01         2016-01-01   2011-12-01 2016-02-01  
##  8 2007-01-01       2014-12-01         2015-01-01   2011-12-01 2016-02-01  
##  9 2004-04-01       2012-08-01         2012-04-01   2011-12-01 2016-02-01  
## 10 2004-09-01       2013-03-01         2012-11-01   2011-12-01 2016-02-01  
## # ℹ 466,275 more rows

Selanjutnya ditambahkan variabel payment_time yang berisi jangka waktu pembayaran loan dan variabel credit_time sebagai jangka waktu credit telah berlangsung.

df<- df %>%
  mutate(payment_time = mapply(function(d1, d2) (year(d1) - year(d2)) * 12 + (month(d1) - month(d2)), next_pymnt_d, last_pymnt_d),
         credit_time = mapply(function(d1, d2) year(d1) - year(d2), last_credit_pull_d, earliest_cr_line),
         loan_time = mapply(function(d1, d2) (year(d1) - year(d2)) * 12 + (month(d1) - month(d2)), next_pymnt_d, issue_d))

head(df[,c('last_pymnt_d', 'next_pymnt_d', 'payment_time')])
## # A tibble: 6 × 3
##   last_pymnt_d next_pymnt_d payment_time
##   <date>       <date>              <dbl>
## 1 2015-01-01   2016-02-01             13
## 2 2013-04-01   2016-02-01             34
## 3 2014-06-01   2016-02-01             20
## 4 2015-01-01   2016-02-01             13
## 5 2016-01-01   2016-02-01              1
## 6 2015-01-01   2016-02-01             13
summary(df$payment_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -85.000   1.000   2.000   8.909  13.000  97.000
head(df[,c('last_credit_pull_d', 'earliest_cr_line',
            'credit_time')])
## # A tibble: 6 × 3
##   last_credit_pull_d earliest_cr_line credit_time
##   <date>             <date>                 <dbl>
## 1 2016-01-01         1985-01-01                31
## 2 2013-09-01         1999-04-01                14
## 3 2016-01-01         2001-11-01                15
## 4 2015-01-01         1996-02-01                19
## 5 2016-01-01         1996-01-01                20
## 6 2015-09-01         2004-11-01                11

Terdapat nilai yang negatif pada kolom payment_time dengan nilai ini akan diganti menjadi 0. Penyebab diperoleh nilai yang negatif adalah adanya clients yang tidak dapat melakukan pembayaran.

df[df$payment_time<0,c('last_pymnt_d', 'next_pymnt_d', 'payment_time')]
## # A tibble: 12 × 3
##    last_pymnt_d next_pymnt_d payment_time
##    <date>       <date>              <dbl>
##  1 2016-01-01   2011-02-01            -59
##  2 2016-01-01   2010-10-01            -63
##  3 2016-01-01   2010-10-01            -63
##  4 2016-01-01   2010-06-01            -67
##  5 2016-01-01   2010-06-01            -67
##  6 2016-01-01   2010-06-01            -67
##  7 2016-01-01   2010-05-01            -68
##  8 2016-01-01   2010-04-01            -69
##  9 2016-01-01   2010-04-01            -69
## 10 2016-01-01   2009-11-01            -74
## 11 2016-01-01   2008-12-01            -85
## 12 2016-01-01   2009-01-01            -84
summary(df$credit_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -57.00   13.00   17.00   18.03   22.00   47.00
df[df$credit_time<0,c('last_credit_pull_d',
                         'earliest_cr_line','credit_time')]
## # A tibble: 1,169 × 3
##    last_credit_pull_d earliest_cr_line credit_time
##    <date>             <date>                 <dbl>
##  1 2013-09-01         2062-09-01               -49
##  2 2015-09-01         2068-09-01               -53
##  3 2016-01-01         2064-09-01               -48
##  4 2015-05-01         2067-09-01               -52
##  5 2014-11-01         2065-02-01               -51
##  6 2013-04-01         2067-06-01               -54
##  7 2014-04-01         2067-08-01               -53
##  8 2014-11-01         2063-12-01               -49
##  9 2016-01-01         2068-09-01               -52
## 10 2015-10-01         2068-09-01               -53
## # ℹ 1,159 more rows
summary(df$loan_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   25.00   28.88   35.00  104.00
head(df[,c('issue_d', 'next_pymnt_d',
            'loan_time')])
## # A tibble: 6 × 3
##   issue_d    next_pymnt_d loan_time
##   <date>     <date>           <dbl>
## 1 2011-12-01 2016-02-01          50
## 2 2011-12-01 2016-02-01          50
## 3 2011-12-01 2016-02-01          50
## 4 2011-12-01 2016-02-01          50
## 5 2011-12-01 2016-02-01          50
## 6 2011-12-01 2016-02-01          50

Juga terdapat nilai yang negatif pada kolom credit_time dengan nilai ini akan diganti menjadi maksimum. Penyebab diperoleh nilai yang negatif adalah adanya kesalahan pengubahan data tanggal yaitu misalnya sept-62 diubah menjadi 2062-09-01. Data ini harusnya diubah menjadi 1962-09-01.

df <- df %>%
  mutate(
    payment_time = ifelse(payment_time<0, 0, payment_time),
    credit_time = ifelse(credit_time<0, max(credit_time, na.rm = TRUE), credit_time)
  )

Perubahan summary pada kolom payment_time dan credit_time.

summary(df$payment_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   8.911  13.000  97.000
summary(df$credit_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   13.00   17.00   18.27   22.00   47.00

Selanjutnya, kolom tanggal akan dihapus dari dataset

df <- df[,!names(df) %in% date_columns]

5 EDA

Exploratory Data Analysis

5.1 Correlation Check

library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(df[,sapply(df, is.numeric)]), method="number",
         type="upper", diag=FALSE, tl.cex=0.7)

library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.2
pmat <- df %>%
  select_if(is.numeric) %>%
  cor_pmat()

df %>%
  select_if(is.numeric) %>%
  cor() %>%
  ggcorrplot(colors = c("firebrick", "white", "green"),
             hc.order = TRUE, p.mat = pmat, 
             lab = TRUE, lab_size = 4, digits = 1, 
             ggtheme = theme_bw())

data.frame(abs(cor(df[,sapply(df, is.numeric)])))
##                              loan_amnt funded_amnt funded_amnt_inv         term
## loan_amnt                  1.000000000 0.998547547     0.994347021 0.4128702547
## funded_amnt                0.998547547 1.000000000     0.996125407 0.4108621600
## funded_amnt_inv            0.994347021 0.996125407     1.000000000 0.4120047058
## term                       0.412870255 0.410862160     0.412004706 1.0000000000
## int_rate                   0.167182789 0.167920761     0.169353404 0.4433946928
## installment                0.949665620 0.951786604     0.947387015 0.1596305798
## emp_length                 0.117569597 0.118105464     0.121100727 0.0795046945
## annual_inc                 0.370892683 0.370310580     0.367805276 0.0699496280
## dti                        0.057318779 0.058672721     0.062606252 0.0919121413
## delinq_2yrs                0.006868767 0.007413858     0.008177758 0.0041488967
## inq_last_6mths             0.020268657 0.020910467     0.027503013 0.0100029233
## open_acc                   0.204189445 0.204990936     0.205813634 0.0777218963
## pub_rec                    0.081107663 0.080564033     0.078855470 0.0227776455
## revol_bal                  0.331422648 0.331039167     0.327436495 0.0925771092
## revol_util                 0.117675554 0.118803283     0.120684027 0.0841125934
## total_acc                  0.235263056 0.235136164     0.235454597 0.1013299180
## out_prncp                  0.518699588 0.521450679     0.525143168 0.4343065006
## out_prncp_inv              0.518659672 0.521412227     0.525115350 0.4342730598
## total_pymnt                0.743840619 0.743362844     0.738036877 0.1749232411
## total_pymnt_inv            0.743580430 0.743440906     0.745459871 0.1773277404
## total_rec_prncp            0.612776695 0.612317698     0.607027005 0.0097980128
## total_rec_int              0.715875859 0.715664457     0.713460135 0.5343329157
## total_rec_late_fee         0.043649948 0.043350435     0.038984991 0.0119152816
## recoveries                 0.110691145 0.110338153     0.107944429 0.0900185914
## collection_recovery_fee    0.077336200 0.077189551     0.073781226 0.0559001087
## last_pymnt_amnt            0.304243272 0.304178424     0.303104762 0.1115968183
## collections_12_mths_ex_med 0.008039230 0.007793597     0.007131168 0.0003727981
## acc_now_delinq             0.006287670 0.006476418     0.006806056 0.0077262582
## tot_coll_amt               0.002023684 0.001989434     0.001890075 0.0014347596
## tot_cur_bal                0.330107186 0.333024613     0.337985206 0.1187821441
## total_rev_hi_lim           0.292122180 0.294750624     0.299241516 0.0672086473
## bad_flag                   0.011819203 0.011003248     0.007191411 0.0635648402
## payment_time               0.121654654 0.127028170     0.147383752 0.1007395526
## credit_time                0.184766244 0.184309567     0.184472644 0.0637074295
## loan_time                  0.120368906 0.127753649     0.148743161 0.0805462217
##                               int_rate installment   emp_length   annual_inc
## loan_amnt                  0.167182789 0.949665620 0.1175695974 0.3708926827
## funded_amnt                0.167920761 0.951786604 0.1181054637 0.3703105805
## funded_amnt_inv            0.169353404 0.947387015 0.1211007268 0.3678052764
## term                       0.443394693 0.159630580 0.0795046945 0.0699496280
## int_rate                   1.000000000 0.148790096 0.0297775049 0.0460689629
## installment                0.148790096 1.000000000 0.1024024650 0.3681450476
## emp_length                 0.029777505 0.102402465 1.0000000000 0.0729817218
## annual_inc                 0.046068963 0.368145048 0.0729817218 1.0000000000
## dti                        0.159657407 0.049918721 0.0486368488 0.1885421251
## delinq_2yrs                0.079209110 0.017027938 0.0297582840 0.0588947748
## inq_last_6mths             0.205676570 0.002304978 0.0160189611 0.0566743497
## open_acc                   0.012321458 0.196847258 0.0416596265 0.1577782383
## pub_rec                    0.066712130 0.070064477 0.0380229074 0.0154371450
## revol_bal                  0.004588639 0.320239374 0.0903617294 0.3323159370
## revol_util                 0.323249068 0.136759333 0.0434388712 0.0375630079
## total_acc                  0.032999652 0.215908000 0.1114628944 0.2246824781
## out_prncp                  0.137427089 0.408837770 0.0856896406 0.1683168076
## out_prncp_inv              0.137451034 0.408810796 0.0856914062 0.1683170848
## total_pymnt                0.127035449 0.764360214 0.0782186118 0.2997844369
## total_pymnt_inv            0.129165807 0.763894699 0.0818547608 0.2987280965
## total_rec_prncp            0.032130464 0.663398905 0.0574376624 0.2758236095
## total_rec_int              0.485153918 0.641947759 0.0961843664 0.2129518016
## total_rec_late_fee         0.057875379 0.052340847 0.0116547697 0.0202729256
## recoveries                 0.127892527 0.105666476 0.0068811325 0.0169540700
## collection_recovery_fee    0.082197687 0.075344082 0.0046930307 0.0136803950
## last_pymnt_amnt            0.075608886 0.299867139 0.0373457042 0.1421560231
## collections_12_mths_ex_med 0.020319236 0.006027600 0.0002173682 0.0004448236
## acc_now_delinq             0.030340591 0.008927168 0.0096787831 0.0171326649
## tot_coll_amt               0.001415883 0.001522130 0.0010212463 0.0019074386
## tot_cur_bal                0.044509453 0.304368976 0.1133953970 0.4309300561
## total_rev_hi_lim           0.093074431 0.276346626 0.0867805332 0.2651156981
## bad_flag                   0.172360667 0.013935632 0.0161638181 0.0498630960
## payment_time               0.050143572 0.102778148 0.0843159591 0.0274456304
## credit_time                0.084212604 0.164275000 0.2210533559 0.1712608663
## loan_time                  0.076691358 0.109865608 0.0836697587 0.0351246765
##                                     dti  delinq_2yrs inq_last_6mths
## loan_amnt                  0.0573187794 0.0068687670    0.020268657
## funded_amnt                0.0586727211 0.0074138584    0.020910467
## funded_amnt_inv            0.0626062525 0.0081777577    0.027503013
## term                       0.0919121413 0.0041488967    0.010002923
## int_rate                   0.1596574074 0.0792091100    0.205676570
## installment                0.0499187207 0.0170279380    0.002304978
## emp_length                 0.0486368488 0.0297582840    0.016018961
## annual_inc                 0.1885421251 0.0588947748    0.056674350
## dti                        1.0000000000 0.0036671069    0.012485003
## delinq_2yrs                0.0036671069 1.0000000000    0.018000976
## inq_last_6mths             0.0124850030 0.0180009761    1.000000000
## open_acc                   0.3039144471 0.0591390343    0.092797307
## pub_rec                    0.0461659793 0.0108032590    0.038341369
## revol_bal                  0.1433802655 0.0311032392    0.015678670
## revol_util                 0.2002844038 0.0131672201    0.094671264
## total_acc                  0.2282215407 0.1307964968    0.124077802
## out_prncp                  0.1199841574 0.0438714904    0.070368170
## out_prncp_inv              0.1199721397 0.0438670711    0.070357459
## total_pymnt                0.0260071365 0.0196764626    0.020683323
## total_pymnt_inv            0.0217879468 0.0189634781    0.014826606
## total_rec_prncp            0.0638953484 0.0320802184    0.005339648
## total_rec_int              0.0895823585 0.0242924213    0.044105180
## total_rec_late_fee         0.0057469635 0.0243884608    0.029925981
## recoveries                 0.0214618789 0.0044794288    0.042931465
## collection_recovery_fee    0.0180835458 0.0053636240    0.032670847
## last_pymnt_amnt            0.0430676308 0.0137276775    0.041252185
## collections_12_mths_ex_med 0.0003350904 0.0391319901    0.001824517
## acc_now_delinq             0.0094944460 0.1265332731    0.006914717
## tot_coll_amt               0.0025062606 0.0004267052    0.001323279
## tot_cur_bal                0.0354641236 0.0894899630    0.028149856
## total_rev_hi_lim           0.1002964574 0.0110366785    0.012436673
## bad_flag                   0.0481018913 0.0012276433    0.073099143
## payment_time               0.1491317191 0.0597273145    0.065690449
## credit_time                0.0446264520 0.0944119875    0.006707277
## loan_time                  0.1673186105 0.0819043421    0.042821424
##                                open_acc     pub_rec   revol_bal  revol_util
## loan_amnt                  0.2041894452 0.081107663 0.331422648 0.117675554
## funded_amnt                0.2049909360 0.080564033 0.331039167 0.118803283
## funded_amnt_inv            0.2058136340 0.078855470 0.327436495 0.120684027
## term                       0.0777218963 0.022777646 0.092577109 0.084112593
## int_rate                   0.0123214582 0.066712130 0.004588639 0.323249068
## installment                0.1968472576 0.070064477 0.320239374 0.136759333
## emp_length                 0.0416596265 0.038022907 0.090361729 0.043438871
## annual_inc                 0.1577782383 0.015437145 0.332315937 0.037563008
## dti                        0.3039144471 0.046165979 0.143380265 0.200284404
## delinq_2yrs                0.0591390343 0.010803259 0.031103239 0.013167220
## inq_last_6mths             0.0927973074 0.038341369 0.015678670 0.094671264
## open_acc                   1.0000000000 0.030473338 0.222908137 0.124231591
## pub_rec                    0.0304733383 1.000000000 0.097960889 0.062494937
## revol_bal                  0.2229081366 0.097960889 1.000000000 0.206277024
## revol_util                 0.1242315913 0.062494937 0.206277024 1.000000000
## total_acc                  0.6821143279 0.007262237 0.198631658 0.094287508
## out_prncp                  0.1445640375 0.002379727 0.181300649 0.096677529
## out_prncp_inv              0.1445665986 0.002370931 0.181284730 0.096678224
## total_pymnt                0.1202069882 0.091161390 0.242185883 0.085910770
## total_pymnt_inv            0.1219014957 0.089689836 0.240251030 0.088216421
## total_rec_prncp            0.0972274820 0.088756684 0.209921839 0.024331798
## total_rec_int              0.1218005672 0.049249874 0.206982234 0.209598626
## total_rec_late_fee         0.0064886760 0.011815211 0.005888523 0.024641687
## recoveries                 0.0134323572 0.012689274 0.020966149 0.033102850
## collection_recovery_fee    0.0120704976 0.007016956 0.015156119 0.021145916
## last_pymnt_amnt            0.0549004486 0.027225452 0.087131869 0.010453890
## collections_12_mths_ex_med 0.0118596988 0.021527715 0.016146221 0.027665449
## acc_now_delinq             0.0181945005 0.002349158 0.001002554 0.022747353
## tot_coll_amt               0.0001830299 0.004372794 0.003635221 0.005136353
## tot_cur_bal                0.2537101368 0.038368880 0.388440351 0.081904808
## total_rev_hi_lim           0.2940651915 0.061555463 0.732382216 0.085171079
## bad_flag                   0.0061736872 0.009110424 0.018348468 0.050187844
## payment_time               0.1028287321 0.072875486 0.054071746 0.084595528
## credit_time                0.1479151986 0.071914752 0.208978888 0.007199994
## loan_time                  0.1332687216 0.116584698 0.043303384 0.047833406
##                              total_acc    out_prncp out_prncp_inv  total_pymnt
## loan_amnt                  0.235263056 0.5186995880  0.5186596717 0.7438406193
## funded_amnt                0.235136164 0.5214506792  0.5214122270 0.7433628439
## funded_amnt_inv            0.235454597 0.5251431685  0.5251153496 0.7380368769
## term                       0.101329918 0.4343065006  0.4342730598 0.1749232411
## int_rate                   0.032999652 0.1374270895  0.1374510340 0.1270354495
## installment                0.215908000 0.4088377704  0.4088107958 0.7643602142
## emp_length                 0.111462894 0.0856896406  0.0856914062 0.0782186118
## annual_inc                 0.224682478 0.1683168076  0.1683170848 0.2997844369
## dti                        0.228221541 0.1199841574  0.1199721397 0.0260071365
## delinq_2yrs                0.130796497 0.0438714904  0.0438670711 0.0196764626
## inq_last_6mths             0.124077802 0.0703681701  0.0703574590 0.0206833230
## open_acc                   0.682114328 0.1445640375  0.1445665986 0.1202069882
## pub_rec                    0.007262237 0.0023797267  0.0023709310 0.0911613902
## revol_bal                  0.198631658 0.1813006490  0.1812847304 0.2421858834
## revol_util                 0.094287508 0.0966775294  0.0966782244 0.0859107697
## total_acc                  1.000000000 0.1234039130  0.1233937266 0.1686782810
## out_prncp                  0.123403913 1.0000000000  0.9999982477 0.0220724107
## out_prncp_inv              0.123393727 0.9999982477  1.0000000000 0.0221411096
## total_pymnt                0.168678281 0.0220724107  0.0221411096 1.0000000000
## total_pymnt_inv            0.169945815 0.0166246723  0.0166813859 0.9958617153
## total_rec_prncp            0.152217528 0.1899325238  0.1899831211 0.9566581894
## total_rec_int              0.126851435 0.4881063840  0.4880193852 0.6244233556
## total_rec_late_fee         0.005511287 0.0066997764  0.0067116512 0.0261583352
## recoveries                 0.016007882 0.1072481328  0.1072407148 0.0224211213
## collection_recovery_fee    0.016140045 0.0727418693  0.0727368380 0.0008899632
## last_pymnt_amnt            0.113015554 0.3173808031  0.3173586917 0.6142763499
## collections_12_mths_ex_med 0.013274555 0.0203460992  0.0203392535 0.0247290523
## acc_now_delinq             0.027718622 0.0164777057  0.0164736264 0.0042887059
## tot_coll_amt               0.005127203 0.0006327966  0.0006323573 0.0025907744
## tot_cur_bal                0.307199521 0.2451596496  0.2451719538 0.1959408990
## total_rev_hi_lim           0.232427774 0.2195087733  0.2195095329 0.1612131873
## bad_flag                   0.023068054 0.1586758643  0.1586622653 0.1944376532
## payment_time               0.054888526 0.4361718206  0.4361407161 0.0489334273
## credit_time                0.298908651 0.1381579071  0.1381279611 0.1262558938
## loan_time                  0.103985776 0.4052968919  0.4053500877 0.1786651253
##                            total_pymnt_inv total_rec_prncp total_rec_int
## loan_amnt                      0.743580430     0.612776695   0.715875859
## funded_amnt                    0.743440906     0.612317698   0.715664457
## funded_amnt_inv                0.745459871     0.607027005   0.713460135
## term                           0.177327740     0.009798013   0.534332916
## int_rate                       0.129165807     0.032130464   0.485153918
## installment                    0.763894699     0.663398905   0.641947759
## emp_length                     0.081854761     0.057437662   0.096184366
## annual_inc                     0.298728097     0.275823610   0.212951802
## dti                            0.021787947     0.063895348   0.089582358
## delinq_2yrs                    0.018963478     0.032080218   0.024292421
## inq_last_6mths                 0.014826606     0.005339648   0.044105180
## open_acc                       0.121901496     0.097227482   0.121800567
## pub_rec                        0.089689836     0.088756684   0.049249874
## revol_bal                      0.240251030     0.209921839   0.206982234
## revol_util                     0.088216421     0.024331798   0.209598626
## total_acc                      0.169945815     0.152217528   0.126851435
## out_prncp                      0.016624672     0.189932524   0.488106384
## out_prncp_inv                  0.016681386     0.189983121   0.488019385
## total_pymnt                    0.995861715     0.956658189   0.624423356
## total_pymnt_inv                1.000000000     0.952157884   0.623730606
## total_rec_prncp                0.952157884     1.000000000   0.378303926
## total_rec_int                  0.623730606     0.378303926   1.000000000
## total_rec_late_fee             0.022136760     0.001048272   0.071733703
## recoveries                     0.023904134     0.115783587   0.030690965
## collection_recovery_fee        0.003727803     0.073920417   0.028152277
## last_pymnt_amnt                0.614216281     0.709675976   0.050888129
## collections_12_mths_ex_med     0.024087677     0.026160178   0.008005301
## acc_now_delinq                 0.003949829     0.008614144   0.009493475
## tot_coll_amt                   0.002496250     0.002335063   0.001864433
## tot_cur_bal                    0.202371552     0.174675988   0.157870720
## total_rev_hi_lim               0.167005460     0.151075701   0.109082261
## bad_flag                       0.195169168     0.253874586   0.026725785
## payment_time                   0.028562726     0.124622260   0.222452198
## credit_time                    0.127163874     0.104464559   0.128424173
## loan_time                      0.155873010     0.190346367   0.046253350
##                            total_rec_late_fee   recoveries
## loan_amnt                        0.0436499484 0.1106911453
## funded_amnt                      0.0433504353 0.1103381525
## funded_amnt_inv                  0.0389849914 0.1079444294
## term                             0.0119152816 0.0900185914
## int_rate                         0.0578753794 0.1278925270
## installment                      0.0523408470 0.1056664759
## emp_length                       0.0116547697 0.0068811325
## annual_inc                       0.0202729256 0.0169540700
## dti                              0.0057469635 0.0214618789
## delinq_2yrs                      0.0243884608 0.0044794288
## inq_last_6mths                   0.0299259811 0.0429314651
## open_acc                         0.0064886760 0.0134323572
## pub_rec                          0.0118152108 0.0126892737
## revol_bal                        0.0058885234 0.0209661493
## revol_util                       0.0246416873 0.0331028500
## total_acc                        0.0055112867 0.0160078819
## out_prncp                        0.0066997764 0.1072481328
## out_prncp_inv                    0.0067116512 0.1072407148
## total_pymnt                      0.0261583352 0.0224211213
## total_pymnt_inv                  0.0221367603 0.0239041337
## total_rec_prncp                  0.0010482724 0.1157835873
## total_rec_int                    0.0717337027 0.0306909648
## total_rec_late_fee               1.0000000000 0.0727026464
## recoveries                       0.0727026464 1.0000000000
## collection_recovery_fee          0.0690150543 0.8006664413
## last_pymnt_amnt                  0.0348203742 0.0707220589
## collections_12_mths_ex_med       0.0005565259 0.0010219600
## acc_now_delinq                   0.0034496846 0.0027615654
## tot_coll_amt                     0.0006081156 0.0006534373
## tot_cur_bal                      0.0067132556 0.0014090735
## total_rev_hi_lim                 0.0176665653 0.0011510894
## bad_flag                         0.1477504480 0.4411705780
## payment_time                     0.0290911992 0.1458420790
## credit_time                      0.0077855875 0.0179608157
## loan_time                        0.0557540533 0.0419128259
##                            collection_recovery_fee last_pymnt_amnt
## loan_amnt                             0.0773362002    0.3042432724
## funded_amnt                           0.0771895506    0.3041784240
## funded_amnt_inv                       0.0737812260    0.3031047618
## term                                  0.0559001087    0.1115968183
## int_rate                              0.0821976869    0.0756088858
## installment                           0.0753440819    0.2998671391
## emp_length                            0.0046930307    0.0373457042
## annual_inc                            0.0136803950    0.1421560231
## dti                                   0.0180835458    0.0430676308
## delinq_2yrs                           0.0053636240    0.0137276775
## inq_last_6mths                        0.0326708466    0.0412521854
## open_acc                              0.0120704976    0.0549004486
## pub_rec                               0.0070169564    0.0272254517
## revol_bal                             0.0151561190    0.0871318688
## revol_util                            0.0211459160    0.0104538899
## total_acc                             0.0161400455    0.1130155536
## out_prncp                             0.0727418693    0.3173808031
## out_prncp_inv                         0.0727368380    0.3173586917
## total_pymnt                           0.0008899632    0.6142763499
## total_pymnt_inv                       0.0037278034    0.6142162807
## total_rec_prncp                       0.0739204166    0.7096759757
## total_rec_int                         0.0281522772    0.0508881289
## total_rec_late_fee                    0.0690150543    0.0348203742
## recoveries                            0.8006664413    0.0707220589
## collection_recovery_fee               1.0000000000    0.0477920275
## last_pymnt_amnt                       0.0477920275    1.0000000000
## collections_12_mths_ex_med            0.0003710405    0.0097890508
## acc_now_delinq                        0.0013778417    0.0009956164
## tot_coll_amt                          0.0003216863    0.0014050910
## tot_cur_bal                           0.0017849011    0.1238012055
## total_rev_hi_lim                      0.0008177373    0.0842468716
## bad_flag                              0.2992273310    0.1680295309
## payment_time                          0.0860448933    0.2866262033
## credit_time                           0.0026421432    0.0039942499
## loan_time                             0.0222862492    0.0213078869
##                            collections_12_mths_ex_med acc_now_delinq
## loan_amnt                                0.0080392300   0.0062876700
## funded_amnt                              0.0077935972   0.0064764177
## funded_amnt_inv                          0.0071311677   0.0068060557
## term                                     0.0003727981   0.0077262582
## int_rate                                 0.0203192362   0.0303405912
## installment                              0.0060276003   0.0089271681
## emp_length                               0.0002173682   0.0096787831
## annual_inc                               0.0004448236   0.0171326649
## dti                                      0.0003350904   0.0094944460
## delinq_2yrs                              0.0391319901   0.1265332731
## inq_last_6mths                           0.0018245171   0.0069147168
## open_acc                                 0.0118596988   0.0181945005
## pub_rec                                  0.0215277146   0.0023491577
## revol_bal                                0.0161462205   0.0010025544
## revol_util                               0.0276654494   0.0227473535
## total_acc                                0.0132745554   0.0277186225
## out_prncp                                0.0203460992   0.0164777057
## out_prncp_inv                            0.0203392535   0.0164736264
## total_pymnt                              0.0247290523   0.0042887059
## total_pymnt_inv                          0.0240876771   0.0039498292
## total_rec_prncp                          0.0261601779   0.0086141441
## total_rec_int                            0.0080053005   0.0094934745
## total_rec_late_fee                       0.0005565259   0.0034496846
## recoveries                               0.0010219600   0.0027615654
## collection_recovery_fee                  0.0003710405   0.0013778417
## last_pymnt_amnt                          0.0097890508   0.0009956164
## collections_12_mths_ex_med               1.0000000000   0.0192869119
## acc_now_delinq                           0.0192869119   1.0000000000
## tot_coll_amt                             0.0068958753   0.0001059103
## tot_cur_bal                              0.0075629776   0.0304194293
## total_rev_hi_lim                         0.0004373544   0.0161110094
## bad_flag                                 0.0051194698   0.0001968804
## payment_time                             0.0330503497   0.0189002678
## credit_time                              0.0077897286   0.0252586599
## loan_time                                0.0514619288   0.0289237634
##                            tot_coll_amt tot_cur_bal total_rev_hi_lim
## loan_amnt                  0.0020236840 0.330107186     0.2921221804
## funded_amnt                0.0019894338 0.333024613     0.2947506243
## funded_amnt_inv            0.0018900751 0.337985206     0.2992415155
## term                       0.0014347596 0.118782144     0.0672086473
## int_rate                   0.0014158831 0.044509453     0.0930744308
## installment                0.0015221297 0.304368976     0.2763466260
## emp_length                 0.0010212463 0.113395397     0.0867805332
## annual_inc                 0.0019074386 0.430930056     0.2651156981
## dti                        0.0025062606 0.035464124     0.1002964574
## delinq_2yrs                0.0004267052 0.089489963     0.0110366785
## inq_last_6mths             0.0013232787 0.028149856     0.0124366735
## open_acc                   0.0001830299 0.253710137     0.2940651915
## pub_rec                    0.0043727941 0.038368880     0.0615554629
## revol_bal                  0.0036352206 0.388440351     0.7323822157
## revol_util                 0.0051363534 0.081904808     0.0851710785
## total_acc                  0.0051272028 0.307199521     0.2324277741
## out_prncp                  0.0006327966 0.245159650     0.2195087733
## out_prncp_inv              0.0006323573 0.245171954     0.2195095329
## total_pymnt                0.0025907744 0.195940899     0.1612131873
## total_pymnt_inv            0.0024962498 0.202371552     0.1670054599
## total_rec_prncp            0.0023350633 0.174675988     0.1510757012
## total_rec_int              0.0018644332 0.157870720     0.1090822605
## total_rec_late_fee         0.0006081156 0.006713256     0.0176665653
## recoveries                 0.0006534373 0.001409074     0.0011510894
## collection_recovery_fee    0.0003216863 0.001784901     0.0008177373
## last_pymnt_amnt            0.0014050910 0.123801206     0.0842468716
## collections_12_mths_ex_med 0.0068958753 0.007562978     0.0004373544
## acc_now_delinq             0.0001059103 0.030419429     0.0161110094
## tot_coll_amt               1.0000000000 0.002397711     0.0015358893
## tot_cur_bal                0.0023977108 1.000000000     0.4210577115
## total_rev_hi_lim           0.0015358893 0.421057712     1.0000000000
## bad_flag                   0.0013937395 0.063741222     0.0503257477
## payment_time               0.0045787456 0.207107599     0.1951475629
## credit_time                0.0032554656 0.172766847     0.1929114148
## loan_time                  0.0066111235 0.266781870     0.2457667167
##                                bad_flag payment_time credit_time   loan_time
## loan_amnt                  0.0118192031  0.121654654 0.184766244 0.120368906
## funded_amnt                0.0110032476  0.127028170 0.184309567 0.127753649
## funded_amnt_inv            0.0071914109  0.147383752 0.184472644 0.148743161
## term                       0.0635648402  0.100739553 0.063707430 0.080546222
## int_rate                   0.1723606670  0.050143572 0.084212604 0.076691358
## installment                0.0139356323  0.102778148 0.164275000 0.109865608
## emp_length                 0.0161638181  0.084315959 0.221053356 0.083669759
## annual_inc                 0.0498630960  0.027445630 0.171260866 0.035124677
## dti                        0.0481018913  0.149131719 0.044626452 0.167318610
## delinq_2yrs                0.0012276433  0.059727315 0.094411987 0.081904342
## inq_last_6mths             0.0730991426  0.065690449 0.006707277 0.042821424
## open_acc                   0.0061736872  0.102828732 0.147915199 0.133268722
## pub_rec                    0.0091104237  0.072875486 0.071914752 0.116584698
## revol_bal                  0.0183484678  0.054071746 0.208978888 0.043303384
## revol_util                 0.0501878437  0.084595528 0.007199994 0.047833406
## total_acc                  0.0230680539  0.054888526 0.298908651 0.103985776
## out_prncp                  0.1586758643  0.436171821 0.138157907 0.405296892
## out_prncp_inv              0.1586622653  0.436140716 0.138127961 0.405350088
## total_pymnt                0.1944376532  0.048933427 0.126255894 0.178665125
## total_pymnt_inv            0.1951691683  0.028562726 0.127163874 0.155873010
## total_rec_prncp            0.2538745860  0.124622260 0.104464559 0.190346367
## total_rec_int              0.0267257852  0.222452198 0.128424173 0.046253350
## total_rec_late_fee         0.1477504480  0.029091199 0.007785588 0.055754053
## recoveries                 0.4411705780  0.145842079 0.017960816 0.041912826
## collection_recovery_fee    0.2992273310  0.086044893 0.002642143 0.022286249
## last_pymnt_amnt            0.1680295309  0.286626203 0.003994250 0.021307887
## collections_12_mths_ex_med 0.0051194698  0.033050350 0.007789729 0.051461929
## acc_now_delinq             0.0001968804  0.018900268 0.025258660 0.028923763
## tot_coll_amt               0.0013937395  0.004578746 0.003255466 0.006611124
## tot_cur_bal                0.0637412217  0.207107599 0.172766847 0.266781870
## total_rev_hi_lim           0.0503257477  0.195147563 0.192911415 0.245766717
## bad_flag                   1.0000000000  0.237647430 0.053208439 0.076956923
## payment_time               0.2376474302  1.000000000 0.098204034 0.736315466
## credit_time                0.0532084388  0.098204034 1.000000000 0.037916352
## loan_time                  0.0769569226  0.736315466 0.037916352 1.000000000

Terdapat pasangan variabel yang memiliki korelai tinggi > 0.8, akan dipilih salah satu variabel saja.

dff <- df # untuk backup data
# Calculate the correlation matrix
corr_matrix <- abs(cor(dff[,sapply(dff, is.numeric)]))

# Create and apply the mask
mask <- upper.tri(corr_matrix, diag = TRUE)
corr_matrix[mask] <- NA

# Find columns with correlation greater than 0.7
high_col <- colnames(corr_matrix)[apply(corr_matrix, 1, function(x) any(x > 0.8, na.rm = TRUE))]
high_col
## [1] "funded_amnt"             "funded_amnt_inv"        
## [3] "installment"             "out_prncp_inv"          
## [5] "total_pymnt_inv"         "total_rec_prncp"        
## [7] "collection_recovery_fee"

Variabel dengan correlation greater than 0.7 dihapus dari dataset.

dff <- dff[,!names(dff) %in% high_col]

5.2 Categorical Check

colnames(dff[,sapply(dff, is.character)])
## [1] "grade"               "sub_grade"           "home_ownership"     
## [4] "verification_status" "pymnt_plan"          "purpose"            
## [7] "initial_list_status"
summarise_all(dff[,sapply(dff,is.character)], list(Unique_Count = ~length(unique(.)))) %>%
  gather(Column, Unique_Count) %>%
  filter(!is.na(Unique_Count))
## # A tibble: 7 × 2
##   Column                           Unique_Count
##   <chr>                                   <int>
## 1 grade_Unique_Count                          7
## 2 sub_grade_Unique_Count                     35
## 3 home_ownership_Unique_Count                 6
## 4 verification_status_Unique_Count            3
## 5 pymnt_plan_Unique_Count                     2
## 6 purpose_Unique_Count                       14
## 7 initial_list_status_Unique_Count            2

Kolom sub_grade akan dihapus karena memiliki nilai unik yang sangat banyak.

dff <- dff %>%
  select(-sub_grade)
sapply(dff %>% select_if(is.character), function(x) table(x) / length(x) * 100)
## $grade
## x
##          A          B          C          D          E          F          G 
## 16.0560601 29.3659457 26.8704762 16.4894860  7.6684860  2.8371061  0.7124398 
## 
## $home_ownership
## x
##          ANY     MORTGAGE         NONE        OTHER          OWN         RENT 
## 2.144611e-04 5.058601e+01 1.072306e-02 3.903192e-02 8.943886e+00 4.042013e+01 
## 
## $verification_status
## x
##    Not Verified Source Verified        Verified 
##        31.79107        32.16767        36.04126 
## 
## $pymnt_plan
## x
##           n           y 
## 99.99806985  0.00193015 
## 
## $purpose
## x
##                car        credit_card debt_consolidation        educational 
##         1.15744663        22.33762613        58.80416483         0.09050259 
##   home_improvement              house     major_purchase            medical 
##         5.69115455         0.48661227         2.10772382         0.98695004 
##             moving              other   renewable_energy     small_business 
##         0.64209657         5.08058376         0.07527585         1.50401578 
##           vacation            wedding 
##         0.53336479         0.50248239 
## 
## $initial_list_status
## x
##        f        w 
## 64.98279 35.01721

Pada kolom pymnt_plan terdiri dari 2 data unik yaitu n dan y. Namun, perbandingan jumlah nilai n dan y sangat jauh berbeda (kolom ini sangat didominasi oleh nilai n saja). Akibatnya, variabel pymnt_plan akan dihapus dari dataset.

dff <- dff %>%
  select(-pymnt_plan)
colnames(dff)
##  [1] "loan_amnt"                  "term"                      
##  [3] "int_rate"                   "grade"                     
##  [5] "emp_length"                 "home_ownership"            
##  [7] "annual_inc"                 "verification_status"       
##  [9] "purpose"                    "dti"                       
## [11] "delinq_2yrs"                "inq_last_6mths"            
## [13] "open_acc"                   "pub_rec"                   
## [15] "revol_bal"                  "revol_util"                
## [17] "total_acc"                  "initial_list_status"       
## [19] "out_prncp"                  "total_pymnt"               
## [21] "total_rec_int"              "total_rec_late_fee"        
## [23] "recoveries"                 "last_pymnt_amnt"           
## [25] "collections_12_mths_ex_med" "acc_now_delinq"            
## [27] "tot_coll_amt"               "tot_cur_bal"               
## [29] "total_rev_hi_lim"           "bad_flag"                  
## [31] "payment_time"               "credit_time"               
## [33] "loan_time"
colnames(dff[,sapply(dff, is.character)])
## [1] "grade"               "home_ownership"      "verification_status"
## [4] "purpose"             "initial_list_status"

Prediksi yang dilakukan menggunakan regresi logistik. Namun dalam pemodelan regresi logistik, seluruh variabel haruslah bertipe numerik. Oleh sebab itu, untuk variabel kategorik akan dihitung WoE dan IV.

6 WoE & IV

crs <- dff %>%
  group_by(grade, bad_flag) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(grade) %>%
  mutate(probability = count / sum(count)) %>%
  # filter(bad_flag == 1) %>%
  select(grade, bad_flag, probability) %>%
  pivot_wider(names_from = bad_flag, values_from = probability) %>%
  arrange(grade)
crs
## # A tibble: 7 × 3
## # Groups:   grade [7]
##   grade   `0`    `1`
##   <chr> <dbl>  <dbl>
## 1 A     0.961 0.0387
## 2 B     0.921 0.0786
## 3 C     0.885 0.115 
## 4 D     0.846 0.154 
## 5 E     0.805 0.195 
## 6 F     0.759 0.241 
## 7 G     0.722 0.278
dff <- dff %>%
  mutate_if(is.character,as.factor)
IV <- Information::create_infotables(dff, y = "bad_flag")
format(IV$Summary, scientific = FALSE)
##                      Variable             IV
## 30               payment_time 1.672809480038
## 24            last_pymnt_amnt 1.465967583644
## 19                  out_prncp 0.763527398792
## 20                total_pymnt 0.609577273280
## 3                    int_rate 0.325231670615
## 23                 recoveries 0.310996853162
## 4                       grade 0.290782139541
## 22         total_rec_late_fee 0.169374635760
## 32                  loan_time 0.097538855862
## 29           total_rev_hi_lim 0.059994509496
## 28                tot_cur_bal 0.055526858355
## 7                  annual_inc 0.053941606223
## 12             inq_last_6mths 0.046715671352
## 2                        term 0.038859834388
## 9                     purpose 0.036980194720
## 31                credit_time 0.035722840355
## 16                 revol_util 0.026880760157
## 10                        dti 0.025982966556
## 18        initial_list_status 0.024801321308
## 21              total_rec_int 0.023441581240
## 6              home_ownership 0.021661906308
## 8         verification_status 0.020831404235
## 17                  total_acc 0.007103883123
## 5                  emp_length 0.005983625175
## 15                  revol_bal 0.005743621809
## 1                   loan_amnt 0.003629138665
## 27               tot_coll_amt 0.001404716364
## 13                   open_acc 0.000676399516
## 14                    pub_rec 0.000581987123
## 25 collections_12_mths_ex_med 0.000238378880
## 11                delinq_2yrs 0.000005419885
## 26             acc_now_delinq 0.000002470560
IV_drop <- format(IV$Summary, scientific = FALSE) %>%
  filter(IV < 0.02) %>%
  pull(Variable)

akan didrop variabel yang memiliki nilai Information Value < 0.02

colnames(dff)
##  [1] "loan_amnt"                  "term"                      
##  [3] "int_rate"                   "grade"                     
##  [5] "emp_length"                 "home_ownership"            
##  [7] "annual_inc"                 "verification_status"       
##  [9] "purpose"                    "dti"                       
## [11] "delinq_2yrs"                "inq_last_6mths"            
## [13] "open_acc"                   "pub_rec"                   
## [15] "revol_bal"                  "revol_util"                
## [17] "total_acc"                  "initial_list_status"       
## [19] "out_prncp"                  "total_pymnt"               
## [21] "total_rec_int"              "total_rec_late_fee"        
## [23] "recoveries"                 "last_pymnt_amnt"           
## [25] "collections_12_mths_ex_med" "acc_now_delinq"            
## [27] "tot_coll_amt"               "tot_cur_bal"               
## [29] "total_rev_hi_lim"           "bad_flag"                  
## [31] "payment_time"               "credit_time"               
## [33] "loan_time"
dffs <- dff %>%
  select (-one_of(IV_drop))

head(dffs)
## # A tibble: 6 × 23
##    term int_rate grade home_ownership annual_inc verification_status purpose    
##   <dbl>    <dbl> <fct> <fct>               <dbl> <fct>               <fct>      
## 1    36     10.6 B     RENT                24000 Verified            credit_card
## 2    60     15.3 C     RENT                30000 Source Verified     car        
## 3    36     16.0 C     RENT                12252 Not Verified        small_busi…
## 4    36     13.5 C     RENT                49200 Source Verified     other      
## 5    60     12.7 B     RENT                80000 Source Verified     other      
## 6    36      7.9 A     RENT                36000 Source Verified     wedding    
## # ℹ 16 more variables: dti <dbl>, inq_last_6mths <dbl>, revol_util <dbl>,
## #   initial_list_status <fct>, out_prncp <dbl>, total_pymnt <dbl>,
## #   total_rec_int <dbl>, total_rec_late_fee <dbl>, recoveries <dbl>,
## #   last_pymnt_amnt <dbl>, tot_cur_bal <dbl>, total_rev_hi_lim <dbl>,
## #   bad_flag <dbl>, payment_time <dbl>, credit_time <dbl>, loan_time <dbl>

Terdapat kolom yang bertipe kategorik. Dilakukan transformasi feature encoding untuk kolom kategorik ini.

7 Feature Encoding

sapply(dffs %>% select_if(is.factor), function(x) table(x))
## $grade
## x
##      A      B      C      D      E      F      G 
##  74867 136929 125293  76888  35757  13229   3322 
## 
## $home_ownership
## x
##      ANY MORTGAGE     NONE    OTHER      OWN     RENT 
##        1   235875       50      182    41704   188473 
## 
## $verification_status
## x
##    Not Verified Source Verified        Verified 
##          148237          149993          168055 
## 
## $purpose
## x
##                car        credit_card debt_consolidation        educational 
##               5397             104157             274195                422 
##   home_improvement              house     major_purchase            medical 
##              26537               2269               9828               4602 
##             moving              other   renewable_energy     small_business 
##               2994              23690                351               7013 
##           vacation            wedding 
##               2487               2343 
## 
## $initial_list_status
## x
##      f      w 
## 303005 163280

pada kolom home_ownership terdapat nilai any, none, other yang jumlahnya cukup kecil. Oleh sebab itu, ketiga nilai ini akan disatukan.

dffs <- dffs %>%
  mutate(home_ownership = factor(case_when(
    home_ownership %in% c("ANY", "NONE") ~ "OTHER",
    TRUE ~ as.character(home_ownership)
  )))

head(dffs)
## # A tibble: 6 × 23
##    term int_rate grade home_ownership annual_inc verification_status purpose    
##   <dbl>    <dbl> <fct> <fct>               <dbl> <fct>               <fct>      
## 1    36     10.6 B     RENT                24000 Verified            credit_card
## 2    60     15.3 C     RENT                30000 Source Verified     car        
## 3    36     16.0 C     RENT                12252 Not Verified        small_busi…
## 4    36     13.5 C     RENT                49200 Source Verified     other      
## 5    60     12.7 B     RENT                80000 Source Verified     other      
## 6    36      7.9 A     RENT                36000 Source Verified     wedding    
## # ℹ 16 more variables: dti <dbl>, inq_last_6mths <dbl>, revol_util <dbl>,
## #   initial_list_status <fct>, out_prncp <dbl>, total_pymnt <dbl>,
## #   total_rec_int <dbl>, total_rec_late_fee <dbl>, recoveries <dbl>,
## #   last_pymnt_amnt <dbl>, tot_cur_bal <dbl>, total_rev_hi_lim <dbl>,
## #   bad_flag <dbl>, payment_time <dbl>, credit_time <dbl>, loan_time <dbl>
sapply(dffs %>% select_if(is.factor), function(x) table(x))
## $grade
## x
##      A      B      C      D      E      F      G 
##  74867 136929 125293  76888  35757  13229   3322 
## 
## $home_ownership
## x
## MORTGAGE    OTHER      OWN     RENT 
##   235875      233    41704   188473 
## 
## $verification_status
## x
##    Not Verified Source Verified        Verified 
##          148237          149993          168055 
## 
## $purpose
## x
##                car        credit_card debt_consolidation        educational 
##               5397             104157             274195                422 
##   home_improvement              house     major_purchase            medical 
##              26537               2269               9828               4602 
##             moving              other   renewable_energy     small_business 
##               2994              23690                351               7013 
##           vacation            wedding 
##               2487               2343 
## 
## $initial_list_status
## x
##      f      w 
## 303005 163280

7.1 Numerical Features Encode

Pada kolom numerik dilakukan proses standarisasi (penskalaan)

dffs_encoded <- dffs %>%
  select(-starts_with("bad_flag")) %>%
  mutate(across(where(is.numeric), scale)) %>%
  bind_cols(select(df, starts_with("bad_flag")), .)
head(dffs_encoded)
## # A tibble: 6 × 23
##   bad_flag term[,1] int_rate[,1] grade home_ownership annual_inc[,1]
##      <dbl>    <dbl>        <dbl> <fct> <fct>                   <dbl>
## 1        0   -0.616      -0.730  B     RENT                   -0.897
## 2        1    1.62        0.331  C     RENT                   -0.787
## 3        0   -0.616       0.489  C     RENT                   -1.11 
## 4        0   -0.616      -0.0778 C     RENT                   -0.438
## 5        0    1.62       -0.261  B     RENT                    0.122
## 6        0   -0.616      -1.36   A     RENT                   -0.678
## # ℹ 17 more variables: verification_status <fct>, purpose <fct>, dti <dbl[,1]>,
## #   inq_last_6mths <dbl[,1]>, revol_util <dbl[,1]>, initial_list_status <fct>,
## #   out_prncp <dbl[,1]>, total_pymnt <dbl[,1]>, total_rec_int <dbl[,1]>,
## #   total_rec_late_fee <dbl[,1]>, recoveries <dbl[,1]>,
## #   last_pymnt_amnt <dbl[,1]>, tot_cur_bal <dbl[,1]>,
## #   total_rev_hi_lim <dbl[,1]>, payment_time <dbl[,1]>, credit_time <dbl[,1]>,
## #   loan_time <dbl[,1]>

7.2 Label Encoding

kolom grade menunjukkan nilai pinjaman, berisi nilai A,B,C,D,E,F,G. Akan diberikan nilai numerik sesuai dengan urutan performa dari A (paling baik) hingga G paling buruk.

dffs_encoded <- dffs_encoded %>%
  mutate(grade_encoded = case_when(
    grade == "A" ~ 7,
    grade == "B" ~ 6,
    grade == "C" ~ 5,
    grade == "D" ~ 4,
    grade == "E" ~ 3,
    grade == "F" ~ 2,
    grade == "G" ~ 1,
    TRUE ~ NA_real_
  )) %>%
  select(-grade)
head(dffs_encoded)
## # A tibble: 6 × 23
##   bad_flag term[,1] int_rate[,1] home_ownership annual_inc[,1]
##      <dbl>    <dbl>        <dbl> <fct>                   <dbl>
## 1        0   -0.616      -0.730  RENT                   -0.897
## 2        1    1.62        0.331  RENT                   -0.787
## 3        0   -0.616       0.489  RENT                   -1.11 
## 4        0   -0.616      -0.0778 RENT                   -0.438
## 5        0    1.62       -0.261  RENT                    0.122
## 6        0   -0.616      -1.36   RENT                   -0.678
## # ℹ 18 more variables: verification_status <fct>, purpose <fct>, dti <dbl[,1]>,
## #   inq_last_6mths <dbl[,1]>, revol_util <dbl[,1]>, initial_list_status <fct>,
## #   out_prncp <dbl[,1]>, total_pymnt <dbl[,1]>, total_rec_int <dbl[,1]>,
## #   total_rec_late_fee <dbl[,1]>, recoveries <dbl[,1]>,
## #   last_pymnt_amnt <dbl[,1]>, tot_cur_bal <dbl[,1]>,
## #   total_rev_hi_lim <dbl[,1]>, payment_time <dbl[,1]>, credit_time <dbl[,1]>,
## #   loan_time <dbl[,1]>, grade_encoded <dbl>

7.3 One-Hot Encoding

Digunakan untuk variabel kategorik nominal (tidak memiliki tingkatan/order).

dffs_encoded <- dffs_encoded %>%
  select_if(is.factor) %>%
  model.matrix(~ . - 1, data = .) %>%
  bind_cols(dffs_encoded, .) %>%
  select(-one_of(colnames(dffs_encoded[,sapply(dffs_encoded, is.factor)]))) %>%
  rename(verification_status_Source_Verified = `verification_statusSource Verified`)

head(dffs_encoded)
## # A tibble: 6 × 39
##   bad_flag term[,1] int_rate[,1] annual_inc[,1] dti[,1] inq_last_6mths[,1]
##      <dbl>    <dbl>        <dbl>          <dbl>   <dbl>              <dbl>
## 1        0   -0.616      -0.730          -0.897  1.33                0.179
## 2        1    1.62        0.331          -0.787 -2.07                3.84 
## 3        0   -0.616       0.489          -1.11  -1.08                1.10 
## 4        0   -0.616      -0.0778         -0.438  0.354               0.179
## 5        0    1.62       -0.261           0.122  0.0919             -0.737
## 6        0   -0.616      -1.36           -0.678 -0.767               2.01 
## # ℹ 33 more variables: revol_util <dbl[,1]>, out_prncp <dbl[,1]>,
## #   total_pymnt <dbl[,1]>, total_rec_int <dbl[,1]>,
## #   total_rec_late_fee <dbl[,1]>, recoveries <dbl[,1]>,
## #   last_pymnt_amnt <dbl[,1]>, tot_cur_bal <dbl[,1]>,
## #   total_rev_hi_lim <dbl[,1]>, payment_time <dbl[,1]>, credit_time <dbl[,1]>,
## #   loan_time <dbl[,1]>, grade_encoded <dbl>, home_ownershipMORTGAGE <dbl>,
## #   home_ownershipOTHER <dbl>, home_ownershipOWN <dbl>, …

8 Modeling

Dataset masih memiliki masalah imbalanced data yang belum diatasi. Problem ini akan diatasi dengan SMOTE (synthetic minority oversampling technique)

library(rsample)
## Warning: package 'rsample' was built under R version 4.3.2
set.seed(100)
df_split <- dffs_encoded %>%
  initial_split(prop = 0.8, strata = bad_flag)
df_split
## <Training/Testing/Total>
## <373027/93258/466285>
df_train <- df_split %>% training()
head(df_train)
## # A tibble: 6 × 39
##   bad_flag term[,1] int_rate[,1] annual_inc[,1] dti[,1] inq_last_6mths[,1]
##      <dbl>    <dbl>        <dbl>          <dbl>   <dbl>              <dbl>
## 1        0   -0.616      -0.730          -0.897  1.33                0.179
## 2        0   -0.616       0.489          -1.11  -1.08                1.10 
## 3        0   -0.616      -0.0778         -0.438  0.354               0.179
## 4        0    1.62       -0.261           0.122  0.0919             -0.737
## 5        0   -0.616      -1.36           -0.678 -0.767               2.01 
## 6        0    1.62        0.489          -0.478  0.801               0.179
## # ℹ 33 more variables: revol_util <dbl[,1]>, out_prncp <dbl[,1]>,
## #   total_pymnt <dbl[,1]>, total_rec_int <dbl[,1]>,
## #   total_rec_late_fee <dbl[,1]>, recoveries <dbl[,1]>,
## #   last_pymnt_amnt <dbl[,1]>, tot_cur_bal <dbl[,1]>,
## #   total_rev_hi_lim <dbl[,1]>, payment_time <dbl[,1]>, credit_time <dbl[,1]>,
## #   loan_time <dbl[,1]>, grade_encoded <dbl>, home_ownershipMORTGAGE <dbl>,
## #   home_ownershipOTHER <dbl>, home_ownershipOWN <dbl>, …
df_test <- df_split %>% testing()
head(df_test)
## # A tibble: 6 × 39
##   bad_flag term[,1] int_rate[,1] annual_inc[,1] dti[,1] inq_last_6mths[,1]
##      <dbl>    <dbl>        <dbl>          <dbl>   <dbl>              <dbl>
## 1        1    1.62        1.71          -0.605   -1.49               1.10 
## 2        0    1.62        0.188         -0.0232  -0.140              1.10 
## 3        1   -0.616      -0.0778        -0.787   -0.909              0.179
## 4        0   -0.616      -1.79           0.668   -0.853             -0.737
## 5        1   -0.616      -0.486          0.0495  -1.89               0.179
## 6        1    1.62        0.675         -0.423   -0.414             -0.737
## # ℹ 33 more variables: revol_util <dbl[,1]>, out_prncp <dbl[,1]>,
## #   total_pymnt <dbl[,1]>, total_rec_int <dbl[,1]>,
## #   total_rec_late_fee <dbl[,1]>, recoveries <dbl[,1]>,
## #   last_pymnt_amnt <dbl[,1]>, tot_cur_bal <dbl[,1]>,
## #   total_rev_hi_lim <dbl[,1]>, payment_time <dbl[,1]>, credit_time <dbl[,1]>,
## #   loan_time <dbl[,1]>, grade_encoded <dbl>, home_ownershipMORTGAGE <dbl>,
## #   home_ownershipOTHER <dbl>, home_ownershipOWN <dbl>, …

Melakukan SMOTE

# # Mengonversi kolom menjadi numerik dengan penanganan nilai-nilai yang hilang
# df_train <- df_train %>%
#   mutate(across(everything(), ~as.numeric(coalesce(as.character(.), "NA"))))
# 
# library(DMwR)
# 
# df_train[] <- lapply(df_train, as.numeric)
# df_train <- as.data.frame(df_train) %>%
#   rename(verification_status_Source_Verified = `verification_statusSource Verified`)
# 
# 
# # # Apply ROSE to address imbalanced data
# # df_train_balanced <- SMOTE(bad_flag ~ ., data = df_train, perc.over = 100, k = 5)

library(ROSE)
## Warning: package 'ROSE' was built under R version 4.3.2
## Loaded ROSE 0.0-4
# Specify the target variable and predictors
# Specify the target variable and predictors
target_var <- "bad_flag"
predictors <- setdiff(colnames(df_train), target_var)

# Create the formula
formula_str <- paste(target_var, "~", paste(predictors, collapse = " + "))


# Use ovun.sample for oversampling
df_train_balanced <- ovun.sample(as.formula(formula_str), data = df_train, method = "over")$data
prop.table(table(df_train_balanced$bad_flag))
## 
##        0        1 
## 0.499401 0.500599

8.1 Binary Logistic Regression

X_train <- df_train_balanced[,-1]
y_train <- df_train_balanced$bad_flag
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
head(X_train)
##         term   int_rate annual_inc         dti inq_last_6mths  revol_util
## 1 -0.6162244 -0.7295864 -0.8965484  1.32863088      0.1789195  1.16009143
## 2 -0.6162244  0.4889780 -1.1102908 -1.08248971      1.0950206  1.78393214
## 3 -0.6162244 -0.0778495 -0.4380609  0.35424776      0.1789195 -1.48280129
## 4  1.6227821 -0.2614374  0.1223126  0.09186485     -0.7371815 -0.09602026
## 5 -0.6162244 -1.3606697 -0.6782210 -0.76661126      2.0111216 -1.17509607
## 6  1.6227821  0.4889780 -0.4780149  0.80131766      0.1789195  1.24017909
##    out_prncp total_pymnt total_rec_int total_rec_late_fee recoveries
## 1 -0.6939430 -0.68713659    -0.6955473         -0.1234642 -0.1545486
## 2 -0.6939430 -1.03283544    -0.7991865         -0.1234642 -0.1545486
## 3 -0.6939430  0.08294785    -0.1527280          3.0992610 -0.1545486
## 4 -0.5732679 -1.00397902    -0.6359615         -0.1234642 -0.1545486
## 5 -0.6939430 -0.71492561    -0.7880222         -0.1234642 -0.1545486
## 6 -0.3966768 -0.41180738     0.1760653         -0.1234642 -0.1545486
##   last_pymnt_amnt tot_cur_bal total_rev_hi_lim payment_time credit_time
## 1      -0.5314912  -0.7926475       -0.7165747    0.3336908   1.7490531
## 2      -0.4453863  -0.7926475       -0.7165747    0.9049419  -0.4488870
## 3      -0.4980314  -0.7926475       -0.7165747    0.3336908   0.1005980
## 4      -0.5501833  -0.7926475       -0.7165747   -0.6455968   0.2379693
## 5      -0.5333976  -0.7926475       -0.7165747    0.3336908  -0.9983720
## 6      -0.5317684  -0.7926475       -0.7165747   -0.6455968  -0.9983720
##   loan_time grade_encoded home_ownershipMORTGAGE home_ownershipOTHER
## 1  1.532957             6                      0                   0
## 2  1.532957             5                      0                   0
## 3  1.532957             5                      0                   0
## 4  1.532957             6                      0                   0
## 5  1.532957             7                      0                   0
## 6  1.532957             5                      0                   0
##   home_ownershipOWN home_ownershipRENT verification_status_Source_Verified
## 1                 0                  1                                   0
## 2                 0                  1                                   0
## 3                 0                  1                                   1
## 4                 0                  1                                   1
## 5                 0                  1                                   1
## 6                 0                  1                                   0
##   verification_statusVerified purposecredit_card purposedebt_consolidation
## 1                           1                  1                         0
## 2                           0                  0                         0
## 3                           0                  0                         0
## 4                           0                  0                         0
## 5                           0                  0                         0
## 6                           0                  0                         1
##   purposeeducational purposehome_improvement purposehouse purposemajor_purchase
## 1                  0                       0            0                     0
## 2                  0                       0            0                     0
## 3                  0                       0            0                     0
## 4                  0                       0            0                     0
## 5                  0                       0            0                     0
## 6                  0                       0            0                     0
##   purposemedical purposemoving purposeother purposerenewable_energy
## 1              0             0            0                       0
## 2              0             0            0                       0
## 3              0             0            1                       0
## 4              0             0            1                       0
## 5              0             0            0                       0
## 6              0             0            0                       0
##   purposesmall_business purposevacation purposewedding initial_list_statusw
## 1                     0               0              0                    0
## 2                     1               0              0                    0
## 3                     0               0              0                    0
## 4                     0               0              0                    0
## 5                     0               0              1                    0
## 6                     0               0              0                    0
# Create a logistic regression model
logreg <- glm(bad_flag ~ ., data = df_train_balanced, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logreg %>%
  summary
## 
## Call:
## glm(formula = bad_flag ~ ., family = "binomial", data = df_train_balanced)
## 
## Coefficients: (1 not defined because of singularities)
##                                       Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)                          5.680e+02  4.750e+03    0.120 0.904816    
## term                                -7.307e-02  6.448e-03  -11.333  < 2e-16 ***
## int_rate                             8.197e-01  1.596e-02   51.371  < 2e-16 ***
## annual_inc                           3.768e-02  6.779e-03    5.559 2.72e-08 ***
## dti                                  9.148e-02  4.589e-03   19.935  < 2e-16 ***
## inq_last_6mths                       6.863e-02  4.348e-03   15.787  < 2e-16 ***
## revol_util                           6.289e-02  4.991e-03   12.600  < 2e-16 ***
## out_prncp                           -1.244e+00  7.251e-03 -171.565  < 2e-16 ***
## total_pymnt                         -1.225e+00  1.354e-02  -90.507  < 2e-16 ***
## total_rec_int                        1.856e+00  1.238e-02  150.004  < 2e-16 ***
## total_rec_late_fee                   4.625e-01  4.514e-03  102.471  < 2e-16 ***
## recoveries                           3.726e+03  3.074e+04    0.121 0.903506    
## last_pymnt_amnt                     -6.736e+00  4.837e-02 -139.274  < 2e-16 ***
## tot_cur_bal                          2.516e-03  6.801e-03    0.370 0.711401    
## total_rev_hi_lim                     1.547e-01  7.408e-03   20.883  < 2e-16 ***
## payment_time                         2.665e+00  1.180e-02  225.911  < 2e-16 ***
## credit_time                          1.515e-02  4.572e-03    3.313 0.000924 ***
## loan_time                           -2.215e+00  1.104e-02 -200.503  < 2e-16 ***
## grade_encoded                        5.845e-01  1.147e-02   50.947  < 2e-16 ***
## home_ownershipMORTGAGE              -6.477e-02  1.089e-02   -5.946 2.75e-09 ***
## home_ownershipOTHER                 -3.247e-01  2.299e-01   -1.413 0.157756    
## home_ownershipOWN                   -9.575e-02  1.568e-02   -6.107 1.02e-09 ***
## home_ownershipRENT                          NA         NA       NA       NA    
## verification_status_Source_Verified  2.386e-01  1.135e-02   21.029  < 2e-16 ***
## verification_statusVerified          3.917e-01  1.172e-02   33.424  < 2e-16 ***
## purposecredit_card                   7.412e-01  4.749e-02   15.608  < 2e-16 ***
## purposedebt_consolidation            7.686e-01  4.685e-02   16.406  < 2e-16 ***
## purposeeducational                  -5.147e-01  1.654e-01   -3.112 0.001857 ** 
## purposehome_improvement              5.099e-01  5.026e-02   10.146  < 2e-16 ***
## purposehouse                         6.527e-01  8.082e-02    8.075 6.73e-16 ***
## purposemajor_purchase                1.758e-01  5.629e-02    3.124 0.001787 ** 
## purposemedical                       1.420e-01  6.212e-02    2.285 0.022301 *  
## purposemoving                        2.239e-01  6.751e-02    3.317 0.000911 ***
## purposeother                         2.319e-01  4.987e-02    4.649 3.34e-06 ***
## purposerenewable_energy              2.543e-01  1.698e-01    1.497 0.134330    
## purposesmall_business                7.910e-01  5.871e-02   13.474  < 2e-16 ***
## purposevacation                     -1.485e-01  7.285e-02   -2.038 0.041536 *  
## purposewedding                       1.239e-01  9.351e-02    1.325 0.185071    
## initial_list_statusw                 1.469e-03  9.390e-03    0.156 0.875649    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 922305  on 665302  degrees of freedom
## Residual deviance: 335807  on 665265  degrees of freedom
## AIC: 335883
## 
## Number of Fisher Scoring iterations: 20
# Make predictions on the training set
y_pred_proba_train <- predict(logreg, newdata = df_train_balanced, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
# Make predictions on the test set
data_test <- model.matrix(as.formula(formula_str), data = df_test)
X_test <- data_test[, -1]
y_pred_proba_test <- predict(logreg, newdata = df_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
library(pROC)
# Calculate AUC-ROC for training set
auc_train <- roc(df_train_balanced$bad_flag, y_pred_proba_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste('AUC train probability:', auc_train$auc))
## [1] "AUC train probability: 0.957483917229818"
# Calculate AUC-ROC for test set
auc_test <- roc(df_test$bad_flag, y_pred_proba_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste('AUC test probability:', auc_test$auc))
## [1] "AUC test probability: 0.956787299471255"

Diperoleh Nilai AUC yang sangat baik untuk data training dan data testing. Hal ini menunjukkan bahwa model memiliki kemampuan yang baik dalam melakukan klasifikasi antara dua kelas.

library(caret)
## Loading required package: lattice
# Function to calculate precision, recall, and F1 score
calculate_metrics <- function(actual, predicted) {
  confusion_matrix <- confusionMatrix(factor(predicted, levels = c(0, 1)), factor(actual, levels = c(0, 1)))
  
  precision <- confusion_matrix$byClass["Pos Pred Value"]
  recall <- confusion_matrix$byClass["Sensitivity"]
  f1_score <- 2 * (precision * recall) / (precision + recall)
  
  metrics <- c(precision = precision, recall = recall, f1_score = f1_score)
  return(metrics)
}

# Apply the function to training set
metrics_train <- calculate_metrics(df_train_balanced$bad_flag, ifelse(y_pred_proba_train > 0.5, 1, 0))
print("Metrics on Training Set:")
## [1] "Metrics on Training Set:"
print(metrics_train)
## precision.Pos Pred Value       recall.Sensitivity  f1_score.Pos Pred Value 
##                0.8694850                0.9409065                0.9037869
# Apply the function to test set
metrics_test <- calculate_metrics(df_test$bad_flag, ifelse(y_pred_proba_test > 0.5, 1, 0))
print("Metrics on Test Set:")
## [1] "Metrics on Test Set:"
print(metrics_test)
## precision.Pos Pred Value       recall.Sensitivity  f1_score.Pos Pred Value 
##                0.9815343                0.9406843                0.9606752
# Convert predicted probabilities to class labels using a threshold of 0.5
predicted_labels_train <- ifelse(y_pred_proba_train > 0.5, 1, 0)
predicted_labels_test <- ifelse(y_pred_proba_test > 0.5, 1, 0)

# Confusion matrices
conf_matrix_train <- confusionMatrix(factor(predicted_labels_train, levels = c(0, 1)),
                                     factor(df_train_balanced$bad_flag, levels = c(0, 1)))
conf_matrix_test <- confusionMatrix(factor(predicted_labels_test, levels = c(0, 1)),
                                    factor(df_test$bad_flag, levels = c(0, 1)))

# Function to create a classification report
classification_report <- function(conf_matrix) {
  precision <- conf_matrix$byClass["Pos Pred Value"]
  recall <- conf_matrix$byClass["Sensitivity"]
  f1_score <- conf_matrix$byClass["F1"]
  support <- table(factor(df_test$bad_flag, levels = c(0, 1)))
  
  report <- data.frame(Precision = precision,
                       Recall = recall,
                       F1_Score = f1_score,
                       Support = support)
  
  return(report)
}

# Create classification reports for training and test sets
report_train <- classification_report(conf_matrix_train)
## Warning in data.frame(Precision = precision, Recall = recall, F1_Score =
## f1_score, : row names were found from a short variable and have been discarded
report_test <- classification_report(conf_matrix_test)
## Warning in data.frame(Precision = precision, Recall = recall, F1_Score =
## f1_score, : row names were found from a short variable and have been discarded
# Print the reports
print("Classification Report for Training Set:")
## [1] "Classification Report for Training Set:"
print(report_train)
##   Precision    Recall  F1_Score Support.Var1 Support.Freq
## 1  0.869485 0.9409065 0.9037869            0        83064
## 2  0.869485 0.9409065 0.9037869            1        10194
print("Classification Report for Test Set:")
## [1] "Classification Report for Test Set:"
print(report_test)
##   Precision    Recall  F1_Score Support.Var1 Support.Freq
## 1 0.9815343 0.9406843 0.9606752            0        83064
## 2 0.9815343 0.9406843 0.9606752            1        10194
# Classification Table for training set
table_train <- table(Reference = df_train_balanced$bad_flag, Prediction = predicted_labels_train)
print("Classification Table - Training Set:")
## [1] "Classification Table - Training Set:"
print(table_train)
##          Prediction
## Reference      0      1
##         0 312619  19634
##         1  46926 286124
# Calculate accuracy for training set using table_train
accuracy_train <- sum(diag(table_train)) / sum(table_train)
print(paste("Accuracy - Training Set:", accuracy_train))
## [1] "Accuracy - Training Set: 0.899955358686193"
# Classification Table for test set
table_test <- table(Reference = df_test$bad_flag, Prediction = predicted_labels_test)
print("Classification Table - Test Set:")
## [1] "Classification Table - Test Set:"
print(table_test)
##          Prediction
## Reference     0     1
##         0 78137  4927
##         1  1470  8724
# Calculate accuracy for training set using table_train
accuracy_test <- sum(diag(table_test)) / sum(table_test)
print(paste("Accuracy - Testing Set:", accuracy_test))
## [1] "Accuracy - Testing Set: 0.931405348602801"

Diperoleh Model Binary ogistic Regression dengan tingkat akurasi 93% pada data testing.