Downloaded the historical loan files from Lending Club as zipped csv files.
files <- dir(pattern='*.zip');
for (f in files) {
unzip(zipfile=f, exdir ='.');
}
Also add a column to indicate which file it came from.
library(data.table);
data.table 1.10.0
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
library(tibble);
files <- dir(pattern='*.csv$');
n <- 1
d <- tibble()
for (f in files) {
a <- as_tibble(fread(file = f,showProgress = TRUE, skip=1));
a[, "file"] <- f
d <- rbind(d, a);
}
Read 52.3% of 133891 rows
Read 89.6% of 133891 rows
Read 133887 rows and 111 (of 111) columns from 0.098 GB file in 00:00:04
Read 80.7% of 99124 rows
Read 99120 rows and 111 (of 111) columns from 0.072 GB file in 00:00:03
Read 69.1% of 188185 rows
Read 188181 rows and 111 (of 111) columns from 0.146 GB file in 00:00:04
Read 42.4% of 235633 rows
Read 76.4% of 235633 rows
Read 235629 rows and 111 (of 111) columns from 0.170 GB file in 00:00:05
Read 0.0% of 421099 rows
Read 19.0% of 421099 rows
Read 38.0% of 421099 rows
Read 54.6% of 421099 rows
Read 71.2% of 421099 rows
Read 87.9% of 421099 rows
Read 421095 rows and 111 (of 111) columns from 0.300 GB file in 00:00:10
Let’s double check that the data frame has all the data from each file:
library(ggplot2)
table(d[,"file"])
LoanStats_2016Q1.csv LoanStats_2016Q2.csv LoanStats_2016Q3.csv LoanStats3a.csv LoanStats3b.csv
133887 97854 99120 39786 188181
LoanStats3c.csv LoanStats3d.csv
235629 421095
glimpse(d)
Observations: 1,215,552
Variables: 112
$ id <chr> "75983127", "75933549", "75800404", "76022756", "76041549", "75993583"...
$ member_id <chr> "81463882", "81424322", "81268205", "81513504", "81532277", "81484367"...
$ loan_amnt <chr> "30000", "24000", "12500", "25000", "16800", "22000", "12000", "35000"...
$ funded_amnt <chr> "30000", "24000", "12500", "25000", "16800", "22000", "12000", "35000"...
$ funded_amnt_inv <chr> "30000", "24000", "12500", "25000", "16800", "22000", "12000", "35000"...
$ term <chr> " 60 months", " 60 months", " 36 months", " 60 months", " 60 months", ...
$ int_rate <chr> " 9.75%", " 19.53%", " 12.99%", " 12.99%", " 17.27%", " 6.49%", " 11...
$ installment <chr> "633.73", "629.6", "421.12", "568.7", "419.97", "674.18", "263.74", "8...
$ grade <chr> "B", "D", "C", "C", "D", "A", "B", "D", "B", "B", "C", "C", "D", "B", ...
$ sub_grade <chr> "B3", "D5", "C2", "C2", "D2", "A2", "B5", "D3", "B4", "B3", "C4", "C5"...
$ emp_title <chr> "Senior Systems Administrator", "Consultant - Gemba Coach", "Modular t...
$ emp_length <chr> "2 years", "1 year", "5 years", "10+ years", "10+ years", "10+ years",...
$ home_ownership <chr> "MORTGAGE", "OWN", "RENT", "RENT", "RENT", "MORTGAGE", "RENT", "MORTGA...
$ annual_inc <chr> "88000", "135000", "55000", "138000", "40000", "134000", "30000", "134...
$ verification_status <chr> "Source Verified", "Verified", "Verified", "Not Verified", "Source Ver...
$ issue_d <chr> "Mar-2016", "Mar-2016", "Mar-2016", "Mar-2016", "Mar-2016", "Mar-2016"...
$ loan_status <chr> "Current", "Current", "Current", "Current", "Current", "Current", "Cur...
$ pymnt_plan <chr> "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", ...
$ url <chr> "https://lendingclub.com/browse/loanDetail.action?loan_id=75983127", "...
$ desc <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""...
$ purpose <chr> "debt_consolidation", "credit_card", "debt_consolidation", "debt_conso...
$ title <chr> "Debt consolidation", "Credit card refinancing", "Debt consolidation",...
$ zip_code <chr> "230xx", "270xx", "980xx", "078xx", "460xx", "338xx", "802xx", "605xx"...
$ addr_state <chr> "VA", "NC", "WA", "NJ", "IN", "FL", "CO", "IL", "TX", "CT", "OH", "NC"...
$ dti <chr> "22.98", "20.2", "33.71", "12.44", "36.18", "26.33", "40.76", "25.69",...
$ delinq_2yrs <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0", ...
$ earliest_cr_line <chr> "Jul-2003", "Sep-2002", "Oct-2007", "Aug-1995", "Jan-1998", "May-1972"...
$ inq_last_6mths <chr> "0", "0", "0", "1", "0", "1", "1", "0", "0", "0", "0", "1", "2", "0", ...
$ mths_since_last_delinq <chr> "", "49", "", "", "", "", "", "", "", "20", "", "44", "", "", "67", "1...
$ mths_since_last_record <chr> "", "", "", "64", "74", "", "", "", "", "", "", "70", "", "", "", "", ...
$ open_acc <chr> "15", "14", "11", "9", "14", "20", "29", "23", "20", "25", "9", "10", ...
$ pub_rec <chr> "0", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "1", "0", "0", ...
$ revol_bal <chr> "18628", "25934", "18170", "8577", "9906", "60963", "23705", "41602", ...
$ revol_util <chr> "43.5%", "67.9%", "73.3%", "37.1%", "41.6%", "67%", "16.7%", "63%", "5...
$ total_acc <chr> "36", "19", "18", "24", "35", "34", "43", "26", "43", "38", "11", "14"...
$ initial_list_status <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w", "w", "f", "w", "w", "w", ...
$ out_prncp <chr> "27202.69", "22243.07", "10433.19", "22844.48", "15497.51", "18050.00"...
$ out_prncp_inv <chr> "27202.69", "22243.07", "10433.19", "22844.48", "15497.51", "18050.00"...
$ total_pymnt <chr> "4403.61", "4535.58", "2929.8", "3944.82", "2907.55", "4715.29", "1830...
$ total_pymnt_inv <chr> "4403.61", "4535.58", "2929.80", "3944.82", "2907.55", "4715.29", "183...
$ total_rec_prncp <chr> "2797.31", "1756.93", "2066.81", "2155.52", "1302.49", "3950.00", "107...
$ total_rec_int <chr> "1606.30", "2778.65", "862.99", "1789.30", "1605.06", "765.29", "757.2...
$ total_rec_late_fee <chr> "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", ...
$ recoveries <chr> "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", ...
$ collection_recovery_fee <chr> "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", ...
$ last_pymnt_d <chr> "Nov-2016", "Nov-2016", "Nov-2016", "Nov-2016", "Nov-2016", "Nov-2016"...
$ last_pymnt_amnt <chr> "633.73", "629.6", "421.12", "568.7", "419.97", "674.18", "263.74", "8...
$ next_pymnt_d <chr> "Dec-2016", "Dec-2016", "Dec-2016", "Dec-2016", "Dec-2016", "Dec-2016"...
$ last_credit_pull_d <chr> "Nov-2016", "Nov-2016", "Nov-2016", "Nov-2016", "Nov-2016", "Nov-2016"...
$ collections_12_mths_ex_med <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0", ...
$ mths_since_last_major_derog <chr> "", "49", "", "", "", "", "", "108", "", "53", "", "63", "", "", "", "...
$ policy_code <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", ...
$ application_type <chr> "INDIVIDUAL", "INDIVIDUAL", "INDIVIDUAL", "INDIVIDUAL", "INDIVIDUAL", ...
$ annual_inc_joint <chr> "", "", "", "", "", "", "", "", "", "", "90000", "", "", "", "", "", "...
$ dti_joint <chr> "", "", "", "", "", "", "", "", "", "", "30.17", "", "", "", "", "", "...
$ verification_status_joint <chr> "", "", "", "", "", "", "", "", "", "", "Verified", "", "", "", "", ""...
$ acc_now_delinq <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", ...
$ tot_coll_amt <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "350", "0", "95", "0", "0...
$ tot_cur_bal <chr> "230635", "118667", "46568", "11565", "51348", "405151", "36850", "310...
$ open_acc_6m <chr> "1", "2", "2", "2", "0", "2", "0", "1", "0", "2", "1", "1", "2", "1", ...
$ open_il_6m <chr> "5", "4", "3", "1", "4", "2", "2", "4", "13", "3", "1", "1", "1", "1",...
$ open_il_12m <chr> "0", "4", "2", "0", "1", "1", "0", "3", "0", "2", "1", "0", "0", "0", ...
$ open_il_24m <chr> "2", "5", "3", "0", "5", "1", "0", "4", "1", "2", "1", "0", "1", "1", ...
$ mths_since_rcnt_il <chr> "17", "1", "4", "33", "8", "1", "54", "7", "19", "2", "3", "31", "18",...
$ total_bal_il <chr> "65991", "92733", "28398", "2988", "41442", "39179", "13145", "44024",...
$ il_util <chr> "56", "94", "82", "", "35", "71", "33", "82", "81", "68", "92", "48", ...
$ open_rv_12m <chr> "1", "5", "0", "2", "2", "0", "0", "3", "0", "1", "0", "2", "3", "3", ...
$ open_rv_24m <chr> "2", "6", "0", "6", "4", "1", "4", "9", "3", "7", "0", "2", "3", "5", ...
$ max_bal_bc <chr> "7784", "10614", "9810", "4350", "1196", "11746", "3595", "6819", "132...
$ all_util <chr> "50", "83", "78", "37", "39", "67", "20", "63", "66", "46", "68", "70"...
$ total_rev_hi_lim <chr> "42800", "38200", "24800", "23100", "23800", "90900", "141900", "66400...
$ inq_fi <chr> "0", "2", "1", "1", "0", "0", "1", "0", "0", "5", "0", "0", "0", "1", ...
$ total_cu_tl <chr> "0", "1", "4", "0", "10", "0", "3", "0", "3", "1", "0", "0", "0", "0",...
$ inq_last_12m <chr> "0", "3", "2", "2", "1", "2", "1", "1", "0", "11", "1", "2", "2", "0",...
$ acc_open_past_24mths <chr> "4", "11", "3", "6", "9", "3", "4", "13", "4", "11", "1", "2", "4", "6...
$ avg_cur_bal <chr> "15376", "8476", "4233", "1285", "3668", "21323", "1474", "16354", "12...
$ bc_open_to_buy <chr> "15887", "7068", "291", "9281", "1574", "7333", "109815", "6438", "130...
$ bc_util <chr> "52.7", "69.9", "97.6", "45.4", "55", "91", "15.7", "72", "95.3", "48....
$ chargeoff_within_12_mths <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", ...
$ delinq_amnt <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", ...
$ mo_sin_old_il_acct <chr> "152", "162", "55", "136", "112", "133", "147", "19", "213", "157", "9...
$ mo_sin_old_rev_tl_op <chr> "151", "121", "101", "247", "218", "526", "241", "232", "205", "256", ...
$ mo_sin_rcnt_rev_tl_op <chr> "5", "4", "41", "2", "8", "20", "15", "5", "19", "12", "31", "6", "1",...
$ mo_sin_rcnt_tl <chr> "5", "1", "4", "2", "8", "1", "15", "5", "19", "2", "3", "6", "1", "3"...
$ mort_acc <chr> "1", "0", "0", "0", "3", "4", "0", "2", "2", "1", "2", "2", "0", "0", ...
$ mths_since_recent_bc <chr> "5", "4", "41", "2", "8", "20", "15", "5", "19", "14", "56", "6", "1",...
$ mths_since_recent_bc_dlq <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "81", "", "", ...
$ mths_since_recent_inq <chr> "20", "1", "4", "0", "10", "2", "0", "7", "19", "2", "3", "6", "1", "1...
$ mths_since_recent_revol_delinq <chr> "", "", "", "", "", "", "", "", "", "20", "", "46", "", "", "81", "", ...
$ num_accts_ever_120_pd <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0", ...
$ num_actv_bc_tl <chr> "6", "5", "3", "5", "2", "6", "16", "8", "3", "5", "3", "4", "2", "3",...
$ num_actv_rev_tl <chr> "7", "10", "5", "8", "8", "14", "18", "13", "3", "13", "5", "8", "4", ...
$ num_bc_sats <chr> "7", "5", "3", "5", "2", "11", "20", "9", "3", "6", "4", "4", "2", "3"...
$ num_bc_tl <chr> "12", "5", "4", "11", "5", "11", "25", "9", "7", "10", "4", "4", "2", ...
$ num_il_tl <chr> "17", "8", "7", "3", "16", "5", "3", "4", "26", "8", "2", "2", "1", "1...
$ num_op_rev_tl <chr> "9", "10", "8", "8", "10", "14", "27", "13", "6", "19", "7", "8", "5",...
$ num_rev_accts <chr> "18", "11", "11", "20", "16", "24", "40", "20", "15", "28", "7", "10",...
$ num_rev_tl_bal_gt_0 <chr> "7", "10", "5", "8", "8", "13", "18", "10", "3", "13", "5", "8", "4", ...
$ num_sats <chr> "15", "14", "11", "9", "14", "19", "29", "19", "20", "25", "9", "10", ...
$ num_tl_120dpd_2m <chr> "0", "0", "0", "0", "0", "", "0", "", "0", "0", "0", "0", "0", "0", "0...
$ num_tl_30dpd <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", ...
$ num_tl_90g_dpd_24m <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", ...
$ num_tl_op_past_12m <chr> "1", "9", "2", "2", "3", "2", "0", "6", "0", "5", "1", "2", "3", "3", ...
$ pct_tl_nvr_dlq <chr> "100", "94.7", "100", "100", "100", "100", "100", "100", "100", "94.6"...
$ percent_bc_gt_75 <chr> "0", "20", "100", "20", "0", "100", "0", "50", "100", "16.7", "75", "7...
$ pub_rec_bankruptcies <chr> "0", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", ...
$ tax_liens <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", ...
$ tot_hi_cred_lim <chr> "290300", "134314", "59392", "58967", "79107", "456923", "182147", "35...
$ total_bal_ex_mort <chr> "84619", "118667", "46568", "11565", "51348", "101287", "36850", "8562...
$ total_bc_limit <chr> "33600", "23500", "11900", "17000", "3500", "44000", "130300", "51500"...
$ total_il_high_credit_limit <chr> "89607", "96114", "34592", "35867", "55307", "55537", "40247", "53811"...
$ file <chr> "LoanStats_2016Q1.csv", "LoanStats_2016Q1.csv", "LoanStats_2016Q1.csv"...
library(knitr)
kable(table(d[,c("purpose","loan_status")]))
Charged Off | Current | Default | Fully Paid | In Grace Period | Late (16-30 days) | Late (31-120 days) | |
---|---|---|---|---|---|---|---|
car | 717 | 6739 | 0 | 4760 | 59 | 36 | 140 |
credit_card | 15526 | 177293 | 39 | 77950 | 1683 | 820 | 3622 |
debt_consolidation | 54490 | 425137 | 114 | 211844 | 5794 | 2544 | 12382 |
educational | 56 | 0 | 0 | 270 | 0 | 0 | 0 |
home_improvement | 4515 | 46251 | 12 | 22634 | 519 | 274 | 1198 |
house | 494 | 2465 | 0 | 2072 | 38 | 17 | 105 |
major_purchase | 1613 | 14331 | 5 | 8478 | 166 | 100 | 398 |
medical | 1038 | 7360 | 5 | 3719 | 80 | 42 | 210 |
moving | 751 | 4309 | 3 | 2517 | 63 | 28 | 156 |
other | 5107 | 37718 | 11 | 18965 | 471 | 248 | 1039 |
renewable_energy | 87 | 365 | 0 | 303 | 7 | 1 | 16 |
small_business | 1985 | 6598 | 3 | 4724 | 109 | 78 | 294 |
vacation | 535 | 4227 | 1 | 2229 | 41 | 19 | 98 |
wedding | 276 | 59 | 0 | 1955 | 2 | 0 | 0 |
library(dplyr)
kable(table(d[,c("grade", "loan_status")]))
Charged Off | Current | Default | Fully Paid | In Grace Period | Late (16-30 days) | Late (31-120 days) | |
---|---|---|---|---|---|---|---|
AK | 212 | 1741 | 1 | 984 | 26 | 12 | 41 |
AL | 1306 | 9035 | 3 | 4339 | 122 | 64 | 332 |
AR | 697 | 5635 | 0 | 2509 | 70 | 34 | 199 |
AZ | 2068 | 16620 | 8 | 8859 | 183 | 84 | 462 |
CA | 13322 | 96486 | 27 | 59334 | 1117 | 582 | 2617 |
CO | 1538 | 14437 | 0 | 8833 | 166 | 63 | 325 |
CT | 1151 | 11766 | 1 | 5228 | 149 | 59 | 268 |
DC | 160 | 1704 | 0 | 1208 | 21 | 8 | 53 |
DE | 231 | 2150 | 0 | 964 | 19 | 12 | 55 |
FL | 6649 | 50949 | 13 | 24668 | 661 | 283 | 1432 |
GA | 2590 | 24667 | 7 | 11720 | 347 | 158 | 543 |
HI | 493 | 3378 | 1 | 2026 | 54 | 30 | 96 |
IA | 1 | 0 | 0 | 6 | 0 | 0 | 0 |
ID | 5 | 868 | 1 | 67 | 9 | 6 | 10 |
IL | 3020 | 31029 | 6 | 13585 | 302 | 147 | 672 |
IN | 1342 | 13046 | 4 | 4636 | 145 | 80 | 324 |
KS | 622 | 6647 | 1 | 3130 | 64 | 35 | 118 |
KY | 826 | 7356 | 0 | 3203 | 92 | 44 | 159 |
LA | 1182 | 8616 | 0 | 4159 | 102 | 54 | 258 |
MA | 1888 | 16763 | 2 | 8663 | 200 | 90 | 426 |
MD | 2091 | 17214 | 4 | 8408 | 238 | 102 | 489 |
ME | 11 | 1447 | 0 | 136 | 15 | 7 | 17 |
MI | 2250 | 19648 | 4 | 8923 | 243 | 106 | 490 |
MN | 1615 | 13099 | 1 | 6440 | 134 | 73 | 328 |
MO | 1440 | 11891 | 6 | 5555 | 124 | 61 | 321 |
MS | 336 | 4529 | 2 | 912 | 56 | 31 | 147 |
MT | 207 | 2050 | 0 | 1120 | 19 | 10 | 56 |
NC | 2567 | 20512 | 8 | 9944 | 284 | 165 | 611 |
ND | 22 | 1130 | 0 | 103 | 11 | 7 | 19 |
NE | 75 | 2319 | 2 | 267 | 16 | 14 | 60 |
NH | 276 | 3758 | 2 | 1727 | 50 | 12 | 66 |
NJ | 3361 | 27394 | 5 | 13216 | 364 | 171 | 784 |
NM | 523 | 3955 | 1 | 1990 | 49 | 21 | 137 |
NV | 1573 | 9785 | 4 | 5230 | 121 | 60 | 273 |
NY | 7872 | 60776 | 19 | 29156 | 955 | 418 | 1883 |
OH | 2914 | 25838 | 6 | 11073 | 297 | 159 | 703 |
OK | 918 | 6853 | 2 | 3058 | 80 | 39 | 240 |
OR | 945 | 8275 | 0 | 4989 | 87 | 36 | 153 |
PA | 3064 | 25686 | 13 | 12126 | 299 | 142 | 753 |
RI | 355 | 3268 | 0 | 1529 | 25 | 18 | 93 |
SC | 868 | 9389 | 2 | 4217 | 110 | 37 | 195 |
SD | 214 | 1445 | 0 | 768 | 15 | 4 | 39 |
TN | 1291 | 12255 | 4 | 4301 | 172 | 56 | 322 |
TX | 6330 | 61186 | 18 | 28752 | 774 | 331 | 1726 |
UT | 663 | 4398 | 2 | 3104 | 47 | 19 | 114 |
VA | 2742 | 20615 | 5 | 10858 | 266 | 124 | 576 |
VT | 116 | 1674 | 0 | 668 | 20 | 4 | 24 |
WA | 1753 | 14948 | 5 | 8560 | 154 | 73 | 324 |
WI | 995 | 9998 | 1 | 4599 | 101 | 34 | 221 |
WV | 329 | 3028 | 1 | 1679 | 35 | 11 | 63 |
WY | 171 | 1596 | 1 | 891 | 22 | 17 | 41 |