library(tidyverse)
## -- Attaching packages ------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  2.0.1     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.3.1     v forcats 0.3.0
## -- Conflicts ---------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

Downloading Data

There are two files to download, the 2012-2013 data (LoanStats3b) and the 2014 data (LoanStats3c).

#note the http instead of https
URL <- 'https://resources.lendingclub.com/LoanStats3b.csv.zip'
download.file(URL, destfile = "./LoanStats3b.zip")

URL <- "https://resources.lendingclub.com/LoanStats3c.csv.zip"
download.file(URL, destfile = "./LoanStats3c.zip")
unzip("./LoanStats3b.zip", exdir="./Files/data")
unzip("./LoanStats3c.zip",exdir="./Files/data")
fn <- "./LoanStats3b.zip"
if (file.exists(fn)) file.remove(fn)
## [1] TRUE
fn <- "./LoanStats3c.zip"
if (file.exists(fn)) file.remove(fn)
## [1] TRUE
file="./Files/data/LoanStats3b.csv"
headers = read.csv(file, skip = 1, header = F, nrows = 1, as.is = T)
d3b = read.csv(file, skip = 3, header = F)
colnames(d3b)= headers

file="./Files/data/LoanStats3c.csv"
headers = read.csv(file, skip = 1, header = F, nrows = 1, as.is = T)
d3c = read.csv(file, skip = 3, header = F)
colnames(d3c)= headers

Merging Data

d <- bind_rows(d3b, d3c) #lots of character coercion
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

Cleaning Data

###convert blanks to NA's: remember, this is NOT equating 0's with blanks
d[d==""] <- NA

### First 10 Variables###
str(d[,1:10])
## 'data.frame':    423812 obs. of  10 variables:
##  $ id             : chr  NA NA NA NA ...
##  $ member_id      : logi  NA NA NA NA NA NA ...
##  $ loan_amnt      : int  3000 20800 4800 14000 15000 11100 12000 9750 15000 10000 ...
##  $ funded_amnt    : int  3000 20800 4800 14000 15000 11100 12000 9750 15000 10000 ...
##  $ funded_amnt_inv: num  3000 20800 4800 14000 15000 11100 12000 9750 15000 10000 ...
##  $ term           : Factor w/ 3 levels ""," 36 months",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ int_rate       : chr  " 12.85%" " 13.53%" " 10.99%" " 12.85%" ...
##  $ installment    : num  101 706 157 471 516 ...
##  $ grade          : Factor w/ 8 levels "","A","B","C",..: 3 3 3 3 4 4 3 4 2 3 ...
##  $ sub_grade      : Factor w/ 36 levels "","A1","A2","A3",..: 10 11 8 10 13 14 11 12 6 7 ...
###terms- convert NA's and clean up factoring
d$term <- as.numeric(d$term); d$term <- as.factor(d$term); levels(d$term) <- c("36 months", "60 months")
table(d$term); sum(is.na(d$term))
## 
## 36 months 60 months 
##    306461    117347
## [1] 4
str(d$term)
##  Factor w/ 2 levels "36 months","60 months": 1 1 1 1 1 1 1 1 1 1 ...
###interest rate- convert from character with % to numeric
d$int_rate<- as.numeric(sub("%", "", d$int_rate))
###grade- convert NA's and clean up factoring
d$grade <- as.numeric(d$grade); d$grade <- as.factor(d$grade); levels(d$grade) <- c("A", "B", "C", "D", "E", "F", "G")
table(d$grade); sum(is.na(d$grade))
## 
##      A      B      C      D      E      F      G 
##  64688 124558 116572  70884  32365  11931   2810
## [1] 4
sum(is.na(d$grade))
## [1] 4
###sub_grade- clean up factoring
TEMP <- levels(d$sub_grade); TEMP <- TEMP[-1]
d$sub_grade <- as.numeric(d$sub_grade); d$sub_grade <- as.factor(d$sub_grade); levels(d$sub_grade) <- TEMP
table(d$sub_grade); sum(is.na(d$sub_grade))
## 
##    A1    A2    A3    A4    A5    B1    B2    B3    B4    B5    C1    C2 
##  9400  9436 10746 16140 18966 20997 24499 28693 27918 22451 24694 24588 
##    C3    C4    C5    D1    D2    D3    D4    D5    E1    E2    E3    E4 
## 23665 22738 20887 18209 15562 13600 12961 10552  8150  7878  6309  5440 
##    E5    F1    F2    F3    F4    F5    G1    G2    G3    G4    G5 
##  4588  3549  2694  2472  1857  1359   968   716   504   323   299
## [1] 4
sum(is.na(d$sub_grade))
## [1] 4
###11-20###
str(d[,11:20])
## 'data.frame':    423812 obs. of  10 variables:
##  $ emp_title          : chr  "Auditor" "Operations Manager" "Surgical Technician" "Assistant Director - Human Resources" ...
##  $ emp_length         : Factor w/ 13 levels "","< 1 year",..: 4 4 5 7 4 4 4 3 5 10 ...
##  $ home_ownership     : chr  "RENT" "RENT" "MORTGAGE" "RENT" ...
##  $ annual_inc         : num  25000 81500 39600 88000 98000 90000 40000 26000 63000 102000 ...
##  $ verification_status: Factor w/ 4 levels "","Not Verified",..: 4 4 3 2 2 2 3 2 2 2 ...
##  $ issue_d            : chr  "Dec-2013" "Dec-2013" "Dec-2013" "Dec-2013" ...
##  $ loan_status        : chr  "Fully Paid" "Fully Paid" "Fully Paid" "Fully Paid" ...
##  $ pymnt_plan         : chr  "n" "n" "n" "n" ...
##  $ url                : logi  NA NA NA NA NA NA ...
##  $ desc               : chr  NA "  Borrower added on 12/31/13 > My goal is to purchase a home. I am consolidating my debt to lower interest rate"| __truncated__ "  Borrower added on 12/31/13 > Just bought a house, and would like a little extra funds to improve aspects of t"| __truncated__ NA ...
###emp_title: convert blanks to NAs
###sub_grade- clean up factoring
TEMP <- levels(d$emp_length); TEMP <- TEMP[-1]
d$emp_length <- as.numeric(d$emp_length); d$emp_length <- as.factor(d$emp_length); levels(d$emp_length) <- TEMP
table(d$emp_length); sum(is.na(d$emp_length))
## 
##  < 1 year    1 year 10+ years   2 years   3 years   4 years   5 years 
##     31206     26027    140703     36633     32233     24375     27321 
##   6 years   7 years   8 years   9 years       n/a 
##     23739     24310     20805     16550     19906
## [1] 4
sum(is.na(d$emp_length)); str(d$emp_length)
## [1] 4
##  Factor w/ 12 levels "< 1 year","1 year",..: 3 3 4 6 3 3 3 2 4 9 ...
###verification_status- clean up factoring
TEMP <- levels(d$verification_status); TEMP <- TEMP[-1]
d$verification_status <- as.numeric(d$verification_status); d$verification_status <- as.factor(d$verification_status); levels(d$verification_status) <- TEMP
table(d$verification_status); sum(is.na(d$verification_status))
## 
##    Not Verified Source Verified        Verified 
##          127340          141850          154618
## [1] 4
sum(is.na(d$verification_status)); str(d$verification_status)
## [1] 4
##  Factor w/ 3 levels "Not Verified",..: 3 3 2 1 1 1 2 1 1 1 ...
###issue_d
d$issue_d <- parse_date_time(d$issue_d, "my")
d$issue_d <- as_date(d$issue_d)
###loan status- convert to factor
d$loan_status <- as.factor(d$loan_status)
###pymnt_plan- express as factor
d$pymnt_plan <- as.factor(d$pymnt_plan)
###URL is weird... exclude from analysis
###desc is definately character.

###21-30###
str(d[,21:30])
## 'data.frame':    423812 obs. of  10 variables:
##  $ purpose               : Factor w/ 14 levels "","car","credit_card",..: 4 4 5 4 4 10 4 4 4 4 ...
##  $ title                 : chr  "debt" "Reducing Debt to Purchase Home" "For The House" "Debt consolidation" ...
##  $ zip_code              : chr  "322xx" "100xx" "782xx" "282xx" ...
##  $ addr_state            : chr  "FL" "NY" "TX" "NC" ...
##  $ dti                   : num  24.68 16.73 2.49 10.02 6.15 ...
##  $ delinq_2yrs           : int  0 0 0 1 0 1 0 0 0 2 ...
##  $ earliest_cr_line      : chr  "May-1991" "Jun-1998" "Aug-1995" "Jun-1988" ...
##  $ inq_last_6mths        : int  0 2 2 0 2 0 0 0 0 0 ...
##  $ mths_since_last_delinq: int  58 64 NA 16 NA 16 53 NA 34 11 ...
##  $ mths_since_last_record: int  53 NA NA 115 NA NA 33 NA NA NA ...
###purpose- clean up factoring
TEMP <- levels(d$purpose); TEMP <- TEMP[-1]
d$purpose <- as.numeric(d$purpose); d$purpose <- as.factor(d$purpose); levels(d$purpose) <- TEMP
table(d$purpose); sum(is.na(d$purpose))
## 
##                car        credit_card debt_consolidation 
##               3783              98692             254455 
##   home_improvement              house     major_purchase 
##              23342               1843               7517 
##            medical             moving              other 
##               3850               2366              19267 
##   renewable_energy     small_business           vacation 
##                245               5022               2087 
##            wedding 
##               1339
## [1] 4
sum(is.na(d$purpose)); str(d$purpose)
## [1] 4
##  Factor w/ 13 levels "car","credit_card",..: 3 3 4 3 3 9 3 3 3 3 ...
###earliest_cr_line as date
d$earliest_cr_line <- parse_date_time(d$earliest_cr_line, "my")
d$earliest_cr_line <- as_date(d$earliest_cr_line)

###31-40###
str(d[,31:40])
## 'data.frame':    423812 obs. of  10 variables:
##  $ open_acc           : int  5 29 3 6 16 9 7 12 8 9 ...
##  $ pub_rec            : int  2 0 0 1 0 0 2 0 0 0 ...
##  $ revol_bal          : int  2875 23473 4136 3686 5749 6619 5572 7967 11431 9912 ...
##  $ revol_util         : chr  "54.2%" "54.5%" "16.1%" "81.9%" ...
##  $ total_acc          : int  26 41 8 14 16 12 32 28 29 22 ...
##  $ initial_list_status: Factor w/ 3 levels "","f","w": 2 2 3 2 2 2 3 2 3 2 ...
##  $ out_prncp          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ out_prncp_inv      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_pymnt        : num  3182 23927 5158 16945 15699 ...
##  $ total_pymnt_inv    : num  3182 23927 5158 16945 15699 ...
###revol_util from percent to numeric
d$revol_util<- as.numeric(sub("%", "", d$revol_util))
###initial_list_status- clean up factoring
TEMP <- levels(d$initial_list_status); TEMP <- TEMP[-1]
d$initial_list_status <- as.numeric(d$initial_list_status); d$initial_list_status <- as.factor(d$initial_list_status); levels(d$initial_list_status) <- TEMP
table(d$initial_list_status); sum(is.na(d$initial_list_status))
## 
##      f      w 
## 260520 163288
## [1] 4
sum(is.na(d$initial_list_status)); str(d$initial_list_status)
## [1] 4
##  Factor w/ 2 levels "f","w": 1 1 2 1 1 1 2 1 2 1 ...
###41-50###
str(d[,41:50])
## 'data.frame':    423812 obs. of  10 variables:
##  $ total_rec_prncp           : num  3000 20800 4800 14000 15000 11100 12000 9750 15000 10000 ...
##  $ total_rec_int             : num  182 3127 358 2945 699 ...
##  $ total_rec_late_fee        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ recoveries                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ collection_recovery_fee   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ last_pymnt_d              : chr  "Jul-2014" "May-2015" "Sep-2014" "Jan-2017" ...
##  $ last_pymnt_amnt           : num  2677 13335 3900 470 14151 ...
##  $ next_pymnt_d              : Factor w/ 2 levels "","Feb-2019": NA NA NA NA NA NA NA NA NA NA ...
##  $ last_credit_pull_d        : chr  "Oct-2016" "Jan-2019" "Jan-2017" "Jan-2019" ...
##  $ collections_12_mths_ex_med: int  0 0 0 0 0 0 0 0 0 0 ...
###last_pymnt_d as date
d$last_pymnt_d <- parse_date_time(d$last_pymnt_d, "my")
d$last_pymnt_d <- as_date(d$last_pymnt_d)
###next_pymnt_d- clean up factoring
TEMP <- levels(d$next_pymnt_d); TEMP <- TEMP[-1]
d$next_pymnt_d <- as.numeric(d$next_pymnt_d); d$next_pymnt_d <- as.factor(d$next_pymnt_d); levels(d$next_pymnt_d) <- TEMP
table(d$next_pymnt_d); sum(is.na(d$next_pymnt_d))
## 
## Feb-2019 
##    15491
## [1] 408321
sum(is.na(d$next_pymnt_d)); str(d$next_pymnt_d)
## [1] 408321
##  Factor w/ 1 level "Feb-2019": NA NA NA NA NA NA NA NA NA NA ...
###hardship_end_date as date
d$hardship_end_date <- parse_date_time(d$hardship_end_date, "my")
d$hardship_end_date <- as_date(d$hardship_end_date)

###51-60###
str(d[,51:60])
## 'data.frame':    423812 obs. of  10 variables:
##  $ mths_since_last_major_derog: int  69 71 NA NA NA 16 53 NA 34 54 ...
##  $ policy_code                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ application_type           : Factor w/ 2 levels "","Individual": 2 2 2 2 2 2 2 2 2 2 ...
##  $ annual_inc_joint           : logi  NA NA NA NA NA NA ...
##  $ dti_joint                  : logi  NA NA NA NA NA NA ...
##  $ verification_status_joint  : logi  NA NA NA NA NA NA ...
##  $ acc_now_delinq             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ tot_coll_amt               : int  154 0 0 0 0 0 15386 0 1514 0 ...
##  $ tot_cur_bal                : int  19530 23473 4136 17672 13038 353402 13605 14123 272492 39143 ...
##  $ open_acc_6m                : logi  NA NA NA NA NA NA ...
###application type: individual or not (or blank?)
TEMP <- levels(d$application_type); TEMP <- TEMP[-1]
d$application_type <- as.numeric(d$application_type); d$application_type <- as.factor(d$application_type); levels(d$application_type) <- TEMP
table(d$application_type); sum(is.na(d$application_type))
## 
## Individual 
##     423808
## [1] 4
sum(is.na(d$application_type)); str(d$application_type)
## [1] 4
##  Factor w/ 1 level "Individual": 1 1 1 1 1 1 1 1 1 1 ...
###61-70###
str(d[,61:70])
## 'data.frame':    423812 obs. of  10 variables:
##  $ open_act_il       : logi  NA NA NA NA NA NA ...
##  $ open_il_12m       : logi  NA NA NA NA NA NA ...
##  $ open_il_24m       : logi  NA NA NA NA NA NA ...
##  $ mths_since_rcnt_il: logi  NA NA NA NA NA NA ...
##  $ total_bal_il      : logi  NA NA NA NA NA NA ...
##  $ il_util           : logi  NA NA NA NA NA NA ...
##  $ open_rv_12m       : logi  NA NA NA NA NA NA ...
##  $ open_rv_24m       : logi  NA NA NA NA NA NA ...
##  $ max_bal_bc        : logi  NA NA NA NA NA NA ...
##  $ all_util          : logi  NA NA NA NA NA NA ...
###71-80###
str(d[,71:80])
## 'data.frame':    423812 obs. of  10 variables:
##  $ total_rev_hi_lim        : int  5300 43100 25700 4500 25800 10000 8100 15100 15400 22300 ...
##  $ inq_fi                  : logi  NA NA NA NA NA NA ...
##  $ total_cu_tl             : logi  NA NA NA NA NA NA ...
##  $ inq_last_12m            : logi  NA NA NA NA NA NA ...
##  $ acc_open_past_24mths    : int  3 9 0 3 6 2 4 2 3 3 ...
##  $ avg_cur_bal             : int  3906 869 1379 2945 815 39267 2268 1177 38927 4349 ...
##  $ bc_open_to_buy          : int  2050 6811 21564 480 15051 1016 1428 1752 2969 973 ...
##  $ bc_util                 : num  52.3 54.6 16.1 87.7 27.6 74.6 79.6 75.7 79.1 89.4 ...
##  $ chargeoff_within_12_mths: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ delinq_amnt             : int  0 0 0 0 0 0 0 0 0 0 ...
###81-90###
str(d[,81:90])
## 'data.frame':    423812 obs. of  10 variables:
##  $ mo_sin_old_il_acct            : int  164 115 104 111 2 NA 124 67 147 243 ...
##  $ mo_sin_old_rev_tl_op          : int  271 186 220 103 257 150 182 83 189 290 ...
##  $ mo_sin_rcnt_rev_tl_op         : int  7 0 25 24 7 11 1 12 24 23 ...
##  $ mo_sin_rcnt_tl                : int  7 0 25 13 2 11 1 12 13 8 ...
##  $ mort_acc                      : int  6 0 0 0 0 1 0 0 4 0 ...
##  $ mths_since_recent_bc          : int  14 0 25 38 7 11 11 12 24 25 ...
##  $ mths_since_recent_bc_dlq      : int  69 70 NA 16 NA 35 53 NA 75 11 ...
##  $ mths_since_recent_inq         : int  8 0 3 NA 2 11 17 20 12 8 ...
##  $ mths_since_recent_revol_delinq: int  69 70 NA 16 NA 35 53 NA 75 11 ...
##  $ num_accts_ever_120_pd         : int  1 1 0 0 0 1 6 0 3 1 ...
###91-100###
str(d[,91:100])
## 'data.frame':    423812 obs. of  10 variables:
##  $ num_actv_bc_tl     : int  2 8 2 3 8 4 2 6 3 3 ...
##  $ num_actv_rev_tl    : int  3 24 2 4 8 8 2 7 4 4 ...
##  $ num_bc_sats        : int  3 11 3 3 13 4 3 6 3 3 ...
##  $ num_bc_tl          : int  6 17 4 9 13 4 14 11 10 6 ...
##  $ num_il_tl          : int  11 1 1 3 1 0 8 8 8 9 ...
##  $ num_op_rev_tl      : int  4 29 3 4 15 8 6 9 6 6 ...
##  $ num_rev_accts      : int  9 40 7 10 15 11 24 20 17 13 ...
##  $ num_rev_tl_bal_gt_0: int  3 24 2 4 8 8 2 7 4 4 ...
##  $ num_sats           : int  5 29 3 6 16 9 7 12 8 9 ...
##  $ num_tl_120dpd_2m   : int  0 0 0 0 0 0 0 0 0 0 ...
###101-110###
str(d[,101:110])
## 'data.frame':    423812 obs. of  10 variables:
##  $ num_tl_30dpd        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ num_tl_90g_dpd_24m  : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ num_tl_op_past_12m  : int  1 3 0 0 2 1 2 2 0 1 ...
##  $ pct_tl_nvr_dlq      : num  91.3 90.2 100 78.6 100 75 81.2 100 89.3 77.3 ...
##  $ percent_bc_gt_75    : num  66.7 50 0 100 7.7 50 33.3 66.7 66.7 66.7 ...
##  $ pub_rec_bankruptcies: int  2 0 0 1 0 0 0 0 0 0 ...
##  $ tax_liens           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ tot_hi_cred_lim     : int  32082 43100 25700 31840 33300 385000 18130 21314 288195 58486 ...
##  $ total_bal_ex_mort   : int  19530 23473 4136 17672 13038 6619 13605 14123 39448 39143 ...
##  $ total_bc_limit      : int  4300 15000 25700 3900 20800 4000 7000 7200 14200 9200 ...
###111-120###
str(d[,111:120])
## 'data.frame':    423812 obs. of  10 variables:
##  $ total_il_high_credit_limit      : int  26782 0 0 27340 7500 0 10030 6214 33895 36186 ...
##  $ revol_bal_joint                 : logi  NA NA NA NA NA NA ...
##  $ sec_app_earliest_cr_line        : logi  NA NA NA NA NA NA ...
##  $ sec_app_inq_last_6mths          : logi  NA NA NA NA NA NA ...
##  $ sec_app_mort_acc                : logi  NA NA NA NA NA NA ...
##  $ sec_app_open_acc                : logi  NA NA NA NA NA NA ...
##  $ sec_app_revol_util              : logi  NA NA NA NA NA NA ...
##  $ sec_app_open_act_il             : logi  NA NA NA NA NA NA ...
##  $ sec_app_num_rev_accts           : logi  NA NA NA NA NA NA ...
##  $ sec_app_chargeoff_within_12_mths: logi  NA NA NA NA NA NA ...
###121-130###
str(d[,121:130])
## 'data.frame':    423812 obs. of  10 variables:
##  $ sec_app_collections_12_mths_ex_med : logi  NA NA NA NA NA NA ...
##  $ sec_app_mths_since_last_major_derog: logi  NA NA NA NA NA NA ...
##  $ hardship_flag                      : chr  "N" "N" "N" "N" ...
##  $ hardship_type                      : Factor w/ 2 levels "","INTEREST ONLY-3 MONTHS DEFERRAL": NA NA NA NA NA NA NA NA NA NA ...
##  $ hardship_reason                    : chr  NA NA NA NA ...
##  $ hardship_status                    : chr  NA NA NA NA ...
##  $ deferral_term                      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ hardship_amount                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hardship_start_date                : chr  NA NA NA NA ...
##  $ hardship_end_date                  : Date, format: NA NA ...
###hardship type: individual or not (or blank?)
TEMP <- levels(d$hardship_type); TEMP <- TEMP[-1]
d$hardship_type <- as.numeric(d$hardship_type); d$hardship_type <- as.factor(d$hardship_type); levels(d$hardship_type) <- TEMP
table(d$hardship_type); sum(is.na(d$hardship_type))
## 
## INTEREST ONLY-3 MONTHS DEFERRAL 
##                             576
## [1] 423236
sum(is.na(d$hardship_type)); str(d$hardship_type)
## [1] 423236
##  Factor w/ 1 level "INTEREST ONLY-3 MONTHS DEFERRAL": NA NA NA NA NA NA NA NA NA NA ...
###hardship_start_date as date
d$hardship_start_date <- parse_date_time(d$hardship_start_date, "my")
d$hardship_start_date <- as_date(d$hardship_start_date)
###hardship_end_date as date
d$hardship_end_date <- parse_date_time(d$hardship_end_date, "my")
## Warning: All formats failed to parse. No formats found.
d$hardship_end_date <- as_date(d$hardship_end_date)

###131-140###
str(d[,131:140])
## 'data.frame':    423812 obs. of  10 variables:
##  $ payment_plan_start_date                   : chr  NA NA NA NA ...
##  $ hardship_length                           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ hardship_dpd                              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ hardship_loan_status                      : Factor w/ 5 levels "","Current","In Grace Period",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ orig_projected_additional_accrued_interest: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hardship_payoff_balance_amount            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hardship_last_payment_amount              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ disbursement_method                       : Factor w/ 2 levels "","Cash": 2 2 2 2 2 2 2 2 2 2 ...
##  $ debt_settlement_flag                      : Factor w/ 3 levels "","N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ debt_settlement_flag_date                 : chr  NA NA NA NA ...
###payment_plan_start_date as date
d$payment_plan_start_date <- parse_date_time(d$payment_plan_start_date, "my")
d$payment_plan_start_date <- as_date(d$payment_plan_start_date)
###hardship_loan_status: 
TEMP <- levels(d$hardship_loan_status); TEMP <- TEMP[-1]
d$hardship_loan_status <- as.numeric(d$hardship_loan_status); d$hardship_loan_status <- as.factor(d$hardship_loan_status); levels(d$hardship_loan_status) <- TEMP
table(d$hardship_loan_status); sum(is.na(d$hardship_loan_status))
## 
##            Current    In Grace Period  Late (16-30 days) 
##                119                161                259 
## Late (31-120 days) 
##                 37
## [1] 423236
sum(is.na(d$hardship_loan_status)); str(d$hardship_loan_status)
## [1] 423236
##  Factor w/ 4 levels "Current","In Grace Period",..: NA NA NA NA NA NA NA NA NA NA ...
###disbursement_method: individual factor?
TEMP <- levels(d$disbursement_method); TEMP <- TEMP[-1]
d$disbursement_method <- as.numeric(d$disbursement_method); d$disbursement_method <- as.factor(d$disbursement_method); levels(d$disbursement_method) <- TEMP
table(d$disbursement_method); sum(is.na(d$disbursement_method))
## 
##   Cash 
## 423808
## [1] 4
sum(is.na(d$disbursement_method)); str(d$disbursement_method)
## [1] 4
##  Factor w/ 1 level "Cash": 1 1 1 1 1 1 1 1 1 1 ...
###debt_settlement_flag as factor
TEMP <- levels(d$debt_settlement_flag); TEMP <- TEMP[-1]
d$debt_settlement_flag <- as.numeric(d$debt_settlement_flag); d$debt_settlement_flag <- as.factor(d$debt_settlement_flag); levels(d$debt_settlement_flag) <- TEMP
table(d$debt_settlement_flag); sum(is.na(d$debt_settlement_flag))
## 
##      N      Y 
## 417627   6181
## [1] 4
sum(is.na(d$debt_settlement_flag)); str(d$debt_settlement_flag)
## [1] 4
##  Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
###141-145###
str(d[,141:145])
## 'data.frame':    423812 obs. of  5 variables:
##  $ settlement_status    : Factor w/ 4 levels "","ACTIVE","BROKEN",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ settlement_date      : chr  NA NA NA NA ...
##  $ settlement_amount    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ settlement_percentage: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ settlement_term      : int  NA NA NA NA NA NA NA NA NA NA ...
###settlement_status as factor
TEMP <- levels(d$settlement_status); TEMP <- TEMP[-1]
d$settlement_status <- as.numeric(d$settlement_status); d$settlement_status <- as.factor(d$settlement_status); levels(d$settlement_status) <- TEMP
table(d$settlement_status); sum(is.na(d$settlement_status))
## 
##   ACTIVE   BROKEN COMPLETE 
##     1198      919     4064
## [1] 417631
sum(is.na(d$settlement_status)); str(d$settlement_status)
## [1] 417631
##  Factor w/ 3 levels "ACTIVE","BROKEN",..: NA NA NA NA NA NA NA NA NA NA ...
saveRDS(d, file = "./Files/data/d")