library(tidyverse)
## -- Attaching packages ------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 2.0.1 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
## -- Conflicts ---------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
There are two files to download, the 2012-2013 data (LoanStats3b) and the 2014 data (LoanStats3c).
#note the http instead of https
URL <- 'https://resources.lendingclub.com/LoanStats3b.csv.zip'
download.file(URL, destfile = "./LoanStats3b.zip")
URL <- "https://resources.lendingclub.com/LoanStats3c.csv.zip"
download.file(URL, destfile = "./LoanStats3c.zip")
unzip("./LoanStats3b.zip", exdir="./Files/data")
unzip("./LoanStats3c.zip",exdir="./Files/data")
fn <- "./LoanStats3b.zip"
if (file.exists(fn)) file.remove(fn)
## [1] TRUE
fn <- "./LoanStats3c.zip"
if (file.exists(fn)) file.remove(fn)
## [1] TRUE
file="./Files/data/LoanStats3b.csv"
headers = read.csv(file, skip = 1, header = F, nrows = 1, as.is = T)
d3b = read.csv(file, skip = 3, header = F)
colnames(d3b)= headers
file="./Files/data/LoanStats3c.csv"
headers = read.csv(file, skip = 1, header = F, nrows = 1, as.is = T)
d3c = read.csv(file, skip = 3, header = F)
colnames(d3c)= headers
d <- bind_rows(d3b, d3c) #lots of character coercion
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
###convert blanks to NA's: remember, this is NOT equating 0's with blanks
d[d==""] <- NA
### First 10 Variables###
str(d[,1:10])
## 'data.frame': 423812 obs. of 10 variables:
## $ id : chr NA NA NA NA ...
## $ member_id : logi NA NA NA NA NA NA ...
## $ loan_amnt : int 3000 20800 4800 14000 15000 11100 12000 9750 15000 10000 ...
## $ funded_amnt : int 3000 20800 4800 14000 15000 11100 12000 9750 15000 10000 ...
## $ funded_amnt_inv: num 3000 20800 4800 14000 15000 11100 12000 9750 15000 10000 ...
## $ term : Factor w/ 3 levels ""," 36 months",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ int_rate : chr " 12.85%" " 13.53%" " 10.99%" " 12.85%" ...
## $ installment : num 101 706 157 471 516 ...
## $ grade : Factor w/ 8 levels "","A","B","C",..: 3 3 3 3 4 4 3 4 2 3 ...
## $ sub_grade : Factor w/ 36 levels "","A1","A2","A3",..: 10 11 8 10 13 14 11 12 6 7 ...
###terms- convert NA's and clean up factoring
d$term <- as.numeric(d$term); d$term <- as.factor(d$term); levels(d$term) <- c("36 months", "60 months")
table(d$term); sum(is.na(d$term))
##
## 36 months 60 months
## 306461 117347
## [1] 4
str(d$term)
## Factor w/ 2 levels "36 months","60 months": 1 1 1 1 1 1 1 1 1 1 ...
###interest rate- convert from character with % to numeric
d$int_rate<- as.numeric(sub("%", "", d$int_rate))
###grade- convert NA's and clean up factoring
d$grade <- as.numeric(d$grade); d$grade <- as.factor(d$grade); levels(d$grade) <- c("A", "B", "C", "D", "E", "F", "G")
table(d$grade); sum(is.na(d$grade))
##
## A B C D E F G
## 64688 124558 116572 70884 32365 11931 2810
## [1] 4
sum(is.na(d$grade))
## [1] 4
###sub_grade- clean up factoring
TEMP <- levels(d$sub_grade); TEMP <- TEMP[-1]
d$sub_grade <- as.numeric(d$sub_grade); d$sub_grade <- as.factor(d$sub_grade); levels(d$sub_grade) <- TEMP
table(d$sub_grade); sum(is.na(d$sub_grade))
##
## A1 A2 A3 A4 A5 B1 B2 B3 B4 B5 C1 C2
## 9400 9436 10746 16140 18966 20997 24499 28693 27918 22451 24694 24588
## C3 C4 C5 D1 D2 D3 D4 D5 E1 E2 E3 E4
## 23665 22738 20887 18209 15562 13600 12961 10552 8150 7878 6309 5440
## E5 F1 F2 F3 F4 F5 G1 G2 G3 G4 G5
## 4588 3549 2694 2472 1857 1359 968 716 504 323 299
## [1] 4
sum(is.na(d$sub_grade))
## [1] 4
###11-20###
str(d[,11:20])
## 'data.frame': 423812 obs. of 10 variables:
## $ emp_title : chr "Auditor" "Operations Manager" "Surgical Technician" "Assistant Director - Human Resources" ...
## $ emp_length : Factor w/ 13 levels "","< 1 year",..: 4 4 5 7 4 4 4 3 5 10 ...
## $ home_ownership : chr "RENT" "RENT" "MORTGAGE" "RENT" ...
## $ annual_inc : num 25000 81500 39600 88000 98000 90000 40000 26000 63000 102000 ...
## $ verification_status: Factor w/ 4 levels "","Not Verified",..: 4 4 3 2 2 2 3 2 2 2 ...
## $ issue_d : chr "Dec-2013" "Dec-2013" "Dec-2013" "Dec-2013" ...
## $ loan_status : chr "Fully Paid" "Fully Paid" "Fully Paid" "Fully Paid" ...
## $ pymnt_plan : chr "n" "n" "n" "n" ...
## $ url : logi NA NA NA NA NA NA ...
## $ desc : chr NA " Borrower added on 12/31/13 > My goal is to purchase a home. I am consolidating my debt to lower interest rate"| __truncated__ " Borrower added on 12/31/13 > Just bought a house, and would like a little extra funds to improve aspects of t"| __truncated__ NA ...
###emp_title: convert blanks to NAs
###sub_grade- clean up factoring
TEMP <- levels(d$emp_length); TEMP <- TEMP[-1]
d$emp_length <- as.numeric(d$emp_length); d$emp_length <- as.factor(d$emp_length); levels(d$emp_length) <- TEMP
table(d$emp_length); sum(is.na(d$emp_length))
##
## < 1 year 1 year 10+ years 2 years 3 years 4 years 5 years
## 31206 26027 140703 36633 32233 24375 27321
## 6 years 7 years 8 years 9 years n/a
## 23739 24310 20805 16550 19906
## [1] 4
sum(is.na(d$emp_length)); str(d$emp_length)
## [1] 4
## Factor w/ 12 levels "< 1 year","1 year",..: 3 3 4 6 3 3 3 2 4 9 ...
###verification_status- clean up factoring
TEMP <- levels(d$verification_status); TEMP <- TEMP[-1]
d$verification_status <- as.numeric(d$verification_status); d$verification_status <- as.factor(d$verification_status); levels(d$verification_status) <- TEMP
table(d$verification_status); sum(is.na(d$verification_status))
##
## Not Verified Source Verified Verified
## 127340 141850 154618
## [1] 4
sum(is.na(d$verification_status)); str(d$verification_status)
## [1] 4
## Factor w/ 3 levels "Not Verified",..: 3 3 2 1 1 1 2 1 1 1 ...
###issue_d
d$issue_d <- parse_date_time(d$issue_d, "my")
d$issue_d <- as_date(d$issue_d)
###loan status- convert to factor
d$loan_status <- as.factor(d$loan_status)
###pymnt_plan- express as factor
d$pymnt_plan <- as.factor(d$pymnt_plan)
###URL is weird... exclude from analysis
###desc is definately character.
###21-30###
str(d[,21:30])
## 'data.frame': 423812 obs. of 10 variables:
## $ purpose : Factor w/ 14 levels "","car","credit_card",..: 4 4 5 4 4 10 4 4 4 4 ...
## $ title : chr "debt" "Reducing Debt to Purchase Home" "For The House" "Debt consolidation" ...
## $ zip_code : chr "322xx" "100xx" "782xx" "282xx" ...
## $ addr_state : chr "FL" "NY" "TX" "NC" ...
## $ dti : num 24.68 16.73 2.49 10.02 6.15 ...
## $ delinq_2yrs : int 0 0 0 1 0 1 0 0 0 2 ...
## $ earliest_cr_line : chr "May-1991" "Jun-1998" "Aug-1995" "Jun-1988" ...
## $ inq_last_6mths : int 0 2 2 0 2 0 0 0 0 0 ...
## $ mths_since_last_delinq: int 58 64 NA 16 NA 16 53 NA 34 11 ...
## $ mths_since_last_record: int 53 NA NA 115 NA NA 33 NA NA NA ...
###purpose- clean up factoring
TEMP <- levels(d$purpose); TEMP <- TEMP[-1]
d$purpose <- as.numeric(d$purpose); d$purpose <- as.factor(d$purpose); levels(d$purpose) <- TEMP
table(d$purpose); sum(is.na(d$purpose))
##
## car credit_card debt_consolidation
## 3783 98692 254455
## home_improvement house major_purchase
## 23342 1843 7517
## medical moving other
## 3850 2366 19267
## renewable_energy small_business vacation
## 245 5022 2087
## wedding
## 1339
## [1] 4
sum(is.na(d$purpose)); str(d$purpose)
## [1] 4
## Factor w/ 13 levels "car","credit_card",..: 3 3 4 3 3 9 3 3 3 3 ...
###earliest_cr_line as date
d$earliest_cr_line <- parse_date_time(d$earliest_cr_line, "my")
d$earliest_cr_line <- as_date(d$earliest_cr_line)
###31-40###
str(d[,31:40])
## 'data.frame': 423812 obs. of 10 variables:
## $ open_acc : int 5 29 3 6 16 9 7 12 8 9 ...
## $ pub_rec : int 2 0 0 1 0 0 2 0 0 0 ...
## $ revol_bal : int 2875 23473 4136 3686 5749 6619 5572 7967 11431 9912 ...
## $ revol_util : chr "54.2%" "54.5%" "16.1%" "81.9%" ...
## $ total_acc : int 26 41 8 14 16 12 32 28 29 22 ...
## $ initial_list_status: Factor w/ 3 levels "","f","w": 2 2 3 2 2 2 3 2 3 2 ...
## $ out_prncp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ out_prncp_inv : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total_pymnt : num 3182 23927 5158 16945 15699 ...
## $ total_pymnt_inv : num 3182 23927 5158 16945 15699 ...
###revol_util from percent to numeric
d$revol_util<- as.numeric(sub("%", "", d$revol_util))
###initial_list_status- clean up factoring
TEMP <- levels(d$initial_list_status); TEMP <- TEMP[-1]
d$initial_list_status <- as.numeric(d$initial_list_status); d$initial_list_status <- as.factor(d$initial_list_status); levels(d$initial_list_status) <- TEMP
table(d$initial_list_status); sum(is.na(d$initial_list_status))
##
## f w
## 260520 163288
## [1] 4
sum(is.na(d$initial_list_status)); str(d$initial_list_status)
## [1] 4
## Factor w/ 2 levels "f","w": 1 1 2 1 1 1 2 1 2 1 ...
###41-50###
str(d[,41:50])
## 'data.frame': 423812 obs. of 10 variables:
## $ total_rec_prncp : num 3000 20800 4800 14000 15000 11100 12000 9750 15000 10000 ...
## $ total_rec_int : num 182 3127 358 2945 699 ...
## $ total_rec_late_fee : num 0 0 0 0 0 0 0 0 0 0 ...
## $ recoveries : num 0 0 0 0 0 0 0 0 0 0 ...
## $ collection_recovery_fee : num 0 0 0 0 0 0 0 0 0 0 ...
## $ last_pymnt_d : chr "Jul-2014" "May-2015" "Sep-2014" "Jan-2017" ...
## $ last_pymnt_amnt : num 2677 13335 3900 470 14151 ...
## $ next_pymnt_d : Factor w/ 2 levels "","Feb-2019": NA NA NA NA NA NA NA NA NA NA ...
## $ last_credit_pull_d : chr "Oct-2016" "Jan-2019" "Jan-2017" "Jan-2019" ...
## $ collections_12_mths_ex_med: int 0 0 0 0 0 0 0 0 0 0 ...
###last_pymnt_d as date
d$last_pymnt_d <- parse_date_time(d$last_pymnt_d, "my")
d$last_pymnt_d <- as_date(d$last_pymnt_d)
###next_pymnt_d- clean up factoring
TEMP <- levels(d$next_pymnt_d); TEMP <- TEMP[-1]
d$next_pymnt_d <- as.numeric(d$next_pymnt_d); d$next_pymnt_d <- as.factor(d$next_pymnt_d); levels(d$next_pymnt_d) <- TEMP
table(d$next_pymnt_d); sum(is.na(d$next_pymnt_d))
##
## Feb-2019
## 15491
## [1] 408321
sum(is.na(d$next_pymnt_d)); str(d$next_pymnt_d)
## [1] 408321
## Factor w/ 1 level "Feb-2019": NA NA NA NA NA NA NA NA NA NA ...
###hardship_end_date as date
d$hardship_end_date <- parse_date_time(d$hardship_end_date, "my")
d$hardship_end_date <- as_date(d$hardship_end_date)
###51-60###
str(d[,51:60])
## 'data.frame': 423812 obs. of 10 variables:
## $ mths_since_last_major_derog: int 69 71 NA NA NA 16 53 NA 34 54 ...
## $ policy_code : int 1 1 1 1 1 1 1 1 1 1 ...
## $ application_type : Factor w/ 2 levels "","Individual": 2 2 2 2 2 2 2 2 2 2 ...
## $ annual_inc_joint : logi NA NA NA NA NA NA ...
## $ dti_joint : logi NA NA NA NA NA NA ...
## $ verification_status_joint : logi NA NA NA NA NA NA ...
## $ acc_now_delinq : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tot_coll_amt : int 154 0 0 0 0 0 15386 0 1514 0 ...
## $ tot_cur_bal : int 19530 23473 4136 17672 13038 353402 13605 14123 272492 39143 ...
## $ open_acc_6m : logi NA NA NA NA NA NA ...
###application type: individual or not (or blank?)
TEMP <- levels(d$application_type); TEMP <- TEMP[-1]
d$application_type <- as.numeric(d$application_type); d$application_type <- as.factor(d$application_type); levels(d$application_type) <- TEMP
table(d$application_type); sum(is.na(d$application_type))
##
## Individual
## 423808
## [1] 4
sum(is.na(d$application_type)); str(d$application_type)
## [1] 4
## Factor w/ 1 level "Individual": 1 1 1 1 1 1 1 1 1 1 ...
###61-70###
str(d[,61:70])
## 'data.frame': 423812 obs. of 10 variables:
## $ open_act_il : logi NA NA NA NA NA NA ...
## $ open_il_12m : logi NA NA NA NA NA NA ...
## $ open_il_24m : logi NA NA NA NA NA NA ...
## $ mths_since_rcnt_il: logi NA NA NA NA NA NA ...
## $ total_bal_il : logi NA NA NA NA NA NA ...
## $ il_util : logi NA NA NA NA NA NA ...
## $ open_rv_12m : logi NA NA NA NA NA NA ...
## $ open_rv_24m : logi NA NA NA NA NA NA ...
## $ max_bal_bc : logi NA NA NA NA NA NA ...
## $ all_util : logi NA NA NA NA NA NA ...
###71-80###
str(d[,71:80])
## 'data.frame': 423812 obs. of 10 variables:
## $ total_rev_hi_lim : int 5300 43100 25700 4500 25800 10000 8100 15100 15400 22300 ...
## $ inq_fi : logi NA NA NA NA NA NA ...
## $ total_cu_tl : logi NA NA NA NA NA NA ...
## $ inq_last_12m : logi NA NA NA NA NA NA ...
## $ acc_open_past_24mths : int 3 9 0 3 6 2 4 2 3 3 ...
## $ avg_cur_bal : int 3906 869 1379 2945 815 39267 2268 1177 38927 4349 ...
## $ bc_open_to_buy : int 2050 6811 21564 480 15051 1016 1428 1752 2969 973 ...
## $ bc_util : num 52.3 54.6 16.1 87.7 27.6 74.6 79.6 75.7 79.1 89.4 ...
## $ chargeoff_within_12_mths: int 0 0 0 0 0 0 0 0 0 0 ...
## $ delinq_amnt : int 0 0 0 0 0 0 0 0 0 0 ...
###81-90###
str(d[,81:90])
## 'data.frame': 423812 obs. of 10 variables:
## $ mo_sin_old_il_acct : int 164 115 104 111 2 NA 124 67 147 243 ...
## $ mo_sin_old_rev_tl_op : int 271 186 220 103 257 150 182 83 189 290 ...
## $ mo_sin_rcnt_rev_tl_op : int 7 0 25 24 7 11 1 12 24 23 ...
## $ mo_sin_rcnt_tl : int 7 0 25 13 2 11 1 12 13 8 ...
## $ mort_acc : int 6 0 0 0 0 1 0 0 4 0 ...
## $ mths_since_recent_bc : int 14 0 25 38 7 11 11 12 24 25 ...
## $ mths_since_recent_bc_dlq : int 69 70 NA 16 NA 35 53 NA 75 11 ...
## $ mths_since_recent_inq : int 8 0 3 NA 2 11 17 20 12 8 ...
## $ mths_since_recent_revol_delinq: int 69 70 NA 16 NA 35 53 NA 75 11 ...
## $ num_accts_ever_120_pd : int 1 1 0 0 0 1 6 0 3 1 ...
###91-100###
str(d[,91:100])
## 'data.frame': 423812 obs. of 10 variables:
## $ num_actv_bc_tl : int 2 8 2 3 8 4 2 6 3 3 ...
## $ num_actv_rev_tl : int 3 24 2 4 8 8 2 7 4 4 ...
## $ num_bc_sats : int 3 11 3 3 13 4 3 6 3 3 ...
## $ num_bc_tl : int 6 17 4 9 13 4 14 11 10 6 ...
## $ num_il_tl : int 11 1 1 3 1 0 8 8 8 9 ...
## $ num_op_rev_tl : int 4 29 3 4 15 8 6 9 6 6 ...
## $ num_rev_accts : int 9 40 7 10 15 11 24 20 17 13 ...
## $ num_rev_tl_bal_gt_0: int 3 24 2 4 8 8 2 7 4 4 ...
## $ num_sats : int 5 29 3 6 16 9 7 12 8 9 ...
## $ num_tl_120dpd_2m : int 0 0 0 0 0 0 0 0 0 0 ...
###101-110###
str(d[,101:110])
## 'data.frame': 423812 obs. of 10 variables:
## $ num_tl_30dpd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ num_tl_90g_dpd_24m : int 0 0 0 0 0 1 0 0 0 0 ...
## $ num_tl_op_past_12m : int 1 3 0 0 2 1 2 2 0 1 ...
## $ pct_tl_nvr_dlq : num 91.3 90.2 100 78.6 100 75 81.2 100 89.3 77.3 ...
## $ percent_bc_gt_75 : num 66.7 50 0 100 7.7 50 33.3 66.7 66.7 66.7 ...
## $ pub_rec_bankruptcies: int 2 0 0 1 0 0 0 0 0 0 ...
## $ tax_liens : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tot_hi_cred_lim : int 32082 43100 25700 31840 33300 385000 18130 21314 288195 58486 ...
## $ total_bal_ex_mort : int 19530 23473 4136 17672 13038 6619 13605 14123 39448 39143 ...
## $ total_bc_limit : int 4300 15000 25700 3900 20800 4000 7000 7200 14200 9200 ...
###111-120###
str(d[,111:120])
## 'data.frame': 423812 obs. of 10 variables:
## $ total_il_high_credit_limit : int 26782 0 0 27340 7500 0 10030 6214 33895 36186 ...
## $ revol_bal_joint : logi NA NA NA NA NA NA ...
## $ sec_app_earliest_cr_line : logi NA NA NA NA NA NA ...
## $ sec_app_inq_last_6mths : logi NA NA NA NA NA NA ...
## $ sec_app_mort_acc : logi NA NA NA NA NA NA ...
## $ sec_app_open_acc : logi NA NA NA NA NA NA ...
## $ sec_app_revol_util : logi NA NA NA NA NA NA ...
## $ sec_app_open_act_il : logi NA NA NA NA NA NA ...
## $ sec_app_num_rev_accts : logi NA NA NA NA NA NA ...
## $ sec_app_chargeoff_within_12_mths: logi NA NA NA NA NA NA ...
###121-130###
str(d[,121:130])
## 'data.frame': 423812 obs. of 10 variables:
## $ sec_app_collections_12_mths_ex_med : logi NA NA NA NA NA NA ...
## $ sec_app_mths_since_last_major_derog: logi NA NA NA NA NA NA ...
## $ hardship_flag : chr "N" "N" "N" "N" ...
## $ hardship_type : Factor w/ 2 levels "","INTEREST ONLY-3 MONTHS DEFERRAL": NA NA NA NA NA NA NA NA NA NA ...
## $ hardship_reason : chr NA NA NA NA ...
## $ hardship_status : chr NA NA NA NA ...
## $ deferral_term : int NA NA NA NA NA NA NA NA NA NA ...
## $ hardship_amount : num NA NA NA NA NA NA NA NA NA NA ...
## $ hardship_start_date : chr NA NA NA NA ...
## $ hardship_end_date : Date, format: NA NA ...
###hardship type: individual or not (or blank?)
TEMP <- levels(d$hardship_type); TEMP <- TEMP[-1]
d$hardship_type <- as.numeric(d$hardship_type); d$hardship_type <- as.factor(d$hardship_type); levels(d$hardship_type) <- TEMP
table(d$hardship_type); sum(is.na(d$hardship_type))
##
## INTEREST ONLY-3 MONTHS DEFERRAL
## 576
## [1] 423236
sum(is.na(d$hardship_type)); str(d$hardship_type)
## [1] 423236
## Factor w/ 1 level "INTEREST ONLY-3 MONTHS DEFERRAL": NA NA NA NA NA NA NA NA NA NA ...
###hardship_start_date as date
d$hardship_start_date <- parse_date_time(d$hardship_start_date, "my")
d$hardship_start_date <- as_date(d$hardship_start_date)
###hardship_end_date as date
d$hardship_end_date <- parse_date_time(d$hardship_end_date, "my")
## Warning: All formats failed to parse. No formats found.
d$hardship_end_date <- as_date(d$hardship_end_date)
###131-140###
str(d[,131:140])
## 'data.frame': 423812 obs. of 10 variables:
## $ payment_plan_start_date : chr NA NA NA NA ...
## $ hardship_length : int NA NA NA NA NA NA NA NA NA NA ...
## $ hardship_dpd : int NA NA NA NA NA NA NA NA NA NA ...
## $ hardship_loan_status : Factor w/ 5 levels "","Current","In Grace Period",..: NA NA NA NA NA NA NA NA NA NA ...
## $ orig_projected_additional_accrued_interest: num NA NA NA NA NA NA NA NA NA NA ...
## $ hardship_payoff_balance_amount : num NA NA NA NA NA NA NA NA NA NA ...
## $ hardship_last_payment_amount : num NA NA NA NA NA NA NA NA NA NA ...
## $ disbursement_method : Factor w/ 2 levels "","Cash": 2 2 2 2 2 2 2 2 2 2 ...
## $ debt_settlement_flag : Factor w/ 3 levels "","N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ debt_settlement_flag_date : chr NA NA NA NA ...
###payment_plan_start_date as date
d$payment_plan_start_date <- parse_date_time(d$payment_plan_start_date, "my")
d$payment_plan_start_date <- as_date(d$payment_plan_start_date)
###hardship_loan_status:
TEMP <- levels(d$hardship_loan_status); TEMP <- TEMP[-1]
d$hardship_loan_status <- as.numeric(d$hardship_loan_status); d$hardship_loan_status <- as.factor(d$hardship_loan_status); levels(d$hardship_loan_status) <- TEMP
table(d$hardship_loan_status); sum(is.na(d$hardship_loan_status))
##
## Current In Grace Period Late (16-30 days)
## 119 161 259
## Late (31-120 days)
## 37
## [1] 423236
sum(is.na(d$hardship_loan_status)); str(d$hardship_loan_status)
## [1] 423236
## Factor w/ 4 levels "Current","In Grace Period",..: NA NA NA NA NA NA NA NA NA NA ...
###disbursement_method: individual factor?
TEMP <- levels(d$disbursement_method); TEMP <- TEMP[-1]
d$disbursement_method <- as.numeric(d$disbursement_method); d$disbursement_method <- as.factor(d$disbursement_method); levels(d$disbursement_method) <- TEMP
table(d$disbursement_method); sum(is.na(d$disbursement_method))
##
## Cash
## 423808
## [1] 4
sum(is.na(d$disbursement_method)); str(d$disbursement_method)
## [1] 4
## Factor w/ 1 level "Cash": 1 1 1 1 1 1 1 1 1 1 ...
###debt_settlement_flag as factor
TEMP <- levels(d$debt_settlement_flag); TEMP <- TEMP[-1]
d$debt_settlement_flag <- as.numeric(d$debt_settlement_flag); d$debt_settlement_flag <- as.factor(d$debt_settlement_flag); levels(d$debt_settlement_flag) <- TEMP
table(d$debt_settlement_flag); sum(is.na(d$debt_settlement_flag))
##
## N Y
## 417627 6181
## [1] 4
sum(is.na(d$debt_settlement_flag)); str(d$debt_settlement_flag)
## [1] 4
## Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
###141-145###
str(d[,141:145])
## 'data.frame': 423812 obs. of 5 variables:
## $ settlement_status : Factor w/ 4 levels "","ACTIVE","BROKEN",..: NA NA NA NA NA NA NA NA NA NA ...
## $ settlement_date : chr NA NA NA NA ...
## $ settlement_amount : num NA NA NA NA NA NA NA NA NA NA ...
## $ settlement_percentage: num NA NA NA NA NA NA NA NA NA NA ...
## $ settlement_term : int NA NA NA NA NA NA NA NA NA NA ...
###settlement_status as factor
TEMP <- levels(d$settlement_status); TEMP <- TEMP[-1]
d$settlement_status <- as.numeric(d$settlement_status); d$settlement_status <- as.factor(d$settlement_status); levels(d$settlement_status) <- TEMP
table(d$settlement_status); sum(is.na(d$settlement_status))
##
## ACTIVE BROKEN COMPLETE
## 1198 919 4064
## [1] 417631
sum(is.na(d$settlement_status)); str(d$settlement_status)
## [1] 417631
## Factor w/ 3 levels "ACTIVE","BROKEN",..: NA NA NA NA NA NA NA NA NA NA ...
saveRDS(d, file = "./Files/data/d")