#load packages
library(dplyr)
library(tidyr)
#library(plyr)
library(stringr)
library(partykit)
library(grid)
library(PerformanceAnalytics)
library(GGally)
library(ggplot2)
library(knitr)
library(rcompanion)
library(corrplot)
library(PerformanceAnalytics)
library(GGally)
library(plyr)
#library(VIM)
library(mice)
library(usmap)
library(maps)
library(ggthemes)

For the final project project I selected the data set that I found on Lending Club’s website (https://www.lendingclub.com). The data is provided for potential investors. The data set contains information about loans that were issued from 2007 to the third quarter of 2017.

Lending Club is the world’s largest peer-to-peer lending platform that enables borrowers to obtain a loan, and investors to purchase notes backed by payments made on loans.

PROPOSAL

The goal of the project is to create the visualization that will allow users see different features like anual loan grade, loan amount, employment length and home ownershiop status across all US States. After user selects a factor from a drop down menu he will be able to see US map where states are shaded according to selected factor. The shade of states will depend on the groups (or “bins”) that states are placed into. Binning strategy (Equal Interval Bins, Data Distribution Bins or Arbitrary Bins) will be chosen based on the variable (selected factor) distribution.

Also, the second visualization will show up below the US map. The second visualization (in a form of bar chart) will allow rank states by a selected factor.

DATASET

In order to collect the data I downloaded (data source: https://www.lendingclub.com/info/download-data.action ) and merged 11 files that contain data from 2007 to the third quarter of 2017. To reduce the loading time I implemented the following steps.

#1. read in a few records of the input file to identify the classes of the input file and assign that column class to the input file while reading the entire data set
data_2007_2011 <- read.csv(file="https://cdn-stage.fedweb.org/fed-2/13/LoanStats3a.csv",  
                           stringsAsFactors=T, header=T, nrows=5)

data_2012_2013 <- read.csv(file="https://cdn-stage.fedweb.org/fed-2/13/LoanStats3b.csv",  
                           stringsAsFactors=T, header=T, nrows=5) 

data_2014 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3c.csv",  
                           stringsAsFactors=T, header=T, nrows=5) 

data_2015 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3d.csv",
                           stringsAsFactors=T, header=T, nrows=5) 

data_2016_q1 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q1.csv",
                          stringsAsFactors=T, header=T, nrows=5) 

data_2016_q2 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q2.csv",  
                          stringsAsFactors=T, header=T, nrows=5)

data_2016_q3 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q3.csv",  
                          stringsAsFactors=T, header=T, nrows=5) 

data_2016_q4 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q4.csv",  
                          stringsAsFactors=T, header=T, nrows=5)



#2. replace all missing values with NAs
data_2007_2011 <- data_2007_2011[is.na(data_2007_2011)]
data_2012_2013 <- data_2012_2013[is.na(data_2012_2013)]
data_2014 <- data_2014[is.na(data_2014)]
data_2015 <- data_2015[is.na(data_2015)]
data_2016_q1 <- data_2016_q1[is.na(data_2016_q1)]
data_2016_q2 <- data_2016_q1[is.na(data_2016_q2)]
data_2016_q3 <- data_2016_q1[is.na(data_2016_q3)]
data_2016_q4 <- data_2016_q1[is.na(data_2016_q4)]



#3. determine classes
data_2007_2011.colclass <- sapply(data_2007_2011,class)
data_2012_2013.colclass <- sapply(data_2012_2013,class)
data_2014.colclass <- sapply(data_2014,class)
data_2015.colclass <- sapply(data_2015,class)
data_2016_q1.colclass <- sapply(data_2016_q1,class)
data_2016_q2.colclass <- sapply(data_2016_q2,class)
data_2016_q3.colclass <- sapply(data_2016_q3,class)
data_2016_q4.colclass <- sapply(data_2016_q4,class)



#4. assign that column class to the input file while reading the entire data set and define comment.char parameter.
data_2007_2011 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3a.csv",  
                           stringsAsFactors=T,
                           header=T,colClasses=data_2007_2011.colclass, comment.char="",na.strings=c(""," ","NA"))

data_2012_2013 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3b.csv",  
                           stringsAsFactors=T,
                           header=T,colClasses=data_2007_2011.colclass, comment.char="",na.strings=c(""," ","NA"))

data_2014 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3c.csv",  
                       stringsAsFactors=T, colClasses=data_2014.colclass, comment.char="",na.strings=c(""," ","NA")) 

data_2015 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3d.csv",
                      stringsAsFactors=T, header=T, colClasses=data_2015.colclass, comment.char="",na.strings=c(""," ","NA")) 

data_2016_q1 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q1.csv", 
                         stringsAsFactors=T, header=T,colClasses=data_2016_q1.colclass, comment.char="",na.strings=c(""," ","NA")) 

data_2016_q2 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q2.csv",  
                          stringsAsFactors=T, header=T,colClasses=data_2016_q2.colclass, comment.char="",na.strings=c(""," ","NA"))

data_2016_q3 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q3.csv",  
                          stringsAsFactors=T, header=T,colClasses=data_2016_q3.colclass, comment.char="",na.strings=c(""," ","NA")) 

data_2016_q4 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q4.csv",  
                          stringsAsFactors=T, header=T,colClasses=data_2016_q4.colclass, comment.char="",na.strings=c(""," ","NA"))
#5. merge csv files
data1 <- rbind.fill(data_2007_2011,data_2012_2013,data_2014,data_2015,data_2016_q1,data_2016_q2,data_2016_q3,data_2016_q4)

head(data1[,1:8])
##     id member_id loan_amnt funded_amnt funded_amnt_inv       term int_rate
## 1 <NA>        NA      5000        5000            4975  36 months   10.65%
## 2 <NA>        NA      2500        2500            2500  60 months   15.27%
## 3 <NA>        NA      2400        2400            2400  36 months   15.96%
## 4 <NA>        NA     10000       10000           10000  36 months   13.49%
## 5 <NA>        NA      3000        3000            3000  60 months   12.69%
## 6 <NA>        NA      5000        5000            5000  36 months    7.90%
##   installment
## 1      162.87
## 2       59.83
## 3       84.33
## 4      339.31
## 5       67.79
## 6      156.46

I excluded loans issued before 2009 (due to financial crises) and after 2016 (as those loans might be still be in progress).

#specify date variables
vars <- c("issue_d", "last_pymnt_d", "last_credit_pull_d","earliest_cr_line")

#function that converts dates to proper date format
convert_date <- function(x){
as.Date(paste0("01-",x), format = "%d-%b-%y")
  
  } 

#convert dates to proper date format
data <- data %>% mutate_at(.funs = funs(convert_date), .vars = vars)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
##   # Before:
##   funs(name = f(.))
## 
##   # After: 
##   list(name = ~ f(.))
## This warning is displayed once per session.
#select loans that were issued between 2009 and 2016
data <- subset(data, as.Date(issue_d) > as.Date("2008-12-31"))

#verify that issue dates fall into the interval (2009,2016)
summary(data$issue_d)
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "2009-01-01" "2014-06-01" "2015-07-01" "2015-03-02" "2016-03-01" 
##         Max. 
## "2016-12-01"

Also, I excluded all loans with 60-month duration as they were firstly introduced in 2010 and majority of such loans haven’t reached their maturity.

#select 36 months loans
data <- data %>% filter(term ==" 36 months") %>% select(-term)

Respose Variable

Response variable ‘loan_status’ can take 9 different categories that are shown below.

#loan status castegories
levels(factor(data$loan_status))
## [1] "Charged Off"                                        
## [2] "Does not meet the credit policy. Status:Charged Off"
## [3] "Does not meet the credit policy. Status:Fully Paid" 
## [4] "Fully Paid"                                         
## [5] "Current"                                            
## [6] "In Grace Period"                                    
## [7] "Late (31-120 days)"                                 
## [8] "Default"                                            
## [9] "Late (16-30 days)"

I don’t consider loans with statuses ‘In Grace Period and Late (16–30 days)’ as Charged off as these loans are not delayed by more than 30 days and in theoretically might be paid off. Lending Club statistics shows that 75% of loans with status ‘Late (31–120 days)’ are never fully paid. The dataset contains 91 loans with status ‘Late (31–120 days)’ and 50 of them are delayed by more than 90 days. I labeled them as ‘Charged Off’ since I assumed that those loans would never be paid off. Loans with marked as ‘Default’ have delayed instalment by more than 120 days. They are labeled as ‘Charged Off’ in the project as well.

#replace "Late (16-30 days)"and "Default" with "Charged Off"
data <- data %>% mutate(loan_status=as.factor(str_replace(loan_status, "Does not meet the credit policy. Status:", "")),loan_status=as.factor(str_replace(loan_status, "Default" | "Late (16-30 days)", "Charged Off")))

levels(factor(data$loan_status))
## [1] "Charged Off"        "Current"            "Default"           
## [4] "Fully Paid"         "In Grace Period"    "Late (16-30 days)" 
## [7] "Late (31-120 days)"
#select only loans that were either paid off or charged off
data <- subset(data, loan_status == "Fully Paid" | loan_status =="Charged Off")
levels(factor(data$loan_status))
## [1] "Charged Off" "Fully Paid"
#select 36 months loans
data <- data %>% select(-emp_title,-id,-member_id)

MISSING VALUES

The dataset contains a lot of missing values. For example, the variable Joint Revolving Balance and Hardship Status are missing more than 99% of their values. The variables that miss more than 96% of values were removed since they don’t have any statistical significance and have a very little or no effect on the response variable. By looking at the dataset it can be concluded that missing values of several variables might be predictive of the response variable Loan Status. For instance, missing values of Home Ownership were replaced by “Not Provided”.

#build function that counts missing values
count_nas <- function(data){
  
variable_name_column <- c()
number_missing_column <- c()

for (i in 2:ncol(data)){
  variable_name <- colnames(data[i])
  number_missing <- sum(is.na(data[i]))
  variable_name_column <- c(variable_name_column,variable_name)
  number_missing_column <- c(number_missing_column,number_missing)
}

missing_table <- data.frame(variable_name_column,number_missing_column)
missing_table <- missing_table %>% mutate(percentage=round(number_missing_column*100/nrow(data),4)) %>% arrange(desc(percentage))
missing_table %>% select(-number_missing_column)
}

#count NAs
missing <- count_nas(data)
head(missing)
##       variable_name_column percentage
## 1                      url        100
## 2          revol_bal_joint        100
## 3 sec_app_earliest_cr_line        100
## 4   sec_app_inq_last_6mths        100
## 5         sec_app_mort_acc        100
## 6         sec_app_open_acc        100
#remove all varables that miss more than 96% of values
var_list <- subset(missing,percentage > 96)
vars <- as.character(list(var_list$variable_name_column)[[1]])
data <- data %>% select(-vars)

#count NAs
head(count_nas(data))
##     variable_name_column percentage
## 1    disbursement_method    92.2540
## 2                   desc    87.8406
## 3 mths_since_last_record    82.9892
## 4                il_util    81.0205
## 5     mths_since_rcnt_il    78.7419
## 6               all_util    78.1585

Instead of throwing out missing values of the remaining variables containing missing data were restored. So that, I got a lot more data to feed to a model. In order to restore the missing data, I applied multiple imputation technique. The method ‘PMM’ (replacement with mean) that is also known as Predictive Mean Matching was chosen for imputation procedure.

data_imputes <- mice(data, m = 5)
data <- complete(data_imputes,5)

DATA MODIFICATIONS

#count number of characters for loan title and description
data$title <- nchar(as.character(data$title), allowNA = TRUE, keepNA = NA)
data$desc <- nchar(as.character(data$desc), allowNA = TRUE, keepNA = NA)
#adjust zip code
data$zip_code <- as.numeric(substring(as.character(data$zip_code),1,3))
#adjust dates
data$issue_d <- as.factor(substring(as.character(data$issue_d),1,4))
data$earliest_cr_line <- abs(data$earliest_cr_line)
data$revol_util <- as.numeric(data$revol_util)

#convert interest rate and zip code to numeric
data <- data %>% mutate(int_rate = as.double(str_replace(int_rate, "%", "")),revol_util = str_replace(revol_util, "%", ""),revol_util = as.integer(revol_util))
#draw distribution of loan description character count
ggplot(data, aes(x=grade, y=desc, color=grade)) +
  geom_boxplot() +
  ggtitle("Distribution of Loan Description Character Count by Grade") +
  ylab("loan description character count")
## Warning: Removed 677921 rows containing non-finite values (stat_boxplot).

OUTLIERS

The dataset contains about 15 records with a self-reported income exceeding 1M USD. Such observations were considered to be outliers and were removed from the dataset. The graph below displays the distribution of annual income before outliers were removed.

#draw annual income distribution
ggplot(data, aes(x=data$annual_inc)) + 
  geom_histogram(aes(fill=..count..)) +
  scale_fill_gradient("Count", low=" light blue", high=" darkblue") + 
  ggtitle("Distribution of Annual Income") +
  xlab("annual income")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#check annual income statistics
summary(data$annual_inc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0   44000   61000   73404   88366 9000000

Removing outliers.

#determine and remove outliers
data <- subset(data, data$annual_inc < 44000+(1.5*(88366-44000)))

#annual income statistics
summary(data$annual_inc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0   41000   56000   59113   75000  110531

The figure below shows the distribution of annual income after outliers were removed.

#draw annual income distribution
ggplot(data, aes(x=data$annual_inc)) + 
  geom_histogram(aes(fill=..count..)) +
  scale_fill_gradient("Count", low=" light blue", high=" darkblue") + 
  ggtitle("Distribution of Annual Income") +
  xlab("annual income")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#find variables that have missing values   
missing <- count_nas(data)
var_list <- subset(missing,percentage > 0)
vars <- as.character(list(var_list$variable_name_column)[[1]])

#function that replace categorical missing values with 'Not Provided' and categorical missing values by 0
replace_nas <- function(x){
  
   if(is.factor(x)){
      factor(ifelse(as.character(is.na(x)),'Not Provided', x))
   }
  else if(is.numeric(x)){
      replace_na(x,0)
      } 
 
  else{
     x
  }
}

#replace categorical missing values with 'Not Provided' and categorical missing values by 0
data <- data %>% mutate_at(.funs = funs(replace_nas), .vars = vars)
#count NAs
head(count_nas(data))
##   variable_name_column percentage
## 1           revol_util     0.0531
## 2          funded_amnt     0.0000
## 3      funded_amnt_inv     0.0000
## 4             int_rate     0.0000
## 5          installment     0.0000
## 6                grade     0.0000
#draw loan status by grade
ggplot(data, aes(grade))+
  geom_bar(aes(fill=loan_status), position="fill") +
  ggtitle("Loan Status by Grade") +
  ylab("percentage")

#add states information
data$abbr <- as.character(data$addr_state)
statepop <- statepop %>% select(-pop_2015)

data <- merge(data,statepop, by="abbr")
all_states <- map_data("state")
## 
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
## 
##     ozone
data$region <- tolower(data$full)
#create sample
set.seed(1234)
data_train <- data[sample(nrow(data)),][1:round(0.66*nrow(data)),]
#write modified dataset to a csv file
write.csv(data_train, file = "/Users/olyafomicheva/desktop/shiny_app/train2.csv")
data <- merge(data_train[1:1000,], all_states, by="region")

VIDSUALIZATION

Loan Status

plot_usmap(data = data_train[1:10000,], values = "loan_status", labels = TRUE, lines = "white", label_color = "white") + 
  scale_fill_discrete(name = "Loan Status") + 
  theme(legend.position = "right") +
  ggtitle("Loan Status by State")

Loan Amount

plot_usmap(data = data_train[1:10000,], values = "loan_amnt", labels = TRUE, lines = "white", label_color = "white") + 
  scale_fill_continuous(name = "Loan Amount", label = scales::comma) + 
  theme(legend.position = "right") +
  ggtitle("Distribution of Loan Amount")

Interest Rate

plot_usmap(data = data_train[1:10000,], values = "int_rate",  labels = TRUE, lines = "white", label_color = "white") + 
  scale_fill_continuous(name = "Interest Rate", label = scales::comma) + 
  theme(legend.position = "right") +
  ggtitle("Distribution of Interest Rate")

Loan Grade

plot_usmap(data = data_train[1:10000,], values = "grade", labels = TRUE, lines = "white", label_color = "white") + 
  scale_fill_discrete(name = "Grade") + 
  theme(legend.position = "right") +
  ggtitle("Grade by State")

Employment Length

plot_usmap(data = data_train[1:10000,], values = "emp_length",  labels = TRUE, lines = "white", label_color = "white") + 
  scale_fill_discrete(name = "Employment Length") + 
  theme(legend.position = "right") +
  ggtitle("Employment Length by State")

Loan Purpose

plot_usmap(data = data_train[1:10000,], values = "purpose", labels = TRUE, lines = "white", label_color = "white") + 
  scale_fill_discrete(name = "credit purpose") + 
  theme(legend.position = "right") +
  ggtitle("Credit Purpose by State")

DEMO

Link to the shiny app:

Project Challenges

Unfortrunatly, shiny app failed while the deployment process due to memory limit for free shiny subscription. It works only locally.