Project Proposal

For the final project project I selected the data set that I found on Lending Club’s website (https://www.lendingclub.com). The data is provided for potential investors. The data set contains information about loans that were issued from 2007 to the third quarter of 2017.

Lending Club is the world’s largest peer-to-peer lending platform that enables borrowers to obtain a loan, and investors to purchase notes backed by payments made on loans.

The goal of the project is to create the visualization that will allow users see different factors like average FICO score, income, age of applicants, loan amount across all US States. After user selects a factor from a drop down menu he will be able to see US map where states are shaded according to selected factor. The shade of states will depend on the groups (or “bins”) that states are placed into. Binning strategy (Equal Interval Bins, Data Distribution Bins or Arbitrary Bins) will be chosen based on the variable (selected factor) distribution.

Also, the second visualization will show up below the US map. The second visualization (in a form of bar chart) will allow rank states by a selected factor.

In order to collect the data I downloaded (data source: https://www.lendingclub.com/info/download-data.action ) and merged 11 files that contain data from 2007 to the third quarter of 2017. To reduce the loading time I implemented the following steps.

#1. read in a few records of the input file to identify the classes of the input file and assign that column class to the input file while reading the entire data set
data_2007_2011 <- read.csv(file="https://cdn-stage.fedweb.org/fed-2/13/LoanStats3a.csv",  
                           stringsAsFactors=T, header=T, nrows=5)

data_2012_2013 <- read.csv(file="https://cdn-stage.fedweb.org/fed-2/13/LoanStats3b.csv",  
                           stringsAsFactors=T, header=T, nrows=5) 

data_2014 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3c.csv",  
                           stringsAsFactors=T, header=T, nrows=5) 

data_2015 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3d.csv",
                           stringsAsFactors=T, header=T, nrows=5) 

data_2016_q1 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q1.csv",
                          stringsAsFactors=T, header=T, nrows=5) 

data_2016_q2 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q2.csv",  
                          stringsAsFactors=T, header=T, nrows=5)

data_2016_q3 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q3.csv",  
                          stringsAsFactors=T, header=T, nrows=5) 

data_2016_q4 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q4.csv",  
                          stringsAsFactors=T, header=T, nrows=5)

data_2017_q1 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2017Q1.csv",  
                          stringsAsFactors=T, header=T, nrows=5)

data_2017_q2 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2017Q2.csv",  
                          stringsAsFactors=T, header=T, nrows=5)

data_2017_q3 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2017Q3.csv",  
                          stringsAsFactors=T, header=T, nrows=5)


#2. replace all missing values with NAs
data_2007_2011 <- data_2007_2011[is.na(data_2007_2011)]
data_2012_2013 <- data_2012_2013[is.na(data_2012_2013)]
data_2014 <- data_2014[is.na(data_2014)]
data_2015 <- data_2015[is.na(data_2015)]
data_2016_q1 <- data_2016_q1[is.na(data_2016_q1)]
data_2016_q2 <- data_2016_q1[is.na(data_2016_q2)]
data_2016_q3 <- data_2016_q1[is.na(data_2016_q3)]
data_2016_q4 <- data_2016_q1[is.na(data_2016_q4)]
data_2017_q1 <- data_2017_q1[is.na(data_2017_q1)]
data_2017_q2 <- data_2017_q2[is.na(data_2017_q2)]
data_2017_q3 <- data_2017_q3[is.na(data_2017_q3)]


#3. determine classes
data_2007_2011.colclass <- sapply(data_2007_2011,class)
data_2012_2013.colclass <- sapply(data_2012_2013,class)
data_2014.colclass <- sapply(data_2014,class)
data_2015.colclass <- sapply(data_2015,class)
data_2016_q1.colclass <- sapply(data_2016_q1,class)
data_2016_q2.colclass <- sapply(data_2016_q2,class)
data_2016_q3.colclass <- sapply(data_2016_q3,class)
data_2016_q4.colclass <- sapply(data_2016_q4,class)
data_2017_q1.colclass <- sapply(data_2017_q1,class)
data_2017_q2.colclass <- sapply(data_2017_q2,class)
data_2017_q3.colclass <- sapply(data_2017_q3,class)


#4. assign that column class to the input file while reading the entire data set and define comment.char parameter.
data_2007_2011 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3a.csv",  
                           stringsAsFactors=T,
                           header=T,colClasses=data_2007_2011.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data_2012_2013 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3b.csv",  
                           stringsAsFactors=T,
                           header=T,colClasses=data_2007_2011.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data_2014 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3c.csv",  
                       stringsAsFactors=T, colClasses=data_2014.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data_2015 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats3d.csv",
                      stringsAsFactors=T, header=T, colClasses=data_2015.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data_2016_q1 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q1.csv", 
                         stringsAsFactors=T, header=T,colClasses=data_2016_q1.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data_2016_q2 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q2.csv",  
                          stringsAsFactors=T, header=T,colClasses=data_2016_q2.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data_2016_q3 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q3.csv",  
                          stringsAsFactors=T, header=T,colClasses=data_2016_q3.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data_2016_q4 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2016Q4.csv",  
                          stringsAsFactors=T, header=T,colClasses=data_2016_q4.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data_2017_q1 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2017Q1.csv",  
                          stringsAsFactors=T, header=T,colClasses=data_2017_q1.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data_2017_q2 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2017Q2.csv",  
                          stringsAsFactors=T, header=T,colClasses=data_2017_q2.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data_2017_q3 <- read.csv("https://cdn-stage.fedweb.org/fed-2/13/LoanStats_2017Q3.csv",  
                          stringsAsFactors=T, header=T,colClasses=data_2017_q3.colclass, comment.char="")

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

#5. merge csv files
full_data <- rbind(data_2007_2011[1:51],data_2012_2013[1:51],data_2014[1:51],data_2015[1:51],data_2016_q1[1:51],data_2016_q2[1:51],data_2016_q3[1:51],data_2016_q4[1:51],data_2017_q1[1:51],data_2017_q2[1:51],data_2017_q3[1:51])

The dataset contains the following variables. For the visualization I will select 6-8 the most important variables.

names(full_data)

##  [1] "id"                          "member_id"                  
##  [3] "loan_amnt"                   "funded_amnt"                
##  [5] "funded_amnt_inv"             "term"                       
##  [7] "int_rate"                    "installment"                
##  [9] "grade"                       "sub_grade"                  
## [11] "emp_title"                   "emp_length"                 
## [13] "home_ownership"              "annual_inc"                 
## [15] "verification_status"         "issue_d"                    
## [17] "loan_status"                 "pymnt_plan"                 
## [19] "url"                         "desc"                       
## [21] "purpose"                     "title"                      
## [23] "zip_code"                    "addr_state"                 
## [25] "dti"                         "delinq_2yrs"                
## [27] "earliest_cr_line"            "inq_last_6mths"             
## [29] "mths_since_last_delinq"      "mths_since_last_record"     
## [31] "open_acc"                    "pub_rec"                    
## [33] "revol_bal"                   "revol_util"                 
## [35] "total_acc"                   "initial_list_status"        
## [37] "out_prncp"                   "out_prncp_inv"              
## [39] "total_pymnt"                 "total_pymnt_inv"            
## [41] "total_rec_prncp"             "total_rec_int"              
## [43] "total_rec_late_fee"          "recoveries"                 
## [45] "collection_recovery_fee"     "last_pymnt_d"               
## [47] "last_pymnt_amnt"             "next_pymnt_d"               
## [49] "last_credit_pull_d"          "collections_12_mths_ex_med" 
## [51] "mths_since_last_major_derog"