Below we will explore a sample dataset of actual loans originated by Zopa, in SQL.

Reading the dataset and setting up the db connection

library(RSQLite)

drv <- dbDriver("SQLite")
dir <- 'Desktop'
dbFilename <- 'zopa_data_for_test.db'

db <- dbConnect(drv, dbname = file.path(dir, dbFilename))

Checking how many tables are in the database and what are the columns in each dataset.

dbListTables(db)
## [1] "default_rate_estimate" "loan_book"
dbListFields(db, "loan_book")
##  [1] "market"               "Amount"               "Term_nMonths"        
##  [4] "Annual_interest_rate" "Date_of_disbursal"    "Date_of_default"     
##  [7] "Declared_income"      "loan_id"              "member_id"           
## [10] "Declared_incom"
dbListFields(db, "default_rate_estimate")
## [1] "market"                       "Term_nMonths"                
## [3] "Annual_default_rate_estimate"

Below is an example of how to query the SQLite database. SQLite might have different syntax from the SQL flavour but for the simple tasks, this is no big difference. The first table “loan_book” contains a sample of actual loans disbursed through Zopa, but limited to 5 rows.

query2 <- dbSendQuery(db, "select * from loan_book")
results2 <- fetch (query2, 5)

results2
##   market Amount Term_nMonths Annual_interest_rate
## 1      E  10000           60               0.2850
## 2     C1   8600           60               0.1482
## 3     A1   5000           60               0.0653
## 4      D   5000           48               0.1820
## 5     A2  20000           60               0.0842
##            Date_of_disbursal Date_of_default Declared_income
## 1 2016-09-13 09:57:07.347000              NA           22000
## 2 2016-09-13 10:17:41.043000              NA           41000
## 3 2016-09-14 08:33:32.090000              NA           28000
## 4 2016-09-14 16:49:36.867000              NA           25000
## 5 2016-09-13 14:40:43.777000              NA           20100
##                loan_id            member_id Declared_incom
## 1 -8536411584091361334  5448905234911242152          22000
## 2  2586587516215590820 -6961035832149058614          41000
## 3  5812729143149005631  7002479678934768268          28000
## 4  2455674728913840239 -6107882007346940517          25000
## 5 -5412804541701733109  8229682404651592959          20100

The second table “default_rate_estimate” contains a lookup table of the expected default rate for various type of loans, limited to 5 rows.

query3 <- dbSendQuery(db, "select * from default_rate_estimate")
## Warning: Closing open result set, pending rows
results3 <- fetch(query3, 5)

results3
##   market Term_nMonths Annual_default_rate_estimate
## 1     A*           12                      0.00344
## 2     A*           24                      0.00345
## 3     A*           36                      0.00303
## 4     A*           48                      0.00356
## 5     A*           60                      0.00446

What are the different “market” we have?

query4 <- dbSendQuery(db, 'SELECT DISTINCT market FROM loan_book')
## Warning: Closing open result set, pending rows
results4 <- fetch(query4)
results4
##   market
## 1      E
## 2     C1
## 3     A1
## 4      D
## 5     A2
## 6      B
## 7     A*

How many loans we have in each market?

query5 <- dbSendQuery(db, 'SELECT market, COUNT(*) as AMOUNT
                      FROM loan_book
                      GROUP BY market')
## Warning: Closing open result set, pending rows
results5 <- fetch(query5)
results5
##   market AMOUNT
## 1     A*  21877
## 2     A1  13097
## 3     A2  13657
## 4      B  14527
## 5     C1  16849
## 6      D  11841
## 7      E   7776

Find top 3 calendar months with highest average values of estimated annual default rate

query7 <- dbSendQuery(db, "SELECT loan_book.Date_of_disbursal, 
                                  strftime('%Y-%m', Date_of_disbursal) as 'Year&Month',
                                  avg(Annual_default_rate_estimate) as 'Avg - Annual Default_rate_estimate'
                          FROM default_rate_estimate
                          INNER JOIN loan_book ON loan_book.market = default_rate_estimate.market
                          GROUP BY 'Year&Month'
                          ORDER BY 'Avg - Annual Default_rate_estimate' DESC LIMIT 3 ")
## Warning: Closing open result set, pending rows
results7 <- fetch(query7)
results7
##            Date_of_disbursal Year&Month Avg - Annual Default_rate_estimate
## 1 2016-12-31 21:26:41.137000    2016-12                          0.0496692

In each market, what is the average of estimated annual default rate, and what is the actual default rate in the first year after disbursal?¶

query8 <- dbSendQuery(db, "SELECT loan_book.Date_of_disbursal,
                                  strftime('%Y-%m', Date_of_disbursal) as 'Year&Month', 
                                  avg(Annual_default_rate_estimate) as 'Avg - Annual Default_rate_estimate'
                      FROM default_rate_estimate
                      INNER JOIN loan_book ON loan_book.market = default_rate_estimate.market
                      GROUP BY 'Year&Month'
                      ORDER BY 'Avg - Annual Default_rate_estimate' DESC 
                      LIMIT 3")
## Warning: Closing open result set, pending rows
result8 <- fetch(query8)
result8
##            Date_of_disbursal Year&Month Avg - Annual Default_rate_estimate
## 1 2016-12-31 21:26:41.137000    2016-12                          0.0496692

In each market, what is the average of estimated annual default rate, and what is the actual default rate in the first year after disbursal?

query9 <- dbSendQuery(db, "SELECT loan_book.market,
                                  SUM(Date_of_default) as 'Actual Default Rate 1st Year from Disbursal',
                                  AVG(Annual_default_rate_estimate) as 'Avg - Annual Default_rate_estimate'
                           FROM default_rate_estimate
                                 INNER JOIN loan_book ON loan_book.market = default_rate_estimate.market
                           WHERE strftime('%Y', Date_of_default)= '2016'
                           GROUP BY loan_book.market
                           ORDER BY 'Actual Default Rate 1st Year from Disbursal' DESC")
## Warning: Closing open result set, pending rows
results9 <- fetch(query9)
results9
##   market Actual Default Rate 1st Year from Disbursal
## 1     A*                                       40320
## 2     A1                                      171360
## 3     A2                                      252000
## 4      B                                      695520
## 5     C1                                     1451520
## 6      D                                     1653120
## 7      E                                     1058400
##   Avg - Annual Default_rate_estimate
## 1                           0.003588
## 2                           0.010130
## 3                           0.018248
## 4                           0.035702
## 5                           0.064888
## 6                           0.134922
## 7                           0.164392

How many borrowers had more than one loans from us?

query10 <- dbSendQuery(db, "SELECT  
                                COUNT(member_id) as 'Borrowers more than one loans'
                            FROM loan_book
                            WHERE
                                member_id >1")
## Warning: Closing open result set, pending rows
results10 <- fetch(query10)
results10
##   Borrowers more than one loans
## 1                         49886

How many of these re-borrowers got a new loan with lower interest rate than their existing/old ones?

query11 <- dbSendQuery(db, "SELECT 
                              COUNT(OldLoan.member_id) as 'Total of borrowers with a new lower interest loan'
                        FROM 
                            loan_book AS OldLoan, loan_book AS NewLoan
                        WHERE NEWLoan.member_id = OldLoan.member_id
                            AND NewLoan.loan_id > OldLoan.loan_id
                            AND NewLoan.Annual_interest_rate < OldLoan.Annual_interest_Rate")
## Warning: Closing open result set, pending rows
results11 <- fetch(query11)
results11
##   Total of borrowers with a new lower interest loan
## 1                                              1941