Below we will explore a sample dataset of actual loans originated by Zopa, in SQL.
Reading the dataset and setting up the db connection
library(RSQLite)
drv <- dbDriver("SQLite")
dir <- 'Desktop'
dbFilename <- 'zopa_data_for_test.db'
db <- dbConnect(drv, dbname = file.path(dir, dbFilename))
Checking how many tables are in the database and what are the columns in each dataset.
dbListTables(db)
## [1] "default_rate_estimate" "loan_book"
dbListFields(db, "loan_book")
## [1] "market" "Amount" "Term_nMonths"
## [4] "Annual_interest_rate" "Date_of_disbursal" "Date_of_default"
## [7] "Declared_income" "loan_id" "member_id"
## [10] "Declared_incom"
dbListFields(db, "default_rate_estimate")
## [1] "market" "Term_nMonths"
## [3] "Annual_default_rate_estimate"
Below is an example of how to query the SQLite database. SQLite might have different syntax from the SQL flavour but for the simple tasks, this is no big difference. The first table “loan_book” contains a sample of actual loans disbursed through Zopa, but limited to 5 rows.
query2 <- dbSendQuery(db, "select * from loan_book")
results2 <- fetch (query2, 5)
results2
## market Amount Term_nMonths Annual_interest_rate
## 1 E 10000 60 0.2850
## 2 C1 8600 60 0.1482
## 3 A1 5000 60 0.0653
## 4 D 5000 48 0.1820
## 5 A2 20000 60 0.0842
## Date_of_disbursal Date_of_default Declared_income
## 1 2016-09-13 09:57:07.347000 NA 22000
## 2 2016-09-13 10:17:41.043000 NA 41000
## 3 2016-09-14 08:33:32.090000 NA 28000
## 4 2016-09-14 16:49:36.867000 NA 25000
## 5 2016-09-13 14:40:43.777000 NA 20100
## loan_id member_id Declared_incom
## 1 -8536411584091361334 5448905234911242152 22000
## 2 2586587516215590820 -6961035832149058614 41000
## 3 5812729143149005631 7002479678934768268 28000
## 4 2455674728913840239 -6107882007346940517 25000
## 5 -5412804541701733109 8229682404651592959 20100
The second table “default_rate_estimate” contains a lookup table of the expected default rate for various type of loans, limited to 5 rows.
query3 <- dbSendQuery(db, "select * from default_rate_estimate")
## Warning: Closing open result set, pending rows
results3 <- fetch(query3, 5)
results3
## market Term_nMonths Annual_default_rate_estimate
## 1 A* 12 0.00344
## 2 A* 24 0.00345
## 3 A* 36 0.00303
## 4 A* 48 0.00356
## 5 A* 60 0.00446
What are the different “market” we have?
query4 <- dbSendQuery(db, 'SELECT DISTINCT market FROM loan_book')
## Warning: Closing open result set, pending rows
results4 <- fetch(query4)
results4
## market
## 1 E
## 2 C1
## 3 A1
## 4 D
## 5 A2
## 6 B
## 7 A*
How many loans we have in each market?
query5 <- dbSendQuery(db, 'SELECT market, COUNT(*) as AMOUNT
FROM loan_book
GROUP BY market')
## Warning: Closing open result set, pending rows
results5 <- fetch(query5)
results5
## market AMOUNT
## 1 A* 21877
## 2 A1 13097
## 3 A2 13657
## 4 B 14527
## 5 C1 16849
## 6 D 11841
## 7 E 7776
Find top 3 calendar months with highest average values of estimated annual default rate
query7 <- dbSendQuery(db, "SELECT loan_book.Date_of_disbursal,
strftime('%Y-%m', Date_of_disbursal) as 'Year&Month',
avg(Annual_default_rate_estimate) as 'Avg - Annual Default_rate_estimate'
FROM default_rate_estimate
INNER JOIN loan_book ON loan_book.market = default_rate_estimate.market
GROUP BY 'Year&Month'
ORDER BY 'Avg - Annual Default_rate_estimate' DESC LIMIT 3 ")
## Warning: Closing open result set, pending rows
results7 <- fetch(query7)
results7
## Date_of_disbursal Year&Month Avg - Annual Default_rate_estimate
## 1 2016-12-31 21:26:41.137000 2016-12 0.0496692
In each market, what is the average of estimated annual default rate, and what is the actual default rate in the first year after disbursal?¶
query8 <- dbSendQuery(db, "SELECT loan_book.Date_of_disbursal,
strftime('%Y-%m', Date_of_disbursal) as 'Year&Month',
avg(Annual_default_rate_estimate) as 'Avg - Annual Default_rate_estimate'
FROM default_rate_estimate
INNER JOIN loan_book ON loan_book.market = default_rate_estimate.market
GROUP BY 'Year&Month'
ORDER BY 'Avg - Annual Default_rate_estimate' DESC
LIMIT 3")
## Warning: Closing open result set, pending rows
result8 <- fetch(query8)
result8
## Date_of_disbursal Year&Month Avg - Annual Default_rate_estimate
## 1 2016-12-31 21:26:41.137000 2016-12 0.0496692
In each market, what is the average of estimated annual default rate, and what is the actual default rate in the first year after disbursal?
query9 <- dbSendQuery(db, "SELECT loan_book.market,
SUM(Date_of_default) as 'Actual Default Rate 1st Year from Disbursal',
AVG(Annual_default_rate_estimate) as 'Avg - Annual Default_rate_estimate'
FROM default_rate_estimate
INNER JOIN loan_book ON loan_book.market = default_rate_estimate.market
WHERE strftime('%Y', Date_of_default)= '2016'
GROUP BY loan_book.market
ORDER BY 'Actual Default Rate 1st Year from Disbursal' DESC")
## Warning: Closing open result set, pending rows
results9 <- fetch(query9)
results9
## market Actual Default Rate 1st Year from Disbursal
## 1 A* 40320
## 2 A1 171360
## 3 A2 252000
## 4 B 695520
## 5 C1 1451520
## 6 D 1653120
## 7 E 1058400
## Avg - Annual Default_rate_estimate
## 1 0.003588
## 2 0.010130
## 3 0.018248
## 4 0.035702
## 5 0.064888
## 6 0.134922
## 7 0.164392
How many borrowers had more than one loans from us?
query10 <- dbSendQuery(db, "SELECT
COUNT(member_id) as 'Borrowers more than one loans'
FROM loan_book
WHERE
member_id >1")
## Warning: Closing open result set, pending rows
results10 <- fetch(query10)
results10
## Borrowers more than one loans
## 1 49886
How many of these re-borrowers got a new loan with lower interest rate than their existing/old ones?
query11 <- dbSendQuery(db, "SELECT
COUNT(OldLoan.member_id) as 'Total of borrowers with a new lower interest loan'
FROM
loan_book AS OldLoan, loan_book AS NewLoan
WHERE NEWLoan.member_id = OldLoan.member_id
AND NewLoan.loan_id > OldLoan.loan_id
AND NewLoan.Annual_interest_rate < OldLoan.Annual_interest_Rate")
## Warning: Closing open result set, pending rows
results11 <- fetch(query11)
results11
## Total of borrowers with a new lower interest loan
## 1 1941