# load data of 2016 Q1
set.seed(7340)
#Getting the sample size of 1000 from loans stats 2016 Q1 dataset
loans_sample <- read.csv ("C:/Users/Shyam/Google Drive/CUNY/Assignment_Repositories/606 - Project/LoanStats_securev1_2016Q1.csv",header = TRUE,skip = 1, sep = ",",stringsAsFactors = FALSE,skipNul = TRUE) %>% sample_n(1000)
#loans_sample <- read.csv ("C:/Users/paperspace/Google Drive/CUNY/Assignment_Repositories/606 - Project/LoanStats_securev1_2016Q1.csv",header = TRUE,skip = 1, sep = ",",stringsAsFactors = FALSE,skipNul = TRUE) %>% sample_n(1000,replace = FALSE)
# 1. Select the required columns
# 2. Convert the column term to numeric
# 3. Remove unwanted columns and change the NA data(if any)
# 4. Calculate the total payment by each customer adding the interest rate
loans_summary <-
loans_sample %>% select(c(id,loan_amnt,funded_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,addr_state,fico_range_low,fico_range_high,issue_d,loan_status,purpose,total_pymnt_inv)) %>%
mutate(fico_score=(fico_range_low+fico_range_high)/2) %>%
mutate(term1=as.numeric(str_trim(str_replace(term,"months","")))) %>%
select(-term,-fico_range_low,-fico_range_high) %>%
mutate(emp_length=str_replace(emp_length,c("n/a"),"NA")) %>%
mutate(int_rate=(as.numeric(str_replace(as.character(int_rate),"%","")))) %>% mutate(total_pymnt=(funded_amnt+(funded_amnt*int_rate*(term1/12))/100))
#Change interest rate to numeric
loans_summary$int_rate <- as.numeric(loans_summary$int_rate)
#Changing required columns as factors and numeric
loans_summary$home_ownership <- as.factor(loans_summary$home_ownership)
loans_summary$verification_status <- as.factor(loans_summary$verification_status)
loans_summary$purpose <- as.factor(loans_summary$purpose)
loans_summary$loan_status <- as.factor(loans_summary$loan_status)
#As we are concentrated on home ownership, we are replacing OWN to
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Below are some research questions from Sample data
What are the cases, and how many are there?
For 2016 Q1, there are around 133K observations. For modelling purposes, I have taken a sample of 1000 rows.
Describe the method of data collection.
Data is collected from Lending loan website (https://www.lendingclub.com/info/download-data.action).
What type of study is this (observational/experiment)?
This is an observational study.
If you collected the data, state self-collected. If not, provide a citation/link.
Link: https://www.lendingclub.com/info/download-data.action
What is the response variable, and what type is it (numerical/categorical)?
What is the explanatory variable, and what type is it (numerical/categorical)?
Below table summarizes the question, response and explanatory variable. It also shows whether it is Numerical or Categorical.
Question | Response Variable | Explanatory Variable |
---|---|---|
1. Is loan interest rate % predictive of credit score? | Interest rate % (Numerical) | Credit score (Numerical) |
2. Is loan funded amount are equal for different purpose of loan request? | Funded amount (Numerical) | Different purpose (Categorical) |
3. Does different states have equal interest rate? | Interest rate % (Numerical) | States (Categorical) |
4. Does home ownership really impact FICO scores or just by chance? | FICO scores (Numerical) | Home Ownership (Categorical) |
5. Number of loan requests which will be received in next month | Loan requests (Numerical) | Months (Categorical) |
Provide relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
Sample size = 1000
summary(loans_summary)
## id loan_amnt funded_amnt int_rate
## Length:1000 Min. : 1000 Min. : 1000 Min. : 5.32
## Class :character 1st Qu.: 8000 1st Qu.: 8000 1st Qu.: 9.16
## Mode :character Median :14650 Median :14650 Median :11.48
## Mean :15559 Mean :15559 Mean :12.30
## 3rd Qu.:21000 3rd Qu.:21000 3rd Qu.:14.96
## Max. :40000 Max. :40000 Max. :28.34
##
## emp_length home_ownership annual_inc
## Length:1000 MORTGAGE:488 Min. : 12500
## Class :character OWN :125 1st Qu.: 47828
## Mode :character RENT :387 Median : 68000
## Mean : 82010
## 3rd Qu.: 99000
## Max. :1000000
##
## verification_status addr_state issue_d
## Not Verified :359 Length:1000 Length:1000
## Source Verified:368 Class :character Class :character
## Verified :273 Mode :character Mode :character
##
##
##
##
## loan_status purpose total_pymnt_inv
## Charged Off : 2 debt_consolidation:589 Min. : 0
## Current :920 credit_card :239 1st Qu.: 1803
## Fully Paid : 53 home_improvement : 56 Median : 2875
## In Grace Period : 9 other : 42 Mean : 4039
## Late (16-30 days) : 5 major_purchase : 23 3rd Qu.: 4611
## Late (31-120 days): 11 car : 12 Max. :36773
## (Other) : 39
## fico_score term1 total_pymnt
## Min. :662.0 Min. :36.00 Min. : 1390
## 1st Qu.:672.0 1st Qu.:36.00 1st Qu.:11297
## Median :689.5 Median :36.00 Median :20396
## Mean :696.2 Mean :42.41 Mean :23058
## 3rd Qu.:712.0 3rd Qu.:60.00 3rd Qu.:32359
## Max. :827.0 Max. :60.00 Max. :76500
##
loandatamean <- group_by(loans_summary,purpose) %>% summarise(mean(funded_amnt))
loandatasd <- group_by(loans_summary,purpose) %>% summarise(sd(funded_amnt))
ggplot(loans_summary,aes(x=funded_amnt)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Histogram of funded amount
ggplot(loans_summary,aes(x=funded_amnt)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Histogram of Interest rate %
ggplot(loans_summary,aes(x=int_rate)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#fico score vs interest rate
ggplotly(ggplot(loans_summary,aes(x=fico_score,y=int_rate)) + geom_point(aes(color=purpose),size=1,alpha=0.8) + geom_smooth(method = "lm"))
#funded amount are equal for different purpose
ggplot(loans_summary,aes(x=purpose,y=funded_amnt)) + geom_boxplot()
#Interest rate vs funded amount
ggplot(loans_summary,aes(x=int_rate,y=funded_amnt)) + geom_point(aes(color=term1))
# State vs Interest rate %
state_interestrate <- loans_summary %>% select(addr_state,int_rate) %>% group_by(addr_state) %>% summarise(avg =mean(int_rate))
a1 <- list(autotick = TRUE, title = "State")
b1 <- list(title = "Interest rate %")
plot_ly(x = ~state_interestrate$addr_state, y = ~state_interestrate$avg, type = "area") %>% layout(xaxis=a1,yaxis=b1,title="State vs Average interest rate")
## Warning: 'area' objects don't have these attributes: 'x', 'y'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'hoverinfo', 'stream', 'r', 't', 'marker', 'rsrc', 'tsrc', 'key'
#Home ownership vs FICO scores
ggplot(loans_summary,aes(x=fico_score)) + geom_histogram() + facet_grid(.~home_ownership)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Loan request vs month
plot_ly(x = ~loans_summary$issue_d)
## No trace type specified:
## Based on info supplied, a 'histogram' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#histogram
#remove(loans_sample,loans_summary,loandatamean,loandatasd,a1,p2,p3,p4,p5,a1,b1,g,h)
Confidence interval of the funded amount
#Point estimate for the population funded amount
p = mean(loans_summary$funded_amnt)
n=1000
se = (sd(loans_summary$funded_amnt))/sqrt(1000)
# 95% confidence interval
me = 1.96*se
# Confidence interval of funded amount
normalPlot(p,se,bounds=c(p-me,p+me))
Hypothesis test between the funded amount of RENT vs MORTGAGE
loan_summary1 <- filter(loans_summary,home_ownership==c("RENT","MORTGAGE"))
loan_summary1$home_ownership <- as.character(loan_summary1$home_ownership)
#Hypothesys test
inference(y=loan_summary1$funded_amnt,x=loan_summary1$home_ownership,est="mean",null=0,alternative="twosided",type="ht",method="theoretical")
## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_MORTGAGE = 236, mean_MORTGAGE = 17059.43, sd_MORTGAGE = 8780.721
## n_RENT = 187, mean_RENT = 13962.97, sd_RENT = 8424.964
## Observed difference between means (MORTGAGE-RENT) = 3096.46
##
## H0: mu_MORTGAGE - mu_RENT = 0
## HA: mu_MORTGAGE - mu_RENT != 0
## Standard error = 840.4
## Test statistic: Z = 3.685
## p-value = 2e-04
# As the p-value is low, we reject the hypothesys
#Confidence interval test
inference(y=loan_summary1$funded_amnt,x=loan_summary1$home_ownership,est="mean",null=0,alternative="twosided",type="ci",conflevel=0.95,method="theoretical")
## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_MORTGAGE = 236, mean_MORTGAGE = 17059.43, sd_MORTGAGE = 8780.721
## n_RENT = 187, mean_RENT = 13962.97, sd_RENT = 8424.964
## Observed difference between means (MORTGAGE-RENT) = 3096.46
##
## Standard error = 840.3997
## 95 % Confidence interval = ( 1449.3068 , 4743.6133 )