# load data of 2016 Q1

set.seed(7340)

#Getting the sample size of 1000 from loans stats 2016 Q1 dataset 

loans_sample <- read.csv ("C:/Users/Shyam/Google Drive/CUNY/Assignment_Repositories/606 - Project/LoanStats_securev1_2016Q1.csv",header = TRUE,skip = 1, sep = ",",stringsAsFactors = FALSE,skipNul = TRUE) %>% sample_n(1000)

#loans_sample <- read.csv ("C:/Users/paperspace/Google Drive/CUNY/Assignment_Repositories/606 - Project/LoanStats_securev1_2016Q1.csv",header = TRUE,skip = 1, sep = ",",stringsAsFactors = FALSE,skipNul = TRUE) %>% sample_n(1000,replace = FALSE)



# 1. Select the required columns
# 2. Convert the column term to numeric
# 3. Remove unwanted columns and change the NA data(if any)
# 4. Calculate the total payment by each customer adding the interest rate

loans_summary <-  
loans_sample %>% select(c(id,loan_amnt,funded_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,addr_state,fico_range_low,fico_range_high,issue_d,loan_status,purpose,total_pymnt_inv)) %>% 
  mutate(fico_score=(fico_range_low+fico_range_high)/2) %>% 
    mutate(term1=as.numeric(str_trim(str_replace(term,"months","")))) %>% 
      select(-term,-fico_range_low,-fico_range_high) %>% 
        mutate(emp_length=str_replace(emp_length,c("n/a"),"NA")) %>% 
          mutate(int_rate=(as.numeric(str_replace(as.character(int_rate),"%",""))))  %>%  mutate(total_pymnt=(funded_amnt+(funded_amnt*int_rate*(term1/12))/100))



#Change interest rate to numeric

loans_summary$int_rate <- as.numeric(loans_summary$int_rate)

#Changing required columns as factors and numeric

loans_summary$home_ownership <- as.factor(loans_summary$home_ownership)

loans_summary$verification_status <- as.factor(loans_summary$verification_status)

loans_summary$purpose <- as.factor(loans_summary$purpose)
loans_summary$loan_status <- as.factor(loans_summary$loan_status)

#As we are concentrated on home ownership, we are replacing OWN to 

Research question

You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.

Below are some research questions from Sample data

  1. Is loan interest % predictive of credit score?
  2. Is loan funded amount are equal for different purpose of loan request?
  3. Does different states have equal interest rate?
  4. Does home ownership really impact FICO scores or just by chance?
  5. Number of loan requests which will be received in next month

Cases

What are the cases, and how many are there?

For 2016 Q1, there are around 133K observations. For modelling purposes, I have taken a sample of 1000 rows.

Data collection

Describe the method of data collection.

Data is collected from Lending loan website (https://www.lendingclub.com/info/download-data.action).

Type of study

What type of study is this (observational/experiment)?

This is an observational study.

Data Source

If you collected the data, state self-collected. If not, provide a citation/link.

Link: https://www.lendingclub.com/info/download-data.action


Response

What is the response variable, and what type is it (numerical/categorical)?

Explanatory

What is the explanatory variable, and what type is it (numerical/categorical)?

Below table summarizes the question, response and explanatory variable. It also shows whether it is Numerical or Categorical.

Question Response Variable Explanatory Variable
1. Is loan interest rate % predictive of credit score? Interest rate % (Numerical) Credit score (Numerical)
2. Is loan funded amount are equal for different purpose of loan request? Funded amount (Numerical) Different purpose (Categorical)
3. Does different states have equal interest rate? Interest rate % (Numerical) States (Categorical)
4. Does home ownership really impact FICO scores or just by chance? FICO scores (Numerical) Home Ownership (Categorical)
5. Number of loan requests which will be received in next month Loan requests (Numerical) Months (Categorical)

Relevant summary statistics

Provide relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

Sample size = 1000

summary(loans_summary)
##       id              loan_amnt      funded_amnt       int_rate    
##  Length:1000        Min.   : 1000   Min.   : 1000   Min.   : 5.32  
##  Class :character   1st Qu.: 8000   1st Qu.: 8000   1st Qu.: 9.16  
##  Mode  :character   Median :14650   Median :14650   Median :11.48  
##                     Mean   :15559   Mean   :15559   Mean   :12.30  
##                     3rd Qu.:21000   3rd Qu.:21000   3rd Qu.:14.96  
##                     Max.   :40000   Max.   :40000   Max.   :28.34  
##                                                                    
##   emp_length         home_ownership   annual_inc     
##  Length:1000        MORTGAGE:488    Min.   :  12500  
##  Class :character   OWN     :125    1st Qu.:  47828  
##  Mode  :character   RENT    :387    Median :  68000  
##                                     Mean   :  82010  
##                                     3rd Qu.:  99000  
##                                     Max.   :1000000  
##                                                      
##       verification_status  addr_state          issue_d         
##  Not Verified   :359      Length:1000        Length:1000       
##  Source Verified:368      Class :character   Class :character  
##  Verified       :273      Mode  :character   Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##              loan_status                purpose    total_pymnt_inv
##  Charged Off       :  2   debt_consolidation:589   Min.   :    0  
##  Current           :920   credit_card       :239   1st Qu.: 1803  
##  Fully Paid        : 53   home_improvement  : 56   Median : 2875  
##  In Grace Period   :  9   other             : 42   Mean   : 4039  
##  Late (16-30 days) :  5   major_purchase    : 23   3rd Qu.: 4611  
##  Late (31-120 days): 11   car               : 12   Max.   :36773  
##                           (Other)           : 39                  
##    fico_score        term1        total_pymnt   
##  Min.   :662.0   Min.   :36.00   Min.   : 1390  
##  1st Qu.:672.0   1st Qu.:36.00   1st Qu.:11297  
##  Median :689.5   Median :36.00   Median :20396  
##  Mean   :696.2   Mean   :42.41   Mean   :23058  
##  3rd Qu.:712.0   3rd Qu.:60.00   3rd Qu.:32359  
##  Max.   :827.0   Max.   :60.00   Max.   :76500  
## 
loandatamean <- group_by(loans_summary,purpose) %>% summarise(mean(funded_amnt))

loandatasd <- group_by(loans_summary,purpose) %>% summarise(sd(funded_amnt))

ggplot(loans_summary,aes(x=funded_amnt)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Histogram of funded amount

ggplot(loans_summary,aes(x=funded_amnt)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Histogram of Interest rate %

ggplot(loans_summary,aes(x=int_rate)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#fico score vs interest rate
ggplotly(ggplot(loans_summary,aes(x=fico_score,y=int_rate)) + geom_point(aes(color=purpose),size=1,alpha=0.8) + geom_smooth(method = "lm"))
#funded amount are equal for different purpose

ggplot(loans_summary,aes(x=purpose,y=funded_amnt)) + geom_boxplot()


#Interest rate vs funded amount

ggplot(loans_summary,aes(x=int_rate,y=funded_amnt)) + geom_point(aes(color=term1)) 



# State vs  Interest rate % 

state_interestrate <- loans_summary %>% select(addr_state,int_rate) %>%  group_by(addr_state) %>% summarise(avg =mean(int_rate))

a1 <- list(autotick = TRUE, title = "State")

b1 <- list(title = "Interest rate %")

plot_ly(x = ~state_interestrate$addr_state, y = ~state_interestrate$avg, type = "area") %>% layout(xaxis=a1,yaxis=b1,title="State vs Average interest rate")
## Warning: 'area' objects don't have these attributes: 'x', 'y'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'hoverinfo', 'stream', 'r', 't', 'marker', 'rsrc', 'tsrc', 'key'
#Home ownership vs FICO scores

ggplot(loans_summary,aes(x=fico_score)) + geom_histogram() + facet_grid(.~home_ownership)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Loan request vs month

plot_ly(x = ~loans_summary$issue_d)
## No trace type specified:
##   Based on info supplied, a 'histogram' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#histogram
#remove(loans_sample,loans_summary,loandatamean,loandatasd,a1,p2,p3,p4,p5,a1,b1,g,h)

Confidence interval of the funded amount

#Point estimate for the population funded amount

p = mean(loans_summary$funded_amnt)
n=1000

se = (sd(loans_summary$funded_amnt))/sqrt(1000)

# 95% confidence interval

me = 1.96*se

# Confidence interval of funded amount

normalPlot(p,se,bounds=c(p-me,p+me))

Hypothesis test between the funded amount of RENT vs MORTGAGE

loan_summary1 <- filter(loans_summary,home_ownership==c("RENT","MORTGAGE"))

loan_summary1$home_ownership <- as.character(loan_summary1$home_ownership)


#Hypothesys test
inference(y=loan_summary1$funded_amnt,x=loan_summary1$home_ownership,est="mean",null=0,alternative="twosided",type="ht",method="theoretical")
## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_MORTGAGE = 236, mean_MORTGAGE = 17059.43, sd_MORTGAGE = 8780.721
## n_RENT = 187, mean_RENT = 13962.97, sd_RENT = 8424.964
## Observed difference between means (MORTGAGE-RENT) = 3096.46
## 
## H0: mu_MORTGAGE - mu_RENT = 0 
## HA: mu_MORTGAGE - mu_RENT != 0 
## Standard error = 840.4 
## Test statistic: Z =  3.685 
## p-value =  2e-04
# As the p-value is low, we reject the hypothesys

#Confidence interval test
inference(y=loan_summary1$funded_amnt,x=loan_summary1$home_ownership,est="mean",null=0,alternative="twosided",type="ci",conflevel=0.95,method="theoretical")
## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_MORTGAGE = 236, mean_MORTGAGE = 17059.43, sd_MORTGAGE = 8780.721
## n_RENT = 187, mean_RENT = 13962.97, sd_RENT = 8424.964
## Observed difference between means (MORTGAGE-RENT) = 3096.46
## 
## Standard error = 840.3997 
## 95 % Confidence interval = ( 1449.3068 , 4743.6133 )