library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.5
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.5
## corrplot 0.88 loaded
library(GGally)
## Warning: package 'GGally' was built under R version 4.0.5
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(DT)
## Warning: package 'DT' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.5
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(sqldf)
## Warning: package 'sqldf' was built under R version 4.0.5
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.0.5
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.0.5
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.0.5
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.0.5
## 
## Attaching package: 'DescTools'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
loan = read_csv("/Users/Sandra/Desktop/NEC/MS-Data Analytics/Data Analytics/Term 1/Data Science Tools/Week 11/Final Case Analysis/data2013.csv" , na = "")
## Warning: Missing column names filled in: 'X1' [1]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   X1 = col_double(),
##   loan_amnt = col_double(),
##   annual_inc = col_double(),
##   dti = col_double(),
##   delinq_2yrs = col_double(),
##   inq_last_6mths = col_double(),
##   mths_since_last_delinq = col_double(),
##   open_acc = col_double(),
##   total_pymnt = col_double(),
##   total_rec_int = col_double(),
##   last_pymnt_amnt = col_double(),
##   tot_cur_bal = col_double(),
##   acc_open_past_24mths = col_double(),
##   pub_rec_bankruptcies = col_double(),
##   `Orig. Index` = col_double(),
##   issue_Year = col_double()
## )
## i Use `spec()` for the full column specifications.
dim(loan)
## [1] 134814     32
colnames(loan)
##  [1] "X1"                     "loan_amnt"              "term"                  
##  [4] "int_rate"               "grade"                  "sub_grade"             
##  [7] "emp_title"              "emp_length"             "home_ownership"        
## [10] "annual_inc"             "issue_d"                "loan_status"           
## [13] "purpose"                "title"                  "addr_state"            
## [16] "dti"                    "delinq_2yrs"            "inq_last_6mths"        
## [19] "mths_since_last_delinq" "open_acc"               "total_pymnt"           
## [22] "total_rec_int"          "last_pymnt_d"           "last_pymnt_amnt"       
## [25] "last_credit_pull_d"     "application_type"       "tot_cur_bal"           
## [28] "acc_open_past_24mths"   "pub_rec_bankruptcies"   "Orig. Index"           
## [31] "issue_Month"            "issue_Year"
ggplot(data=loan, aes(loan_amnt))+geom_histogram(bins = 40,color="blue",fill="purple")
## Warning: Removed 1 rows containing non-finite values (stat_bin).

Desc(loan$loan_amnt, main = "Loan amount distribution", plotit = TRUE)
## ------------------------------------------------------------------------------ 
## Loan amount distribution
## 
##      length         n       NAs     unique         0s       mean     meanCI'
##     134'814   134'813         1      1'221          0  14'698.89  14'655.67
##                100.0%      0.0%                  0.0%             14'742.12
##                                                                            
##         .05       .10       .25     median        .75        .90        .95
##    4'000.00  5'000.00  8'500.00  13'000.00  20'000.00  25'995.00  30'000.00
##                                                                            
##       range        sd     vcoef        mad        IQR       skew       kurt
##   34'000.00  8'097.72      0.55   7'783.65  11'500.00       0.66      -0.20
##                                                                            
## lowest : 1'000.0 (350), 1'025.0 (2), 1'075.0, 1'100.0 (7), 1'125.0 (3)
## highest: 34'825.0, 34'900.0 (2), 34'925.0, 34'975.0 (6), 35'000.0 (4'316)
## 
## heap(?): remarkable frequency (7.3%) for the mode(s) (= 10000)
## 
## ' 95%-CI (classic)

Desc(loan$purpose, main = "Loan purposes", plotit = TRUE)
## ------------------------------------------------------------------------------ 
## Loan purposes
## 
##    length       n     NAs  unique  levels   dupes
##   134'814 134'813       1      13      13       y
##            100.0%    0.0%                        
## 
##                  level    freq   perc  cumfreq  cumperc
## 1   debt_consolidation  80'635  59.8%   80'635    59.8%
## 2          credit_card  32'797  24.3%  113'432    84.1%
## 3     home_improvement   7'394   5.5%  120'826    89.6%
## 4                other   5'851   4.3%  126'677    94.0%
## 5       major_purchase   2'302   1.7%  128'979    95.7%
## 6       small_business   1'356   1.0%  130'335    96.7%
## 7                  car   1'052   0.8%  131'387    97.5%
## 8              medical     890   0.7%  132'277    98.1%
## 9                house     676   0.5%  132'953    98.6%
## 10              moving     643   0.5%  133'596    99.1%
## 11             wedding     600   0.4%  134'196    99.5%
## 12            vacation     566   0.4%  134'762   100.0%
## ... etc.
##  [list output truncated]

ggplot(loan, aes(x=grade, y=loan_amnt, fill=grade)) +
  stat_summary(fun.y="sum", geom="bar") +
  labs(y ="Total Loan Amount",title="Total loan amount based on loan grade")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 1 rows containing non-finite values (stat_summary).

ggplot(data=loan, aes(grade,int_rate,fill=grade))+geom_boxplot(outlier.color = "blue")+labs(title="Box plot of Interest rate")

ggplot(data=loan,aes(loan_amnt, fill=grade))+
  geom_density(alpha=0.25) + 
  facet_grid(grade ~ .)
## Warning: Removed 1 rows containing non-finite values (stat_density).

hist1=sqldf("select count(*) as freq,grade,int_rate from loan group by grade,int_rate")

p <- ggplot(data = hist1, aes(x = int_rate, y = freq)) +
  geom_smooth(aes(colour = grade, fill = grade)) + facet_wrap(~ grade) + labs(x = "Interest Rate", y="Frequency")
 

ggplotly(p) 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
unique(loan$addr_state)
##  [1] "MA" "NY" "CO" "CA" "TX" "MI" "CT" "NC" "FL" "NM" "IN" "SC" "KY" "UT" "GA"
## [16] "NJ" "OH" "DE" "PA" "AZ" "IL" "WI" "MO" "WY" "TN" "WA" "MD" "VA" "MN" "NV"
## [31] "NH" "KS" "AL" "RI" "WV" "LA" "OR" "AR" "VT" "OK" "HI" "MT" "AK" "SD" "DC"
## [46] "NE" "IA" "MS" "ID" NA
suppressPackageStartupMessages(library(maps))
## Warning: package 'maps' was built under R version 4.0.5
loan$region <- loan$addr_state
loan$region <- as.factor(loan$region)
levels(loan$region)<- c("alaska", "alabama","arkansas", "arizona", "california","colorado","connecticut","district of columbia","delaware","florida","georgia","hawaii","iowa","idaho","illinois","indiana","kansas","kentucky","louisiana","massachusetts","maryland","maine","michigan","minnesota","missouri","mississippi","montana","north carolina","north dakota","nebraska","new hampshire","new jersey","new mexico","nevada","new york","ohio","oklahoma","oregon","pennsylvania","rhode island","south carolina","south dakota","tennessee","texas","utah","virginia","vermont","washington","wisconsin","west virginia","wyoming")

all_states <- map_data("state")
state_by_loan <-loan %>% group_by(region) %>%
                summarise(value = sum(loan_amnt, na.rm=TRUE))
state_by_loan$region <- as.character(state_by_loan$region)

Total <- merge(all_states, state_by_loan, by="region")

p <- ggplot()
p <- p + geom_polygon(data=Total, aes(x=long, y=lat, group = group, fill=Total$value),colour="white"
      ) + scale_fill_continuous(low = "skyblue", high = "darkblue", guide="colorbar")
P1 <- p + theme_bw()  + labs(fill = "Gradient of loan amount" 
                            ,title = "Heat Map of loan amount in all states", x="", y="")
P1 + scale_y_continuous(breaks=c()) + scale_x_continuous(breaks=c()) + theme(panel.border =  element_blank())
## Warning: Use of `Total$value` is discouraged. Use `value` instead.

ggplot(data = loan, aes(x=addr_state, fill=loan_status)) +
  labs(x="State", y="Total Loan issued") +
  geom_bar() +
  coord_flip()