library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.5
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.5
## corrplot 0.88 loaded
library(GGally)
## Warning: package 'GGally' was built under R version 4.0.5
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(DT)
## Warning: package 'DT' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.5
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(sqldf)
## Warning: package 'sqldf' was built under R version 4.0.5
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.0.5
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.0.5
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.0.5
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.0.5
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
loan = read_csv("/Users/Sandra/Desktop/NEC/MS-Data Analytics/Data Analytics/Term 1/Data Science Tools/Week 11/Final Case Analysis/data2013.csv" , na = "")
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## X1 = col_double(),
## loan_amnt = col_double(),
## annual_inc = col_double(),
## dti = col_double(),
## delinq_2yrs = col_double(),
## inq_last_6mths = col_double(),
## mths_since_last_delinq = col_double(),
## open_acc = col_double(),
## total_pymnt = col_double(),
## total_rec_int = col_double(),
## last_pymnt_amnt = col_double(),
## tot_cur_bal = col_double(),
## acc_open_past_24mths = col_double(),
## pub_rec_bankruptcies = col_double(),
## `Orig. Index` = col_double(),
## issue_Year = col_double()
## )
## i Use `spec()` for the full column specifications.
dim(loan)
## [1] 134814 32
colnames(loan)
## [1] "X1" "loan_amnt" "term"
## [4] "int_rate" "grade" "sub_grade"
## [7] "emp_title" "emp_length" "home_ownership"
## [10] "annual_inc" "issue_d" "loan_status"
## [13] "purpose" "title" "addr_state"
## [16] "dti" "delinq_2yrs" "inq_last_6mths"
## [19] "mths_since_last_delinq" "open_acc" "total_pymnt"
## [22] "total_rec_int" "last_pymnt_d" "last_pymnt_amnt"
## [25] "last_credit_pull_d" "application_type" "tot_cur_bal"
## [28] "acc_open_past_24mths" "pub_rec_bankruptcies" "Orig. Index"
## [31] "issue_Month" "issue_Year"
ggplot(data=loan, aes(loan_amnt))+geom_histogram(bins = 40,color="blue",fill="purple")
## Warning: Removed 1 rows containing non-finite values (stat_bin).

Desc(loan$loan_amnt, main = "Loan amount distribution", plotit = TRUE)
## ------------------------------------------------------------------------------
## Loan amount distribution
##
## length n NAs unique 0s mean meanCI'
## 134'814 134'813 1 1'221 0 14'698.89 14'655.67
## 100.0% 0.0% 0.0% 14'742.12
##
## .05 .10 .25 median .75 .90 .95
## 4'000.00 5'000.00 8'500.00 13'000.00 20'000.00 25'995.00 30'000.00
##
## range sd vcoef mad IQR skew kurt
## 34'000.00 8'097.72 0.55 7'783.65 11'500.00 0.66 -0.20
##
## lowest : 1'000.0 (350), 1'025.0 (2), 1'075.0, 1'100.0 (7), 1'125.0 (3)
## highest: 34'825.0, 34'900.0 (2), 34'925.0, 34'975.0 (6), 35'000.0 (4'316)
##
## heap(?): remarkable frequency (7.3%) for the mode(s) (= 10000)
##
## ' 95%-CI (classic)

Desc(loan$purpose, main = "Loan purposes", plotit = TRUE)
## ------------------------------------------------------------------------------
## Loan purposes
##
## length n NAs unique levels dupes
## 134'814 134'813 1 13 13 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 debt_consolidation 80'635 59.8% 80'635 59.8%
## 2 credit_card 32'797 24.3% 113'432 84.1%
## 3 home_improvement 7'394 5.5% 120'826 89.6%
## 4 other 5'851 4.3% 126'677 94.0%
## 5 major_purchase 2'302 1.7% 128'979 95.7%
## 6 small_business 1'356 1.0% 130'335 96.7%
## 7 car 1'052 0.8% 131'387 97.5%
## 8 medical 890 0.7% 132'277 98.1%
## 9 house 676 0.5% 132'953 98.6%
## 10 moving 643 0.5% 133'596 99.1%
## 11 wedding 600 0.4% 134'196 99.5%
## 12 vacation 566 0.4% 134'762 100.0%
## ... etc.
## [list output truncated]

ggplot(loan, aes(x=grade, y=loan_amnt, fill=grade)) +
stat_summary(fun.y="sum", geom="bar") +
labs(y ="Total Loan Amount",title="Total loan amount based on loan grade")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 1 rows containing non-finite values (stat_summary).

ggplot(data=loan, aes(grade,int_rate,fill=grade))+geom_boxplot(outlier.color = "blue")+labs(title="Box plot of Interest rate")

ggplot(data=loan,aes(loan_amnt, fill=grade))+
geom_density(alpha=0.25) +
facet_grid(grade ~ .)
## Warning: Removed 1 rows containing non-finite values (stat_density).

hist1=sqldf("select count(*) as freq,grade,int_rate from loan group by grade,int_rate")
p <- ggplot(data = hist1, aes(x = int_rate, y = freq)) +
geom_smooth(aes(colour = grade, fill = grade)) + facet_wrap(~ grade) + labs(x = "Interest Rate", y="Frequency")
ggplotly(p)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
unique(loan$addr_state)
## [1] "MA" "NY" "CO" "CA" "TX" "MI" "CT" "NC" "FL" "NM" "IN" "SC" "KY" "UT" "GA"
## [16] "NJ" "OH" "DE" "PA" "AZ" "IL" "WI" "MO" "WY" "TN" "WA" "MD" "VA" "MN" "NV"
## [31] "NH" "KS" "AL" "RI" "WV" "LA" "OR" "AR" "VT" "OK" "HI" "MT" "AK" "SD" "DC"
## [46] "NE" "IA" "MS" "ID" NA
suppressPackageStartupMessages(library(maps))
## Warning: package 'maps' was built under R version 4.0.5
loan$region <- loan$addr_state
loan$region <- as.factor(loan$region)
levels(loan$region)<- c("alaska", "alabama","arkansas", "arizona", "california","colorado","connecticut","district of columbia","delaware","florida","georgia","hawaii","iowa","idaho","illinois","indiana","kansas","kentucky","louisiana","massachusetts","maryland","maine","michigan","minnesota","missouri","mississippi","montana","north carolina","north dakota","nebraska","new hampshire","new jersey","new mexico","nevada","new york","ohio","oklahoma","oregon","pennsylvania","rhode island","south carolina","south dakota","tennessee","texas","utah","virginia","vermont","washington","wisconsin","west virginia","wyoming")
all_states <- map_data("state")
state_by_loan <-loan %>% group_by(region) %>%
summarise(value = sum(loan_amnt, na.rm=TRUE))
state_by_loan$region <- as.character(state_by_loan$region)
Total <- merge(all_states, state_by_loan, by="region")
p <- ggplot()
p <- p + geom_polygon(data=Total, aes(x=long, y=lat, group = group, fill=Total$value),colour="white"
) + scale_fill_continuous(low = "skyblue", high = "darkblue", guide="colorbar")
P1 <- p + theme_bw() + labs(fill = "Gradient of loan amount"
,title = "Heat Map of loan amount in all states", x="", y="")
P1 + scale_y_continuous(breaks=c()) + scale_x_continuous(breaks=c()) + theme(panel.border = element_blank())
## Warning: Use of `Total$value` is discouraged. Use `value` instead.

ggplot(data = loan, aes(x=addr_state, fill=loan_status)) +
labs(x="State", y="Total Loan issued") +
geom_bar() +
coord_flip()
