import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import datetime as dt
import sys
import warnings
from matplotlib import style
import seaborn as sns
from matplotlib import rcParams
from scipy import stats
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly import tools
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
path = 'C:/Users/pptallon/Desktop/'
filename = path + 'loanstats3a.csv'
import pandas as pd
df = pd.read_csv(filename)
def resumetable(df):
print(f"Dataset Shape: {df.shape}")
summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
summary = summary.reset_index()
summary['Name'] = summary['index']
summary = summary[['Name','dtypes']]
summary['Missing'] = df.isnull().sum().values
summary['Uniques'] = df.nunique().values
summary['First Value'] = df.loc[0].values
summary['Second Value'] = df.loc[1].values
summary['Third Value'] = df.loc[2].values
for name in summary['Name'].value_counts().index:
summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2)
return summary
#print(summary)
resumetable(df.reset_index())[:50]
#*************************Create Subset of dataset with desired variables *************************************
df_1 = df[['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade',
'sub_grade', 'emp_title','emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status',
'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'earliest_cr_line', 'inq_last_6mths', 'open_acc','revol_bal', 'revol_util',
'total_acc', 'last_pymnt_d', 'last_pymnt_amnt', 'last_credit_pull_d']]
# Creating regions
import us
north_east = ['CT','MA','ME','NH','RI','VT']
mid_atlantic = [ 'NJ', 'NY','PA', 'DE', 'DC', 'MD','VA', 'WV']
south_east = ['AL', 'AR','FL', 'GA', 'KY', 'LA','NC', 'SC', 'MS','TN' ]
south_west = ['AZ', 'TX', 'NM', 'OK']
mid_west = ['IA','IL','IN','KS', 'MO', 'NE', 'MI','MN', 'OH', 'SD', 'WI', 'ND']
west = ['CA', 'OR', 'UT','WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']
# define the regions
df_1['region'] = np.nan
def finding_regions(state):
if state in north_east:
return 'North_East'
elif state in mid_atlantic:
return 'Mid_Atlantic'
elif state in south_east:
return 'South_East'
elif state in south_west:
return 'South_West'
elif state in mid_west:
return 'Mid_West'
elif state in west:
return 'West'
df_1['region'] = df_1['addr_state'].apply(finding_regions)
regional = df_1.groupby(['region']).agg({'loan_amnt':['sum']}).reset_index()
fig= plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)
sns.set(style="white", color_codes=True)
p2=sns.boxplot(x="region", y="loan_amnt", data=df_1)
sns.despine(offset=5, trim=True)
plt.xlabel('region')
ax.set_ylabel('Sum Total Loan by Purpose')
p2.set_title("Median Loan Funded by Region", fontsize=20)
for item in p2.get_xticklabels():
item.set_rotation(45)
plt.show()
# Distribution of loan funded by investors vers loan amount requested by applicants

fig = plt.figure(figsize=(12,12))
p4=sns.kdeplot(df_1['loan_amnt'], shade=True, bw=.05, color="olive")
p4=sns.kdeplot(df_1['funded_amnt_inv'], shade=True, bw=.05, color="orange")
p4.set_title("Distribution of Loan Amount funded and Loan Request", fontsize=18)
sns.despine(offset=5, trim=True)
for item in p4.get_xticklabels():
item.set_rotation(45)
# Note: Fairly consistant; nothing unusual. Looks like the majaority of the amounts requested matched what the investers funded
fig = plt.figure(figsize=(12,12))
# Grouped violinplot
p3=sns.violinplot(x="emp_length", y="loan_amnt", data=df_1, palette="Pastel1")
p3.set_title("Loan Amount and Length of Employment", fontsize=18, color='blue')
for item in p3.get_xticklabels():
item.set_rotation(45)
plt.show()
#Note: looks likes length of employment >= 8 had more loans at higher rate; also less than one year of employment
#had success in obtaining loans as well.
#***********Exploring proportion of loan grades awarded
# counting : this dataframe creates a count for the different grades of loans

x1=df_1.groupby(['grade']).agg({'grade':['count']})
fig = plt.figure(figsize=(12,12))
# Data to plot
labels = 'A', 'B', 'C', 'D', 'E', 'F', 'G'
colors = ['pink', 'blue', 'yellowgreen', 'purple', 'gold', 'turquoise', 'magenta']
explode = (0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.8)
# Plot
patches, texts, autotexts = plt.pie(x1, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=360)
[eachlabel.set_fontsize(12) for eachlabel in texts]
[eachlabel.set_fontsize(12) for eachlabel in autotexts]
[eachlabel.set_color('black') for eachlabel in autotexts]
autotexts[1].set_color('red')
texts[1].set_fontsize(8)
texts[2].set_fontsize(8)
plt.title('Proportion of Loans by Grade', fontsize=20, color='blue')
plt.axis('equal')
plt.show()
#Note: Most (75%) of the loans are of grade: A,B,C
#**************Exploring the home ownership profile of Applicants
# counting : this dataframe creates a count for the different homeownership

x=df_1.groupby(['home_ownership']).agg({'home_ownership':['count']})
#print(x)
fig = plt.figure(figsize=(12,12))
# Data to plot
labels = 'MORTGAGE', 'NONE','OTHER', 'OWN', 'RENT'
colors = ['gold', 'blue', 'yellowgreen', 'coral', 'lightskyblue']
explode = (0.1, 3, 2, 0, 0)
# Plot
patches, texts, autotexts = plt.pie(x, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=160)
[eachlabel.set_fontsize(16) for eachlabel in texts]
[eachlabel.set_fontsize(16) for eachlabel in autotexts]
[eachlabel.set_color('red') for eachlabel in autotexts]
autotexts[4].set_color('black')
texts[1].set_fontsize(10)
texts[2].set_fontsize(10)
plt.title('Home Ownsership Types of Loan Applicants', fontsize=20, color='blue')
plt.axis('equal')
plt.show()
#Note: almost half of the applicants rent, then 44.6 percent have a mortage
#**************Creating a dataframe with new summary variables

x5=df_1.groupby(['purpose']).agg({'purpose':['count'], 'funded_amnt':['mean'], 'annual_inc':['mean'], 'dti':['mean'], 'open_acc':['mean']}).reset_index()
x5.columns = ['Purpose', 'Count', 'Loan', 'Ann_Inc', 'DTI', 'Acc_aver']
x5= x5.sort_values('Count', ascending=True)
x5['Loan']=round(x5 ['Loan'],2)
x5['Ann_Inc']=round(x5['Ann_Inc'],)
x5['DTI']=round(x5['DTI'],2)
x5['Acc_aver']=round(x5['Acc_aver'],2)
#print(x5)
#Purpose of loan and relationship with number of accoutns open
fig = plt.figure(figsize=(18,18))
ax = fig.add_subplot(111)
sns.set(style="white", color_codes=True)
sns.boxplot(x="Purpose", y="Acc_aver", data=x5)
p1=sns.boxplot(y=df_1['open_acc'].astype('float'), x=df_1['purpose'], data=df_1)
#ax = sns.boxplot(...)
ax.get_xticklabels()
sns.despine(offset=5, trim=True)
for item in p1.get_xticklabels():
item.set_rotation(45)
p1.set_ylabel("Number of open Accounts", fontsize=12)
p1.set_title("Distribution of Loan Amount Borrowed \n with Number of Accounts open", fontsize=18)
plt.subplots_adjust(hspace = 1, top = 0.9)
plt.show()
#**************exploring purpose of loan and award according to debt income index************

fig = plt.figure(figsize=(12,12))
sns.set(style="darkgrid")
p9=sns.relplot(x="Loan", y="Purpose", hue="DTI",palette="ch:r=-.5,l=.75", data=x5)
plt.title('Loan Purpose, amount and relationship with Debt Inc. Ratio', fontsize=15)
plt.show()
# Libraries

import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(12,12))
x5_df = x5.set_index('Purpose')
my_palette = dict(zip(x5_df.DTI.unique(), ["orange","yellow","brown"]))
row_colors = x5_df.DTI.map(my_palette)
# plot
sns.clustermap(x5_df, metric="correlation", method="single", cmap="Blues", standard_scale=1, row_colors=row_colors)
layout = {"title": "Correlation Heatmap"}
plt.title('Relationship by correlating Loan Purpose, annual income and relationship with Debt Inc. Ratio', fontsize=10)
plt.show()
### ******************************CREATING A SOLID DATA FRAME WITH THE NAMES OF THE COLUMNS********************

x3=df_1.groupby(['sub_grade']).agg({'sub_grade':['count'], 'annual_inc':['mean'], 'total_acc':['sum', 'mean'],
'revol_bal':['mean']}).reset_index()
x3.columns = ['Sub_Grade', 'Count', 'Aver_AI', 'Sum_ACC', 'Aver_ACC', 'Aver_RB']
x3= x3.sort_values('Sub_Grade', ascending=True)
x3['Aver_AI']=round(x3['Aver_AI'],2)
x3['Aver_ACC']=round(x3['Aver_ACC'],)
x3['Aver_RB']=round(x3['Aver_RB'],2)
x3['Count']=round(x3['Count'],2)
#print(x3)
#specify my data so that i dont have to repeat loc all teh time
mydata=x3.loc[0:14, :]
mydata=mydata.sort_values('Count', ascending=True) #sort data
mydata.reset_index(inplace=True) #reset index
mydata['Aver_ACC']=round(mydata['Aver_ACC'],0)
#print(mydata)
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(111)
plt.barh(mydata.Sub_Grade, mydata.Count) # horizontal bars
for i, v in enumerate(mydata.Count):
ax.text(v+1, i-0.1, str(v), color='blue', fontweight='bold') # this is to label the bars
# labels and title
plt.xlabel('Count of loans for each Grade type')
ax.set_ylabel('Loan Grade')
plt.title('Loan by Grade Type Analysis', fontsize=18)
plt.show()
#-----------------------------add a second Y to the basic BAR GRAPH--------------------------------#

ind = np.arange(15)
fig = plt.figure(figsize = (15, 15))
ax = fig.add_subplot(111) # i row, 1 col
ax2 = ax.twinx() # box axes share the same x axis
bar_width = 0.4
opacity = 0.8 # control
style.use('ggplot')
rect1 = ax.bar(ind-(0.5*bar_width), mydata.Count, bar_width, alpha=opacity, color='b', label='Count')
rect2 = ax2.bar(ind+(0.5*bar_width), mydata.Aver_ACC, bar_width, alpha=opacity, color='g', label='Aver_AI')
# labels and titles
plt.xlabel('Loan Grade')
ax.set_ylabel('Count of loans by Grade type')
ax2.set_ylabel('Average # of Revolving Accounts Open')
plt.title('Loan Count by Grade Type versus # of Revolving Accounts of Loan Applicants')
# specify the ticks
ax.set_xticks(ind)
ax.set_xticklabels(mydata.Sub_Grade)
# add legend symbol and labels
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
legend = ax.legend(lines + lines2, labels+labels2, loc="upper left", frameon=True, ncol=1, shadow=True, borderpad=1,
fontsize=12)
# add text on the top of the bars : passing into function seveal things decimal, axis symbols and the rects
def autolabel(rects, p, ax, symbol):
for eachrect in rects:
h = eachrect.get_height()
ax.text(eachrect.get_x()+eachrect.get_width()/2, h*.9, symbol+format(h), fontsize=12, ha='center',
color='black',va='bottom')
autolabel(rect1,'1f', ax, '')
autolabel(rect2, '1f', ax2, '')
####****************Creating a dataframe for evaluating the top 15 states: those having the most loans with a condition****
x4=df_1.groupby(['addr_state']).agg({'addr_state':['count'], 'annual_inc':['mean'], 'total_acc':['sum', 'mean'],
'revol_bal':['mean']}).reset_index()
x4.columns = ['addr_state', 'Count', 'Aver_AI', 'Sum_ACC', 'Aver_ACC', 'Aver_RB']
x4['Aver_AI']=round(x4['Aver_AI'],2)
x4['Aver_ACC']=round(x4['Aver_ACC'],0)
x4['Aver_RB']=round(x4['Aver_RB'],2)
x4['Count']=round(x4['Count'],0)
x4['Aver_ACC']=round(x4['Aver_ACC'],0)
x4= x4.sort_values('Count', ascending=True)
#print(x4)
#*******************creating subset: TOP ADDRESS STATES OF APPLICANTS*******************************
top_states=x4.query('Count >= 837' )
print(top_states)
#x4[(x4.Count > 837 = True)]
#print(x4.Count)
#****************top 15 states with the most loans***************
fig = plt.figure(figsize=(12,12))
ax=fig.add_subplot(111)
plt.bar(top_states.addr_state, top_states.Count)
p10=barlist=plt.bar([1,2,3,4,5,6,7,8,9,10,11,12,13,14], [1,2,3,4,5,6,7,8,9,10,11,12,13,14])
barlist[0].set_color('blue')
plt.title('Top 15 states with the most loans awards', fontsize=15)
#p10.set_xlabel('States')
#p10.set_ylabel('Total number of loans')
def autolabel(rects, p, ax, symbol):
for eachrect in rects:
h = eachrect.get_height()
ax.text(eachrect.get_x()+eachrect.get_width()/2, h*0.0, symbol+format(h), fontweight='bold', fontsize=12, ha='center',
color='black',va='bottom')
autolabel(rect1,'1f', ax, '')
plt.show()

#my_fig.write_html("c:/Users/pptallon/Desktop/go1.html")
