Data Visualization - Python (no Tabs or TOC)

Analysis of Citations Issued by Baltimore City (2013-2020)

Here is some general information on my data. Let’s now look at the results.

First up is a Bar Chart

Here’s what I found in my bar chart.

# I added these two lines as I had some errors when knitting my rmarkdown code
import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'c:/users/pptallon/Anaconda3/Library/plugins/platforms'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

path = "C:/Users/pptallon/Dropbox/G/Teaching/Data Visualization Data Files/Python_datafiles/"

filename = path + 'Baltimore_Traffic_Citations.csv'

df = pd.read_csv(filename, usecols = ['Tag', 'ViolFine', 'ViolDate'])

df.Tag.fillna("Not Available", inplace=True)

df['ViolDate'] = pd.to_datetime(df['ViolDate'], format = '%m/%d/%Y %I:%M:%S %p')
df['Hour'] = df.ViolDate.dt.hour
df['Day'] = df.ViolDate.dt.day
df['Month'] = df.ViolDate.dt.month
df['Year'] = df.ViolDate.dt.year
df['WeekDay'] = df.ViolDate.dt.strftime('%a')
df['MonthName'] = df.ViolDate.dt.strftime('%b')

x = df.groupby(['Tag']).agg({'Tag':['count'], 'ViolFine':['sum', 'mean']}).reset_index()
x.columns = ['Tag', 'Count', 'TotalFines', 'AverFine']
x = x.sort_values('Count', ascending=False)

PossibleBadTags = x['Tag'].str.contains('TAG|Not Available|Tag')
PossibleBadRows =  x[ x['Tag'].str.contains('TAG|Not Available|Tag')]
KeepRows = 'SCRTAGT|LITAG8R|V1NTAGE|WTTAGW1|DATAGUY|TAG944|TAGDAT|TAG DAT|TAG ART'
deleteRows = PossibleBadRows[  -PossibleBadRows.Tag.str.contains(KeepRows)  ]

a = deleteRows.Count.sum()
b = deleteRows.TotalFines.sum()
c = b/a

x = x[  -x['Tag'].isin(deleteRows.Tag)  ]
x.loc[x.index.max()+1] = ['Missing', a, b, c]
x = x.sort_values('Count', ascending=False)
x.reset_index(inplace=True, drop=True)

def pick_colors_according_to_mean_count(this_data):
    colors=[]
    avg = this_data.Count.mean()
    for each in this_data.Count:
        if each > avg*1.01:
            colors.append('lightcoral')
        elif each < avg*0.99:
            colors.append('green')
        else:
            colors.append('black')
    return colors  
 
import matplotlib.patches as mpatches

bottom1 = 1
top1 = 250
d1 = x.loc[bottom1:top1]
my_colors1 = pick_colors_according_to_mean_count(d1)

bottom2 = 1
top2 = 10
d2 = x.loc[bottom2:top2]
my_colors2 = pick_colors_according_to_mean_count(d2)

Above = mpatches.Patch(color='lightcoral', label='Above Average')
At = mpatches.Patch(color='black', label='Within 1% of the Average')
Below = mpatches.Patch(color='green', label='Below Average')

fig = plt.figure(figsize=(18, 16))
fig.suptitle('Frequency of Citation Analysis by Registration Tag:\n Top ' + str(top1) + ' and Top ' + str(top2), 
             fontsize=18, fontweight='bold')

ax1 = fig.add_subplot(2, 1, 1)
ax1.bar(d1.Tag, d1.Count, label='Count', color=my_colors1)

ax1.legend(handles=[Above, At, Below], fontsize=14)
plt.axhline(d1.Count.mean(), color='black', linestyle='dashed')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.axes.xaxis.set_visible(False)
ax1.set_title('Top ' + str(top1) + ' Citations', size=20)
ax1.text(top1-10, d1.Count.mean()+5, 'Mean = ' + str(d1.Count.mean()), rotation=0, fontsize=14 )
ax2 = fig.add_subplot(2, 1, 2)
ax2.bar(d2.Tag, d2.Count, label='Count', color=my_colors2)

ax2.legend(handles=[Above, At, Below], fontsize=14)
plt.axhline(d2.Count.mean(), color='black', linestyle='solid')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.set_title('Top ' + str(top2) + ' Citations', size=20)
ax2.text(top2-1, d2.Count.mean()+5, 'Mean = ' + str(d2.Count.mean()), rotation=0, fontsize=14 )

fig.subplots_adjust(hspace = 0.35)

plt.show()

Next up is a Bar Chart

Here’s what I found in my next bar chart.

def autolabel(these_bars, this_ax, place_of_decimals, symbol):
    for each_bar in these_bars:
        height = each_bar.get_height()
        this_ax.text(each_bar.get_x()+each_bar.get_width()/2, height*1.01, symbol+format(height, place_of_decimals),
                    fontsize=11, color='black', ha='center', va='bottom')
                    
fig = plt.figure(figsize=(18, 10))
ax1 = fig.add_subplot(1, 1, 1)
ax2 = ax1.twinx()
bar_width = 0.4

x_pos = np.arange(10)
count_bars = ax1.bar(x_pos-(0.5*bar_width), d2.Count, bar_width, color='gray', edgecolor='black', label='Citation Count')
aver_fine_bars = ax2.bar(x_pos+(0.5*bar_width), d2.AverFine, bar_width, color='green', edgecolor='black', label='Average Fine')

ax1.set_xlabel('Vehicle Tag', fontsize=18)
ax1.set_ylabel('Count of Citations', fontsize=18, labelpad=20)
ax2.set_ylabel('Average Fine', fontsize=18, rotation=270, labelpad=20)
ax1.tick_params(axis='y', labelsize=14)
ax2.tick_params(axis='y', labelsize=14)

plt.title('Citation Count and Average Fine Analysis\n Top 10 Most Frequently Cited Tags', fontsize=18)
ax1.set_xticks(x_pos)

ax1.set_xticklabels(d2.Tag, fontsize=14)

count_color, count_label = ax1.get_legend_handles_labels()
fine_color, fine_label   = ax2.get_legend_handles_labels()
legend = ax1.legend(count_color + fine_color, count_label + fine_label, loc='upper left', frameon=True, ncol=1, shadow=True,
                   borderpad=1, fontsize=14)
ax1.set_ylim(0, d2.Count.max()*1.50)

autolabel(count_bars, ax1, '.0f', '')
autolabel(aver_fine_bars, ax2, '.2f', '$')

plt.show()

Line Plots

Here is what I found.

fine_df = df.groupby(['Hour', 'WeekDay'])['ViolFine'].sum().reset_index(name='TotalFines')

from matplotlib.ticker import FuncFormatter

fig = plt.figure(figsize = (18, 10))
ax = fig.add_subplot(1, 1, 1)

my_colors = {'Mon':'blue',
             'Tue':'red',
             'Wed':'green',
             'Thu':'gray',
             'Fri':'purple',
             'Sat':'gold',
             'Sun':'brown'}

for key, grp in fine_df.groupby(['WeekDay']):
    grp.plot(ax=ax, kind='line', x='Hour', y ='TotalFines', color=my_colors[key], label=key, marker='8')

plt.title('Total Fines by Hour', fontsize=18)
ax.set_xlabel('Hour (24 Hour Interval)', fontsize=18)
ax.set_ylabel('Total Fines ($M)', fontsize=18, labelpad=20)  
ax.tick_params(axis='x', labelsize=14, rotation=0)
ax.tick_params(axis='y', labelsize=14, rotation=0)

ax.set_xticks(np.arange(24))

handles, labels = ax.get_legend_handles_labels()
handles = [ handles[1], handles[5], handles[6], handles[4], handles[0], handles[2], handles[3]  ]
labels  = [  labels[1],  labels[5],  labels[6],  labels[4],  labels[0],  labels[2],  labels[3]  ]
plt.legend(handles, labels, loc='best', fontsize=14, ncol=1)

ax.yaxis.set_major_formatter( FuncFormatter( lambda x, pos:('$%1.1fM')%(x*1e-6)))
    
plt.show()

Gotta Love Pie Charts

I had to shrink the text sizes on this and the pie fig size to get it to look this way.

df['Quarter'] = 'Quarter ' + df.ViolDate.dt.quarter.astype('string')

pie_df = df.groupby(['Quarter', 'MonthName', 'Month'])['ViolFine'].sum().reset_index(name='TotalFines')

pie_df.sort_values(by=['Month'], inplace=True)

pie_df.reset_index(inplace=True, drop=True)

del pie_df['Month']

number_outside_colors = len(pie_df.Quarter.unique())
outside_color_ref_number = np.arange(number_outside_colors)*4

number_inside_colors = len(pie_df.MonthName.unique())
all_color_ref_number = np.arange(number_outside_colors + number_inside_colors)

inside_color_ref_number = []
for each in all_color_ref_number:
    if each not in outside_color_ref_number:
        inside_color_ref_number.append(each)
        
fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(1, 1, 1)

colormap = plt.get_cmap("tab20c")
outer_colors = colormap(outside_color_ref_number)

all_fines = pie_df.TotalFines.sum()

pie_df.groupby(['Quarter'])['TotalFines'].sum().plot(
       kind='pie', radius=1, colors = outer_colors, pctdistance = 0.85, labeldistance = 1.1,
       wedgeprops = dict(edgecolor='W'), textprops= {'fontsize':13},
       autopct = lambda p: '{:.2f}%\n(${:.1f}M)'.format(p,(p/100)*all_fines/1e+6),
       startangle=90)

inner_colors = colormap(inside_color_ref_number)
pie_df.TotalFines.plot(
       kind='pie', radius=0.7, colors = inner_colors, pctdistance = 0.55, labeldistance = 0.8,
       wedgeprops = dict(edgecolor='W'), textprops= {'fontsize':11},
       labels = pie_df.MonthName,
       autopct = '%1.2f%%',
       startangle=90)

hole = plt.Circle((0,0), 0.3, fc='white')
fig1 = plt.gcf()
fig1.gca().add_artist(hole)

ax.yaxis.set_visible(False)
plt.title('Total Fines by Quarter and Month', fontsize=14)

ax.text(0, 0, 'Total Fines\n' + '$' + str(round(all_fines/1e6,2)) + 'M', size=13, ha='center', va='center'   )

ax.axis('equal')

plt.tight_layout()
plt.show()

Plotly is Nice too

To include this plotly chart, I had to output it as an HTML file and then read it immediately back in. I commented out the line for plt.show() and added some extra lines below this to output it to an html file.

wf_df = df[df['Year'] == 2019].groupby(['MonthName'])['ViolFine'].sum().reset_index(name='TotalFines')

wf_df['Budget'] = 4.6e6

wf_df['Deviation'] = wf_df.TotalFines - wf_df.Budget

wf_df.loc[wf_df.index.max()+1] = ['Total',
                                  wf_df.TotalFines.sum(),
                                  wf_df.Budget.sum(),
                                  wf_df.TotalFines.sum() - wf_df.Budget.sum()]

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Total']

wf_df.MonthName = pd.Categorical(wf_df.MonthName, categories = months, ordered = True)

wf_df.sort_values(by='MonthName', inplace=True)

wf_df.reset_index(inplace = True, drop = True)

import plotly.graph_objects as go

if wf_df.loc[12, 'Deviation'] > 0:
    end_color = 'black'
elif wf_df.loc[12, 'Deviation'] < 0:
    end_color = 'red'
else: end_color = 'blue'

fig = go.Figure( go.Waterfall( name='', orientation = 'v', x = wf_df['MonthName'], textposition='outside',
                              measure = ['relative', 'relative', 'relative', 'relative', 'relative', 'relative', 
                                         'relative', 'relative', 'relative', 'relative', 'relative', 'relative', 'total' ],
                              y = wf_df['Deviation']/1e6,
                              text = ['${:.2f}M'.format(each/1e6) for each in wf_df['TotalFines']],
                              decreasing = {'marker':{'color':'red'}},
                              increasing = {'marker':{'color':'green'}},
                              totals     = {'marker':{'color': end_color}},
                              hovertemplate = 'Cumulative Deviation to Date: ' + '$%{y:,.2f}M' + '<br>' +
                                              'Total Fines in %{x}: %{text}'))

fig.layout = go.Layout(yaxis=dict(tickformat='.1f'))

fig.update_xaxes(title_text='Months', title_font = {'size': 18})

fig.update_yaxes(title_text='Total Fines (Running Total $M)', title_font = {'size':18}, 
                 dtick=0.5, tickprefix = '$', ticksuffix = 'M', zeroline=True  )

fig.update_layout(title = dict( text='Deviation between Actual and Budgeted Monthly Fines in 2019 (Waterfall Diagram)<br>' +
                              'Surpluses appear in Green, Deficits appear in Red', 
                               font = dict( family='Arial', size=18, color='black' )),
                 
                  template='simple_white',
                  title_x = 0.5,
                  showlegend = False,
                  autosize=True,
                  margin=dict(l=30, r=30, t=60, b=30)
                 )

#fig.show()

import plotly.io as pio
pio.write_html(fig, path+"plotly_result.html", auto_open=False)

Bump Charts

This looks nice too.

bump_df = df.groupby(['Year', 'MonthName'])['ViolFine'].sum().reset_index(name='TotalFines')

bump_df = bump_df.pivot(index='Year', columns='MonthName', values = 'TotalFines')

month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

bump_df = bump_df.reindex(columns=month_order)

bump_df = bump_df.dropna()

bump_df_ranked = bump_df.rank(0, ascending=False, method='min')

bump_df_ranked = bump_df_ranked.T

fig = plt.figure(figsize=(20, 12))
ax = fig.add_subplot(1, 1, 1)

bump_df_ranked.plot(kind='line', ax=ax, marker='o', markeredgewidth=1, linewidth=6, 
                   markersize=44, 
                   markerfacecolor='white')

ax.invert_yaxis()

num_rows = bump_df_ranked.shape[0]
num_cols = bump_df_ranked.shape[1]

plt.ylabel('Monthly Ranking', fontsize=18, labelpad=10)
plt.title('Ranking of Total Fines by Month and by Year \n Bump Chart', fontsize=18, pad=15)
plt.xticks(np.arange(num_rows), month_order, fontsize=14)

plt.yticks(range(1, num_cols+1, 1), fontsize=14)

ax.set_xlabel('Month', fontsize=18)

handles, labels = ax.get_legend_handles_labels()
handles = [ handles[6], handles[5], handles[4], handles[3], handles[2], handles[1], handles[0] ]
labels  = [  labels[6],  labels[5],  labels[4],  labels[3],  labels[2],  labels[1],  labels[0] ]
ax.legend(handles, labels, bbox_to_anchor=(1.01, 1.01), fontsize=14,
         labelspacing = 1,
         markerscale = .4,
         borderpad = 1,
         handletextpad = 0.8)

i = 0
j = 0
for eachcol in bump_df_ranked.columns:
    for eachrow in bump_df_ranked.index:
        this_rank = bump_df_ranked.iloc[i, j]
        ax.text(i, this_rank, '$' + str(round(bump_df.iloc[j, i]/1e6,1)) + 'M', ha='center', va='center', fontsize=12)
        i+=1
    j+=1
    i=0    

plt.show()

Here is a caption at the end of my bump chart

Data Visualization - Python (no Tabs or TOC)

P. Tallon

4/19/1775