Data Visualization - Python (Tabs & TOC)

Introduction

Here is an intro section.

Dataset

Here is something about my data.

Findings

This is some general text about my findings before I show the individual charts in tabs. If you add .tabset-pills inside the curly braces, it will generate orange tab buttons

Maps

Here is what I have under Tab 1

import pandas as pd
import numpy as np
import folium

path = "C:/Users/pptallon/Dropbox/G/Teaching/Data Visualization Data Files/"
filename = "Baltimore911_Dec2020.csv"
import_cols = ['Location', 'Description', 'CallDateTime', 'Neighborhood']

map_df = pd.read_csv(path + filename, usecols = import_cols, skiprows = 0, nrows=100000)

c = 0
for each in map_df['Location']:
    try:
        x = str(each).replace(")", "").split("(")[-1].split(",")
        map_df.loc[c, 'Lat'] = '%.6f' % float(x[0])
        map_df.loc[c, 'Lon'] = '%.6f' % float(x[1])
    except:
        map_df.loc[c, 'Lat'] = np.NaN
        map_df.loc[c, 'Lon'] = np.NaN
    c+=1

neigh_df = map_df.groupby(['Neighborhood']).size().reset_index(name="Count")

center_of_map = [39.3024273,-76.6195023]
my_map = folium.Map(location = center_of_map,  #Penn Station Baltimore Lat/Lon
                    zoom_start = 12,
                    tiles = 'cartodbpositron', # 'OpenStreetMap',
                    width='90%', 
                    height='100%', 
                    left='5%', 
                    top='0%',)  # 0 is furthest out shows earth, 11 is city level, 18 is closest  

ch_map = folium.Choropleth(
    geo_data = path + 'baltimore.txt',
    name = 'choropleth',
    data = neigh_df,
    columns = ['Neighborhood', 'Count'],
    key_on = 'feature.properties.name',
    fill_color = 'RdPu',
    fill_opacity = 0.9,
    line_opacity = 0.4,
    legend_name = 'Neighborhood Based on 911 Call Origination',
    highlight=True
).add_to(my_map)

# Display Region Label
ch_map.geojson.add_child(
    folium.features.GeoJsonTooltip(fields=['name'], aliases=['Neighborhood: '], 
                                   labels=True, style=('background-color: black; color: white;'))
)

my_map.save(path + 'Chloropleth_911_Baltimore.html')

Repeat Tags

Here’s what I found in my bar chart.

# I added these two lines as I had some errors when knitting my rmarkdown code
import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'c:/users/pptallon/Anaconda3/Library/plugins/platforms'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

path = "C:/Users/pptallon/Dropbox/G/Teaching/Data Visualization Data Files/Python_datafiles/"

filename = path + 'Baltimore_Traffic_Citations.csv'

df = pd.read_csv(filename, usecols = ['Tag', 'ViolFine', 'ViolDate'])

df.Tag.fillna("Not Available", inplace=True)

df['ViolDate'] = pd.to_datetime(df['ViolDate'], format = '%m/%d/%Y %I:%M:%S %p')
df['Hour'] = df.ViolDate.dt.hour
df['Day'] = df.ViolDate.dt.day
df['Month'] = df.ViolDate.dt.month
df['Year'] = df.ViolDate.dt.year
df['WeekDay'] = df.ViolDate.dt.strftime('%a')
df['MonthName'] = df.ViolDate.dt.strftime('%b')

x = df.groupby(['Tag']).agg({'Tag':['count'], 'ViolFine':['sum', 'mean']}).reset_index()
x.columns = ['Tag', 'Count', 'TotalFines', 'AverFine']
x = x.sort_values('Count', ascending=False)

PossibleBadTags = x['Tag'].str.contains('TAG|Not Available|Tag')
PossibleBadRows =  x[ x['Tag'].str.contains('TAG|Not Available|Tag')]
KeepRows = 'SCRTAGT|LITAG8R|V1NTAGE|WTTAGW1|DATAGUY|TAG944|TAGDAT|TAG DAT|TAG ART'
deleteRows = PossibleBadRows[  -PossibleBadRows.Tag.str.contains(KeepRows)  ]

a = deleteRows.Count.sum()
b = deleteRows.TotalFines.sum()
c = b/a

x = x[  -x['Tag'].isin(deleteRows.Tag)  ]
x.loc[x.index.max()+1] = ['Missing', a, b, c]
x = x.sort_values('Count', ascending=False)
x.reset_index(inplace=True, drop=True)

def pick_colors_according_to_mean_count(this_data):
    colors=[]
    avg = this_data.Count.mean()
    for each in this_data.Count:
        if each > avg*1.01:
            colors.append('lightcoral')
        elif each < avg*0.99:
            colors.append('green')
        else:
            colors.append('black')
    return colors  
 
import matplotlib.patches as mpatches

bottom1 = 1
top1 = 250
d1 = x.loc[bottom1:top1]
my_colors1 = pick_colors_according_to_mean_count(d1)

bottom2 = 1
top2 = 10
d2 = x.loc[bottom2:top2]
my_colors2 = pick_colors_according_to_mean_count(d2)

Above = mpatches.Patch(color='lightcoral', label='Above Average')
At = mpatches.Patch(color='black', label='Within 1% of the Average')
Below = mpatches.Patch(color='green', label='Below Average')

fig = plt.figure(figsize=(18, 16))
fig.suptitle('Frequency of Citation Analysis by Registration Tag:\n Top ' + str(top1) + ' and Top ' + str(top2), 
             fontsize=18, fontweight='bold')

ax1 = fig.add_subplot(2, 1, 1)
ax1.bar(d1.Tag, d1.Count, label='Count', color=my_colors1)

ax1.legend(handles=[Above, At, Below], fontsize=14)
plt.axhline(d1.Count.mean(), color='black', linestyle='dashed')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.axes.xaxis.set_visible(False)
ax1.set_title('Top ' + str(top1) + ' Citations', size=20)
ax1.text(top1-10, d1.Count.mean()+5, 'Mean = ' + str(d1.Count.mean()), rotation=0, fontsize=14 )
ax2 = fig.add_subplot(2, 1, 2)
ax2.bar(d2.Tag, d2.Count, label='Count', color=my_colors2)

ax2.legend(handles=[Above, At, Below], fontsize=14)
plt.axhline(d2.Count.mean(), color='black', linestyle='solid')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.set_title('Top ' + str(top2) + ' Citations', size=20)
ax2.text(top2-1, d2.Count.mean()+5, 'Mean = ' + str(d2.Count.mean()), rotation=0, fontsize=14 )

fig.subplots_adjust(hspace = 0.35)

plt.show()

Aver. Fines

Dual Axis Bar Charts

def autolabel(these_bars, this_ax, place_of_decimals, symbol):
    for each_bar in these_bars:
        height = each_bar.get_height()
        this_ax.text(each_bar.get_x()+each_bar.get_width()/2, height*1.01, symbol+format(height, place_of_decimals),
                    fontsize=11, color='black', ha='center', va='bottom')
                    
fig = plt.figure(figsize=(18, 10))
ax1 = fig.add_subplot(1, 1, 1)
ax2 = ax1.twinx()
bar_width = 0.4

x_pos = np.arange(10)
count_bars = ax1.bar(x_pos-(0.5*bar_width), d2.Count, bar_width, color='gray', edgecolor='black', label='Citation Count')
aver_fine_bars = ax2.bar(x_pos+(0.5*bar_width), d2.AverFine, bar_width, color='green', edgecolor='black', label='Average Fine')

ax1.set_xlabel('Vehicle Tag', fontsize=18)
ax1.set_ylabel('Count of Citations', fontsize=18, labelpad=20)
ax2.set_ylabel('Average Fine', fontsize=18, rotation=270, labelpad=20)
ax1.tick_params(axis='y', labelsize=14)
ax2.tick_params(axis='y', labelsize=14)

plt.title('Citation Count and Average Fine Analysis\n Top 10 Most Frequently Cited Tags', fontsize=18)
ax1.set_xticks(x_pos)

ax1.set_xticklabels(d2.Tag, fontsize=14)

count_color, count_label = ax1.get_legend_handles_labels()
fine_color, fine_label   = ax2.get_legend_handles_labels()
legend = ax1.legend(count_color + fine_color, count_label + fine_label, loc='upper left', frameon=True, ncol=1, shadow=True,
                   borderpad=1, fontsize=14)
ax1.set_ylim(0, d2.Count.max()*1.50)

autolabel(count_bars, ax1, '.0f', '')
autolabel(aver_fine_bars, ax2, '.2f', '$')

plt.show()

Hourly Fines

Analysis of line plots.

fine_df = df.groupby(['Hour', 'WeekDay'])['ViolFine'].sum().reset_index(name='TotalFines')

from matplotlib.ticker import FuncFormatter

fig = plt.figure(figsize = (18, 10))
ax = fig.add_subplot(1, 1, 1)

my_colors = {'Mon':'blue',
             'Tue':'red',
             'Wed':'green',
             'Thu':'gray',
             'Fri':'purple',
             'Sat':'gold',
             'Sun':'brown'}

for key, grp in fine_df.groupby(['WeekDay']):
    grp.plot(ax=ax, kind='line', x='Hour', y ='TotalFines', color=my_colors[key], label=key, marker='8')

plt.title('Total Fines by Hour', fontsize=18)
ax.set_xlabel('Hour (24 Hour Interval)', fontsize=18)
ax.set_ylabel('Total Fines ($M)', fontsize=18, labelpad=20)  
ax.tick_params(axis='x', labelsize=14, rotation=0)
ax.tick_params(axis='y', labelsize=14, rotation=0)

ax.set_xticks(np.arange(24))

handles, labels = ax.get_legend_handles_labels()
handles = [ handles[1], handles[5], handles[6], handles[4], handles[0], handles[2], handles[3]  ]
labels  = [  labels[1],  labels[5],  labels[6],  labels[4],  labels[0],  labels[2],  labels[3]  ]
plt.legend(handles, labels, loc='best', fontsize=14, ncol=1)

ax.yaxis.set_major_formatter( FuncFormatter( lambda x, pos:('$%1.1fM')%(x*1e-6)))
    
plt.show()

Quarterly Fines

Pie Charts

I had to shrink the text sizes on this and the pie fig size to get it to look this way.

df['Quarter'] = 'Quarter ' + df.ViolDate.dt.quarter.astype('string')

pie_df = df.groupby(['Quarter', 'MonthName', 'Month'])['ViolFine'].sum().reset_index(name='TotalFines')

pie_df.sort_values(by=['Month'], inplace=True)

pie_df.reset_index(inplace=True, drop=True)

del pie_df['Month']

number_outside_colors = len(pie_df.Quarter.unique())
outside_color_ref_number = np.arange(number_outside_colors)*4

number_inside_colors = len(pie_df.MonthName.unique())
all_color_ref_number = np.arange(number_outside_colors + number_inside_colors)

inside_color_ref_number = []
for each in all_color_ref_number:
    if each not in outside_color_ref_number:
        inside_color_ref_number.append(each)
        
fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(1, 1, 1)

colormap = plt.get_cmap("tab20c")
outer_colors = colormap(outside_color_ref_number)

all_fines = pie_df.TotalFines.sum()

pie_df.groupby(['Quarter'])['TotalFines'].sum().plot(
       kind='pie', radius=1, colors = outer_colors, pctdistance = 0.85, labeldistance = 1.1,
       wedgeprops = dict(edgecolor='W'), textprops= {'fontsize':13},
       autopct = lambda p: '{:.2f}%\n(${:.1f}M)'.format(p,(p/100)*all_fines/1e+6),
       startangle=90)

inner_colors = colormap(inside_color_ref_number)
pie_df.TotalFines.plot(
       kind='pie', radius=0.7, colors = inner_colors, pctdistance = 0.55, labeldistance = 0.8,
       wedgeprops = dict(edgecolor='W'), textprops= {'fontsize':11},
       labels = pie_df.MonthName,
       autopct = '%1.2f%%',
       startangle=90)

hole = plt.Circle((0,0), 0.3, fc='white')
fig1 = plt.gcf()
fig1.gca().add_artist(hole)

ax.yaxis.set_visible(False)
plt.title('Total Fines by Quarter and Month', fontsize=14)

ax.text(0, 0, 'Total Fines\n' + '$' + str(round(all_fines/1e6,2)) + 'M', size=13, ha='center', va='center'   )

ax.axis('equal')

plt.tight_layout()
plt.show()

Budgets

Plotly waterfall chart

To include this plotly chart, I had to output it as an HTML file and then read it immediately back in. I commented out the line for plt.show() and added some extra lines below this to output it to an html file.

wf_df = df[df['Year'] == 2019].groupby(['MonthName'])['ViolFine'].sum().reset_index(name='TotalFines')

wf_df['Budget'] = 4.6e6

wf_df['Deviation'] = wf_df.TotalFines - wf_df.Budget

wf_df.loc[wf_df.index.max()+1] = ['Total',
                                  wf_df.TotalFines.sum(),
                                  wf_df.Budget.sum(),
                                  wf_df.TotalFines.sum() - wf_df.Budget.sum()]

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Total']

wf_df.MonthName = pd.Categorical(wf_df.MonthName, categories = months, ordered = True)

wf_df.sort_values(by='MonthName', inplace=True)

wf_df.reset_index(inplace = True, drop = True)

import plotly.graph_objects as go

if wf_df.loc[12, 'Deviation'] > 0:
    end_color = 'black'
elif wf_df.loc[12, 'Deviation'] < 0:
    end_color = 'red'
else: end_color = 'blue'

fig = go.Figure( go.Waterfall( name='', orientation = 'v', x = wf_df['MonthName'], textposition='outside',
                              measure = ['relative', 'relative', 'relative', 'relative', 'relative', 'relative', 
                                         'relative', 'relative', 'relative', 'relative', 'relative', 'relative', 'total' ],
                              y = wf_df['Deviation']/1e6,
                              text = ['${:.2f}M'.format(each/1e6) for each in wf_df['TotalFines']],
                              decreasing = {'marker':{'color':'red'}},
                              increasing = {'marker':{'color':'green'}},
                              totals     = {'marker':{'color': end_color}},
                              hovertemplate = 'Cumulative Deviation to Date: ' + '$%{y:,.2f}M' + '<br>' +
                                              'Total Fines in %{x}: %{text}'))

fig.layout = go.Layout(yaxis=dict(tickformat='.1f'))

fig.update_xaxes(title_text='Months', title_font = {'size': 18})

fig.update_yaxes(title_text='Total Fines (Running Total $M)', title_font = {'size':18}, 
                 dtick=0.5, tickprefix = '$', ticksuffix = 'M', zeroline=True  )

fig.update_layout(title = dict( text='Deviation between Actual and Budgeted Monthly Fines in 2019 (Waterfall Diagram)<br>' +
                              'Surpluses appear in Green, Deficits appear in Red', 
                               font = dict( family='Arial', size=18, color='black' )),
                 
                  template='simple_white',
                  title_x = 0.5,
                  showlegend = False,
                  autosize=True,
                  margin=dict(l=30, r=30, t=60, b=30)
                 )

#fig.show()

import plotly.io as pio
pio.write_html(fig, path+"plotly_result.html", auto_open=False)

Rankings

Bump chart.

bump_df = df.groupby(['Year', 'MonthName'])['ViolFine'].sum().reset_index(name='TotalFines')

bump_df = bump_df.pivot(index='Year', columns='MonthName', values = 'TotalFines')

month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

bump_df = bump_df.reindex(columns=month_order)

bump_df = bump_df.dropna()

bump_df_ranked = bump_df.rank(0, ascending=False, method='min')

bump_df_ranked = bump_df_ranked.T

fig = plt.figure(figsize=(20, 12))
ax = fig.add_subplot(1, 1, 1)

bump_df_ranked.plot(kind='line', ax=ax, marker='o', markeredgewidth=1, linewidth=6, 
                   markersize=44, 
                   markerfacecolor='white')

ax.invert_yaxis()

num_rows = bump_df_ranked.shape[0]
num_cols = bump_df_ranked.shape[1]

plt.ylabel('Monthly Ranking', fontsize=18, labelpad=10)
plt.title('Ranking of Total Fines by Month and by Year \n Bump Chart', fontsize=18, pad=15)
plt.xticks(np.arange(num_rows), month_order, fontsize=14)

plt.yticks(range(1, num_cols+1, 1), fontsize=14)

ax.set_xlabel('Month', fontsize=18)

handles, labels = ax.get_legend_handles_labels()
handles = [ handles[6], handles[5], handles[4], handles[3], handles[2], handles[1], handles[0] ]
labels  = [  labels[6],  labels[5],  labels[4],  labels[3],  labels[2],  labels[1],  labels[0] ]
ax.legend(handles, labels, bbox_to_anchor=(1.01, 1.01), fontsize=14,
         labelspacing = 1,
         markerscale = .4,
         borderpad = 1,
         handletextpad = 0.8)

i = 0
j = 0
for eachcol in bump_df_ranked.columns:
    for eachrow in bump_df_ranked.index:
        this_rank = bump_df_ranked.iloc[i, j]
        ax.text(i, this_rank, '$' + str(round(bump_df.iloc[j, i]/1e6,1)) + 'M', ha='center', va='center', fontsize=12)
        i+=1
    j+=1
    i=0    

plt.show()

Here is a caption at the end of my bump chart

Conclusion

We are now done with charts. Here is some general takeaways from my output.

You can add captions at the bottom of images. To add a caption, include the words fig.cap=“blah blah” inside the {….} at the top of the RMarkdown code you are using to include the image.

knitr::include_graphics("c:/Users/pptallon/Dropbox/G/Personal/Tallon005.jpg")

Courtesy of your favorite IT professor