There are hundreds of different reasons why a flight can be delayed from taking off and/or from arriving to its destination. This data
The following visualizations were made from a data set from 2015 which logged all the flights from commercial airlines during 2015. The airlines took off and/or landed at the major global airports and/or private airports.This data visualization makes sense of the number of departure and arrival flights and the delay time for departure and arrival to provide useful information for airports and airlines so they can prioritize which airports and airlines need to work on running a tighter schedule to be closer to being on time.
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import statistics
import matplotlib.patches as mpatches
import jupyter
from matplotlib.ticker import FuncFormatter
path = "C:/Users/amb11/OneDrive - Loyola University Maryland/IS 460W/Python/"
filename = "FlightDelays.csv"
df = pd.read_csv(filename)
def pick_colors_according_to_mean_count(this_data):
colors=[]
avg = this_data.Count.mean()
for each in this_data.Count:
if each > avg*1.10:
colors.append('lightcoral')
elif each < avg*0.90:
colors.append('green')
else:
colors.append('black')
return colors
def autolabel(these_bars, this_ax, place_of_decimals, symbol):
for each_bar in these_bars:
height = each_bar.get_height()
this_ax.text(each_bar.get_x()+each_bar.get_width()/2, height*1.01, symbol+format(height, place_of_decimals),
fontsize=11, color='black', ha='center', va='bottom')
dayAbrev = [' ', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
row_num = df.shape
isna = df.DEPARTURE_DELAY.isna().sum()
delay_time_count = df.DEPARTURE_DELAY.value_counts()
x = df.groupby(['DEPARTURE_DELAY']).agg({'ORIGIN_AIRPORT': ['count'], 'DEPARTURE_DELAY':['sum']}).reset_index()
x.columns = ['Delay_Time', 'Count', 'Delay_Sum']
x = x.sort_values('Count', ascending=False).reset_index()
x1= df.groupby(['AIR_TIME']).agg({'AIR_TIME':['sum'], 'DISTANCE': ['count', 'mean']}).reset_index()
x1.columns = ['Air_Time', 'Total_Time_In_Air', 'Distance_Sum', 'mean']
x1 = x1.sort_values('Total_Time_In_Air', ascending=False).reset_index()
flight_df = df.groupby(['AIRLINE', 'DAY_OF_WEEK'])['YEAR'].count().reset_index(name='TotalFlights')
stacked_df = df.groupby(['ORIGIN_AIRPORT', 'AIRLINE'])['YEAR'].count().reset_index(name='TotalOrigin')
stacked_df = stacked_df.pivot(index='ORIGIN_AIRPORT', columns='AIRLINE', values='TotalOrigin').reset_index(drop=True)
bump_df = df.groupby(['AIRLINE', 'DAY_OF_WEEK'])['DEPARTURE_DELAY'].mean().reset_index(name='AvgDelay')
bump_df = bump_df.pivot(index='AIRLINE', columns='DAY_OF_WEEK', values='AvgDelay')
bump_df_ranked = bump_df.rank(0, ascending=False, method='min')
bump_df_ranked = bump_df_ranked.T
Of the top 150 most common lengths of time for a flight to be delayed from the origin airport, about 34,498.41 of those flights have a differing takeoff times from planned between being 12 minutes late to being 14 minutes early.
Of the top 10 most common lengths of time for a flight to be delayed from the origin airport, about 305,785.70 flights within the top differing takeoff times from planned between being 2-5 minutes late.
import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'C:/Users/amb11/OneDrive/Documents/Anaconda/Library/plugins/platforms'
plt.figure(figsize=(30,16))
plt.bar(x.loc[0:10, 'Delay_Time'], x.loc[0:10, 'Count'], label='Delay Count')
plt.legend(loc='upper right', fontsize=14)
bottom1 = 1
top1 = 150
d1 = x.loc[bottom1:top1]
my_colors1 = pick_colors_according_to_mean_count(d1)
bottom2 = 1
top2 = 10
d2 = x.loc[bottom2:top2]
my_colors2 = pick_colors_according_to_mean_count(d2)
Above = mpatches.Patch(color='lightcoral', label='Above Average')
At = mpatches.Patch(color='black', label='Within 10% of the Average')
Below = mpatches.Patch(color='green', label='Below Average')
fig = plt.figure(figsize=(20,18))
fig.suptitle('Length of Time Delays from Origin Airports by Count Frequency:\n Top ' + str(top1) + ' and Top ' + str(top2) +
' Delay Lengths',fontsize=18, fontweight='bold')
ax1 = fig.add_subplot(2, 1, 1)
ax1.bar(x.loc[0:150, 'Delay_Time'], x.loc[0:150, 'Count'], label='Delay Count', color=my_colors1)
# ax1.legend(fontsize=14)
ax1.legend(handles=[Above, At, Below], fontsize=14)
plt.axhline(d1.Count.mean(), color='black', linestyle='dashed')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.set_title('Top ' + str(top1) + ' Delay Lengths', size=15)
ax1.text(top1-40, d1.Count.mean()+1600, 'Mean = ' + str("{:,.2f}".format(d1.Count.mean())), rotation=0, fontsize=14)
ax1.get_yaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))
ax2 = fig.add_subplot(2, 1, 2)
ax2.bar(x.loc[0:10, 'Delay_Time'], x.loc[0:10, 'Count'], label='Delay Count', color=my_colors2)
# ax1.legend(fontsize=14)
ax2.legend(handles=[Above, At, Below], fontsize=10)
plt.axhline(d2.Count.mean(), color='black', linestyle='dashed')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.set_title('Top ' + str(top2) + ' Delay Lengths', size=15)
ax2.text(top2-19, d2.Count.mean()+1005, 'Mean = ' + str("{:,.2f}".format(d2.Count.mean())), rotation=0, fontsize=14)
ax2.get_yaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))
ax2.get_legend().remove()
fig.subplots_adjust(hspace=0.5)
plt.show()
Of the major airlines, Southwest Airlines has the most flights per day everyday of the week by roughly 50,000 flights. It is important to note that all of the airlines have a significant drop in the number of flights they had in 2015 on Saturdays. It is also important to note that there is 3 groupings of airlines, one being Southwest Airlines as a standalone, the second being the other major airlines with an average total of flights being around roughly 100,000 flights per day (expect on Saturday with a rough average of 85,000), and then finally the smaller airlines with a rough average of being 25,000 flights per day.
from matplotlib.ticker import FuncFormatter
df1 = df[ df['ORIGIN_AIRPORT'].map(str).apply(len) == 3]
stacked_df1 = df1.groupby(['AIRLINE', 'ORIGIN_AIRPORT'])['AIRLINE'].count().reset_index(name='TotalFlightz')
stacked_df2 = stacked_df1.sort_values(by=['TotalFlightz'], ascending=False).reset_index(drop=True)
stacked_df3 = stacked_df2[1:305]
stacked_df4 = df1[ df1['AIRLINE'].isin(df['AIRLINE'])]
stacked_df5 = stacked_df3.groupby(['AIRLINE', 'ORIGIN_AIRPORT'])['AIRLINE'].count().reset_index(name='Count')
stacked_df3 = stacked_df3.pivot(index='ORIGIN_AIRPORT', columns='AIRLINE', values='TotalFlightz')
fig3 = plt.figure(figsize=(26, 20))
ax5 = fig3.add_subplot(1, 1, 1)
my_colors = {'AA':'red',
'AS':'orange',
'B6':'yellow',
'DL':'green',
'EV':'blue',
'F9':'purple',
'HA':'pink',
'MQ':'gray',
'NK':'gold',
'OO':'black',
'UA':'brown',
'US':'lime',
'VX':'deepskyblue',
'WN':'darkviolet'}
for key, grp in flight_df.groupby(['AIRLINE']):
grp.plot(ax=ax5, kind='line', x='DAY_OF_WEEK', y='TotalFlights', color=my_colors[key], label=key, marker='8')
plt.title('Total Number of Flights by Day of Week (2015)', fontsize=18)
ax5.set_xlabel('Day of Week', fontsize=18)
ax5.set_ylabel('Total Flights', fontsize=18, labelpad=20)
ax5.set_xticklabels(dayAbrev)
ax5.tick_params(axis='x', labelsize=14, rotation=0)
ax5.tick_params(axis='y', labelsize=14, rotation=0)
ax5.get_yaxis().set_major_formatter( FuncFormatter( lambda x, p: format(int(x), ' ,')))
plt.show()
Of the major airports around the world, the busiest are ORD, DFW, DEN, IAH, SFO for the top 10 major commercial airlines. American, United, and Southwest Airlines having the most flights from the major airports.
fig4 = plt.figure(figsize=(26,20))
ax6 = fig4.add_subplot(1, 1, 1)
stacked_df3.plot(kind='bar', stacked=True, ax=ax6)
plt.ylabel('Total Number of Flights', fontsize=18, labelpad=10)
plt.title('Total Flights per Airline per Airport 2015', fontsize=18)
plt.xticks(rotation=70, horizontalalignment='center', fontsize=14)
plt.yticks(fontsize=14)
ax6.set_xlabel('Airport Abbreviation', fontsize=18)
ax6.yaxis.set_major_formatter(FuncFormatter( lambda x, pos: format(int(x), ' ,')))
plt.show()
Over the course of the week, Spirit and American Airlines are the top 2 airlines with the highest daily average departure time delay. JetBlue fluctuates over the week from starting the week being ranked 8th then by Friday at 6th before dropping to 12th on Saturday and then moving up to 3rd on Sunday. Virgin Airlines airlines also has a fluctuation in its weekly rank of its average time departure delay but not as drastic as JetBlue’s.
rank_order = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']
fig5 = plt.figure(figsize=(26,20))
ax7 = fig5.add_subplot(1,1,1)
bump_df_ranked.plot(kind='line', ax=ax7, marker='o', markeredgewidth=1, linewidth=6,
markersize=35,
markerfacecolor='white')
ax7.invert_yaxis()
ax7.set_xticklabels(dayAbrev)
ax7.set_xlabel('Airline', fontsize=18)
num_rows = bump_df_ranked.shape[0]
num_cols = bump_df_ranked.shape[1]
plt.ylabel('Day Ranking', fontsize=18, labelpad=10)
plt.xlabel('Day of Week', fontsize=18, labelpad=10)
plt.title('Ranking of Average Time Delay by Day of Week \n and Airline in 2015', fontsize=18, pad=15)
plt.yticks(np.arange(1, num_cols+1,1))
plt.legend(bbox_to_anchor=(1.01,1.01),fontsize=14,
labelspacing = 1,
markerscale = .4,
borderpad = 1,
handletextpad = 0.8)
i = 0
j = 0
for eachcol in bump_df_ranked.columns:
for eachrow in bump_df_ranked.index:
this_rank = bump_df_ranked.iloc[i,j]
ax7.text(i+1, this_rank, str(round(bump_df.iloc[j,i],2)), ha='center', va='center', fontsize=12)
i+=1
j+=1
i=0
plt.show()
As stated, in addition to flights being delayed with takeoff they can be delayed with their arrival. Of the top 10 lengths of time for a flight to be delayed with arrival, the average if the 10 is 8.6 minutes with 13 minutes being the first, followed by 12 minutes, and then 11 minutes.
pie_df = df.groupby(['ARRIVAL_DELAY'])['ARRIVAL_DELAY'].count().reset_index(name='Count')
pie_df1 = pie_df.sort_values(by=['Count'], ascending=False).reset_index(drop=True)
pie_df1['ARRIVAL_DELAY'] = pie_df1['ARRIVAL_DELAY'].abs()
pie_df1 = pie_df1[1:10]
avgarrival = pie_df1['ARRIVAL_DELAY'].mean()
avgarrival = "{:.2}".format(avgarrival)
number_outside_colors = len(pie_df1.ARRIVAL_DELAY.unique())
outside_color_ref_number = np.arange(number_outside_colors)*14
fig = plt.figure(figsize=(10,10))
ax8 = fig.add_subplot(1,1,1)
colormap = plt.get_cmap("tab20c")
outer_colors = colormap(outside_color_ref_number)
plt.pie(pie_df1['ARRIVAL_DELAY'], labels=pie_df1['ARRIVAL_DELAY'], autopct='%0.2f%%', pctdistance=0.85, startangle=90)
hole = plt.Circle((0,0), 0.3, fc='white')
fig7 = plt.gcf()
fig7.gca().add_artist(hole)
ax8.yaxis.set_visible(False)
plt.title('Top 10 Arrival Delay Times', fontsize=18, pad=15)
ax8.text(-0.1,0, 'Avg Delay\nin Mins: \n' + str(avgarrival), fontsize=15)
ax8.axis('equal')
plt.tight_layout()
plt.show()