A real-world dataset containing flights data from the US Department of Transportation will be explored.
Loading and viewing the data.
import pandas as pd
import scipy.stats as stats
# Plots will be displayed inline
%matplotlib inline
from matplotlib import pyplot as plt
# Read dataset
flights = pd.read_csv('data/flights.csv')
flights.head()
# Dimensions
flights.shape
The dataset contains observations of US domestic flights in 2013, and consists of the following fields:
Exploration of the flight data to analyze possible factors that affect delays in departure or arrival of a flight.
# Checking null and missing data
flights.isnull().sum()
There are 2761 null values in the variable DepDel15.
# Checking some lines with null values
flights[flights.isnull().any(axis=1)][['DepDelay','DepDel15']]
flights[flights.isnull().any(axis=1)].DepDelay.describe()
NaN values in DepDel15 are from DepDelay with value 0. It is considered that if DepDelay is 0, then the flight was not delayed, so DepDelay15 will be filled with 0.
# Filling NA values in DepDel15 with 0
flights.DepDel15 = flights.DepDel15.fillna(0)
# Checking null and missing data again
flights.isnull().sum()
flights.head()
flights.describe()
# Create a function showing distribution
def show_distribution(var_data):
from matplotlib import pyplot as plt
# Get statistics
min_val = var_data.min()
max_val = var_data.max()
mean_val = var_data.mean()
med_val = var_data.median()
mod_val = var_data.mode()[0]
print('Minimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val,
mean_val,
med_val,
mod_val,
max_val))
# Create a figure for 2 subplots (2 rows, 1 column)
fig, ax = plt.subplots(2, 1, figsize = (10,4))
# Plot the histogram
ax[0].hist(var_data)
ax[0].set_ylabel('Frequency')
# Add lines for the mean, median, and mode
ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)
# Plot the boxplot
ax[1].boxplot(var_data, vert=False)
ax[1].set_xlabel('Value')
# Add a title to the Figure
fig.suptitle('Data Distribution')
# Show the figure
fig.show()
# Get the variable to examine
col = flights['DepDelay']
# Call the function
show_distribution(col)
# Get the variable to examine
col = flights['ArrDelay']
# Call the function
show_distribution(col)
# Removing Outliers - between 0.01 and 90 percentile
q90 = flights.DepDelay.quantile(0.90)
q01 = flights.DepDelay.quantile(0.01)
# Removing Outliers - between 0.01 and 90 percentile
q90a = flights.ArrDelay.quantile(0.90)
q01a = flights.ArrDelay.quantile(0.01)
# Filtered flights
flights = flights[(flights.DepDelay>q01) & (flights.DepDelay<q90)]
flights = flights[(flights.ArrDelay>q01a) & (flights.ArrDelay<q90a)]
flights.describe()
delayFields = ['DepDelay','ArrDelay']
# View the revised distributions
for col in delayFields:
show_distribution(flights[col])
# Number of Flights by Day of Week
print(flights.groupby(flights.DayOfWeek).Year.count())
There are less flights on Saturdays. On Sat the number of flights is smaller.
# Cancelled flights
pass_counts = flights['Cancelled'].value_counts()
plt.pie(pass_counts, labels=pass_counts)
plt.legend(pass_counts.keys().tolist())
# Mean Dep Delay and Arr Delay by Day of Week
print(flights.groupby(flights.DayOfWeek)[delayFields].mean())
for col in delayFields:
flights.boxplot(column=col, by='DayOfWeek', figsize=(8,8))
print(flights.groupby(flights.Carrier)[delayFields].mean())
for col in delayFields:
flights.boxplot(column=col, by='Carrier', figsize=(8,8))
departure_airport_group = flights.groupby(flights.OriginAirportName)
mean_departure_delays = pd.DataFrame(departure_airport_group['DepDelay'].mean()).sort_values('DepDelay', ascending=False)
mean_departure_delays.plot(kind = "bar", figsize=(12,12))
mean_departure_delays
flights.boxplot(column='ArrDelay', by='DepDel15', figsize=(12,12))
# Add a routes column
routes = pd.Series(flights['OriginAirportName'] + ' -> ' + flights['DestAirportName'])
flights = pd.concat([flights, routes.rename("route")], axis=1)
# Sum arr delays per route
routearrdel = flights.groupby(flights.route)
pd.DataFrame(routearrdel['ArrDel15'].sum()).sort_values('ArrDel15', ascending=False)
# get the mean value and order from higher to minor
pd.DataFrame(routearrdel['ArrDelay'].mean()).sort_values('ArrDelay', ascending=False)
# Checking DepDelay of cancelled flights
flights.boxplot(column='DepDelay', by='Cancelled', figsize=(8,5))