Between 2010 and 2016, oil pipeline accidents followed clear, predictable patterns driven by a small number of recurring failure mechanisms and consistent temporal rhythms. The data shows that accidents are not random events; they cluster around specific sub causes, seasons, months, days, and hours, revealing systemic vulnerabilities and operational risk windows.
import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'C:/Users/dkwlk/anaconda3/Library/plugins/platforms'
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
file_path = Path("C:/Users/dkwlk/OneDrive/Documents/Nates Docs/Loyola_MSDS/2026_Spring/DS-736_Data-Visualizations/Assigment_2/Oil_Pipeline_Accident/dataset2.csv")
oil_df = pd.read_csv(file_path)
from IPython.display import Image
import matplotlib.image as mpimg
img = mpimg.imread(
"C:/Users/dkwlk/OneDrive/Documents/Nates Docs/Loyola_MSDS/2026_Spring/DS-736_Data-Visualizations/Assigment_2/Oil_Pipeline_Accident/Image_Oil_Pipeline/self-healing-coatings-for-oil-pipelines-Corrosionpedia-2048x1170.jpg"
)
plt.figure(figsize=(14, 14))
plt.imshow(img)
plt.axis('off')
plt.show()
import folium
import pandas as pd
from IPython.display import HTML
map_df = oil_df[['Accident Date/Time','Cause Category','Cause Subcategory',
'Pipeline Location','Accident County','Accident State',
'Accident City','Accident Latitude','Accident Longitude']]
center_of_map = [41.8500, -87.6501]
my_map = folium.Map(location=center_of_map, zoom_start=4)
tiles = ['cartodbpositron','openstreetmap','stamenterrain','stamentoner']
for tile in tiles:
folium.TileLayer(tile).add_to(my_map);
folium.LayerControl().add_to(my_map);
for i in range(len(map_df)):
cat = map_df.loc[i,'Cause Category']
if cat == 'INCORRECT OPERATION':
color = 'green'
elif cat == 'MATERIAL/WELD/EQUIP FAILURE':
color = 'red'
elif cat == 'ALL OTHER CAUSES':
color = 'blue'
else:
color = 'black'
try:
if color != 'black':
folium.Circle(
location=[map_df.loc[i,'Accident Latitude'], map_df.loc[i,'Accident Longitude']],
tooltip=cat,
popup=f"Date: {map_df.loc[i,'Accident Date/Time']}<br>Cause Subcategory: {map_df.loc[i,'Cause Subcategory']}",
radius=50,
color=color,
fill=True,
fill_color=color,
fill_opacity=0.5
).add_to(my_map);
except:
pass
HTML(my_map._repr_html_())
Select the map layer (upper right) to see street level features (select ‘open streetmap’). Use the Zoom button (upper left, (-/+)), to see the causes of the oil pipe line leaks, zoom-in and hover over the colored dot or click on the colored dot.
import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'C:/Users/dkwlk/anaconda3/Library/plugins/platforms'
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
# Load data
file_path = Path("C:/Users/dkwlk/OneDrive/Documents/Nates Docs/Loyola_MSDS/2026_Spring/DS-736_Data-Visualizations/Assigment_2/Oil_Pipeline_Accident/dataset2.csv")
oil_df = pd.read_csv(file_path)
# Extract datetime
df_datetime = oil_df[['Accident Date/Time']].copy()
df_datetime['Accident Date/Time'] = pd.to_datetime(df_datetime['Accident Date/Time'], format='%m/%d/%Y %H:%M')
# Extract components
df_datetime['Year'] = df_datetime['Accident Date/Time'].dt.year
df_datetime['Quarter'] = df_datetime['Accident Date/Time'].dt.quarter
df_datetime['Month'] = df_datetime['Accident Date/Time'].dt.month
df_datetime['Day'] = df_datetime['Accident Date/Time'].dt.day
df_datetime['Hour'] = df_datetime['Accident Date/Time'].dt.hour
# Remove missing and 2017
df_datetime = df_datetime.dropna()
df_datetime = df_datetime[df_datetime['Year'] != 2017]
df_datetime['Year'] = df_datetime['Year'].astype(int)
# Count accidents per year
df_datetime['Accident_Year_Count'] = df_datetime.groupby('Year')['Year'].transform('count')
# Plot
with sns.axes_style('white'):
g = sns.catplot(
x='Year',
data=df_datetime,
aspect=2,
kind='count',
color='steelblue'
);
# Add labels
ax = g.ax
for p in ax.patches:
height = p.get_height()
ax.text(
p.get_x() + p.get_width() / 2,
height,
f'{int(height)}',
ha='center',
va='bottom',
fontsize=11
)
plt.title('Oil Pipeline Accidents by Year', fontsize=14)
plt.xlabel('Year', fontsize=11)
plt.ylabel('Count', fontsize=11)
plt.show()
Yearly oil pipe line failures are not randomly distributed across the
years from 2010 through 2016. In the year 2015, we see a peak in oil
pipeline leaks by an increase of 61 cases from 2013.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# New Data Frame
df_datetime3 = df_datetime.copy()
# Keep valid months only
df_datetime3 = df_datetime3[df_datetime3['Month'].isin(range(1, 13))]
# Ensure numeric types
df_datetime3['Year'] = df_datetime3['Year'].astype(int)
df_datetime3['Month'] = df_datetime3['Month'].astype(int)
# Pivot table for heatmap
hm_df = pd.pivot_table(
df_datetime3,
index='Year',
columns='Month',
values='Accident_Year_Count',
aggfunc='sum'
)
# Convert to numeric matrix
hm_df = hm_df.apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)
# Plot heatmap
fig = plt.figure(figsize=(18, 10))
ax = fig.add_subplot(1, 1, 1)
comma_fmt = FuncFormatter(lambda x, p: format(int(x), ','))
sns.heatmap(
hm_df,
linewidth=0.2,
annot=True,
cmap='coolwarm',
fmt=',.0f',
square=True,
annot_kws={'size': 9},
cbar_kws={'format': comma_fmt, 'orientation': 'vertical'},
ax=ax
);
plt.title("Heatmap of Oil Pipeline Accidents by Year and Month", fontsize=18, pad=15)
plt.xlabel("Accident Month", fontsize=15)
plt.ylabel("Accident Year", fontsize=15)
plt.xticks(fontsize=10)
plt.yticks(rotation=0, fontsize=10)
ax.invert_yaxis()
# Colorbar
cbar = ax.collections[0].colorbar
cbar.set_label("Number of Accidents", rotation=270, fontsize=14, labelpad=20)
plt.show()
The heatmap reveals that certain months repeatedly experience elevated
accident counts, indicating environmental or operational stress cycles.
Some years show concentrated clusters of high incident months,
suggesting periods of increased activity.
# Second tab: total accidents by hour and weekday
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# Ensure weekday column exists
df_datetime['WeekDay'] = df_datetime['Accident Date/Time'].dt.day_name().str[:3]
# Group by hour and weekday
accident_df = (
df_datetime
.groupby(['Hour', 'WeekDay'])['Accident_Year_Count']
.count()
.reset_index(name='Total Accidents')
)
# Pivot for stacked bar plot
pivot_df = accident_df.pivot(index='Hour', columns='WeekDay', values='Total Accidents')
# Reorder weekday columns
day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
pivot_df = pivot_df.reindex(columns=day_order)
# Plot
fig = plt.figure(figsize=(18, 10))
ax = fig.add_subplot(1, 1, 1)
pivot_df.plot(kind='bar', stacked=True, ax=ax);
# Labels and formatting
plt.ylabel('Total Accidents', fontsize=18, labelpad=10)
plt.title('Total Accidents by Hour and by Day (2010-2016)\nStacked Bar Plot', fontsize=18)
plt.xticks(rotation=0, fontsize=14)
plt.yticks(fontsize=14)
ax.set_xlabel('Hour (24-Hour Interval)', fontsize=18)
# Reorder legend manually
handles, labels = ax.get_legend_handles_labels()
handles = handles[::-1]
labels = labels[::-1]
ax.legend(handles, labels, title='Day', fontsize=14)
plt.show()
Daily and hourly patterns reinforce the role of human activity.
Accidents peak during weekday working hours — especially late morning
through late afternoon — and drop sharply overnight and on
weekends.
This indicates that operational activity, maintenance work, and
third‑party interactions with pipelines are major drivers of incident
frequency.
# I paste some code in here for my second tab
# we add the word quarter to the number quarter
# Ensure MonthName exists for this chunk
df_datetime['MonthName'] = df_datetime['Accident Date/Time'].dt.month_name().str[:3]
df_datetime['Quarter'] = 'Quarter ' + df_datetime['Accident Date/Time'].dt.quarter.astype('string')
df_datetime2 = df_datetime
df_datetime3 = df_datetime
df_datetime2 = df_datetime2.groupby(['Quarter', 'MonthName', 'Month'])['Accident_Year_Count'].count().reset_index(name='Total_Accidents')
df_datetime2.sort_values(by=['Month'], inplace=True) #sorted by month in ascending order
df_datetime2.reset_index(inplace=True, drop=True) #drop the old index
del df_datetime2['Month']
number_outside_colors = len(df_datetime2.Quarter.unique()) # returns the count of 4 quarters
outside_color_ref_number = np.arange(number_outside_colors) * 4 # retruns 0 ,4,8,12 - range of colors
number_inside_colors = len(df_datetime2.MonthName.unique()) # returns the count of 4 quarters
all_color_ref_number = np.arange(number_outside_colors + number_inside_colors) # retruns 0 ,4,8,12 - range of colors
inside_color_ref_number = []
for each in all_color_ref_number:
if each not in outside_color_ref_number:
inside_color_ref_number.append(each)
inside_color_ref_number = np.array(inside_color_ref_number).astype(np.int64)
# build up the pie chart
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1,1,1)
# set the colors
colormap = plt.get_cmap("tab20c")
outer_colors = colormap(outside_color_ref_number)
all_accidents = df_datetime2.Total_Accidents.sum() # center value
# OUTER PIE — totals by quarter
df_datetime2.groupby(['Quarter'])['Total_Accidents'].sum().plot(
kind='pie',
radius=1,
colors=outer_colors,
pctdistance=0.85,
labeldistance=1.1,
wedgeprops=dict(edgecolor='white'),
textprops={'fontsize': 18},
autopct=lambda p: '{:.1f}%\n({:.0f})'.format(p, (p/100)*all_accidents),
startangle=90
)
# INNER PIE — totals by month
inner_colors = colormap(inside_color_ref_number)
df_datetime2.Total_Accidents.plot(
kind='pie',
radius=0.7,
colors=inner_colors,
pctdistance=0.55,
labeldistance=0.8,
wedgeprops=dict(edgecolor='white'),
textprops={'fontsize': 13},
labels=df_datetime2.MonthName,
autopct='%1.2f%%',
startangle=90
)
# add a circle in the middle
hole = plt.Circle((0,0), 0.3, fc='white')
fig.gca().add_artist(hole)
# remove y-axis
ax.yaxis.set_visible(False)
plt.title('Total Pipeline Accidents by Quarter and Month (2010-2016)', fontsize=18)
# add text to the center
ax.text(0, 0,
' Total Accidents\n' + str(round(all_accidents, 2)),
size=18, ha='center', va='center')
ax.axis('equal')
plt.tight_layout()
plt.show()
Certain months repeatedly experience more pipeline accidents than
others. • Winter spikes → cold related stress, brittle materials,
pressure changes • Summer spikes → construction activity, excavation
damage, thermal expansion • Spring/Fall dips → transitional periods with
fewer extreme conditions
# I paste some code in here for my second tab
cause_df = oil_df.groupby(['Cause Category']) ['Cause Category'].count().reset_index(name = 'oil_leak_count')
cause_df = pd.DataFrame(cause_df)
cause_df = cause_df.sort_values(by='oil_leak_count',ascending=False)
x=cause_df.oil_leak_count.sum()
with sns.axes_style('white'):
g = sns.catplot(
x="Cause Category",
y="oil_leak_count",
data=cause_df,
kind='bar', # <-- changed from 'count' to 'bar'
estimator=sum, # sum oil_leak_count per category
ci=None,
aspect=2,
height = 8, # increase the figure height
color='steelblue'
);
g.set_xticklabels(rotation=45, ha='right', step=1)
# Add labels on top of bars
ax = g.ax
for p in ax.patches:
height = p.get_height()
ax.text(
p.get_x() + p.get_width() / 2,
height,
f'{int(height)}',
ha='center',
va='bottom',
fontsize=11
)
plt.title('Oil Pipeline Accidents by Cause Type, from 2010–2016', fontsize=18)
plt.xlabel('Cause Category', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, horizontalalignment='center', fontsize=11)
plt.tight_layout()
plt.show()
The plot shows the top causes of the oil pipeline leaks that have
occurred. The top failure cause is “material/weld/equipment” failure.
This includes corrosion failures.
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
scause_df = (
oil_df.groupby('Cause Subcategory')['Cause Subcategory']
.count()
.reset_index(name='count')
).sort_values(by='count', ascending=False).reset_index(drop=True)
def pick_colors_by_mean_count(df):
avg = df['count'].mean()
return [
'lightcoral' if x > avg * 1.01 else
'green' if x < avg * 0.99 else
'black'
for x in df['count']
]
d1 = scause_df.iloc[0:25]
d2 = scause_df.iloc[0:10]
my_colors1 = pick_colors_by_mean_count(d1)
my_colors2 = pick_colors_by_mean_count(d2)
mean1 = d1['count'].mean()
mean2 = d2['count'].mean()
Above = mpatches.Patch(color='lightcoral', label='Above Average')
At = mpatches.Patch(color='black', label='Within 1% of Average')
Below = mpatches.Patch(color='green', label='Below Average')
fig = plt.figure(figsize=(18,16))
fig.suptitle('Frequency of Oil Pipeline Leaks by Cause Subcategory:\nTop 25 and Top 10', fontsize=18)
ax1 = fig.add_subplot(2,1,1)
ax1.bar(d1['Cause Subcategory'], d1['count'], color=my_colors1);
ax1.legend(handles=[Above, At, Below], fontsize=14)
ax1.axhline(mean1, color='black', linestyle='dashed')
ax1.set_title('Top 25 Subcauses', size=20)
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.axes.xaxis.set_visible(False)
ax1.text(1, mean1 + 5, f'Mean = {mean1:.2f}', fontsize=14)
ax2 = fig.add_subplot(2,1,2)
ax2.bar(d2['Cause Subcategory'], d2['count'], color=my_colors2);
for bar in ax2.patches:
height = bar.get_height()
ax2.text(
bar.get_x() + bar.get_width()/2,
height,
f'{int(height)}',
ha='center',
va='bottom',
fontsize=12
)
ax2.legend(handles=[Above, At, Below], fontsize=14)
ax2.axhline(mean2, color='black', linestyle='solid')
ax2.set_title('Top 10 Subcauses', size=20)
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
plt.show()
Conclusion: A small set of sub causes accounts for a disproportionate share of all incidents. These high frequency contributors consistently exceed the historical average and reflect persistent structural weaknesses in pipeline systems — including corrosion, material defects, equipment failures, and operational errors. Addressing these top contributors would yield the greatest reduction in total incidents.