Analysis of Oil Pipeline Leaks (2010 - 2016)

Between 2010 and 2016, oil pipeline accidents followed clear, predictable patterns driven by a small number of recurring failure mechanisms and consistent temporal rhythms. The data shows that accidents are not random events; they cluster around specific sub causes, seasons, months, days, and hours, revealing systemic vulnerabilities and operational risk windows.

import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'C:/Users/dkwlk/anaconda3/Library/plugins/platforms'

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')
from datetime import datetime, timedelta

file_path = Path("C:/Users/dkwlk/OneDrive/Documents/Nates Docs/Loyola_MSDS/2026_Spring/DS-736_Data-Visualizations/Assigment_2/Oil_Pipeline_Accident/dataset2.csv")
oil_df = pd.read_csv(file_path)

Oil Pipeline Image

from IPython.display import Image
import matplotlib.image as mpimg

img = mpimg.imread(
    "C:/Users/dkwlk/OneDrive/Documents/Nates Docs/Loyola_MSDS/2026_Spring/DS-736_Data-Visualizations/Assigment_2/Oil_Pipeline_Accident/Image_Oil_Pipeline/self-healing-coatings-for-oil-pipelines-Corrosionpedia-2048x1170.jpg"
)

plt.figure(figsize=(14, 14))
plt.imshow(img)
plt.axis('off')
plt.show()

Interactive Map of the Oil Pipeline Leaks

import folium
import pandas as pd
from IPython.display import HTML

map_df = oil_df[['Accident Date/Time','Cause Category','Cause Subcategory',
                 'Pipeline Location','Accident County','Accident State',
                 'Accident City','Accident Latitude','Accident Longitude']]

center_of_map = [41.8500, -87.6501]

my_map = folium.Map(location=center_of_map, zoom_start=4)

tiles = ['cartodbpositron','openstreetmap','stamenterrain','stamentoner']
for tile in tiles:
    folium.TileLayer(tile).add_to(my_map);

folium.LayerControl().add_to(my_map);

for i in range(len(map_df)):
    cat = map_df.loc[i,'Cause Category']
    if cat == 'INCORRECT OPERATION':
        color = 'green'
    elif cat == 'MATERIAL/WELD/EQUIP FAILURE':
        color = 'red'
    elif cat == 'ALL OTHER CAUSES':
        color = 'blue'
    else:
        color = 'black'

    try:
        if color != 'black':
            folium.Circle(
                location=[map_df.loc[i,'Accident Latitude'], map_df.loc[i,'Accident Longitude']],
                tooltip=cat,
                popup=f"Date: {map_df.loc[i,'Accident Date/Time']}<br>Cause Subcategory: {map_df.loc[i,'Cause Subcategory']}",
                radius=50,
                color=color,
                fill=True,
                fill_color=color,
                fill_opacity=0.5
            ).add_to(my_map);
    except:
        pass
   

HTML(my_map._repr_html_())
Make this Notebook Trusted to load map: File -> Trust Notebook

Select the map layer (upper right) to see street level features (select ‘open streetmap’). Use the Zoom button (upper left, (-/+)), to see the causes of the oil pipe line leaks, zoom-in and hover over the colored dot or click on the colored dot.

Bar Plot - Pipeline Oil Leaks by Year (2010 - 2016)

import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'C:/Users/dkwlk/anaconda3/Library/plugins/platforms'

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load data
file_path = Path("C:/Users/dkwlk/OneDrive/Documents/Nates Docs/Loyola_MSDS/2026_Spring/DS-736_Data-Visualizations/Assigment_2/Oil_Pipeline_Accident/dataset2.csv")
oil_df = pd.read_csv(file_path)

# Extract datetime
df_datetime = oil_df[['Accident Date/Time']].copy()
df_datetime['Accident Date/Time'] = pd.to_datetime(df_datetime['Accident Date/Time'], format='%m/%d/%Y %H:%M')

# Extract components
df_datetime['Year'] = df_datetime['Accident Date/Time'].dt.year
df_datetime['Quarter'] = df_datetime['Accident Date/Time'].dt.quarter
df_datetime['Month'] = df_datetime['Accident Date/Time'].dt.month
df_datetime['Day'] = df_datetime['Accident Date/Time'].dt.day
df_datetime['Hour'] = df_datetime['Accident Date/Time'].dt.hour

# Remove missing and 2017
df_datetime = df_datetime.dropna()
df_datetime = df_datetime[df_datetime['Year'] != 2017]
df_datetime['Year'] = df_datetime['Year'].astype(int)

# Count accidents per year
df_datetime['Accident_Year_Count'] = df_datetime.groupby('Year')['Year'].transform('count')

# Plot
with sns.axes_style('white'):
    g = sns.catplot(
        x='Year',
        data=df_datetime,
        aspect=2,
        kind='count',
        color='steelblue'
    );

# Add labels
ax = g.ax
for p in ax.patches:
    height = p.get_height()
    ax.text(
        p.get_x() + p.get_width() / 2,
        height,
        f'{int(height)}',
        ha='center',
        va='bottom',
        fontsize=11
    )

plt.title('Oil Pipeline Accidents by Year', fontsize=14)
plt.xlabel('Year', fontsize=11)
plt.ylabel('Count', fontsize=11)

plt.show()

Yearly oil pipe line failures are not randomly distributed across the years from 2010 through 2016. In the year 2015, we see a peak in oil pipeline leaks by an increase of 61 cases from 2013.

Oil Pipeline Leaks Heat Map

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# New Data Frame
df_datetime3 = df_datetime.copy()

# Keep valid months only
df_datetime3 = df_datetime3[df_datetime3['Month'].isin(range(1, 13))]

# Ensure numeric types
df_datetime3['Year'] = df_datetime3['Year'].astype(int)
df_datetime3['Month'] = df_datetime3['Month'].astype(int)

# Pivot table for heatmap
hm_df = pd.pivot_table(
    df_datetime3,
    index='Year',
    columns='Month',
    values='Accident_Year_Count',
    aggfunc='sum'
)

# Convert to numeric matrix
hm_df = hm_df.apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)

# Plot heatmap
fig = plt.figure(figsize=(18, 10))
ax = fig.add_subplot(1, 1, 1)

comma_fmt = FuncFormatter(lambda x, p: format(int(x), ','))

sns.heatmap(
    hm_df,
    linewidth=0.2,
    annot=True,
    cmap='coolwarm',
    fmt=',.0f',
    square=True,
    annot_kws={'size': 9},
    cbar_kws={'format': comma_fmt, 'orientation': 'vertical'},
    ax=ax
);

plt.title("Heatmap of Oil Pipeline Accidents by Year and Month", fontsize=18, pad=15)
plt.xlabel("Accident Month", fontsize=15)
plt.ylabel("Accident Year", fontsize=15)

plt.xticks(fontsize=10)
plt.yticks(rotation=0, fontsize=10)
ax.invert_yaxis()

# Colorbar
cbar = ax.collections[0].colorbar
cbar.set_label("Number of Accidents", rotation=270, fontsize=14, labelpad=20)

plt.show()

The heatmap reveals that certain months repeatedly experience elevated accident counts, indicating environmental or operational stress cycles. Some years show concentrated clusters of high incident months, suggesting periods of increased activity.

Stacked Bar Chart

# Second tab: total accidents by hour and weekday

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Ensure weekday column exists
df_datetime['WeekDay'] = df_datetime['Accident Date/Time'].dt.day_name().str[:3]

# Group by hour and weekday
accident_df = (
    df_datetime
    .groupby(['Hour', 'WeekDay'])['Accident_Year_Count']
    .count()
    .reset_index(name='Total Accidents')
)

# Pivot for stacked bar plot
pivot_df = accident_df.pivot(index='Hour', columns='WeekDay', values='Total Accidents')

# Reorder weekday columns
day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
pivot_df = pivot_df.reindex(columns=day_order)

# Plot
fig = plt.figure(figsize=(18, 10))
ax = fig.add_subplot(1, 1, 1)

pivot_df.plot(kind='bar', stacked=True, ax=ax);

# Labels and formatting
plt.ylabel('Total Accidents', fontsize=18, labelpad=10)
plt.title('Total Accidents by Hour and by Day (2010-2016)\nStacked Bar Plot', fontsize=18)
plt.xticks(rotation=0, fontsize=14)
plt.yticks(fontsize=14)
ax.set_xlabel('Hour (24-Hour Interval)', fontsize=18)

# Reorder legend manually
handles, labels = ax.get_legend_handles_labels()
handles = handles[::-1]
labels = labels[::-1]
ax.legend(handles, labels, title='Day', fontsize=14)

plt.show()

Daily and hourly patterns reinforce the role of human activity.
Accidents peak during weekday working hours — especially late morning through late afternoon — and drop sharply overnight and on weekends.
This indicates that operational activity, maintenance work, and third‑party interactions with pipelines are major drivers of incident frequency.

Nested Pie Chart: Total Pipeline Accidents

# I paste some code in here for my second tab
# we add the word quarter to the number quarter
# Ensure MonthName exists for this chunk
df_datetime['MonthName'] = df_datetime['Accident Date/Time'].dt.month_name().str[:3]
df_datetime['Quarter'] = 'Quarter ' +  df_datetime['Accident Date/Time'].dt.quarter.astype('string')
df_datetime2 = df_datetime
df_datetime3 = df_datetime

df_datetime2 = df_datetime2.groupby(['Quarter', 'MonthName', 'Month'])['Accident_Year_Count'].count().reset_index(name='Total_Accidents')

df_datetime2.sort_values(by=['Month'], inplace=True) #sorted by month in ascending order

df_datetime2.reset_index(inplace=True, drop=True) #drop the old index
del df_datetime2['Month']

number_outside_colors = len(df_datetime2.Quarter.unique()) # returns the count of 4 quarters
outside_color_ref_number = np.arange(number_outside_colors) * 4 # retruns 0 ,4,8,12 - range of colors

number_inside_colors = len(df_datetime2.MonthName.unique()) # returns the count of 4 quarters
all_color_ref_number = np.arange(number_outside_colors + number_inside_colors) # retruns 0 ,4,8,12 - range of colors

inside_color_ref_number = []
for each in all_color_ref_number:
    if each not in outside_color_ref_number:
        inside_color_ref_number.append(each)
        

inside_color_ref_number = np.array(inside_color_ref_number).astype(np.int64)

# build up the pie chart
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1,1,1)

# set the colors
colormap = plt.get_cmap("tab20c")
outer_colors = colormap(outside_color_ref_number)

all_accidents = df_datetime2.Total_Accidents.sum()  # center value

# OUTER PIE — totals by quarter
df_datetime2.groupby(['Quarter'])['Total_Accidents'].sum().plot(
    kind='pie',
    radius=1,
    colors=outer_colors,
    pctdistance=0.85,
    labeldistance=1.1,
    wedgeprops=dict(edgecolor='white'),
    textprops={'fontsize': 18},
    autopct=lambda p: '{:.1f}%\n({:.0f})'.format(p, (p/100)*all_accidents),
    startangle=90
)

# INNER PIE — totals by month
inner_colors = colormap(inside_color_ref_number)

df_datetime2.Total_Accidents.plot(
    kind='pie',
    radius=0.7,
    colors=inner_colors,
    pctdistance=0.55,
    labeldistance=0.8,
    wedgeprops=dict(edgecolor='white'),
    textprops={'fontsize': 13},
    labels=df_datetime2.MonthName,   
    autopct='%1.2f%%',
    startangle=90
)

# add a circle in the middle
hole = plt.Circle((0,0), 0.3, fc='white')
fig.gca().add_artist(hole)

# remove y-axis
ax.yaxis.set_visible(False)

plt.title('Total Pipeline Accidents by Quarter and Month (2010-2016)', fontsize=18)

# add text to the center
ax.text(0, 0,
        ' Total Accidents\n' + str(round(all_accidents, 2)),
        size=18, ha='center', va='center')

ax.axis('equal')
plt.tight_layout()
plt.show()

Certain months repeatedly experience more pipeline accidents than others. • Winter spikes → cold related stress, brittle materials, pressure changes • Summer spikes → construction activity, excavation damage, thermal expansion • Spring/Fall dips → transitional periods with fewer extreme conditions

Top Causes of Oil Pipeline Leaks

# I paste some code in here for my second tab
cause_df = oil_df.groupby(['Cause Category']) ['Cause Category'].count().reset_index(name = 'oil_leak_count')
cause_df = pd.DataFrame(cause_df)
cause_df = cause_df.sort_values(by='oil_leak_count',ascending=False)
x=cause_df.oil_leak_count.sum()


with sns.axes_style('white'):
    g = sns.catplot(
        x="Cause Category",
        y="oil_leak_count",
        data=cause_df,
        kind='bar',          # <-- changed from 'count' to 'bar'
        estimator=sum,       # sum oil_leak_count per category
        ci=None,
        aspect=2,
        height = 8,          # increase the figure height
        color='steelblue'
    );
    g.set_xticklabels(rotation=45, ha='right', step=1)

# Add labels on top of bars
ax = g.ax
for p in ax.patches:
    height = p.get_height()
    ax.text(
        p.get_x() + p.get_width() / 2,
        height,
        f'{int(height)}',
        ha='center',
        va='bottom',
        fontsize=11
    )

plt.title('Oil Pipeline Accidents by Cause Type, from 2010–2016', fontsize=18)
plt.xlabel('Cause Category', fontsize=14)
plt.ylabel('Count', fontsize=14)

plt.xticks(rotation=45, horizontalalignment='center', fontsize=11)
plt.tight_layout()
plt.show()

The plot shows the top causes of the oil pipeline leaks that have occurred. The top failure cause is “material/weld/equipment” failure. This includes corrosion failures.

Top Sub-Causes of Oil Pipeline Leaks

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

scause_df = (
    oil_df.groupby('Cause Subcategory')['Cause Subcategory']
    .count()
    .reset_index(name='count')
).sort_values(by='count', ascending=False).reset_index(drop=True)

def pick_colors_by_mean_count(df):
    avg = df['count'].mean()
    return [
        'lightcoral' if x > avg * 1.01 else
        'green' if x < avg * 0.99 else
        'black'
        for x in df['count']
    ]

d1 = scause_df.iloc[0:25]
d2 = scause_df.iloc[0:10]

my_colors1 = pick_colors_by_mean_count(d1)
my_colors2 = pick_colors_by_mean_count(d2)

mean1 = d1['count'].mean()
mean2 = d2['count'].mean()

Above = mpatches.Patch(color='lightcoral', label='Above Average')
At = mpatches.Patch(color='black', label='Within 1% of Average')
Below = mpatches.Patch(color='green', label='Below Average')

fig = plt.figure(figsize=(18,16))
fig.suptitle('Frequency of Oil Pipeline Leaks by Cause Subcategory:\nTop 25 and Top 10', fontsize=18)

ax1 = fig.add_subplot(2,1,1)
ax1.bar(d1['Cause Subcategory'], d1['count'], color=my_colors1);
ax1.legend(handles=[Above, At, Below], fontsize=14)
ax1.axhline(mean1, color='black', linestyle='dashed')
ax1.set_title('Top 25 Subcauses', size=20)
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.axes.xaxis.set_visible(False)

ax1.text(1, mean1 + 5, f'Mean = {mean1:.2f}', fontsize=14)

ax2 = fig.add_subplot(2,1,2)
ax2.bar(d2['Cause Subcategory'], d2['count'], color=my_colors2);

for bar in ax2.patches:
    height = bar.get_height()
    ax2.text(
        bar.get_x() + bar.get_width()/2,
        height,
        f'{int(height)}',
        ha='center',
        va='bottom',
        fontsize=12
    )

ax2.legend(handles=[Above, At, Below], fontsize=14)
ax2.axhline(mean2, color='black', linestyle='solid')
ax2.set_title('Top 10 Subcauses', size=20)
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')

plt.show()

Conclusion: A small set of sub causes accounts for a disproportionate share of all incidents. These high frequency contributors consistently exceed the historical average and reflect persistent structural weaknesses in pipeline systems — including corrosion, material defects, equipment failures, and operational errors. Addressing these top contributors would yield the greatest reduction in total incidents.