According to the World Health Organization (WHO), stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This data set can be used to predict whether a patient is likely to have a stroke based on the input parameters like gender, age, various diseases, and work environment. For the purposes of this project, the data is used to construct several visualizations to display stroke and heart disease statistics and includes necessary metrics to make predictions and draw conclusions.
Each row in the data provides relevant information about the patient.
Link to data set: https://www.kaggle.com/fedesoriano/stroke-prediction-dataset
Using this data set, I’ve used visualizations to show the following:
Formulae and assumptions are stated under each tab for the chart displayed.
I am able to make the following assumptions:
This vertical bar chart displays ages 0-82 and the average of strokes that can occur for each age. The average number of strokes per age is 5.66. What this indicates is, for each age recorded, there is a chance that less than 6 people will stroke.
What we see in the bar chart is ages less than 50 years is below the mean of strokes and most ages more than 51 years is above the mean of strokes per age. The closer the age is to 80 years, the further away from the average number of strokes by age. This indicates that the older the age, the greater the number of people having a stroke.
import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'c:/ProgramData/Anaconda3/Library/plugins/platforms'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#path = ('//apporto.com/dfs/LOYOLA/Users/jlcaesar_loyola/Desktop/')
#filename = (path + 'healthcare-dataset-stroke-data.csv')
filename = ('U:/healthcare-dataset-stroke-data.csv')
df = pd.read_csv(filename)
x = df.groupby(['age']).agg({'stroke': ['sum']}).reset_index()
x.columns = ['Age', 'Stroke']
x = x.sort_values('Age', ascending=True)
x = x[x.Stroke != 0]
x.reset_index(inplace=True, drop = True)
def pick_mean_colors(this_data):
colors=[]
avg = this_data.Stroke.mean()
for each in this_data.Stroke:
if each > avg * 1.01:
colors.append('red')
elif each < avg * 0.99:
colors.append('blue')
else:
colors.append('green')
return colors
import matplotlib.patches as mpatches
bottom1 = 0
top1 = 44
d1 = x.loc[bottom1:top1]
my_colors1=pick_mean_colors(d1)
Above = mpatches.Patch(color='red', label='Above Average')
At = mpatches.Patch(color='green', label = 'Within %1 of the Average')
Below = mpatches.Patch(color='blue', label = 'Below Average')
fig=plt.figure(figsize=(20,18))
fig.suptitle('Average Number of Stroke Patients by Age from 0-82 Years Recorded', fontsize=24, fontweight='bold')
ax1=fig.add_subplot(1,1,1)
ax1.bar(d1.Age, d1.Stroke, label = 'Stroke', color = my_colors1)
ax1.legend(handles= [Above, At, Below], fontsize=16)
plt.axhline(d1.Stroke.mean(), color='black', linestyle = 'dashed')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.axes.xaxis.set_visible(True)
ax1.text(top1-20, d1.Stroke.mean()+0.5, 'Mean=' + str(round(d1.Stroke.mean(),2)), rotation=0, fontsize=16)
ax1.set_xlabel('Age', labelpad = 20, fontsize=20)
ax1.set_ylabel('Stroke', labelpad=20, fontsize=20)
ax1.set_xticks(np.arange(0, 86, 5))
ax1.set_yticks(np.arange(0, 23, 1))
plt.show()
This horizontal bar chart displays the average number of strokes based on work environment. The average number of stroke patients is 62.25. People who work in the private sector industry have the highest number stroke patients recorded. Next, are those who are self-employed.
People who categorized themselves within these work environments, are above the average threshold of stroke patients.
y = df.groupby(['work_type']).agg({'stroke': ['sum']}).reset_index()
y.columns = ['Work_Type', 'Stroke']
y = y.sort_values('Stroke', ascending=True)
y.reset_index(inplace=True, drop = True)
y['Work_Type'] = y['Work_Type'].replace(['Never_worked', 'Govt_job', 'children', 'Self-employed', 'Private'],['Never Worked', 'Government Job', 'Child/NA', 'Self-Employed',
'Private Sector'])
y = y.drop(y.index[[0]])
y.reset_index(inplace=True, drop = True)
def pick_mean_colors_two(this_data2):
colors=[]
avg = this_data2.Stroke.mean()
for each in this_data2.Stroke:
if each > avg * 1.01:
colors.append('lightcoral')
elif each < avg * 0.99:
colors.append('purple')
else:
colors.append('black')
return colors
import matplotlib.patches as mpatches
bottom2 = 0
top2 = 5
d2 = y.loc[bottom2:top2]
d2 = d2.sort_values('Stroke', ascending = True)
d2.reset_index(inplace=True, drop=True)
my_colors2 = pick_mean_colors_two(d2)
Above = mpatches.Patch(color='lightcoral', label='Above Average')
At = mpatches.Patch(color='black', label = 'Within %1 of the Average')
Below = mpatches.Patch(color='purple', label = 'Below Average')
fig = plt.figure(figsize=(18,12))
ax1 = fig.add_subplot(1,1,1)
ax1.barh(d2.Work_Type, d2.Stroke, color=my_colors2)
for row_counter, value_at_row_counter in enumerate(d2.Stroke):
if value_at_row_counter > d2.Stroke.mean()*1.01:
color='lightcoral'
elif value_at_row_counter<d2.Stroke.mean()*0.99:
color='purple'
else:
color='black'
plt.xlim(0, d2.Stroke.max()*1.1)
ax1.text(value_at_row_counter+2, row_counter, str(value_at_row_counter), color=color, size=14, fontweight='bold')
ax1.legend(loc = 'lower right', handles = [Above, At, Below], fontsize = 18)
plt.axvline(d2.Stroke.mean(), color = 'black', linestyle = 'dashed')
ax1.text(d2.Stroke.mean()+5, 0, 'Mean = ' + str(d2.Stroke.mean()), rotation = 0, fontsize=16)
ax1.set_title('Average Number of Stroke Patients Based on Work Environment', size = 20, fontweight = 'bold', pad=20)
ax1.set_xlabel('Stroke Count', fontsize = 18, labelpad = 20)
ax1.set_ylabel('Work Type', fontsize = 18)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.show()
The pie chart displays the percent of patients who have heart disease based on the work environment. The total number of heart disease patients is 276 and over 50% of patients identify their careers to be within the private sector industry.
df
pie_df = df.groupby(['work_type'])['heart_disease'].sum().reset_index(name='Total_HeartDisease')
pie_df = pie_df.drop(pie_df.index[[1]])
pie_df['work_type'] = pie_df['work_type'].replace(['Govt_job', 'children', 'Self-employed', 'Private'],
['Government Job', 'Child/NA', 'Self-Employed',
'Private Sector Employee'])
pie_df.columns = ['Work_Type', 'Total_HeartDisease']
pie_df.reset_index(inplace = True, drop = True)
labels = ['Government Job', 'Private Sector Employee', 'Self-Employed', 'Child/NA']
sizes = [36, 158, 81, 1]
fig = plt.figure(figsize =(12,12))
ax = fig.add_subplot(1,1,1)
colormap = plt.get_cmap("tab20c")
total_heart_disease = pie_df.Total_HeartDisease.sum()
patches, texts, pcts = ax.pie(sizes, labels=labels, labeldistance = 1.1, wedgeprops = dict(edgecolor = 'white'),
textprops = {'fontsize':16}, pctdistance = 0.85,
autopct = lambda p: '{:.2f}%\n({:.0f})'.format(p,(p/100)*total_heart_disease/1),
startangle=90)
plt.setp(pcts, color='white', fontweight = 'bold')
hole = plt.Circle((0,0), 0.3, fc = 'white')
fig1 = plt.gcf()
fig1.gca().add_artist(hole)
ax.yaxis.set_visible(False)
plt.title('Patients with Heart Disease by Environment', fontsize = 18, fontweight='bold', pad=20)
ax.text(0,0, 'Total Heart Disease\n ' + str(total_heart_disease), size = 18, ha='center', va = 'center')
ax.axis('equal')
plt.tight_layout()
plt.show()
This nested pie chart is shows additional information about patients with heart disease. Within the pie chart that displays percent of men and women with heart disease, it also shows nested information about the percent of men and women who have heart disease, based on their work environment: 59% of men have heart disease and 32.6% of them work in the private sector industry.
Based on this nested pie chart, we can see that men have a higher percent of heart disease than women, but also are those who work in private sector industries. The nested pie chart displays a multidimensional view of layered statistics.
pie_nest_df = df.groupby(['gender','work_type']).agg({'heart_disease': ['sum']}).reset_index()
pie_nest_df = pie_nest_df.drop(pie_nest_df.index[[1,4,6,10]])
pie_nest_df['work_type'] = pie_nest_df['work_type'].replace(['Govt_job', 'Private', 'Self-employed', 'children'],['Government Job', 'Private Sector Employee', 'Self-Employed', 'Child/NA'])
pie_nest_df.columns = ['Gender', 'Work_Type','Total_HeartDisease']
pie_nest_df.reset_index(inplace = True, drop = True)
number_outside_colors = len(pie_nest_df.Gender.unique())
outside_color_ref_number = np.arange(number_outside_colors) * 4
number_inside_colors = len(pie_nest_df.Total_HeartDisease.unique())
all_color_ref_number = np.arange(number_outside_colors + number_inside_colors)
inside_color_ref_number = []
for each in all_color_ref_number:
if each not in outside_color_ref_number:
inside_color_ref_number.append(each)
print(outside_color_ref_number)
print(inside_color_ref_number)
fig = plt.figure(figsize = (14, 14))
ax = fig.add_subplot(1,1,1)
colormap = plt.get_cmap("tab20c")
outer_colors = colormap(outside_color_ref_number)
all_hd = pie_nest_df.Total_HeartDisease.sum()
pie_nest_df.groupby(['Gender'])['Total_HeartDisease'].sum().plot(
kind='pie', radius=1, colors=outer_colors, pctdistance = 0.6, labeldistance=0.8,
wedgeprops = dict(edgecolor='white'), textprops = dict(color='white', fontsize=26, fontweight='bold', va='center'),
autopct = lambda p: '{:.2f}%\n({:.0f})'.format(p, (p/100)*all_hd/1), startangle = 90)
inner_colors = colormap(inside_color_ref_number)
all_hd = pie_nest_df.Total_HeartDisease.sum()
explode = [0.5,0.5,0.5,0.5,0.5,0.5,0]
pie_nest_df.Total_HeartDisease.plot(
kind='pie', radius=0.7, colors = inner_colors,
pctdistance=0.72, labeldistance=0.9, wedgeprops=dict(edgecolor='White'),
textprops={'fontsize': 16}, labels=pie_nest_df.Work_Type, explode=explode,
shadow=True,
autopct = '%1.2f%%', startangle = 90)
hole = plt.Circle((0,0), 0.3, fc = 'white')
fig1 = plt.gcf()
fig1.gca().add_artist(hole)
ax.yaxis.set_visible(False)
plt.title('Patients with Heart Disease:\n Based on Gender & Work Environment', fontsize = 20,
fontweight = 'bold')
ax.text(0,0, 'Total Heart\n Diseased\n Patients\n ' + str(all_hd), size = 20, ha = 'center', va = 'center')
ax.axis('equal')
plt.tight_layout()
plt.show()
The dual axis chart compares the total number heart disease and stroke patients based on work type. Based on the total count, heart disease exceeds the number of patients who had a stroke for each work type.
Once again, we see that patients who work in private sector industries are recorded to have more heart disease (158) stroke (149) than the other work environments.
dual_df = df.groupby(['work_type']).agg({'stroke': ['sum'], 'heart_disease':['sum']}).reset_index()
dual_df.columns = ['WorkType', 'Stroke', 'Heart_Disease']
dual_df['WorkType'] = dual_df['WorkType'].replace(['Govt_job', 'Private', 'Self-employed', 'children'],
['Government Job', 'Private Sector Employee', 'Self-Employed',
'Child/NA'])
dual_df2 = dual_df.drop(dual_df.index[[1]])
dual_df2.reset_index(inplace = True, drop = True)
def barlabel (these_bars, this_ax):
for each_bar in these_bars:
height = each_bar.get_height()
this_ax.text(each_bar.get_x() + each_bar.get_width()/2,
height*1.01, format(height), fontsize=11, color='black', ha='center', va='bottom')
fig = plt.figure(figsize=(18,10))
ax1 = fig.add_subplot(1,1,1)
ax2 = ax1.twinx()
bar_width = 0.2
x_pos = np.arange(4)
sum_stroke_bars = ax1.bar(x_pos-(0.5*bar_width), dual_df2.Stroke, bar_width, color='yellow', edgecolor='black',
label='Sum of Stroke')
sum_hd_bars = ax2.bar(x_pos + (0.5*bar_width), dual_df2.Heart_Disease, bar_width, color = 'purple', edgecolor='black',
label = 'Sum of Heart Disease')
ax1.set_xlabel('Work Environment', fontsize = 18, labelpad = 20)
ax1.set_ylabel('Total Stroke', fontsize = 18, labelpad=20)
ax2.set_ylabel('Total Heart Disease', fontsize=18, rotation=270, labelpad=30)
ax1.tick_params(axis='y', labelsize=18)
ax2.tick_params(axis='y', labelsize=18)
plt.title('Total Stroke and Heart Disease Patients\n by Work Environment', fontsize = 22, pad = 30, fontweight='bold')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(dual_df2.WorkType, fontsize=18)
stroke_color, stroke_label = ax1.get_legend_handles_labels()
hd_color, hd_label = ax2.get_legend_handles_labels()
legend = ax1.legend(stroke_color + hd_color, stroke_label + hd_label, loc='upper left', frameon=True, ncol=1,
shadow=True, borderpad=1, fontsize=14)
ax1.set_ylim(0, dual_df2.Stroke.max()*1.50)
barlabel(sum_stroke_bars, ax1)
barlabel(sum_hd_bars, ax2)
plt.show()
General takeaways from the data set:
Men who work in private sector industries are most susceptible to heart disease and/or stroke. People over the age of 50 year are in the ‘above average’ category for number of stroke patients by age. This could possibly indicate that those over the age of 50 will see more frequent stroke cases among their peers. Heart disease appears to be less common in children than stroke.
Ages 0-2 include year & months in float/decimal format.