The data set used for this data visualization contains patient data that can be used to predict whether a patient is likely to have a stroke based on the input parameters like gender, age, various diseases, and work environment. For the purposes of this project, the data is used to construct several visualizations to display stroke and heart disease statistics among patients based on age, gender, and work type. The data set also includes necessary metrics to make predictions and draw conclusions.
According to the World Health Organization (WHO), stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
Link to data set: https://www.kaggle.com/fedesoriano/stroke-prediction-dataset
Using this data set, I’ve used visualizations to show the following:
Formulae and assumptions are stated under each tab for the chart displayed.
I am able to make the following assumptions:
This vertical bar chart displays ages 0-82 and the average number of stroke patients per each age. The average number of stroke patients per age is 5.66. What this indicates is, for each age recorded, there is a chance that less than 6 people will stroke.
What we see in the bar chart is patients less than 50 years old are below the mean number of stroke patients by age; and, most ages above 51 years is above the mean of stroke patients per age. The closer the age is to 80 years, the further away from the average number of stroke patients by age. This indicates that the older the age, the greater the number of patients per age are having a stroke.
import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'c:/ProgramData/Anaconda3/Library/plugins/platforms'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
path = ('//apporto.com/dfs/LOYOLA/Users/jlcaesar_loyola/Desktop/')
filename = (path + 'healthcare-dataset-stroke-data.csv')
df = pd.read_csv(filename)
x = df.groupby(['age']).agg({'stroke': ['sum']}).reset_index()
x.columns = ['Age', 'Stroke']
x = x.sort_values('Age', ascending=True)
x = x[x.Stroke != 0]
x.reset_index(inplace=True, drop = True)
def pick_mean_colors(this_data):
colors=[]
avg = this_data.Stroke.mean()
for each in this_data.Stroke:
if each > avg * 1.01:
colors.append('red')
elif each < avg * 0.99:
colors.append('blue')
else:
colors.append('green')
return colors
import matplotlib.patches as mpatches
bottom1 = 0
top1 = 44
d1 = x.loc[bottom1:top1]
my_colors1=pick_mean_colors(d1)
Above = mpatches.Patch(color='red', label='Above Average')
At = mpatches.Patch(color='green', label = 'Within %1 of the Average')
Below = mpatches.Patch(color='blue', label = 'Below Average')
fig=plt.figure(figsize=(20,18))
fig.suptitle('Average Number of Stroke Patients by Age from 0-82 Years Recorded', fontsize=24, fontweight='bold')
ax1=fig.add_subplot(1,1,1)
ax1.bar(d1.Age, d1.Stroke, label = 'Stroke', color = my_colors1)
ax1.legend(handles= [Above, At, Below], fontsize=16)
plt.axhline(d1.Stroke.mean(), color='black', linestyle = 'dashed')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.axes.xaxis.set_visible(True)
ax1.text(top1-20, d1.Stroke.mean()+0.5, 'Mean=' + str(round(d1.Stroke.mean(),2)), rotation=0, fontsize=16)
ax1.set_xlabel('Age', labelpad = 20, fontsize=20)
ax1.set_ylabel('Stroke', labelpad=20, fontsize=20)
ax1.set_xticks(np.arange(0, 86, 5))
ax1.set_yticks(np.arange(0, 23, 1))
plt.show()
This horizontal bar chart displays the average number of stroke patients based on work environment. The average number of stroke patients is 62.25. People who work in the private sector industry have the highest number stroke patients recorded. Next, are those who are self-employed.
People who categorized themselves within these work environments, are above the average threshold of stroke patients.
y = df.groupby(['work_type']).agg({'stroke': ['sum']}).reset_index()
y.columns = ['Work_Type', 'Stroke']
y = y.sort_values('Stroke', ascending=True)
y.reset_index(inplace=True, drop = True)
y['Work_Type'] = y['Work_Type'].replace(['Never_worked', 'Govt_job', 'children', 'Self-employed', 'Private'],['Never Worked', 'Government Job', 'Child/NA', 'Self-Employed',
'Private Sector'])
y = y.drop(y.index[[0]])
y.reset_index(inplace=True, drop = True)
def pick_mean_colors_two(this_data2):
colors=[]
avg = this_data2.Stroke.mean()
for each in this_data2.Stroke:
if each > avg * 1.01:
colors.append('lightcoral')
elif each < avg * 0.99:
colors.append('purple')
else:
colors.append('black')
return colors
import matplotlib.patches as mpatches
bottom2 = 0
top2 = 5
d2 = y.loc[bottom2:top2]
d2 = d2.sort_values('Stroke', ascending = True)
d2.reset_index(inplace=True, drop=True)
my_colors2 = pick_mean_colors_two(d2)
Above = mpatches.Patch(color='lightcoral', label='Above Average')
At = mpatches.Patch(color='black', label = 'Within %1 of the Average')
Below = mpatches.Patch(color='purple', label = 'Below Average')
fig = plt.figure(figsize=(18,12))
ax1 = fig.add_subplot(1,1,1)
ax1.barh(d2.Work_Type, d2.Stroke, color=my_colors2)
for row_counter, value_at_row_counter in enumerate(d2.Stroke):
if value_at_row_counter > d2.Stroke.mean()*1.01:
color='lightcoral'
elif value_at_row_counter<d2.Stroke.mean()*0.99:
color='purple'
else:
color='black'
plt.xlim(0, d2.Stroke.max()*1.1)
ax1.text(value_at_row_counter+2, row_counter, str(value_at_row_counter), color=color, size=14, fontweight='bold')
ax1.legend(loc = 'lower right', handles = [Above, At, Below], fontsize = 18)
plt.axvline(d2.Stroke.mean(), color = 'black', linestyle = 'dashed')
ax1.text(d2.Stroke.mean()+5, 0, 'Mean = ' + str(d2.Stroke.mean()), rotation = 0, fontsize=16)
ax1.set_title('Average Number of Stroke Patients Based on Work Environment', size = 20, fontweight = 'bold', pad=20)
ax1.set_xlabel('Stroke Count', fontsize = 18, labelpad = 20)
ax1.set_ylabel('Work Type', fontsize = 18)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.show()
The pie chart displays the percent of patients who have heart disease based on the work environment. The total number of heart disease patients is 276 and over 50% of those patients identify their careers to be within the private sector industry.
df
pie_df = df.groupby(['work_type'])['heart_disease'].sum().reset_index(name='Total_HeartDisease')
pie_df = pie_df.drop(pie_df.index[[1]])
pie_df['work_type'] = pie_df['work_type'].replace(['Govt_job', 'children', 'Self-employed', 'Private'],
['Government Job', 'Child/NA', 'Self-Employed',
'Private Sector Employee'])
pie_df.columns = ['Work_Type', 'Total_HeartDisease']
pie_df.reset_index(inplace = True, drop = True)
labels = ['Government Job', 'Private Sector Employee', 'Self-Employed', 'Child/NA']
sizes = [36, 158, 81, 1]
fig = plt.figure(figsize =(12,12))
ax = fig.add_subplot(1,1,1)
colormap = plt.get_cmap("tab20c")
total_heart_disease = pie_df.Total_HeartDisease.sum()
patches, texts, pcts = ax.pie(sizes, labels=labels, labeldistance = 1.1, wedgeprops = dict(edgecolor = 'white'),
textprops = {'fontsize':16}, pctdistance = 0.85,
autopct = lambda p: '{:.2f}%\n({:.0f})'.format(p,(p/100)*total_heart_disease/1),
startangle=90)
plt.setp(pcts, color='white', fontweight = 'bold')
hole = plt.Circle((0,0), 0.3, fc = 'white')
fig1 = plt.gcf()
fig1.gca().add_artist(hole)
ax.yaxis.set_visible(False)
plt.title('Patients with Heart Disease by Environment', fontsize = 18, fontweight='bold', pad=20)
ax.text(0,0, 'Total Heart Disease\n ' + str(total_heart_disease), size = 18, ha='center', va = 'center')
ax.axis('equal')
plt.tight_layout()
plt.show()
This nested pie chart shows additional information about patients with heart disease. Within the pie chart that displays percent of men and women with heart disease, it also shows nested information about the percent of men and women who have heart disease, based on their work environment: 59% of men have heart disease and 32.6% of them work in the private sector industry.
Based on the nested pie chart, we can see that men have a higher percent of heart disease than women, but also are those who work in private sector industries. The nested pie chart displays a multidimensional view of layered statistics.
pie_nest_df = df.groupby(['gender','work_type']).agg({'heart_disease': ['sum']}).reset_index()
pie_nest_df = pie_nest_df.drop(pie_nest_df.index[[1,4,6,10]])
pie_nest_df['work_type'] = pie_nest_df['work_type'].replace(['Govt_job', 'Private', 'Self-employed', 'children'],['Government Job', 'Private Sector Employee', 'Self-Employed', 'Child/NA'])
pie_nest_df.columns = ['Gender', 'Work_Type','Total_HeartDisease']
pie_nest_df.reset_index(inplace = True, drop = True)
number_outside_colors = len(pie_nest_df.Gender.unique())
outside_color_ref_number = np.arange(number_outside_colors) * 4
number_inside_colors = len(pie_nest_df.Total_HeartDisease.unique())
all_color_ref_number = np.arange(number_outside_colors + number_inside_colors)
inside_color_ref_number = []
for each in all_color_ref_number:
if each not in outside_color_ref_number:
inside_color_ref_number.append(each)
print(outside_color_ref_number)
print(inside_color_ref_number)
fig = plt.figure(figsize = (14, 14))
ax = fig.add_subplot(1,1,1)
colormap = plt.get_cmap("tab20c")
outer_colors = colormap(outside_color_ref_number)
all_hd = pie_nest_df.Total_HeartDisease.sum()
pie_nest_df.groupby(['Gender'])['Total_HeartDisease'].sum().plot(
kind='pie', radius=1, colors=outer_colors, pctdistance = 0.6, labeldistance=0.8,
wedgeprops = dict(edgecolor='white'), textprops = dict(color='white', fontsize=26, fontweight='bold', va='center'),
autopct = lambda p: '{:.2f}%\n({:.0f})'.format(p, (p/100)*all_hd/1), startangle = 90)
inner_colors = colormap(inside_color_ref_number)
all_hd = pie_nest_df.Total_HeartDisease.sum()
explode = [0.5,0.5,0.5,0.5,0.5,0.5,0]
pie_nest_df.Total_HeartDisease.plot(
kind='pie', radius=0.7, colors = inner_colors,
pctdistance=0.72, labeldistance=0.9, wedgeprops=dict(edgecolor='White'),
textprops={'fontsize': 16}, labels=pie_nest_df.Work_Type, explode=explode,
shadow=True,
autopct = '%1.2f%%', startangle = 90)
hole = plt.Circle((0,0), 0.3, fc = 'white')
fig1 = plt.gcf()
fig1.gca().add_artist(hole)
ax.yaxis.set_visible(False)
plt.title('Patients with Heart Disease:\n Based on Gender & Work Environment', fontsize = 20,
fontweight = 'bold')
ax.text(0,0, 'Total Heart\n Diseased\n Patients\n ' + str(all_hd), size = 20, ha = 'center', va = 'center')
ax.axis('equal')
plt.tight_layout()
plt.show()
The dual axis chart compares the total number of heart disease and stroke patients based on work environment. Based on the total count, heart disease exceeds the number of patients who had a stroke for each work environment.
Once again, we see that patients who work in private sector industries are recorded to have more heart disease (158) stroke (149) than the other work environments.
It is also worth mentioning that children, while not employed, are still a part of the statistics to show very minimal patients who are minors have heart disease and stroke (based on this data set).
dual_df = df.groupby(['work_type']).agg({'stroke': ['sum'], 'heart_disease':['sum']}).reset_index()
dual_df.columns = ['WorkType', 'Stroke', 'Heart_Disease']
dual_df['WorkType'] = dual_df['WorkType'].replace(['Govt_job', 'Private', 'Self-employed', 'children'],
['Government Job', 'Private Sector Employee', 'Self-Employed',
'Child/NA'])
dual_df2 = dual_df.drop(dual_df.index[[1]])
dual_df2.reset_index(inplace = True, drop = True)
def barlabel (these_bars, this_ax):
for each_bar in these_bars:
height = each_bar.get_height()
this_ax.text(each_bar.get_x() + each_bar.get_width()/2,
height*1.01, format(height), fontsize=16, color='black', ha='center', va='bottom')
fig = plt.figure(figsize=(18,10))
ax1 = fig.add_subplot(1,1,1)
ax2 = ax1.twinx()
bar_width = 0.2
x_pos = np.arange(4)
sum_stroke_bars = ax1.bar(x_pos-(0.5*bar_width), dual_df2.Stroke, bar_width, color='yellow', edgecolor='black',
label='Sum of Stroke')
sum_hd_bars = ax2.bar(x_pos + (0.5*bar_width), dual_df2.Heart_Disease, bar_width, color = 'purple', edgecolor='black',
label = 'Sum of Heart Disease')
ax1.set_xlabel('Work Environment', fontsize = 18, labelpad = 20)
ax1.set_ylabel('Total Stroke', fontsize = 18, labelpad=20)
ax2.set_ylabel('Total Heart Disease', fontsize=18, rotation=270, labelpad=30)
ax1.tick_params(axis='y', labelsize=18)
ax2.tick_params(axis='y', labelsize=18)
plt.title('Total Stroke and Heart Disease Patients\n by Work Environment', fontsize = 22, pad = 30, fontweight='bold')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(dual_df2.WorkType, fontsize=18)
stroke_color, stroke_label = ax1.get_legend_handles_labels()
hd_color, hd_label = ax2.get_legend_handles_labels()
legend = ax1.legend(stroke_color + hd_color, stroke_label + hd_label, loc='upper left', frameon=True, ncol=1,
shadow=True, borderpad=1, fontsize=14)
ax1.set_ylim(0, dual_df2.Stroke.max()*1.50)
barlabel(sum_stroke_bars, ax1)
barlabel(sum_hd_bars, ax2)
plt.show()
General takeaways from the data set:
Ages 0-2 include year & months in float/decimal format.