import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import kaleido
import plotly
# Read CSV file from URL
url = 'https://gist.githubusercontent.com/mattkram/d3880a3a23ca36ccf10f22c1f49adb29/raw/f4602d2b9a17eb0d17355897264f4bad80c5528f/NST-EST2022-POPCHG2020_2022.csv'
df = pd.read_csv(url)
# Display first 10 rows
print(df.head(10))
# Check the column names to ensure they match the expected names
print(df.columns)
# Rename columns for easier access
df.rename(columns={'NAME': 'State', 'NPOPCHG_2020': 'PopChange', 'PPOPCHG_2020': 'PopChangePerc'}, inplace=True)
SUMLEV REGION DIVISION STATE NAME ESTIMATESBASE2020 \
0 10 0 0 0 United States 331449520
1 20 1 0 0 Northeast Region 57609156
2 30 1 1 0 New England 15116206
3 30 1 2 0 Middle Atlantic 42492950
4 20 2 0 0 Midwest Region 68985537
5 30 2 3 0 East North Central 47368637
6 30 2 4 0 West North Central 21616900
7 20 3 0 0 South Region 126266262
8 30 3 5 0 South Atlantic 66089861
9 30 3 6 0 East South Central 19402323
POPESTIMATE2020 POPESTIMATE2021 POPESTIMATE2022 NPOPCHG_2020 ... \
0 331511512 332031554 333287557 61992 ...
1 57448898 57259257 57040406 -160258 ...
2 15074473 15121745 15129548 -41733 ...
3 42374425 42137512 41910858 -118525 ...
4 68961043 68836505 68787595 -24494 ...
5 47338744 47181948 47097779 -29893 ...
6 21622299 21654557 21689816 5399 ...
7 126450613 127346029 128716192 184351 ...
8 66164801 66666348 67452940 74940 ...
9 19422567 19474372 19578002 20244 ...
NRANK_ESTBASE2020 NRANK_POPEST2020 NRANK_POPEST2021 NRANK_POPEST2022 \
0 X X X X
1 4 4 4 4
2 9 9 9 9
3 4 4 4 4
4 3 3 3 3
5 3 3 3 3
6 7 7 7 7
7 1 1 1 1
8 1 1 1 1
9 8 8 8 8
NRANK_NPCHG2020 NRANK_NPCHG2021 NRANK_NPCHG2022 NRANK_PPCHG2020 \
0 X X X X
1 4 4 4 4
2 8 5 6 8
3 9 8 9 9
4 3 3 3 3
5 7 7 7 7
6 5 6 5 5
7 1 1 1 1
8 3 1 1 3
9 4 4 4 4
NRANK_PPCHG2021 NRANK_PPCHG2022
0 X X
1 4 4
2 4 6
3 8 9
4 3 3
5 7 8
6 6 5
7 1 1
8 3 1
9 5 4
[10 rows x 25 columns]
Index(['SUMLEV', 'REGION', 'DIVISION', 'STATE', 'NAME', 'ESTIMATESBASE2020',
'POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022', 'NPOPCHG_2020',
'NPOPCHG_2021', 'NPOPCHG_2022', 'PPOPCHG_2020', 'PPOPCHG_2021',
'PPOPCHG_2022', 'NRANK_ESTBASE2020', 'NRANK_POPEST2020',
'NRANK_POPEST2021', 'NRANK_POPEST2022', 'NRANK_NPCHG2020',
'NRANK_NPCHG2021', 'NRANK_NPCHG2022', 'NRANK_PPCHG2020',
'NRANK_PPCHG2021', 'NRANK_PPCHG2022'],
dtype='object')
# Include only the specified states for barcharts
include_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia',
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota',
'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
df_barchart = df[df['State'].isin(include_states)]
# Define colors based on population change for barcharts
colors_barchart = ['blue' if x > 0 else 'red' for x in df_barchart['PopChange']]
# Barchart: Population Change by State
plt.figure(figsize=(14, 8))
bars = plt.bar(df_barchart['State'], df_barchart['PopChange'], color=colors_barchart)
plt.xticks(rotation=90)
plt.title('Population Change by State (2020-2022)', fontsize=16)
plt.xlabel('State', fontsize=14)
plt.ylabel('Population Change', fontsize=14)
plt.grid(True)
# Add data labels
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.savefig('population_change_by_state_colored.png')
plt.show()
png
# Include only the specified regions for scatterplots
include_regions = ['Northeast Region', 'New England', 'Middle Atlantic', 'Midwest Region',
'East North Central', 'West North Central', 'South Region',
'South Atlantic','East South Central','West South Central',
'West Region','Mountain','Pacific']
df_scatterplot = df[df['State'].isin(include_regions)]
# Scatterplot: Population Change vs. Population Estimate 2022
plt.figure(figsize=(14, 8))
sns.scatterplot(x='POPESTIMATE2022', y='PopChange', hue='REGION', data=df_scatterplot, s=100, palette="viridis")
plt.title('Population Change vs. Population Estimate 2022', fontsize=16)
plt.xlabel('Population Estimate 2022 (in billions)', fontsize=14)
plt.ylabel('Population Change (in thousands)', fontsize=14)
plt.grid(True)
plt.legend(loc='upper left')
# Add data labels
for index, row in df_scatterplot.iterrows():
plt.text(row['POPESTIMATE2022'], row['PopChange'], row['State'], fontsize=9)
plt.tight_layout()
plt.savefig('population_change_vs_population_estimate_2022.png')
plt.show()
png
# Multiple Line Plots: Population Estimates over Years for Regions
plt.figure(figsize=(16,10))
for region in df['State'].unique():
region_data = df[df['State'] == region]
plt.plot(['2020', '2021', '2022'], region_data[['POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022']].values[0], label=region)
plt.title('Population Estimates over Years for Regions')
plt.xlabel('Year')
plt.ylabel('Population Estimate')
plt.legend()
plt.grid(True)
plt.tight_layout(pad=3.0)
plt.savefig('population_estimates_over_years.png')
plt.show()
png
# Define the states to include
states_to_include = [
'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia',
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota',
'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
]
# Filter the DataFrame to include only the specified states
df_filtered = df[df['State'].isin(states_to_include)].copy()
df_filtered['PopChange_2021'] = df_filtered['POPESTIMATE2021'] - df_filtered['POPESTIMATE2020']
df_filtered['PopChange_2022'] = df_filtered['POPESTIMATE2022'] - df_filtered['POPESTIMATE2021']
df_stacked = df_filtered[['State', 'PopChange_2021', 'PopChange_2022']]
df_stacked.set_index('State', inplace=True)
plt.figure(figsize=(14, 8)) # Adjusting figure size
df_stacked.plot(kind='bar', stacked=True)
plt.title('Population Change by State (2021-2022)')
plt.xlabel('State')
plt.ylabel('Population Change')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('population_change_by_state.png')
plt.show()
<Figure size 1400x800 with 0 Axes>
png
# Define the regions to include
regions_to_include = [
'Northeast Region', 'New England', 'Middle Atlantic', 'Midwest Region',
'East North Central', 'West North Central', 'South Region',
'South Atlantic', 'East South Central', 'West South Central',
'West Region', 'Mountain', 'Pacific'
]
# Filter the DataFrame to include only the specified regions
df_filtered = df[df['State'].isin(regions_to_include)].copy()
labels = df_filtered['State']
sizes = df_filtered['POPESTIMATE2022']
colors = sns.color_palette('pastel')[0:10]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors,
autopct='%1.1f%%', startangle=90)
# Draw circle for donut chart
centre_circle = plt.Circle((0,0),0.70,color='black', fc='white',linewidth=1.25)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
ax1.axis('equal')
plt.title('Population Distribution by Region (2022)')
plt.tight_layout()
plt.savefig('population_distribution_donut_chart.png')
plt.show()
png
import plotly.graph_objects as go
import plotly.io as pio
fig = go.Figure(go.Waterfall(
name = "Population Change",
orientation = "v",
measure = ["relative", "relative", "relative"],
x = ["Base to Estimate (2020)", "Estimate (2020) to Estimate (2021)", "Estimate (2021) to Estimate (2022)"],
textposition = "outside",
text = ["+61992", "-160258", "-41733"],
y = [61992,-160258,-41733],
))
fig.update_layout(
title="Waterfall Diagram: Population Change from Base to Estimate",
showlegend=True
)
# Convert the figure to an image
img_bytes = pio.to_image(fig, format='png')
# Save the image to a file
with open("waterfall_population_change.png", "wb") as f:
f.write(img_bytes)
fig.show()