import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import kaleido
import plotly
# Read CSV file from URL
url = 'https://gist.githubusercontent.com/mattkram/d3880a3a23ca36ccf10f22c1f49adb29/raw/f4602d2b9a17eb0d17355897264f4bad80c5528f/NST-EST2022-POPCHG2020_2022.csv'
df = pd.read_csv(url)

# Display first 10 rows
print(df.head(10))

# Check the column names to ensure they match the expected names
print(df.columns)

# Rename columns for easier access
df.rename(columns={'NAME': 'State', 'NPOPCHG_2020': 'PopChange', 'PPOPCHG_2020': 'PopChangePerc'}, inplace=True)
   SUMLEV REGION DIVISION  STATE                NAME  ESTIMATESBASE2020  \
0      10      0        0      0       United States          331449520   
1      20      1        0      0    Northeast Region           57609156   
2      30      1        1      0         New England           15116206   
3      30      1        2      0     Middle Atlantic           42492950   
4      20      2        0      0      Midwest Region           68985537   
5      30      2        3      0  East North Central           47368637   
6      30      2        4      0  West North Central           21616900   
7      20      3        0      0        South Region          126266262   
8      30      3        5      0      South Atlantic           66089861   
9      30      3        6      0  East South Central           19402323   

   POPESTIMATE2020  POPESTIMATE2021  POPESTIMATE2022  NPOPCHG_2020  ...  \
0        331511512        332031554        333287557         61992  ...   
1         57448898         57259257         57040406       -160258  ...   
2         15074473         15121745         15129548        -41733  ...   
3         42374425         42137512         41910858       -118525  ...   
4         68961043         68836505         68787595        -24494  ...   
5         47338744         47181948         47097779        -29893  ...   
6         21622299         21654557         21689816          5399  ...   
7        126450613        127346029        128716192        184351  ...   
8         66164801         66666348         67452940         74940  ...   
9         19422567         19474372         19578002         20244  ...   

   NRANK_ESTBASE2020  NRANK_POPEST2020  NRANK_POPEST2021  NRANK_POPEST2022  \
0                  X                 X                 X                 X   
1                  4                 4                 4                 4   
2                  9                 9                 9                 9   
3                  4                 4                 4                 4   
4                  3                 3                 3                 3   
5                  3                 3                 3                 3   
6                  7                 7                 7                 7   
7                  1                 1                 1                 1   
8                  1                 1                 1                 1   
9                  8                 8                 8                 8   

   NRANK_NPCHG2020 NRANK_NPCHG2021 NRANK_NPCHG2022 NRANK_PPCHG2020  \
0                X               X               X               X   
1                4               4               4               4   
2                8               5               6               8   
3                9               8               9               9   
4                3               3               3               3   
5                7               7               7               7   
6                5               6               5               5   
7                1               1               1               1   
8                3               1               1               3   
9                4               4               4               4   

  NRANK_PPCHG2021 NRANK_PPCHG2022  
0               X               X  
1               4               4  
2               4               6  
3               8               9  
4               3               3  
5               7               8  
6               6               5  
7               1               1  
8               3               1  
9               5               4  

[10 rows x 25 columns]
Index(['SUMLEV', 'REGION', 'DIVISION', 'STATE', 'NAME', 'ESTIMATESBASE2020',
       'POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022', 'NPOPCHG_2020',
       'NPOPCHG_2021', 'NPOPCHG_2022', 'PPOPCHG_2020', 'PPOPCHG_2021',
       'PPOPCHG_2022', 'NRANK_ESTBASE2020', 'NRANK_POPEST2020',
       'NRANK_POPEST2021', 'NRANK_POPEST2022', 'NRANK_NPCHG2020',
       'NRANK_NPCHG2021', 'NRANK_NPCHG2022', 'NRANK_PPCHG2020',
       'NRANK_PPCHG2021', 'NRANK_PPCHG2022'],
      dtype='object')
# Include only the specified states for barcharts
include_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
                  'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia',
                  'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
                  'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
                  'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
                  'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota',
                  'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
                  'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
                  'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
df_barchart = df[df['State'].isin(include_states)]

# Define colors based on population change for barcharts
colors_barchart = ['blue' if x > 0 else 'red' for x in df_barchart['PopChange']]

# Barchart: Population Change by State 
plt.figure(figsize=(14, 8))
bars = plt.bar(df_barchart['State'], df_barchart['PopChange'], color=colors_barchart)
plt.xticks(rotation=90)
plt.title('Population Change by State (2020-2022)', fontsize=16)
plt.xlabel('State', fontsize=14)
plt.ylabel('Population Change', fontsize=14)
plt.grid(True)

# Add data labels
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('population_change_by_state_colored.png')
plt.show()

png
png
# Include only the specified regions for scatterplots
include_regions = ['Northeast Region', 'New England', 'Middle Atlantic', 'Midwest Region',
                   'East North Central', 'West North Central', 'South Region', 
                   'South Atlantic','East South Central','West South Central',
                   'West Region','Mountain','Pacific']
df_scatterplot = df[df['State'].isin(include_regions)]

# Scatterplot: Population Change vs. Population Estimate 2022
plt.figure(figsize=(14, 8))
sns.scatterplot(x='POPESTIMATE2022', y='PopChange', hue='REGION', data=df_scatterplot, s=100, palette="viridis")
plt.title('Population Change vs. Population Estimate 2022', fontsize=16)
plt.xlabel('Population Estimate 2022 (in billions)', fontsize=14)
plt.ylabel('Population Change (in thousands)', fontsize=14)
plt.grid(True)
plt.legend(loc='upper left')

# Add data labels
for index, row in df_scatterplot.iterrows():
    plt.text(row['POPESTIMATE2022'], row['PopChange'], row['State'], fontsize=9)

plt.tight_layout()
plt.savefig('population_change_vs_population_estimate_2022.png')
plt.show()
png
png
# Multiple Line Plots: Population Estimates over Years for Regions
plt.figure(figsize=(16,10))
for region in df['State'].unique():
    region_data = df[df['State'] == region]
    plt.plot(['2020', '2021', '2022'], region_data[['POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022']].values[0], label=region)

plt.title('Population Estimates over Years for Regions')
plt.xlabel('Year')
plt.ylabel('Population Estimate')
plt.legend()
plt.grid(True)
plt.tight_layout(pad=3.0)
plt.savefig('population_estimates_over_years.png')
plt.show() 
png
png
# Define the states to include
states_to_include = [
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
    'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
    'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
    'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota',
    'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
    'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
    'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
]

# Filter the DataFrame to include only the specified states
df_filtered = df[df['State'].isin(states_to_include)].copy()

df_filtered['PopChange_2021'] = df_filtered['POPESTIMATE2021'] - df_filtered['POPESTIMATE2020']
df_filtered['PopChange_2022'] = df_filtered['POPESTIMATE2022'] - df_filtered['POPESTIMATE2021']

df_stacked = df_filtered[['State', 'PopChange_2021', 'PopChange_2022']]
df_stacked.set_index('State', inplace=True)

plt.figure(figsize=(14, 8))  # Adjusting figure size
df_stacked.plot(kind='bar', stacked=True)

plt.title('Population Change by State (2021-2022)')
plt.xlabel('State')
plt.ylabel('Population Change')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('population_change_by_state.png')
plt.show()
<Figure size 1400x800 with 0 Axes>
png
png
# Define the regions to include
regions_to_include = [
    'Northeast Region', 'New England', 'Middle Atlantic', 'Midwest Region',
    'East North Central', 'West North Central', 'South Region', 
    'South Atlantic', 'East South Central', 'West South Central',
    'West Region', 'Mountain', 'Pacific'
]

# Filter the DataFrame to include only the specified regions
df_filtered = df[df['State'].isin(regions_to_include)].copy()

labels = df_filtered['State']
sizes = df_filtered['POPESTIMATE2022']
colors = sns.color_palette('pastel')[0:10]

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', startangle=90)

# Draw circle for donut chart
centre_circle = plt.Circle((0,0),0.70,color='black', fc='white',linewidth=1.25)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

ax1.axis('equal')  
plt.title('Population Distribution by Region (2022)')
plt.tight_layout()
plt.savefig('population_distribution_donut_chart.png')
plt.show()
png
png
import plotly.graph_objects as go
import plotly.io as pio

fig = go.Figure(go.Waterfall(
    name = "Population Change",
    orientation = "v",
    measure = ["relative", "relative", "relative"],
    x = ["Base to Estimate (2020)", "Estimate (2020) to Estimate (2021)", "Estimate (2021) to Estimate (2022)"],
    textposition = "outside",
    text = ["+61992", "-160258", "-41733"],
    y = [61992,-160258,-41733],
))

fig.update_layout(
        title="Waterfall Diagram: Population Change from Base to Estimate",
        showlegend=True
)

# Convert the figure to an image
img_bytes = pio.to_image(fig, format='png')

# Save the image to a file
with open("waterfall_population_change.png", "wb") as f:
    f.write(img_bytes)

fig.show()