Week_1_Assignment-Data

2024-12-21

Load Libraries

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)
library(ggplot2)
library(dplyr)
library(stringr)
library(tools)
library(stringdist)

## Warning: package 'stringdist' was built under R version 4.3.3

## 
## Attaching package: 'stringdist'
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

IIJA_funding_data <- read.csv("https://raw.githubusercontent.com/uzmabb182/Data_608/refs/heads/main/Week%201/IIJA%20FUNDING%20AS%20OF%20MARCH%202023.csv")

state_population_data <- read.csv("https://raw.githubusercontent.com/uzmabb182/Data_608/refs/heads/main/Week%201/State_Population_Data.csv")


election_results_2020_data <- read.csv("https://raw.githubusercontent.com/uzmabb182/Data_608/refs/heads/main/Week%201/2020_Presidential_Election_Results.csv")

View the First Few Rows of Data

#print(IIJA_funding_data)

# Check column names
colnames(IIJA_funding_data)

## [1] "State..Teritory.or.Tribal.Nation" "Total..Billions."

# Renaming the columns

IIJA_funding_data <- IIJA_funding_data %>%
  rename(State_Name = State..Teritory.or.Tribal.Nation, Funding_Billions = Total..Billions.)

#print(IIJA_funding_data)

# Convert column names to lowercase
colnames(IIJA_funding_data ) <- tolower(colnames(IIJA_funding_data ))

# View the modified data frame
#print(IIJA_funding_data )

# Sort by population (ascending order)
IIJA_funding_data <- IIJA_funding_data[order(IIJA_funding_data$state_name),]

# View the sorted dataframe
#print(IIJA_funding_data)

# Replace "Deleware" with "Delaware" conditionally
IIJA_funding_data$state_name[IIJA_funding_data$state_name == "Deleware"] <- "Delaware"

# View the updated DataFrame
#print(IIJA_funding_data)

# Add index column using mutate
IIJA_funding_data <- IIJA_funding_data %>% mutate(index = row_number())

# View the updated DataFrame
#print(IIJA_funding_data)

# Convert first letter of each word to uppercase using str_to_title
IIJA_funding_data$state_name <- str_to_title(IIJA_funding_data$state_name)

# View the updated DataFrame
#print(IIJA_funding_data)

# Convert column names to lowercase
colnames(state_population_data ) <- tolower(colnames(state_population_data ))

# View the modified data frame
#print(state_population_data)

# Sort by population (ascending order)
state_population_data <- state_population_data[order(state_population_data$name),]

# View the sorted dataframe
#print(state_population_data)

# Convert first letter of each word to uppercase using str_to_title
state_population_data$name <- str_to_title(state_population_data$name)

# View the updated DataFrame
#print(state_population_data)

# Convert column names to lowercase
colnames(election_results_2020_data ) <- tolower(colnames(election_results_2020_data ))

# View the modified data frame
#print(election_results_2020_data)

# Convert first letter of each word to uppercase using str_to_title
election_results_2020_data$state <- str_to_title(election_results_2020_data$state)

# View the updated DataFrame
#print(election_results_2020_data)

# Checking datatypes
#str(IIJA_funding_data)

#str(election_results_2020_data)

###str(state_population_data)

# Convert 'state' column from integer to character
state_population_data$name <- as.character(state_population_data$name)

###str(state_population_data)

# Merge on 'state_name' from df1 and 'state' from df2
result_df <- merge(IIJA_funding_data, state_population_data, by.x = "state_name", by.y = "name")

# View the merged dataframe
#print(result_df)

# Merge on 'state_name' from df1 and 'state' from df2
result_df <- merge(IIJA_funding_data, state_population_data, by.x = "state_name", by.y = "name")

# View the merged dataframe
#print(result_df)

# Merge on 'state_name' from df1 and 'state' from df2
merged_df <- merge(result_df, election_results_2020_data, by.x = "state_name", by.y = "state")

# View the merged dataframe
#print(merged_df)

# Select specific columns
new_df <- merged_df %>% select(state_name, funding_billions, popestimate2020, trump_win, biden_win)

# View the new data frame
#print(new_df)

Create Key Metrics for Analysis

Funding Per state percentage

# Add a new column
new_df <- new_df %>%
  mutate(funding_per_state_percentage = round((funding_billions / sum(funding_billions)) * 100, 2))

# Print the merged data frame
#print(new_df)

# Add a new column
new_df <- new_df %>%
  mutate(population_per_state_percentage = round((popestimate2020 / sum(popestimate2020)) * 100, 2))

# Print the merged data frame
#print(new_df)

# Add a new columns for testing criteria
new_df <- new_df %>%
  mutate(population_per_state_plus5percent = population_per_state_percentage + (population_per_state_percentage * 0.05))


new_df <- new_df %>%
  mutate(population_per_state_minus5percent = population_per_state_percentage - (population_per_state_percentage * 0.05))

new_df <- new_df %>%
  mutate(Equitable = ifelse(funding_per_state_percentage > population_per_state_minus5percent & 
                            funding_per_state_percentage < population_per_state_plus5percent, 
                            "Yes", "No"))
new_df <- new_df %>%
  mutate(bias = ifelse(biden_win == 1 & 
                       funding_per_state_percentage > population_per_state_plus5percent, 
                       "Yes", 
                       "No"))

# Print the merged data frame
head(new_df)

##   state_name funding_billions popestimate2020 trump_win biden_win
## 1    Alabama              3.0         5033094         1         0
## 2     Alaska              3.7          733017         1         0
## 3    Arizona              3.5         7187135         0         1
## 4   Arkansas              2.8         3014546         1         0
## 5 California             18.4        39521958         0         1
## 6   Colorado              3.2         5787129         0         1
##   funding_per_state_percentage population_per_state_percentage
## 1                         1.57                            1.52
## 2                         1.94                            0.22
## 3                         1.83                            2.17
## 4                         1.47                            0.91
## 5                         9.64                           11.96
## 6                         1.68                            1.75
##   population_per_state_plus5percent population_per_state_minus5percent
## 1                            1.5960                             1.4440
## 2                            0.2310                             0.2090
## 3                            2.2785                             2.0615
## 4                            0.9555                             0.8645
## 5                           12.5580                            11.3620
## 6                            1.8375                             1.6625
##   Equitable bias
## 1       Yes   No
## 2        No   No
## 3        No   No
## 4        No   No
## 5        No   No
## 6       Yes   No

Summary of Why These Fields Are Created

The following new fields are created to analyze whether funding allocation is equitable and to detect potential political bias in how resources are distributed.

population_per_state_plus5percent & population_per_state_minus5percent

Purpose:

These fields define an acceptable funding range (+/- 5%) around each state’s population percentage.

Why?

If funding is within ±5% of the population share, it is considered fair (equitable).

If funding falls outside this range, the state may be overfunded or underfunded.

population_per_state_plus5percent → Upper threshold (+5% of population share).

population_per_state_minus5percent → Lower threshold (-5% of population share).

Equitable (Is the funding fair?)

Purpose:

This field determines whether a state’s funding is proportional to its population based on the ±5% rule.

Why?

If funding falls within the ±5% range, the state is labeled “Yes” (Equitable).

If funding falls outside the range, it is labeled “No” (Inequitable).

This ensures we have an objective way to measure fairness in funding allocation.

bias (Is there political bias in funding?)

Purpose:

This field identifies political bias in funding allocation, specifically if Democratic (Biden-won) states received more than the fair share in funding.

Why?

A state is considered biased (“Yes”) if:

The state voted for Biden (biden_win == 1)

The state received funding more than equitable range

Otherwise, the state is labeled as “No” (No bias detected).

This helps detect whether Democratic states were treated more favorably compared to Republican states.

Is the Allocation Equitable Based on Population?

Funding Allocation vs. Population

This chart compares funding per state percentage to population per state percentage. If funding is fair, bars should align with the population.

ggplot(new_df, aes(x = reorder(state_name, population_per_state_percentage), 
                   y = funding_per_state_percentage, 
                   fill = Equitable)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = c("Yes" = "green", "No" = "orange")) +
  labs(title = "Funding Allocation vs. Population Percentage",
       x = "State",
       y = "Funding Percentage",
       fill = "Equitable") +
  theme_minimal()

Interpretation:

Green bars (“Yes”) → States where funding aligns with population.

Yellow bars (“No”) → States overfunded or underfunded, indicating inequity.

If many states are yellow, the allocation is not equitable.

Funding vs. Population Percentage

If funding is fair, points should align in a linear trend.

ggplot(new_df, aes(x = population_per_state_percentage, 
                   y = funding_per_state_percentage, 
                   color = Equitable)) +
  geom_point(size = 4, alpha = 0.7) +
  geom_smooth(method = "lm", color = "black", linetype = "dashed") +
  scale_color_manual(values = c("Yes" = "green", "No" = "orange")) +
  labs(title = "Funding Allocation vs. Population Percentage",
       x = "Population Percentage",
       y = "Funding Percentage",
       color = "Equitable") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Interpretation:

A strong trend line suggests fair allocation.

Scattered points with many “No” (yellow) indicate funding was not proportional.

Is the allocation equitable based on the population of each of the States and Territories, or is bias apparent?

According to the chart below, about 80% of the states have inequitable allocation.

Does the allocation favor the political interests of the Biden administration?

No, it doesn’t serve the political interests of the Biden administration

ggplot(new_df, aes(x = reorder(state_name, population_per_state_percentage), 
                   y = funding_per_state_percentage, 
                   fill = bias)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = c("Yes" = "orange", "No" = "green")) +
  labs(title = "Funding Allocation vs. Population Percentage",
       x = "State",
       y = "Funding Percentage",
       fill = "bias") +
  theme_minimal()

# Define the file path with filename and extension
file_path <- "C:/Users/Uzma/Downloads/new_df.csv"

# Write dataframe to CSV
write.csv(new_df, file = file_path, row.names = FALSE)

# Confirm that the file was saved
print("File saved successfully!")

## [1] "File saved successfully!"

# Select specific columns
group_df <- new_df %>% select(state_name, funding_billions, popestimate2020, trump_win, biden_win)

# View the new data frame
###print(group_df)

# Calculate total funding and population
total_funding <- sum(group_df$funding_billions, na.rm = TRUE)
total_population <- sum(group_df$popestimate2020, na.rm = TRUE)

# Create a new table with grouped states and percentages
grouped_table <- group_df %>%
  group_by(trump_win, biden_win) %>%
  summarize(
    trump_funding_percentage = round(sum(ifelse(trump_win == 1, funding_billions, 0), na.rm = TRUE) / total_funding * 100, 2),
    biden_funding_percentage = round(sum(ifelse(biden_win == 1, funding_billions, 0), na.rm = TRUE) / total_funding * 100, 2),
    trump_population_percentage = round(sum(ifelse(trump_win == 1, popestimate2020, 0), na.rm = TRUE) / total_population * 100, 2),
    biden_population_percentage = round(sum(ifelse(biden_win == 1, popestimate2020, 0), na.rm = TRUE) / total_population * 100, 2)
  ) %>%
  ungroup()

## `summarise()` has grouped output by 'trump_win'. You can override using the
## `.groups` argument.

# Print the new grouped table with rounded percentages
print(grouped_table)

## # A tibble: 2 × 6
##   trump_win biden_win trump_funding_percentage biden_funding_percentage
##       <int>     <int>                    <dbl>                    <dbl>
## 1         0         1                      0                       53.6
## 2         1         0                     46.4                      0  
## # ℹ 2 more variables: trump_population_percentage <dbl>,
## #   biden_population_percentage <dbl>

# Define the file path with filename and extension
file_path <- "C:/Users/Uzma/Downloads/grouped_table.csv"

# Write dataframe to CSV
write.csv(grouped_table, file = file_path, row.names = FALSE)

# Confirm that the file was saved
print("File saved successfully!")

## [1] "File saved successfully!"

Does the allocation favor the political interests of the Biden administration?

No, it does not favor the the political interests of the Biden administration.

Comparing Funding vs. Population Distribution

Bar Chart

# Convert data to long format for easy visualization
grouped_long <- grouped_table %>%
  pivot_longer(cols = c(trump_funding_percentage, biden_funding_percentage, 
                        trump_population_percentage, biden_population_percentage), 
               names_to = "Category", 
               values_to = "Percentage")

# Create labels for clarity
grouped_long$Group <- ifelse(grepl("trump", grouped_long$Category), "Trump-Won States", "Biden-Won States")
grouped_long$Metric <- ifelse(grepl("funding", grouped_long$Category), "Funding Allocation", "Population Percentage")

# Create the bar chart
ggplot(grouped_long, aes(x = Group, y = Percentage, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("Funding Allocation" = "green", "Population Percentage" = "orange")) +
  labs(title = "Funding Allocation vs. Population Share by Political Affiliation",
       x = "Political Affiliation (2020 Election)",
       y = "Percentage of Total",
       fill = "Category") +
  theme_minimal()

Analysis of the Bar Chart

If funding allocation closely matches population share, then the distribution is likely fair.

If Biden states receive significantly more funding than their population share, it suggests possible bias in allocation.

If Trump states receive less funding despite a larger population share, it may indicate under funding relative to need.

Funding vs. Population for Biden vs. Trump States

This scatter plot shows whether Biden states received disproportionately more funding.

Scatter Plot

ggplot(grouped_table, aes(x = trump_population_percentage, y = trump_funding_percentage, color = "Trump States")) +
  geom_point(size = 4) +
  geom_smooth(method = "lm", linetype = "dashed", color = "red") +
  geom_point(aes(x = biden_population_percentage, y = biden_funding_percentage, color = "Biden States"), size = 4) +
  geom_smooth(aes(x = biden_population_percentage, y = biden_funding_percentage), 
              method = "lm", linetype = "dashed", color = "blue") +
  scale_color_manual(values = c("Trump States" = "red", "Biden States" = "blue")) +
  labs(title = "Funding Allocation vs. Population Share for Trump and Biden States",
       x = "Population Share (%)",
       y = "Funding Share (%)",
       color = "Political Group") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning in qt((1 - level)/2, df): NaNs produced

## `geom_smooth()` using formula = 'y ~ x'

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

Storyboard (Sequential List of Phrases)

Introduction – The Funding Question: The U.S. government allocated billions in infrastructure funding—was it distributed fairly across states?

Data Collection & Analysis: We examined funding allocation, state populations, and 2020 election results to assess fairness.

Key Finding – Inequitable Distribution: About 80% of states received funding disproportionate to their population size.

Bias Investigation: Did Biden-won states receive more funding than Trump-won states?

Final Conclusion: The analysis shows no strong bias toward Biden states; inequities exist, but they don’t follow a clear political pattern.

3-Minute Story (One Paragraph)

When the U.S. government passed the Infrastructure Investment and Jobs Act (IIJA), it promised to distribute funding fairly to support all states. However, was this promise kept? By analyzing funding allocations alongside state populations and election results, we discovered that about 80% of states received either more or less than their fair share based on population. To investigate potential bias, we checked whether Biden-won states received disproportionately higher funding. The results? While the distribution was inequitable, there was no clear pattern favoring Democratic states—both Republican and Democratic states experienced funding disparities. This suggests that funding decisions may be influenced by other factors beyond politics, such as infrastructure needs or economic priorities.

Big Idea Summary (One Sentence)

The federal infrastructure funding distribution was inequitable, but no clear political bias was found in favor of Biden-won states.