library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(ggplot2)
library(dplyr)
library(stringr)
library(tools)
library(stringdist)
## Warning: package 'stringdist' was built under R version 4.3.3
##
## Attaching package: 'stringdist'
##
## The following object is masked from 'package:tidyr':
##
## extract
IIJA_funding_data <- read.csv("https://raw.githubusercontent.com/uzmabb182/Data_608/refs/heads/main/Week%201/IIJA%20FUNDING%20AS%20OF%20MARCH%202023.csv")
state_population_data <- read.csv("https://raw.githubusercontent.com/uzmabb182/Data_608/refs/heads/main/Week%201/State_Population_Data.csv")
election_results_2020_data <- read.csv("https://raw.githubusercontent.com/uzmabb182/Data_608/refs/heads/main/Week%201/2020_Presidential_Election_Results.csv")
#print(IIJA_funding_data)
# Check column names
colnames(IIJA_funding_data)
## [1] "State..Teritory.or.Tribal.Nation" "Total..Billions."
# Renaming the columns
IIJA_funding_data <- IIJA_funding_data %>%
rename(State_Name = State..Teritory.or.Tribal.Nation, Funding_Billions = Total..Billions.)
#print(IIJA_funding_data)
# Convert column names to lowercase
colnames(IIJA_funding_data ) <- tolower(colnames(IIJA_funding_data ))
# View the modified data frame
#print(IIJA_funding_data )
# Sort by population (ascending order)
IIJA_funding_data <- IIJA_funding_data[order(IIJA_funding_data$state_name),]
# View the sorted dataframe
#print(IIJA_funding_data)
# Replace "Deleware" with "Delaware" conditionally
IIJA_funding_data$state_name[IIJA_funding_data$state_name == "Deleware"] <- "Delaware"
# View the updated DataFrame
#print(IIJA_funding_data)
# Add index column using mutate
IIJA_funding_data <- IIJA_funding_data %>% mutate(index = row_number())
# View the updated DataFrame
#print(IIJA_funding_data)
# Convert first letter of each word to uppercase using str_to_title
IIJA_funding_data$state_name <- str_to_title(IIJA_funding_data$state_name)
# View the updated DataFrame
#print(IIJA_funding_data)
# Convert column names to lowercase
colnames(state_population_data ) <- tolower(colnames(state_population_data ))
# View the modified data frame
#print(state_population_data)
# Sort by population (ascending order)
state_population_data <- state_population_data[order(state_population_data$name),]
# View the sorted dataframe
#print(state_population_data)
# Convert first letter of each word to uppercase using str_to_title
state_population_data$name <- str_to_title(state_population_data$name)
# View the updated DataFrame
#print(state_population_data)
# Convert column names to lowercase
colnames(election_results_2020_data ) <- tolower(colnames(election_results_2020_data ))
# View the modified data frame
#print(election_results_2020_data)
# Convert first letter of each word to uppercase using str_to_title
election_results_2020_data$state <- str_to_title(election_results_2020_data$state)
# View the updated DataFrame
#print(election_results_2020_data)
# Checking datatypes
#str(IIJA_funding_data)
#str(election_results_2020_data)
###str(state_population_data)
# Convert 'state' column from integer to character
state_population_data$name <- as.character(state_population_data$name)
###str(state_population_data)
# Merge on 'state_name' from df1 and 'state' from df2
result_df <- merge(IIJA_funding_data, state_population_data, by.x = "state_name", by.y = "name")
# View the merged dataframe
#print(result_df)
# Merge on 'state_name' from df1 and 'state' from df2
result_df <- merge(IIJA_funding_data, state_population_data, by.x = "state_name", by.y = "name")
# View the merged dataframe
#print(result_df)
# Merge on 'state_name' from df1 and 'state' from df2
merged_df <- merge(result_df, election_results_2020_data, by.x = "state_name", by.y = "state")
# View the merged dataframe
#print(merged_df)
# Select specific columns
new_df <- merged_df %>% select(state_name, funding_billions, popestimate2020, trump_win, biden_win)
# View the new data frame
#print(new_df)
# Add a new column
new_df <- new_df %>%
mutate(funding_per_state_percentage = round((funding_billions / sum(funding_billions)) * 100, 2))
# Print the merged data frame
#print(new_df)
# Add a new column
new_df <- new_df %>%
mutate(population_per_state_percentage = round((popestimate2020 / sum(popestimate2020)) * 100, 2))
# Print the merged data frame
#print(new_df)
# Add a new columns for testing criteria
new_df <- new_df %>%
mutate(population_per_state_plus5percent = population_per_state_percentage + (population_per_state_percentage * 0.05))
new_df <- new_df %>%
mutate(population_per_state_minus5percent = population_per_state_percentage - (population_per_state_percentage * 0.05))
new_df <- new_df %>%
mutate(Equitable = ifelse(funding_per_state_percentage > population_per_state_minus5percent &
funding_per_state_percentage < population_per_state_plus5percent,
"Yes", "No"))
new_df <- new_df %>%
mutate(bias = ifelse(biden_win == 1 &
funding_per_state_percentage > population_per_state_plus5percent,
"Yes",
"No"))
# Print the merged data frame
head(new_df)
## state_name funding_billions popestimate2020 trump_win biden_win
## 1 Alabama 3.0 5033094 1 0
## 2 Alaska 3.7 733017 1 0
## 3 Arizona 3.5 7187135 0 1
## 4 Arkansas 2.8 3014546 1 0
## 5 California 18.4 39521958 0 1
## 6 Colorado 3.2 5787129 0 1
## funding_per_state_percentage population_per_state_percentage
## 1 1.57 1.52
## 2 1.94 0.22
## 3 1.83 2.17
## 4 1.47 0.91
## 5 9.64 11.96
## 6 1.68 1.75
## population_per_state_plus5percent population_per_state_minus5percent
## 1 1.5960 1.4440
## 2 0.2310 0.2090
## 3 2.2785 2.0615
## 4 0.9555 0.8645
## 5 12.5580 11.3620
## 6 1.8375 1.6625
## Equitable bias
## 1 Yes No
## 2 No No
## 3 No No
## 4 No No
## 5 No No
## 6 Yes No
The following new fields are created to analyze whether funding allocation is equitable and to detect potential political bias in how resources are distributed.
These fields define an acceptable funding range (+/- 5%) around each state’s population percentage.
If funding is within ±5% of the population share, it is considered fair (equitable).
If funding falls outside this range, the state may be overfunded or underfunded.
population_per_state_plus5percent → Upper threshold (+5% of population share).
population_per_state_minus5percent → Lower threshold (-5% of population share).
This field determines whether a state’s funding is proportional to its population based on the ±5% rule.
If funding falls within the ±5% range, the state is labeled “Yes” (Equitable).
If funding falls outside the range, it is labeled “No” (Inequitable).
This ensures we have an objective way to measure fairness in funding allocation.
This field identifies political bias in funding allocation, specifically if Democratic (Biden-won) states received more than the fair share in funding.
Why?
A state is considered biased (“Yes”) if:
The state voted for Biden (biden_win == 1)
The state received funding more than equitable range
Otherwise, the state is labeled as “No” (No bias detected).
This helps detect whether Democratic states were treated more favorably compared to Republican states.
This chart compares funding per state percentage to population per state percentage. If funding is fair, bars should align with the population.
ggplot(new_df, aes(x = reorder(state_name, population_per_state_percentage),
y = funding_per_state_percentage,
fill = Equitable)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = c("Yes" = "green", "No" = "orange")) +
labs(title = "Funding Allocation vs. Population Percentage",
x = "State",
y = "Funding Percentage",
fill = "Equitable") +
theme_minimal()
Interpretation:
Green bars (“Yes”) → States where funding aligns with population.
Yellow bars (“No”) → States overfunded or underfunded, indicating inequity.
If many states are yellow, the allocation is not equitable.
If funding is fair, points should align in a linear trend.
ggplot(new_df, aes(x = population_per_state_percentage,
y = funding_per_state_percentage,
color = Equitable)) +
geom_point(size = 4, alpha = 0.7) +
geom_smooth(method = "lm", color = "black", linetype = "dashed") +
scale_color_manual(values = c("Yes" = "green", "No" = "orange")) +
labs(title = "Funding Allocation vs. Population Percentage",
x = "Population Percentage",
y = "Funding Percentage",
color = "Equitable") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Interpretation:
A strong trend line suggests fair allocation.
Scattered points with many “No” (yellow) indicate funding was not proportional.
According to the chart below, about 80% of the states have inequitable allocation.
Does the allocation favor the political interests of the Biden administration?
No, it doesn’t serve the political interests of the Biden administration
ggplot(new_df, aes(x = reorder(state_name, population_per_state_percentage),
y = funding_per_state_percentage,
fill = bias)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = c("Yes" = "orange", "No" = "green")) +
labs(title = "Funding Allocation vs. Population Percentage",
x = "State",
y = "Funding Percentage",
fill = "bias") +
theme_minimal()
# Define the file path with filename and extension
file_path <- "C:/Users/Uzma/Downloads/new_df.csv"
# Write dataframe to CSV
write.csv(new_df, file = file_path, row.names = FALSE)
# Confirm that the file was saved
print("File saved successfully!")
## [1] "File saved successfully!"
# Select specific columns
group_df <- new_df %>% select(state_name, funding_billions, popestimate2020, trump_win, biden_win)
# View the new data frame
###print(group_df)
# Calculate total funding and population
total_funding <- sum(group_df$funding_billions, na.rm = TRUE)
total_population <- sum(group_df$popestimate2020, na.rm = TRUE)
# Create a new table with grouped states and percentages
grouped_table <- group_df %>%
group_by(trump_win, biden_win) %>%
summarize(
trump_funding_percentage = round(sum(ifelse(trump_win == 1, funding_billions, 0), na.rm = TRUE) / total_funding * 100, 2),
biden_funding_percentage = round(sum(ifelse(biden_win == 1, funding_billions, 0), na.rm = TRUE) / total_funding * 100, 2),
trump_population_percentage = round(sum(ifelse(trump_win == 1, popestimate2020, 0), na.rm = TRUE) / total_population * 100, 2),
biden_population_percentage = round(sum(ifelse(biden_win == 1, popestimate2020, 0), na.rm = TRUE) / total_population * 100, 2)
) %>%
ungroup()
## `summarise()` has grouped output by 'trump_win'. You can override using the
## `.groups` argument.
# Print the new grouped table with rounded percentages
print(grouped_table)
## # A tibble: 2 × 6
## trump_win biden_win trump_funding_percentage biden_funding_percentage
## <int> <int> <dbl> <dbl>
## 1 0 1 0 53.6
## 2 1 0 46.4 0
## # ℹ 2 more variables: trump_population_percentage <dbl>,
## # biden_population_percentage <dbl>
# Define the file path with filename and extension
file_path <- "C:/Users/Uzma/Downloads/grouped_table.csv"
# Write dataframe to CSV
write.csv(grouped_table, file = file_path, row.names = FALSE)
# Confirm that the file was saved
print("File saved successfully!")
## [1] "File saved successfully!"
No, it does not favor the the political interests of the Biden administration.
# Convert data to long format for easy visualization
grouped_long <- grouped_table %>%
pivot_longer(cols = c(trump_funding_percentage, biden_funding_percentage,
trump_population_percentage, biden_population_percentage),
names_to = "Category",
values_to = "Percentage")
# Create labels for clarity
grouped_long$Group <- ifelse(grepl("trump", grouped_long$Category), "Trump-Won States", "Biden-Won States")
grouped_long$Metric <- ifelse(grepl("funding", grouped_long$Category), "Funding Allocation", "Population Percentage")
# Create the bar chart
ggplot(grouped_long, aes(x = Group, y = Percentage, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = c("Funding Allocation" = "green", "Population Percentage" = "orange")) +
labs(title = "Funding Allocation vs. Population Share by Political Affiliation",
x = "Political Affiliation (2020 Election)",
y = "Percentage of Total",
fill = "Category") +
theme_minimal()
Analysis of the Bar Chart
If funding allocation closely matches population share, then the distribution is likely fair.
If Biden states receive significantly more funding than their population share, it suggests possible bias in allocation.
If Trump states receive less funding despite a larger population share, it may indicate under funding relative to need.
This scatter plot shows whether Biden states received disproportionately more funding.
ggplot(grouped_table, aes(x = trump_population_percentage, y = trump_funding_percentage, color = "Trump States")) +
geom_point(size = 4) +
geom_smooth(method = "lm", linetype = "dashed", color = "red") +
geom_point(aes(x = biden_population_percentage, y = biden_funding_percentage, color = "Biden States"), size = 4) +
geom_smooth(aes(x = biden_population_percentage, y = biden_funding_percentage),
method = "lm", linetype = "dashed", color = "blue") +
scale_color_manual(values = c("Trump States" = "red", "Biden States" = "blue")) +
labs(title = "Funding Allocation vs. Population Share for Trump and Biden States",
x = "Population Share (%)",
y = "Funding Share (%)",
color = "Political Group") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf