1. Data Source & Variables:
    1. US Low Birth Weight Babies dataset; data source: Centers for Disease Control and Prevention, National Center for Health Statistics: https://datacenter.aecf.org/
      1. Location (state), Race Group, TimeFrame (Year), Data (Low birth weight totals)
      2. 16383 observations
    2. 2019 Census dataset: https://data.census.gov. Note: I “customized” both datasets from each of these data repositories to show the variables that I was interested in seeing.
      1. State, Total Population, Multiracial, White, Black, American Indian/Alaska Native, Asian, Latinx
      2. 4368 observations

Load Packages and Datasets

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr) #to help identify hidden characters
setwd("/Users/smhenderson/Desktop/DATA101/Project")
#Low Birth Weight Dataset
lowbirthweight <- read.csv("Low birth-weight babies by race and ethnicity.csv")

#Census 2019 Dataset
census_data <- read.csv("census2019.csv")

Clean 2019 Census Data

names(census_data) <- tolower(names(census_data)) #create lowercase
names(census_data) <- gsub(" ","",names(census_data)) #removes unwanted spaces
sum(is.na(census_data)) #NAs/blanks did not return any results; however, there where missing (hidden characters) when looking at the dataset. I handled these in subsequent codes.
## [1] 0
#colnames(census_data)

#Rename and filter variables; make the dataset long form vs wide form by gathering the racial groups
census_data2 <- census_data%>%
  rename(location = x) %>%
  filter(!is.na(location)) %>%
  filter(!str_detect(location, "^\\s*$")) %>% #identify hidden characters
  tidyr::gather("race.group", "race_perc", 3:8)
#str(census_data2)

#Rename variables and recode racial groups so that they match low birth weight dataset
census_data3 <- census_data2 %>%
  rename(state_population = total.population) %>%
  mutate(race.group = recode(race.group, "asian" = "Asian/Pacific Islander", "black" = "Black", "latinx" = "Latinx","white" = "White", "multiracial" = "Multiracial", "american.indian.alaska.native" = "American Indian/Alaska Native")) #rename the different groups

#unique(census_data3$race.group)
#str(census_data3)

Clean 2019 Low Birth Weight Data

names(lowbirthweight) <- tolower(names(lowbirthweight)) #create lowercase
names(lowbirthweight) <- gsub(" ","",names(lowbirthweight)) #removes unwanted spaces
sum(is.na(lowbirthweight))
## [1] 0
#str(lowbirthweight)
#unique(lowbirthweight$race.group)
#unique(lowbirthweight$location)

#removes unwanted values in the location, dataformat, and racegroup variables
lowbirthweight2 <- lowbirthweight %>%
  filter(location != "United States") %>%
  filter(location != "District of Columbia") %>%
  filter(dataformat != "Percent") %>%
  filter(race.group != "Total") %>%
  filter(timeframe == 2019) %>%
  mutate(race.group = recode(race.group, "Asian and Pacific Islander" = "American/Pacific Islander", "Black or African American" = "Black", "Hispanic or Latino" = "Latinx","Non-Hispanic White" = "White", "Two or more races" = "Multiracial", "American Indian" = "American Indian/Alaska Native")) #rename the different groups
#unique(lowbirthweight2$location)
#unique(lowbirthweight2$race.group)
#unique(lowbirthweight3$race.group)

#Select the variables that I want to work with and rename variable
lowbirthweight3 <- lowbirthweight2 %>%
  select(location, race.group, data) %>%
  rename(lbw = data)

lowbirthweight3$lbw <- as.numeric(lowbirthweight3$lbw) #convert to numeric
## Warning: NAs introduced by coercion
sum(is.na(lowbirthweight3))
## [1] 12
lbw_missing <- sum(is.na(lowbirthweight3))
total_obs <- nrow(lowbirthweight3)
percentage_missing <- (lbw_missing / total_obs) * 100

lowbirthweight3 <- lowbirthweight3 %>%
  filter(!is.na(lbw)) #removes blanks

More Data Prepping: Convert Census Percentages to Absolute Counts

#str(census_data3)
census_data3$state_population <- as.numeric(gsub("[^0-9]", "", census_data3$state_population)) #convert to numeric, accounting for numbers being stored as text in csv file
census_data3$race_perc_numeric <- as.numeric(gsub("%", "", census_data3$race_perc)) / 100

#calculating the racial population for each state, using the total population of the state and the percentage of each racial group
census_data3$racial_population <- census_data3$state_population * (census_data3$race_perc_numeric / 100)
census_data3$racial_population <- round(census_data3$racial_population)

Merge Census & Low Birth Weight Datasets

census_lbw <- merge(census_data3, lowbirthweight3, by = c("location", "race.group"))

More Data Prepping: Calculate Low Birth Weight Rates

#Calculate the low birth weight rate for each racial group in each state by dividing the total number of low birth weight cases for that racial group by the total population of that racial group in the state.
census_lbw$lbw_rate <- census_lbw$lbw / census_lbw$racial_population
census_lbw$lbw_rate <- round(census_lbw$lbw_rate, digits = 2)

Question 1: Which Racial Group had the Highest Low Birth Weight Rates in 2019?

race_data <- census_lbw %>%
  group_by(race.group) %>%
  summarize(total_lbw_rate = mean(lbw_rate))

#Create bar graph
barplot(race_data$total_lbw_rate, names.arg = race_data$race.group,
        main = "Average Low Birth Weight Rates by Racial Groups in 2019",
        xlab = "Racial Groups", ylab = "Average Low Birth Weight Rates",
        col = "darkblue", cex.names = 0.6, ylim = c(0,0.25))

Summary:

Incorporating census data helped take into account the different population sizes for the different racial groups in each states. It helped identify if there are any potential disparities as it relates to different racial groups, and avoid potential biases arising from unequal sample sizes. Black woman in 2019 had the highest low birth weight rates (20%) compared to other racial groups with White woman having the lowest rate (6%). This is consistent with literature that express that historically marginalized racial groups tend have the worst maternal and child health outcomes.

Question 2: What is the Distribution of Average Low Birth Weight Rates among Black Babies Across all States in 2019?

# Filter the data for black women
black_women_data <- census_lbw %>%
  filter(race.group == "Black")

# Group by state and calculate the average low birth weight rate for black women
state_grouped <- black_women_data %>%
  group_by(location) %>%
  summarize(avg_lbw_rate = mean(lbw_rate))%>%
  arrange(desc(avg_lbw_rate))

hist(state_grouped$avg_lbw_rate, main = "Distribution of Average Low Birth Weight Rates Among Black Babies in 2019",
     xlab = "Average Low Birth Weight Rate", ylab = "# of States",
     col = "violet", breaks = 15,  # Adjust the number of bins (breaks)
    cex.main = 0.9)  # Adjust the title font size

## Summary: The histogram suggests that a significant number of states have average low birth weight rates around 21%. It also suggests that the majority of states have low birth weight rates clustered around a specific range, with fewer states having rates that deviate significantly from the mean.

Question 3: What are the Top 5 States with the Highest Low Birth Weight Rates among Black Babies in 2019?

top5<- head(state_grouped, 5)

barplot(top5$avg_lbw_rate, names.arg = top5$location,
        main = "Top 5 States with Highest Average Low Birth Weight Rates (Black Babies)",
        xlab = "State", ylab = "Average Low Birth Weight Rate",
        col = "darkred", cex.names = 0.8, ylim = c(0, max(top5$avg_lbw_rate) + 0.01),
        cex.main = 0.9)  # Adjust the title font size

## Summary: The top 5 US states with the highest low birth weight rates among Black babies includes all midwest states - North Dakota, Wisconsin, Iowa, Nebraska, and Utah.

Question 4: Is there a statistically significant difference in the average low birth weight rates among different racial groups?

anova_result <- aov(lbw_rate ~ as.factor(race.group), data = census_lbw)
print(summary(anova_result))
##                        Df Sum Sq Mean Sq F value Pr(>F)    
## as.factor(race.group)   4 0.5625 0.14063   147.4 <2e-16 ***
## Residuals             234 0.2233 0.00095                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Summary:

Null Hypothesis (H0): There is no significant difference in the mean low birth weight rates among different racial groups.

Alternative Hypothesis (Ha): There is a significant difference in the mean low birth weight rates among the racial groups, with historically marginalized racial groups having higher average low birth weight rates compared to the White race group.

The results indicate that there is a statistically significant difference in the average low birth weight rates among the racial groups. This conclusion is based on the p-value being very small (less than 2e-16) and suggests strong evidence against the null hypothesis (no difference between group means) and supports the alternative hypothesis that at least one group mean is different from the others. This is consistent with literature that express that historically marginalized racial groups tend have the worst maternal and child health outcomes. It also highlights the need for intervention to address racial disparities related to low birth weight rates in the US.

Create a Box Plot

boxplot(census_lbw$lbw_rate ~ census_lbw$race.group,
        main = "Low Birth Weight Rate by Racial Group in 2019",
        xlab = "Race Groups",
        ylab = "Low Birth Weight Rate",
        col = "blue",
        cex.names = 0.4,
        cex.axis = 0.6)

## Summary: The box plot shows that Black babies had the highest low birth weight rate. American Indian/Alaska Native and Black babies are the only ones with noted outliers. There are less variation in the data descibing White babies and more in Black babies.

Question 5: How does state population size impact low birth weight rates?

# Create scatterplot
plot(census_lbw$state_population, census_lbw$lbw_rate,
     xlab = "State Population",
     ylab = "Low Birth Weight Rate",
     main = "State Population vs. Low Birth Weight Rate",
     col = "blue", pch = 16)

# Add best fit line to the scatterplot
abline(lm(lbw_rate ~ state_population, data = census_lbw), col = "red")

## Summary: Given that we are seeing a negative correlation, this could mean that as the racial population increases, the low birth weight rates tends to decrease. However, in the context of your scatterplot of racial population and low birth weight rates, it’s possible that there are more states with smaller racial populations and fewer states with larger racial populations. This can result in a concentration of data points towards the left side of the graph (lower racial populations) and a tapering off towards the right side (higher racial populations).

Perform Linear Regression

regression_model <- lm(lbw_rate ~ state_population, data = census_lbw)
summary(regression_model)
## 
## Call:
## lm(formula = lbw_rate ~ state_population, data = census_lbw)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.093970 -0.044060 -0.008164  0.028586  0.214252 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.268e-01  4.976e-03  25.474  < 2e-16 ***
## state_population -1.332e-09  4.961e-10  -2.684  0.00779 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05672 on 237 degrees of freedom
## Multiple R-squared:  0.0295, Adjusted R-squared:  0.0254 
## F-statistic: 7.204 on 1 and 237 DF,  p-value: 0.007788

Summary:

Based on these results, there seems to be no meaningful linear relationship between state population and low birth weight rates. For state population, the p-value (0.95) is high, indicating that the coefficient is not statistically significant. This suggests that changes in state population are not associated with changes in low birth weight rates in a meaningful way. The Multiple R-squared (3.591e-06) is extremely close to zero, indicating that the model explains an almost negligible amount of the variability in low birth weight rates.

Potential ethical concerns: I attempted to account for potentially biases such as weighing the low birth weight dataset to account for population size and this can skew results. However, some concerns may come into play such as during the data collection process (e.g., it is uncertain how racial groups were recorded from the population - either self-reported or provider reported; also, the racial group is based on the mother’s race and not the baby) or even during the categorization of racial groups that I performed.