library(tidyverse)
library(dplyr)
library(ggplot2)
library(treemap)Project 1: Exploring Low Birth Weight in the USA
#Summary: The Low Birth Weight Babies in the United States dataset was obtained from the Annie E. Casey Foundation website. The official data source is the Centers for Disease Control and Prevention, National Center for Health Statistics. The foundation has a data repository called, Kids Count Data Center, where you can find copious datasets about youth wellbeing. The low birth weight dataset includes information on live births weighing less than 5.5 pounds. It also includes the mother’s place of residence (not where the birth was given).
The dataset included the mother’s residence (location), reported race/ethnicity (Race Group), year of births (TimeFrame), and the count of low birth weight (Data). The dataset includes three variables that primarily served as categorical variables (location, Race Group, and TimeFrame), one variable that served as the quantitative variable (Data).
The dataset was cleaned using “tolower”, “gsub”, and “!is.na”. These were elected given that the variable names were in upper and lower case, and there were unwanted spaces and blanks that needed to be removed from the dataset to complete the analysis. “Filter (X !=…” was also used to remove unwanted values. For example, the dataset included both the count and the percentage of low birth weights. Yet, only the count was needed for the analysis. A couple of variables were also created in order to assist with the data analysis. For example, when looking at low birth weight by region, the states needed to be assigned their respective region.
#Load Packages
#Working Directory & Dataset
setwd("/Users/smhenderson/Desktop/DATA110/R/Datasets")
lowbirthweight <- read_csv("Low birth-weight babies by race and ethnicity.csv")#Data Cleaning
names(lowbirthweight) <- tolower(names(lowbirthweight)) #create lowercase
names(lowbirthweight) <- gsub(" ","",names(lowbirthweight)) #removes unwanted spaces
#removes unwanted values in the location, dataformat, and racegroup variables
lowbirthweight2 <- lowbirthweight %>%
filter(location != "United States") %>%
filter(dataformat != "Percent") %>%
filter(racegroup != "Total") %>%
mutate(racegroup = recode(racegroup, "Asian and Pacific Islander" = "American/Pacific Islander", "Black or African American" = "Black", "Hispanic or Latino" = "Latinx","Non-Hispanic White" = "White", "Two or more races" = "Multiracial" )) #rename the different groups
lowbirthweight2$data <- as.numeric(lowbirthweight2$data) #convert data variable so you can easily sum
lowbirthweight3 <- lowbirthweight2 %>%
filter(!is.na(data)) #removes blanks#Assign Regions (West, Midwest, South, Northeast) to States
regions <- lowbirthweight3 %>%
mutate(region = ifelse(location %in% c("Alaska", "California", "Colorado", "Hawaii", "Idaho", "Montana", "Nevada", "Oregon",
"Utah", "Washington", "Wyoming"), "West",
ifelse(location %in% c("Alabama", "Arkansas", "Delaware", "District of Columbia", "Florida", "Georgia", "Kentucky", "Louisiana",
"Maryland", "Mississippi", "North Carolina", "Oklahoma", "South Carolina",
"Tennessee", "Texas", "Virginia", "West Virginia"), "South",
ifelse(location %in% c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan",
"Minnesota", "Missouri", "Nebraska", "North Dakota",
"Ohio", "South Dakota", "Wisconsin"), "Midwest",
ifelse(location %in% c("Connecticut", "Maine", "Massachusetts",
"New Hampshire", "New Jersey", "New York",
"Pennsylvania","Rhode Island", "Vermont"), "Northeast", NA))))) %>%
filter(!is.na(region)) #removes blanks#Filter Low Birth Weights by Year
lbw_year <- lowbirthweight3 %>%
group_by(timeframe) %>%
summarize(total = sum(data))
#Convert "Timeframe" as factor because it needs to be read as a chartegorical variable in order to produce the bar chart
lbw_year$timeframe <- as.factor(lbw_year$timeframe)#Create a Bar Chart showing Low Birth Weight by Years from 2016 - 2021
ggplot(data = lbw_year, aes(x = timeframe, y = total, fill = timeframe)) +
geom_bar(stat = "identity") +
labs(x = "Year", y = "Frequency", title = "Low Birth Weight in the USA from 2016 - 2021") +
labs(caption = "Source: Centers for Disease Control and Prevention, National Center for Health Statistics") +
theme_minimal() +
scale_fill_discrete(name = "Year")+
guides(fill = "none") +
theme(plot.title = element_text(hjust = 0.5), panel.background = element_rect(fill = "white"),
panel.grid = element_blank())options(scipen = 999)#Filter Low Birth Weight by Race/Ethnicity
lbw_sum_race <- lowbirthweight3 %>%
group_by(timeframe, racegroup) %>%
summarize(total = sum(data), .groups = "drop")
str(lbw_sum_race)tibble [36 × 3] (S3: tbl_df/tbl/data.frame)
$ timeframe: num [1:36] 2016 2016 2016 2016 2016 ...
$ racegroup: chr [1:36] "American Indian" "American/Pacific Islander" "Black" "Latinx" ...
$ total : num [1:36] 2829 23942 82148 67210 8389 ...
#Convert "Timeframe" as factor because it needs to be read as a chartegorical variable in order to produce the bar chart. Also, convert "Data" as numeric so that it can be summed.
lbw_sum_race$timeframe <- as.factor(lbw_sum_race$timeframe)
lbw_sum_race$total <- as.numeric(lbw_sum_race$total)#Calculate Low Birth Weight by Race/Ethnicty
freq_race <- lbw_sum_race %>%
group_by(racegroup) %>%
summarize(freq = sum(total)) %>%
arrange(desc(freq))
#Reorder Frequencies from Highest to Lowest
freq_race <- freq_race[order(freq_race$freq, decreasing = TRUE), ]#Create a Bar Chart Showing Low Birth Weight by Race/Ethncity
ggplot(data = freq_race, aes(x = reorder(racegroup, freq), y = freq, fill = racegroup)) +
geom_bar(stat = "identity") + guides(fill = "none") +
labs(x = "Race/Ethnicity", y = "Frequency", title = "Low Birth Weight by Race/Ethnicity from 2016 - 2021") +
labs(caption = "Source: Centers for Disease Control and Prevention, National Center for Health Statistics") +
theme_minimal() +
scale_fill_discrete(name = "Race/Ethnicity")+
coord_flip() #makes the bar graph horizontal + theme(plot.title = element_text(hjust = 0.5), panel.background = element_rect(fill = "white"),
panel.grid = element_blank())List of 3
$ panel.background:List of 5
..$ fill : chr "white"
..$ colour : NULL
..$ linewidth : NULL
..$ linetype : NULL
..$ inherit.blank: logi FALSE
..- attr(*, "class")= chr [1:2] "element_rect" "element"
$ panel.grid : list()
..- attr(*, "class")= chr [1:2] "element_blank" "element"
$ plot.title :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : num 0.5
..$ vjust : NULL
..$ angle : NULL
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi FALSE
..- attr(*, "class")= chr [1:2] "element_text" "element"
- attr(*, "class")= chr [1:2] "theme" "gg"
- attr(*, "complete")= logi FALSE
- attr(*, "validate")= logi TRUE
options(scipen = 999)
ggplotfunction (data = NULL, mapping = aes(), ..., environment = parent.frame())
{
UseMethod("ggplot")
}
<bytecode: 0x1073d91f8>
<environment: namespace:ggplot2>
#Calculate Proportions of Low Birth Weight by Race/Ethnicity
prop_race <- lbw_sum_race %>%
group_by(timeframe) %>%
mutate(prop = total / sum(total))#Stacked Bar Chart by Race/Ethnicity & Year
ggplot(data = prop_race, aes(x = timeframe, y = prop, fill = racegroup)) +
geom_bar(stat = "identity", position = "stack") +
labs(x = "Year", y = "Proportion", title = "Proportion of Low Birth Weight by Race/Ethnicity and Year") +
labs(caption = "Source: Centers for Disease Control and Prevention, National Center for Health Statistics") +
theme_minimal() +
scale_fill_discrete(name = "Race/Ethnicity")#Calculate Low Birth Weight by Regions
lbw_sum_region <- regions %>%
group_by(region) %>%
summarize(total = sum(data)) %>%
arrange(desc(total))
#Reorder Frequencies from Highest to Lowest
lbw_sum_region <- lbw_sum_region[order(lbw_sum_region$total, decreasing = TRUE), ]#Create a Bar Chart showing Low Birth Weight by Regions
ggplot(data = lbw_sum_region, aes(x = reorder(region, total), y = total, fill = region)) +
geom_bar(stat = "identity") +
labs(x = "Regions", y = "Frequency", title = "Low Birth Weight by Regions from 2016 - 2021") +
labs(caption = "Source: Centers for Disease Control and Prevention, National Center for Health Statistics") +
theme_minimal() +
scale_fill_discrete(name = "Race/Ethnicity")+
guides(fill = "none") +
theme(plot.title = element_text(hjust = 0.5), panel.background = element_rect(fill = "white"),
panel.grid = element_blank())options(scipen = 999)
ggplotfunction (data = NULL, mapping = aes(), ..., environment = parent.frame())
{
UseMethod("ggplot")
}
<bytecode: 0x1073d91f8>
<environment: namespace:ggplot2>
#Calculate Low Birth Weight by States in the South Region
south2 <- regions %>%
filter(region == "South") %>%
group_by(location) %>%
summarize(total = sum(data)) %>%
arrange(desc(total))#Create Treemap of Low Birth Weights by Southern States
south_treemap <- treemap(south2, index="location", vSize="total", type="index",
title = "Low Birth Weight by Southern States from 2016 - 2021", fontsize.labels = 11, palette="Set3")#Summary: I created several data visualizations based on my low birth weight dataset analysis. Firstly, I looked at low birth weight in the USA from 2016 – 2021. There was an observed decrease from 2016 – 2020. However, there an increase in 2021. It would be interesting to know if the changes in low birth weight by year hold any statistical significance. Also, it would be interesting to know some of the potential factors that caused the increase from 2020 – 2021 (e.g., maybe related to the pandemic). Next, I looked at low birth weight by Race/Ethnicity. White babies were more likely to weigh less than 5.5 pounds compared to other babies from different race/ethnicity groups. I also looked at the proportion of low birth weight by Race/Ethnicity and Year. It follows a similar trend where white babies had the highest numbers compared to other groups. However, there is a noticeable ongoing decrease in white babies from 2016 – 2021 whereas others aren’t as apparent (e.g., black babies). Lastly, I looked at low birth weight by region from 2016 – 2021. Those categorized in southern states, respectfully, reported more low birth weights than any other region (Northeast, West, and Midwest). When looking at low birth weights in the South, Texas, and Florida reported the highest low birth weights in this region with West Virginia, Delaware, and the District of Columbia reporting the lowest low birth weights. Note, that the missing block is Delaware. It would be interesting to see a similar breakdown by all regions, and tabulated by race/ethnicity.