library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
I choosed to work with the Maternal Mortality dataset.
setwd("C:/Users/munis/Documents/Comm in Data Science")
mortality <- read_csv("maternalmortality.csv")
## Parsed with column specification:
## cols(
## State = col_character(),
## MMR = col_double(),
## Prenatal = col_double(),
## Csection = col_double(),
## Underserved = col_double(),
## Uninsured = col_double(),
## Population_18 = col_double()
## )
I used dplyr to select the variables I would work with from the data set.
mortality <- mortality %>%
select(MMR, Csection, Underserved, Uninsured, Population_18)
head(mortality)
## # A tibble: 6 x 5
## MMR Csection Underserved Uninsured Population_18
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9.6 33.8 55 18.1 4887871
## 2 5 22.6 50 19.8 737438
## 3 7.2 26.2 51 22.3 7171646
## 4 14.6 34.8 34 23.3 3013825
## 5 11.3 32.1 49 20.9 39557045
## 6 11 25.8 42 18 5695564
I graph the effect of the percentage of C-Sections on the Maternal Mortality rate in each state. I also create a regression line to determine a trend.
ggplot(data = mortality, aes(x = Csection, y = MMR)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle("C-Section Percentage vs. Maternal Mortality Rate")
To check the correlation of the two variables, I find the summary of my regression line and I look at the adjusted R-Squared. Unfortunately, the value is extremely small. Nonetheless, I continue on.
line1 <- lm(Csection ~ MMR, data = mortality)
summary(line1)
##
## Call:
## lm(formula = Csection ~ MMR, data = mortality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8555 -2.1984 -0.0618 2.5403 7.4323
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.26924 1.11055 25.455 <2e-16 ***
## MMR 0.22995 0.09651 2.383 0.0211 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.667 on 49 degrees of freedom
## Multiple R-squared: 0.1038, Adjusted R-squared: 0.08555
## F-statistic: 5.678 on 1 and 49 DF, p-value: 0.02111
Now, I graph the percentage of births in medically underserved areas and its effect on each states mortality rate. There didn’t seem to be much correlation.
ggplot(data = mortality, aes(x = Underserved, y = MMR)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle("Underserved Percentage vs. Maternal Mortality Rate")
Indeed, there was no correlation.
line2 <- lm(Underserved ~ MMR, data = mortality)
summary(line2)
##
## Call:
## lm(formula = Underserved ~ MMR, data = mortality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.222 -4.803 1.838 6.723 16.617
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.1787 2.4562 17.172 <2e-16 ***
## MMR 0.1305 0.2134 0.611 0.544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.109 on 49 degrees of freedom
## Multiple R-squared: 0.007566, Adjusted R-squared: -0.01269
## F-statistic: 0.3736 on 1 and 49 DF, p-value: 0.5439
I did the same thing as above but with the percentage of women who were uninsured.
ggplot(data = mortality, aes(x = Uninsured, y = MMR)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle("Uninsured Percentage vs. Maternal Mortality Rate")
Again, there was very little correlation.
line3 <- lm(Uninsured ~ MMR, data = mortality)
summary(line3)
##
## Call:
## lm(formula = Uninsured ~ MMR, data = mortality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5322 -3.8249 -0.4974 3.3754 11.7157
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.1394 1.3870 10.194 1.06e-13 ***
## MMR 0.2262 0.1205 1.876 0.0666 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.579 on 49 degrees of freedom
## Multiple R-squared: 0.06703, Adjusted R-squared: 0.04799
## F-statistic: 3.52 on 1 and 49 DF, p-value: 0.06658
Nonetheless, I decide to graph the percentage of women who have C-sections and its effect on the population for each state. I graphed a regression line and I also made the size of each point relate to its population. The bigger the population, the larger the point would be. Also, I used plotly to make the graph interactive. If you hover over it, you can read the actual values of each point.
mortality_graph <- ggplot(data = mortality, aes(x = Csection, y = MMR, size = Population_18)) +
ggtitle("C-Section Percentage vs. Maternal Mortality Rate") +
theme(legend.position = "right") +
geom_point(alpha = 0.5, color = "blue") + ylim(0,22) + theme_light(base_size = 11) +
geom_smooth(method = "lm", se = FALSE, color = "red")
mortality_graph <- ggplotly(mortality_graph)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
mortality_graph