Does higher units of alcohol conscoumption lead to a higher number of sexual transmission disease?
mydata <- read.csv("abortion.csv")
head(mydata, n = 10)
tail(mydata, n = 10)
summary(mydata)
## ID fip age race year
## Min. : 1 Min. : 1.00 Min. :15.0 Min. :1.0 Min. :1985
## 1st Qu.: 4897 1st Qu.:16.00 1st Qu.:20.0 1st Qu.:1.0 1st Qu.:1989
## Median : 9792 Median :29.00 Median :27.5 Median :1.5 Median :1992
## Mean : 9792 Mean :28.96 Mean :27.5 Mean :1.5 Mean :1992
## 3rd Qu.:14688 3rd Qu.:42.00 3rd Qu.:35.0 3rd Qu.:2.0 3rd Qu.:1996
## Max. :19584 Max. :56.00 Max. :40.0 Max. :2.0 Max. :2000
##
## sex totpop ir crack
## Min. :1.0 Min. : 0 Min. : 0.0 Min. :-1.166
## 1st Qu.:1.0 1st Qu.: 18245 1st Qu.: 100.5 1st Qu.: 0.718
## Median :1.5 Median : 87564 Median : 186.8 Median : 1.369
## Mean :1.5 Mean : 163414 Mean : 416.7 Mean : 1.601
## 3rd Qu.:2.0 3rd Qu.: 207858 3rd Qu.: 645.0 3rd Qu.: 2.212
## Max. :2.0 Max. :1606397 Max. :5542.9 Max. : 7.313
##
## alcohol income ur poverty
## Min. :1.200 Min. : 9892 Min. : 2.258 Min. : 2.90
## 1st Qu.:2.040 1st Qu.:16613 1st Qu.: 4.317 1st Qu.:10.10
## Median :2.300 Median :20060 Median : 5.312 Median :12.40
## Mean :2.388 Mean :20626 Mean : 5.546 Mean :13.06
## 3rd Qu.:2.560 3rd Qu.:24065 3rd Qu.: 6.575 3rd Qu.:15.43
## Max. :5.050 Max. :41489 Max. :13.442 Max. :27.20
##
## repeal acc wht male
## Min. :0.00000 Min. : 0.000 Min. :0.0 Min. :0.0
## 1st Qu.:0.00000 1st Qu.: 6.616 1st Qu.:0.0 1st Qu.:0.0
## Median :0.00000 Median : 17.060 Median :0.5 Median :0.5
## Mean :0.09804 Mean : 30.299 Mean :0.5 Mean :0.5
## 3rd Qu.:0.00000 3rd Qu.: 35.711 3rd Qu.:1.0 3rd Qu.:1.0
## Max. :1.00000 Max. :454.303 Max. :1.0 Max. :1.0
##
## lnr t younger fa
## Min. :-1.792 Min. : 1.00 Min. :0.0000 Min. : 1.0
## 1st Qu.: 3.538 1st Qu.: 4.75 1st Qu.:0.0000 1st Qu.: 26.0
## Median : 5.240 Median : 8.50 Median :0.0000 Median : 51.5
## Mean : 5.285 Mean : 8.50 Mean :0.1667 Mean : 51.5
## 3rd Qu.: 7.125 3rd Qu.:12.25 3rd Qu.:0.0000 3rd Qu.: 77.0
## Max. :10.007 Max. :16.00 Max. :1.0000 Max. :102.0
## NA's :1663 NA's :13056
## pi bf15
## Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000
## Mean :0.07026 Mean :0.04167
## 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000
##
display the structure of dataframe
str(mydata)
## 'data.frame': 19584 obs. of 23 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ fip : int 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 30 15 20 20 20 40 35 40 20 35 ...
## $ race : int 2 1 2 1 2 2 2 1 1 1 ...
## $ year : int 1985 1985 1985 1985 1985 1985 1985 1985 1985 1985 ...
## $ sex : int 1 2 2 1 1 1 2 2 2 2 ...
## $ totpop : int 78805 224003 94113 252076 94113 48527 68459 177798 252076 203621 ...
## $ ir : num 371.5 51.4 390.9 100.8 390.9 ...
## $ crack : num 0.217 0.217 0.217 0.217 0.217 ...
## $ alcohol: num 1.9 1.9 1.9 1.9 1.9 ...
## $ income : int 11566 11566 11566 11566 11566 11566 11566 11566 11566 11566 ...
## $ ur : num 8.62 8.62 8.62 8.62 8.62 ...
## $ poverty: num 20.6 20.6 20.6 20.6 20.6 ...
## $ repeal : int 0 0 0 0 0 0 0 0 0 0 ...
## $ acc : num 0.68 0.68 0.68 0.68 0.68 ...
## $ wht : int 0 1 0 1 0 0 0 1 1 1 ...
## $ male : int 1 0 0 1 1 1 0 0 0 0 ...
## $ lnr : num 7.88 6.36 8.92 5.91 9.07 ...
## $ t : int 1 1 1 1 1 1 1 1 1 1 ...
## $ younger: int 0 1 0 0 0 0 0 0 0 0 ...
## $ fa : int NA 1 NA NA NA NA NA NA NA NA ...
## $ pi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ bf15 : int 0 0 0 0 0 0 0 0 0 0 ...
We have 100,000 STD cases per capita and the mean value is 5.28. On the other hand, we have 2.38 of mean value for alchohol consumption (per capita). # 2. perform some basic transformations. They will need to make sense but could include column renaming, creating a subset of the data, replacing values, or creating new columns with derived data
cleaned_data <- subset(mydata, (lnr != "NA" & alcohol != "NA"), select = c("lnr", "alcohol", "totpop", "sex","year", "race") )
cleaned_data
cleaned_data$sex[cleaned_data$sex == 1] <- "Male"
cleaned_data$sex[cleaned_data$sex == 2] <- "Female"
cleaned_data$race[cleaned_data$race == 1] <- "White"
cleaned_data$race[cleaned_data$race == 2] <- "Black"
tail(cleaned_data)
# add a new column called total_alcohol using mutate()
# total unit of alcohol consumption
recleaned <- dplyr::mutate(cleaned_data, total_alcohol = alcohol * totpop)
recleaned
given lnr, (sexual transmission disease)value is per 100,000. So lets find out the total gonorrhea cases and add a new column called total_std
# Total gonorrhea disease
recleaned_data <- dplyr::mutate(recleaned, total_std = ((lnr*totpop)/100000))
recleaned_data
mean value of total population and total units of alcohol consumption
mean_total_alcohol <- mean(recleaned_data[["total_alcohol"]])
mean_totpop <- mean(recleaned_data[["totpop"]])
mean_total_alcohol
## [1] 403763.2
mean_totpop
## [1] 176075.9
median value of total population and total units of alcohol consumption
median_total_alcohol <- median(recleaned_data[["total_alcohol"]])
median_totpop <- median(recleaned_data[["totpop"]])
median_total_alcohol
## [1] 217542.5
median_totpop
## [1] 97923
head(recleaned_data, n = 25)
# check if there is any NA values
any(is.na(cleaned_data))
## [1] FALSE
3.Graphics: Please make sure to display at least one scatter plot, box plot and histogram. Don’t be limited to this. Please explore the many other options in R packages such as ggplot2.
library(ggplot2)
# compare the total number of gonorrhea disease with total unit of alcohol #consumed
totAlcohol_VSTot_std <- ggplot(recleaned_data, aes(x = total_alcohol, y = total_std)) + geom_point()+ stat_smooth(se = FALSE)+ labs(x = "Total Units of alcohol consumption", y = "Total Gonorrhea cases")
totAlcohol_VSTot_std
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# avoid overplotting
library(ggplot2)
totAlcohol_VSTot_std <- ggplot(recleaned_data, aes(x = total_alcohol, y = total_std)) + geom_point(alpha = 0.1, size = 0.01)+ stat_smooth(se = FALSE)+ labs(x = "Total Units of alcohol consumption", y = "Total Gonorrhea cases")
totAlcohol_VSTot_std
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# check the Gonorrhea cases in total population # total std vs total population
total_pop_vs_total_std <- ggplot(recleaned_data, aes(x = total_alcohol, y = totpop)) + geom_point(alpha = 0.1, size = 0.01) + stat_smooth(se = FALSE) + facet_grid(rows = "sex") + labs(x = "Total Units of Alcohol consumed", y = "Gonorrhea cases", title = "Number of Gonorrhea cases in Sex Group")
total_pop_vs_total_std
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
histogram
histogram_std_cases <- ggplot(recleaned_data, aes(x = total_alcohol)) + geom_histogram(fill = "lightblue", color = "white", bins = 30) + labs(x = "Total alcohol consumption", title = "alcohol consumption")+ theme_minimal()
histogram_std_cases
## boxplot
boxPlot_maleFemale <- ggplot(recleaned_data, aes(x = total_alcohol,y = total_std, group = sex, fill = sex)) +
geom_boxplot( fill = "lightgrey", outlier.shape = NA) +
labs(x = "Total units of alcohol consumed",
y = "Total gonorrhea cases",
title = "Boxplot of Total Gonorrhea cases") +
theme_minimal()
boxPlot_maleFemale
violin plot
violinplot_std <- ggplot(recleaned_data, aes(x = total_alcohol, y = total_std, group =sex,fill = sex)) + geom_violin(trim = FALSE, scale = "width", draw_quantiles = c(0.25, 0.5, 0.75)) + labs(x = "Total units of alcohol consumption", y = "Total Gonorrhea cases", title = "Total Gonorrhea Cases VS Alcohol Consumption by Sex Group") + theme_minimal()
violinplot_std
## Conclusion:- It can be concluded from the graphs that as number of alcoholconsumption increases the cases of gonorrhea (Sexual Transmission Disease) is decreased. First, It shows that the disease rate is not following the rate of alcohol consumption. Second, the gonorrhea case rate is dispersed to the bigger number of population as it grown to bigger. However, we can see some room to doubt in data because the source shows that the number of gonorrhea cases is logged and collected from the age group of 15 to 19. This age group is less likely to consume the alcohol. Data source: https://vincentarelbundock.github.io/Rdatasets/doc/causaldata/abortion.html