I tried to make the barplots on my own so they were much more basic than the ones in the lecture. I also used facet_wrap instead of facet_grid for the barplot to understand what the difference between them could be. Without putting any further parameters, facet_grid seemed more useful since you could compare all of the age groups more easily since they were side by side. I had never used the pivot_longer or uncount functions before, and found them very useful in preparing/cleaning the data. I think I definitely need to review and practice making more graphs for future labs/assignments.
library(readr)
cuse <- read_table("~/Desktop/BIOS 621/Lab 2/cuse.dat",
col_types = cols(age = col_factor(levels = c("<25",
"25-29", "30-39", "40-49")), education = col_factor(levels = c("low",
"high")), wantsMore = col_factor(levels = c("no",
"yes")), notUsing = col_integer(),
using = col_integer(), X6 = col_skip()))
## Warning: Missing column names filled in: 'X6' [6]
## Warning: 16 parsing failures.
## row col expected actual file
## 1 -- 6 columns 5 columns '~/Desktop/BIOS 621/Lab 2/cuse.dat'
## 2 -- 6 columns 5 columns '~/Desktop/BIOS 621/Lab 2/cuse.dat'
## 3 -- 6 columns 5 columns '~/Desktop/BIOS 621/Lab 2/cuse.dat'
## 4 -- 6 columns 5 columns '~/Desktop/BIOS 621/Lab 2/cuse.dat'
## 5 -- 6 columns 5 columns '~/Desktop/BIOS 621/Lab 2/cuse.dat'
## ... ... ......... ......... ...................................
## See problems(...) for more details.
View(cuse)
cuse
## # A tibble: 16 × 5
## age education wantsMore notUsing using
## <fct> <fct> <fct> <int> <int>
## 1 <25 low yes 53 6
## 2 <25 low no 10 4
## 3 <25 high yes 212 52
## 4 <25 high no 50 10
## 5 25-29 low yes 60 14
## 6 25-29 low no 19 10
## 7 25-29 high yes 155 54
## 8 25-29 high no 65 27
## 9 30-39 low yes 112 33
## 10 30-39 low no 77 80
## 11 30-39 high yes 118 46
## 12 30-39 high no 68 78
## 13 40-49 low yes 35 6
## 14 40-49 low no 46 48
## 15 40-49 high yes 8 8
## 16 40-49 high no 12 31
levels(cuse$education) #don't have to use relevel
## [1] "low" "high"
levels(cuse$wantsMore)
## [1] "no" "yes"
summary(cuse)
## age education wantsMore notUsing using
## <25 :4 low :8 no :8 Min. : 8.00 Min. : 4.00
## 25-29:4 high:8 yes:8 1st Qu.: 31.00 1st Qu.: 9.50
## 30-39:4 Median : 56.50 Median :29.00
## 40-49:4 Mean : 68.75 Mean :31.69
## 3rd Qu.: 85.75 3rd Qu.:49.00
## Max. :212.00 Max. :80.00
problems()
tableone::CreateTableOne(data = cuse)
##
## Overall
## n 16
## age (%)
## <25 4 (25.0)
## 25-29 4 (25.0)
## 30-39 4 (25.0)
## 40-49 4 (25.0)
## education = high (%) 8 (50.0)
## wantsMore = yes (%) 8 (50.0)
## notUsing (mean (SD)) 68.75 (56.28)
## using (mean (SD)) 31.69 (25.54)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ ., data = cuse)
| Overall (N=16) |
|
|---|---|
| age | |
| <25 | 4 (25.0%) |
| 25-29 | 4 (25.0%) |
| 30-39 | 4 (25.0%) |
| 40-49 | 4 (25.0%) |
| education | |
| low | 8 (50.0%) |
| high | 8 (50.0%) |
| wantsMore | |
| no | 8 (50.0%) |
| yes | 8 (50.0%) |
| notUsing | |
| Mean (SD) | 68.8 (56.3) |
| Median [Min, Max] | 56.5 [8.00, 212] |
| using | |
| Mean (SD) | 31.7 (25.5) |
| Median [Min, Max] | 29.0 [4.00, 80.0] |
NOT overall participants, but overall number of levels per age group
Creating correct Table 1
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cusebyage <- group_by(cuse, age, education, wantsMore)%>%
summarize(using = sum(using), "not using" = sum(notUsing))
## `summarise()` has grouped output by 'age', 'education'. You can override using
## the `.groups` argument.
print(cusebyage)
## # A tibble: 16 × 5
## # Groups: age, education [8]
## age education wantsMore using `not using`
## <fct> <fct> <fct> <int> <int>
## 1 <25 low no 4 10
## 2 <25 low yes 6 53
## 3 <25 high no 10 50
## 4 <25 high yes 52 212
## 5 25-29 low no 10 19
## 6 25-29 low yes 14 60
## 7 25-29 high no 27 65
## 8 25-29 high yes 54 155
## 9 30-39 low no 80 77
## 10 30-39 low yes 33 112
## 11 30-39 high no 78 68
## 12 30-39 high yes 46 118
## 13 40-49 low no 48 46
## 14 40-49 low yes 6 35
## 15 40-49 high no 31 12
## 16 40-49 high yes 8 8
Combine the using and notUsing columns into one Contraceptive column
cusebyage<- pivot_longer(cusebyage,
cols = using:"not using",
values_to = "n",
names_to = "contraceptive")
cusebyage
## # A tibble: 32 × 5
## # Groups: age, education [8]
## age education wantsMore contraceptive n
## <fct> <fct> <fct> <chr> <int>
## 1 <25 low no using 4
## 2 <25 low no not using 10
## 3 <25 low yes using 6
## 4 <25 low yes not using 53
## 5 <25 high no using 10
## 6 <25 high no not using 50
## 7 <25 high yes using 52
## 8 <25 high yes not using 212
## 9 25-29 low no using 10
## 10 25-29 low no not using 19
## # ℹ 22 more rows
Uncounting the n column and making more informative labels
cuseverylong <- uncount(cusebyage, n) #from the tidyr package
label(cuseverylong$age) <- "Age (years)"
label(cuseverylong$education) <- "Education level"
label(cuseverylong$wantsMore) <- "Wants more children?"
label(cuseverylong$contraceptive) <- "Using contraceptives?"
table1(~ . | wantsMore, rowlabelhead = "Wants more children?",
caption = "Table 1: Participants in the contraceptive use study",
data = cuseverylong)
| Wants more children? | no (N=635) |
yes (N=972) |
Overall (N=1607) |
|---|---|---|---|
| Age (years) | |||
| <25 | 74 (11.7%) | 323 (33.2%) | 397 (24.7%) |
| 25-29 | 121 (19.1%) | 283 (29.1%) | 404 (25.1%) |
| 30-39 | 303 (47.7%) | 309 (31.8%) | 612 (38.1%) |
| 40-49 | 137 (21.6%) | 57 (5.9%) | 194 (12.1%) |
| Education level | |||
| low | 294 (46.3%) | 319 (32.8%) | 613 (38.1%) |
| high | 341 (53.7%) | 653 (67.2%) | 994 (61.9%) |
| Using contraceptives? | |||
| not using | 347 (54.6%) | 753 (77.5%) | 1100 (68.5%) |
| using | 288 (45.4%) | 219 (22.5%) | 507 (31.5%) |
#Create a barplot stratified by age and showing the relative proportions of participants using contraceptives among those who do and do not want more children.
ggplot(cusebyage, aes(x = wantsMore, weight = n, fill = contraceptive))+
geom_bar()+
facet_wrap(~age)
ggplot(cusebyage, aes(x = wantsMore, weight = n, fill = contraceptive))+
geom_bar()+
facet_grid(~age)
ggplot(cusebyage, aes(x = wantsMore, weight = n, fill = contraceptive)) +
# create a stacked bar plot, where the values provided are counts/frequencies,
# and use black outlines for the bars.
geom_bar(position = "stack", stat = "count", color = "black") +
# use facet_grid to separate the plots by age group
facet_grid(.~age, labeller = label_both) +
labs(title = "Contraceptive usage counts",
subtitle = "in Contraceptive Use dataset",
caption = "Contraceptive use in study sample") +
xlab("Wants more children?") +
ylab("Number of Participants") +
# there are lots of scale_fill_* options for automatic color schemes, but I
# just want to specify the colors manually here.
scale_fill_manual(values=c("white", "grey")) +
theme_bw()
#Repeat the barplot showing percentages instead of counts Add a percent column
cusebyage2 <- group_by(cusebyage, age, wantsMore) %>%
mutate(percent = n / sum(n) * 100)
cusebyage2
## # A tibble: 32 × 6
## # Groups: age, wantsMore [8]
## age education wantsMore contraceptive n percent
## <fct> <fct> <fct> <chr> <int> <dbl>
## 1 <25 low no using 4 5.41
## 2 <25 low no not using 10 13.5
## 3 <25 low yes using 6 1.86
## 4 <25 low yes not using 53 16.4
## 5 <25 high no using 10 13.5
## 6 <25 high no not using 50 67.6
## 7 <25 high yes using 52 16.1
## 8 <25 high yes not using 212 65.6
## 9 25-29 low no using 10 8.26
## 10 25-29 low no not using 19 15.7
## # ℹ 22 more rows
Make the barplot
ggplot(cusebyage2, aes(x = wantsMore, weight = percent, fill = contraceptive))+
geom_bar()+
facet_grid(~age)
ggplot(cusebyage2, aes(x = wantsMore, weight = percent, fill = contraceptive)) +
# create a stacked bar plot, where the values provided are counts/frequencies,
# and use black outlines for the bars.
geom_bar(position = "stack", stat = "count", color = "black") +
# use facet_grid to separate the plots by age group
facet_grid(.~age, labeller = label_both) +
labs(title = "Contraceptive usage proportion",
subtitle = "in Contraceptive Use dataset",
caption = "Contraceptive use in study sample") +
xlab("Wants more children?") +
ylab("Percent of Participants") +
# there are lots of scale_fill_* options for automatic color schemes, but I
# just want to specify the colors manually here.
scale_fill_manual(values=c("white", "grey")) +
theme_bw()