I tried to make the barplots on my own so they were much more basic than the ones in the lecture. I also used facet_wrap instead of facet_grid for the barplot to understand what the difference between them could be. Without putting any further parameters, facet_grid seemed more useful since you could compare all of the age groups more easily since they were side by side. I had never used the pivot_longer or uncount functions before, and found them very useful in preparing/cleaning the data. I think I definitely need to review and practice making more graphs for future labs/assignments.

Load the contraceptive use data

library(readr)
cuse <- read_table("~/Desktop/BIOS 621/Lab 2/cuse.dat", 
    col_types = cols(age = col_factor(levels = c("<25", 
        "25-29", "30-39", "40-49")), education = col_factor(levels = c("low", 
        "high")), wantsMore = col_factor(levels = c("no", 
        "yes")), notUsing = col_integer(), 
        using = col_integer(), X6 = col_skip()))
## Warning: Missing column names filled in: 'X6' [6]
## Warning: 16 parsing failures.
## row col  expected    actual                                file
##   1  -- 6 columns 5 columns '~/Desktop/BIOS 621/Lab 2/cuse.dat'
##   2  -- 6 columns 5 columns '~/Desktop/BIOS 621/Lab 2/cuse.dat'
##   3  -- 6 columns 5 columns '~/Desktop/BIOS 621/Lab 2/cuse.dat'
##   4  -- 6 columns 5 columns '~/Desktop/BIOS 621/Lab 2/cuse.dat'
##   5  -- 6 columns 5 columns '~/Desktop/BIOS 621/Lab 2/cuse.dat'
## ... ... ......... ......... ...................................
## See problems(...) for more details.
View(cuse)
cuse
## # A tibble: 16 × 5
##    age   education wantsMore notUsing using
##    <fct> <fct>     <fct>        <int> <int>
##  1 <25   low       yes             53     6
##  2 <25   low       no              10     4
##  3 <25   high      yes            212    52
##  4 <25   high      no              50    10
##  5 25-29 low       yes             60    14
##  6 25-29 low       no              19    10
##  7 25-29 high      yes            155    54
##  8 25-29 high      no              65    27
##  9 30-39 low       yes            112    33
## 10 30-39 low       no              77    80
## 11 30-39 high      yes            118    46
## 12 30-39 high      no              68    78
## 13 40-49 low       yes             35     6
## 14 40-49 low       no              46    48
## 15 40-49 high      yes              8     8
## 16 40-49 high      no              12    31
levels(cuse$education) #don't have to use relevel
## [1] "low"  "high"
levels(cuse$wantsMore)
## [1] "no"  "yes"
summary(cuse)
##     age    education wantsMore    notUsing          using      
##  <25  :4   low :8    no :8     Min.   :  8.00   Min.   : 4.00  
##  25-29:4   high:8    yes:8     1st Qu.: 31.00   1st Qu.: 9.50  
##  30-39:4                       Median : 56.50   Median :29.00  
##  40-49:4                       Mean   : 68.75   Mean   :31.69  
##                                3rd Qu.: 85.75   3rd Qu.:49.00  
##                                Max.   :212.00   Max.   :80.00
problems()

Create an epi “Table 1”

tableone::CreateTableOne(data = cuse)
##                       
##                        Overall      
##   n                       16        
##   age (%)                           
##      <25                   4 (25.0) 
##      25-29                 4 (25.0) 
##      30-39                 4 (25.0) 
##      40-49                 4 (25.0) 
##   education = high (%)     8 (50.0) 
##   wantsMore = yes (%)      8 (50.0) 
##   notUsing (mean (SD)) 68.75 (56.28)
##   using (mean (SD))    31.69 (25.54)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ ., data = cuse)
Overall
(N=16)
age
<25 4 (25.0%)
25-29 4 (25.0%)
30-39 4 (25.0%)
40-49 4 (25.0%)
education
low 8 (50.0%)
high 8 (50.0%)
wantsMore
no 8 (50.0%)
yes 8 (50.0%)
notUsing
Mean (SD) 68.8 (56.3)
Median [Min, Max] 56.5 [8.00, 212]
using
Mean (SD) 31.7 (25.5)
Median [Min, Max] 29.0 [4.00, 80.0]

NOT overall participants, but overall number of levels per age group

Creating correct Table 1

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cusebyage <- group_by(cuse, age, education, wantsMore)%>%
  summarize(using = sum(using), "not using" = sum(notUsing))
## `summarise()` has grouped output by 'age', 'education'. You can override using
## the `.groups` argument.
print(cusebyage)
## # A tibble: 16 × 5
## # Groups:   age, education [8]
##    age   education wantsMore using `not using`
##    <fct> <fct>     <fct>     <int>       <int>
##  1 <25   low       no            4          10
##  2 <25   low       yes           6          53
##  3 <25   high      no           10          50
##  4 <25   high      yes          52         212
##  5 25-29 low       no           10          19
##  6 25-29 low       yes          14          60
##  7 25-29 high      no           27          65
##  8 25-29 high      yes          54         155
##  9 30-39 low       no           80          77
## 10 30-39 low       yes          33         112
## 11 30-39 high      no           78          68
## 12 30-39 high      yes          46         118
## 13 40-49 low       no           48          46
## 14 40-49 low       yes           6          35
## 15 40-49 high      no           31          12
## 16 40-49 high      yes           8           8

Combine the using and notUsing columns into one Contraceptive column

cusebyage<- pivot_longer(cusebyage, 
                         cols = using:"not using",
                         values_to = "n",
                         names_to = "contraceptive")
cusebyage
## # A tibble: 32 × 5
## # Groups:   age, education [8]
##    age   education wantsMore contraceptive     n
##    <fct> <fct>     <fct>     <chr>         <int>
##  1 <25   low       no        using             4
##  2 <25   low       no        not using        10
##  3 <25   low       yes       using             6
##  4 <25   low       yes       not using        53
##  5 <25   high      no        using            10
##  6 <25   high      no        not using        50
##  7 <25   high      yes       using            52
##  8 <25   high      yes       not using       212
##  9 25-29 low       no        using            10
## 10 25-29 low       no        not using        19
## # ℹ 22 more rows

Uncounting the n column and making more informative labels

cuseverylong <- uncount(cusebyage, n) #from the tidyr package
label(cuseverylong$age) <- "Age (years)"
label(cuseverylong$education) <- "Education level"
label(cuseverylong$wantsMore) <- "Wants more children?"
label(cuseverylong$contraceptive) <- "Using contraceptives?"
table1(~ . | wantsMore, rowlabelhead = "Wants more children?", 
       caption = "Table 1: Participants in the contraceptive use study",
       data = cuseverylong)
Table 1: Participants in the contraceptive use study
Wants more children? no
(N=635)
yes
(N=972)
Overall
(N=1607)
Age (years)
<25 74 (11.7%) 323 (33.2%) 397 (24.7%)
25-29 121 (19.1%) 283 (29.1%) 404 (25.1%)
30-39 303 (47.7%) 309 (31.8%) 612 (38.1%)
40-49 137 (21.6%) 57 (5.9%) 194 (12.1%)
Education level
low 294 (46.3%) 319 (32.8%) 613 (38.1%)
high 341 (53.7%) 653 (67.2%) 994 (61.9%)
Using contraceptives?
not using 347 (54.6%) 753 (77.5%) 1100 (68.5%)
using 288 (45.4%) 219 (22.5%) 507 (31.5%)

#Create a barplot stratified by age and showing the relative proportions of participants using contraceptives among those who do and do not want more children.

ggplot(cusebyage, aes(x = wantsMore, weight = n, fill = contraceptive))+
  geom_bar()+
  facet_wrap(~age)

ggplot(cusebyage, aes(x = wantsMore, weight = n, fill = contraceptive))+
  geom_bar()+
  facet_grid(~age)

ggplot(cusebyage, aes(x = wantsMore, weight = n, fill = contraceptive)) +
  # create a stacked bar plot, where the values provided are counts/frequencies,
  # and use black outlines for the bars.
  geom_bar(position = "stack", stat = "count", color = "black") + 
  # use facet_grid to separate the plots by age group
  facet_grid(.~age, labeller = label_both) +
  labs(title = "Contraceptive usage counts",
       subtitle = "in Contraceptive Use dataset",
       caption = "Contraceptive use in study sample") + 
  xlab("Wants more children?") + 
  ylab("Number of Participants") +
  # there are lots of scale_fill_* options for automatic color schemes, but I 
  # just want to specify the colors manually here.
  scale_fill_manual(values=c("white", "grey")) +
  theme_bw()

#Repeat the barplot showing percentages instead of counts Add a percent column

cusebyage2 <- group_by(cusebyage, age, wantsMore) %>% 
    mutate(percent = n / sum(n) * 100)
cusebyage2
## # A tibble: 32 × 6
## # Groups:   age, wantsMore [8]
##    age   education wantsMore contraceptive     n percent
##    <fct> <fct>     <fct>     <chr>         <int>   <dbl>
##  1 <25   low       no        using             4    5.41
##  2 <25   low       no        not using        10   13.5 
##  3 <25   low       yes       using             6    1.86
##  4 <25   low       yes       not using        53   16.4 
##  5 <25   high      no        using            10   13.5 
##  6 <25   high      no        not using        50   67.6 
##  7 <25   high      yes       using            52   16.1 
##  8 <25   high      yes       not using       212   65.6 
##  9 25-29 low       no        using            10    8.26
## 10 25-29 low       no        not using        19   15.7 
## # ℹ 22 more rows

Make the barplot

ggplot(cusebyage2, aes(x = wantsMore, weight = percent, fill = contraceptive))+
  geom_bar()+
  facet_grid(~age)

ggplot(cusebyage2, aes(x = wantsMore, weight = percent, fill = contraceptive)) +
  # create a stacked bar plot, where the values provided are counts/frequencies,
  # and use black outlines for the bars.
  geom_bar(position = "stack", stat = "count", color = "black") + 
  # use facet_grid to separate the plots by age group
  facet_grid(.~age, labeller = label_both) +
  labs(title = "Contraceptive usage proportion",
       subtitle = "in Contraceptive Use dataset",
       caption = "Contraceptive use in study sample") + 
  xlab("Wants more children?") + 
  ylab("Percent of Participants") +
  # there are lots of scale_fill_* options for automatic color schemes, but I 
  # just want to specify the colors manually here.
  scale_fill_manual(values=c("white", "grey")) +
  theme_bw()