Chapter_3: Data wrangling, summarizing, plotting, and statistical testing with Peter Higgins

Installed the following packages

#install.packages(“janitor”) #install.packages(“rstatix”) #install.packages(“remotes”) #tidyverse and medicaldata are already installed.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(rstatix)

## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:janitor':
## 
##     make_clean_names
## 
## The following object is masked from 'package:stats':
## 
##     filter

library(remotes)

Viewing contents of medicaldata

data(package = "medicaldata")

Wrangling the “blood_storage” dataset

prostate <- medicaldata::blood_storage |> 
  clean_names()

#now columns aa and fam_hx are numbers. But they are actually categorical variables. #so we will mutate.

prostate |> 
  mutate(aa = factor(aa, levels = c(0,1),
                     labels = c("White", "African-American"))) |> 
  mutate(fam_hx = factor(fam_hx, levels = c(0,1),
                         labels = c("No family history", "Fhx of Prostate Cancer"))) -> prostate_factors

Summarise the data

prostate_factors |> 
  select(age, p_vol, preop_psa, aa, fam_hx) |> 
  group_by(aa, fam_hx) |> 
  summarise(across(age:preop_psa, \(x) mean(x, na.rm = TRUE)))

## `summarise()` has grouped output by 'aa'. You can override using the `.groups`
## argument.

## # A tibble: 4 × 5
## # Groups:   aa [2]
##   aa               fam_hx                   age p_vol preop_psa
##   <fct>            <fct>                  <dbl> <dbl>     <dbl>
## 1 White            No family history       61.8  56.9      8.06
## 2 White            Fhx of Prostate Cancer  59.5  57.3      7.22
## 3 African-American No family history       60.7  54.3      9.90
## 4 African-American Fhx of Prostate Cancer  60.1  51.4      8.71

Visualize the data

ggplot(prostate_factors) +
  aes(x = p_vol, y = preop_psa, col = aa) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_grid(aa~fam_hx) +
  labs(x = 'Prostate Volume', y = "Preoperative PSA",
       title = 'Relationship Between Prostate Volume and Preop PSA,\nSubdivided by Family History and Race') +
  theme(legend.position = "bottom")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).

Statistical testing of data

prostate_factors |> 
  t_test(formula = preop_psa~aa,
         detailed = TRUE)

## # A tibble: 1 × 15
##   estimate estimate1 estimate2 .y.    group1 group2    n1    n2 statistic      p
## *    <dbl>     <dbl>     <dbl> <chr>  <chr>  <chr>  <int> <int>     <dbl>  <dbl>
## 1    -1.89      7.86      9.75 preop… White  Afric…   259    54     -1.96 0.0534
## # ℹ 5 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>, method <chr>,
## #   alternative <chr>