Content produced here is for R data wrangling/analysis purposes only. Views expressed are my own. The opinions expressed here belong solely to me and do not reflect the views of my employer.

Cleaning Input Data

.csv is always preferred over .xlsx
When read in .xlsx data, remember to clean up the headers & footers & irregular column names.
|> pipes are useful for a series of operations.

raw_xlsx <- read_excel(path = table_path,
                        sheet = 1,
                        skip = 3,
                        n_max = 51)
raw_xlsx <- raw_xlsx |>
  rename(s = 1, 
         p = 2, 
         a = 3, 
         c = 4)
total_ <- tail(raw_xlsx, n = 1) # last row as total
raw_xlsx <- raw_xlsx |>
  filter(row_number() <= n() - 1) # exclude the last row

# confirm whether two values match
sum(raw_xlsx$p) == total_$p

## [1] TRUE

Nested Loops

Avoid nested loop syntax but achieve the same thing
Note how to index list items

options(scipen = 999)
options(digits=20)
numerator <- raw_xlsx$p
n <- seq(1, 30, by = 1) # a reasonable range of numbers to show how iterations proceed in a nested loop
value_l <- vector(mode = "list", length = dim(raw_xlsx)[1])

# loop
for (i in 1:dim(raw_xlsx)[1]) {
      value_l[[i]]  <- numerator[i] / sqrt(n * (n - 1))
}
names(value_l) <- raw_xlsx$s

value_unlist <- unlist(value_l)

df_value <- data.frame(s = names(value_unlist),
                                p= unlist(value_l),
                                row.names = NULL) |>
  arrange(desc(p))

Parsing Numbers

"[0-9]+"

# parse number
df_value$s <- readr::parse_number(df_value$s) + 1
# remove 0-9 from labels
df_value$s <- gsub(pattern = "[0-9]+",replacement = "", df_value$s)

readxl-dplyr

Tiangeng Lu

2025-03-14

Cleaning Input Data

Nested Loops

Parsing Numbers