knitr::opts_knit$set(root.dir = 'C:/Users/Cyrus Elahi/Documents/R/input')

Set up your R environment

library(ggplot2)
library(stringr)
library(forcats)
library(lubridate)
library(likert)
library(dplyr)
library(tidyr)
library(kableExtra)
library(knitr)
library(scales)

Load in your data file

Included at this step is cleaning the data if needed

#Set your working directory to the folder you have saved the test data
setwd <- ("C:/Users/Cyrus Elahi/Documents/R/input")

#read in data file
data_file <- read.csv("tab1_test_data.csv")

###clean data file however needed -
#clean upper/lower case problems
str_to_upper(data_file$col_2_gender)

#convert column to numeric structure (ex. age, bmi)
data_file$col_name <- as.numeric("data_file$col_1_age")

#or sometimes if column is as.factor and converting to as.numeric
data_file$col_name <- as.numeric(as.character("data_file$BMI"))


#recoding column entries that are slightly different that should be the same
data_file$Chief.Complaints <- car::recode(data_file$Chief.Complaints, 
                                          "'HEADAHCE' = 'HEADACHE';
                                           'PHYSICAL' = 'PHYSICAL EXAM';
                                           'RESPRIATORY COMPLAINT' = 'RESPIRATORY COMPLAINT';
                                           'HEARING PROBLEMS' = 'ENT COMPLAINT';
                                           'ALLERGIES' = 'ENT COMPLAINT'") 

Table Construction

Subgroups of data

Now you are ready to build your table one. We will build a table one with three columns: one including the overall summary data and the next two representing different population subsets.

This first block of code is for building the different population subset columns

#This code chunk will help reorganize the row in the order you desire. 
#Otherwise the rows will be printed in alaphabetical order
desired_col_order <- c("Total", "Male", "Female", "All (m/sd)", "Males (m/sd)", 
                       "Females (m/sd)", "Hispanic", "White", "Black", "Asian", 
                       "Native American", "Other", "Not Available", "Spanish", 
                       "English", "Both", " Other", " Not Available")

#This code chunk manipulates and summarized your data_file. select()- place the columns 
#of interest. mutate()- create your desired population subgroups, group_by()- enter in the 
#col_name your created in the mutate() step, summarise()- summarize the columns how you please, 
#unite()- combine two columns (ex mean and sd), gather spread rotates your table into a classic 
#table 1 format, slice()- uses the desired_col_order you defined above to reorder the rows
diff_pop_subsets <- data_file %>%
  select(col_1_age, col_2_gender, col_3_race, col_4_lang) %>%
  mutate(pop_subset = case_when(col_1_age <= 17 ~ "Pediatric pts",
                                col_1_age >= 18 ~ "Adults"
                              )) %>%
  group_by(pop_subset) %>%
  summarise(AGE = round(mean(col_1_age, na.rm=TRUE), 0),
            asd = round(sd(col_1_age, na.rm = TRUE), 1),
            Males = round(mean(col_1_age[col_2_gender == "M"], na.rm = TRUE), 0), 
            Msd = round(sd(col_1_age[col_2_gender == "M"], na.rm = TRUE), 1),
            Females = round(mean(col_1_age[col_2_gender == "F"], na.rm = TRUE), 0),
            Fsd = round(sd(col_1_age[col_2_gender =="F"], na.rm = TRUE), 1),
            Male = sum(col_2_gender == "M"),
            Female = sum(col_2_gender == "F"),
            Total = sum(col_2_gender == "F" | col_2_gender == "M"),
            Hispanic = sum(col_3_race == "Hispanic"),
            White = sum(col_3_race == "White"),
            Black = sum(col_3_race == "Black"),
            Asian = sum(col_3_race == "Asian"),
            "Native American" = sum(col_3_race == "Native American"),
            Other = sum(Race = sum(col_3_race == "Other")),
            "Not Available" = sum(col_3_race == "N/A"),
            Spanish = sum(col_4_lang == "Spanish"),
            English = sum(col_4_lang == "English"),
            Both = sum(col_4_lang == "Spanish or English"),
            " Other" = sum(col_4_lang == "sign language" | col_4_lang == "Other"),
            " Not Available" = sum(col_4_lang == "n/a" | col_4_lang == "N/A")
            
        )  %>%
  unite(col = "All (m/sd)", AGE, asd, sep= " / ") %>%
  unite(col = "Males (m/sd)", Males, Msd, sep= " / ") %>%
  unite(col = "Females (m/sd)", Females, Fsd, sep= " / ") %>%
  gather (., key = "Variable", value = "value", -pop_subset) %>%
  spread(., key = pop_subset, value = value) %>%
  slice(match(desired_col_order, Variable)) #%>%

#print your table if you want to preview 
 diff_pop_subsets
## # A tibble: 18 x 3
##    Variable         Adults    `Pediatric pts`
##    <chr>            <chr>     <chr>          
##  1 Total            638       122            
##  2 Male             100       54             
##  3 Female           538       68             
##  4 All (m/sd)       44 / 12.5 10 / 4.8       
##  5 Males (m/sd)     43 / 15.2 9 / 5.2        
##  6 Females (m/sd)   44 / 11.9 11 / 4.2       
##  7 Hispanic         582       108            
##  8 White            19        8              
##  9 Black            1         0              
## 10 Asian            1         0              
## 11 Native American  1         0              
## 12 Other            2         0              
## 13 Not Available    32        6              
## 14 Spanish          445       62             
## 15 English          76        25             
## 16 Both             19        17             
## 17 " Other"         8         0              
## 18 " Not Available" 89        18

Overall data

This block of code is for building the overall column for your table 1 (including both population subsets from above)

#This code chunk will help reorganize the row in the order you desire. Otherwise the rows 
#will be printed in alaphabetical order
desired_col_order <- c("Total", "Male", "Female", "All (m/sd)", "Males (m/sd)", 
                       "Females (m/sd)", "Hispanic", "White", "Black", "Asian", 
                       "Native American", "Other", "Not Available", "Spanish", 
                       "English", "Both", " Other", " Not Available")


#This code chunk manipulates and summarized your data_file. Note you will not need to use 
#mutate(), group_by(), or spread(). select()- place the columns of interest, summarise()- summarize 
#the columns how you please, unite()- combine two columns (ex mean and sd), gather rotates your 
#table into a classic table 1 format, slice()- uses the desired_col_order you defined above to 
#reorder the rows
overall_summary <- data_file %>%
  select(col_1_age, col_2_gender, col_3_race, col_4_lang) %>%
#  mutate(pop_subset = case_when(col_1_age <= 17 ~ "Pediatric pts",
#                                col_1_age >= 18 ~ "Adults"
#                              )) %>%
#  group_by(pop_subset) %>%
  summarise(AGE = round(mean(col_1_age, na.rm=TRUE), 0),
            asd = round(sd(col_1_age, na.rm = TRUE), 1),
            Males = round(mean(col_1_age[col_2_gender == "M"], na.rm = TRUE), 0), 
            Msd = round(sd(col_1_age[col_2_gender == "M"], na.rm = TRUE), 1),
            Females = round(mean(col_1_age[col_2_gender == "F"], na.rm = TRUE), 0),
            Fsd = round(sd(col_1_age[col_2_gender =="F"], na.rm = TRUE), 1),
            Male = sum(col_2_gender == "M"),
            Female = sum(col_2_gender == "F"),
            Total = sum(col_2_gender == "F" | col_2_gender == "M"),
            Hispanic = sum(col_3_race == "Hispanic"),
            White = sum(col_3_race == "White"),
            Black = sum(col_3_race == "Black"),
            Asian = sum(col_3_race == "Asian"),
            "Native American" = sum(col_3_race == "Native American"),
            Other = sum(Race = sum(col_3_race == "Other")),
            "Not Available" = sum(col_3_race == "N/A"),
            Spanish = sum(col_4_lang == "Spanish"),
            English = sum(col_4_lang == "English"),
            Both = sum(col_4_lang == "Spanish or English"),
            " Other" = sum(col_4_lang == "sign language" | col_4_lang == "Other"),
            " Not Available" = sum(col_4_lang == "n/a" | col_4_lang == "N/A")
            
        )  %>%
  unite(col = "All (m/sd)", AGE, asd, sep= " / ") %>%
  unite(col = "Males (m/sd)", Males, Msd, sep= " / ") %>%
  unite(col = "Females (m/sd)", Females, Fsd, sep= " / ") %>%
  gather (., key = "Variable", value = "All Patients") %>%
#  spread(., key = pop_subset, value = value) %>%
  slice(match(desired_col_order, Variable)) #%>%


#print your table if you want to preview 
overall_summary
## # A tibble: 18 x 2
##    Variable         `All Patients`
##    <chr>            <chr>         
##  1 Total            760           
##  2 Male             154           
##  3 Female           606           
##  4 All (m/sd)       38 / 16.9     
##  5 Males (m/sd)     31 / 20.3     
##  6 Females (m/sd)   40 / 15.3     
##  7 Hispanic         690           
##  8 White            27            
##  9 Black            1             
## 10 Asian            1             
## 11 Native American  1             
## 12 Other            2             
## 13 Not Available    38            
## 14 Spanish          507           
## 15 English          101           
## 16 Both             36            
## 17 " Other"         8             
## 18 " Not Available" 107

Combine data frames

Here you will combine the two data frames you have made (the diff_pop_subsets data frame and the overall data frame)

tab1 <- merge(diff_pop_subsets, overall_summary, by = "Variable", sort = FALSE)

#this step reorganizes you columns
tab1Reorder <- tab1[, c(1,4,2, 3)]
write.csv(tab1Reorder, "tab1.csv")

#When you knit to PDF, this step makes a publication like table one. It looks flashy 
#and can be used for internal use. However, many journals prefer an editable version. As result, 
#you will skip this step, proceed below, and knit to word
 tab1Reorder %>%
  kable(format= "latex", caption = "Clinic Demographics", booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = F) %>%
  column_spec(1, width = "5cm") %>%
  column_spec(2, width = "2cm") %>%
  column_spec(3, width = "2cm") %>%
  group_rows("Counts", 1, 3) %>%
  group_rows("Ages", 4, 6) %>%
  group_rows("Race/Ethnicity", 7, 13) %>%
  group_rows("Language", 14, 18) %>%
  add_footnote(c("Adults is 18+", "Peds is 17 & under",
                 "(m/sd) is mean and standard deviation"), 
               notation = "symbol") %>%
  landscape()
#this step creates an editable table. Knit this to word.
tab1Reorder %>% kable()
Variable All Patients Adults Pediatric pts
Total 760 638 122
Male 154 100 54
Female 606 538 68
All (m/sd) 38 / 16.9 44 / 12.5 10 / 4.8
Males (m/sd) 31 / 20.3 43 / 15.2 9 / 5.2
Females (m/sd) 40 / 15.3 44 / 11.9 11 / 4.2
Hispanic 690 582 108
White 27 19 8
Black 1 1 0
Asian 1 1 0
Native American 1 1 0
Other 2 2 0
Not Available 38 32 6
Spanish 507 445 62
English 101 76 25
Both 36 19 17
Other 8 8 0
Not Available 107 89 18

Appendix

This step is to make organize the data in the columns as mean (SD) rather than mean / SD. This is not the most efficient way to achieve this goal. Please feel free to improve code

reordertab1 <- tab1[, c(1, 4, 2, 3)]

#Now i need to add parenthesis around the SD or %. For this is must seperate, add, parenth, and then re-unite
reorg_reorder <- reordertab1 %>%
  separate(col = "All Patients", into= c("all", "all_sd"), sep = "/") %>%
  separate(col = "Adults", into= c("adults", "adults_sd"), sep = "/") %>%
  separate(col = "Pediatric pts", into= c("ped", "ped_sd"), sep = "/") 

reorg_reorder[c(4:6),3] <- paste0("(", format(unlist(reorg_reorder[c(4:6),3])),")")
reorg_reorder[c(4:6),5] <- paste0("(", format(unlist(reorg_reorder[c(4:6),5])),")")
reorg_reorder[c(4:6),7] <- paste0("(", format(unlist(reorg_reorder[c(4:6),7])),")")

reorg_reorder2 <- reorg_reorder %>%
  replace_na(list(all_sd = "", adults_sd = "", ped_sd = "")) %>%
  unite(col = "All Patients", all, all_sd , sep = "") %>%
  unite(col = "Adults", adults, adults_sd , sep = "") %>%
  unite(col = "Pediatric pts", ped, ped_sd , sep = "")