1 Overall description

The following code/output/plots were served as examples of specific/general R functions/operations that I found helpful in analyzing data with R.

2 Pseudo Data

A pseudo dataset was created to illustrate functions/codes. The data was created using random sampling completely, and the topic of the data is mimicking a class of college students.

rm(list=ls())
library(tidyverse)
library(lubridate)
library(janeaustenr)

set.seed(20120116)

PseudoData <- data.frame(StudentID = c(1:100)) %>%
  mutate(
    
    #### Demographics
    DoB = sample(seq(as.Date('1990/01/01'), as.Date('1995/01/01'), by="day"), 100),
    Age = floor(decimal_date(as.Date('2020/01/01')) - decimal_date(DoB)),
    Gender = sample(c("Female","Male"),size=100, replace=T),
    Race = sample(c("White","Black","Asian","Multiple","Unknown",NA), size=100, replace=T),
    Ethnicity = sample(c("Non-Hispanic","Hispanic"),size=100,replace=T)) %>%
  
  #### Numerical grade from year 1 to year 4
  mutate(
    Grade_Math_Y1 = round(rnorm(100,mean=70,sd=10)),
    Grade_Stats_Y1 = round(rnorm(100,mean=75,sd=10)),
    Grade_Engl_Y1 = round(rnorm(100,mean=75,sd=10)),
    Grade_Art_Y1 = round(rnorm(100,mean=75,sd=10)),
    Grade_Sports_Y1 = round(rnorm(100,mean=75,sd=10)),
    
    Grade_Math_Y2 = Grade_Math_Y1+round(rnorm(100,mean=2,sd=5)),
    Grade_Stats_Y2 = Grade_Stats_Y1+round(rnorm(100,mean=2,sd=5)),
    Grade_Engl_Y2 = Grade_Engl_Y1+round(rnorm(100,mean=2,sd=5)),
    Grade_Art_Y2 = Grade_Art_Y1+round(rnorm(100,mean=2,sd=5)),
    Grade_Sports_Y2 = Grade_Sports_Y1+round(rnorm(100,mean=2,sd=5)),
    
    Grade_Math_Y3 = Grade_Math_Y2+round(rnorm(100,mean=2,sd=5)),
    Grade_Stats_Y3 = Grade_Stats_Y2+round(rnorm(100,mean=2,sd=5)),
    Grade_Engl_Y3 = Grade_Engl_Y2+round(rnorm(100,mean=2,sd=5)),
    Grade_Art_Y3 = Grade_Art_Y2+round(rnorm(100,mean=2,sd=5)),
    Grade_Sports_Y3 = Grade_Sports_Y2+round(rnorm(100,mean=2,sd=5)),
    
    Grade_Math_Y4 = Grade_Math_Y3+round(rnorm(100,mean=2,sd=5)),
    Grade_Stats_Y4 = Grade_Stats_Y3+round(rnorm(100,mean=2,sd=5)),
    Grade_Engl_Y4 = Grade_Engl_Y3+round(rnorm(100,mean=2,sd=5)),
    Grade_Art_Y4 = Grade_Art_Y3+round(rnorm(100,mean=2,sd=5)),
    Grade_Sports_Y4 = Grade_Sports_Y3+round(rnorm(100,mean=2,sd=5))) %>%
  
  mutate_at(vars(starts_with("Grade")), ~ ifelse(. >100, 100, .)) %>%
  
  #### Create some random NA's in the data
  mutate_at(vars(starts_with("Grade")), ~ ifelse(. == sample(seq_len(100), 10),NA,.)) %>%
  
  #### Create letter-grades based on number-grades
  mutate_at(vars(starts_with('Grade')), .funs = list(Letter = ~ ifelse(         
    . >= 90, "A",ifelse( . >= 80, "B", ifelse(. >= 70, "C", ifelse(. >= 60, "D","F")))))) %>%
  
  #### Add other aspects of the data
  mutate(Major = sample(c("Communications", "Business", "Unknown","Statistics", "Public Health",NA,
                          "Nursing","Sustainability","Biology", "English","History", "Computer Science"),
                        size=100, replace=T),
         Scholarship = sample(c("No","Yes"), size=100, prob=c(0.6,0.4), replace=T),
         Schol_Amount = ifelse(Scholarship %in% "Yes", round(rnorm(100,mean=80, sd = 20))*100, NA),
         ProjectedEarning = sample(seq(35000, 110000, by=500),100)) %>%
  
  #### Add employment for potential survival analysis 
  mutate(JobHuntDate = sample(seq(as.Date('2020/01/01'), as.Date('2020/04/01'), by="day"), 100, replace = T),
         Employed= sample(c("No","Yes"), size=100, prob=c(0.7,0.3), replace=T),
         EmployDate = as.Date(ifelse(Employed %in% "Yes",
                                     as.Date( JobHuntDate + sample(seq(30,180,1), 100, replace=T),
                                              origin="1970-01-01"), 
                                     as.Date('2021/01/01')), 
                              origin="1970-01-01"),
         Day2Employ = as.numeric(difftime(EmployDate, JobHuntDate, unit="days"))) %>% 
  
  #### For text analysis, use random data from janeaustenr package. 
  mutate(Fav_Jane = sample(as.character(austen_books()$text[!austen_books()$text %in% ""]),
                           100, replace = T)) %>%               
  
  #### Add US state longitude and latitude data for map plot purpose
  ## Randomly select 100 location in US
  bind_cols(sample_n(map_data("state"),100)) %>%                                
  select(-c(group, order, subregion))        %>% 
  dplyr::rename(Longitude = long, Latitude = lat, State = region) %>% 
  
  #### Transfer data.frame as a subclass that behave with different default behavior
  tbl_df()

3 Workspace, Packages and Setting (Work Pack)

3.1 Work Pack remove current workspace

# rm(list=ls())

3.2 Work Pack detach all packages

# invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE))

3.3 Work Pack install specific version of package

Often need to install rtools.exe and devtools package.

# require(devtools)
# install_version("ggplot2", version = "0.9.1", repos = "http://cran.us.r-project.org")

3.4 Work Pack install and load packages under a specific library path

#### install packages under a specific library path
# project_lib <- "D:/YOUR OWN PATH/"
# install.packages("countrycode", lib = project_lib)

#### load packages from a specific library path 
# project_lib <- "D:/YOUR OWN PATH/"
# library(countrycode, lib.loc = project_lib)

3.5 Work Pack report all packages installed

# ip = as.data.frame(installed.packages(lib.loc = project_lib)[,c(1,3:4)])
# write.csv(ip, file = paste0("list of packages ", Sys.Date(), ".csv"), row.names = F)

3.6 Work Pack install and load if not exist

# if(!require(somepackage)){
#     install.packages("somepackage")
#     library(somepackage)
# }

3.7 Work Pack install binary version of package

# install.packages("packagename", type = "binary")

4 Reporting and Some Tricks (Repo Tric)

4.1 Repo Trick run a quick summarizing statistics

#### Method 1. use glimpse from dplyr 
glimpse(PseudoData)

## Rows: 100
## Columns: 58
## $ StudentID              <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …
## $ DoB                    <date> 1993-01-31, 1991-01-04, 1993-02-28, 1990-11-28…
## $ Age                    <dbl> 26, 28, 26, 29, 29, 25, 28, 27, 28, 27, 25, 28,…
## $ Gender                 <chr> "Female", "Female", "Female", "Male", "Male", "…
## $ Race                   <chr> "Asian", "Unknown", NA, "Unknown", "Multiple", …
## $ Ethnicity              <chr> "Non-Hispanic", "Hispanic", "Hispanic", "Non-Hi…
## $ Grade_Math_Y1          <dbl> 56, 65, 46, 69, 84, 81, 62, 75, 56, 90, 66, 93,…
## $ Grade_Stats_Y1         <dbl> 78, 61, 46, 79, 80, 80, 70, 83, 75, 80, 63, 82,…
## $ Grade_Engl_Y1          <dbl> 80, 92, 83, 74, 87, 56, 69, 60, 82, 77, 82, 77,…
## $ Grade_Art_Y1           <dbl> 80, 87, 70, 62, NA, 63, 81, 63, 69, 69, 67, NA,…
## $ Grade_Sports_Y1        <dbl> 75, 79, 90, 64, 82, 72, 81, 71, 90, 81, 79, 62,…
## $ Grade_Math_Y2          <dbl> 60, 63, 45, 66, 90, 85, 64, 76, 61, 91, 61, 97,…
## $ Grade_Stats_Y2         <dbl> 76, 62, 45, 79, 79, 88, 67, 80, 72, 76, 69, 89,…
## $ Grade_Engl_Y2          <dbl> 85, 95, 88, 80, 95, NA, 75, NA, 80, 76, 81, 87,…
## $ Grade_Art_Y2           <dbl> 88, 86, 77, 71, 58, 69, 87, 76, 64, 56, 67, 75,…
## $ Grade_Sports_Y2        <dbl> 77, 81, 100, 57, 87, 72, 85, 74, 100, 85, 81, 6…
## $ Grade_Math_Y3          <dbl> 68, 67, 56, 76, 96, 87, 67, 75, 59, 91, 64, 99,…
## $ Grade_Stats_Y3         <dbl> 77, 55, 47, 77, 91, 97, 64, 79, 79, 74, 80, 92,…
## $ Grade_Engl_Y3          <dbl> 87, 96, 82, 79, 93, 79, 71, 57, 80, 80, 86, 87,…
## $ Grade_Art_Y3           <dbl> 87, 84, 82, 64, 52, 69, 86, 74, 64, 54, 69, 73,…
## $ Grade_Sports_Y3        <dbl> 77, 91, 98, 62, 85, 75, 86, 69, 100, 94, 90, 61…
## $ Grade_Math_Y4          <dbl> 65, 73, 61, 78, 100, 91, 68, 75, 61, 100, 74, 1…
## $ Grade_Stats_Y4         <dbl> 78, 53, 39, 77, 100, 95, 71, 80, 77, 75, 72, 94…
## $ Grade_Engl_Y4          <dbl> 92, 98, 83, 77, 91, 85, 69, 55, 91, 87, 91, 78,…
## $ Grade_Art_Y4           <dbl> 100, 77, 93, 67, 65, 69, 93, 71, 64, 54, 68, 74…
## $ Grade_Sports_Y4        <dbl> 80, 85, 88, 61, 89, 67, 83, 56, 100, 98, 88, 62…
## $ Grade_Math_Y1_Letter   <chr> "F", "D", "F", "D", "B", "B", "D", "C", "F", "A…
## $ Grade_Stats_Y1_Letter  <chr> "C", "D", "F", "C", "B", "B", "C", "B", "C", "B…
## $ Grade_Engl_Y1_Letter   <chr> "B", "A", "B", "C", "B", "F", "D", "D", "B", "C…
## $ Grade_Art_Y1_Letter    <chr> "B", "B", "C", "D", NA, "D", "B", "D", "D", "D"…
## $ Grade_Sports_Y1_Letter <chr> "C", "C", "A", "D", "B", "C", "B", "C", "A", "B…
## $ Grade_Math_Y2_Letter   <chr> "D", "D", "F", "D", "A", "B", "D", "C", "D", "A…
## $ Grade_Stats_Y2_Letter  <chr> "C", "D", "F", "C", "C", "B", "D", "B", "C", "C…
## $ Grade_Engl_Y2_Letter   <chr> "B", "A", "B", "B", "A", NA, "C", NA, "B", "C",…
## $ Grade_Art_Y2_Letter    <chr> "B", "B", "C", "C", "F", "D", "B", "C", "D", "F…
## $ Grade_Sports_Y2_Letter <chr> "C", "B", "A", "F", "B", "C", "B", "C", "A", "B…
## $ Grade_Math_Y3_Letter   <chr> "D", "D", "F", "C", "A", "B", "D", "C", "F", "A…
## $ Grade_Stats_Y3_Letter  <chr> "C", "F", "F", "C", "A", "A", "D", "C", "C", "C…
## $ Grade_Engl_Y3_Letter   <chr> "B", "A", "B", "C", "A", "C", "C", "F", "B", "B…
## $ Grade_Art_Y3_Letter    <chr> "B", "B", "B", "D", "F", "D", "B", "C", "D", "F…
## $ Grade_Sports_Y3_Letter <chr> "C", "A", "A", "D", "B", "C", "B", "D", "A", "A…
## $ Grade_Math_Y4_Letter   <chr> "D", "C", "D", "C", "A", "A", "D", "C", "D", "A…
## $ Grade_Stats_Y4_Letter  <chr> "C", "F", "F", "C", "A", "A", "C", "B", "C", "C…
## $ Grade_Engl_Y4_Letter   <chr> "A", "A", "B", "C", "A", "B", "D", "F", "A", "B…
## $ Grade_Art_Y4_Letter    <chr> "A", "C", "A", "D", "D", "D", "A", "C", "D", "F…
## $ Grade_Sports_Y4_Letter <chr> "B", "B", "B", "D", "B", "D", "B", "F", "A", "A…
## $ Major                  <chr> "Nursing", "Unknown", "Unknown", "Public Health…
## $ Scholarship            <chr> "No", "No", "No", "No", "No", "No", "No", "No",…
## $ Schol_Amount           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ProjectedEarning       <dbl> 46000, 91500, 48500, 98500, 102000, 108500, 735…
## $ JobHuntDate            <date> 2020-01-10, 2020-01-10, 2020-01-17, 2020-03-12…
## $ Employed               <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes…
## $ EmployDate             <date> 2021-01-01, 2021-01-01, 2021-01-01, 2021-01-01…
## $ Day2Employ             <dbl> 357, 357, 350, 295, 63, 283, 355, 43, 349, 287,…
## $ Fav_Jane               <chr> "answer a note, instead of waiting for me, you …
## $ Longitude              <dbl> -75.79086, -71.41919, -89.39861, -114.72906, -8…
## $ Latitude               <dbl> 44.45007, 41.60247, 30.29801, 33.40344, 38.4397…
## $ State                  <chr> "new york", "rhode island", "mississippi", "ari…

#### Method 2. use describe() from Hmisc
# This method outputs a very detailed report, thus result not shown
# Hmisc::describe(PseudoData)


## see also https://dabblingwithdata.wordpress.com/2018/01/02/my-favourite-r-package-for-summarising-data/
## for other packages/functions of summarizing data

#### Method 3. use skim from skimr
library(skimr)
skim(PseudoData)

Data summary
Name	PseudoData
Number of rows	100
Number of columns	58
_______________________
Column type frequency:
character	28
Date	3
numeric	27
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
Gender	0	1.00	4	6	2
Race	22	0.78	5	8	5
Ethnicity	0	1.00	8	12	2
Grade_Math_Y1_Letter	2	0.98	1	1	5
Grade_Stats_Y1_Letter	1	0.99	1	1	5
Grade_Engl_Y1_Letter	0	1.00	1	1	5
Grade_Art_Y1_Letter	4	0.96	1	1	5
Grade_Sports_Y1_Letter	1	0.99	1	1	5
Grade_Math_Y2_Letter	1	0.99	1	1	5
Grade_Stats_Y2_Letter	0	1.00	1	1	5
Grade_Engl_Y2_Letter	3	0.97	1	1	5
Grade_Art_Y2_Letter	0	1.00	1	1	5
Grade_Sports_Y2_Letter	0	1.00	1	1	5
Grade_Math_Y3_Letter	2	0.98	1	1	5
Grade_Stats_Y3_Letter	0	1.00	1	1	5
Grade_Engl_Y3_Letter	0	1.00	1	1	5
Grade_Art_Y3_Letter	2	0.98	1	1	5
Grade_Sports_Y3_Letter	1	0.99	1	1	5
Grade_Math_Y4_Letter	0	1.00	1	1	5
Grade_Stats_Y4_Letter	0	1.00	1	1	5
Grade_Engl_Y4_Letter	2	0.98	1	1	5
Grade_Art_Y4_Letter	0	1.00	1	1	5
Grade_Sports_Y4_Letter	1	0.99	1	1	5
Major	13	0.87	7	16	11
Scholarship	0	1.00	2	3	2
Employed	0	1.00	2	3	2
Fav_Jane	0	1.00	10	72	100
State	0	1.00	4	14	35

Variable type: Date

skim_variable	complete_rate	min	max	median	n_unique
DoB	1	1990-02-27	1994-12-26	1992-11-05	100
JobHuntDate	1	2020-01-01	2020-03-31	2020-02-14	57
EmployDate	1	2020-03-03	2021-01-01	2021-01-01	23

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
StudentID	0	1.00	50.50	29.01	1.00	25.75	50.50	75.25	100.00	▇▇▇▇▇
Age	0	1.00	26.79	1.29	25.00	26.00	27.00	28.00	29.00	▆▇▆▆▃
Grade_Math_Y1	2	0.98	70.06	10.18	42.00	62.25	70.50	76.00	93.00	▁▅▇▆▃
Grade_Stats_Y1	1	0.99	75.22	9.05	46.00	70.00	75.00	82.00	97.00	▁▃▇▇▂
Grade_Engl_Y1	0	1.00	75.77	10.98	41.00	68.00	74.50	83.00	100.00	▁▂▇▇▂
Grade_Art_Y1	4	0.96	73.84	8.26	56.00	67.75	73.50	78.00	92.00	▂▆▇▃▃
Grade_Sports_Y1	1	0.99	74.92	9.62	52.00	68.50	76.00	81.50	96.00	▂▅▇▇▂
Grade_Math_Y2	1	0.99	72.60	11.13	44.00	64.00	73.00	82.00	97.00	▁▅▇▆▂
Grade_Stats_Y2	0	1.00	77.16	9.94	45.00	71.00	77.00	83.00	100.00	▁▂▇▇▂
Grade_Engl_Y2	3	0.97	77.84	11.61	42.00	70.00	78.00	86.00	100.00	▁▃▇▇▃
Grade_Art_Y2	0	1.00	75.77	9.49	54.00	69.00	75.50	82.25	98.00	▂▅▇▆▁
Grade_Sports_Y2	0	1.00	77.14	10.66	46.00	71.75	77.50	84.00	100.00	▁▃▇▇▂
Grade_Math_Y3	2	0.98	75.01	11.98	46.00	67.00	76.00	83.75	99.00	▂▆▇▇▃
Grade_Stats_Y3	0	1.00	79.10	11.34	47.00	72.75	79.00	88.00	100.00	▁▃▇▇▅
Grade_Engl_Y3	0	1.00	79.83	11.58	43.00	72.75	81.00	87.25	100.00	▁▂▅▇▅
Grade_Art_Y3	2	0.98	78.11	10.71	49.00	71.00	79.00	85.00	100.00	▁▃▇▇▃
Grade_Sports_Y3	1	0.99	79.43	11.48	50.00	71.50	81.00	87.00	100.00	▁▅▆▇▅
Grade_Math_Y4	0	1.00	76.76	13.89	41.00	67.75	77.50	87.25	100.00	▂▃▇▇▅
Grade_Stats_Y4	0	1.00	81.70	12.82	39.00	72.00	82.50	92.00	100.00	▁▂▅▇▇
Grade_Engl_Y4	2	0.98	81.51	12.39	37.00	72.25	82.50	91.00	100.00	▁▁▅▇▇
Grade_Art_Y4	0	1.00	79.70	10.89	49.00	73.00	79.00	87.00	100.00	▁▃▇▇▅
Grade_Sports_Y4	1	0.99	81.49	12.35	52.00	73.50	82.00	90.00	100.00	▂▃▅▇▆
Schol_Amount	77	0.23	7869.57	2557.95	3300.00	6150.00	8100.00	9350.00	14500.00	▃▇▇▃▁
ProjectedEarning	0	1.00	72320.00	23408.70	35500.00	51875.00	71750.00	92750.00	110000.00	▇▇▅▇▇
Day2Employ	0	1.00	264.03	107.50	31.00	175.75	304.00	345.25	366.00	▂▁▁▃▇
Longitude	0	1.00	-90.48	14.03	-124.06	-96.81	-88.17	-80.25	-70.03	▂▁▃▇▆
Latitude	0	1.00	37.31	5.64	26.04	33.54	36.47	41.61	48.29	▃▇▇▆▅

#### Method 4. Ssummary stats for continuous as table
library(rstatix)
PseudoData %>% 
  group_by(Major) %>% 
  select(Grade_Art_Y1, Grade_Math_Y1) %>% 
  get_summary_stats()

## # A tibble: 24 × 14
##    Major    varia…¹     n   min   max median    q1    q3   iqr   mad  mean    sd
##    <chr>    <chr>   <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 Biology  Grade_…    10    64    92   74    69.5  81.8  12.2  8.90  75.7  9.12
##  2 Biology  Grade_…    10    62    90   75.5  67    80.8  13.8 10.4   74.8  9.45
##  3 Business Grade_…     7    60    85   77    75    80.5   5.5  4.45  76.1  8.11
##  4 Business Grade_…     8    69    93   72.5  71.5  77     5.5  2.96  76.1  8.61
##  5 Communi… Grade_…     7    67    80   70    67.5  73.5   6    4.45  71.3  4.75
##  6 Communi… Grade_…     8    54    74   68    60.5  72.5  12    8.90  66    7.86
##  7 Compute… Grade_…     9    68    89   77    77    78     1    1.48  76.8  6.04
##  8 Compute… Grade_…     9    42    84   74    62    81    19   11.9   70.2 13.4 
##  9 English  Grade_…     7    63    82   73    69.5  76.5   7    5.93  72.9  6.36
## 10 English  Grade_…     8    57    82   69    58.8  72    13.2  9.64  67.2  8.75
## # … with 14 more rows, 2 more variables: se <dbl>, ci <dbl>, and abbreviated
## #   variable name ¹variable
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names

4.2 Repo Tric read and run code that was in string format

#### Option 1. use eval with parse. However, this method does not work with dplyr pipes
x <- "Math"
eval(parse(text=paste0("PseudoData$Grade_", x, "_Y1")))

##   [1] 56 65 46 69 84 81 62 75 56 90 66 93 62 42 72 71 69 82 76 64 72 59 85 54 55
##  [26] NA 84 84 56 82 82 85 56 71 82 68 62 84 53 NA 73 63 83 62 74 53 76 57 62 74
##  [51] 60 66 72 58 75 68 71 61 69 68 75 84 56 61 65 68 86 70 64 71 72 85 62 70 74
##  [76] 50 68 70 77 72 80 74 67 70 80 75 70 74 84 79 81 64 72 71 74 66 68 61 75 81

#### Option 2. use !!sym, this method works well with dplyr pipes
Y <- paste0("Grade_",x,"_Y1")
PseudoData %>% mutate(new_col = !!sym(Y)) %>% pull(new_col)

##   [1] 56 65 46 69 84 81 62 75 56 90 66 93 62 42 72 71 69 82 76 64 72 59 85 54 55
##  [26] NA 84 84 56 82 82 85 56 71 82 68 62 84 53 NA 73 63 83 62 74 53 76 57 62 74
##  [51] 60 66 72 58 75 68 71 61 69 68 75 84 56 61 65 68 86 70 64 71 72 85 62 70 74
##  [76] 50 68 70 77 72 80 74 67 70 80 75 70 74 84 79 81 64 72 71 74 66 68 61 75 81

#### Option 3. use {{ x }}, this method also works well with dplyr pipes
## check https://dplyr.tidyverse.org/articles/programming.html for more details
example_func <- function(data, col, ...) {
  data %>% 
    count({{ col }}) %>% 
    ggplot(aes(x = n, y = {{ col }}, ...)) + # ... passes the additional terms as aesthetics
    geom_col()
}

PseudoData %>% 
  example_func(Major, fill = Major)

#### Other situation, need to copy a console output that can be directly ran
PseudoData %>% select(Grade_Math_Y1) %>% head(10) %>% dput()

## structure(list(Grade_Math_Y1 = c(56, 65, 46, 69, 84, 81, 62, 
## 75, 56, 90)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", 
## "data.frame"))

PseudoData %>% pull(Grade_Math_Y1) %>% head(10) %>% dput()

## c(56, 65, 46, 69, 84, 81, 62, 75, 56, 90)

4.3 Repo Tric report regular number of scientific notation

## Some numeric value will be automatically presented in scientific notation
1/987654

## [1] 1.0125e-06

## If you want to prevent that from happening, try set the scipen in the option
options(scipen = 999)
1/987654

## [1] 0.0000010125

## Also, if you want to reset the setting, you can do (0 is the default for scipen)
options(scipen = 0)
1/987654

## [1] 1.0125e-06

4.4 Repo Tric report n and percentage

#### Situation 1. single column n and percentage - using prop.table
## note, if need to include NA in calculating percentage, add in table(, useNA = "always)
tb <- table(PseudoData$Race)
paste0(tb, " (",format(round(prop.table(tb) * 100, 2), nsmall = 2), ")")

## [1] "19 (24.36)" "20 (25.64)" "15 (19.23)" "11 (14.10)" "13 (16.67)"

#### Situation 2. single column n and percentage - using dplyr version
## note, this version, NA was included by default if exist
np <- 
  PseudoData %>% 
  count(Race, .drop = F) %>%   # .drop = F forbids the function to drop empty categories
  mutate(p = format(round(n/sum(n)*100, 2), nsmall = 2)) %>% 
  mutate(n_p = paste0(n, " (", p, ")"))
np

## # A tibble: 6 × 4
##   Race         n p     n_p       
##   <chr>    <int> <chr> <chr>     
## 1 Asian       19 19.00 19 (19.00)
## 2 Black       20 20.00 20 (20.00)
## 3 Multiple    15 15.00 15 (15.00)
## 4 Unknown     11 11.00 11 (11.00)
## 5 White       13 13.00 13 (13.00)
## 6 <NA>        22 22.00 22 (22.00)

#### Situation 3. cross-tab - using prop.table 
tb2 <-  table(PseudoData$Race, PseudoData$Ethnicity)
## row percentage
paste0(tb2," (",format(round(prop.table(tb2, 1) * 100, 2), nsmall = 2), ")")

##  [1] "9 (47.37)"  "9 (45.00)"  "7 (46.67)"  "7 (63.64)"  "7 (53.85)" 
##  [6] "10 (52.63)" "11 (55.00)" "8 (53.33)"  "4 (36.36)"  "6 (46.15)"

## column percentage 
paste0(tb2," (",format(round(prop.table(tb2, 2) * 100, 2), nsmall = 2), ")")

##  [1] "9 (23.08)"  "9 (23.08)"  "7 (17.95)"  "7 (17.95)"  "7 (17.95)" 
##  [6] "10 (25.64)" "11 (28.21)" "8 (20.51)"  "4 (10.26)"  "6 (15.38)"

#### Of course, a dplyr version also exist but, as I know, is more complex than what was described using prop.table

4.5 Repo Tric report p-value

#### Method 1. report p-value from regression/test summary
## using wilcoxon test as an example 
wilc_example <- wilcox.test(PseudoData$Grade_Math_Y1~PseudoData$Gender)

p_val_fun <- function(test){
 p_raw <- test$p.value 
 p_format <- ifelse(p_raw < 0.001, paste0("p<0.001"), paste0("p=", round(p_raw, 3)))
 p_format
}
p_val_fun(wilc_example)

## [1] "p=0.162"

#### Method 2. use format.pval from base
format.pval(wilc_example$p.value, 3)

## [1] "0.162"

4.6 Repo Tric report a formatted table (kable)

PseudoData %>% select(1:5) %>% slice(1:5) %>% knitr::kable()

StudentID	DoB	Age	Gender	Race
1	1993-01-31	26	Female	Asian
2	1991-01-04	28	Female	Unknown
3	1993-02-28	26	Female	NA
4	1990-11-28	29	Male	Unknown
5	1990-07-05	29	Male	Multiple

4.7 Repo Tric report a formatted frequency/contingency table (tabyl from janitor)

#### Situation 1. One-way frequency 
library(janitor)
PseudoData %>% 
  tabyl(Grade_Math_Y1_Letter) %>% 
  adorn_totals("row") %>% 
  adorn_pct_formatting(digits = 2)

##  Grade_Math_Y1_Letter   n percent valid_percent
##                     A   2   2.00%         2.04%
##                     B  20  20.00%        20.41%
##                     C  32  32.00%        32.65%
##                     D  29  29.00%        29.59%
##                     F  15  15.00%        15.31%
##                  <NA>   2   2.00%             -
##                 Total 100 100.00%       100.00%

#### Situation 2. Two-way frequency
## just the frequency
PseudoData %>% 
  tabyl(Employed, Grade_Math_Y1_Letter) %>% 
  adorn_totals(c("row", "col"))

##  Employed A  B  C  D  F NA_ Total
##        No 2 11 22 27 11   1    74
##       Yes 0  9 10  2  4   1    26
##     Total 2 20 32 29 15   2   100

## percentage (frequency)
PseudoData %>% 
  tabyl(Employed, Grade_Math_Y1_Letter) %>% 
  adorn_totals("col") %>% 
  adorn_percentages("all") %>% 
  adorn_pct_formatting(digits = 2) %>% 
  adorn_ns() %>% 
  reactable::reactable()

4.8 Repo Tric report a characteristics summary table (CreateTableOne from tableone)

library(tableone)
col_list <- c("Age", "Gender", "Race", "Ethnicity", "Employed", "State")
fac_list <- col_list[-1]

report_table <- print(
  CreateTableOne(
    vars = col_list,
    strata = "Employed",
    data = PseudoData,
    factorVars = fac_list,
    includeNA = T,
    test = F),
  showAllLevels = T,
  catDigits = 1,
  contDigits = 1,
  pDigits = 4
)

##                  Stratified by Employed
##                   level          No            Yes          
##   n                                74            26         
##   Age (mean (SD))                26.6 (1.2)    27.3 (1.3)   
##   Gender (%)      Female           48 ( 64.9)    17 ( 65.4) 
##                   Male             26 ( 35.1)     9 ( 34.6) 
##   Race (%)        Asian            12 ( 16.2)     7 ( 26.9) 
##                   Black            15 ( 20.3)     5 ( 19.2) 
##                   Multiple         10 ( 13.5)     5 ( 19.2) 
##                   Unknown          10 ( 13.5)     1 (  3.8) 
##                   White            11 ( 14.9)     2 (  7.7) 
##                   <NA>             16 ( 21.6)     6 ( 23.1) 
##   Ethnicity (%)   Hispanic         42 ( 56.8)     9 ( 34.6) 
##                   Non-Hispanic     32 ( 43.2)    17 ( 65.4) 
##   Employed (%)    No               74 (100.0)     0 (  0.0) 
##                   Yes               0 (  0.0)    26 (100.0) 
##   State (%)       arizona           3 (  4.1)     1 (  3.8) 
##                   arkansas          2 (  2.7)     1 (  3.8) 
##                   california        3 (  4.1)     1 (  3.8) 
##                   florida           5 (  6.8)     0 (  0.0) 
##                   georgia           0 (  0.0)     1 (  3.8) 
##                   illinois          0 (  0.0)     2 (  7.7) 
##                   indiana           2 (  2.7)     0 (  0.0) 
##                   iowa              0 (  0.0)     1 (  3.8) 
##                   kansas            0 (  0.0)     1 (  3.8) 
##                   kentucky          1 (  1.4)     0 (  0.0) 
##                   louisiana         2 (  2.7)     2 (  7.7) 
##                   maine             1 (  1.4)     0 (  0.0) 
##                   maryland          2 (  2.7)     1 (  3.8) 
##                   massachusetts     1 (  1.4)     1 (  3.8) 
##                   michigan          4 (  5.4)     1 (  3.8) 
##                   minnesota         3 (  4.1)     0 (  0.0) 
##                   mississippi       2 (  2.7)     1 (  3.8) 
##                   missouri          1 (  1.4)     0 (  0.0) 
##                   nebraska          1 (  1.4)     0 (  0.0) 
##                   nevada            1 (  1.4)     0 (  0.0) 
##                   new hampshire     2 (  2.7)     0 (  0.0) 
##                   new mexico        2 (  2.7)     0 (  0.0) 
##                   new york          4 (  5.4)     1 (  3.8) 
##                   north carolina    6 (  8.1)     2 (  7.7) 
##                   ohio              3 (  4.1)     0 (  0.0) 
##                   oregon            1 (  1.4)     0 (  0.0) 
##                   rhode island      1 (  1.4)     0 (  0.0) 
##                   south carolina    3 (  4.1)     1 (  3.8) 
##                   tennessee         1 (  1.4)     0 (  0.0) 
##                   texas             9 ( 12.2)     3 ( 11.5) 
##                   virginia          0 (  0.0)     1 (  3.8) 
##                   washington        1 (  1.4)     2 (  7.7) 
##                   west virginia     4 (  5.4)     1 (  3.8) 
##                   wisconsin         1 (  1.4)     1 (  3.8) 
##                   wyoming           2 (  2.7)     0 (  0.0)

4.9 Repo Tric report a characteristics summary table (tbl_summary from ggsummary)

library(gtsummary)

## The tbl_summary outputs a nicely formatted table that can be directly reported in markdowns
PseudoData %>%
  select(Employed, Grade_Math_Y1, Gender, Age, Race) %>% 
  tbl_summary(by = Employed) %>% # Use missing = "no" to exclude NA from the table
  # add_p(pvalue_fun = ~style_pvalue(.x, digits = 2)) %>%
  add_overall() %>%
  add_n() %>%
  modify_header(label ~ "**Variable**") %>%
  modify_footnote(
    all_stat_cols() ~ "Median (IQR) or Frequency (%)"
  ) %>%
  bold_labels()

Variable	N	Overall, N = 100¹	No, N = 74¹	Yes, N = 26¹
Grade_Math_Y1	98	70 (62, 76)	69 (62, 74)	75 (70, 82)
Unknown		2	1	1
Gender	100
Female		65 (65%)	48 (65%)	17 (65%)
Male		35 (35%)	26 (35%)	9 (35%)
Age	100
25		19 (19%)	16 (22%)	3 (12%)
26		27 (27%)	22 (30%)	5 (19%)
27		21 (21%)	16 (22%)	5 (19%)
28		22 (22%)	14 (19%)	8 (31%)
29		11 (11%)	6 (8.1%)	5 (19%)
Race	78
Asian		19 (24%)	12 (21%)	7 (35%)
Black		20 (26%)	15 (26%)	5 (25%)
Multiple		15 (19%)	10 (17%)	5 (25%)
Unknown		11 (14%)	10 (17%)	1 (5.0%)
White		13 (17%)	11 (19%)	2 (10%)
Unknown		22	16	6
¹ Median (IQR) or Frequency (%)

4.10 Repo Tric report a comparison table (createTable from compareGroups)

Tables created by compareGroups can be easily be exported to CSV, LaTeX, HTML, PDF, Word or Excel, or inserted in R-markdown files to generate reports automatically.

More details, please see this site.

library(compareGroups)
res <- compareGroups(Employed ~ Age + Gender + Grade_Math_Y1 + Grade_Engl_Y1_Letter + Major, 
    data = PseudoData)
res

## 
## 
## -------- Summary of results by groups of 'Employed'---------
## 
## 
##   var                  N   p.value method            selection
## 1 Age                  100 0.034** continuous normal ALL      
## 2 Gender               100 1.000   categorical       ALL      
## 3 Grade_Math_Y1         98 0.177   continuous normal ALL      
## 4 Grade_Engl_Y1_Letter 100 0.879   categorical       ALL      
## -----
## Signif. codes:  0 '**' 0.05 '*' 0.1 ' ' 1

createTable(res)

## 
## --------Summary descriptives table by 'Employed'---------
## 
## _______________________________________________________ 
##                           No          Yes     p.overall 
##                          N=74        N=26               
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## Age                   26.6 (1.25) 27.3 (1.31)   0.034   
## Gender:                                         1.000   
##     Female            48 (64.9%)  17 (65.4%)            
##     Male              26 (35.1%)   9 (34.6%)            
## Grade_Math_Y1         69.2 (9.73) 72.6 (11.2)   0.177   
## Grade_Engl_Y1_Letter:                           0.879   
##     A                  9 (12.2%)   2 (7.69%)            
##     B                 20 (27.0%)   7 (26.9%)            
##     C                 24 (32.4%)   7 (26.9%)            
##     D                 17 (23.0%)   8 (30.8%)            
##     F                  4 (5.41%)   2 (7.69%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

4.11 Repo Tric report coefficient and confidence interval from regression models

It is often needed to extract regression coefficients and 95% confidence intervals from regression model and report just numbers instead of the summary table. Here are some quick functions to be used in that situation.

#### Situation 1. Cox regression, hazard ratio and 95% confidence interval
library(survival)
CoxReg <- coxph(Surv(Day2Employ, Employed == "Yes") ~ Gender, data = PseudoData)

HazardRatio <- function(x){
  paste(format(round(summary(x)$conf.int[, 1], 2), nsmall = 2),
        " (", format(round(summary(x)$conf.int[, 3], 2),nsmall = 2),
        "-", format(round(summary(x)$conf.int[, 4], 2), nsmall = 2), ")", sep = "")}
HazardRatio(CoxReg)

## [1] "1.03 (0.46-2.31)"

#### Situation 2.a Logistic regression, odds ratio and 95% confidence interval (using base)
LogiReg <- glm(Employed == "Yes" ~ Age + Gender, data = PseudoData, family = "binomial")

OddsRatio <- function(x){
  ## note that confnt calculates confidence interval using profiling method 
  ## while confint.default calculates confidence interval using Wald method, change if needed
  temp <- exp(cbind(OR = coef(x), confint(x))) 
  output <- paste(format(round(temp[, 1], 2), nsmall = 2), "(",
                  format(round(temp[, 2], 2), nsmall = 2), "-",
                  format(round(temp[, 3], 2), nsmall = 2), ")")
  return(temp)
}
OddsRatio(LogiReg)

##                       OR        2.5 %    97.5 %
## (Intercept) 6.859014e-06 2.242221e-10 0.1018555
## Age         1.496210e+00 1.048201e+00 2.1872222
## GenderMale  9.629803e-01 3.550460e-01 2.4941384

#### Situation 2.b Logistic regression, odds ratio and 95% confidence interval (using broom package)
broom::tidy(LogiReg, conf.int = T, exponentiate = T)

## # A tibble: 3 × 7
##   term          estimate std.error statistic p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept) 0.00000686     5.04    -2.36    0.0183 2.24e-10     0.102
## 2 Age         1.50           0.186    2.17    0.0303 1.05e+ 0     2.19 
## 3 GenderMale  0.963          0.493   -0.0765  0.939  3.55e- 1     2.49

4.12 Repo Tric report different tables in APA format (apaTables)

Functions from this package can format a variety types of tables to APA format, such as correlation table, regression table, ANOVA, and repeated measures. The output tables are saved as doc files by default. For details, see this post.

library(apaTables)

#### Example 1. Correlation table
PseudoData %>% 
  select(Grade_Art_Y1, Grade_Math_Y1, Grade_Stats_Y1, Grade_Sports_Y1) %>% 
  apa.cor.table(., filename="Table1_APA.doc", table.number=1)

## 
## 
## Table 1 
## 
## Means, standard deviations, and correlations with confidence intervals
##  
## 
##   Variable           M     SD    1           2           3          
##   1. Grade_Art_Y1    73.84 8.26                                     
##                                                                     
##   2. Grade_Math_Y1   70.06 10.18 .08                                
##                                  [-.13, .28]                        
##                                                                     
##   3. Grade_Stats_Y1  75.22 9.05  -.02        .09                    
##                                  [-.22, .18] [-.11, .28]            
##                                                                     
##   4. Grade_Sports_Y1 74.92 9.62  .18         -.10        -.04       
##                                  [-.02, .37] [-.30, .10] [-.24, .16]
##                                                                     
## 
## Note. M and SD are used to represent mean and standard deviation, respectively.
## Values in square brackets indicate the 95% confidence interval.
## The confidence interval is a plausible range of population correlations 
## that could have caused the sample correlation (Cumming, 2014).
##  * indicates p < .05. ** indicates p < .01.
##

#### Example 2. Regression table
reg <- lm(Grade_Math_Y1 ~ Grade_Stats_Y1 + Grade_Sports_Y1 + Age, data = PseudoData)
apa.reg.table(reg, filename = "Table2_APA.doc", table.number = 2)

## 
## 
## Table 2 
## 
## Regression results using Grade_Math_Y1 as the criterion
##  
## 
##        Predictor      b       b_95%_CI  beta   beta_95%_CI sr2  sr2_95%_CI    r
##      (Intercept) 55.37* [5.55, 105.18]                                         
##   Grade_Stats_Y1   0.10  [-0.14, 0.33]  0.08 [-0.12, 0.29] .01 [-.03, .04]  .09
##  Grade_Sports_Y1  -0.11  [-0.33, 0.12] -0.10 [-0.30, 0.11] .01 [-.03, .05] -.10
##              Age   0.58  [-1.04, 2.19]  0.07 [-0.13, 0.28] .01 [-.02, .03]  .08
##                                                                                
##                                                                                
##                                                                                
##              Fit
##                 
##                 
##                 
##                 
##        R2 = .024
##  95% CI[.00,.08]
##                 
## 
## Note. A significant b-weight indicates the beta-weight and semi-partial correlation are also significant.
## b represents unstandardized regression weights. beta indicates the standardized regression weights. 
## sr2 represents the semi-partial correlation squared. r represents the zero-order correlation.
## Square brackets are used to enclose the lower and upper limits of a confidence interval.
## * indicates p < .05. ** indicates p < .01.
##

#### Example 3. 2-way ANOVA table
options(contrasts = c("contr.sum", "contr.poly"))
lm_output <- lm(Grade_Math_Y1 ~ Employed * Scholarship, data = PseudoData)
apa.aov.table(lm_output, filename = "Table3_APA.doc", table.number = 7)

## 
## 
## Table 7 
## 
## ANOVA results using Grade_Math_Y1 as the dependent variable
##  
## 
##               Predictor        SS df        MS       F    p partial_eta2
##             (Intercept) 268270.63  1 268270.63 2576.00 .000             
##                Employed    134.24  1    134.24    1.29 .259          .01
##             Scholarship     42.09  1     42.09    0.40 .526          .00
##  Employed x Scholarship      5.07  1      5.07    0.05 .826          .00
##                   Error   9789.36 94    104.14                          
##  CI_90_partial_eta2
##                    
##          [.00, .07]
##          [.00, .05]
##          [.00, .03]
##                    
## 
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared

4.13 Repo Tric running multiple tests at simultaneously

#### use wilcoxon as an example
result <- list()
for(i in c(7:26)){                                                       
  fmla <- paste0("PseudoData$",names(PseudoData)[i], "~ PseudoData$Gender")
  result[[i]]<- eval(parse(text=paste0("wilcox.test(",
                                       fmla, ", paired = FALSE)")))
}
result

4.14 Repo Tric running multiple regression with the same predicting/outcome variable

#### use logistic regression as an example
fun1<-function(x){
  res<-c(paste(as.character(summary(x)$call),collapse = " "),
         ## note, only the gender coefficient (3rd row in the coefficient table) was reported
         summary(x)$coefficients[3,4],
         exp(coef(x))[3],
         exp(confint(x))[3,1:2],"\n")
  
  names(res)<-c("call","p-value","OR","LCI","UCI","")
  return(res)}

res2=NULL
lms=list()
for(i in 27:30)                                                                 
{
  lms[[i]]=glm(PseudoData[,i] == "A"
               ~ Age +
                 as.factor(Gender),
               family = "binomial", data = PseudoData)       
  res2<-rbind(res2,fun1(lms[[i]]))
}

res2

##      call                                                                        
## [1,] "glm PseudoData[, i] == \"A\" ~ Age + as.factor(Gender) binomial PseudoData"
## [2,] "glm PseudoData[, i] == \"A\" ~ Age + as.factor(Gender) binomial PseudoData"
## [3,] "glm PseudoData[, i] == \"A\" ~ Age + as.factor(Gender) binomial PseudoData"
## [4,] "glm PseudoData[, i] == \"A\" ~ Age + as.factor(Gender) binomial PseudoData"
##      p-value             OR                  LCI                
## [1,] "0.694933206189563" "0.754169390410814" "0.147688465301043"
## [2,] "0.651868303069197" "1.30416846723969"  "0.455591375491405"
## [3,] "0.221655500497209" "1.64805194858341"  "0.802210520880066"
## [4,] "0.47828655276198"  "0.695304803476652" "0.236079607524523"
##      UCI                    
## [1,] "3.85904159901459" "\n"
## [2,] "5.93665240232086" "\n"
## [3,] "4.32316360162714" "\n"
## [4,] "2.04728081413273" "\n"

4.15 Repo Tric highlight difference in two items

The compare function from package waldo returns a color-coded output indicating what was different from the two/multiple objects. It works with vector, list, data frame or tibble.

waldo::compare(c(1, 1, 2, 3), c(2, 2, 2, 3))

## `old`: 1 1 2 3
## `new`: 2 2 2 3

5 Data Operations and Formatting (Data Ops)

5.1 Data Ops run ifelse across multiple columns (any of the columns, all of the columns)

#### Situation 1. Any of the columns 
Any_Math_A <- apply(PseudoData[,str_detect(colnames(PseudoData),"Grade_Math") & 
                                 str_detect(colnames(PseudoData),"Letter")],1,  
                    function(x){ifelse(any(x %in% "A"),1,0)})


#### Situation 2. All of the columns
All_Math_A <- apply(PseudoData[,str_detect(colnames(PseudoData),"Grade_Math") & 
                                 str_detect(colnames(PseudoData),"Letter")],1,  
                    function(x){ifelse(all(x %in% "A"),1,0)})

5.2 Data Ops create and merge data frames

#### Situation 1. Create empty data frame or tibble
Empty_DF <- data.frame(matrix(nrow = 100, ncol = 0))
Empty_1Tibble <- as_tibble(matrix(nrow = 100, ncol = 0))


#### Situation 2. Create data frame or tibble based on existing column
Temp_DF <- data.frame(Col_A = PseudoData$StudentID)
Temp_tibble <- tibble(Col_A = PseudoData$StudentID)


#### Situation 3. Create data frame or tibble by entering data 
Temp_DF <- data.frame("Col A" = c(0,1,2),
                      "Col B" = c("A", "B", "C"),
                      "Col C" = factor(c("A", "B", "C")),
                      stringsAsFactors = F)
Test_Tibble <- tibble(Col_A = c(0,1,2),
                      Col_B = c("A","B","C"),
                      Col_C = factor(c("A","B", "D")),
                      NEW_COL = Col_A*Col_A) 


#### Situation 4. Create a tibble that is  based on combinations of multiple categorical variable 
Temp_DF <- crossing(ID = 1:5, Year = 1:5, class = c("A", "B", "C"))


#### Situation 5. Transform a data frame to a tibble-df object
Temp_TibbleDF <- PseudoData %>% tbl_df()


#### Situation 6. Merge multiple data frames at once
Reduce(function(x, y) merge(x, y, by = "",all = TRUE), list(df1, df2, df3))

5.3 Data Ops work with NA

#### Situation 1. Identify complete rows in a data frame 
complete.cases(PseudoData)

##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
##  [61]  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
##  [73] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [97] FALSE  TRUE FALSE FALSE

#### Situation 2. Replace certain entry as NA
## example of replacing single column
PseudoData %>% mutate(Race_na = na_if(Race, "Unknown")) %>% count(Race_na, Race)

## # A tibble: 6 × 3
##   Race_na  Race         n
##   <chr>    <chr>    <int>
## 1 Asian    Asian       19
## 2 Black    Black       20
## 3 Multiple Multiple    15
## 4 White    White       13
## 5 <NA>     Unknown     11
## 6 <NA>     <NA>        22

## can also work with multiple columns 
PseudoData %>% 
  mutate_at(vars(contains("Letter")), list(~ na_if(., "F"))) %>% 
  count(Grade_Math_Y1_Letter)

## # A tibble: 5 × 2
##   Grade_Math_Y1_Letter     n
##   <chr>                <int>
## 1 A                        2
## 2 B                       20
## 3 C                       32
## 4 D                       29
## 5 <NA>                    17

#### Situation 3. Replace NA as certain entry 
PseudoData %>% 
  replace_na(list(Race = "Missing",                               
                  Grade_Art_Y1_Letter = "Other")) %>% 
  count(Race, Grade_Art_Y1_Letter)

## # A tibble: 25 × 3
##    Race    Grade_Art_Y1_Letter     n
##    <chr>   <chr>               <int>
##  1 Asian   A                       1
##  2 Asian   B                       4
##  3 Asian   C                      10
##  4 Asian   D                       4
##  5 Black   B                       2
##  6 Black   C                       8
##  7 Black   D                       8
##  8 Black   Other                   2
##  9 Missing A                       1
## 10 Missing B                       6
## # … with 15 more rows
## # ℹ Use `print(n = ...)` to see more rows

#### Situation 4. fill NA of one column with values from a specified column
PseudoData %>% 
  select(Grade_Math_Y1, Grade_Math_Y2) %>%
  mutate(Grade_Math_Y2_New = coalesce(Grade_Math_Y2, Grade_Math_Y1)) %>%       
  filter(is.na(Grade_Math_Y2))

## # A tibble: 1 × 3
##   Grade_Math_Y1 Grade_Math_Y2 Grade_Math_Y2_New
##           <dbl>         <dbl>             <dbl>
## 1            79            NA                79

#### Situation 5. fill NA of one column with value of the next or the prior entry
PseudoData %>% 
  select(Grade_Math_Y1_Letter) %>%
  fill(Grade_Math_Y1_Letter, .direction = "down")

## # A tibble: 100 × 1
##    Grade_Math_Y1_Letter
##    <chr>               
##  1 F                   
##  2 D                   
##  3 F                   
##  4 D                   
##  5 B                   
##  6 B                   
##  7 D                   
##  8 C                   
##  9 F                   
## 10 A                   
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

#### Situation 6. remove rows with NA in any column
PseudoData %>% drop_na() %>% dim()

## [1] 12 58

5.4 Data Ops count data

#### Situation 1. Count a single column
## generate a frequency table
PseudoData %>% count(Major, name = "Major_n", sort = T)

## # A tibble: 12 × 2
##    Major            Major_n
##    <chr>              <int>
##  1 <NA>                  13
##  2 Unknown               12
##  3 Nursing               11
##  4 Biology               10
##  5 Computer Science       9
##  6 Business               8
##  7 Communications         8
##  8 English                8
##  9 Public Health          6
## 10 Statistics             6
## 11 History                5
## 12 Sustainability         4

## or add a column counting that variable
PseudoData %>% add_count(Race, name = "Race_count") %>% select(Race, Race_count)

## # A tibble: 100 × 2
##    Race     Race_count
##    <chr>         <int>
##  1 Asian            19
##  2 Unknown          11
##  3 <NA>             22
##  4 Unknown          11
##  5 Multiple         15
##  6 White            13
##  7 Asian            19
##  8 <NA>             22
##  9 White            13
## 10 <NA>             22
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

#### Situation 2. Count multiple columns
## if the no observation falls in the combination, n is NA
PseudoData %>% count(Race, Major) %>% complete(Race, Major)

## # A tibble: 72 × 3
##    Race  Major                n
##    <chr> <chr>            <int>
##  1 Asian Biology              3
##  2 Asian Business             1
##  3 Asian Communications       4
##  4 Asian Computer Science     1
##  5 Asian English              1
##  6 Asian History             NA
##  7 Asian Nursing              4
##  8 Asian Public Health        2
##  9 Asian Statistics          NA
## 10 Asian Sustainability      NA
## # … with 62 more rows
## # ℹ Use `print(n = ...)` to see more rows

## can also change that NA to 0 by adding fill = list(n = 0)
PseudoData %>% count(Race, Major) %>% complete(Race, Major, fill = list(n = 0))

## # A tibble: 72 × 3
##    Race  Major                n
##    <chr> <chr>            <int>
##  1 Asian Biology              3
##  2 Asian Business             1
##  3 Asian Communications       4
##  4 Asian Computer Science     1
##  5 Asian English              1
##  6 Asian History              0
##  7 Asian Nursing              4
##  8 Asian Public Health        2
##  9 Asian Statistics           0
## 10 Asian Sustainability       0
## # … with 62 more rows
## # ℹ Use `print(n = ...)` to see more rows

#### Situation 3. count the time each entry appeared (sequence)
## often useful when having repeated data. 
PseudoData %>%  
  # give the data an order to count sequence
  arrange(DoB) %>% 
  select(DoB, Major) %>% 
  # code  occurrence of each Major (when did each major had its first/2nd/3rd students)
  group_by(Major) %>%   
  mutate(occurrance = sequence(n()))

## # A tibble: 100 × 3
## # Groups:   Major [12]
##    DoB        Major            occurrance
##    <date>     <chr>                 <int>
##  1 1990-02-27 Computer Science          1
##  2 1990-03-18 Computer Science          2
##  3 1990-07-05 <NA>                      1
##  4 1990-08-19 <NA>                      2
##  5 1990-09-22 History                   1
##  6 1990-09-27 Unknown                   1
##  7 1990-09-30 <NA>                      3
##  8 1990-10-16 Public Health             1
##  9 1990-11-19 <NA>                      4
## 10 1990-11-28 Public Health             2
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

5.5 Data Ops arrange data

#### Situation 1. Arrange the data based on a given variable
## remove desc then arrange the data in increasing order 
PseudoData %>% arrange(desc(DoB)) %>% head(10) %>% select(StudentID, DoB)

## # A tibble: 10 × 2
##    StudentID DoB       
##        <int> <date>    
##  1        53 1994-12-26
##  2        87 1994-12-16
##  3        98 1994-11-18
##  4        37 1994-11-07
##  5        11 1994-10-02
##  6        14 1994-09-27
##  7        23 1994-09-23
##  8        77 1994-09-20
##  9        20 1994-06-28
## 10        45 1994-06-18

#### Situation 2. After arranging the data, keep only one observation for each category of certain variable
PseudoData %>% arrange(desc(Grade_Math_Y1)) %>% distinct(Major, .keep_all = T) %>% dim()

## [1] 12 58

#### Situation 3. Create a rank column based on order of a variable
## works with both categorical variable and continuous variable 
# rank method will give ties a gap
PseudoData %>% select(Grade_Stats_Y1_Letter) %>%
  mutate(rank = rank(Grade_Stats_Y1_Letter, ties.method = "min")) %>% 
  count(Grade_Stats_Y1_Letter, rank)

## # A tibble: 6 × 3
##   Grade_Stats_Y1_Letter  rank     n
##   <chr>                 <int> <int>
## 1 A                         1     4
## 2 B                         5    28
## 3 C                        33    46
## 4 D                        79    17
## 5 F                        96     4
## 6 <NA>                    100     1

# dense_rank will not give ties a gap
PseudoData %>% select(Grade_Stats_Y1_Letter) %>%
  mutate(rank = dense_rank(Grade_Stats_Y1_Letter)) %>% 
  count(Grade_Stats_Y1_Letter, rank)

## # A tibble: 6 × 3
##   Grade_Stats_Y1_Letter  rank     n
##   <chr>                 <int> <int>
## 1 A                         1     4
## 2 B                         2    28
## 3 C                         3    46
## 4 D                         4    17
## 5 F                         5     4
## 6 <NA>                     NA     1

5.6 Data Ops column operations

#### Situation 1. Rename columns
## can use both rename
temp <- PseudoData %>% rename(NewName4Major = Major,          
                              # or use column index (column number)
                              NewName4Age = 3)  
## or rename within select
temp <- PseudoData %>% select(NewName4Major = Major,                      
                              NewName4Age = Age)

## or rename all if applies
temp <- PseudoData %>% 
  rename_all(str_replace, "Grade", "G")


#### Situation 2. Select columns
## select the columns starts_with certain string (does not work with |)
PseudoData %>% select(StudentID, starts_with('Grade',ignore.case=TRUE)) %>% colnames()

##  [1] "StudentID"              "Grade_Math_Y1"          "Grade_Stats_Y1"        
##  [4] "Grade_Engl_Y1"          "Grade_Art_Y1"           "Grade_Sports_Y1"       
##  [7] "Grade_Math_Y2"          "Grade_Stats_Y2"         "Grade_Engl_Y2"         
## [10] "Grade_Art_Y2"           "Grade_Sports_Y2"        "Grade_Math_Y3"         
## [13] "Grade_Stats_Y3"         "Grade_Engl_Y3"          "Grade_Art_Y3"          
## [16] "Grade_Sports_Y3"        "Grade_Math_Y4"          "Grade_Stats_Y4"        
## [19] "Grade_Engl_Y4"          "Grade_Art_Y4"           "Grade_Sports_Y4"       
## [22] "Grade_Math_Y1_Letter"   "Grade_Stats_Y1_Letter"  "Grade_Engl_Y1_Letter"  
## [25] "Grade_Art_Y1_Letter"    "Grade_Sports_Y1_Letter" "Grade_Math_Y2_Letter"  
## [28] "Grade_Stats_Y2_Letter"  "Grade_Engl_Y2_Letter"   "Grade_Art_Y2_Letter"   
## [31] "Grade_Sports_Y2_Letter" "Grade_Math_Y3_Letter"   "Grade_Stats_Y3_Letter" 
## [34] "Grade_Engl_Y3_Letter"   "Grade_Art_Y3_Letter"    "Grade_Sports_Y3_Letter"
## [37] "Grade_Math_Y4_Letter"   "Grade_Stats_Y4_Letter"  "Grade_Engl_Y4_Letter"  
## [40] "Grade_Art_Y4_Letter"    "Grade_Sports_Y4_Letter"

## or contains certain string (does not work with |)
PseudoData %>% select(contains("Math")) %>% colnames()

## [1] "Grade_Math_Y1"        "Grade_Math_Y2"        "Grade_Math_Y3"       
## [4] "Grade_Math_Y4"        "Grade_Math_Y1_Letter" "Grade_Math_Y2_Letter"
## [7] "Grade_Math_Y3_Letter" "Grade_Math_Y4_Letter"

## or match certain strings (works well with |)
PseudoData %>% select(matches("Math|Age")) %>% colnames()

## [1] "Age"                  "Grade_Math_Y1"        "Grade_Math_Y2"       
## [4] "Grade_Math_Y3"        "Grade_Math_Y4"        "Grade_Math_Y1_Letter"
## [7] "Grade_Math_Y2_Letter" "Grade_Math_Y3_Letter" "Grade_Math_Y4_Letter"

PseudoData %>% select(matches("Math.+Letter")) %>% colnames()

## [1] "Grade_Math_Y1_Letter" "Grade_Math_Y2_Letter" "Grade_Math_Y3_Letter"
## [4] "Grade_Math_Y4_Letter"

PseudoData %>% select(matches("Math|Stats"), matches("Art|Sports")) %>% colnames()

##  [1] "Grade_Math_Y1"          "Grade_Stats_Y1"         "Grade_Math_Y2"         
##  [4] "Grade_Stats_Y2"         "Grade_Math_Y3"          "Grade_Stats_Y3"        
##  [7] "Grade_Math_Y4"          "Grade_Stats_Y4"         "Grade_Math_Y1_Letter"  
## [10] "Grade_Stats_Y1_Letter"  "Grade_Math_Y2_Letter"   "Grade_Stats_Y2_Letter" 
## [13] "Grade_Math_Y3_Letter"   "Grade_Stats_Y3_Letter"  "Grade_Math_Y4_Letter"  
## [16] "Grade_Stats_Y4_Letter"  "Grade_Art_Y1"           "Grade_Sports_Y1"       
## [19] "Grade_Art_Y2"           "Grade_Sports_Y2"        "Grade_Art_Y3"          
## [22] "Grade_Sports_Y3"        "Grade_Art_Y4"           "Grade_Sports_Y4"       
## [25] "Grade_Art_Y1_Letter"    "Grade_Sports_Y1_Letter" "Grade_Art_Y2_Letter"   
## [28] "Grade_Sports_Y2_Letter" "Grade_Art_Y3_Letter"    "Grade_Sports_Y3_Letter"
## [31] "Grade_Art_Y4_Letter"    "Grade_Sports_Y4_Letter"

#### Situation 3. Use select to reorder the columns
## say you want to have Age and Gender to be the first few columns 
PseudoData %>% select(Age, Gender, everything()) %>% colnames()

##  [1] "Age"                    "Gender"                 "StudentID"             
##  [4] "DoB"                    "Race"                   "Ethnicity"             
##  [7] "Grade_Math_Y1"          "Grade_Stats_Y1"         "Grade_Engl_Y1"         
## [10] "Grade_Art_Y1"           "Grade_Sports_Y1"        "Grade_Math_Y2"         
## [13] "Grade_Stats_Y2"         "Grade_Engl_Y2"          "Grade_Art_Y2"          
## [16] "Grade_Sports_Y2"        "Grade_Math_Y3"          "Grade_Stats_Y3"        
## [19] "Grade_Engl_Y3"          "Grade_Art_Y3"           "Grade_Sports_Y3"       
## [22] "Grade_Math_Y4"          "Grade_Stats_Y4"         "Grade_Engl_Y4"         
## [25] "Grade_Art_Y4"           "Grade_Sports_Y4"        "Grade_Math_Y1_Letter"  
## [28] "Grade_Stats_Y1_Letter"  "Grade_Engl_Y1_Letter"   "Grade_Art_Y1_Letter"   
## [31] "Grade_Sports_Y1_Letter" "Grade_Math_Y2_Letter"   "Grade_Stats_Y2_Letter" 
## [34] "Grade_Engl_Y2_Letter"   "Grade_Art_Y2_Letter"    "Grade_Sports_Y2_Letter"
## [37] "Grade_Math_Y3_Letter"   "Grade_Stats_Y3_Letter"  "Grade_Engl_Y3_Letter"  
## [40] "Grade_Art_Y3_Letter"    "Grade_Sports_Y3_Letter" "Grade_Math_Y4_Letter"  
## [43] "Grade_Stats_Y4_Letter"  "Grade_Engl_Y4_Letter"   "Grade_Art_Y4_Letter"   
## [46] "Grade_Sports_Y4_Letter" "Major"                  "Scholarship"           
## [49] "Schol_Amount"           "ProjectedEarning"       "JobHuntDate"           
## [52] "Employed"               "EmployDate"             "Day2Employ"            
## [55] "Fav_Jane"               "Longitude"              "Latitude"              
## [58] "State"

## or you want to order the columns alphabetically 
PseudoData %>% select(sort(colnames(PseudoData))) %>% colnames()

##  [1] "Age"                    "Day2Employ"             "DoB"                   
##  [4] "EmployDate"             "Employed"               "Ethnicity"             
##  [7] "Fav_Jane"               "Gender"                 "Grade_Art_Y1"          
## [10] "Grade_Art_Y1_Letter"    "Grade_Art_Y2"           "Grade_Art_Y2_Letter"   
## [13] "Grade_Art_Y3"           "Grade_Art_Y3_Letter"    "Grade_Art_Y4"          
## [16] "Grade_Art_Y4_Letter"    "Grade_Engl_Y1"          "Grade_Engl_Y1_Letter"  
## [19] "Grade_Engl_Y2"          "Grade_Engl_Y2_Letter"   "Grade_Engl_Y3"         
## [22] "Grade_Engl_Y3_Letter"   "Grade_Engl_Y4"          "Grade_Engl_Y4_Letter"  
## [25] "Grade_Math_Y1"          "Grade_Math_Y1_Letter"   "Grade_Math_Y2"         
## [28] "Grade_Math_Y2_Letter"   "Grade_Math_Y3"          "Grade_Math_Y3_Letter"  
## [31] "Grade_Math_Y4"          "Grade_Math_Y4_Letter"   "Grade_Sports_Y1"       
## [34] "Grade_Sports_Y1_Letter" "Grade_Sports_Y2"        "Grade_Sports_Y2_Letter"
## [37] "Grade_Sports_Y3"        "Grade_Sports_Y3_Letter" "Grade_Sports_Y4"       
## [40] "Grade_Sports_Y4_Letter" "Grade_Stats_Y1"         "Grade_Stats_Y1_Letter" 
## [43] "Grade_Stats_Y2"         "Grade_Stats_Y2_Letter"  "Grade_Stats_Y3"        
## [46] "Grade_Stats_Y3_Letter"  "Grade_Stats_Y4"         "Grade_Stats_Y4_Letter" 
## [49] "JobHuntDate"            "Latitude"               "Longitude"             
## [52] "Major"                  "ProjectedEarning"       "Race"                  
## [55] "Schol_Amount"           "Scholarship"            "State"                 
## [58] "StudentID"

#### Situation 4. Clean the column names
## try also case = c("title", "snake", "all_caps“)
PseudoData %>% janitor::clean_names(case = "snake") %>% colnames()

##  [1] "student_id"             "do_b"                   "age"                   
##  [4] "gender"                 "race"                   "ethnicity"             
##  [7] "grade_math_y1"          "grade_stats_y1"         "grade_engl_y1"         
## [10] "grade_art_y1"           "grade_sports_y1"        "grade_math_y2"         
## [13] "grade_stats_y2"         "grade_engl_y2"          "grade_art_y2"          
## [16] "grade_sports_y2"        "grade_math_y3"          "grade_stats_y3"        
## [19] "grade_engl_y3"          "grade_art_y3"           "grade_sports_y3"       
## [22] "grade_math_y4"          "grade_stats_y4"         "grade_engl_y4"         
## [25] "grade_art_y4"           "grade_sports_y4"        "grade_math_y1_letter"  
## [28] "grade_stats_y1_letter"  "grade_engl_y1_letter"   "grade_art_y1_letter"   
## [31] "grade_sports_y1_letter" "grade_math_y2_letter"   "grade_stats_y2_letter" 
## [34] "grade_engl_y2_letter"   "grade_art_y2_letter"    "grade_sports_y2_letter"
## [37] "grade_math_y3_letter"   "grade_stats_y3_letter"  "grade_engl_y3_letter"  
## [40] "grade_art_y3_letter"    "grade_sports_y3_letter" "grade_math_y4_letter"  
## [43] "grade_stats_y4_letter"  "grade_engl_y4_letter"   "grade_art_y4_letter"   
## [46] "grade_sports_y4_letter" "major"                  "scholarship"           
## [49] "schol_amount"           "projected_earning"      "job_hunt_date"         
## [52] "employed"               "employ_date"            "day2employ"            
## [55] "fav_jane"               "longitude"              "latitude"              
## [58] "state"

#### Situation 5. Conditional mutate 
## mutate the column if the column meets certain format
PseudoData %>% 
  mutate_if(is.numeric, ~ ifelse(. %in% NA, 0,.)) %>% 
  pull(Grade_Math_Y1)

##   [1] 56 65 46 69 84 81 62 75 56 90 66 93 62 42 72 71 69 82 76 64 72 59 85 54 55
##  [26]  0 84 84 56 82 82 85 56 71 82 68 62 84 53  0 73 63 83 62 74 53 76 57 62 74
##  [51] 60 66 72 58 75 68 71 61 69 68 75 84 56 61 65 68 86 70 64 71 72 85 62 70 74
##  [76] 50 68 70 77 72 80 74 67 70 80 75 70 74 84 79 81 64 72 71 74 66 68 61 75 81

## mutate the column if the column name meets certain string
PseudoData %>% 
  mutate_at(vars(ends_with("Letter")), ~ifelse(. %in% NA, "U", .)) %>%
  count(Grade_Math_Y1_Letter)

## # A tibble: 6 × 2
##   Grade_Math_Y1_Letter     n
##   <chr>                <int>
## 1 A                        2
## 2 B                       20
## 3 C                       32
## 4 D                       29
## 5 F                       15
## 6 U                        2

## mutate columns from a to b
PseudoData %>% 
  mutate_at(vars(Grade_Math_Y1_Letter : Grade_Sports_Y4_Letter), ~ifelse(. %in% NA, "U", .)) %>% 
  select(ends_with("Letter")) %>% 
  complete.cases()

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#### Situation 6. Transmute
## Only output the data that were mutated
PseudoData %>% transmute(test = Age) %>% colnames()

## [1] "test"

#### Situation 7. separate columns into multiple columns/rows based on separator
## into multiple columns
PseudoData %>% 
  select(StudentID, DoB) %>% 
  separate(DoB, 
           c("Y_DoB", "M_Dob", "D_DoB"), 
           sep="-",    
           extra = "merge",
           fill = "right")

## # A tibble: 100 × 4
##    StudentID Y_DoB M_Dob D_DoB
##        <int> <chr> <chr> <chr>
##  1         1 1993  01    31   
##  2         2 1991  01    04   
##  3         3 1993  02    28   
##  4         4 1990  11    28   
##  5         5 1990  07    05   
##  6         6 1994  02    04   
##  7         7 1991  09    07   
##  8         8 1992  07    26   
##  9         9 1991  11    04   
## 10        10 1992  10    10   
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

5.7 Data Ops row operations

#### Situation 1. slice rows
## show rows with lowest/highest n rows a certain variable 
PseudoData %>% slice_min(Grade_Math_Y1, n = 5) %>% select(StudentID, Grade_Math_Y1)

## # A tibble: 5 × 2
##   StudentID Grade_Math_Y1
##       <int>         <dbl>
## 1        14            42
## 2         3            46
## 3        76            50
## 4        39            53
## 5        46            53

PseudoData %>% slice_max(Grade_Math_Y1, n = 5) %>% select(StudentID, Grade_Math_Y1)

## # A tibble: 6 × 2
##   StudentID Grade_Math_Y1
##       <int>         <dbl>
## 1        12            93
## 2        10            90
## 3        67            86
## 4        23            85
## 5        32            85
## 6        72            85

## random select rows
# use slice_sample
PseudoData %>% slice_sample(n = 5) %>% select(StudentID)

## # A tibble: 5 × 1
##   StudentID
##       <int>
## 1        10
## 2        99
## 3        52
## 4        93
## 5        79

# use sample_n, allows for replaceable 
PseudoData %>% sample_n(size = 5, replace = F) %>% select(StudentID)

## # A tibble: 5 × 1
##   StudentID
##       <int>
## 1        75
## 2        32
## 3        29
## 4        73
## 5        45

## slice based on current row order (row number)
# first and last 10 rows
PseudoData %>% slice(c(1:10, seq(n()-9, n()))) %>% nrow()

## [1] 20

# first n rows
PseudoData %>% head(n = 5) %>% nrow()

## [1] 5

# last n rows
PseudoData %>% tail(n = 5) %>% nrow()

## [1] 5

#### Situation 2. quick row operations (all should work with group_by)
## add row number as identifier
PseudoData %>% mutate(row_id = row_number()) %>% pull(row_id)

##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [91]  91  92  93  94  95  96  97  98  99 100

## report the nth observation downwards from the current
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Lead = lead(Grade_Math_Y1, n = 1, default = NA))

## # A tibble: 100 × 2
##    Grade_Math_Y1  Lead
##            <dbl> <dbl>
##  1            56    65
##  2            65    46
##  3            46    69
##  4            69    84
##  5            84    81
##  6            81    62
##  7            62    75
##  8            75    56
##  9            56    90
## 10            90    66
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

##  report the nth observation upwards from the current
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Lag = lag(Grade_Math_Y1, n = 1, default = NA))

## # A tibble: 100 × 2
##    Grade_Math_Y1   Lag
##            <dbl> <dbl>
##  1            56    NA
##  2            65    56
##  3            46    65
##  4            69    46
##  5            84    69
##  6            81    84
##  7            62    81
##  8            75    62
##  9            56    75
## 10            90    56
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

## report cumulative sum
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Cumsum = cumsum(Grade_Math_Y1))

## # A tibble: 100 × 2
##    Grade_Math_Y1 Cumsum
##            <dbl>  <dbl>
##  1            56     56
##  2            65    121
##  3            46    167
##  4            69    236
##  5            84    320
##  6            81    401
##  7            62    463
##  8            75    538
##  9            56    594
## 10            90    684
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

# can also be used to count occurrence
PseudoData %>% transmute(unit = 1) %>% mutate(occurance = cumsum(unit))

## # A tibble: 100 × 2
##     unit occurance
##    <dbl>     <dbl>
##  1     1         1
##  2     1         2
##  3     1         3
##  4     1         4
##  5     1         5
##  6     1         6
##  7     1         7
##  8     1         8
##  9     1         9
## 10     1        10
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

## report cumulative probability 
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Cumprob = cumprod(Grade_Math_Y1))

## # A tibble: 100 × 2
##    Grade_Math_Y1 Cumprob
##            <dbl>   <dbl>
##  1            56 5.6 e 1
##  2            65 3.64e 3
##  3            46 1.67e 5
##  4            69 1.16e 7
##  5            84 9.70e 8
##  6            81 7.86e10
##  7            62 4.87e12
##  8            75 3.66e14
##  9            56 2.05e16
## 10            90 1.84e18
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

## report cumulative minimum
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Cummin = cummin(Grade_Math_Y1))

## # A tibble: 100 × 2
##    Grade_Math_Y1 Cummin
##            <dbl>  <dbl>
##  1            56     56
##  2            65     56
##  3            46     46
##  4            69     46
##  5            84     46
##  6            81     46
##  7            62     46
##  8            75     46
##  9            56     46
## 10            90     46
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

## report cumulative maximum 
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Cummax = cummax(Grade_Math_Y1))

## # A tibble: 100 × 2
##    Grade_Math_Y1 Cummax
##            <dbl>  <dbl>
##  1            56     56
##  2            65     65
##  3            46     65
##  4            69     69
##  5            84     84
##  6            81     84
##  7            62     84
##  8            75     84
##  9            56     84
## 10            90     90
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

#### Situation 3. report the highest or lowest number across multiple columns for each row
PseudoData %>%
  select(Grade_Math_Y1, Grade_Math_Y2, Grade_Math_Y3, Grade_Math_Y4) %>% 
  mutate(min = pmin(Grade_Math_Y1, Grade_Math_Y2, Grade_Math_Y3, Grade_Math_Y4),
         max = pmax(Grade_Math_Y1, Grade_Math_Y2, Grade_Math_Y3, Grade_Math_Y4, na.rm = T)) %>% 
  filter(Grade_Math_Y1 %in% NA | Grade_Math_Y2 %in% NA)

## # A tibble: 3 × 6
##   Grade_Math_Y1 Grade_Math_Y2 Grade_Math_Y3 Grade_Math_Y4   min   max
##           <dbl>         <dbl>         <dbl>         <dbl> <dbl> <dbl>
## 1            NA            83            82            88    NA    88
## 2            NA            86            92            98    NA    98
## 3            79            NA            79            90    NA    90

#### Situation 4. separate data into multiple rows based on one column 
PseudoData %>% select(StudentID, DoB) %>% separate_rows(DoB, sep="-", convert = T)

## # A tibble: 300 × 2
##    StudentID   DoB
##        <int> <int>
##  1         1  1993
##  2         1     1
##  3         1    NA
##  4         2  1991
##  5         2     1
##  6         2    NA
##  7         3  1993
##  8         3     2
##  9         3    NA
## 10         4  1990
## # … with 290 more rows
## # ℹ Use `print(n = ...)` to see more rows

5.8 Data Ops group_by then summarize

#### Situation 1. Report summarized data after grouping by certain variable
## summarize single variable 
PseudoData %>%
  group_by(Major) %>%
  summarize(Med_Math = median(Grade_Math_Y1, na.rm = T),   
            Min_Math = min(Grade_Math_Y1, na.rm = T),  
            First_Math = first(Grade_Math_Y1),  
            Count = n(),                                                        
            Uniq_Race = n_distinct(Race))

## # A tibble: 12 × 6
##    Major            Med_Math Min_Math First_Math Count Uniq_Race
##    <chr>               <dbl>    <dbl>      <dbl> <int>     <int>
##  1 Biology              75.5       62         90    10         4
##  2 Business             72.5       69         93     8         4
##  3 Communications       68         54         72     8         4
##  4 Computer Science     74         42         42     9         5
##  5 English              69         57         82     8         4
##  6 History              73         56         71     5         3
##  7 Nursing              64         50         56    11         6
##  8 Public Health        72         53         69     6         5
##  9 Statistics           70         56         81     6         2
## 10 Sustainability       66.5       56         56     4         4
## 11 Unknown              69.5       46         65    12         6
## 12 <NA>                 75         64         84    13         6

## summarize multiple selected variables using the same metrics
PseudoData %>%
  group_by(Major) %>%
  summarize_at(vars(Grade_Math_Y1, Grade_Math_Y2,                                
                    Grade_Math_Y3, Grade_Math_Y4), mean, na.rm=T)

## # A tibble: 12 × 5
##    Major            Grade_Math_Y1 Grade_Math_Y2 Grade_Math_Y3 Grade_Math_Y4
##    <chr>                    <dbl>         <dbl>         <dbl>         <dbl>
##  1 Biology                   74.8          78.6          80.2          86.3
##  2 Business                  76.1          78.1          80.4          81.8
##  3 Communications            66            65.1          69.1          68.9
##  4 Computer Science          70.2          72.1          70            73.4
##  5 English                   67.2          70.4          74.1          74.5
##  6 History                   71.8          75.2          76.2          78.8
##  7 Nursing                   63.2          67.7          73.7          74.4
##  8 Public Health             71            71.7          75.3          74.5
##  9 Statistics                70.7          71.5          71.8          69.5
## 10 Sustainability            68.2          72            73.8          75.2
## 11 Unknown                   67.8          70.4          72.2          75.8
## 12 <NA>                      73.5          77.5          79.7          81.9

## summarize multiple variables using different metrics
PseudoData %>% 
  select(Major, Age, Grade_Math_Y1: Grade_Math_Y4) %>% 
  group_by(Major) %>% 
  summarize(across(Grade_Math_Y1: Grade_Math_Y4, 
                   list(missing = ~ mean(is.na(.)),
                        avg = ~ mean(., na.rm=T),          
                        total = ~ sum(., na.rm=T),          
                        ave_among_age25 = ~ mean(.[Age > 25], na.rm=T))))

## # A tibble: 12 × 65
##    Major Grade…¹ Grade…² Grade…³ Grade…⁴ Grade…⁵ Grade…⁶ Grade…⁷ Grade…⁸ Grade…⁹
##    <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 Biol…  0         74.8     748    75.8     0.1    74.4     670    75.9       0
##  2 Busi…  0         76.1     609    77.1     0      77.9     623    78.1       0
##  3 Comm…  0         66       528    64.9     0      75.8     606    76.6       0
##  4 Comp…  0         70.2     632    75.6     0      76.9     692    77.7       0
##  5 Engl…  0         67.2     538    66.5     0      75.1     601    77.3       0
##  6 Hist…  0.2       71.8     287    72       0      81.6     408    81.5       0
##  7 Nurs…  0.0909    63.2     632    61.1     0      77.9     857    74.4       0
##  8 Publ…  0         71       426    68.2     0      72.2     433    74.8       0
##  9 Stat…  0         70.7     424    69       0      75.7     454    74.7       0
## 10 Sust…  0         68.2     273    63       0      79       316    79         0
## 11 Unkn…  0         67.8     814    66.5     0      69.7     836    69.5       0
## 12 <NA>   0         73.5     955    74.8     0      73.2     951    72.2       0
## # … with 55 more variables: Grade_Engl_Y1_avg <dbl>, Grade_Engl_Y1_total <dbl>,
## #   Grade_Engl_Y1_ave_among_age25 <dbl>, Grade_Art_Y1_missing <dbl>,
## #   Grade_Art_Y1_avg <dbl>, Grade_Art_Y1_total <dbl>,
## #   Grade_Art_Y1_ave_among_age25 <dbl>, Grade_Sports_Y1_missing <dbl>,
## #   Grade_Sports_Y1_avg <dbl>, Grade_Sports_Y1_total <dbl>,
## #   Grade_Sports_Y1_ave_among_age25 <dbl>, Grade_Math_Y2_missing <dbl>,
## #   Grade_Math_Y2_avg <dbl>, Grade_Math_Y2_total <dbl>, …
## # ℹ Use `colnames()` to see all variable names

#### Situation 2. Summarize over combination of multiple variables 
## summarize 
PseudoData %>% 
  group_by(Employed, Gender) %>% 
  summarize(Med_Math = median(Grade_Math_Y1, na.rm=T))

## # A tibble: 4 × 3
## # Groups:   Employed [2]
##   Employed Gender Med_Math
##   <chr>    <chr>     <dbl>
## 1 No       Female       68
## 2 No       Male         70
## 3 Yes      Female       75
## 4 Yes      Male         74

#### Situation 3. Summarize data by two opponent categories
## this is helpful when trying to calculate the difference between the two opponent groups
PseudoData %>% select(Gender, Grade_Art_Y1) %>%             
  group_by(Gender) %>%                                            
  summarize(Mean_Art = mean(Grade_Art_Y1, na.rm = T)) %>%                      
  mutate(Opposing_Mean_Art = rev(Mean_Art)) %>% 
  mutate(Diff_Mean_Art = Mean_Art - Opposing_Mean_Art)

## # A tibble: 2 × 4
##   Gender Mean_Art Opposing_Mean_Art Diff_Mean_Art
##   <chr>     <dbl>             <dbl>         <dbl>
## 1 Female     73.5              74.4        -0.891
## 2 Male       74.4              73.5         0.891

5.9 Data Ops work with nested data or list of data frames

#### Situation 1. Break the data frame or tibble into multiple sub-tibbles using nest
PseudoData_nest <- PseudoData %>% nest(-Major) 
PseudoData_nest

## # A tibble: 12 × 2
##    Major            data              
##    <chr>            <list>            
##  1 Nursing          <tibble [11 × 57]>
##  2 Unknown          <tibble [12 × 57]>
##  3 Public Health    <tibble [6 × 57]> 
##  4 <NA>             <tibble [13 × 57]>
##  5 Statistics       <tibble [6 × 57]> 
##  6 Biology          <tibble [10 × 57]>
##  7 Business         <tibble [8 × 57]> 
##  8 Computer Science <tibble [9 × 57]> 
##  9 Communications   <tibble [8 × 57]> 
## 10 History          <tibble [5 × 57]> 
## 11 English          <tibble [8 × 57]> 
## 12 Sustainability   <tibble [4 × 57]>

#### Situation 2. Change the nested sub-tibbles to sub-lists
PseudoData_list <- as.list(PseudoData_nest)
PseudoData_list[[1]]

##  [1] "Nursing"          "Unknown"          "Public Health"    NA                
##  [5] "Statistics"       "Biology"          "Business"         "Computer Science"
##  [9] "Communications"   "History"          "English"          "Sustainability"

#### Situation 3. Change the sub-list object back to sub-tibble
PseudoData_tibble <- as_tibble(PseudoData_list)
PseudoData_tibble

## # A tibble: 12 × 2
##    Major            data              
##    <chr>            <list>            
##  1 Nursing          <tibble [11 × 57]>
##  2 Unknown          <tibble [12 × 57]>
##  3 Public Health    <tibble [6 × 57]> 
##  4 <NA>             <tibble [13 × 57]>
##  5 Statistics       <tibble [6 × 57]> 
##  6 Biology          <tibble [10 × 57]>
##  7 Business         <tibble [8 × 57]> 
##  8 Computer Science <tibble [9 × 57]> 
##  9 Communications   <tibble [8 × 57]> 
## 10 History          <tibble [5 × 57]> 
## 11 English          <tibble [8 × 57]> 
## 12 Sustainability   <tibble [4 × 57]>

#### Situation 4. Extract data from sub-tibble to main work tibble
PseudoData %>% nest(-StudentID) %>% hoist(data, Sex = "Gender", Age = "Age")

## # A tibble: 100 × 4
##    StudentID Sex      Age data             
##        <int> <chr>  <dbl> <list>           
##  1         1 Female    26 <tibble [1 × 55]>
##  2         2 Female    28 <tibble [1 × 55]>
##  3         3 Female    26 <tibble [1 × 55]>
##  4         4 Male      29 <tibble [1 × 55]>
##  5         5 Male      29 <tibble [1 × 55]>
##  6         6 Female    25 <tibble [1 × 55]>
##  7         7 Female    28 <tibble [1 × 55]>
##  8         8 Female    27 <tibble [1 × 55]>
##  9         9 Female    28 <tibble [1 × 55]>
## 10        10 Female    27 <tibble [1 × 55]>
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

#### Situation 5. Extract data from sub-tibbles to a wider format
PseudoData_nest %>% unnest_wider(data)

## # A tibble: 12 × 58
##    Major        Stude…¹   DoB   Age Gender  Race Ethni…² Grade…³ Grade…⁴ Grade…⁵
##    <chr>        <list<> <lis> <lis> <list> <lis> <list<> <list<> <list<> <list<>
##  1 Nursing         [11]  [11]  [11]   [11]  [11]    [11]    [11]    [11]    [11]
##  2 Unknown         [12]  [12]  [12]   [12]  [12]    [12]    [12]    [12]    [12]
##  3 Public Heal…     [6]   [6]   [6]    [6]   [6]     [6]     [6]     [6]     [6]
##  4 <NA>            [13]  [13]  [13]   [13]  [13]    [13]    [13]    [13]    [13]
##  5 Statistics       [6]   [6]   [6]    [6]   [6]     [6]     [6]     [6]     [6]
##  6 Biology         [10]  [10]  [10]   [10]  [10]    [10]    [10]    [10]    [10]
##  7 Business         [8]   [8]   [8]    [8]   [8]     [8]     [8]     [8]     [8]
##  8 Computer Sc…     [9]   [9]   [9]    [9]   [9]     [9]     [9]     [9]     [9]
##  9 Communicati…     [8]   [8]   [8]    [8]   [8]     [8]     [8]     [8]     [8]
## 10 History          [5]   [5]   [5]    [5]   [5]     [5]     [5]     [5]     [5]
## 11 English          [8]   [8]   [8]    [8]   [8]     [8]     [8]     [8]     [8]
## 12 Sustainabil…     [4]   [4]   [4]    [4]   [4]     [4]     [4]     [4]     [4]
## # … with 48 more variables: Grade_Art_Y1 <list<dbl>>,
## #   Grade_Sports_Y1 <list<dbl>>, Grade_Math_Y2 <list<dbl>>,
## #   Grade_Stats_Y2 <list<dbl>>, Grade_Engl_Y2 <list<dbl>>,
## #   Grade_Art_Y2 <list<dbl>>, Grade_Sports_Y2 <list<dbl>>,
## #   Grade_Math_Y3 <list<dbl>>, Grade_Stats_Y3 <list<dbl>>,
## #   Grade_Engl_Y3 <list<dbl>>, Grade_Art_Y3 <list<dbl>>,
## #   Grade_Sports_Y3 <list<dbl>>, Grade_Math_Y4 <list<dbl>>, …
## # ℹ Use `colnames()` to see all variable names

#### Situation 6. Extract data from sub-tibbles to a longer format
PseudoData_nest %>% unnest_longer(data)

## # A tibble: 100 × 2
##    Major   data$StudentID $DoB        $Age $Gender $Race $Ethn…¹ $Grad…² $Grad…³
##    <chr>            <int> <date>     <dbl> <chr>   <chr> <chr>     <dbl>   <dbl>
##  1 Nursing              1 1993-01-31    26 Female  Asian Non-Hi…      56      78
##  2 Nursing              7 1991-09-07    28 Female  Asian Non-Hi…      62      70
##  3 Nursing             40 1991-07-16    28 Female  Black Non-Hi…      NA      75
##  4 Nursing             46 1991-08-30    28 Male    Black Non-Hi…      53      77
##  5 Nursing             47 1993-03-20    26 Female  White Non-Hi…      76      76
##  6 Nursing             51 1992-01-13    27 Female  Unkn… Hispan…      60      77
##  7 Nursing             52 1994-03-30    25 Male    Asian Hispan…      66      91
##  8 Nursing             70 1991-10-27    28 Female  Asian Hispan…      71      83
##  9 Nursing             76 1993-08-19    26 Female  White Hispan…      50      59
## 10 Nursing             77 1994-09-20    25 Female  <NA>  Hispan…      68      86
## # … with 90 more rows, 49 more variables: data$Grade_Engl_Y1 <dbl>,
## #   $Grade_Art_Y1 <dbl>, $Grade_Sports_Y1 <dbl>, $Grade_Math_Y2 <dbl>,
## #   $Grade_Stats_Y2 <dbl>, $Grade_Engl_Y2 <dbl>, $Grade_Art_Y2 <dbl>,
## #   $Grade_Sports_Y2 <dbl>, $Grade_Math_Y3 <dbl>, $Grade_Stats_Y3 <dbl>,
## #   $Grade_Engl_Y3 <dbl>, $Grade_Art_Y3 <dbl>, $Grade_Sports_Y3 <dbl>,
## #   $Grade_Math_Y4 <dbl>, $Grade_Stats_Y4 <dbl>, $Grade_Engl_Y4 <dbl>,
## #   $Grade_Art_Y4 <dbl>, $Grade_Sports_Y4 <dbl>, $Grade_Math_Y1_Letter <chr>, …
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names

#### Situation 7. Nest by factor, then model within each category
## use logistic regression as an example
library(broom)
PseudoData %>% 
  nest(-Major)  %>% 
  ## run model within each category (Major) and output as a sub-tibble object
  mutate(model = map(data,
                     ~ glm(Employed %in% "Yes" ~ Grade_Math_Y1,
                           data=.,
                           family="binomial")),
         ## tidy the model output as a sub-tibble object
         tidied = map(model, tidy)) %>%   
  ## unnest the tidy model output
  unnest(tidied) %>%
  ## report coefficient for specific term
  filter(term == "Grade_Math_Y1")           %>%
  mutate(fdr = p.adjust(p.value, method="fdr"))

## # A tibble: 12 × 9
##    Major          data     model term   estimate std.e…¹ statistic p.value   fdr
##    <chr>          <list>   <lis> <chr>     <dbl>   <dbl>     <dbl>   <dbl> <dbl>
##  1 Nursing        <tibble> <glm> Grad… -2.18e-16 5.16e+3 -4.22e-20   1     1    
##  2 Unknown        <tibble> <glm> Grad…  7.45e+ 0 1.02e+4  7.27e- 4   0.999 1    
##  3 Public Health  <tibble> <glm> Grad… -9.44e- 2 9.37e-2 -1.01e+ 0   0.314 0.915
##  4 <NA>           <tibble> <glm> Grad…  4.54e- 2 7.78e-2  5.84e- 1   0.559 1    
##  5 Statistics     <tibble> <glm> Grad… -4.12e-16 5.20e+3 -7.92e-20   1     1    
##  6 Biology        <tibble> <glm> Grad…  1.53e- 1 1.11e-1  1.38e+ 0   0.167 0.915
##  7 Business       <tibble> <glm> Grad… -2.09e- 1 2.39e-1 -8.76e- 1   0.381 0.915
##  8 Computer Scie… <tibble> <glm> Grad… -8.01e- 2 6.78e-2 -1.18e+ 0   0.238 0.915
##  9 Communications <tibble> <glm> Grad… -2.43e- 2 1.10e-1 -2.22e- 1   0.825 1    
## 10 History        <tibble> <glm> Grad…  4.68e+ 0 1.05e+4  4.47e- 4   1.00  1    
## 11 English        <tibble> <glm> Grad…  4.49e- 2 1.01e-1  4.44e- 1   0.657 1    
## 12 Sustainability <tibble> <glm> Grad…  1.20e- 1 1.33e-1  9.03e- 1   0.366 0.915
## # … with abbreviated variable name ¹std.error

## use linear regression as an example
PseudoData %>% 
  select(Major, Grade_Art_Y1, Grade_Math_Y1) %>%                                
  nest(-Major)  %>%                                                            
  mutate(model = map(data,~ lm(Grade_Art_Y1 ~ Grade_Math_Y1, data=.)),         
         ## glance report evaluation of each regression model, can also try tidy
         glance = map(model, glance)) %>%                               
  unnest(glance)

## # A tibble: 12 × 15
##    Major data     model r.squ…¹ adj.r…² sigma stati…³ p.value    df logLik   AIC
##    <chr> <list>   <lis>   <dbl>   <dbl> <dbl>   <dbl>   <dbl> <dbl>  <dbl> <dbl>
##  1 Nurs… <tibble> <lm>  1.14e-3 -0.124   8.67 0.00912  0.926      1  -34.7  75.3
##  2 Unkn… <tibble> <lm>  8.88e-3 -0.0902  8.98 0.0896   0.771      1  -42.3  90.6
##  3 Publ… <tibble> <lm>  4.01e-2 -0.200   6.37 0.167    0.704      1  -18.4  42.8
##  4 <NA>  <tibble> <lm>  4.75e-1  0.422   6.56 9.03     0.0132     1  -38.5  83.0
##  5 Stat… <tibble> <lm>  1.46e-1 -0.0675  5.40 0.684    0.455      1  -17.4  40.8
##  6 Biol… <tibble> <lm>  8.50e-4 -0.124   9.67 0.00680  0.936      1  -35.8  77.5
##  7 Busi… <tibble> <lm>  9.34e-3 -0.189   8.84 0.0472   0.837      1  -24.0  54.0
##  8 Comp… <tibble> <lm>  8.24e-2 -0.0487  6.18 0.629    0.454      1  -28.0  62.1
##  9 Comm… <tibble> <lm>  9.33e-2 -0.0880  4.96 0.515    0.505      1  -20.0  45.9
## 10 Hist… <tibble> <lm>  5.90e-1  0.385   4.83 2.88     0.232      1  -10.6  27.2
## 11 Engl… <tibble> <lm>  5.00e-2 -0.140   6.79 0.263    0.630      1  -22.2  50.3
## 12 Sust… <tibble> <lm>  4.37e-1  0.155   6.65 1.55     0.339      1  -11.9  29.7
## # … with 4 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>,
## #   nobs <int>, and abbreviated variable names ¹r.squared, ²adj.r.squared,
## #   ³statistic
## # ℹ Use `colnames()` to see all variable names

#### Situation 8. map over list of dataframes or tibbles
PseudoData %>% 
  nest(-Major) %>% 
  ## single variable
  mutate(nrow = unlist(map(data, ~ nrow(.x))),
         ncol = unlist(map(data, ~ ncol(.x))),
         dim = unlist(map2(Major, data, ~ paste0((.x), " data has a dimension of", nrow(.y), " * ", ncol(.y), "."))))

## # A tibble: 12 × 5
##    Major            data                nrow  ncol dim                          
##    <chr>            <list>             <int> <int> <chr>                        
##  1 Nursing          <tibble [11 × 57]>    11    57 Nursing data has a dimension…
##  2 Unknown          <tibble [12 × 57]>    12    57 Unknown data has a dimension…
##  3 Public Health    <tibble [6 × 57]>      6    57 Public Health data has a dim…
##  4 <NA>             <tibble [13 × 57]>    13    57 NA data has a dimension of13…
##  5 Statistics       <tibble [6 × 57]>      6    57 Statistics data has a dimens…
##  6 Biology          <tibble [10 × 57]>    10    57 Biology data has a dimension…
##  7 Business         <tibble [8 × 57]>      8    57 Business data has a dimensio…
##  8 Computer Science <tibble [9 × 57]>      9    57 Computer Science data has a …
##  9 Communications   <tibble [8 × 57]>      8    57 Communications data has a di…
## 10 History          <tibble [5 × 57]>      5    57 History data has a dimension…
## 11 English          <tibble [8 × 57]>      8    57 English data has a dimension…
## 12 Sustainability   <tibble [4 × 57]>      4    57 Sustainability data has a di…

5.10 Data Ops transform data from wide to long

PseudoData_Grade <- PseudoData[,str_detect(colnames(PseudoData), "ID|Letter")]

#### Method 1. Use melt() from data.table package
TempData_Long1 <- as.data.frame(
  data.table::melt(data.table::setDT(PseudoData_Grade), 
                   measure = patterns("Grade_Math_",
                                      "Grade_Stats_",
                                      "Grade_Engl_",
                                      "Grade_Art_",
                                      "Grade_Sports_"),
                   variable.name = 'Term',
                   value.name = c("Grade_Math",
                                  "Grade_Stats",
                                  "Grade_Engl",
                                  "Grade_Art",
                                  "Grade_Sports")))


#### Method 2. Use reshape() from base
## note, reshape only works with data frames 
PseudoData_Grade_DF <- as.data.frame(PseudoData_Grade)
TempData_long2 <- reshape(PseudoData_Grade_DF,
                          varying=colnames(PseudoData_Grade)[-1], 
                          idvar="StudentID", direction="long", sep="Y")


#### Method 3. Use pivot_longer from dplyr package
TempData_Long3 <- PseudoData_Grade %>%                                        
  select(StudentID,starts_with("Grade_Math")) %>%                                
  pivot_longer(-StudentID, names_to = "Class", values_to = "Grade") 


####  Method 4. Use gather 
TempData_Grade_Gathered <- PseudoData_Grade %>% 
  gather(key = "Term", value = "Grade",                                         
         contains("Letter"))

5.11 Data Ops transform data from long to wide

#### Method 1. Use reshape() from base
## first, sort the data (often repeated data), use TempData_Long1 as an example, which is already sorted 
head(TempData_Long1)

##   StudentID Term Grade_Math Grade_Stats Grade_Engl Grade_Art Grade_Sports
## 1         1    1          F           C          B         B            C
## 2         2    1          D           D          A         B            C
## 3         3    1          F           F          B         C            A
## 4         4    1          D           C          C         D            D
## 5         5    1          B           B          B      <NA>            B
## 6         6    1          B           B          F         D            C

## second, for each unique ID, give an order variable, like occurrence 
TempData_Long1$Order <- with(TempData_Long1, ave(StudentID, StudentID, FUN = seq_along))
## lastly, reshape it to wide 
PseudoData_Wide1 <- reshape(TempData_Long1, 
                            timevar = "Order",
                            idvar = "StudentID",
                            direction = "wide",
                            sep = "_")


#### Method 2. Use pivot_wider from dplyr
TempData_Wide2 <- 
  TempData_Long3 %>% 
  pivot_wider(names_from = "Class", values_from = "Grade", 
              values_fill = list(n = 0))


#### Method 3. Use spread 
TempData_Wide3 <- 
  TempData_Grade_Gathered %>% 
  spread(key = "Term",
         value = "Grade",
         # fill in the missing data to a specific string of your choice (can also be NA) 
         fill = "Missing",
         # automatically convert the data type
         convert = T)

5.12 Data Ops cut continuous data

#### Situation 1. Cut based on specific cutpoints
PseudoData %>% 
  mutate(Grade_Math_Y1_level = cut(Grade_Math_Y1,  
                                   breaks = c(-Inf,70,80,90,Inf),
                                   include.lowest = T, 
                                   right=T)) %>% 
  count(Grade_Math_Y1_level)

## # A tibble: 5 × 2
##   Grade_Math_Y1_level     n
##   <fct>               <int>
## 1 [-Inf,70]              49
## 2 (70,80]                29
## 3 (80,90]                19
## 4 (90, Inf]               1
## 5 <NA>                    2

#### Situation 2. Cut based on proportion (quantile)
PseudoData %>% 
  mutate(Grade_Math_Y1_Tertile = 
           cut(Grade_Math_Y1,  
               breaks = quantile(Grade_Math_Y1, 
                                 probs = seq(0, 1, 1/3), 
                                 na.rm = T),
               labels=c("T1","T2","T3"),
               include.lowest = T, right=T)) %>% 
  count(Grade_Math_Y1_Tertile)

## # A tibble: 4 × 2
##   Grade_Math_Y1_Tertile     n
##   <fct>                 <int>
## 1 T1                       34
## 2 T2                       33
## 3 T3                       31
## 4 <NA>                      2

5.13 Data Ops work with factor data

#### Situation 1. reorder factor by frequency
## reorder by frequency
PseudoData %>% 
  mutate(Major = fct_infreq(Major)) %>% 
  count(Major) %>% 
  arrange(Major)

## # A tibble: 12 × 2
##    Major                n
##    <fct>            <int>
##  1 Unknown             12
##  2 Nursing             11
##  3 Biology             10
##  4 Computer Science     9
##  5 Business             8
##  6 Communications       8
##  7 English              8
##  8 Public Health        6
##  9 Statistics           6
## 10 History              5
## 11 Sustainability       4
## 12 <NA>                13

## reorder by frequency after count
PseudoData %>% 
  count(Major) %>% 
  mutate(Major = fct_reorder(Major, -n)) %>% 
  arrange(Major)

## # A tibble: 12 × 2
##    Major                n
##    <fct>            <int>
##  1 Unknown             12
##  2 Nursing             11
##  3 Biology             10
##  4 Computer Science     9
##  5 Business             8
##  6 Communications       8
##  7 English              8
##  8 Public Health        6
##  9 Statistics           6
## 10 History              5
## 11 Sustainability       4
## 12 <NA>                13

#### Situation 2. reorder by specific column
## order identifier
# decreasing order, try -Grade_Math_Y1
PseudoData %>% 
  mutate(StudentID = fct_reorder(factor(StudentID), Grade_Math_Y1)) %>% 
  arrange(StudentID) %>% 
  select(StudentID, Grade_Math_Y1)

## # A tibble: 100 × 2
##    StudentID Grade_Math_Y1
##    <fct>             <dbl>
##  1 14                   42
##  2 3                    46
##  3 76                   50
##  4 39                   53
##  5 46                   53
##  6 24                   54
##  7 25                   55
##  8 1                    56
##  9 9                    56
## 10 29                   56
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

## order categories (by median, also try mean, min, max, first, last, last2)
# decreasing order, try -Grade_Math_Y1
PseudoData %>% 
  mutate(Major = fct_reorder(factor(Major), Grade_Math_Y1, median, na.rm = T)) %>% 
  group_by(Major) %>% 
  summarise(med = median(Grade_Math_Y1, na.rm = T)) %>% 
  arrange(Major)

## # A tibble: 12 × 2
##    Major              med
##    <fct>            <dbl>
##  1 Nursing           64  
##  2 Sustainability    66.5
##  3 Communications    68  
##  4 English           69  
##  5 Unknown           69.5
##  6 Statistics        70  
##  7 Public Health     72  
##  8 Business          72.5
##  9 History           73  
## 10 Computer Science  74  
## 11 Biology           75.5
## 12 <NA>              75

#### Situation 3. reverse factor order
PseudoData %>% 
  transmute(Major = fct_infreq(Major)) %>%  
  mutate(Major = fct_rev(Major)) %>% 
  count(Major)%>% 
  arrange(Major)

## # A tibble: 12 × 2
##    Major                n
##    <fct>            <int>
##  1 Sustainability       4
##  2 History              5
##  3 Statistics           6
##  4 Public Health        6
##  5 English              8
##  6 Communications       8
##  7 Business             8
##  8 Computer Science     9
##  9 Biology             10
## 10 Nursing             11
## 11 Unknown             12
## 12 <NA>                13

#### Situation 4. lump factor
## lump based on number of levels
# lump non-NA categories that are not the top 4 frequent
PseudoData %>% 
  mutate(Major = fct_lump(Major, n = 4)) %>% 
  count(Major)

## # A tibble: 6 × 2
##   Major                n
##   <fct>            <int>
## 1 Biology             10
## 2 Computer Science     9
## 3 Nursing             11
## 4 Unknown             12
## 5 Other               45
## 6 <NA>                13

## lump based on percentage 
# lump non-NA categories that are less than or equal to 8%
PseudoData %>% 
  mutate(Major = fct_lump(Major, prop = 0.08)) %>% 
  count(Major)

## # A tibble: 6 × 2
##   Major                n
##   <fct>            <int>
## 1 Biology             10
## 2 Computer Science     9
## 3 Nursing             11
## 4 Unknown             12
## 5 Other               45
## 6 <NA>                13

## lump with specific rule
PseudoData %>% 
  select(Grade_Art_Y1_Letter) %>% 
  mutate(Grade_Art_Y1_Letter2 = fct_collapse(                               
    Grade_Art_Y1_Letter,
    "Grade A" = "A",
    "Grade B" = "B",
    "Grade C, D, F" = c("C","D","F"))) %>% 
  count(Grade_Art_Y1_Letter, Grade_Art_Y1_Letter2)

## # A tibble: 6 × 3
##   Grade_Art_Y1_Letter Grade_Art_Y1_Letter2     n
##   <chr>               <fct>                <int>
## 1 A                   Grade A                  4
## 2 B                   Grade B                 19
## 3 C                   Grade C, D, F           40
## 4 D                   Grade C, D, F           31
## 5 F                   Grade C, D, F            2
## 6 <NA>                <NA>                     4

#### Situation 5. relevel - specify reference or highest level
## specify reference
PseudoData %>% 
  transmute(Race = factor(Race)) %>% 
  mutate(Race = fct_relevel(Race, "Multiple")) %>%
  count(Race) %>% 
  arrange(Race)

## # A tibble: 6 × 2
##   Race         n
##   <fct>    <int>
## 1 Multiple    15
## 2 Asian       19
## 3 Black       20
## 4 Unknown     11
## 5 White       13
## 6 <NA>        22

## specify first few levels 
PseudoData %>% 
  transmute(Race = factor(Race)) %>% 
  mutate(Race = fct_relevel(Race, c("Multiple", "Unknown"))) %>%
  count(Race) %>% 
  arrange(Race)

## # A tibble: 6 × 2
##   Race         n
##   <fct>    <int>
## 1 Multiple    15
## 2 Unknown     11
## 3 Asian       19
## 4 Black       20
## 5 White       13
## 6 <NA>        22

## specify highest level
PseudoData %>% 
  transmute(Race = factor(Race)) %>% 
  mutate(Race = fct_relevel(Race, "Multiple", after = Inf)) %>% 
  count(Race) %>% 
  arrange(Race)

## # A tibble: 6 × 2
##   Race         n
##   <fct>    <int>
## 1 Asian       19
## 2 Black       20
## 3 Unknown     11
## 4 White       13
## 5 Multiple    15
## 6 <NA>        22

#### Situation 6. rename specific level
PseudoData %>%
  transmute(Race = fct_recode(Race,   "Unknown_newname" = "Unknown")) %>% 
  count(Race)

## # A tibble: 6 × 2
##   Race                n
##   <fct>           <int>
## 1 Asian              19
## 2 Black              20
## 3 Multiple           15
## 4 Unknown_newname    11
## 5 White              13
## 6 <NA>               22

5.14 Data Ops work with non-standard numeric data

Sometimes continuous data like currency is not stored in a numeric format but often in a character format and requires quick cleaning to get it ready

#### Method 1. parse_number()
readr::parse_number("$1000,000")

## [1] 1e+06

readr::parse_number("$10,000,000")

## [1] 1e+07

readr::parse_number("1.050$")

## [1] 1.05

6 Text Operations (Text Ops)

Many operations can be found in the stirngr package and this document.

6.1 Text Ops regular expression - some examples

#### . - anything
str_detect("Apple", ".")

## [1] TRUE

#### ^ - start of the string
str_detect("Apple", "^A")

## [1] TRUE

#### $ - end of the string
str_detect("Apple", "e$")

## [1] TRUE

#### \w - any letter (also works with +, any or more letter)
str_detect("Apple", "\\w")

## [1] TRUE

#### \d - any digit (also works with +, any or more digit)
str_detect("Apple", "\\d")

## [1] FALSE

#### \s - any white space 
str_detect("Apple ", "\\s")

## [1] TRUE

#### [abcABC] - any abc, or capital ABC
str_detect("Apple", "[abcABC]")

## [1] TRUE

#### [a-zA-Z] - any lower or upper case English character
str_detect("5", "[a-zA-Z]")

## [1] FALSE

#### ab|ac - either ab or ac
str_detect("Apple", "ab|ac")

## [1] FALSE

#### App or ppp 
str_detect("Apple", "(A|p)pp")

## [1] TRUE

#### one or more p
str_detect("Apple", "p+")

## [1] TRUE

#### zero or more p
str_detect("Apple", "p*")

## [1] TRUE

#### anything starting with p
str_extract("Apple", "p.*")

## [1] "pple"

#### anything ending with p
str_extract("Apple", ".*p")

## [1] "App"

6.2 Text Ops regular expression - some helpful package or sites

# Try regexplain by Garrick Aiden-Buie
# install.packages("remotes")
# remotes::install_github("gadenbuie/regexplain")
# regexplain::regexplain_cheatsheet()
# regexplain::regexplain_gadget()
# regexplain::regexplain_web()
# PseudoData %>% select(Fav_Jane) %>% regexplain::regexplain_gadget()

6.3 Text Ops text operations

#### Operation 1. case change 
Hmisc:: capitalize(c("zip code", "state", "final count"))

## [1] "Zip code"    "State"       "Final count"

stringr:: str_to_title(c("zip code", "state", "final count"))

## [1] "Zip Code"    "State"       "Final Count"

tolower("Zip Code")

## [1] "zip code"

toupper("Zip Code")

## [1] "ZIP CODE"

#### Operation 2. detect
stringr::str_detect("Stringtest", "^String")

## [1] TRUE

#### Operation 3. remove
stringr::str_remove("Stringtest", "test")

## [1] "String"

str_remove(names(PseudoData), "\\d+")

##  [1] "StudentID"             "DoB"                   "Age"                  
##  [4] "Gender"                "Race"                  "Ethnicity"            
##  [7] "Grade_Math_Y"          "Grade_Stats_Y"         "Grade_Engl_Y"         
## [10] "Grade_Art_Y"           "Grade_Sports_Y"        "Grade_Math_Y"         
## [13] "Grade_Stats_Y"         "Grade_Engl_Y"          "Grade_Art_Y"          
## [16] "Grade_Sports_Y"        "Grade_Math_Y"          "Grade_Stats_Y"        
## [19] "Grade_Engl_Y"          "Grade_Art_Y"           "Grade_Sports_Y"       
## [22] "Grade_Math_Y"          "Grade_Stats_Y"         "Grade_Engl_Y"         
## [25] "Grade_Art_Y"           "Grade_Sports_Y"        "Grade_Math_Y_Letter"  
## [28] "Grade_Stats_Y_Letter"  "Grade_Engl_Y_Letter"   "Grade_Art_Y_Letter"   
## [31] "Grade_Sports_Y_Letter" "Grade_Math_Y_Letter"   "Grade_Stats_Y_Letter" 
## [34] "Grade_Engl_Y_Letter"   "Grade_Art_Y_Letter"    "Grade_Sports_Y_Letter"
## [37] "Grade_Math_Y_Letter"   "Grade_Stats_Y_Letter"  "Grade_Engl_Y_Letter"  
## [40] "Grade_Art_Y_Letter"    "Grade_Sports_Y_Letter" "Grade_Math_Y_Letter"  
## [43] "Grade_Stats_Y_Letter"  "Grade_Engl_Y_Letter"   "Grade_Art_Y_Letter"   
## [46] "Grade_Sports_Y_Letter" "Major"                 "Scholarship"          
## [49] "Schol_Amount"          "ProjectedEarning"      "JobHuntDate"          
## [52] "Employed"              "EmployDate"            "DayEmploy"            
## [55] "Fav_Jane"              "Longitude"             "Latitude"             
## [58] "State"

#### Operation 4. substitute 
gsub("@gmail.com", "@hotmail","SomeEmail@gmail.com")

## [1] "SomeEmail@hotmail"

#### Operation 5. concatenation 
## use str_c - works well with both string entries and/or string columns
PseudoData %>% 
  mutate(test = str_c("prefix", 
                      PseudoData$Grade_Art_Y1_Letter,
                      PseudoData$Grade_Art_Y2_Letter, 
                      "suffix", 
                      sep = "-")) %>%
  select(test)

## # A tibble: 100 × 1
##    test             
##    <chr>            
##  1 prefix-B-B-suffix
##  2 prefix-B-B-suffix
##  3 prefix-C-C-suffix
##  4 prefix-D-C-suffix
##  5 <NA>             
##  6 prefix-D-D-suffix
##  7 prefix-B-B-suffix
##  8 prefix-D-C-suffix
##  9 prefix-D-D-suffix
## 10 prefix-D-F-suffix
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

## use unite - only works with columns operations, does not work with string entries
PseudoData %>% 
  unite(united, 
        Grade_Art_Y1_Letter, Grade_Art_Y2_Letter,  
        sep="_") %>% 
  select(united)

## # A tibble: 100 × 1
##    united
##    <chr> 
##  1 B_B   
##  2 B_B   
##  3 C_C   
##  4 D_C   
##  5 NA_F  
##  6 D_D   
##  7 B_B   
##  8 D_C   
##  9 D_D   
## 10 D_F   
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

## use glue - a flexible way of concatenate strings/texts
PseudoData %>% 
  # filter(!Grade_Art_Y1_Letter %in% NA, !Grade_Art_Y2_Letter %in% NA) %>% 
  mutate(Art = glue::glue("Year 1: { Grade_Art_Y1_Letter } Year 2: { Grade_Art_Y2_Letter }  Year 3: { Grade_Art_Y3_Letter } Year 4: { Grade_Art_Y4_Letter }")) %>% 
  count(Art)

## # A tibble: 45 × 2
##    Art                                          n
##    <glue>                                   <int>
##  1 Year 1: A Year 2: A  Year 3: A Year 4: A     3
##  2 Year 1: A Year 2: A  Year 3: B Year 4: C     1
##  3 Year 1: B Year 2: A  Year 3: A Year 4: A     2
##  4 Year 1: B Year 2: B  Year 3: A Year 4: A     3
##  5 Year 1: B Year 2: B  Year 3: A Year 4: B     1
##  6 Year 1: B Year 2: B  Year 3: B Year 4: A     4
##  7 Year 1: B Year 2: B  Year 3: B Year 4: B     2
##  8 Year 1: B Year 2: B  Year 3: B Year 4: C     3
##  9 Year 1: B Year 2: B  Year 3: C Year 4: B     1
## 10 Year 1: B Year 2: B  Year 3: C Year 4: C     1
## # … with 35 more rows
## # ℹ Use `print(n = ...)` to see more rows

#### Operation 6. subset string
## based on locations
str_sub("This is a long sentence as an example.", 5, 20)

## [1] " is a long sente"

## based on pattern
# example 1 - lower vs upper cases
data.frame(x=c("aaaAaa","bbAbb")) %>% 
  extract(x,
          c("lower","Upperlower"),    
          "^([a-z]+)([A-Z].*)")

##   lower Upperlower
## 1   aaa        Aaa
## 2    bb        Abb

# example 2 - Text + number 
data.frame(x=c("HPV5_,HPyV6_","HPV5_,HPyV9_", "HPV99_,HPyV100_")) %>% 
  extract(x, 
          c("HPV","HPyV"),     
          "HPV(\\d+)_,HPyV(\\d+)_")

##   HPV HPyV
## 1   5    6
## 2   5    9
## 3  99  100

#### Operation 7. trim and add white space
str_trim("   xxx   ", side="both")

## [1] "xxx"

str_pad("xxx", width = 20, side="both")

## [1] "        xxx         "

#### Operation 8. use snakecase to make sense of string
library(snakecase)
strings <- c("this Is a Strange_string", "AND THIS ANOTHER_One", NA)

to_snake_case(strings)

## [1] "this_is_a_strange_string" "and_this_another_one"    
## [3] NA

to_lower_camel_case(strings)

## [1] "thisIsAStrangeString" "andThisAnotherOne"    NA

to_upper_camel_case(strings)

## [1] "ThisIsAStrangeString" "AndThisAnotherOne"    NA

to_screaming_snake_case(strings)

## [1] "THIS_IS_A_STRANGE_STRING" "AND_THIS_ANOTHER_ONE"    
## [3] NA

to_lower_upper_case(strings)

## [1] "thisISaSTRANGEstring" "andTHISanotherONE"    NA

to_upper_lower_case(strings)

## [1] "THISisAstrangeSTRING" "ANDthisANOTHERone"    NA

to_parsed_case(strings)

## [1] "this_Is_a_Strange_string" "AND_THIS_ANOTHER_One"    
## [3] NA

to_mixed_case(strings)

## [1] "this_Is_a_Strange_string" "And_This_Another_One"    
## [3] NA

to_swap_case(strings)

## [1] "THIS iS A sTRANGE_STRING" "and this another_oNE"    
## [3] NA

to_sentence_case(strings)

## [1] "This is a strange string" "And this another one"    
## [3] NA

to_random_case(strings)

## [1] "thiS Is a sTRANge_sTRinG" "And THis AnOTheR_ONE"    
## [3] "NA"

to_title_case(strings)

## [1] "This is a Strange String" "And this Another One"    
## [3] NA

6.4 Text Ops duplicated strings in multiple columns

library(tuple)

#### Situation 1. Check if substring in both columns
## base version
# Data$Output <- 
#  ifelse(Data$A_Column %in% c(NA, "") | B_Column %in% c(NA, ""), NA, 
#         ifelse(unlist(lapply(lapply(lapply(strsplit(paste(sapply(str_extract_all(Data$A_Column,Specific_Strings),paste,collapse=","),
#                                                           sapply(str_extract_all(Data$B_Column,Specific_Strings),paste,collapse=","),sep=","),",",fixed=TRUE),
#                                            function(x){x[!x==""]}), function(x){
#                                             tuplicated(x,2)
#                                            }),any)), 1, NA))

## function version 
Concor_F <- function(Col_1,Col_2,Specific_Strings){     
  attach(data)
  DupColumn <- gsub("NA","",paste(                                              
    sapply(str_extract_all(Col_1,Specific_Strings),paste,collapse=","),
    sapply(str_extract_all(Col_2,Specific_Strings),paste,collapse=","), sep=","))
  Split <-strsplit(DupColumn,",",fixed=TRUE)
  NoneEmptySplit <- lapply(Split,function(x){x[!x==""]})
  Any_Duplicated <- unlist(lapply(lapply(NoneEmptySplit, function(x){
    tuplicated(x,2)
  }),any))
  
  Concor <- ifelse(Any_Duplicated %in% T, "Concordant","Non-concordant")
  Concor <- factor(Concor, levels=c("Non-concordant", "Concordant"))
  
  detach(data)
  
  return(Concor)
}


#### Situtaion 2. Count duplicates across three columns
A_B_C <- function(A_Column, B_Column, C_Column,Specific_Strings){               
  #Specific_String could be a single name or multiple names separated by '|'
  #e.g. "Apple|Banana|Orange"
  DupColumn <- gsub("NA","",paste(                                              
    sapply(str_extract_all(A_Column,Specific_Strings),paste,collapse=","),
    sapply(str_extract_all(B_Column,Specific_Strings),paste,collapse=","),
    sapply(str_extract_all(C_Column,Specific_Strings),paste,collapse=","), sep=","))
  Split <-strsplit(DupColumn,",",fixed=TRUE)
  NoneEmptySplit <- lapply(Split,function(x){x[!x==""]})
  Any_Duplicated <- unlist(lapply(lapply(NoneEmptySplit, function(x){
    tuplicated(x,2)
  }),any))
  Any_Triplicated <- unlist(lapply(lapply(NoneEmptySplit, function(x){
    tuplicated(x,3)
  }),any))
  ifelse(Any_Triplicated %in% T,3,
         ifelse(Any_Duplicated %in% T,2,
                ifelse(str_detect(A_Column,Specific_Strings)| 
                         str_detect(B_Column,Specific_Strings)|
                         str_detect(C_Column,Specific_Strings),1,0)))
}

6.5 Text Ops tokenization and pairwise correlation

More details about tokenization, pairwise correlation, topic modeling, and other text analysis topics can be found in Text Mining with R- a Tidy Approach by Julia Silge and David Robinson.

Here we show a quick example of how to tokenize text data and run a quick pairwise correlation analysis.

library(tidytext)
library(widyr)
library(ggraph)
library(igraph)

#### Step 1. tokenization 
TokenData <- 
  PseudoData %>% select(StudentID, Grade_Math_Y1, Fav_Jane) %>%
  #Tokenize string column Fav_Jane
  unnest_tokens(word, Fav_Jane) %>%    
  #Remove stop words by anti join the original word column with a stop words data
  anti_join(stop_words, by = "word") %>%        
  #Manually remove Spanish stop words
  #and keep words with at least 1 English character
  filter(!(word %in% c("de", "en", "la", "para")),                              
         str_detect(word, "[a-z]"))   

#### Step 2. summarize one covariable by each word
Summa_Token <- TokenData %>%                                                    
  group_by(word) %>%                                                            
  #Calculate summarized data corresponding to each word as covariable
  summarize(med_math = median(Grade_Math_Y1, na.rm = T),
            occurences = n()) 

#### Step 3. calculate pairwise correlation based on how often they appear together
TopWordCor <- TokenData %>% 
  select(StudentID, word) %>% 
  #Analyze correlation between words, what words always appear together
  widyr::pairwise_cor(word, StudentID, sort=T) %>%                              
  head(100) 

#### Step 4. create a vertices data for graphing word correlations
vertices <- 
  Summa_Token %>% 
  filter(word %in% TopWordCor$item1 |                 
           word %in% TopWordCor$item2)

#### Step 5. plot correlation between words
TopWordCor %>%                                                                 
  graph_from_data_frame(vertices = vertices) %>% 
  ggraph() + 
  geom_edge_link() +
  geom_node_point(aes(size = occurences, color = med_math)) +                   
  geom_node_text(aes(label = name), repel = TRUE) +
  scale_color_gradient2(low = "blue", high = "red", midpoint = 70,  
                        labels = scales::comma_format())

#### Step 6. cast the tokenized data into a sparse data matrix for topic modeling
# TokenData %>% cast_sparse(StudentID, word)

#### Step 7. TF-IDF 
## how often a word appears in one set as compared to whether the word appears 
## in other documents. 
tf_idf_example <- 
  PseudoData %>% 
  select(Gender, Fav_Jane) %>% 
  #Tokenize string column Fav_Jane
  unnest_tokens(word, Fav_Jane) %>%    
  #Remove stop words by anti join the original word column with a stop words data
  anti_join(stop_words, by = "word") %>% 
  count(Gender, word) %>% 
  bind_tf_idf(word, Gender, n) %>% 
  arrange(desc(tf_idf))
  

#### Step 8. log-odds 
## how often a word appears in one set as compared to how often it appears in all 
## other documents.
## The main difference between log-odds and TF-IDF is that the log-odds evaluate 
## the frequency it appeared in other documents.  
log_odds_example <- 
  PseudoData %>% 
  select(Gender, Fav_Jane) %>% 
  #Tokenize string column Fav_Jane
  unnest_tokens(word, Fav_Jane) %>%    
  #Remove stop words by anti join the original word column with a stop words data
  anti_join(stop_words, by = "word") %>% 
  count(Gender, word) %>% 
  ## notice the order is different
  tidylo::bind_log_odds(Gender, word, n) %>% 
  arrange(desc(log_odds_weighted))

7 Date and Time Operations (Date Time)

7.1 Date Time lubridate package and related functions

#### Situation 1. character to date 
"1990-02-02" %>% str()

##  chr "1990-02-02"

ymd("1990-02-02") %>% str()

##  Date[1:1], format: "1990-02-02"

#### Situation 2. get day, month, year from a Date variable 
PseudoData %>% 
  transmute(DoB=DoB,
            year = year(DoB), month = month(DoB), day = day(DoB)) %>%
  ## form a Date variable from columns of year, month and day
  ## if time is needed, use make_datetime
  mutate(DoB2 = make_date(year = year, month = month, day = day))

## # A tibble: 100 × 5
##    DoB         year month   day DoB2      
##    <date>     <dbl> <dbl> <int> <date>    
##  1 1993-01-31  1993     1    31 1993-01-31
##  2 1991-01-04  1991     1     4 1991-01-04
##  3 1993-02-28  1993     2    28 1993-02-28
##  4 1990-11-28  1990    11    28 1990-11-28
##  5 1990-07-05  1990     7     5 1990-07-05
##  6 1994-02-04  1994     2     4 1994-02-04
##  7 1991-09-07  1991     9     7 1991-09-07
##  8 1992-07-26  1992     7    26 1992-07-26
##  9 1991-11-04  1991    11     4 1991-11-04
## 10 1992-10-10  1992    10    10 1992-10-10
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

#### Situation 3. get week day
PseudoData %>% mutate(weekday = wday(DoB, label = TRUE)) %>% count(weekday)

## # A tibble: 7 × 2
##   weekday     n
##   <ord>   <int>
## 1 Sun        17
## 2 Mon        14
## 3 Tue        18
## 4 Wed         7
## 5 Thu        16
## 6 Fri        20
## 7 Sat         8

#### Situation 4. get first day of week, month, quarter, year and decades
## this is particularly useful when trying to plot weekly, monthly, quarterly, yearly or decades data
PseudoData %>% mutate(weekly = floor_date(DoB, "week", week_start = 1)) %>% count(weekly)

## # A tibble: 83 × 2
##    weekly         n
##    <date>     <int>
##  1 1990-02-26     1
##  2 1990-03-12     1
##  3 1990-07-02     1
##  4 1990-08-13     1
##  5 1990-09-17     1
##  6 1990-09-24     2
##  7 1990-10-15     1
##  8 1990-11-19     1
##  9 1990-11-26     1
## 10 1990-12-24     1
## # … with 73 more rows
## # ℹ Use `print(n = ...)` to see more rows

PseudoData %>% mutate(monthly = floor_date(DoB, "month")) %>% count(monthly)

## # A tibble: 48 × 2
##    monthly        n
##    <date>     <int>
##  1 1990-02-01     1
##  2 1990-03-01     1
##  3 1990-07-01     1
##  4 1990-08-01     1
##  5 1990-09-01     3
##  6 1990-10-01     1
##  7 1990-11-01     2
##  8 1990-12-01     1
##  9 1991-01-01     3
## 10 1991-05-01     2
## # … with 38 more rows
## # ℹ Use `print(n = ...)` to see more rows

PseudoData %>% mutate(quarterly = floor_date(DoB, "quarter", week_start = 1)) %>% count(quarterly)

## # A tibble: 19 × 2
##    quarterly      n
##    <date>     <int>
##  1 1990-01-01     2
##  2 1990-07-01     5
##  3 1990-10-01     4
##  4 1991-01-01     3
##  5 1991-04-01     4
##  6 1991-07-01     9
##  7 1991-10-01     6
##  8 1992-01-01     9
##  9 1992-04-01     1
## 10 1992-07-01     5
## 11 1992-10-01     6
## 12 1993-01-01     8
## 13 1993-04-01     5
## 14 1993-07-01     7
## 15 1993-10-01     7
## 16 1994-01-01     7
## 17 1994-04-01     4
## 18 1994-07-01     3
## 19 1994-10-01     5

PseudoData %>% mutate(yearly = floor_date(DoB, "year", week_start = 1)) %>% count(yearly)

## # A tibble: 5 × 2
##   yearly         n
##   <date>     <int>
## 1 1990-01-01    11
## 2 1991-01-01    22
## 3 1992-01-01    21
## 4 1993-01-01    27
## 5 1994-01-01    19

PseudoData %>% mutate(year = year(DoB), decade = (year(DoB)%/%10)*10) %>% count(decade)

## # A tibble: 1 × 2
##   decade     n
##    <dbl> <int>
## 1   1990   100

#### Situation 5. character and numeric months
PseudoData %>% 
  transmute(month_chr = month(DoB, label = T, abbr = F),
            month_num = as.integer(factor(month_chr, levels = month.name))) %>% 
  count(month_chr, month_num)

## # A tibble: 12 × 3
##    month_chr month_num     n
##    <ord>         <int> <int>
##  1 January           1    10
##  2 February          2    11
##  3 March             3     8
##  4 April             4     2
##  5 May               5     6
##  6 June              6     6
##  7 July              7     8
##  8 August            8     7
##  9 September         9    14
## 10 October          10     8
## 11 November         11    10
## 12 December         12    10

8 Plotting Operations (Plot Ops)

It is crucial to use the most appropriate plot to present the data of interest. If unsure about what plot schema to choose, see this site for ideas.

8.1 Plot Ops basics of ggplot

ggplot requires a data object and a set of base aesthetics to construct the plot background through ggplot function. Then you can add one or more plotting schema as layers onto the background through geom function. Each layer of plotting schema can inherit aesthetics from the base layer, or its own set of aesthetics.

Each aesthetic of each layer can be scaled separately through scale function. Labels, limits and legends can be set in each of the scale function.

Theme of the figure is another big topic, while font, size, color, panel grid line, legend can be specified in theme function.

8.2 Plot Ops scatter plot

#### Situation 1. Usually used to plot association between two continuous variable 
ggplot(data=PseudoData, aes(x = Grade_Math_Y1, y = Grade_Art_Y1)) + geom_point()

#### Situation 2. Stratified scatter plot
Scatter_plot <- 
  ggplot(data=PseudoData, aes(x = Grade_Math_Y1, y = Grade_Art_Y1, color = Gender)) +
  geom_point() +
  ## often helpful to have a smooth line
  ## if not need to stratify the line by color, try adding aes(group=1)
  geom_smooth(method = "loess", se = T)
Scatter_plot

#### Situation 3. Scatter plot with text labels
Scatter_plot + geom_text(aes(label = StudentID), check_overlap = TRUE, vjust = 1, hjust = 1)

#### Situation 4. Scatter plot with text boxes (geom_label_repel, geom_text_repel)
Scatter_plot+
  ggrepel::geom_label_repel(                                             
    aes(x = Grade_Math_Y1, y = Grade_Art_Y1,
        ## give a condition, only show labels for those meet the condition
        label = ifelse(Grade_Math_Y1 > 80, paste0(StudentID, ": Math > 80"), "")),   
    box.padding = 0.15, point.padding = 0.15, segment.color = "grey50")

8.3 Plot Ops line plot

#### Situation 1. Usually used with respect to date/time
Line_plot <- 
  PseudoData %>% 
  ## created pseudo data to showcase confidence interval with line plot
  mutate(any_low_bound = Grade_Art_Y1 - abs(round(rnorm(100, mean = 20, sd = 2))),     
         any_high_bound = Grade_Art_Y1 + abs(round(rnorm(100, mean=20, sd = 2)))) %>%
  ggplot(aes(x = DoB , y = Grade_Art_Y1)) +                              
  geom_line()  


#### Situation 2. Line plot with confidence interval using geom_ribbon
Line_plot + geom_ribbon(aes(ymin = any_low_bound, ymax = any_high_bound), alpha = .2)

#### Situation 3. Path plot (sometimes used to track location change)
Path_plot <- 
  PseudoData %>% 
  ## order the data in certain way
  arrange(EmployDate) %>% 
  ## plot location change
  ggplot(aes(x = Longitude, y = Latitude)) + 
  geom_path()
Path_plot

8.4 Plot Ops residual plot

#### Method 1. using geom_segment
## geom_segment allows to create a line between any two points of a ggplot 
## which can also be used to plot residual
PseudoData %>% 
  group_by(Major) %>% 
  summarise(mean_stats = mean(Grade_Stats_Y1, na.rm = T),
            mean_math = mean(Grade_Math_Y1, na.rm = T)) %>% 
  ggplot(aes(x = mean_stats, y = mean_math, color = Major)) + 
  geom_point() + 
  geom_segment(aes(xend = mean_stats, yend = mean_stats, x = mean_stats, y = mean_math)) + 
  geom_abline(slope = 1)

8.5 Plot Ops histogram plot

#### Situation 1. Usually used to plot distribution 
Hist_base <- ggplot(data = PseudoData, aes(x = Grade_Math_Y1)) 
Hist_base + geom_histogram()

## this is the same as geom_histogram(aes(y = ..count..))
Hist_base + 
  geom_histogram(aes(y = ..count..),
                 ## specify bin width
                 binwidth = 2, 
                 ## specify color of the outter box
                 colour = "red",
                 ## specify color of the fill
                 fill = "blue")

#### Situation 2. Plot density instead of frequency
ggplot(data = PseudoData, aes(x = Grade_Math_Y1)) +
  geom_histogram(aes(y = ..density..)) +
  geom_density()

#### Situation 3. Plot stratified density 
ggplot(data = PseudoData, aes(x = Grade_Math_Y1, y = Major)) +
  ## geom_density_ridges2 adds a solid black line under the grey area
  ggridges::geom_density_ridges(scale = 4) +
  ggridges::theme_ridges()

8.6 Plot Ops bar plot

#### Situation 1. Used to plot distribution of categorical variable
PseudoData %>% 
  count(Grade_Math_Y1_Letter) %>% 
  ggplot(aes(x = Grade_Math_Y1_Letter, y = n)) +
  geom_col()

#### Situation 2. Stratified bar plot
Bar_base <- 
  PseudoData %>% 
  count(Grade_Math_Y1_Letter, Gender) %>%  
  mutate(perc = n / sum(n) * 100) %>% 
  ggplot(aes(x = Grade_Math_Y1_Letter, y = n, fill = Gender))
## stacked bar
Bar_base + geom_col() + 
  geom_text(aes(label = paste0(perc, "%")),
            position = position_stack(vjust=0.5))

## stacked bar that got connected
Bar_base + geom_col(position = "stack", width = 1)

## stacked bar with percentage insead of frequency 
PseudoData %>% 
  count(Grade_Math_Y1_Letter, Gender) %>% 
  group_by(Grade_Math_Y1_Letter) %>% 
  mutate(pct = prop.table(n) * 100,
         fct_n = sum(n)) %>% 
  ggplot(aes(x = Grade_Math_Y1_Letter, y = pct, fill = Gender)) +
  geom_bar(stat = "identity") + 
  geom_text(aes(label = paste0(sprintf("%1.1f", pct), "%")),
            position = position_stack(vjust = 0.5), size = 5) +
  geom_text(aes(label = paste0("n=", formatC(fct_n, format = "d", big.mark = ",")),
                x = Grade_Math_Y1_Letter, y = 0), size = 5, vjust = 1.5)

## side by side bar 
Bar_base + geom_col(position = "dodge")

8.7 Plot Ops box plot

#### Situation 1. Used to compare distribution of continuous variable of multiple groups
Box_base <- ggplot(data = PseudoData, aes(x = Grade_Art_Y1_Letter, y = Grade_Math_Y1))
Box_base + geom_boxplot(aes(fill = Grade_Art_Y1_Letter))

#### Situation 2. Add more data to the box plot
## specify outlier color, shape and size
Box_base + geom_boxplot(outlier.colour = "red",                                
                        outlier.shape = 8,
                        outliersize=3)

## indicate mean as symbol
Box_base + geom_boxplot() + stat_summary(fun.y = mean, geom = "point", shape = 23, size = 4)

## add median as text
Box_base + geom_boxplot() + stat_summary(fun.y = median, geom = "text", vjust = -0.7,
                                         aes(label = round(..y.., digits = 1)))

## add dots
Box_base + geom_boxplot() + geom_dotplot(binaxis = 'y', stackdir = 'center', dotsize = 0.5)

#### Situation 3. Double stratified box plot
Box_base + geom_boxplot(aes(fill = Gender, 
                            group = interaction(Grade_Art_Y1_Letter, Gender)))

8.8 Plot Ops 2D density plot

#### Method 1. Used to plot density of two continuous variable
ggplot(PseudoData, aes(x = Grade_Math_Y1, y = Grade_Art_Y1, colour = Gender)) + 
  geom_point() + 
  stat_density2d(aes(alpha = stat(level) ^ 2, fill = Gender, geom = "polygon"))

#### Method 2. polygon with stat_chull
library(ggpubr)
ggplot(PseudoData, aes(x=Grade_Math_Y1, y=Grade_Art_Y1)) +
  geom_point(aes(color = Grade_Math_Y1_Letter, shape = Grade_Math_Y1_Letter)) +
  stat_chull(aes(color = Grade_Math_Y1_Letter, fill = Grade_Math_Y1_Letter), 
             alpha = 0.1, 
             geom = "polygon")

#### Method 3. ellipse plot
library(ggpubr)
PseudoData %>% 
  filter(!Grade_Math_Y2_Letter %in% NA) %>% 
  ggplot(., aes(x = Grade_Math_Y1, y = Grade_Art_Y1)) + 
  geom_point(aes(color = Grade_Math_Y2_Letter, shape = Grade_Math_Y2_Letter)) +
  stat_conf_ellipse(aes(color = Grade_Math_Y2_Letter, fill = Grade_Math_Y2_Letter), 
                    alpha = 0.1,
                    geom = "polygon",
                    na.rm = T) +
  stat_mean(aes(color = Grade_Math_Y2_Letter, shape = Grade_Math_Y2_Letter), 
            size = 2, 
            na.rm = T)

#### Method 4. ellipse using ggscatter 
ggscatter(PseudoData, 
          x = "Grade_Math_Y1", y = "Grade_Art_Y1",
          color = "Grade_Math_Y1_Letter", 
          palette = "npg",
          shape = "Grade_Math_Y1_Letter", 
          ellipse = TRUE, 
          mean.point = TRUE, 
          star.plot = TRUE,
          ggtheme = theme_minimal())

#### Method 5. 2D bin plot (like heatmap)
## continuous vs. continuous
PseudoData %>% 
  ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) +  
  geom_bin2d()

## continuous vs. factor
PseudoData %>% 
  ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1_Letter)) +  
  geom_bin2d()

## factor vs. factor
PseudoData %>% 
  ggplot(aes(x = Grade_Math_Y1_Letter, y = Grade_Stats_Y1_Letter)) +  
  geom_bin2d()

#### Method 6. heatmap using geom_raster
PseudoData %>% 
  select(StudentID, starts_with("Grade")) %>% 
  select(StudentID, ends_with("Y1")) %>% 
  head(20) %>% 
  gather(., Term, Grade, -1) %>%                                               
  ggplot(., aes(x = Term, y = StudentID)) +                                         
  geom_raster(aes(fill = Grade)) +                                                 
  scale_fill_gradient(low = "white", high = "#F8766D") +                             
  theme_bw() +                                                                   
  theme(panel.grid = element_blank())

#### Method 7. geom_tile as an alternative
PseudoData %>% 
  count(Grade_Math_Y1_Letter, Grade_Stats_Y1_Letter) %>% 
  ggplot(aes(x = Grade_Math_Y1_Letter, y = Grade_Stats_Y1_Letter, fill = n)) +
  geom_tile() +
  geom_text(aes(label = n))

#### Method 8. geom_rect is the best option so far
temp_data <-   
  PseudoData %>%
  head(20) %>% 
  select(StudentID, Grade_Math_Y1_Letter, Grade_Art_Y1_Letter, Grade_Math_Y1_Letter) %>%
  pivot_longer(-StudentID, 
               names_to = "class",
               values_to = "grade") %>% 
  mutate(StudentID_loc = as.numeric(StudentID),
         class_loc = as.numeric(as.factor(class)))
  
temp_data %>% 
  ggplot() +
  geom_rect(aes(xmin = class_loc, xmax = class_loc + 1, ymin = StudentID_loc, ymax = StudentID_loc + 1, fill = grade)) + 
  theme_bw() + 
  scale_x_continuous(breaks = seq(1.5, length(unique(temp_data$class)) + 0.5, 1), labels = c("art", "math"), 
                     expand = c(0, 0)) + 
  scale_y_continuous(breaks = seq(1.5, length(unique(temp_data$StudentID)) + 0.5, 1), 
                     labels = unique(temp_data$StudentID), 
                     expand = c(0, 0)) + 
  theme(panel.grid.major = element_blank(),
        panel.ontop = T,
        legend.key = element_rect(color = "grey38"),
        panel.background = element_rect(fill = "transparent"))

8.9 Plot Ops 3D density plots

library(rayshader)
ggdiamonds = ggplot(diamonds) +
  stat_density_2d(aes(x = x, y = depth, fill = stat(nlevel)), 
                  geom = "polygon", n = 200, bins = 50,contour = TRUE) +
  facet_wrap(clarity~.) +
  scale_fill_viridis_c(option = "A")

par(mfrow = c(1, 2))

plot_gg(ggdiamonds, width = 5, height = 5, raytrace = FALSE, preview = TRUE)
plot_gg(ggdiamonds, width = 5, height = 5, multicore = TRUE, scale = 250, 
        zoom = 0.7, theta = 10, phi = 30, windowsize = c(800, 800))
# Sys.sleep(0.2)
render_snapshot(clear = TRUE)

8.10 Plot Ops error bar plot

#### Situation 1. Used to plot confidence interval of all kinds  (forest plot)
## like estimate (95% CI) of regression, or incidence rate
PseudoData %>%
  select(StudentID, starts_with("Grade_Math_Y")) %>% select(1:5) %>%            
  gather(., Term, Grade, -1) %>%  
  group_by(Term) %>% 
  summarize(Med = median(Grade, na.rm = T),
            P25th = quantile(Grade, 0.25, na.rm = T),
            P75th = quantile(Grade, 0.75, na.rm = T)) %>%
  ggplot(aes(x = Med, y = Term, color = Term))+                                     
  geom_point()+                                                                 
  geom_errorbarh(aes(xmin = P25th, xmax = P75th), height = 0.2, cex = 1)

#### Situation 2. lollipop plot
## Combining a horizontal error bar and dot plot can create a lollipop plot
## can be used to present two or even three different continuous measures by each observations
PseudoData %>% 
  ggplot(aes(x = Grade_Math_Y1, y = StudentID)) + 
  geom_errorbarh(aes(xmin = 0, xmax = Grade_Math_Y1), height = 0) + 
  geom_point(aes(size = Age, color = Gender))

8.11 Plot Ops moon chart

library(gggibbous)
## see an example here: https://cran.r-project.org/web/packages/gggibbous/vignettes/gggibbous.html#:~:text=gggibbous%20and%20its%20usage,require%20any%20special%20coordinate%20system.

## Example 1: point 
tidymoons <- data.frame(
  x = rep(1:3, 6),
  y = rep(rep(3:1, each = 3), 2),
  ratio = c(1:9 / 10, 9:1 / 10),
  right = rep(c(TRUE, FALSE), each = 9)
)

ggplot(tidymoons, aes(x, y, ratio = ratio, right = right, size = 2^x)) +
  geom_moon(data = subset(tidymoons, right), fill = "violetred") +
  geom_moon(
    data = subset(tidymoons, !right), fill = "turquoise3",
    key_glyph = draw_key_moon_left
  ) +
  lims(x = c(0.5, 3.5), y = c(0.5, 3.5)) +
  scale_size("size", range = c(5, 10), breaks = 2^(1:3))

## Example 2 map plot
dmeladh_adj <- dmeladh
dmeladh_adj$long <- dmeladh$Longitude + c(
  -2, 0, -2, 2, -3, 3, 3, 2, 3, 4, -2.5, -2.5, -1, -2, -2.5, -4, 2.5,
  5, 6, 7, 2, -7, -5.5, -3, 0, -7, -2, 3, 5.5, 0.5, -1, -1.5, -3, 2)
dmeladh_adj$lat <- dmeladh$Latitude + c(
  -2, 2, 0, 1, 0, 0, 0, 2, 0.5, -1, 1, -1.5, 2, 4, 1.5, 0, 2,
  1, -1, -3, -2, 1, -1, -2, -3, -2, -4, -3, -1, 1.5, 2, 2, -2, 0)

moonmap <- ggplot(dmeladh_adj, aes(long, lat)) +
  geom_polygon(
    data = map_data(
      "world", region = "(Australia)|(Indonesia)|(Papua New Guinea)"),
    aes(group = group),
    fill = "gray80"
  ) +
  geom_segment(aes(xend = Longitude, yend = Latitude), color = "gray20") +
  geom_point(aes(Longitude, Latitude), size = 0.75, color = "gray20") +
  scale_size(range = c(4, 10)) +
  coord_map(xlim = c(110, 160), ylim = c(-45, -5)) +
  theme_void() +
  theme(
    legend.position = c(0.05, 0.05),
    legend.direction = "horizontal",
    legend.justification = c(0, 0)
  )

moonmap +
  geom_moon(
    aes(ratio = AdhS / 100, size = N),
    right = FALSE, fill = "gold", color = "gold",
    key_glyph = draw_key_moon_left
  ) +
  geom_moon(
    aes(ratio = AdhF / 100, size = N),
    fill = "forestgreen", color = "forestgreen"
  )

8.12 Plot Ops map plot

#### Method 1. plot dot on map
PseudoData %>%
  ggplot(aes(x = Longitude, y = Latitude, size = Grade_Math_Y1, color = Age)) + 
  #Use state border in map plot, try "world", or adding region = "florida"
  borders("state") +                                                            
  geom_point() +
  coord_map() +                                                                 
  #Can also use ggthemes::theme_map 
  theme_void()

#### Method 2. plot region on map
PseudoData %>% 
  select(State, Grade_Stats_Y1) %>%                                              
  group_by(State) %>% 
  summarise(mean = mean(Grade_Stats_Y1, na.rm = T)) %>%    
  #Merge with map data to obtain state data
  right_join(map_data("state"), by = c("State" = "region")) %>%                       
  ggplot(aes(x = long, y = lat, group = group, fill = mean)) + 
  geom_polygon()+                                                                
  coord_map()+
  ggthemes::theme_map()

# use map_data("world") for word data


#### Method 3. interactive map with leaflet package
## Interactive map with leaflet package - see tidytuesday script:: "volcano eruption" for detailed walk through
library(leaflet)
library(glue)
library(htmlwidgets)
library(DT)
PseudoData %>% 
  select(StudentID,                                                             
         Gender, Age, Grade_Stats_Y1, Grade_Stats_Y1_Letter,                    
         Latitude, Longitude) %>%               
  # translate a continuous variable to a color variable
  mutate(age_col = colorNumeric(c("blue", "red"), domain = Age)(Age)) %>%       
  gather(key, value, 
         StudentID, Gender, Age, Grade_Stats_Y1, Grade_Stats_Y1_Letter) %>%     
  # optional:: replace na to unknown
  replace_na(list(value = "Unknown")) %>%                                       
  mutate(key = str_to_title(str_replace_all(key, "_", " ")),                    
         # bold the key variable in the plot pop-up
         key = paste0("<b>", key, "</b>")) %>%    
  # nest all descriptive data into a small tibble
  nest(data = c(key, value)) %>%     
  # make a html table from the nested descriptive tibble
  mutate(html = map(data,                                                       
                    knitr::kable,
                    format = "html",
                    escape = F,   
                    # remove the column name of the html
                    col.names = c("", ""))) %>%   
  # make a leaflet object
  leaflet() %>%     
  # add graphic layer
  addTiles() %>%  
  # add circle marker (points on the map); also try addMarkers()
  addCircleMarkers(lat = ~ Latitude,                                            
                   lng = ~ Longitude, 
                   col = ~ age_col, 
                   # use the html object as popup object
                   popup = ~ html, 
                   # specify the radius of the circle 
                   radius = 1) %>%                                             
  addMeasure()

#### Method 4. Stamen map
library(ggmap)
## get map using longitude and latitude
tampa_borders <- c(bottom  = 27.964157  - 0.2, 
                   top     = 27.964157  + 0.2,
                   left    = -82.452606 - 0.2,
                   right   = -82.452606 + 0.2)
## create pseudo data within the map range
set.seed(20120116)
tampa_pseudo_data <- 
 tbl_df(cbind(lon = runif(100, min = tampa_borders["left"], max = tampa_borders["right"]),
              lat = runif(100, min = tampa_borders["bottom"], max = tampa_borders["top"]),
              random_grade = PseudoData$Grade_Math_Y1))
## obtain map
map <- get_stamenmap(tampa_borders, zoom = 11, maptype = "watercolor")
## plot on map
ggmap(map) + 
 geom_point(data = tampa_pseudo_data,
            mapping = aes(x = lon, y = lat, 
                          col = random_grade, size = random_grade)) +
 scale_color_distiller(palette = "YlOrRd", direction = 1)

#### Method 5. 3D map 
## example from https://www.rayshader.com/
library(rayshader)

#Here, I load a map with the raster package.
loadzip = tempfile() 
download.file("https://tylermw.com/data/dem_01.tif.zip", loadzip)
localtif = raster::raster(unzip(loadzip, "dem_01.tif"))
unlink(loadzip)

#And convert it to a matrix:
elmat = raster_to_matrix(localtif)

elmat %>%
  sphere_shade(texture = "desert") %>%
  add_water(detect_water(elmat), color = "lightblue") %>%
  add_shadow(cloud_shade(elmat, zscale = 10, start_altitude = 500, end_altitude = 1000,), 0) %>%
  plot_3d(elmat, zscale = 10, fov = 0, theta = 135, zoom = 0.75, phi = 45, windowsize = c(1000, 800),
          background="darkred")
# render_camera(theta = 20, phi=40,zoom= 0.64, fov= 56 ) # this allows interactive rotation
render_snapshot(theta = 20, phi=40,zoom= 0.64, fov= 56, clear = TRUE)

8.13 Plot Ops Kaplan-Meier and cumulative incidence

library(survival)
library(survminer)

#### Situation 1. used in survival/time-to-event analysis

Sample_Fit <- survfit(Surv(Day2Employ, Employed == "Yes") ~ Grade_Stats_Y1_Letter,
                     data=PseudoData)
tb <- table(PseudoData$Grade_Stats_Y1_Letter)

ggsurvplot(Sample_Fit,
           data = PseudoData,
           #event = cumulative event, cumhaz = cumulative hazard, pct = survival probability
           fun="event",    
           #Scale time by year
           break.time.by = 365.25, xscale = "d_y",  
           #Calculate log-rank p-value
           pval = F,           
           #Output risk table
           risk.table = F,        
           #Show censor point
           censor=F, 
           #Show confidence interval
           conf.int = F,                                                        
           title="Time to employment by first year stats grade",
           legend="bottom",
           legend.title="",
           legend.labs=c(paste0("A n=",tb[1]),
                         paste0("B n=",tb[2]),
                         paste0("C n=",tb[3]),
                         paste0("D n=",tb[4]),
                         paste0("E n=",tb[5])),
           xlab="Years",
           ylab="Cumulative probability of employment",
           ylim=c(0,0.5),
           ggtheme = theme_bw(), 
           size=1.3)

#### Situation 2. Kaplan-Meier plot with risk table
Surv_plot_example <- ggsurvplot(Sample_Fit,
                                data = PseudoData,
                                fun="event",    
                                break.time.by = 60, xscale = "d_y",  
                                pval = F,      
                                ## show risk table
                                risk.table = T,
                                ## risk table font size
                                risk.table.fontsize = 6, 
                                ## risk table theme 
                                tables.theme = theme_cleantable(), 
                                ## risk table height
                                risk.table.height = 0.5,
                                censor=F, 
                                conf.int = F,                                                        
                                title="Time to employment by first year stats grade",
                                legend="bottom",
                                legend.title="",
                                legend.labs=c(paste0("A n=",tb[1]),
                                              paste0("B n=",tb[2]),
                                              paste0("C n=",tb[3]),
                                              paste0("D n=",tb[4]),
                                              paste0("E n=",tb[5])),
                                xlab="Years",
                                ylab="Cumulative probability of employment",
                                ylim=c(0,0.5),
                                ggtheme = theme_bw(), 
                                size=1.3)
Surv_plot_example

## additional edits on the table can be done by calling the table object from the plot
Surv_plot_example$table <-
  Surv_plot_example$table + 
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank())


#### Situation 3. stratified KM plot
ggsurvplot_facet(Sample_Fit,                                                    
                 data = PseudoData,
                 fun="event",
                 facet.by = "Gender")

8.14 Plot Ops ggforest plot

#### Situation 1. used to plot HR and 95% confidence interval from Cox regression model
library(survival)
library(survminer)

model <- coxph( Surv(time, status) ~ sex + rx + adhere,
                data = colon )
ggforest(model)

8.15 Plot Ops Venn diagram

library(ggVennDiagram)

Venn_data <- list('Group A'= as.character(sample(c(1: 1000), 300)),
                  "Group B"= as.character(sample(c(1: 1000), 450)))

ggVennDiagram(Venn_data,
              category.names = c(" ", "  "),
              label = "both")+
  scale_fill_gradient(low = "white", high = "cornflowerblue")+
  theme_void()+
  annotate("text",x = -2, y = 4, label = "Customer label A", size = 8)  +
  annotate("text",x = 6, y = 4, label = "Customer label B", size = 8)

8.16 Plot Ops plot intersection of categories

This site contains a bit more details, but you’ll have to rely on R doc for actual examples.

library(VennDetail)
A <- sample(1:1000, 400, replace = FALSE)
B <- sample(1:1000, 600, replace = FALSE)
C <- sample(1:1000, 350, replace = FALSE)
D <- sample(1:1000, 550, replace = FALSE)
res <- venndetail(list(A = A, B = B, C = C, D = D))
# result <- result(res)
# head(result)


# getSet(res, "A") # get unique elements in A
summary(res) #show overlap 'details' of all subsets

## Input groups are:  A B C D 
## Total unique elements are:  921 
## Total subsets are:  15 
## ====== Subsets detail: ======

## Shared  B_C_D  A_C_D    C_D  A_B_D    B_D    A_D      D  A_B_C    B_C    A_C 
##     52     69     31     43     86    130     62     77     26     66     31 
##      C    A_B      B      A 
##     32     67    104     45

do <- dplot(res) #make a bargraph for 'details'
pie <- vennpie(res) # make a pie chart

## Warning: `select_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `select()` instead.
## ℹ The deprecated feature was likely used in the dplyr package.
##   Please report the issue at <https://github.com/tidyverse/dplyr/issues>.

## Warning: `filter_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `filter()` instead.
## ℹ See vignette('programming') for more help
## ℹ The deprecated feature was likely used in the dplyr package.
##   Please report the issue at <https://github.com/tidyverse/dplyr/issues>.

ggiraph::girafe(ggobj = cowplot::plot_grid(do, pie), width_svg = 8, height_svg = 4)

8.17 Plot Ops chord diagram

Chord diagram could be helpful in visualizing concordance between two measures of the same multi-level categorical variable. See this site for more details.

library(chorddiag)

# Create dummy data
m <- matrix(c(11975,  5871, 8916, 2868,
              1951, 10048, 2060, 6171,
              8010, 16145, 8090, 8045,
              1013,   990,  940, 6907),
            byrow = TRUE,
            nrow = 4, ncol = 4)

# A vector of 4 colors for 4 groups
haircolors <- c("black", "blonde", "brown", "red")
dimnames(m) <- list(have = haircolors,
                    prefer = haircolors)
groupColors <- c("#000000", "#FFDD89", "#957244", "#F26223")

# Build the chord diagram:
p <- chorddiag(m, groupColors = groupColors, groupnamePadding = 20)
p

8.18 Plot Ops ggplot specifics - facet

Faceting is particularly helpful when trying to stratify the data in someway.

#### Situation 1. facet by one stratifying variable
## add labels of the facets
Facet_Label <-  c("Cus Label F", "Cus Label M")
names(Facet_Label) <- c("Female", "Male")
## use scatter plot as an example
PseudoData %>% ggplot(aes(x=Grade_Math_Y1,y=Grade_Stats_Y1)) + 
  geom_point() +                          
  facet_wrap(~Gender, nrow=1,                                                   
             labeller = labeller(Gender = Facet_Label))

#### Situation 2. facet plot allows for different order within each facet
## use bar plot as an example (also works when facetting box plot and other continuous vs. categorical plot)
PseudoData %>% 
  count(Race, Gender) %>% 
  ## order the race by frequency, separately within each gender category
  mutate(Race = tidytext::reorder_within(Race, n, Gender)) %>%                   
  ggplot(aes(Race, n, fill = Gender)) + 
  geom_col(show.legend = F)+
  facet_wrap(~Gender, scales = "free_y")+
  tidytext::scale_x_reordered()+                                                 
  coord_flip()

#### Situation 3. facet_wrap by two stratifying variable
##add labels of the facets
facet_labeller_left <- function(variable, value) {
  c("A1", "", "B1", "", "M1", "",
    "U1", "", "W1", "", "NA1", "")
}

facet_labeller_top <- function(variable, value) {
  c("female", "male","", "","", "","", "", "", "","", "")
}

PseudoData %>% ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) + 
  geom_point() +  
  ## facet by race (in rows) and gender (in columns), 
  ## note, this will automatically remove empty combinations
  facet_wrap(Race ~ Gender, 
             ## allow the x scale to be free, also try "free_y", and "free"
             scales = "free_x",
             ncol = 2,
             labeller = labeller(Race = as_labeller(facet_labeller_left),
                                 Gender = as_labeller(facet_labeller_top)))

#### Situation 4. facet_grid by two stratifying variable
PseudoData %>% 
  ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) +                 
  geom_point() +            
  ## facet by gender (in rows) and race (in columns), 
  ## note, this will NOT automatically remove empty combinations
  facet_grid(rows = vars(Gender), cols = vars(Race))

#### Situation 5. geofacet - facet by locations
library(geofacet)
## facet by US state as an example
PseudoData %>% 
 mutate(State = snakecase::to_title_case(State)) %>% 
ggplot(aes(Grade_Math_Y1, Grade_Stats_Y1)) +
 geom_point() +
 theme_bw() +
 facet_geo(~ State, grid = "us_state_grid1", label = "name")

## One can choose to facet by any of the 141 pre-set grids 
get_grid_names() %>% head(n = 10)

##  [1] "us_state_grid1"          "us_state_grid2"         
##  [3] "eu_grid1"                "aus_grid1"              
##  [5] "sa_prov_grid1"           "gb_london_boroughs_grid"
##  [7] "nhs_scot_grid"           "india_grid1"            
##  [9] "india_grid2"             "argentina_grid1"

## Or, to create your own geo grid by design the grid data.

8.19 Plot Ops ggplot specifics - scales

library(scales)
#### Situation 1. log scale x or y axis
PseudoData %>%
  filter(! Schol_Amount %in% NA) %>% 
  ggplot(aes(x = Schol_Amount, y = Schol_Amount)) + 
  geom_point() +
  ## log10 is a built-in scale
  scale_x_log10() + 
  ## other log transformation can be done like the following
  scale_y_continuous(trans = log2_trans(),
                     breaks = trans_breaks("log2", function(x) 2^x),
                     labels = trans_format("log2", math_format(2^.x)))

#### Situation 2. specify limit, breaks and name
PseudoData %>%
  filter(! Schol_Amount %in% NA) %>% 
  ggplot(aes(x = Schol_Amount, y = Schol_Amount)) + 
  geom_point() +
  ## manually specify name, limit, breaks and labels of each tick
  scale_x_continuous(name = "amount of scholarship", 
                     limit = c(0, 20000), 
                     labels = c("lower end", "somehwere in the middle", "upper end"),
                     breaks = c(0, 10000, 20000)) +
  ## put breaks as a function of the limit
  scale_y_continuous(name = "same as x-axis",
                     breaks = function(lim) (seq(floor(lim[1]),
                                                 ceiling(lim[2]),
                                                 5000))) +
  ## one quick way to expand the limits is by adding expand_limits
  expand_limits(y = 0)

#### Situation 3. scale size, color, or fill
Scatter_base <- 
  PseudoData %>%
  filter(! Schol_Amount %in% NA) %>% 
  ggplot(aes(x = Schol_Amount,
             y = Schol_Amount, 
             size = Schol_Amount, 
             color = Schol_Amount)) + 
  geom_point()

## size: specify range of the size
Scatter_base + scale_size_continuous(range = c(5, 10))

## color/fill: specify range of the color using scale_color_gradient
Scatter_base + scale_color_gradient(low = "yellow", high = "red", na.value = NA)

## color/fill: specify range of color and midpoint using scale_color_gradient2
Scatter_base + scale_color_gradient2(low = "blue", high = "red", midpoint = 75, na.value = NA)

## color/fill: can also work with transformed scale
Scatter_base + scale_color_continuous(trans = "log10")

#### Situation 4. reverse order of discrete scales
Temp_data <- 
  PseudoData %>% 
  count(Major) %>% 
  mutate(Major = fct_reorder(Major, n))

Temp_data %>% 
  ggplot(aes(x = n, y = Major, fill = n)) + 
  geom_col()

Temp_data %>% 
  ggplot(aes(x = n, y = Major, fill = n)) + 
  geom_col() +
  scale_y_discrete(limits = rev(levels(Temp_data$Major)))

#### Situation 5. change x or y axis label format 
#### these labels should work with any type of scaling function
## comma format
PseudoData %>% 
  filter(! Schol_Amount %in% NA) %>% 
  ggplot(aes(x = Schol_Amount)) +
  geom_histogram() + 
  scale_x_continuous(labels = comma_format())  # labels = comma works the same

## dollar format
PseudoData %>% 
  filter(! Schol_Amount %in% NA) %>% 
  ggplot(aes(x = Schol_Amount)) +
  geom_histogram() + 
  scale_x_continuous(labels = dollar_format())

## percent format
PseudoData %>% 
  count(Major) %>% 
  mutate(p = n/sum(n)) %>% 
  ggplot(aes(x = p, y = Major)) +
  geom_col() + 
  scale_x_continuous(labels = percent_format())  # labels = percent works the same

8.20 Plot Ops ggplot specifics - legends and labels

Temp_data <- 
  PseudoData %>% 
  count(Major) %>% 
  mutate(Major = fct_reorder(Major, n))

#### Situation 1. Labels
## Axis label or legend label can be edited in scale function as described above
## In addition, these labels as well as title, subtitle and caption can be added by labs function
Temp_plot <- 
  Temp_data %>% 
  ggplot(aes(x = n, y = Major, fill = Major)) + 
  geom_col() + 
  labs(title = "title",
       subtitle = "subtitle",
       x = "x axis",
       y = "y axis",
       caption = "caption",
       fill = "fill",
       color = "color")
Temp_plot

#### Situation 2. legends
## position
Temp_plot + theme(legend.position = c(1,1))

Temp_plot + theme(legend.position = "bottom")

Temp_plot + theme(legend.justification = c(1,1))

## override legend
# you can override the aes in the legend to show fixed alpha, or dot size
# this is particularly useful when you have alpha and size in the aes, but you want to show the original color
Temp_plot + 
  guides(fill = guide_legend(override.aes = list(alpha = 0.5, size = 1)))

## other specification of legends
# break legends into multiple rows
Temp_plot + guides(fill = guide_legend(nrow = 3, byrow = F))

# reverse the order of the legend
Temp_plot + guides(fill = guide_legend(reverse = T))

# specify the size of the legend example
Temp_plot + guides(fill = guide_legend(keywidth = 5))

## hide legend
# hide legend in geom function
Temp_data %>% ggplot(aes(x = n, y = Major, fill = Major)) + geom_col(show.legend = F)

# hide legend in scale function
Temp_plot + scale_fill_discrete(guide = F)

# hide legend in theme function
Temp_plot + theme(legend.position = "none")

# hide legend in guides function
Temp_plot + guides(fill = "none")

8.21 Plot Ops ggplot specifics - coordinates

#### Situation 1. Cropping plot
Temp_plot + coord_cartesian(xlim = c(0, 8))

#### Situation 2. Specify the coordinate and aspect ratio
## fixed equal aspect radio by coord_equal
Temp_plot + coord_equal()

## specify aspect ratio in theme with aspect.ratio
Temp_plot + theme(aspect.ratio = 1)

8.22 Plot Ops ggplot specifics - themes

themes function allows you customize almost every aspect of the plot. No example will be shown in this section.

#### Situation 1. Some neat built-in themes
theme_bw()
theme_void()
theme_light()
theme_classic()

#### Situation 2. Set universal theme of the work space
theme_set(
  ## use a built-in theme as the base of your custom theme
  theme_light() + 
    ## add in other specifications of your custom theme
    theme(...))


#### Situation 3. Some examples of custom specification
theme(plot.title = element_text(size=20,hjust=0.5),                             
      axis.title = element_text(size=15,face="bold"),                           
      axis.text.x = element_text(face='bold',size=15,angle=30),               
      axis.text.y = element_text(face='bold',size=15),                          
      plot.caption = element_text(size=15,hjust=0.5),                           
      legend.title =  element_text(size=15),
      legend.text = element_text(size=15),
      strip.text = element_text(size=15),
      legend.position = "bottom") 


#### Situation 4. Revome grid line of the plot 
theme(panel.grid = element_blank(),                                            
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank())  

#### Situation 5. Use additional theme from ggthemes package
ggthemes::theme_wsj()     # wall streat journal theme
ggthemes::theme_tufte()   # Tufte maximal data
ggthemes::theme_stata()   # Stata graph schemes
ggthemes::theme_excel()   # old excel plot 
ggthemes::theme_gdocs()   # google doc chart
ggthemes::theme_fivethirtyeight()  # fivethirtyeight.com
ggthemes::theme_economist()    # Economist
ggthemes::theme_map()     # for maps

8.23 Plot Ops ggplot specifics - adding annotations, segment lines, and brackets

#### Situation 1. adding text annotations 
## using annotate function (only be added inside the plot)
Temp_plot + 
  annotate("text", x = 10, y = 5, label = "Texts here",                             
           size = 8, color = "red", fontface = "bold.italic")+
  annotate("text", x = 10, y = 4,
           label = expression(italic("Everthing italic")), size = 6)

## using tag inside labs function (can be added outside of the plot area)
Temp_plot + 
  labs(tag = "tag A") +
  theme(plot.tag = element_text(size = 20, face = "bold"),
        plot.tag.position = c(0.05, 0.98))

#### Situation 2. add segment line 
## using geom_segment
PseudoData %>% 
  ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) + 
  geom_point() +
  geom_segment(aes(x = -Inf, y = 60, xend = Inf, yend = 60), colour = "darkred", size = 1.5)+
  geom_segment(aes(x = 60, y = -Inf, xend = 60, yend = Inf), colour = "darkred", size = 1.5)

## using geom_hline
PseudoData %>% 
  ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) + 
  geom_point() +
  geom_hline(yintercept = 70)

## using geom_vline
PseudoData %>% 
  ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) + 
  geom_point() +
  geom_vline(xintercept = 70)

#### Situation 3. add brackets (often works with boxplot)
## the old fashion curly bracket
library(pBrackets)
library(grid)

bracketsGrob <- function(...){
  l <- list(...)
  e <- new.env()
  e$l <- l
  grid:::recordGrob(  {
    do.call(grid.brackets, l)
  }, e)
}

b1 <- bracketsGrob(1/6, 0.8, 1/2, 0.8, h = 0.05, lwd = 2, col = "red")
b2 <- bracketsGrob(5/6, 0.3, 1/2, 0.3, h = 0.05,  lwd = 2, col = "red")

PseudoData %>% 
  ggplot(aes(x = Employed, y = Grade_Stats_Y1)) + 
  geom_boxplot() +  
 annotation_custom(b1) +  
 annotation_custom(b2)

## the more convenient square bracket with multiple ticks
PseudoData %>% 
  ggplot(aes(x = Employed, y = Grade_Stats_Y1)) + 
  geom_boxplot() +
  geom_bracket(xmin = "No", xmax = "Yes", y.position = 100, label = "TEXT", size = 1, tip.length = 0.03)+
  geom_bracket(xmin = "No", xmax = "Yes", y.position = 50, label = "text", size = 1, tip.length = -0.03)

8.24 Plot Ops ggplot specifics - merging multiple plot into one

#### Method 1. This is from Davis McCarthy

# By Davis McCarthy
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
 library(grid)
 
 # Make a list from the ... arguments and plotlist
 plots <- c(list(...), plotlist)
 
 numPlots = length(plots)
 
 # If layout is NULL, then use 'cols' to determine layout
 if (is.null(layout)) {
  # Make the panel
  # ncol: Number of columns of plots
  # nrow: Number of rows needed, calculated from # of cols
  layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                   ncol = cols, nrow = ceiling(numPlots/cols))
 }
 
 if (numPlots==1) {
  print(plots[[1]])
  
 } else {
  # Set up the page
  grid.newpage()
  pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
  
  # Make each plot, in the correct location
  for (i in 1:numPlots) {
   # Get the i,j matrix positions of the regions that contain this subplot
   matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
   
   print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                   layout.pos.col = matchidx$col))
  }
 }
}

multiplot(Temp_ggplot1, Temp_ggplot2, cols=2)

#### Method 2. using patchwork package
library(patchwork)
## this allows for shared titles and format
Temp_ggplot1 +
 labs(title = "shared title whatever") + 
   Temp_ggplot2

#### Method 3. working with ggsurvplot (containing risk tables)
ggsurvplot_list <- list()
ggsurvplot_list[[1]] <- Temp_ggsurvplot1
ggsurvplot_list[[2]] <- Temp_ggsurvplot2
arrange_ggsurvplots(ggsurvplot_list, print = T, ncol = 2, nrow = 1, risk.table.height = 0.5)

## if no risk table is included, multiplot function works as well 
multiplot(Temp_ggsurvplot1$plot,Temp_ggsurvplot1$plot, cols=2)

#### Method 4. Run the same ggplot function on a list of data frames
# Plot_List_Shell <- c()                                                          
# DataList <- list(Data_Frame1, Data_Frame2, Data_Frame3)                        
# names(DataList) <- c("Data_Name1","Data_Name2","Data_Name3")                   
# Yourggplot <- ggplot()                                                             
# 
# Plot_List_Shell[["Data_Name1"]] <- Yourggplot(data= Data_Frame1)                                 
# Plot_List_Shell[["Data_Name2"]] <- Yourggplot(data= Data_Frame2)       


#### Method 5. merging base plot
par(mfrow=c(1,3))
hist(mtcars$wt)
hist(mtcars$mpg)
hist(mtcars$disp)

#### Method 6. merging multiple existing images
# library(gird)
# library(gridExtra)
# library(ggplot)
# jpeg1 <- readJPEG("image1.jpeg")
# jpeg2 <- readJPEG("image2.jpeg")
# jpeg3 <- readJPEG("image3.jpeg")
# jpeg4 <- readJPEG("image4.jpeg")
# jpeg5 <- readJPEG("image5.jpeg")
# jpeg6 <- readJPEG("image6.jpeg")
# 
# arranged_fig <- 
#   grid.arrange(rasterGrob(jpeg1), rasterGrob(jpeg2), rasterGrob(jpeg3),
#                rasterGrob(jpeg4), rasterGrob(jpeg5), rasterGrob(jpeg6), 
#                ncol = 3)
# 
# output_fig <- 
#   qplot(1:10, 1:10, geom = "blank") + 
#   annotation_custom(arranged_fig, xlim = 0, xmax = 10, ymin = 0, ymax = 10) +
#   theme_bw() + 
#   theme(panel.grid.major = element_blank(),
#         panel.grid.minor = element_blank(),
#         axis.ticks = element_blank(),
#         panel.border = element_blank())




#### Method 7. merging multiple plots with a specific layout
# library(grid)
# library(gridExtra)
# 
# lay <- rbind(c(1, 1, 1, 1, 2, 2),
#              c(1, 1, 1, 1, 2, 2),
#              c(1, 1, 1, 1, 3, 3))
# 
# grid.arrange(plot1, plot2, plot3, layout_matrix = lay)

8.25 Plot Ops ggplot specifics - add a table to ggplot

library(gridExtra)
data_table <- tibble(course = c("Math", "Art"),
                     Year = c("1", "1"),
                     mean = c(mean(PseudoData$Grade_Math_Y1, na.rm = T),
                              mean(PseudoData$Grade_Art_Y1, na.rm = T)))

ggplot(data=PseudoData, aes(x = Grade_Math_Y1, y = Grade_Art_Y1)) + 
  geom_point() + 
  annotation_custom(tableGrob(data_table, rows = NULL), xmin = 40, xmax = 60, ymin = 50, ymax = 70)

8.26 Plot Ops ggplot transform to plotly

library(plotly)
#### Situation 1. regular ggplot object
ggplotly(Temp_ggplot1)

#### Situation 2. ggplot object from ggsurvplot
ggplotly(Temp_ggsurvplot1$plot)

8.27 Plot Ops Exporting plots

#### Situation 1. export plot into jpeg
jpeg("PlotName.jpeg", width = 15, height = 9, units = 'in', res = 300)
Plot1
dev.off()


#### Situation 2. export plot into pdf
pdf("PlotName.pdf",width = 15, height = 9)
Plot2
dev.off()


#### Situation 3. exporting image using ragg package
## the agg_xxxx function from ragg package outputs a slightly nicer image as it uses a better engine. What's inside the function should be exact the same as the orginal png() or jpeg() function. 
ragg::agg_jpeg("PlotName.jpeg", width = 15, height = 9, units = 'in', res = 300)
Plot1
dev.off()

9 Helpful Packages and Sources (Help Pack)

9.1 Help Pack pak - instal packages

A faster and convenient way to install packages

library(pak)
# pak::pkg_install("Waldo")

9.2 Help Pack maps - data for creating maps

Contianing several datasets with world country names and abbreviations, as well as latitude and longitude of world/US cities.

maps::iso3166 %>% head()

##   a2  a3       ISOname               mapname sovereignty
## 1 AW ABW         Aruba                 Aruba Netherlands
## 2 AF AFG   Afghanistan           Afghanistan Afghanistan
## 3 AO AGO        Angola                Angola      Angola
## 4 AI AIA      Anguilla              Anguilla    Anguilla
## 5 AX ALA Aland Islands Finland:Aland Islands     Finland
## 6 AL ALB       Albania               Albania     Albania

maps::us.cities %>% head()

##         name country.etc    pop   lat    long capital
## 1 Abilene TX          TX 113888 32.45  -99.74       0
## 2   Akron OH          OH 206634 41.08  -81.52       0
## 3 Alameda CA          CA  70069 37.77 -122.26       0
## 4  Albany GA          GA  75510 31.58  -84.18       0
## 5  Albany NY          NY  93576 42.67  -73.80       2
## 6  Albany OR          OR  45535 44.62 -123.09       0

9.3 Help Pack WDI - word development indicators

A very helpful data source for nation-level income, economic and population data

#### Example of pulling GDP per capita in 2015 from WDI data
library(WDI)
Example_gdp_capita <- WDI(indicator = "NY.GDP.PCAP.CD",
                          start = 2015, end = 2015, extra = T) %>%
  tbl_df()
Example_gdp_capita

## # A tibble: 266 × 13
##    iso2c country       NY.GD…¹  year status lastu…² iso3c region capital longi…³
##    <chr> <chr>           <dbl> <int> <chr>  <chr>   <chr> <chr>  <chr>   <chr>  
##  1 1A    Arab World      6246.  2015 ""     2022-1… ARB   Aggre… ""      ""     
##  2 1W    World          10154.  2015 ""     2022-1… WLD   Aggre… ""      ""     
##  3 4E    East Asia & …   6502.  2015 ""     2022-1… EAP   Aggre… ""      ""     
##  4 7E    Europe & Cen…   7458.  2015 ""     2022-1… ECA   Aggre… ""      ""     
##  5 8S    South Asia      1524.  2015 ""     2022-1… SAS   Aggre… ""      ""     
##  6 AD    Andorra        38885.  2015 ""     2022-1… AND   Europ… "Andor… "1.521…
##  7 AE    United Arab …  41525.  2015 ""     2022-1… ARE   Middl… "Abu D… "54.37…
##  8 AF    Afghanistan      592.  2015 ""     2022-1… AFG   South… "Kabul" "69.17…
##  9 AG    Antigua and …  14862.  2015 ""     2022-1… ATG   Latin… "Saint… "-61.8…
## 10 AL    Albania         3953.  2015 ""     2022-1… ALB   Europ… "Tiran… "19.81…
## # … with 256 more rows, 3 more variables: latitude <chr>, income <chr>,
## #   lending <chr>, and abbreviated variable names ¹NY.GDP.PCAP.CD,
## #   ²lastupdated, ³longitude
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names

9.4 Help Pack countrycode: standardized country name and country code

library(countrycode)
countrycode(c("china","us"), "country.name", "iso2c")

## [1] "CN" "US"

countrycode(c("china","us"), "country.name", "iso3c")

## [1] "CHN" "USA"

9.5 Help Pack ggsci: color palettes used in scientific publications

library(ggsci)

9.6 Help Pack ggtech: use pre-set theme from common tech companies (Facebook, Airbnb, Google,…)

It also allows user to design his/her own pre-set theme. See example in this page.

library(ggtech)
d2 <- data.frame(x = c(1:4, 3:1), y=1:7)
ggplot(aes(x,y), data=d2) + 
  geom_tech(size=2, theme="twitter") + 
  theme_tech("twitter") +
  ggtitle("Twitter geom")

9.7 Help Pack ggtext: formatting texts in ggplot in a stylish way

library(ggtext)

9.8 Help Pack ggiragh:: a better way of transforming ggplot to interactive plotly

library(ggiraph)
#### just an example
gg1 <- 
  ggplot(PseudoData) +
  geom_point_interactive(aes(x = Grade_Stats_Y1, y = Grade_Math_Y1, color = Employed,
                             tooltip = StudentID, data_id = StudentID), size = 3)
gg2 <- 
  ggplot(PseudoData) +
  geom_point_interactive(aes(x = Grade_Art_Y1, y = Grade_Engl_Y1, color = Employed,
                             tooltip = StudentID, data_id = StudentID), size = 3)

girafe(ggobj = cowplot::plot_grid(gg1, gg2), width_svg = 8, height_svg = 4)

9.9 Help Pack magick: working with figuer/gif

library(magick)

9.10 Help Pack tidymetrics, shinymetrics, shinydashboard and shinybones

These four packages can be used to create shinymetrics for easy plotting, particularly useful in presenting count data across cateogires and time.

In addition, utilizing both shinydashboard and shinybones, a interactive dashboard can be easily created.

An speech focusing on the workflow can be found on this site. David Robinson has also given several examples in his Tidy Tuesday screen casts.

An example created by myself using data of Spotify songs is also available.

# devtools::install_github("ramnathv/tidymetrics")
# library(tidymetrics)

9.11 Help Pack GGally/ggcorr: variable correlation plots in one simple ste

More details about ggcorr correlation plots can be seen in this site.

library(GGally)
PseudoData %>% 
 select(Race, Age,Grade_Math_Y1,Grade_Stats_Y1,Grade_Math_Y1_Letter,Grade_Stats_Y1_Letter) %>%
 ggpairs()

9.12 Help Pack ggstatsplot: a quick function for a cool plot with summary statistics and tests

This is a helpful package when you want to generate a quick plot with some summary statistics and quick tests. For more details, see this site.

library(ggstatsplot)

ggbetweenstats(
  data  = PseudoData,
  x     = Major,
  y     = Grade_Math_Y1,
  title = "Distribution of first year math grade across majors"
)

9.13 Help Pack calendR: plot calendar using R

See this page for more details: https://r-coder.com/calendar-plot-r/

library(calendR)
# Vector of NA of the same length of the number of days of the year
events <- rep(NA, 365)

# Set the corresponding events
events[40:45] <- "Trip"
events[213:240] <- "Holidays"
events[252] <- "Birthday"
events[359] <- "Christmas" 

# Creating the calendar with a legend
calendR(year = 2025,
        special.days = events,
        special.col = c("pink", "lightblue", # Colors
                        "lightgreen", "lightsalmon"),
        legend.pos = "right") # Legend to the right

# example two
calendR(start_date = "2020-09-01", # Custom start date
        end_date = "2021-05-31",   # Custom end date
        start = "M",               # Start the weeks on Monday
        mbg.col = 4,               # Color of the background of the names of the months
        months.col = "white",      # Color text of the names of the months
        special.days = "weekend",  # Color the weekends
        special.col = "lightblue", # Color of the special.days
        lty = 0,                   # Line type
        bg.col = "#f4f4f4",        # Background color
        title = "Academic calendar 2020-2021", # Title
        title.size = 30,                       # Title size
        orientation = "p")         # Vertical orientation

9.14 Help Pack tidymodels: running modeling in a tidy way

One quick example of how classification, regression and other applications work in tidymodel format can be found in David Robinson’s tidytuesday screen case: penguins.

9.15 Help Pack datapasta: copy table/data and directly paste into r as tibble

Try copying a table/table-like object from this wikipedia site. Then click on “Addins” and click “Paste as tribble” (or other format of your choice), and it shall return something like this, which can be directly ran as an r object.

tibble::tribble(
 ~Position, ~Tonic,          ~Heptatonic.mode,           ~Pentatonic.scales,           ~Name,
        0L,    "F",                  "Lydian",                      "Major",              NA,
        1L,    "C",          "Ionian (major)",             "Major, ritusen", "Straight harp",
        2L,    "G",              "Mixolydian",  "Major, ritusen, suspended",     "Crossharp",
        3L,    "D",                  "Dorian",  "Minor, ritusen, suspended",    "Slant harp",
        4L,    "A", "Aeolian (natural minor)", "Minor, man gong, suspended",              NA,
        5L,    "E",                "Phrygian",            "Minor, man gong",              NA,
        6L,    "B",                 "Locrian",            "Man gong, blues",              NA
 )

## # A tibble: 7 × 5
##   Position Tonic Heptatonic.mode         Pentatonic.scales          Name        
##      <int> <chr> <chr>                   <chr>                      <chr>       
## 1        0 F     Lydian                  Major                      <NA>        
## 2        1 C     Ionian (major)          Major, ritusen             Straight ha…
## 3        2 G     Mixolydian              Major, ritusen, suspended  Crossharp   
## 4        3 D     Dorian                  Minor, ritusen, suspended  Slant harp  
## 5        4 A     Aeolian (natural minor) Minor, man gong, suspended <NA>        
## 6        5 E     Phrygian                Minor, man gong            <NA>        
## 7        6 B     Locrian                 Man gong, blues            <NA>

9.16 Help Pack factoextra: a package to visualize multivariate analysis

This is package that aimed at consolidate visualization tools for multivariate analyses such as: PCA, correspondence analysis, clustering, particitioning. More details to be found in this site.

9.17 Help Pack raddle: a newbie friendly click-based data exploration tool

raddle creates an interactive and click-based user interface that allow the user to explore/mine the data without having to type any code. In addition, it generates the code behind each click and make the output more streamlined. For details see this post.

# library(rattle)
# rattle()

9.18 Help Pack esquisse: another more powerful newbie friendly click-based plotting tool

For details, see this post.

# library(esquisse)
# esquisser()

9.19 Help Pack easyPubMed: extract data from PubMed and analyze PubMed data

This site contains some examples.

library("easyPubMed")
dami_query_string <- "yayi zhao[AU]"
dami_on_pubmed <- get_pubmed_ids(dami_query_string)
dami_papers <- fetch_pubmed_data(dami_on_pubmed)
titles <- sapply(dami_papers, custom_grep, tag = "ArticleTitle", format = "char", USE.NAMES = FALSE)
## The following articles are under my name, but some of them aren't really my work... blame PubMed for its search engine.
print(titles)

##       [,1]                                                                                                                                                      
##  [1,] "Feasibility of a Culturally Specific DEmentia Competence Education for Nursing Home Taskforce (DECENT) Programme: A Mixed-Method Approach."              
##  [2,] "History of keratinocyte carcinoma and survival after a second primary malignancy: the Moffitt Cancer Center patient experience."                         
##  [3,] "Ethical challenges experienced by care home staff during COVID-19 pandemic."                                                                             
##  [4,] "Effect of a culturally sensitive DEmentia Competence Education for Nursing home Taskforce (DECENT) programme in China: A quasi-experimental study."      
##  [5,] "Translation and validation of Chinese version of sense of competence in dementia care staff scale in healthcare providers: a cross-sectional study."     
##  [6,] "Effect of prior antibiotic or chemotherapy treatment on immunotherapy response in non-small cell lung cancer."                                           
##  [7,] "Natural History of Incident and Persistent Cutaneous Human Papillomavirus and Human Polyomavirus Infections."                                            
##  [8,] "Circulating Immunosuppressive Regulatory T Cells Predict Risk of Incident Cutaneous Squamous Cell Carcinoma."                                            
##  [9,] "Dementia care education interventions on healthcare providers' outcomes in the nursing home setting: A systematic review."                               
## [10,] "Staff's Psychological Well-Being and Coping Mechanisms During COVID-19 Lockdown in Care Homes for Older Adults: A Structural Equation Modeling Analysis."
## [11,] "Cutaneous Human Papillomaviruses and the Risk of Keratinocyte Carcinomas."                                                                               
## [12,] "Association between Human Polyomaviruses and Keratinocyte Carcinomas: A Prospective Cohort Study."                                                       
## [13,] "Gamification for promoting advance care planning: A mixed-method systematic review and meta-analysis."                                                   
## [14,] "Understanding dementia care in care home setting in China: An exploratory qualitative study."                                                            
## [15,] "Validation of a Chinese version of the dementia knowledge assessment scale in healthcare providers in China."                                            
## [16,] "Cutaneous viral infections associated with ultraviolet radiation exposure."                                                                              
## [17,] "Measuring Self-Efficacy and Readiness for Advance Care Planning in Chinese Older Adults."                                                                
## [18,] "Viruses in Skin Cancer (VIRUSCAN): Study Design and Baseline Characteristics of a Prospective Clinic-Based Cohort Study."                                
## [19,] "T Regulatory Cell Subpopulations Associated with Recent Ultraviolet Radiation Exposure in a Skin Cancer Screening Cohort."                               
## [20,] "Cutaneous Viral Infections Across 2 Anatomic Sites Among a Cohort of Patients Undergoing Skin Cancer Screening."

9.20 Help Pack installr: updating R from R

This is something that I am yet to try. This package is supposed to update R, not the packages, using r itself. Check out this post for more details.

A personal summary of R code for data analysis

eli_3357

8/5/2022