The following code/output/plots were served as examples of specific/general R functions/operations that I found helpful in analyzing data with R.
A pseudo dataset was created to illustrate functions/codes. The data was created using random sampling completely, and the topic of the data is mimicking a class of college students.
rm(list=ls())
library(tidyverse)
library(lubridate)
library(janeaustenr)
set.seed(20120116)
PseudoData <- data.frame(StudentID = c(1:100)) %>%
mutate(
#### Demographics
DoB = sample(seq(as.Date('1990/01/01'), as.Date('1995/01/01'), by="day"), 100),
Age = floor(decimal_date(as.Date('2020/01/01')) - decimal_date(DoB)),
Gender = sample(c("Female","Male"),size=100, replace=T),
Race = sample(c("White","Black","Asian","Multiple","Unknown",NA), size=100, replace=T),
Ethnicity = sample(c("Non-Hispanic","Hispanic"),size=100,replace=T)) %>%
#### Numerical grade from year 1 to year 4
mutate(
Grade_Math_Y1 = round(rnorm(100,mean=70,sd=10)),
Grade_Stats_Y1 = round(rnorm(100,mean=75,sd=10)),
Grade_Engl_Y1 = round(rnorm(100,mean=75,sd=10)),
Grade_Art_Y1 = round(rnorm(100,mean=75,sd=10)),
Grade_Sports_Y1 = round(rnorm(100,mean=75,sd=10)),
Grade_Math_Y2 = Grade_Math_Y1+round(rnorm(100,mean=2,sd=5)),
Grade_Stats_Y2 = Grade_Stats_Y1+round(rnorm(100,mean=2,sd=5)),
Grade_Engl_Y2 = Grade_Engl_Y1+round(rnorm(100,mean=2,sd=5)),
Grade_Art_Y2 = Grade_Art_Y1+round(rnorm(100,mean=2,sd=5)),
Grade_Sports_Y2 = Grade_Sports_Y1+round(rnorm(100,mean=2,sd=5)),
Grade_Math_Y3 = Grade_Math_Y2+round(rnorm(100,mean=2,sd=5)),
Grade_Stats_Y3 = Grade_Stats_Y2+round(rnorm(100,mean=2,sd=5)),
Grade_Engl_Y3 = Grade_Engl_Y2+round(rnorm(100,mean=2,sd=5)),
Grade_Art_Y3 = Grade_Art_Y2+round(rnorm(100,mean=2,sd=5)),
Grade_Sports_Y3 = Grade_Sports_Y2+round(rnorm(100,mean=2,sd=5)),
Grade_Math_Y4 = Grade_Math_Y3+round(rnorm(100,mean=2,sd=5)),
Grade_Stats_Y4 = Grade_Stats_Y3+round(rnorm(100,mean=2,sd=5)),
Grade_Engl_Y4 = Grade_Engl_Y3+round(rnorm(100,mean=2,sd=5)),
Grade_Art_Y4 = Grade_Art_Y3+round(rnorm(100,mean=2,sd=5)),
Grade_Sports_Y4 = Grade_Sports_Y3+round(rnorm(100,mean=2,sd=5))) %>%
mutate_at(vars(starts_with("Grade")), ~ ifelse(. >100, 100, .)) %>%
#### Create some random NA's in the data
mutate_at(vars(starts_with("Grade")), ~ ifelse(. == sample(seq_len(100), 10),NA,.)) %>%
#### Create letter-grades based on number-grades
mutate_at(vars(starts_with('Grade')), .funs = list(Letter = ~ ifelse(
. >= 90, "A",ifelse( . >= 80, "B", ifelse(. >= 70, "C", ifelse(. >= 60, "D","F")))))) %>%
#### Add other aspects of the data
mutate(Major = sample(c("Communications", "Business", "Unknown","Statistics", "Public Health",NA,
"Nursing","Sustainability","Biology", "English","History", "Computer Science"),
size=100, replace=T),
Scholarship = sample(c("No","Yes"), size=100, prob=c(0.6,0.4), replace=T),
Schol_Amount = ifelse(Scholarship %in% "Yes", round(rnorm(100,mean=80, sd = 20))*100, NA),
ProjectedEarning = sample(seq(35000, 110000, by=500),100)) %>%
#### Add employment for potential survival analysis
mutate(JobHuntDate = sample(seq(as.Date('2020/01/01'), as.Date('2020/04/01'), by="day"), 100, replace = T),
Employed= sample(c("No","Yes"), size=100, prob=c(0.7,0.3), replace=T),
EmployDate = as.Date(ifelse(Employed %in% "Yes",
as.Date( JobHuntDate + sample(seq(30,180,1), 100, replace=T),
origin="1970-01-01"),
as.Date('2021/01/01')),
origin="1970-01-01"),
Day2Employ = as.numeric(difftime(EmployDate, JobHuntDate, unit="days"))) %>%
#### For text analysis, use random data from janeaustenr package.
mutate(Fav_Jane = sample(as.character(austen_books()$text[!austen_books()$text %in% ""]),
100, replace = T)) %>%
#### Add US state longitude and latitude data for map plot purpose
## Randomly select 100 location in US
bind_cols(sample_n(map_data("state"),100)) %>%
select(-c(group, order, subregion)) %>%
dplyr::rename(Longitude = long, Latitude = lat, State = region) %>%
#### Transfer data.frame as a subclass that behave with different default behavior
tbl_df()
# rm(list=ls())
# invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE))
Often need to install rtools.exe and devtools package.
# require(devtools)
# install_version("ggplot2", version = "0.9.1", repos = "http://cran.us.r-project.org")
#### install packages under a specific library path
# project_lib <- "D:/YOUR OWN PATH/"
# install.packages("countrycode", lib = project_lib)
#### load packages from a specific library path
# project_lib <- "D:/YOUR OWN PATH/"
# library(countrycode, lib.loc = project_lib)
# ip = as.data.frame(installed.packages(lib.loc = project_lib)[,c(1,3:4)])
# write.csv(ip, file = paste0("list of packages ", Sys.Date(), ".csv"), row.names = F)
# if(!require(somepackage)){
# install.packages("somepackage")
# library(somepackage)
# }
# install.packages("packagename", type = "binary")
#### Method 1. use glimpse from dplyr
glimpse(PseudoData)
## Rows: 100
## Columns: 58
## $ StudentID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …
## $ DoB <date> 1993-01-31, 1991-01-04, 1993-02-28, 1990-11-28…
## $ Age <dbl> 26, 28, 26, 29, 29, 25, 28, 27, 28, 27, 25, 28,…
## $ Gender <chr> "Female", "Female", "Female", "Male", "Male", "…
## $ Race <chr> "Asian", "Unknown", NA, "Unknown", "Multiple", …
## $ Ethnicity <chr> "Non-Hispanic", "Hispanic", "Hispanic", "Non-Hi…
## $ Grade_Math_Y1 <dbl> 56, 65, 46, 69, 84, 81, 62, 75, 56, 90, 66, 93,…
## $ Grade_Stats_Y1 <dbl> 78, 61, 46, 79, 80, 80, 70, 83, 75, 80, 63, 82,…
## $ Grade_Engl_Y1 <dbl> 80, 92, 83, 74, 87, 56, 69, 60, 82, 77, 82, 77,…
## $ Grade_Art_Y1 <dbl> 80, 87, 70, 62, NA, 63, 81, 63, 69, 69, 67, NA,…
## $ Grade_Sports_Y1 <dbl> 75, 79, 90, 64, 82, 72, 81, 71, 90, 81, 79, 62,…
## $ Grade_Math_Y2 <dbl> 60, 63, 45, 66, 90, 85, 64, 76, 61, 91, 61, 97,…
## $ Grade_Stats_Y2 <dbl> 76, 62, 45, 79, 79, 88, 67, 80, 72, 76, 69, 89,…
## $ Grade_Engl_Y2 <dbl> 85, 95, 88, 80, 95, NA, 75, NA, 80, 76, 81, 87,…
## $ Grade_Art_Y2 <dbl> 88, 86, 77, 71, 58, 69, 87, 76, 64, 56, 67, 75,…
## $ Grade_Sports_Y2 <dbl> 77, 81, 100, 57, 87, 72, 85, 74, 100, 85, 81, 6…
## $ Grade_Math_Y3 <dbl> 68, 67, 56, 76, 96, 87, 67, 75, 59, 91, 64, 99,…
## $ Grade_Stats_Y3 <dbl> 77, 55, 47, 77, 91, 97, 64, 79, 79, 74, 80, 92,…
## $ Grade_Engl_Y3 <dbl> 87, 96, 82, 79, 93, 79, 71, 57, 80, 80, 86, 87,…
## $ Grade_Art_Y3 <dbl> 87, 84, 82, 64, 52, 69, 86, 74, 64, 54, 69, 73,…
## $ Grade_Sports_Y3 <dbl> 77, 91, 98, 62, 85, 75, 86, 69, 100, 94, 90, 61…
## $ Grade_Math_Y4 <dbl> 65, 73, 61, 78, 100, 91, 68, 75, 61, 100, 74, 1…
## $ Grade_Stats_Y4 <dbl> 78, 53, 39, 77, 100, 95, 71, 80, 77, 75, 72, 94…
## $ Grade_Engl_Y4 <dbl> 92, 98, 83, 77, 91, 85, 69, 55, 91, 87, 91, 78,…
## $ Grade_Art_Y4 <dbl> 100, 77, 93, 67, 65, 69, 93, 71, 64, 54, 68, 74…
## $ Grade_Sports_Y4 <dbl> 80, 85, 88, 61, 89, 67, 83, 56, 100, 98, 88, 62…
## $ Grade_Math_Y1_Letter <chr> "F", "D", "F", "D", "B", "B", "D", "C", "F", "A…
## $ Grade_Stats_Y1_Letter <chr> "C", "D", "F", "C", "B", "B", "C", "B", "C", "B…
## $ Grade_Engl_Y1_Letter <chr> "B", "A", "B", "C", "B", "F", "D", "D", "B", "C…
## $ Grade_Art_Y1_Letter <chr> "B", "B", "C", "D", NA, "D", "B", "D", "D", "D"…
## $ Grade_Sports_Y1_Letter <chr> "C", "C", "A", "D", "B", "C", "B", "C", "A", "B…
## $ Grade_Math_Y2_Letter <chr> "D", "D", "F", "D", "A", "B", "D", "C", "D", "A…
## $ Grade_Stats_Y2_Letter <chr> "C", "D", "F", "C", "C", "B", "D", "B", "C", "C…
## $ Grade_Engl_Y2_Letter <chr> "B", "A", "B", "B", "A", NA, "C", NA, "B", "C",…
## $ Grade_Art_Y2_Letter <chr> "B", "B", "C", "C", "F", "D", "B", "C", "D", "F…
## $ Grade_Sports_Y2_Letter <chr> "C", "B", "A", "F", "B", "C", "B", "C", "A", "B…
## $ Grade_Math_Y3_Letter <chr> "D", "D", "F", "C", "A", "B", "D", "C", "F", "A…
## $ Grade_Stats_Y3_Letter <chr> "C", "F", "F", "C", "A", "A", "D", "C", "C", "C…
## $ Grade_Engl_Y3_Letter <chr> "B", "A", "B", "C", "A", "C", "C", "F", "B", "B…
## $ Grade_Art_Y3_Letter <chr> "B", "B", "B", "D", "F", "D", "B", "C", "D", "F…
## $ Grade_Sports_Y3_Letter <chr> "C", "A", "A", "D", "B", "C", "B", "D", "A", "A…
## $ Grade_Math_Y4_Letter <chr> "D", "C", "D", "C", "A", "A", "D", "C", "D", "A…
## $ Grade_Stats_Y4_Letter <chr> "C", "F", "F", "C", "A", "A", "C", "B", "C", "C…
## $ Grade_Engl_Y4_Letter <chr> "A", "A", "B", "C", "A", "B", "D", "F", "A", "B…
## $ Grade_Art_Y4_Letter <chr> "A", "C", "A", "D", "D", "D", "A", "C", "D", "F…
## $ Grade_Sports_Y4_Letter <chr> "B", "B", "B", "D", "B", "D", "B", "F", "A", "A…
## $ Major <chr> "Nursing", "Unknown", "Unknown", "Public Health…
## $ Scholarship <chr> "No", "No", "No", "No", "No", "No", "No", "No",…
## $ Schol_Amount <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ ProjectedEarning <dbl> 46000, 91500, 48500, 98500, 102000, 108500, 735…
## $ JobHuntDate <date> 2020-01-10, 2020-01-10, 2020-01-17, 2020-03-12…
## $ Employed <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes…
## $ EmployDate <date> 2021-01-01, 2021-01-01, 2021-01-01, 2021-01-01…
## $ Day2Employ <dbl> 357, 357, 350, 295, 63, 283, 355, 43, 349, 287,…
## $ Fav_Jane <chr> "answer a note, instead of waiting for me, you …
## $ Longitude <dbl> -75.79086, -71.41919, -89.39861, -114.72906, -8…
## $ Latitude <dbl> 44.45007, 41.60247, 30.29801, 33.40344, 38.4397…
## $ State <chr> "new york", "rhode island", "mississippi", "ari…
#### Method 2. use describe() from Hmisc
# This method outputs a very detailed report, thus result not shown
# Hmisc::describe(PseudoData)
## see also https://dabblingwithdata.wordpress.com/2018/01/02/my-favourite-r-package-for-summarising-data/
## for other packages/functions of summarizing data
#### Method 3. use skim from skimr
library(skimr)
skim(PseudoData)
| Name | PseudoData |
| Number of rows | 100 |
| Number of columns | 58 |
| _______________________ | |
| Column type frequency: | |
| character | 28 |
| Date | 3 |
| numeric | 27 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Gender | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| Race | 22 | 0.78 | 5 | 8 | 0 | 5 | 0 |
| Ethnicity | 0 | 1.00 | 8 | 12 | 0 | 2 | 0 |
| Grade_Math_Y1_Letter | 2 | 0.98 | 1 | 1 | 0 | 5 | 0 |
| Grade_Stats_Y1_Letter | 1 | 0.99 | 1 | 1 | 0 | 5 | 0 |
| Grade_Engl_Y1_Letter | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| Grade_Art_Y1_Letter | 4 | 0.96 | 1 | 1 | 0 | 5 | 0 |
| Grade_Sports_Y1_Letter | 1 | 0.99 | 1 | 1 | 0 | 5 | 0 |
| Grade_Math_Y2_Letter | 1 | 0.99 | 1 | 1 | 0 | 5 | 0 |
| Grade_Stats_Y2_Letter | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| Grade_Engl_Y2_Letter | 3 | 0.97 | 1 | 1 | 0 | 5 | 0 |
| Grade_Art_Y2_Letter | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| Grade_Sports_Y2_Letter | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| Grade_Math_Y3_Letter | 2 | 0.98 | 1 | 1 | 0 | 5 | 0 |
| Grade_Stats_Y3_Letter | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| Grade_Engl_Y3_Letter | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| Grade_Art_Y3_Letter | 2 | 0.98 | 1 | 1 | 0 | 5 | 0 |
| Grade_Sports_Y3_Letter | 1 | 0.99 | 1 | 1 | 0 | 5 | 0 |
| Grade_Math_Y4_Letter | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| Grade_Stats_Y4_Letter | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| Grade_Engl_Y4_Letter | 2 | 0.98 | 1 | 1 | 0 | 5 | 0 |
| Grade_Art_Y4_Letter | 0 | 1.00 | 1 | 1 | 0 | 5 | 0 |
| Grade_Sports_Y4_Letter | 1 | 0.99 | 1 | 1 | 0 | 5 | 0 |
| Major | 13 | 0.87 | 7 | 16 | 0 | 11 | 0 |
| Scholarship | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| Employed | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| Fav_Jane | 0 | 1.00 | 10 | 72 | 0 | 100 | 0 |
| State | 0 | 1.00 | 4 | 14 | 0 | 35 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DoB | 0 | 1 | 1990-02-27 | 1994-12-26 | 1992-11-05 | 100 |
| JobHuntDate | 0 | 1 | 2020-01-01 | 2020-03-31 | 2020-02-14 | 57 |
| EmployDate | 0 | 1 | 2020-03-03 | 2021-01-01 | 2021-01-01 | 23 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| StudentID | 0 | 1.00 | 50.50 | 29.01 | 1.00 | 25.75 | 50.50 | 75.25 | 100.00 | ▇▇▇▇▇ |
| Age | 0 | 1.00 | 26.79 | 1.29 | 25.00 | 26.00 | 27.00 | 28.00 | 29.00 | ▆▇▆▆▃ |
| Grade_Math_Y1 | 2 | 0.98 | 70.06 | 10.18 | 42.00 | 62.25 | 70.50 | 76.00 | 93.00 | ▁▅▇▆▃ |
| Grade_Stats_Y1 | 1 | 0.99 | 75.22 | 9.05 | 46.00 | 70.00 | 75.00 | 82.00 | 97.00 | ▁▃▇▇▂ |
| Grade_Engl_Y1 | 0 | 1.00 | 75.77 | 10.98 | 41.00 | 68.00 | 74.50 | 83.00 | 100.00 | ▁▂▇▇▂ |
| Grade_Art_Y1 | 4 | 0.96 | 73.84 | 8.26 | 56.00 | 67.75 | 73.50 | 78.00 | 92.00 | ▂▆▇▃▃ |
| Grade_Sports_Y1 | 1 | 0.99 | 74.92 | 9.62 | 52.00 | 68.50 | 76.00 | 81.50 | 96.00 | ▂▅▇▇▂ |
| Grade_Math_Y2 | 1 | 0.99 | 72.60 | 11.13 | 44.00 | 64.00 | 73.00 | 82.00 | 97.00 | ▁▅▇▆▂ |
| Grade_Stats_Y2 | 0 | 1.00 | 77.16 | 9.94 | 45.00 | 71.00 | 77.00 | 83.00 | 100.00 | ▁▂▇▇▂ |
| Grade_Engl_Y2 | 3 | 0.97 | 77.84 | 11.61 | 42.00 | 70.00 | 78.00 | 86.00 | 100.00 | ▁▃▇▇▃ |
| Grade_Art_Y2 | 0 | 1.00 | 75.77 | 9.49 | 54.00 | 69.00 | 75.50 | 82.25 | 98.00 | ▂▅▇▆▁ |
| Grade_Sports_Y2 | 0 | 1.00 | 77.14 | 10.66 | 46.00 | 71.75 | 77.50 | 84.00 | 100.00 | ▁▃▇▇▂ |
| Grade_Math_Y3 | 2 | 0.98 | 75.01 | 11.98 | 46.00 | 67.00 | 76.00 | 83.75 | 99.00 | ▂▆▇▇▃ |
| Grade_Stats_Y3 | 0 | 1.00 | 79.10 | 11.34 | 47.00 | 72.75 | 79.00 | 88.00 | 100.00 | ▁▃▇▇▅ |
| Grade_Engl_Y3 | 0 | 1.00 | 79.83 | 11.58 | 43.00 | 72.75 | 81.00 | 87.25 | 100.00 | ▁▂▅▇▅ |
| Grade_Art_Y3 | 2 | 0.98 | 78.11 | 10.71 | 49.00 | 71.00 | 79.00 | 85.00 | 100.00 | ▁▃▇▇▃ |
| Grade_Sports_Y3 | 1 | 0.99 | 79.43 | 11.48 | 50.00 | 71.50 | 81.00 | 87.00 | 100.00 | ▁▅▆▇▅ |
| Grade_Math_Y4 | 0 | 1.00 | 76.76 | 13.89 | 41.00 | 67.75 | 77.50 | 87.25 | 100.00 | ▂▃▇▇▅ |
| Grade_Stats_Y4 | 0 | 1.00 | 81.70 | 12.82 | 39.00 | 72.00 | 82.50 | 92.00 | 100.00 | ▁▂▅▇▇ |
| Grade_Engl_Y4 | 2 | 0.98 | 81.51 | 12.39 | 37.00 | 72.25 | 82.50 | 91.00 | 100.00 | ▁▁▅▇▇ |
| Grade_Art_Y4 | 0 | 1.00 | 79.70 | 10.89 | 49.00 | 73.00 | 79.00 | 87.00 | 100.00 | ▁▃▇▇▅ |
| Grade_Sports_Y4 | 1 | 0.99 | 81.49 | 12.35 | 52.00 | 73.50 | 82.00 | 90.00 | 100.00 | ▂▃▅▇▆ |
| Schol_Amount | 77 | 0.23 | 7869.57 | 2557.95 | 3300.00 | 6150.00 | 8100.00 | 9350.00 | 14500.00 | ▃▇▇▃▁ |
| ProjectedEarning | 0 | 1.00 | 72320.00 | 23408.70 | 35500.00 | 51875.00 | 71750.00 | 92750.00 | 110000.00 | ▇▇▅▇▇ |
| Day2Employ | 0 | 1.00 | 264.03 | 107.50 | 31.00 | 175.75 | 304.00 | 345.25 | 366.00 | ▂▁▁▃▇ |
| Longitude | 0 | 1.00 | -90.48 | 14.03 | -124.06 | -96.81 | -88.17 | -80.25 | -70.03 | ▂▁▃▇▆ |
| Latitude | 0 | 1.00 | 37.31 | 5.64 | 26.04 | 33.54 | 36.47 | 41.61 | 48.29 | ▃▇▇▆▅ |
#### Method 4. Ssummary stats for continuous as table
library(rstatix)
PseudoData %>%
group_by(Major) %>%
select(Grade_Art_Y1, Grade_Math_Y1) %>%
get_summary_stats()
## # A tibble: 24 × 14
## Major varia…¹ n min max median q1 q3 iqr mad mean sd
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Biology Grade_… 10 64 92 74 69.5 81.8 12.2 8.90 75.7 9.12
## 2 Biology Grade_… 10 62 90 75.5 67 80.8 13.8 10.4 74.8 9.45
## 3 Business Grade_… 7 60 85 77 75 80.5 5.5 4.45 76.1 8.11
## 4 Business Grade_… 8 69 93 72.5 71.5 77 5.5 2.96 76.1 8.61
## 5 Communi… Grade_… 7 67 80 70 67.5 73.5 6 4.45 71.3 4.75
## 6 Communi… Grade_… 8 54 74 68 60.5 72.5 12 8.90 66 7.86
## 7 Compute… Grade_… 9 68 89 77 77 78 1 1.48 76.8 6.04
## 8 Compute… Grade_… 9 42 84 74 62 81 19 11.9 70.2 13.4
## 9 English Grade_… 7 63 82 73 69.5 76.5 7 5.93 72.9 6.36
## 10 English Grade_… 8 57 82 69 58.8 72 13.2 9.64 67.2 8.75
## # … with 14 more rows, 2 more variables: se <dbl>, ci <dbl>, and abbreviated
## # variable name ¹variable
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
#### Option 1. use eval with parse. However, this method does not work with dplyr pipes
x <- "Math"
eval(parse(text=paste0("PseudoData$Grade_", x, "_Y1")))
## [1] 56 65 46 69 84 81 62 75 56 90 66 93 62 42 72 71 69 82 76 64 72 59 85 54 55
## [26] NA 84 84 56 82 82 85 56 71 82 68 62 84 53 NA 73 63 83 62 74 53 76 57 62 74
## [51] 60 66 72 58 75 68 71 61 69 68 75 84 56 61 65 68 86 70 64 71 72 85 62 70 74
## [76] 50 68 70 77 72 80 74 67 70 80 75 70 74 84 79 81 64 72 71 74 66 68 61 75 81
#### Option 2. use !!sym, this method works well with dplyr pipes
Y <- paste0("Grade_",x,"_Y1")
PseudoData %>% mutate(new_col = !!sym(Y)) %>% pull(new_col)
## [1] 56 65 46 69 84 81 62 75 56 90 66 93 62 42 72 71 69 82 76 64 72 59 85 54 55
## [26] NA 84 84 56 82 82 85 56 71 82 68 62 84 53 NA 73 63 83 62 74 53 76 57 62 74
## [51] 60 66 72 58 75 68 71 61 69 68 75 84 56 61 65 68 86 70 64 71 72 85 62 70 74
## [76] 50 68 70 77 72 80 74 67 70 80 75 70 74 84 79 81 64 72 71 74 66 68 61 75 81
#### Option 3. use {{ x }}, this method also works well with dplyr pipes
## check https://dplyr.tidyverse.org/articles/programming.html for more details
example_func <- function(data, col, ...) {
data %>%
count({{ col }}) %>%
ggplot(aes(x = n, y = {{ col }}, ...)) + # ... passes the additional terms as aesthetics
geom_col()
}
PseudoData %>%
example_func(Major, fill = Major)
#### Other situation, need to copy a console output that can be directly ran
PseudoData %>% select(Grade_Math_Y1) %>% head(10) %>% dput()
## structure(list(Grade_Math_Y1 = c(56, 65, 46, 69, 84, 81, 62,
## 75, 56, 90)), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
## "data.frame"))
PseudoData %>% pull(Grade_Math_Y1) %>% head(10) %>% dput()
## c(56, 65, 46, 69, 84, 81, 62, 75, 56, 90)
## Some numeric value will be automatically presented in scientific notation
1/987654
## [1] 1.0125e-06
## If you want to prevent that from happening, try set the scipen in the option
options(scipen = 999)
1/987654
## [1] 0.0000010125
## Also, if you want to reset the setting, you can do (0 is the default for scipen)
options(scipen = 0)
1/987654
## [1] 1.0125e-06
#### Situation 1. single column n and percentage - using prop.table
## note, if need to include NA in calculating percentage, add in table(, useNA = "always)
tb <- table(PseudoData$Race)
paste0(tb, " (",format(round(prop.table(tb) * 100, 2), nsmall = 2), ")")
## [1] "19 (24.36)" "20 (25.64)" "15 (19.23)" "11 (14.10)" "13 (16.67)"
#### Situation 2. single column n and percentage - using dplyr version
## note, this version, NA was included by default if exist
np <-
PseudoData %>%
count(Race, .drop = F) %>% # .drop = F forbids the function to drop empty categories
mutate(p = format(round(n/sum(n)*100, 2), nsmall = 2)) %>%
mutate(n_p = paste0(n, " (", p, ")"))
np
## # A tibble: 6 × 4
## Race n p n_p
## <chr> <int> <chr> <chr>
## 1 Asian 19 19.00 19 (19.00)
## 2 Black 20 20.00 20 (20.00)
## 3 Multiple 15 15.00 15 (15.00)
## 4 Unknown 11 11.00 11 (11.00)
## 5 White 13 13.00 13 (13.00)
## 6 <NA> 22 22.00 22 (22.00)
#### Situation 3. cross-tab - using prop.table
tb2 <- table(PseudoData$Race, PseudoData$Ethnicity)
## row percentage
paste0(tb2," (",format(round(prop.table(tb2, 1) * 100, 2), nsmall = 2), ")")
## [1] "9 (47.37)" "9 (45.00)" "7 (46.67)" "7 (63.64)" "7 (53.85)"
## [6] "10 (52.63)" "11 (55.00)" "8 (53.33)" "4 (36.36)" "6 (46.15)"
## column percentage
paste0(tb2," (",format(round(prop.table(tb2, 2) * 100, 2), nsmall = 2), ")")
## [1] "9 (23.08)" "9 (23.08)" "7 (17.95)" "7 (17.95)" "7 (17.95)"
## [6] "10 (25.64)" "11 (28.21)" "8 (20.51)" "4 (10.26)" "6 (15.38)"
#### Of course, a dplyr version also exist but, as I know, is more complex than what was described using prop.table
#### Method 1. report p-value from regression/test summary
## using wilcoxon test as an example
wilc_example <- wilcox.test(PseudoData$Grade_Math_Y1~PseudoData$Gender)
p_val_fun <- function(test){
p_raw <- test$p.value
p_format <- ifelse(p_raw < 0.001, paste0("p<0.001"), paste0("p=", round(p_raw, 3)))
p_format
}
p_val_fun(wilc_example)
## [1] "p=0.162"
#### Method 2. use format.pval from base
format.pval(wilc_example$p.value, 3)
## [1] "0.162"
PseudoData %>% select(1:5) %>% slice(1:5) %>% knitr::kable()
| StudentID | DoB | Age | Gender | Race |
|---|---|---|---|---|
| 1 | 1993-01-31 | 26 | Female | Asian |
| 2 | 1991-01-04 | 28 | Female | Unknown |
| 3 | 1993-02-28 | 26 | Female | NA |
| 4 | 1990-11-28 | 29 | Male | Unknown |
| 5 | 1990-07-05 | 29 | Male | Multiple |
#### Situation 1. One-way frequency
library(janitor)
PseudoData %>%
tabyl(Grade_Math_Y1_Letter) %>%
adorn_totals("row") %>%
adorn_pct_formatting(digits = 2)
## Grade_Math_Y1_Letter n percent valid_percent
## A 2 2.00% 2.04%
## B 20 20.00% 20.41%
## C 32 32.00% 32.65%
## D 29 29.00% 29.59%
## F 15 15.00% 15.31%
## <NA> 2 2.00% -
## Total 100 100.00% 100.00%
#### Situation 2. Two-way frequency
## just the frequency
PseudoData %>%
tabyl(Employed, Grade_Math_Y1_Letter) %>%
adorn_totals(c("row", "col"))
## Employed A B C D F NA_ Total
## No 2 11 22 27 11 1 74
## Yes 0 9 10 2 4 1 26
## Total 2 20 32 29 15 2 100
## percentage (frequency)
PseudoData %>%
tabyl(Employed, Grade_Math_Y1_Letter) %>%
adorn_totals("col") %>%
adorn_percentages("all") %>%
adorn_pct_formatting(digits = 2) %>%
adorn_ns() %>%
reactable::reactable()
library(tableone)
col_list <- c("Age", "Gender", "Race", "Ethnicity", "Employed", "State")
fac_list <- col_list[-1]
report_table <- print(
CreateTableOne(
vars = col_list,
strata = "Employed",
data = PseudoData,
factorVars = fac_list,
includeNA = T,
test = F),
showAllLevels = T,
catDigits = 1,
contDigits = 1,
pDigits = 4
)
## Stratified by Employed
## level No Yes
## n 74 26
## Age (mean (SD)) 26.6 (1.2) 27.3 (1.3)
## Gender (%) Female 48 ( 64.9) 17 ( 65.4)
## Male 26 ( 35.1) 9 ( 34.6)
## Race (%) Asian 12 ( 16.2) 7 ( 26.9)
## Black 15 ( 20.3) 5 ( 19.2)
## Multiple 10 ( 13.5) 5 ( 19.2)
## Unknown 10 ( 13.5) 1 ( 3.8)
## White 11 ( 14.9) 2 ( 7.7)
## <NA> 16 ( 21.6) 6 ( 23.1)
## Ethnicity (%) Hispanic 42 ( 56.8) 9 ( 34.6)
## Non-Hispanic 32 ( 43.2) 17 ( 65.4)
## Employed (%) No 74 (100.0) 0 ( 0.0)
## Yes 0 ( 0.0) 26 (100.0)
## State (%) arizona 3 ( 4.1) 1 ( 3.8)
## arkansas 2 ( 2.7) 1 ( 3.8)
## california 3 ( 4.1) 1 ( 3.8)
## florida 5 ( 6.8) 0 ( 0.0)
## georgia 0 ( 0.0) 1 ( 3.8)
## illinois 0 ( 0.0) 2 ( 7.7)
## indiana 2 ( 2.7) 0 ( 0.0)
## iowa 0 ( 0.0) 1 ( 3.8)
## kansas 0 ( 0.0) 1 ( 3.8)
## kentucky 1 ( 1.4) 0 ( 0.0)
## louisiana 2 ( 2.7) 2 ( 7.7)
## maine 1 ( 1.4) 0 ( 0.0)
## maryland 2 ( 2.7) 1 ( 3.8)
## massachusetts 1 ( 1.4) 1 ( 3.8)
## michigan 4 ( 5.4) 1 ( 3.8)
## minnesota 3 ( 4.1) 0 ( 0.0)
## mississippi 2 ( 2.7) 1 ( 3.8)
## missouri 1 ( 1.4) 0 ( 0.0)
## nebraska 1 ( 1.4) 0 ( 0.0)
## nevada 1 ( 1.4) 0 ( 0.0)
## new hampshire 2 ( 2.7) 0 ( 0.0)
## new mexico 2 ( 2.7) 0 ( 0.0)
## new york 4 ( 5.4) 1 ( 3.8)
## north carolina 6 ( 8.1) 2 ( 7.7)
## ohio 3 ( 4.1) 0 ( 0.0)
## oregon 1 ( 1.4) 0 ( 0.0)
## rhode island 1 ( 1.4) 0 ( 0.0)
## south carolina 3 ( 4.1) 1 ( 3.8)
## tennessee 1 ( 1.4) 0 ( 0.0)
## texas 9 ( 12.2) 3 ( 11.5)
## virginia 0 ( 0.0) 1 ( 3.8)
## washington 1 ( 1.4) 2 ( 7.7)
## west virginia 4 ( 5.4) 1 ( 3.8)
## wisconsin 1 ( 1.4) 1 ( 3.8)
## wyoming 2 ( 2.7) 0 ( 0.0)
library(gtsummary)
## The tbl_summary outputs a nicely formatted table that can be directly reported in markdowns
PseudoData %>%
select(Employed, Grade_Math_Y1, Gender, Age, Race) %>%
tbl_summary(by = Employed) %>% # Use missing = "no" to exclude NA from the table
# add_p(pvalue_fun = ~style_pvalue(.x, digits = 2)) %>%
add_overall() %>%
add_n() %>%
modify_header(label ~ "**Variable**") %>%
modify_footnote(
all_stat_cols() ~ "Median (IQR) or Frequency (%)"
) %>%
bold_labels()
| Variable | N | Overall, N = 1001 | No, N = 741 | Yes, N = 261 |
|---|---|---|---|---|
| Grade_Math_Y1 | 98 | 70 (62, 76) | 69 (62, 74) | 75 (70, 82) |
| Unknown | 2 | 1 | 1 | |
| Gender | 100 | |||
| Female | 65 (65%) | 48 (65%) | 17 (65%) | |
| Male | 35 (35%) | 26 (35%) | 9 (35%) | |
| Age | 100 | |||
| 25 | 19 (19%) | 16 (22%) | 3 (12%) | |
| 26 | 27 (27%) | 22 (30%) | 5 (19%) | |
| 27 | 21 (21%) | 16 (22%) | 5 (19%) | |
| 28 | 22 (22%) | 14 (19%) | 8 (31%) | |
| 29 | 11 (11%) | 6 (8.1%) | 5 (19%) | |
| Race | 78 | |||
| Asian | 19 (24%) | 12 (21%) | 7 (35%) | |
| Black | 20 (26%) | 15 (26%) | 5 (25%) | |
| Multiple | 15 (19%) | 10 (17%) | 5 (25%) | |
| Unknown | 11 (14%) | 10 (17%) | 1 (5.0%) | |
| White | 13 (17%) | 11 (19%) | 2 (10%) | |
| Unknown | 22 | 16 | 6 | |
| 1 Median (IQR) or Frequency (%) | ||||
Tables created by compareGroups can be easily be exported to CSV, LaTeX, HTML, PDF, Word or Excel, or inserted in R-markdown files to generate reports automatically.
More details, please see this site.
library(compareGroups)
res <- compareGroups(Employed ~ Age + Gender + Grade_Math_Y1 + Grade_Engl_Y1_Letter + Major,
data = PseudoData)
res
##
##
## -------- Summary of results by groups of 'Employed'---------
##
##
## var N p.value method selection
## 1 Age 100 0.034** continuous normal ALL
## 2 Gender 100 1.000 categorical ALL
## 3 Grade_Math_Y1 98 0.177 continuous normal ALL
## 4 Grade_Engl_Y1_Letter 100 0.879 categorical ALL
## -----
## Signif. codes: 0 '**' 0.05 '*' 0.1 ' ' 1
createTable(res)
##
## --------Summary descriptives table by 'Employed'---------
##
## _______________________________________________________
## No Yes p.overall
## N=74 N=26
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Age 26.6 (1.25) 27.3 (1.31) 0.034
## Gender: 1.000
## Female 48 (64.9%) 17 (65.4%)
## Male 26 (35.1%) 9 (34.6%)
## Grade_Math_Y1 69.2 (9.73) 72.6 (11.2) 0.177
## Grade_Engl_Y1_Letter: 0.879
## A 9 (12.2%) 2 (7.69%)
## B 20 (27.0%) 7 (26.9%)
## C 24 (32.4%) 7 (26.9%)
## D 17 (23.0%) 8 (30.8%)
## F 4 (5.41%) 2 (7.69%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
It is often needed to extract regression coefficients and 95% confidence intervals from regression model and report just numbers instead of the summary table. Here are some quick functions to be used in that situation.
#### Situation 1. Cox regression, hazard ratio and 95% confidence interval
library(survival)
CoxReg <- coxph(Surv(Day2Employ, Employed == "Yes") ~ Gender, data = PseudoData)
HazardRatio <- function(x){
paste(format(round(summary(x)$conf.int[, 1], 2), nsmall = 2),
" (", format(round(summary(x)$conf.int[, 3], 2),nsmall = 2),
"-", format(round(summary(x)$conf.int[, 4], 2), nsmall = 2), ")", sep = "")}
HazardRatio(CoxReg)
## [1] "1.03 (0.46-2.31)"
#### Situation 2.a Logistic regression, odds ratio and 95% confidence interval (using base)
LogiReg <- glm(Employed == "Yes" ~ Age + Gender, data = PseudoData, family = "binomial")
OddsRatio <- function(x){
## note that confnt calculates confidence interval using profiling method
## while confint.default calculates confidence interval using Wald method, change if needed
temp <- exp(cbind(OR = coef(x), confint(x)))
output <- paste(format(round(temp[, 1], 2), nsmall = 2), "(",
format(round(temp[, 2], 2), nsmall = 2), "-",
format(round(temp[, 3], 2), nsmall = 2), ")")
return(temp)
}
OddsRatio(LogiReg)
## OR 2.5 % 97.5 %
## (Intercept) 6.859014e-06 2.242221e-10 0.1018555
## Age 1.496210e+00 1.048201e+00 2.1872222
## GenderMale 9.629803e-01 3.550460e-01 2.4941384
#### Situation 2.b Logistic regression, odds ratio and 95% confidence interval (using broom package)
broom::tidy(LogiReg, conf.int = T, exponentiate = T)
## # A tibble: 3 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.00000686 5.04 -2.36 0.0183 2.24e-10 0.102
## 2 Age 1.50 0.186 2.17 0.0303 1.05e+ 0 2.19
## 3 GenderMale 0.963 0.493 -0.0765 0.939 3.55e- 1 2.49
Functions from this package can format a variety types of tables to APA format, such as correlation table, regression table, ANOVA, and repeated measures. The output tables are saved as doc files by default. For details, see this post.
library(apaTables)
#### Example 1. Correlation table
PseudoData %>%
select(Grade_Art_Y1, Grade_Math_Y1, Grade_Stats_Y1, Grade_Sports_Y1) %>%
apa.cor.table(., filename="Table1_APA.doc", table.number=1)
##
##
## Table 1
##
## Means, standard deviations, and correlations with confidence intervals
##
##
## Variable M SD 1 2 3
## 1. Grade_Art_Y1 73.84 8.26
##
## 2. Grade_Math_Y1 70.06 10.18 .08
## [-.13, .28]
##
## 3. Grade_Stats_Y1 75.22 9.05 -.02 .09
## [-.22, .18] [-.11, .28]
##
## 4. Grade_Sports_Y1 74.92 9.62 .18 -.10 -.04
## [-.02, .37] [-.30, .10] [-.24, .16]
##
##
## Note. M and SD are used to represent mean and standard deviation, respectively.
## Values in square brackets indicate the 95% confidence interval.
## The confidence interval is a plausible range of population correlations
## that could have caused the sample correlation (Cumming, 2014).
## * indicates p < .05. ** indicates p < .01.
##
#### Example 2. Regression table
reg <- lm(Grade_Math_Y1 ~ Grade_Stats_Y1 + Grade_Sports_Y1 + Age, data = PseudoData)
apa.reg.table(reg, filename = "Table2_APA.doc", table.number = 2)
##
##
## Table 2
##
## Regression results using Grade_Math_Y1 as the criterion
##
##
## Predictor b b_95%_CI beta beta_95%_CI sr2 sr2_95%_CI r
## (Intercept) 55.37* [5.55, 105.18]
## Grade_Stats_Y1 0.10 [-0.14, 0.33] 0.08 [-0.12, 0.29] .01 [-.03, .04] .09
## Grade_Sports_Y1 -0.11 [-0.33, 0.12] -0.10 [-0.30, 0.11] .01 [-.03, .05] -.10
## Age 0.58 [-1.04, 2.19] 0.07 [-0.13, 0.28] .01 [-.02, .03] .08
##
##
##
## Fit
##
##
##
##
## R2 = .024
## 95% CI[.00,.08]
##
##
## Note. A significant b-weight indicates the beta-weight and semi-partial correlation are also significant.
## b represents unstandardized regression weights. beta indicates the standardized regression weights.
## sr2 represents the semi-partial correlation squared. r represents the zero-order correlation.
## Square brackets are used to enclose the lower and upper limits of a confidence interval.
## * indicates p < .05. ** indicates p < .01.
##
#### Example 3. 2-way ANOVA table
options(contrasts = c("contr.sum", "contr.poly"))
lm_output <- lm(Grade_Math_Y1 ~ Employed * Scholarship, data = PseudoData)
apa.aov.table(lm_output, filename = "Table3_APA.doc", table.number = 7)
##
##
## Table 7
##
## ANOVA results using Grade_Math_Y1 as the dependent variable
##
##
## Predictor SS df MS F p partial_eta2
## (Intercept) 268270.63 1 268270.63 2576.00 .000
## Employed 134.24 1 134.24 1.29 .259 .01
## Scholarship 42.09 1 42.09 0.40 .526 .00
## Employed x Scholarship 5.07 1 5.07 0.05 .826 .00
## Error 9789.36 94 104.14
## CI_90_partial_eta2
##
## [.00, .07]
## [.00, .05]
## [.00, .03]
##
##
## Note: Values in square brackets indicate the bounds of the 90% confidence interval for partial eta-squared
#### use wilcoxon as an example
result <- list()
for(i in c(7:26)){
fmla <- paste0("PseudoData$",names(PseudoData)[i], "~ PseudoData$Gender")
result[[i]]<- eval(parse(text=paste0("wilcox.test(",
fmla, ", paired = FALSE)")))
}
result
#### use logistic regression as an example
fun1<-function(x){
res<-c(paste(as.character(summary(x)$call),collapse = " "),
## note, only the gender coefficient (3rd row in the coefficient table) was reported
summary(x)$coefficients[3,4],
exp(coef(x))[3],
exp(confint(x))[3,1:2],"\n")
names(res)<-c("call","p-value","OR","LCI","UCI","")
return(res)}
res2=NULL
lms=list()
for(i in 27:30)
{
lms[[i]]=glm(PseudoData[,i] == "A"
~ Age +
as.factor(Gender),
family = "binomial", data = PseudoData)
res2<-rbind(res2,fun1(lms[[i]]))
}
res2
## call
## [1,] "glm PseudoData[, i] == \"A\" ~ Age + as.factor(Gender) binomial PseudoData"
## [2,] "glm PseudoData[, i] == \"A\" ~ Age + as.factor(Gender) binomial PseudoData"
## [3,] "glm PseudoData[, i] == \"A\" ~ Age + as.factor(Gender) binomial PseudoData"
## [4,] "glm PseudoData[, i] == \"A\" ~ Age + as.factor(Gender) binomial PseudoData"
## p-value OR LCI
## [1,] "0.694933206189563" "0.754169390410814" "0.147688465301043"
## [2,] "0.651868303069197" "1.30416846723969" "0.455591375491405"
## [3,] "0.221655500497209" "1.64805194858341" "0.802210520880066"
## [4,] "0.47828655276198" "0.695304803476652" "0.236079607524523"
## UCI
## [1,] "3.85904159901459" "\n"
## [2,] "5.93665240232086" "\n"
## [3,] "4.32316360162714" "\n"
## [4,] "2.04728081413273" "\n"
The compare function from package waldo returns a color-coded output indicating what was different from the two/multiple objects. It works with vector, list, data frame or tibble.
waldo::compare(c(1, 1, 2, 3), c(2, 2, 2, 3))
## `old`: 1 1 2 3
## `new`: 2 2 2 3
#### Situation 1. Any of the columns
Any_Math_A <- apply(PseudoData[,str_detect(colnames(PseudoData),"Grade_Math") &
str_detect(colnames(PseudoData),"Letter")],1,
function(x){ifelse(any(x %in% "A"),1,0)})
#### Situation 2. All of the columns
All_Math_A <- apply(PseudoData[,str_detect(colnames(PseudoData),"Grade_Math") &
str_detect(colnames(PseudoData),"Letter")],1,
function(x){ifelse(all(x %in% "A"),1,0)})
#### Situation 1. Create empty data frame or tibble
Empty_DF <- data.frame(matrix(nrow = 100, ncol = 0))
Empty_1Tibble <- as_tibble(matrix(nrow = 100, ncol = 0))
#### Situation 2. Create data frame or tibble based on existing column
Temp_DF <- data.frame(Col_A = PseudoData$StudentID)
Temp_tibble <- tibble(Col_A = PseudoData$StudentID)
#### Situation 3. Create data frame or tibble by entering data
Temp_DF <- data.frame("Col A" = c(0,1,2),
"Col B" = c("A", "B", "C"),
"Col C" = factor(c("A", "B", "C")),
stringsAsFactors = F)
Test_Tibble <- tibble(Col_A = c(0,1,2),
Col_B = c("A","B","C"),
Col_C = factor(c("A","B", "D")),
NEW_COL = Col_A*Col_A)
#### Situation 4. Create a tibble that is based on combinations of multiple categorical variable
Temp_DF <- crossing(ID = 1:5, Year = 1:5, class = c("A", "B", "C"))
#### Situation 5. Transform a data frame to a tibble-df object
Temp_TibbleDF <- PseudoData %>% tbl_df()
#### Situation 6. Merge multiple data frames at once
Reduce(function(x, y) merge(x, y, by = "",all = TRUE), list(df1, df2, df3))
#### Situation 1. Identify complete rows in a data frame
complete.cases(PseudoData)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [61] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [73] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [97] FALSE TRUE FALSE FALSE
#### Situation 2. Replace certain entry as NA
## example of replacing single column
PseudoData %>% mutate(Race_na = na_if(Race, "Unknown")) %>% count(Race_na, Race)
## # A tibble: 6 × 3
## Race_na Race n
## <chr> <chr> <int>
## 1 Asian Asian 19
## 2 Black Black 20
## 3 Multiple Multiple 15
## 4 White White 13
## 5 <NA> Unknown 11
## 6 <NA> <NA> 22
## can also work with multiple columns
PseudoData %>%
mutate_at(vars(contains("Letter")), list(~ na_if(., "F"))) %>%
count(Grade_Math_Y1_Letter)
## # A tibble: 5 × 2
## Grade_Math_Y1_Letter n
## <chr> <int>
## 1 A 2
## 2 B 20
## 3 C 32
## 4 D 29
## 5 <NA> 17
#### Situation 3. Replace NA as certain entry
PseudoData %>%
replace_na(list(Race = "Missing",
Grade_Art_Y1_Letter = "Other")) %>%
count(Race, Grade_Art_Y1_Letter)
## # A tibble: 25 × 3
## Race Grade_Art_Y1_Letter n
## <chr> <chr> <int>
## 1 Asian A 1
## 2 Asian B 4
## 3 Asian C 10
## 4 Asian D 4
## 5 Black B 2
## 6 Black C 8
## 7 Black D 8
## 8 Black Other 2
## 9 Missing A 1
## 10 Missing B 6
## # … with 15 more rows
## # ℹ Use `print(n = ...)` to see more rows
#### Situation 4. fill NA of one column with values from a specified column
PseudoData %>%
select(Grade_Math_Y1, Grade_Math_Y2) %>%
mutate(Grade_Math_Y2_New = coalesce(Grade_Math_Y2, Grade_Math_Y1)) %>%
filter(is.na(Grade_Math_Y2))
## # A tibble: 1 × 3
## Grade_Math_Y1 Grade_Math_Y2 Grade_Math_Y2_New
## <dbl> <dbl> <dbl>
## 1 79 NA 79
#### Situation 5. fill NA of one column with value of the next or the prior entry
PseudoData %>%
select(Grade_Math_Y1_Letter) %>%
fill(Grade_Math_Y1_Letter, .direction = "down")
## # A tibble: 100 × 1
## Grade_Math_Y1_Letter
## <chr>
## 1 F
## 2 D
## 3 F
## 4 D
## 5 B
## 6 B
## 7 D
## 8 C
## 9 F
## 10 A
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
#### Situation 6. remove rows with NA in any column
PseudoData %>% drop_na() %>% dim()
## [1] 12 58
#### Situation 1. Count a single column
## generate a frequency table
PseudoData %>% count(Major, name = "Major_n", sort = T)
## # A tibble: 12 × 2
## Major Major_n
## <chr> <int>
## 1 <NA> 13
## 2 Unknown 12
## 3 Nursing 11
## 4 Biology 10
## 5 Computer Science 9
## 6 Business 8
## 7 Communications 8
## 8 English 8
## 9 Public Health 6
## 10 Statistics 6
## 11 History 5
## 12 Sustainability 4
## or add a column counting that variable
PseudoData %>% add_count(Race, name = "Race_count") %>% select(Race, Race_count)
## # A tibble: 100 × 2
## Race Race_count
## <chr> <int>
## 1 Asian 19
## 2 Unknown 11
## 3 <NA> 22
## 4 Unknown 11
## 5 Multiple 15
## 6 White 13
## 7 Asian 19
## 8 <NA> 22
## 9 White 13
## 10 <NA> 22
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
#### Situation 2. Count multiple columns
## if the no observation falls in the combination, n is NA
PseudoData %>% count(Race, Major) %>% complete(Race, Major)
## # A tibble: 72 × 3
## Race Major n
## <chr> <chr> <int>
## 1 Asian Biology 3
## 2 Asian Business 1
## 3 Asian Communications 4
## 4 Asian Computer Science 1
## 5 Asian English 1
## 6 Asian History NA
## 7 Asian Nursing 4
## 8 Asian Public Health 2
## 9 Asian Statistics NA
## 10 Asian Sustainability NA
## # … with 62 more rows
## # ℹ Use `print(n = ...)` to see more rows
## can also change that NA to 0 by adding fill = list(n = 0)
PseudoData %>% count(Race, Major) %>% complete(Race, Major, fill = list(n = 0))
## # A tibble: 72 × 3
## Race Major n
## <chr> <chr> <int>
## 1 Asian Biology 3
## 2 Asian Business 1
## 3 Asian Communications 4
## 4 Asian Computer Science 1
## 5 Asian English 1
## 6 Asian History 0
## 7 Asian Nursing 4
## 8 Asian Public Health 2
## 9 Asian Statistics 0
## 10 Asian Sustainability 0
## # … with 62 more rows
## # ℹ Use `print(n = ...)` to see more rows
#### Situation 3. count the time each entry appeared (sequence)
## often useful when having repeated data.
PseudoData %>%
# give the data an order to count sequence
arrange(DoB) %>%
select(DoB, Major) %>%
# code occurrence of each Major (when did each major had its first/2nd/3rd students)
group_by(Major) %>%
mutate(occurrance = sequence(n()))
## # A tibble: 100 × 3
## # Groups: Major [12]
## DoB Major occurrance
## <date> <chr> <int>
## 1 1990-02-27 Computer Science 1
## 2 1990-03-18 Computer Science 2
## 3 1990-07-05 <NA> 1
## 4 1990-08-19 <NA> 2
## 5 1990-09-22 History 1
## 6 1990-09-27 Unknown 1
## 7 1990-09-30 <NA> 3
## 8 1990-10-16 Public Health 1
## 9 1990-11-19 <NA> 4
## 10 1990-11-28 Public Health 2
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
#### Situation 1. Arrange the data based on a given variable
## remove desc then arrange the data in increasing order
PseudoData %>% arrange(desc(DoB)) %>% head(10) %>% select(StudentID, DoB)
## # A tibble: 10 × 2
## StudentID DoB
## <int> <date>
## 1 53 1994-12-26
## 2 87 1994-12-16
## 3 98 1994-11-18
## 4 37 1994-11-07
## 5 11 1994-10-02
## 6 14 1994-09-27
## 7 23 1994-09-23
## 8 77 1994-09-20
## 9 20 1994-06-28
## 10 45 1994-06-18
#### Situation 2. After arranging the data, keep only one observation for each category of certain variable
PseudoData %>% arrange(desc(Grade_Math_Y1)) %>% distinct(Major, .keep_all = T) %>% dim()
## [1] 12 58
#### Situation 3. Create a rank column based on order of a variable
## works with both categorical variable and continuous variable
# rank method will give ties a gap
PseudoData %>% select(Grade_Stats_Y1_Letter) %>%
mutate(rank = rank(Grade_Stats_Y1_Letter, ties.method = "min")) %>%
count(Grade_Stats_Y1_Letter, rank)
## # A tibble: 6 × 3
## Grade_Stats_Y1_Letter rank n
## <chr> <int> <int>
## 1 A 1 4
## 2 B 5 28
## 3 C 33 46
## 4 D 79 17
## 5 F 96 4
## 6 <NA> 100 1
# dense_rank will not give ties a gap
PseudoData %>% select(Grade_Stats_Y1_Letter) %>%
mutate(rank = dense_rank(Grade_Stats_Y1_Letter)) %>%
count(Grade_Stats_Y1_Letter, rank)
## # A tibble: 6 × 3
## Grade_Stats_Y1_Letter rank n
## <chr> <int> <int>
## 1 A 1 4
## 2 B 2 28
## 3 C 3 46
## 4 D 4 17
## 5 F 5 4
## 6 <NA> NA 1
#### Situation 1. Rename columns
## can use both rename
temp <- PseudoData %>% rename(NewName4Major = Major,
# or use column index (column number)
NewName4Age = 3)
## or rename within select
temp <- PseudoData %>% select(NewName4Major = Major,
NewName4Age = Age)
## or rename all if applies
temp <- PseudoData %>%
rename_all(str_replace, "Grade", "G")
#### Situation 2. Select columns
## select the columns starts_with certain string (does not work with |)
PseudoData %>% select(StudentID, starts_with('Grade',ignore.case=TRUE)) %>% colnames()
## [1] "StudentID" "Grade_Math_Y1" "Grade_Stats_Y1"
## [4] "Grade_Engl_Y1" "Grade_Art_Y1" "Grade_Sports_Y1"
## [7] "Grade_Math_Y2" "Grade_Stats_Y2" "Grade_Engl_Y2"
## [10] "Grade_Art_Y2" "Grade_Sports_Y2" "Grade_Math_Y3"
## [13] "Grade_Stats_Y3" "Grade_Engl_Y3" "Grade_Art_Y3"
## [16] "Grade_Sports_Y3" "Grade_Math_Y4" "Grade_Stats_Y4"
## [19] "Grade_Engl_Y4" "Grade_Art_Y4" "Grade_Sports_Y4"
## [22] "Grade_Math_Y1_Letter" "Grade_Stats_Y1_Letter" "Grade_Engl_Y1_Letter"
## [25] "Grade_Art_Y1_Letter" "Grade_Sports_Y1_Letter" "Grade_Math_Y2_Letter"
## [28] "Grade_Stats_Y2_Letter" "Grade_Engl_Y2_Letter" "Grade_Art_Y2_Letter"
## [31] "Grade_Sports_Y2_Letter" "Grade_Math_Y3_Letter" "Grade_Stats_Y3_Letter"
## [34] "Grade_Engl_Y3_Letter" "Grade_Art_Y3_Letter" "Grade_Sports_Y3_Letter"
## [37] "Grade_Math_Y4_Letter" "Grade_Stats_Y4_Letter" "Grade_Engl_Y4_Letter"
## [40] "Grade_Art_Y4_Letter" "Grade_Sports_Y4_Letter"
## or contains certain string (does not work with |)
PseudoData %>% select(contains("Math")) %>% colnames()
## [1] "Grade_Math_Y1" "Grade_Math_Y2" "Grade_Math_Y3"
## [4] "Grade_Math_Y4" "Grade_Math_Y1_Letter" "Grade_Math_Y2_Letter"
## [7] "Grade_Math_Y3_Letter" "Grade_Math_Y4_Letter"
## or match certain strings (works well with |)
PseudoData %>% select(matches("Math|Age")) %>% colnames()
## [1] "Age" "Grade_Math_Y1" "Grade_Math_Y2"
## [4] "Grade_Math_Y3" "Grade_Math_Y4" "Grade_Math_Y1_Letter"
## [7] "Grade_Math_Y2_Letter" "Grade_Math_Y3_Letter" "Grade_Math_Y4_Letter"
PseudoData %>% select(matches("Math.+Letter")) %>% colnames()
## [1] "Grade_Math_Y1_Letter" "Grade_Math_Y2_Letter" "Grade_Math_Y3_Letter"
## [4] "Grade_Math_Y4_Letter"
PseudoData %>% select(matches("Math|Stats"), matches("Art|Sports")) %>% colnames()
## [1] "Grade_Math_Y1" "Grade_Stats_Y1" "Grade_Math_Y2"
## [4] "Grade_Stats_Y2" "Grade_Math_Y3" "Grade_Stats_Y3"
## [7] "Grade_Math_Y4" "Grade_Stats_Y4" "Grade_Math_Y1_Letter"
## [10] "Grade_Stats_Y1_Letter" "Grade_Math_Y2_Letter" "Grade_Stats_Y2_Letter"
## [13] "Grade_Math_Y3_Letter" "Grade_Stats_Y3_Letter" "Grade_Math_Y4_Letter"
## [16] "Grade_Stats_Y4_Letter" "Grade_Art_Y1" "Grade_Sports_Y1"
## [19] "Grade_Art_Y2" "Grade_Sports_Y2" "Grade_Art_Y3"
## [22] "Grade_Sports_Y3" "Grade_Art_Y4" "Grade_Sports_Y4"
## [25] "Grade_Art_Y1_Letter" "Grade_Sports_Y1_Letter" "Grade_Art_Y2_Letter"
## [28] "Grade_Sports_Y2_Letter" "Grade_Art_Y3_Letter" "Grade_Sports_Y3_Letter"
## [31] "Grade_Art_Y4_Letter" "Grade_Sports_Y4_Letter"
#### Situation 3. Use select to reorder the columns
## say you want to have Age and Gender to be the first few columns
PseudoData %>% select(Age, Gender, everything()) %>% colnames()
## [1] "Age" "Gender" "StudentID"
## [4] "DoB" "Race" "Ethnicity"
## [7] "Grade_Math_Y1" "Grade_Stats_Y1" "Grade_Engl_Y1"
## [10] "Grade_Art_Y1" "Grade_Sports_Y1" "Grade_Math_Y2"
## [13] "Grade_Stats_Y2" "Grade_Engl_Y2" "Grade_Art_Y2"
## [16] "Grade_Sports_Y2" "Grade_Math_Y3" "Grade_Stats_Y3"
## [19] "Grade_Engl_Y3" "Grade_Art_Y3" "Grade_Sports_Y3"
## [22] "Grade_Math_Y4" "Grade_Stats_Y4" "Grade_Engl_Y4"
## [25] "Grade_Art_Y4" "Grade_Sports_Y4" "Grade_Math_Y1_Letter"
## [28] "Grade_Stats_Y1_Letter" "Grade_Engl_Y1_Letter" "Grade_Art_Y1_Letter"
## [31] "Grade_Sports_Y1_Letter" "Grade_Math_Y2_Letter" "Grade_Stats_Y2_Letter"
## [34] "Grade_Engl_Y2_Letter" "Grade_Art_Y2_Letter" "Grade_Sports_Y2_Letter"
## [37] "Grade_Math_Y3_Letter" "Grade_Stats_Y3_Letter" "Grade_Engl_Y3_Letter"
## [40] "Grade_Art_Y3_Letter" "Grade_Sports_Y3_Letter" "Grade_Math_Y4_Letter"
## [43] "Grade_Stats_Y4_Letter" "Grade_Engl_Y4_Letter" "Grade_Art_Y4_Letter"
## [46] "Grade_Sports_Y4_Letter" "Major" "Scholarship"
## [49] "Schol_Amount" "ProjectedEarning" "JobHuntDate"
## [52] "Employed" "EmployDate" "Day2Employ"
## [55] "Fav_Jane" "Longitude" "Latitude"
## [58] "State"
## or you want to order the columns alphabetically
PseudoData %>% select(sort(colnames(PseudoData))) %>% colnames()
## [1] "Age" "Day2Employ" "DoB"
## [4] "EmployDate" "Employed" "Ethnicity"
## [7] "Fav_Jane" "Gender" "Grade_Art_Y1"
## [10] "Grade_Art_Y1_Letter" "Grade_Art_Y2" "Grade_Art_Y2_Letter"
## [13] "Grade_Art_Y3" "Grade_Art_Y3_Letter" "Grade_Art_Y4"
## [16] "Grade_Art_Y4_Letter" "Grade_Engl_Y1" "Grade_Engl_Y1_Letter"
## [19] "Grade_Engl_Y2" "Grade_Engl_Y2_Letter" "Grade_Engl_Y3"
## [22] "Grade_Engl_Y3_Letter" "Grade_Engl_Y4" "Grade_Engl_Y4_Letter"
## [25] "Grade_Math_Y1" "Grade_Math_Y1_Letter" "Grade_Math_Y2"
## [28] "Grade_Math_Y2_Letter" "Grade_Math_Y3" "Grade_Math_Y3_Letter"
## [31] "Grade_Math_Y4" "Grade_Math_Y4_Letter" "Grade_Sports_Y1"
## [34] "Grade_Sports_Y1_Letter" "Grade_Sports_Y2" "Grade_Sports_Y2_Letter"
## [37] "Grade_Sports_Y3" "Grade_Sports_Y3_Letter" "Grade_Sports_Y4"
## [40] "Grade_Sports_Y4_Letter" "Grade_Stats_Y1" "Grade_Stats_Y1_Letter"
## [43] "Grade_Stats_Y2" "Grade_Stats_Y2_Letter" "Grade_Stats_Y3"
## [46] "Grade_Stats_Y3_Letter" "Grade_Stats_Y4" "Grade_Stats_Y4_Letter"
## [49] "JobHuntDate" "Latitude" "Longitude"
## [52] "Major" "ProjectedEarning" "Race"
## [55] "Schol_Amount" "Scholarship" "State"
## [58] "StudentID"
#### Situation 4. Clean the column names
## try also case = c("title", "snake", "all_caps“)
PseudoData %>% janitor::clean_names(case = "snake") %>% colnames()
## [1] "student_id" "do_b" "age"
## [4] "gender" "race" "ethnicity"
## [7] "grade_math_y1" "grade_stats_y1" "grade_engl_y1"
## [10] "grade_art_y1" "grade_sports_y1" "grade_math_y2"
## [13] "grade_stats_y2" "grade_engl_y2" "grade_art_y2"
## [16] "grade_sports_y2" "grade_math_y3" "grade_stats_y3"
## [19] "grade_engl_y3" "grade_art_y3" "grade_sports_y3"
## [22] "grade_math_y4" "grade_stats_y4" "grade_engl_y4"
## [25] "grade_art_y4" "grade_sports_y4" "grade_math_y1_letter"
## [28] "grade_stats_y1_letter" "grade_engl_y1_letter" "grade_art_y1_letter"
## [31] "grade_sports_y1_letter" "grade_math_y2_letter" "grade_stats_y2_letter"
## [34] "grade_engl_y2_letter" "grade_art_y2_letter" "grade_sports_y2_letter"
## [37] "grade_math_y3_letter" "grade_stats_y3_letter" "grade_engl_y3_letter"
## [40] "grade_art_y3_letter" "grade_sports_y3_letter" "grade_math_y4_letter"
## [43] "grade_stats_y4_letter" "grade_engl_y4_letter" "grade_art_y4_letter"
## [46] "grade_sports_y4_letter" "major" "scholarship"
## [49] "schol_amount" "projected_earning" "job_hunt_date"
## [52] "employed" "employ_date" "day2employ"
## [55] "fav_jane" "longitude" "latitude"
## [58] "state"
#### Situation 5. Conditional mutate
## mutate the column if the column meets certain format
PseudoData %>%
mutate_if(is.numeric, ~ ifelse(. %in% NA, 0,.)) %>%
pull(Grade_Math_Y1)
## [1] 56 65 46 69 84 81 62 75 56 90 66 93 62 42 72 71 69 82 76 64 72 59 85 54 55
## [26] 0 84 84 56 82 82 85 56 71 82 68 62 84 53 0 73 63 83 62 74 53 76 57 62 74
## [51] 60 66 72 58 75 68 71 61 69 68 75 84 56 61 65 68 86 70 64 71 72 85 62 70 74
## [76] 50 68 70 77 72 80 74 67 70 80 75 70 74 84 79 81 64 72 71 74 66 68 61 75 81
## mutate the column if the column name meets certain string
PseudoData %>%
mutate_at(vars(ends_with("Letter")), ~ifelse(. %in% NA, "U", .)) %>%
count(Grade_Math_Y1_Letter)
## # A tibble: 6 × 2
## Grade_Math_Y1_Letter n
## <chr> <int>
## 1 A 2
## 2 B 20
## 3 C 32
## 4 D 29
## 5 F 15
## 6 U 2
## mutate columns from a to b
PseudoData %>%
mutate_at(vars(Grade_Math_Y1_Letter : Grade_Sports_Y4_Letter), ~ifelse(. %in% NA, "U", .)) %>%
select(ends_with("Letter")) %>%
complete.cases()
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#### Situation 6. Transmute
## Only output the data that were mutated
PseudoData %>% transmute(test = Age) %>% colnames()
## [1] "test"
#### Situation 7. separate columns into multiple columns/rows based on separator
## into multiple columns
PseudoData %>%
select(StudentID, DoB) %>%
separate(DoB,
c("Y_DoB", "M_Dob", "D_DoB"),
sep="-",
extra = "merge",
fill = "right")
## # A tibble: 100 × 4
## StudentID Y_DoB M_Dob D_DoB
## <int> <chr> <chr> <chr>
## 1 1 1993 01 31
## 2 2 1991 01 04
## 3 3 1993 02 28
## 4 4 1990 11 28
## 5 5 1990 07 05
## 6 6 1994 02 04
## 7 7 1991 09 07
## 8 8 1992 07 26
## 9 9 1991 11 04
## 10 10 1992 10 10
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
#### Situation 1. slice rows
## show rows with lowest/highest n rows a certain variable
PseudoData %>% slice_min(Grade_Math_Y1, n = 5) %>% select(StudentID, Grade_Math_Y1)
## # A tibble: 5 × 2
## StudentID Grade_Math_Y1
## <int> <dbl>
## 1 14 42
## 2 3 46
## 3 76 50
## 4 39 53
## 5 46 53
PseudoData %>% slice_max(Grade_Math_Y1, n = 5) %>% select(StudentID, Grade_Math_Y1)
## # A tibble: 6 × 2
## StudentID Grade_Math_Y1
## <int> <dbl>
## 1 12 93
## 2 10 90
## 3 67 86
## 4 23 85
## 5 32 85
## 6 72 85
## random select rows
# use slice_sample
PseudoData %>% slice_sample(n = 5) %>% select(StudentID)
## # A tibble: 5 × 1
## StudentID
## <int>
## 1 10
## 2 99
## 3 52
## 4 93
## 5 79
# use sample_n, allows for replaceable
PseudoData %>% sample_n(size = 5, replace = F) %>% select(StudentID)
## # A tibble: 5 × 1
## StudentID
## <int>
## 1 75
## 2 32
## 3 29
## 4 73
## 5 45
## slice based on current row order (row number)
# first and last 10 rows
PseudoData %>% slice(c(1:10, seq(n()-9, n()))) %>% nrow()
## [1] 20
# first n rows
PseudoData %>% head(n = 5) %>% nrow()
## [1] 5
# last n rows
PseudoData %>% tail(n = 5) %>% nrow()
## [1] 5
#### Situation 2. quick row operations (all should work with group_by)
## add row number as identifier
PseudoData %>% mutate(row_id = row_number()) %>% pull(row_id)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100
## report the nth observation downwards from the current
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Lead = lead(Grade_Math_Y1, n = 1, default = NA))
## # A tibble: 100 × 2
## Grade_Math_Y1 Lead
## <dbl> <dbl>
## 1 56 65
## 2 65 46
## 3 46 69
## 4 69 84
## 5 84 81
## 6 81 62
## 7 62 75
## 8 75 56
## 9 56 90
## 10 90 66
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
## report the nth observation upwards from the current
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Lag = lag(Grade_Math_Y1, n = 1, default = NA))
## # A tibble: 100 × 2
## Grade_Math_Y1 Lag
## <dbl> <dbl>
## 1 56 NA
## 2 65 56
## 3 46 65
## 4 69 46
## 5 84 69
## 6 81 84
## 7 62 81
## 8 75 62
## 9 56 75
## 10 90 56
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
## report cumulative sum
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Cumsum = cumsum(Grade_Math_Y1))
## # A tibble: 100 × 2
## Grade_Math_Y1 Cumsum
## <dbl> <dbl>
## 1 56 56
## 2 65 121
## 3 46 167
## 4 69 236
## 5 84 320
## 6 81 401
## 7 62 463
## 8 75 538
## 9 56 594
## 10 90 684
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
# can also be used to count occurrence
PseudoData %>% transmute(unit = 1) %>% mutate(occurance = cumsum(unit))
## # A tibble: 100 × 2
## unit occurance
## <dbl> <dbl>
## 1 1 1
## 2 1 2
## 3 1 3
## 4 1 4
## 5 1 5
## 6 1 6
## 7 1 7
## 8 1 8
## 9 1 9
## 10 1 10
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
## report cumulative probability
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Cumprob = cumprod(Grade_Math_Y1))
## # A tibble: 100 × 2
## Grade_Math_Y1 Cumprob
## <dbl> <dbl>
## 1 56 5.6 e 1
## 2 65 3.64e 3
## 3 46 1.67e 5
## 4 69 1.16e 7
## 5 84 9.70e 8
## 6 81 7.86e10
## 7 62 4.87e12
## 8 75 3.66e14
## 9 56 2.05e16
## 10 90 1.84e18
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
## report cumulative minimum
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Cummin = cummin(Grade_Math_Y1))
## # A tibble: 100 × 2
## Grade_Math_Y1 Cummin
## <dbl> <dbl>
## 1 56 56
## 2 65 56
## 3 46 46
## 4 69 46
## 5 84 46
## 6 81 46
## 7 62 46
## 8 75 46
## 9 56 46
## 10 90 46
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
## report cumulative maximum
PseudoData %>% select(Grade_Math_Y1) %>% mutate(Cummax = cummax(Grade_Math_Y1))
## # A tibble: 100 × 2
## Grade_Math_Y1 Cummax
## <dbl> <dbl>
## 1 56 56
## 2 65 65
## 3 46 65
## 4 69 69
## 5 84 84
## 6 81 84
## 7 62 84
## 8 75 84
## 9 56 84
## 10 90 90
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
#### Situation 3. report the highest or lowest number across multiple columns for each row
PseudoData %>%
select(Grade_Math_Y1, Grade_Math_Y2, Grade_Math_Y3, Grade_Math_Y4) %>%
mutate(min = pmin(Grade_Math_Y1, Grade_Math_Y2, Grade_Math_Y3, Grade_Math_Y4),
max = pmax(Grade_Math_Y1, Grade_Math_Y2, Grade_Math_Y3, Grade_Math_Y4, na.rm = T)) %>%
filter(Grade_Math_Y1 %in% NA | Grade_Math_Y2 %in% NA)
## # A tibble: 3 × 6
## Grade_Math_Y1 Grade_Math_Y2 Grade_Math_Y3 Grade_Math_Y4 min max
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NA 83 82 88 NA 88
## 2 NA 86 92 98 NA 98
## 3 79 NA 79 90 NA 90
#### Situation 4. separate data into multiple rows based on one column
PseudoData %>% select(StudentID, DoB) %>% separate_rows(DoB, sep="-", convert = T)
## # A tibble: 300 × 2
## StudentID DoB
## <int> <int>
## 1 1 1993
## 2 1 1
## 3 1 NA
## 4 2 1991
## 5 2 1
## 6 2 NA
## 7 3 1993
## 8 3 2
## 9 3 NA
## 10 4 1990
## # … with 290 more rows
## # ℹ Use `print(n = ...)` to see more rows
#### Situation 1. Report summarized data after grouping by certain variable
## summarize single variable
PseudoData %>%
group_by(Major) %>%
summarize(Med_Math = median(Grade_Math_Y1, na.rm = T),
Min_Math = min(Grade_Math_Y1, na.rm = T),
First_Math = first(Grade_Math_Y1),
Count = n(),
Uniq_Race = n_distinct(Race))
## # A tibble: 12 × 6
## Major Med_Math Min_Math First_Math Count Uniq_Race
## <chr> <dbl> <dbl> <dbl> <int> <int>
## 1 Biology 75.5 62 90 10 4
## 2 Business 72.5 69 93 8 4
## 3 Communications 68 54 72 8 4
## 4 Computer Science 74 42 42 9 5
## 5 English 69 57 82 8 4
## 6 History 73 56 71 5 3
## 7 Nursing 64 50 56 11 6
## 8 Public Health 72 53 69 6 5
## 9 Statistics 70 56 81 6 2
## 10 Sustainability 66.5 56 56 4 4
## 11 Unknown 69.5 46 65 12 6
## 12 <NA> 75 64 84 13 6
## summarize multiple selected variables using the same metrics
PseudoData %>%
group_by(Major) %>%
summarize_at(vars(Grade_Math_Y1, Grade_Math_Y2,
Grade_Math_Y3, Grade_Math_Y4), mean, na.rm=T)
## # A tibble: 12 × 5
## Major Grade_Math_Y1 Grade_Math_Y2 Grade_Math_Y3 Grade_Math_Y4
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Biology 74.8 78.6 80.2 86.3
## 2 Business 76.1 78.1 80.4 81.8
## 3 Communications 66 65.1 69.1 68.9
## 4 Computer Science 70.2 72.1 70 73.4
## 5 English 67.2 70.4 74.1 74.5
## 6 History 71.8 75.2 76.2 78.8
## 7 Nursing 63.2 67.7 73.7 74.4
## 8 Public Health 71 71.7 75.3 74.5
## 9 Statistics 70.7 71.5 71.8 69.5
## 10 Sustainability 68.2 72 73.8 75.2
## 11 Unknown 67.8 70.4 72.2 75.8
## 12 <NA> 73.5 77.5 79.7 81.9
## summarize multiple variables using different metrics
PseudoData %>%
select(Major, Age, Grade_Math_Y1: Grade_Math_Y4) %>%
group_by(Major) %>%
summarize(across(Grade_Math_Y1: Grade_Math_Y4,
list(missing = ~ mean(is.na(.)),
avg = ~ mean(., na.rm=T),
total = ~ sum(., na.rm=T),
ave_among_age25 = ~ mean(.[Age > 25], na.rm=T))))
## # A tibble: 12 × 65
## Major Grade…¹ Grade…² Grade…³ Grade…⁴ Grade…⁵ Grade…⁶ Grade…⁷ Grade…⁸ Grade…⁹
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Biol… 0 74.8 748 75.8 0.1 74.4 670 75.9 0
## 2 Busi… 0 76.1 609 77.1 0 77.9 623 78.1 0
## 3 Comm… 0 66 528 64.9 0 75.8 606 76.6 0
## 4 Comp… 0 70.2 632 75.6 0 76.9 692 77.7 0
## 5 Engl… 0 67.2 538 66.5 0 75.1 601 77.3 0
## 6 Hist… 0.2 71.8 287 72 0 81.6 408 81.5 0
## 7 Nurs… 0.0909 63.2 632 61.1 0 77.9 857 74.4 0
## 8 Publ… 0 71 426 68.2 0 72.2 433 74.8 0
## 9 Stat… 0 70.7 424 69 0 75.7 454 74.7 0
## 10 Sust… 0 68.2 273 63 0 79 316 79 0
## 11 Unkn… 0 67.8 814 66.5 0 69.7 836 69.5 0
## 12 <NA> 0 73.5 955 74.8 0 73.2 951 72.2 0
## # … with 55 more variables: Grade_Engl_Y1_avg <dbl>, Grade_Engl_Y1_total <dbl>,
## # Grade_Engl_Y1_ave_among_age25 <dbl>, Grade_Art_Y1_missing <dbl>,
## # Grade_Art_Y1_avg <dbl>, Grade_Art_Y1_total <dbl>,
## # Grade_Art_Y1_ave_among_age25 <dbl>, Grade_Sports_Y1_missing <dbl>,
## # Grade_Sports_Y1_avg <dbl>, Grade_Sports_Y1_total <dbl>,
## # Grade_Sports_Y1_ave_among_age25 <dbl>, Grade_Math_Y2_missing <dbl>,
## # Grade_Math_Y2_avg <dbl>, Grade_Math_Y2_total <dbl>, …
## # ℹ Use `colnames()` to see all variable names
#### Situation 2. Summarize over combination of multiple variables
## summarize
PseudoData %>%
group_by(Employed, Gender) %>%
summarize(Med_Math = median(Grade_Math_Y1, na.rm=T))
## # A tibble: 4 × 3
## # Groups: Employed [2]
## Employed Gender Med_Math
## <chr> <chr> <dbl>
## 1 No Female 68
## 2 No Male 70
## 3 Yes Female 75
## 4 Yes Male 74
#### Situation 3. Summarize data by two opponent categories
## this is helpful when trying to calculate the difference between the two opponent groups
PseudoData %>% select(Gender, Grade_Art_Y1) %>%
group_by(Gender) %>%
summarize(Mean_Art = mean(Grade_Art_Y1, na.rm = T)) %>%
mutate(Opposing_Mean_Art = rev(Mean_Art)) %>%
mutate(Diff_Mean_Art = Mean_Art - Opposing_Mean_Art)
## # A tibble: 2 × 4
## Gender Mean_Art Opposing_Mean_Art Diff_Mean_Art
## <chr> <dbl> <dbl> <dbl>
## 1 Female 73.5 74.4 -0.891
## 2 Male 74.4 73.5 0.891
#### Situation 1. Break the data frame or tibble into multiple sub-tibbles using nest
PseudoData_nest <- PseudoData %>% nest(-Major)
PseudoData_nest
## # A tibble: 12 × 2
## Major data
## <chr> <list>
## 1 Nursing <tibble [11 × 57]>
## 2 Unknown <tibble [12 × 57]>
## 3 Public Health <tibble [6 × 57]>
## 4 <NA> <tibble [13 × 57]>
## 5 Statistics <tibble [6 × 57]>
## 6 Biology <tibble [10 × 57]>
## 7 Business <tibble [8 × 57]>
## 8 Computer Science <tibble [9 × 57]>
## 9 Communications <tibble [8 × 57]>
## 10 History <tibble [5 × 57]>
## 11 English <tibble [8 × 57]>
## 12 Sustainability <tibble [4 × 57]>
#### Situation 2. Change the nested sub-tibbles to sub-lists
PseudoData_list <- as.list(PseudoData_nest)
PseudoData_list[[1]]
## [1] "Nursing" "Unknown" "Public Health" NA
## [5] "Statistics" "Biology" "Business" "Computer Science"
## [9] "Communications" "History" "English" "Sustainability"
#### Situation 3. Change the sub-list object back to sub-tibble
PseudoData_tibble <- as_tibble(PseudoData_list)
PseudoData_tibble
## # A tibble: 12 × 2
## Major data
## <chr> <list>
## 1 Nursing <tibble [11 × 57]>
## 2 Unknown <tibble [12 × 57]>
## 3 Public Health <tibble [6 × 57]>
## 4 <NA> <tibble [13 × 57]>
## 5 Statistics <tibble [6 × 57]>
## 6 Biology <tibble [10 × 57]>
## 7 Business <tibble [8 × 57]>
## 8 Computer Science <tibble [9 × 57]>
## 9 Communications <tibble [8 × 57]>
## 10 History <tibble [5 × 57]>
## 11 English <tibble [8 × 57]>
## 12 Sustainability <tibble [4 × 57]>
#### Situation 4. Extract data from sub-tibble to main work tibble
PseudoData %>% nest(-StudentID) %>% hoist(data, Sex = "Gender", Age = "Age")
## # A tibble: 100 × 4
## StudentID Sex Age data
## <int> <chr> <dbl> <list>
## 1 1 Female 26 <tibble [1 × 55]>
## 2 2 Female 28 <tibble [1 × 55]>
## 3 3 Female 26 <tibble [1 × 55]>
## 4 4 Male 29 <tibble [1 × 55]>
## 5 5 Male 29 <tibble [1 × 55]>
## 6 6 Female 25 <tibble [1 × 55]>
## 7 7 Female 28 <tibble [1 × 55]>
## 8 8 Female 27 <tibble [1 × 55]>
## 9 9 Female 28 <tibble [1 × 55]>
## 10 10 Female 27 <tibble [1 × 55]>
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
#### Situation 5. Extract data from sub-tibbles to a wider format
PseudoData_nest %>% unnest_wider(data)
## # A tibble: 12 × 58
## Major Stude…¹ DoB Age Gender Race Ethni…² Grade…³ Grade…⁴ Grade…⁵
## <chr> <list<> <lis> <lis> <list> <lis> <list<> <list<> <list<> <list<>
## 1 Nursing [11] [11] [11] [11] [11] [11] [11] [11] [11]
## 2 Unknown [12] [12] [12] [12] [12] [12] [12] [12] [12]
## 3 Public Heal… [6] [6] [6] [6] [6] [6] [6] [6] [6]
## 4 <NA> [13] [13] [13] [13] [13] [13] [13] [13] [13]
## 5 Statistics [6] [6] [6] [6] [6] [6] [6] [6] [6]
## 6 Biology [10] [10] [10] [10] [10] [10] [10] [10] [10]
## 7 Business [8] [8] [8] [8] [8] [8] [8] [8] [8]
## 8 Computer Sc… [9] [9] [9] [9] [9] [9] [9] [9] [9]
## 9 Communicati… [8] [8] [8] [8] [8] [8] [8] [8] [8]
## 10 History [5] [5] [5] [5] [5] [5] [5] [5] [5]
## 11 English [8] [8] [8] [8] [8] [8] [8] [8] [8]
## 12 Sustainabil… [4] [4] [4] [4] [4] [4] [4] [4] [4]
## # … with 48 more variables: Grade_Art_Y1 <list<dbl>>,
## # Grade_Sports_Y1 <list<dbl>>, Grade_Math_Y2 <list<dbl>>,
## # Grade_Stats_Y2 <list<dbl>>, Grade_Engl_Y2 <list<dbl>>,
## # Grade_Art_Y2 <list<dbl>>, Grade_Sports_Y2 <list<dbl>>,
## # Grade_Math_Y3 <list<dbl>>, Grade_Stats_Y3 <list<dbl>>,
## # Grade_Engl_Y3 <list<dbl>>, Grade_Art_Y3 <list<dbl>>,
## # Grade_Sports_Y3 <list<dbl>>, Grade_Math_Y4 <list<dbl>>, …
## # ℹ Use `colnames()` to see all variable names
#### Situation 6. Extract data from sub-tibbles to a longer format
PseudoData_nest %>% unnest_longer(data)
## # A tibble: 100 × 2
## Major data$StudentID $DoB $Age $Gender $Race $Ethn…¹ $Grad…² $Grad…³
## <chr> <int> <date> <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 Nursing 1 1993-01-31 26 Female Asian Non-Hi… 56 78
## 2 Nursing 7 1991-09-07 28 Female Asian Non-Hi… 62 70
## 3 Nursing 40 1991-07-16 28 Female Black Non-Hi… NA 75
## 4 Nursing 46 1991-08-30 28 Male Black Non-Hi… 53 77
## 5 Nursing 47 1993-03-20 26 Female White Non-Hi… 76 76
## 6 Nursing 51 1992-01-13 27 Female Unkn… Hispan… 60 77
## 7 Nursing 52 1994-03-30 25 Male Asian Hispan… 66 91
## 8 Nursing 70 1991-10-27 28 Female Asian Hispan… 71 83
## 9 Nursing 76 1993-08-19 26 Female White Hispan… 50 59
## 10 Nursing 77 1994-09-20 25 Female <NA> Hispan… 68 86
## # … with 90 more rows, 49 more variables: data$Grade_Engl_Y1 <dbl>,
## # $Grade_Art_Y1 <dbl>, $Grade_Sports_Y1 <dbl>, $Grade_Math_Y2 <dbl>,
## # $Grade_Stats_Y2 <dbl>, $Grade_Engl_Y2 <dbl>, $Grade_Art_Y2 <dbl>,
## # $Grade_Sports_Y2 <dbl>, $Grade_Math_Y3 <dbl>, $Grade_Stats_Y3 <dbl>,
## # $Grade_Engl_Y3 <dbl>, $Grade_Art_Y3 <dbl>, $Grade_Sports_Y3 <dbl>,
## # $Grade_Math_Y4 <dbl>, $Grade_Stats_Y4 <dbl>, $Grade_Engl_Y4 <dbl>,
## # $Grade_Art_Y4 <dbl>, $Grade_Sports_Y4 <dbl>, $Grade_Math_Y1_Letter <chr>, …
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
#### Situation 7. Nest by factor, then model within each category
## use logistic regression as an example
library(broom)
PseudoData %>%
nest(-Major) %>%
## run model within each category (Major) and output as a sub-tibble object
mutate(model = map(data,
~ glm(Employed %in% "Yes" ~ Grade_Math_Y1,
data=.,
family="binomial")),
## tidy the model output as a sub-tibble object
tidied = map(model, tidy)) %>%
## unnest the tidy model output
unnest(tidied) %>%
## report coefficient for specific term
filter(term == "Grade_Math_Y1") %>%
mutate(fdr = p.adjust(p.value, method="fdr"))
## # A tibble: 12 × 9
## Major data model term estimate std.e…¹ statistic p.value fdr
## <chr> <list> <lis> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Nursing <tibble> <glm> Grad… -2.18e-16 5.16e+3 -4.22e-20 1 1
## 2 Unknown <tibble> <glm> Grad… 7.45e+ 0 1.02e+4 7.27e- 4 0.999 1
## 3 Public Health <tibble> <glm> Grad… -9.44e- 2 9.37e-2 -1.01e+ 0 0.314 0.915
## 4 <NA> <tibble> <glm> Grad… 4.54e- 2 7.78e-2 5.84e- 1 0.559 1
## 5 Statistics <tibble> <glm> Grad… -4.12e-16 5.20e+3 -7.92e-20 1 1
## 6 Biology <tibble> <glm> Grad… 1.53e- 1 1.11e-1 1.38e+ 0 0.167 0.915
## 7 Business <tibble> <glm> Grad… -2.09e- 1 2.39e-1 -8.76e- 1 0.381 0.915
## 8 Computer Scie… <tibble> <glm> Grad… -8.01e- 2 6.78e-2 -1.18e+ 0 0.238 0.915
## 9 Communications <tibble> <glm> Grad… -2.43e- 2 1.10e-1 -2.22e- 1 0.825 1
## 10 History <tibble> <glm> Grad… 4.68e+ 0 1.05e+4 4.47e- 4 1.00 1
## 11 English <tibble> <glm> Grad… 4.49e- 2 1.01e-1 4.44e- 1 0.657 1
## 12 Sustainability <tibble> <glm> Grad… 1.20e- 1 1.33e-1 9.03e- 1 0.366 0.915
## # … with abbreviated variable name ¹std.error
## use linear regression as an example
PseudoData %>%
select(Major, Grade_Art_Y1, Grade_Math_Y1) %>%
nest(-Major) %>%
mutate(model = map(data,~ lm(Grade_Art_Y1 ~ Grade_Math_Y1, data=.)),
## glance report evaluation of each regression model, can also try tidy
glance = map(model, glance)) %>%
unnest(glance)
## # A tibble: 12 × 15
## Major data model r.squ…¹ adj.r…² sigma stati…³ p.value df logLik AIC
## <chr> <list> <lis> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Nurs… <tibble> <lm> 1.14e-3 -0.124 8.67 0.00912 0.926 1 -34.7 75.3
## 2 Unkn… <tibble> <lm> 8.88e-3 -0.0902 8.98 0.0896 0.771 1 -42.3 90.6
## 3 Publ… <tibble> <lm> 4.01e-2 -0.200 6.37 0.167 0.704 1 -18.4 42.8
## 4 <NA> <tibble> <lm> 4.75e-1 0.422 6.56 9.03 0.0132 1 -38.5 83.0
## 5 Stat… <tibble> <lm> 1.46e-1 -0.0675 5.40 0.684 0.455 1 -17.4 40.8
## 6 Biol… <tibble> <lm> 8.50e-4 -0.124 9.67 0.00680 0.936 1 -35.8 77.5
## 7 Busi… <tibble> <lm> 9.34e-3 -0.189 8.84 0.0472 0.837 1 -24.0 54.0
## 8 Comp… <tibble> <lm> 8.24e-2 -0.0487 6.18 0.629 0.454 1 -28.0 62.1
## 9 Comm… <tibble> <lm> 9.33e-2 -0.0880 4.96 0.515 0.505 1 -20.0 45.9
## 10 Hist… <tibble> <lm> 5.90e-1 0.385 4.83 2.88 0.232 1 -10.6 27.2
## 11 Engl… <tibble> <lm> 5.00e-2 -0.140 6.79 0.263 0.630 1 -22.2 50.3
## 12 Sust… <tibble> <lm> 4.37e-1 0.155 6.65 1.55 0.339 1 -11.9 29.7
## # … with 4 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>,
## # nobs <int>, and abbreviated variable names ¹r.squared, ²adj.r.squared,
## # ³statistic
## # ℹ Use `colnames()` to see all variable names
#### Situation 8. map over list of dataframes or tibbles
PseudoData %>%
nest(-Major) %>%
## single variable
mutate(nrow = unlist(map(data, ~ nrow(.x))),
ncol = unlist(map(data, ~ ncol(.x))),
dim = unlist(map2(Major, data, ~ paste0((.x), " data has a dimension of", nrow(.y), " * ", ncol(.y), "."))))
## # A tibble: 12 × 5
## Major data nrow ncol dim
## <chr> <list> <int> <int> <chr>
## 1 Nursing <tibble [11 × 57]> 11 57 Nursing data has a dimension…
## 2 Unknown <tibble [12 × 57]> 12 57 Unknown data has a dimension…
## 3 Public Health <tibble [6 × 57]> 6 57 Public Health data has a dim…
## 4 <NA> <tibble [13 × 57]> 13 57 NA data has a dimension of13…
## 5 Statistics <tibble [6 × 57]> 6 57 Statistics data has a dimens…
## 6 Biology <tibble [10 × 57]> 10 57 Biology data has a dimension…
## 7 Business <tibble [8 × 57]> 8 57 Business data has a dimensio…
## 8 Computer Science <tibble [9 × 57]> 9 57 Computer Science data has a …
## 9 Communications <tibble [8 × 57]> 8 57 Communications data has a di…
## 10 History <tibble [5 × 57]> 5 57 History data has a dimension…
## 11 English <tibble [8 × 57]> 8 57 English data has a dimension…
## 12 Sustainability <tibble [4 × 57]> 4 57 Sustainability data has a di…
PseudoData_Grade <- PseudoData[,str_detect(colnames(PseudoData), "ID|Letter")]
#### Method 1. Use melt() from data.table package
TempData_Long1 <- as.data.frame(
data.table::melt(data.table::setDT(PseudoData_Grade),
measure = patterns("Grade_Math_",
"Grade_Stats_",
"Grade_Engl_",
"Grade_Art_",
"Grade_Sports_"),
variable.name = 'Term',
value.name = c("Grade_Math",
"Grade_Stats",
"Grade_Engl",
"Grade_Art",
"Grade_Sports")))
#### Method 2. Use reshape() from base
## note, reshape only works with data frames
PseudoData_Grade_DF <- as.data.frame(PseudoData_Grade)
TempData_long2 <- reshape(PseudoData_Grade_DF,
varying=colnames(PseudoData_Grade)[-1],
idvar="StudentID", direction="long", sep="Y")
#### Method 3. Use pivot_longer from dplyr package
TempData_Long3 <- PseudoData_Grade %>%
select(StudentID,starts_with("Grade_Math")) %>%
pivot_longer(-StudentID, names_to = "Class", values_to = "Grade")
#### Method 4. Use gather
TempData_Grade_Gathered <- PseudoData_Grade %>%
gather(key = "Term", value = "Grade",
contains("Letter"))
#### Method 1. Use reshape() from base
## first, sort the data (often repeated data), use TempData_Long1 as an example, which is already sorted
head(TempData_Long1)
## StudentID Term Grade_Math Grade_Stats Grade_Engl Grade_Art Grade_Sports
## 1 1 1 F C B B C
## 2 2 1 D D A B C
## 3 3 1 F F B C A
## 4 4 1 D C C D D
## 5 5 1 B B B <NA> B
## 6 6 1 B B F D C
## second, for each unique ID, give an order variable, like occurrence
TempData_Long1$Order <- with(TempData_Long1, ave(StudentID, StudentID, FUN = seq_along))
## lastly, reshape it to wide
PseudoData_Wide1 <- reshape(TempData_Long1,
timevar = "Order",
idvar = "StudentID",
direction = "wide",
sep = "_")
#### Method 2. Use pivot_wider from dplyr
TempData_Wide2 <-
TempData_Long3 %>%
pivot_wider(names_from = "Class", values_from = "Grade",
values_fill = list(n = 0))
#### Method 3. Use spread
TempData_Wide3 <-
TempData_Grade_Gathered %>%
spread(key = "Term",
value = "Grade",
# fill in the missing data to a specific string of your choice (can also be NA)
fill = "Missing",
# automatically convert the data type
convert = T)
#### Situation 1. Cut based on specific cutpoints
PseudoData %>%
mutate(Grade_Math_Y1_level = cut(Grade_Math_Y1,
breaks = c(-Inf,70,80,90,Inf),
include.lowest = T,
right=T)) %>%
count(Grade_Math_Y1_level)
## # A tibble: 5 × 2
## Grade_Math_Y1_level n
## <fct> <int>
## 1 [-Inf,70] 49
## 2 (70,80] 29
## 3 (80,90] 19
## 4 (90, Inf] 1
## 5 <NA> 2
#### Situation 2. Cut based on proportion (quantile)
PseudoData %>%
mutate(Grade_Math_Y1_Tertile =
cut(Grade_Math_Y1,
breaks = quantile(Grade_Math_Y1,
probs = seq(0, 1, 1/3),
na.rm = T),
labels=c("T1","T2","T3"),
include.lowest = T, right=T)) %>%
count(Grade_Math_Y1_Tertile)
## # A tibble: 4 × 2
## Grade_Math_Y1_Tertile n
## <fct> <int>
## 1 T1 34
## 2 T2 33
## 3 T3 31
## 4 <NA> 2
#### Situation 1. reorder factor by frequency
## reorder by frequency
PseudoData %>%
mutate(Major = fct_infreq(Major)) %>%
count(Major) %>%
arrange(Major)
## # A tibble: 12 × 2
## Major n
## <fct> <int>
## 1 Unknown 12
## 2 Nursing 11
## 3 Biology 10
## 4 Computer Science 9
## 5 Business 8
## 6 Communications 8
## 7 English 8
## 8 Public Health 6
## 9 Statistics 6
## 10 History 5
## 11 Sustainability 4
## 12 <NA> 13
## reorder by frequency after count
PseudoData %>%
count(Major) %>%
mutate(Major = fct_reorder(Major, -n)) %>%
arrange(Major)
## # A tibble: 12 × 2
## Major n
## <fct> <int>
## 1 Unknown 12
## 2 Nursing 11
## 3 Biology 10
## 4 Computer Science 9
## 5 Business 8
## 6 Communications 8
## 7 English 8
## 8 Public Health 6
## 9 Statistics 6
## 10 History 5
## 11 Sustainability 4
## 12 <NA> 13
#### Situation 2. reorder by specific column
## order identifier
# decreasing order, try -Grade_Math_Y1
PseudoData %>%
mutate(StudentID = fct_reorder(factor(StudentID), Grade_Math_Y1)) %>%
arrange(StudentID) %>%
select(StudentID, Grade_Math_Y1)
## # A tibble: 100 × 2
## StudentID Grade_Math_Y1
## <fct> <dbl>
## 1 14 42
## 2 3 46
## 3 76 50
## 4 39 53
## 5 46 53
## 6 24 54
## 7 25 55
## 8 1 56
## 9 9 56
## 10 29 56
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
## order categories (by median, also try mean, min, max, first, last, last2)
# decreasing order, try -Grade_Math_Y1
PseudoData %>%
mutate(Major = fct_reorder(factor(Major), Grade_Math_Y1, median, na.rm = T)) %>%
group_by(Major) %>%
summarise(med = median(Grade_Math_Y1, na.rm = T)) %>%
arrange(Major)
## # A tibble: 12 × 2
## Major med
## <fct> <dbl>
## 1 Nursing 64
## 2 Sustainability 66.5
## 3 Communications 68
## 4 English 69
## 5 Unknown 69.5
## 6 Statistics 70
## 7 Public Health 72
## 8 Business 72.5
## 9 History 73
## 10 Computer Science 74
## 11 Biology 75.5
## 12 <NA> 75
#### Situation 3. reverse factor order
PseudoData %>%
transmute(Major = fct_infreq(Major)) %>%
mutate(Major = fct_rev(Major)) %>%
count(Major)%>%
arrange(Major)
## # A tibble: 12 × 2
## Major n
## <fct> <int>
## 1 Sustainability 4
## 2 History 5
## 3 Statistics 6
## 4 Public Health 6
## 5 English 8
## 6 Communications 8
## 7 Business 8
## 8 Computer Science 9
## 9 Biology 10
## 10 Nursing 11
## 11 Unknown 12
## 12 <NA> 13
#### Situation 4. lump factor
## lump based on number of levels
# lump non-NA categories that are not the top 4 frequent
PseudoData %>%
mutate(Major = fct_lump(Major, n = 4)) %>%
count(Major)
## # A tibble: 6 × 2
## Major n
## <fct> <int>
## 1 Biology 10
## 2 Computer Science 9
## 3 Nursing 11
## 4 Unknown 12
## 5 Other 45
## 6 <NA> 13
## lump based on percentage
# lump non-NA categories that are less than or equal to 8%
PseudoData %>%
mutate(Major = fct_lump(Major, prop = 0.08)) %>%
count(Major)
## # A tibble: 6 × 2
## Major n
## <fct> <int>
## 1 Biology 10
## 2 Computer Science 9
## 3 Nursing 11
## 4 Unknown 12
## 5 Other 45
## 6 <NA> 13
## lump with specific rule
PseudoData %>%
select(Grade_Art_Y1_Letter) %>%
mutate(Grade_Art_Y1_Letter2 = fct_collapse(
Grade_Art_Y1_Letter,
"Grade A" = "A",
"Grade B" = "B",
"Grade C, D, F" = c("C","D","F"))) %>%
count(Grade_Art_Y1_Letter, Grade_Art_Y1_Letter2)
## # A tibble: 6 × 3
## Grade_Art_Y1_Letter Grade_Art_Y1_Letter2 n
## <chr> <fct> <int>
## 1 A Grade A 4
## 2 B Grade B 19
## 3 C Grade C, D, F 40
## 4 D Grade C, D, F 31
## 5 F Grade C, D, F 2
## 6 <NA> <NA> 4
#### Situation 5. relevel - specify reference or highest level
## specify reference
PseudoData %>%
transmute(Race = factor(Race)) %>%
mutate(Race = fct_relevel(Race, "Multiple")) %>%
count(Race) %>%
arrange(Race)
## # A tibble: 6 × 2
## Race n
## <fct> <int>
## 1 Multiple 15
## 2 Asian 19
## 3 Black 20
## 4 Unknown 11
## 5 White 13
## 6 <NA> 22
## specify first few levels
PseudoData %>%
transmute(Race = factor(Race)) %>%
mutate(Race = fct_relevel(Race, c("Multiple", "Unknown"))) %>%
count(Race) %>%
arrange(Race)
## # A tibble: 6 × 2
## Race n
## <fct> <int>
## 1 Multiple 15
## 2 Unknown 11
## 3 Asian 19
## 4 Black 20
## 5 White 13
## 6 <NA> 22
## specify highest level
PseudoData %>%
transmute(Race = factor(Race)) %>%
mutate(Race = fct_relevel(Race, "Multiple", after = Inf)) %>%
count(Race) %>%
arrange(Race)
## # A tibble: 6 × 2
## Race n
## <fct> <int>
## 1 Asian 19
## 2 Black 20
## 3 Unknown 11
## 4 White 13
## 5 Multiple 15
## 6 <NA> 22
#### Situation 6. rename specific level
PseudoData %>%
transmute(Race = fct_recode(Race, "Unknown_newname" = "Unknown")) %>%
count(Race)
## # A tibble: 6 × 2
## Race n
## <fct> <int>
## 1 Asian 19
## 2 Black 20
## 3 Multiple 15
## 4 Unknown_newname 11
## 5 White 13
## 6 <NA> 22
Sometimes continuous data like currency is not stored in a numeric format but often in a character format and requires quick cleaning to get it ready
#### Method 1. parse_number()
readr::parse_number("$1000,000")
## [1] 1e+06
readr::parse_number("$10,000,000")
## [1] 1e+07
readr::parse_number("1.050$")
## [1] 1.05
Many operations can be found in the stirngr package and this document.
#### . - anything
str_detect("Apple", ".")
## [1] TRUE
#### ^ - start of the string
str_detect("Apple", "^A")
## [1] TRUE
#### $ - end of the string
str_detect("Apple", "e$")
## [1] TRUE
#### \w - any letter (also works with +, any or more letter)
str_detect("Apple", "\\w")
## [1] TRUE
#### \d - any digit (also works with +, any or more digit)
str_detect("Apple", "\\d")
## [1] FALSE
#### \s - any white space
str_detect("Apple ", "\\s")
## [1] TRUE
#### [abcABC] - any abc, or capital ABC
str_detect("Apple", "[abcABC]")
## [1] TRUE
#### [a-zA-Z] - any lower or upper case English character
str_detect("5", "[a-zA-Z]")
## [1] FALSE
#### ab|ac - either ab or ac
str_detect("Apple", "ab|ac")
## [1] FALSE
#### App or ppp
str_detect("Apple", "(A|p)pp")
## [1] TRUE
#### one or more p
str_detect("Apple", "p+")
## [1] TRUE
#### zero or more p
str_detect("Apple", "p*")
## [1] TRUE
#### anything starting with p
str_extract("Apple", "p.*")
## [1] "pple"
#### anything ending with p
str_extract("Apple", ".*p")
## [1] "App"
# Try regexplain by Garrick Aiden-Buie
# install.packages("remotes")
# remotes::install_github("gadenbuie/regexplain")
# regexplain::regexplain_cheatsheet()
# regexplain::regexplain_gadget()
# regexplain::regexplain_web()
# PseudoData %>% select(Fav_Jane) %>% regexplain::regexplain_gadget()
#### Operation 1. case change
Hmisc:: capitalize(c("zip code", "state", "final count"))
## [1] "Zip code" "State" "Final count"
stringr:: str_to_title(c("zip code", "state", "final count"))
## [1] "Zip Code" "State" "Final Count"
tolower("Zip Code")
## [1] "zip code"
toupper("Zip Code")
## [1] "ZIP CODE"
#### Operation 2. detect
stringr::str_detect("Stringtest", "^String")
## [1] TRUE
#### Operation 3. remove
stringr::str_remove("Stringtest", "test")
## [1] "String"
str_remove(names(PseudoData), "\\d+")
## [1] "StudentID" "DoB" "Age"
## [4] "Gender" "Race" "Ethnicity"
## [7] "Grade_Math_Y" "Grade_Stats_Y" "Grade_Engl_Y"
## [10] "Grade_Art_Y" "Grade_Sports_Y" "Grade_Math_Y"
## [13] "Grade_Stats_Y" "Grade_Engl_Y" "Grade_Art_Y"
## [16] "Grade_Sports_Y" "Grade_Math_Y" "Grade_Stats_Y"
## [19] "Grade_Engl_Y" "Grade_Art_Y" "Grade_Sports_Y"
## [22] "Grade_Math_Y" "Grade_Stats_Y" "Grade_Engl_Y"
## [25] "Grade_Art_Y" "Grade_Sports_Y" "Grade_Math_Y_Letter"
## [28] "Grade_Stats_Y_Letter" "Grade_Engl_Y_Letter" "Grade_Art_Y_Letter"
## [31] "Grade_Sports_Y_Letter" "Grade_Math_Y_Letter" "Grade_Stats_Y_Letter"
## [34] "Grade_Engl_Y_Letter" "Grade_Art_Y_Letter" "Grade_Sports_Y_Letter"
## [37] "Grade_Math_Y_Letter" "Grade_Stats_Y_Letter" "Grade_Engl_Y_Letter"
## [40] "Grade_Art_Y_Letter" "Grade_Sports_Y_Letter" "Grade_Math_Y_Letter"
## [43] "Grade_Stats_Y_Letter" "Grade_Engl_Y_Letter" "Grade_Art_Y_Letter"
## [46] "Grade_Sports_Y_Letter" "Major" "Scholarship"
## [49] "Schol_Amount" "ProjectedEarning" "JobHuntDate"
## [52] "Employed" "EmployDate" "DayEmploy"
## [55] "Fav_Jane" "Longitude" "Latitude"
## [58] "State"
#### Operation 4. substitute
gsub("@gmail.com", "@hotmail","SomeEmail@gmail.com")
## [1] "SomeEmail@hotmail"
#### Operation 5. concatenation
## use str_c - works well with both string entries and/or string columns
PseudoData %>%
mutate(test = str_c("prefix",
PseudoData$Grade_Art_Y1_Letter,
PseudoData$Grade_Art_Y2_Letter,
"suffix",
sep = "-")) %>%
select(test)
## # A tibble: 100 × 1
## test
## <chr>
## 1 prefix-B-B-suffix
## 2 prefix-B-B-suffix
## 3 prefix-C-C-suffix
## 4 prefix-D-C-suffix
## 5 <NA>
## 6 prefix-D-D-suffix
## 7 prefix-B-B-suffix
## 8 prefix-D-C-suffix
## 9 prefix-D-D-suffix
## 10 prefix-D-F-suffix
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
## use unite - only works with columns operations, does not work with string entries
PseudoData %>%
unite(united,
Grade_Art_Y1_Letter, Grade_Art_Y2_Letter,
sep="_") %>%
select(united)
## # A tibble: 100 × 1
## united
## <chr>
## 1 B_B
## 2 B_B
## 3 C_C
## 4 D_C
## 5 NA_F
## 6 D_D
## 7 B_B
## 8 D_C
## 9 D_D
## 10 D_F
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
## use glue - a flexible way of concatenate strings/texts
PseudoData %>%
# filter(!Grade_Art_Y1_Letter %in% NA, !Grade_Art_Y2_Letter %in% NA) %>%
mutate(Art = glue::glue("Year 1: { Grade_Art_Y1_Letter } Year 2: { Grade_Art_Y2_Letter } Year 3: { Grade_Art_Y3_Letter } Year 4: { Grade_Art_Y4_Letter }")) %>%
count(Art)
## # A tibble: 45 × 2
## Art n
## <glue> <int>
## 1 Year 1: A Year 2: A Year 3: A Year 4: A 3
## 2 Year 1: A Year 2: A Year 3: B Year 4: C 1
## 3 Year 1: B Year 2: A Year 3: A Year 4: A 2
## 4 Year 1: B Year 2: B Year 3: A Year 4: A 3
## 5 Year 1: B Year 2: B Year 3: A Year 4: B 1
## 6 Year 1: B Year 2: B Year 3: B Year 4: A 4
## 7 Year 1: B Year 2: B Year 3: B Year 4: B 2
## 8 Year 1: B Year 2: B Year 3: B Year 4: C 3
## 9 Year 1: B Year 2: B Year 3: C Year 4: B 1
## 10 Year 1: B Year 2: B Year 3: C Year 4: C 1
## # … with 35 more rows
## # ℹ Use `print(n = ...)` to see more rows
#### Operation 6. subset string
## based on locations
str_sub("This is a long sentence as an example.", 5, 20)
## [1] " is a long sente"
## based on pattern
# example 1 - lower vs upper cases
data.frame(x=c("aaaAaa","bbAbb")) %>%
extract(x,
c("lower","Upperlower"),
"^([a-z]+)([A-Z].*)")
## lower Upperlower
## 1 aaa Aaa
## 2 bb Abb
# example 2 - Text + number
data.frame(x=c("HPV5_,HPyV6_","HPV5_,HPyV9_", "HPV99_,HPyV100_")) %>%
extract(x,
c("HPV","HPyV"),
"HPV(\\d+)_,HPyV(\\d+)_")
## HPV HPyV
## 1 5 6
## 2 5 9
## 3 99 100
#### Operation 7. trim and add white space
str_trim(" xxx ", side="both")
## [1] "xxx"
str_pad("xxx", width = 20, side="both")
## [1] " xxx "
#### Operation 8. use snakecase to make sense of string
library(snakecase)
strings <- c("this Is a Strange_string", "AND THIS ANOTHER_One", NA)
to_snake_case(strings)
## [1] "this_is_a_strange_string" "and_this_another_one"
## [3] NA
to_lower_camel_case(strings)
## [1] "thisIsAStrangeString" "andThisAnotherOne" NA
to_upper_camel_case(strings)
## [1] "ThisIsAStrangeString" "AndThisAnotherOne" NA
to_screaming_snake_case(strings)
## [1] "THIS_IS_A_STRANGE_STRING" "AND_THIS_ANOTHER_ONE"
## [3] NA
to_lower_upper_case(strings)
## [1] "thisISaSTRANGEstring" "andTHISanotherONE" NA
to_upper_lower_case(strings)
## [1] "THISisAstrangeSTRING" "ANDthisANOTHERone" NA
to_parsed_case(strings)
## [1] "this_Is_a_Strange_string" "AND_THIS_ANOTHER_One"
## [3] NA
to_mixed_case(strings)
## [1] "this_Is_a_Strange_string" "And_This_Another_One"
## [3] NA
to_swap_case(strings)
## [1] "THIS iS A sTRANGE_STRING" "and this another_oNE"
## [3] NA
to_sentence_case(strings)
## [1] "This is a strange string" "And this another one"
## [3] NA
to_random_case(strings)
## [1] "thiS Is a sTRANge_sTRinG" "And THis AnOTheR_ONE"
## [3] "NA"
to_title_case(strings)
## [1] "This is a Strange String" "And this Another One"
## [3] NA
library(tuple)
#### Situation 1. Check if substring in both columns
## base version
# Data$Output <-
# ifelse(Data$A_Column %in% c(NA, "") | B_Column %in% c(NA, ""), NA,
# ifelse(unlist(lapply(lapply(lapply(strsplit(paste(sapply(str_extract_all(Data$A_Column,Specific_Strings),paste,collapse=","),
# sapply(str_extract_all(Data$B_Column,Specific_Strings),paste,collapse=","),sep=","),",",fixed=TRUE),
# function(x){x[!x==""]}), function(x){
# tuplicated(x,2)
# }),any)), 1, NA))
## function version
Concor_F <- function(Col_1,Col_2,Specific_Strings){
attach(data)
DupColumn <- gsub("NA","",paste(
sapply(str_extract_all(Col_1,Specific_Strings),paste,collapse=","),
sapply(str_extract_all(Col_2,Specific_Strings),paste,collapse=","), sep=","))
Split <-strsplit(DupColumn,",",fixed=TRUE)
NoneEmptySplit <- lapply(Split,function(x){x[!x==""]})
Any_Duplicated <- unlist(lapply(lapply(NoneEmptySplit, function(x){
tuplicated(x,2)
}),any))
Concor <- ifelse(Any_Duplicated %in% T, "Concordant","Non-concordant")
Concor <- factor(Concor, levels=c("Non-concordant", "Concordant"))
detach(data)
return(Concor)
}
#### Situtaion 2. Count duplicates across three columns
A_B_C <- function(A_Column, B_Column, C_Column,Specific_Strings){
#Specific_String could be a single name or multiple names separated by '|'
#e.g. "Apple|Banana|Orange"
DupColumn <- gsub("NA","",paste(
sapply(str_extract_all(A_Column,Specific_Strings),paste,collapse=","),
sapply(str_extract_all(B_Column,Specific_Strings),paste,collapse=","),
sapply(str_extract_all(C_Column,Specific_Strings),paste,collapse=","), sep=","))
Split <-strsplit(DupColumn,",",fixed=TRUE)
NoneEmptySplit <- lapply(Split,function(x){x[!x==""]})
Any_Duplicated <- unlist(lapply(lapply(NoneEmptySplit, function(x){
tuplicated(x,2)
}),any))
Any_Triplicated <- unlist(lapply(lapply(NoneEmptySplit, function(x){
tuplicated(x,3)
}),any))
ifelse(Any_Triplicated %in% T,3,
ifelse(Any_Duplicated %in% T,2,
ifelse(str_detect(A_Column,Specific_Strings)|
str_detect(B_Column,Specific_Strings)|
str_detect(C_Column,Specific_Strings),1,0)))
}
More details about tokenization, pairwise correlation, topic modeling, and other text analysis topics can be found in Text Mining with R- a Tidy Approach by Julia Silge and David Robinson.
Here we show a quick example of how to tokenize text data and run a quick pairwise correlation analysis.
library(tidytext)
library(widyr)
library(ggraph)
library(igraph)
#### Step 1. tokenization
TokenData <-
PseudoData %>% select(StudentID, Grade_Math_Y1, Fav_Jane) %>%
#Tokenize string column Fav_Jane
unnest_tokens(word, Fav_Jane) %>%
#Remove stop words by anti join the original word column with a stop words data
anti_join(stop_words, by = "word") %>%
#Manually remove Spanish stop words
#and keep words with at least 1 English character
filter(!(word %in% c("de", "en", "la", "para")),
str_detect(word, "[a-z]"))
#### Step 2. summarize one covariable by each word
Summa_Token <- TokenData %>%
group_by(word) %>%
#Calculate summarized data corresponding to each word as covariable
summarize(med_math = median(Grade_Math_Y1, na.rm = T),
occurences = n())
#### Step 3. calculate pairwise correlation based on how often they appear together
TopWordCor <- TokenData %>%
select(StudentID, word) %>%
#Analyze correlation between words, what words always appear together
widyr::pairwise_cor(word, StudentID, sort=T) %>%
head(100)
#### Step 4. create a vertices data for graphing word correlations
vertices <-
Summa_Token %>%
filter(word %in% TopWordCor$item1 |
word %in% TopWordCor$item2)
#### Step 5. plot correlation between words
TopWordCor %>%
graph_from_data_frame(vertices = vertices) %>%
ggraph() +
geom_edge_link() +
geom_node_point(aes(size = occurences, color = med_math)) +
geom_node_text(aes(label = name), repel = TRUE) +
scale_color_gradient2(low = "blue", high = "red", midpoint = 70,
labels = scales::comma_format())
#### Step 6. cast the tokenized data into a sparse data matrix for topic modeling
# TokenData %>% cast_sparse(StudentID, word)
#### Step 7. TF-IDF
## how often a word appears in one set as compared to whether the word appears
## in other documents.
tf_idf_example <-
PseudoData %>%
select(Gender, Fav_Jane) %>%
#Tokenize string column Fav_Jane
unnest_tokens(word, Fav_Jane) %>%
#Remove stop words by anti join the original word column with a stop words data
anti_join(stop_words, by = "word") %>%
count(Gender, word) %>%
bind_tf_idf(word, Gender, n) %>%
arrange(desc(tf_idf))
#### Step 8. log-odds
## how often a word appears in one set as compared to how often it appears in all
## other documents.
## The main difference between log-odds and TF-IDF is that the log-odds evaluate
## the frequency it appeared in other documents.
log_odds_example <-
PseudoData %>%
select(Gender, Fav_Jane) %>%
#Tokenize string column Fav_Jane
unnest_tokens(word, Fav_Jane) %>%
#Remove stop words by anti join the original word column with a stop words data
anti_join(stop_words, by = "word") %>%
count(Gender, word) %>%
## notice the order is different
tidylo::bind_log_odds(Gender, word, n) %>%
arrange(desc(log_odds_weighted))
It is crucial to use the most appropriate plot to present the data of interest. If unsure about what plot schema to choose, see this site for ideas.
ggplot requires a data object and a set of base aesthetics to construct the plot background through ggplot function. Then you can add one or more plotting schema as layers onto the background through geom function. Each layer of plotting schema can inherit aesthetics from the base layer, or its own set of aesthetics.
Each aesthetic of each layer can be scaled separately through scale function. Labels, limits and legends can be set in each of the scale function.
Theme of the figure is another big topic, while font, size, color, panel grid line, legend can be specified in theme function.
#### Situation 1. Usually used to plot association between two continuous variable
ggplot(data=PseudoData, aes(x = Grade_Math_Y1, y = Grade_Art_Y1)) + geom_point()
#### Situation 2. Stratified scatter plot
Scatter_plot <-
ggplot(data=PseudoData, aes(x = Grade_Math_Y1, y = Grade_Art_Y1, color = Gender)) +
geom_point() +
## often helpful to have a smooth line
## if not need to stratify the line by color, try adding aes(group=1)
geom_smooth(method = "loess", se = T)
Scatter_plot
#### Situation 3. Scatter plot with text labels
Scatter_plot + geom_text(aes(label = StudentID), check_overlap = TRUE, vjust = 1, hjust = 1)
#### Situation 4. Scatter plot with text boxes (geom_label_repel, geom_text_repel)
Scatter_plot+
ggrepel::geom_label_repel(
aes(x = Grade_Math_Y1, y = Grade_Art_Y1,
## give a condition, only show labels for those meet the condition
label = ifelse(Grade_Math_Y1 > 80, paste0(StudentID, ": Math > 80"), "")),
box.padding = 0.15, point.padding = 0.15, segment.color = "grey50")
#### Situation 1. Usually used with respect to date/time
Line_plot <-
PseudoData %>%
## created pseudo data to showcase confidence interval with line plot
mutate(any_low_bound = Grade_Art_Y1 - abs(round(rnorm(100, mean = 20, sd = 2))),
any_high_bound = Grade_Art_Y1 + abs(round(rnorm(100, mean=20, sd = 2)))) %>%
ggplot(aes(x = DoB , y = Grade_Art_Y1)) +
geom_line()
#### Situation 2. Line plot with confidence interval using geom_ribbon
Line_plot + geom_ribbon(aes(ymin = any_low_bound, ymax = any_high_bound), alpha = .2)
#### Situation 3. Path plot (sometimes used to track location change)
Path_plot <-
PseudoData %>%
## order the data in certain way
arrange(EmployDate) %>%
## plot location change
ggplot(aes(x = Longitude, y = Latitude)) +
geom_path()
Path_plot
#### Method 1. using geom_segment
## geom_segment allows to create a line between any two points of a ggplot
## which can also be used to plot residual
PseudoData %>%
group_by(Major) %>%
summarise(mean_stats = mean(Grade_Stats_Y1, na.rm = T),
mean_math = mean(Grade_Math_Y1, na.rm = T)) %>%
ggplot(aes(x = mean_stats, y = mean_math, color = Major)) +
geom_point() +
geom_segment(aes(xend = mean_stats, yend = mean_stats, x = mean_stats, y = mean_math)) +
geom_abline(slope = 1)
#### Situation 1. Usually used to plot distribution
Hist_base <- ggplot(data = PseudoData, aes(x = Grade_Math_Y1))
Hist_base + geom_histogram()
## this is the same as geom_histogram(aes(y = ..count..))
Hist_base +
geom_histogram(aes(y = ..count..),
## specify bin width
binwidth = 2,
## specify color of the outter box
colour = "red",
## specify color of the fill
fill = "blue")
#### Situation 2. Plot density instead of frequency
ggplot(data = PseudoData, aes(x = Grade_Math_Y1)) +
geom_histogram(aes(y = ..density..)) +
geom_density()
#### Situation 3. Plot stratified density
ggplot(data = PseudoData, aes(x = Grade_Math_Y1, y = Major)) +
## geom_density_ridges2 adds a solid black line under the grey area
ggridges::geom_density_ridges(scale = 4) +
ggridges::theme_ridges()
#### Situation 1. Used to plot distribution of categorical variable
PseudoData %>%
count(Grade_Math_Y1_Letter) %>%
ggplot(aes(x = Grade_Math_Y1_Letter, y = n)) +
geom_col()
#### Situation 2. Stratified bar plot
Bar_base <-
PseudoData %>%
count(Grade_Math_Y1_Letter, Gender) %>%
mutate(perc = n / sum(n) * 100) %>%
ggplot(aes(x = Grade_Math_Y1_Letter, y = n, fill = Gender))
## stacked bar
Bar_base + geom_col() +
geom_text(aes(label = paste0(perc, "%")),
position = position_stack(vjust=0.5))
## stacked bar that got connected
Bar_base + geom_col(position = "stack", width = 1)
## stacked bar with percentage insead of frequency
PseudoData %>%
count(Grade_Math_Y1_Letter, Gender) %>%
group_by(Grade_Math_Y1_Letter) %>%
mutate(pct = prop.table(n) * 100,
fct_n = sum(n)) %>%
ggplot(aes(x = Grade_Math_Y1_Letter, y = pct, fill = Gender)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(sprintf("%1.1f", pct), "%")),
position = position_stack(vjust = 0.5), size = 5) +
geom_text(aes(label = paste0("n=", formatC(fct_n, format = "d", big.mark = ",")),
x = Grade_Math_Y1_Letter, y = 0), size = 5, vjust = 1.5)
## side by side bar
Bar_base + geom_col(position = "dodge")
#### Situation 1. Used to compare distribution of continuous variable of multiple groups
Box_base <- ggplot(data = PseudoData, aes(x = Grade_Art_Y1_Letter, y = Grade_Math_Y1))
Box_base + geom_boxplot(aes(fill = Grade_Art_Y1_Letter))
#### Situation 2. Add more data to the box plot
## specify outlier color, shape and size
Box_base + geom_boxplot(outlier.colour = "red",
outlier.shape = 8,
outliersize=3)
## indicate mean as symbol
Box_base + geom_boxplot() + stat_summary(fun.y = mean, geom = "point", shape = 23, size = 4)
## add median as text
Box_base + geom_boxplot() + stat_summary(fun.y = median, geom = "text", vjust = -0.7,
aes(label = round(..y.., digits = 1)))
## add dots
Box_base + geom_boxplot() + geom_dotplot(binaxis = 'y', stackdir = 'center', dotsize = 0.5)
#### Situation 3. Double stratified box plot
Box_base + geom_boxplot(aes(fill = Gender,
group = interaction(Grade_Art_Y1_Letter, Gender)))
#### Method 1. Used to plot density of two continuous variable
ggplot(PseudoData, aes(x = Grade_Math_Y1, y = Grade_Art_Y1, colour = Gender)) +
geom_point() +
stat_density2d(aes(alpha = stat(level) ^ 2, fill = Gender, geom = "polygon"))
#### Method 2. polygon with stat_chull
library(ggpubr)
ggplot(PseudoData, aes(x=Grade_Math_Y1, y=Grade_Art_Y1)) +
geom_point(aes(color = Grade_Math_Y1_Letter, shape = Grade_Math_Y1_Letter)) +
stat_chull(aes(color = Grade_Math_Y1_Letter, fill = Grade_Math_Y1_Letter),
alpha = 0.1,
geom = "polygon")
#### Method 3. ellipse plot
library(ggpubr)
PseudoData %>%
filter(!Grade_Math_Y2_Letter %in% NA) %>%
ggplot(., aes(x = Grade_Math_Y1, y = Grade_Art_Y1)) +
geom_point(aes(color = Grade_Math_Y2_Letter, shape = Grade_Math_Y2_Letter)) +
stat_conf_ellipse(aes(color = Grade_Math_Y2_Letter, fill = Grade_Math_Y2_Letter),
alpha = 0.1,
geom = "polygon",
na.rm = T) +
stat_mean(aes(color = Grade_Math_Y2_Letter, shape = Grade_Math_Y2_Letter),
size = 2,
na.rm = T)
#### Method 4. ellipse using ggscatter
ggscatter(PseudoData,
x = "Grade_Math_Y1", y = "Grade_Art_Y1",
color = "Grade_Math_Y1_Letter",
palette = "npg",
shape = "Grade_Math_Y1_Letter",
ellipse = TRUE,
mean.point = TRUE,
star.plot = TRUE,
ggtheme = theme_minimal())
#### Method 5. 2D bin plot (like heatmap)
## continuous vs. continuous
PseudoData %>%
ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) +
geom_bin2d()
## continuous vs. factor
PseudoData %>%
ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1_Letter)) +
geom_bin2d()
## factor vs. factor
PseudoData %>%
ggplot(aes(x = Grade_Math_Y1_Letter, y = Grade_Stats_Y1_Letter)) +
geom_bin2d()
#### Method 6. heatmap using geom_raster
PseudoData %>%
select(StudentID, starts_with("Grade")) %>%
select(StudentID, ends_with("Y1")) %>%
head(20) %>%
gather(., Term, Grade, -1) %>%
ggplot(., aes(x = Term, y = StudentID)) +
geom_raster(aes(fill = Grade)) +
scale_fill_gradient(low = "white", high = "#F8766D") +
theme_bw() +
theme(panel.grid = element_blank())
#### Method 7. geom_tile as an alternative
PseudoData %>%
count(Grade_Math_Y1_Letter, Grade_Stats_Y1_Letter) %>%
ggplot(aes(x = Grade_Math_Y1_Letter, y = Grade_Stats_Y1_Letter, fill = n)) +
geom_tile() +
geom_text(aes(label = n))
#### Method 8. geom_rect is the best option so far
temp_data <-
PseudoData %>%
head(20) %>%
select(StudentID, Grade_Math_Y1_Letter, Grade_Art_Y1_Letter, Grade_Math_Y1_Letter) %>%
pivot_longer(-StudentID,
names_to = "class",
values_to = "grade") %>%
mutate(StudentID_loc = as.numeric(StudentID),
class_loc = as.numeric(as.factor(class)))
temp_data %>%
ggplot() +
geom_rect(aes(xmin = class_loc, xmax = class_loc + 1, ymin = StudentID_loc, ymax = StudentID_loc + 1, fill = grade)) +
theme_bw() +
scale_x_continuous(breaks = seq(1.5, length(unique(temp_data$class)) + 0.5, 1), labels = c("art", "math"),
expand = c(0, 0)) +
scale_y_continuous(breaks = seq(1.5, length(unique(temp_data$StudentID)) + 0.5, 1),
labels = unique(temp_data$StudentID),
expand = c(0, 0)) +
theme(panel.grid.major = element_blank(),
panel.ontop = T,
legend.key = element_rect(color = "grey38"),
panel.background = element_rect(fill = "transparent"))
library(rayshader)
ggdiamonds = ggplot(diamonds) +
stat_density_2d(aes(x = x, y = depth, fill = stat(nlevel)),
geom = "polygon", n = 200, bins = 50,contour = TRUE) +
facet_wrap(clarity~.) +
scale_fill_viridis_c(option = "A")
par(mfrow = c(1, 2))
plot_gg(ggdiamonds, width = 5, height = 5, raytrace = FALSE, preview = TRUE)
plot_gg(ggdiamonds, width = 5, height = 5, multicore = TRUE, scale = 250,
zoom = 0.7, theta = 10, phi = 30, windowsize = c(800, 800))
# Sys.sleep(0.2)
render_snapshot(clear = TRUE)
#### Situation 1. Used to plot confidence interval of all kinds (forest plot)
## like estimate (95% CI) of regression, or incidence rate
PseudoData %>%
select(StudentID, starts_with("Grade_Math_Y")) %>% select(1:5) %>%
gather(., Term, Grade, -1) %>%
group_by(Term) %>%
summarize(Med = median(Grade, na.rm = T),
P25th = quantile(Grade, 0.25, na.rm = T),
P75th = quantile(Grade, 0.75, na.rm = T)) %>%
ggplot(aes(x = Med, y = Term, color = Term))+
geom_point()+
geom_errorbarh(aes(xmin = P25th, xmax = P75th), height = 0.2, cex = 1)
#### Situation 2. lollipop plot
## Combining a horizontal error bar and dot plot can create a lollipop plot
## can be used to present two or even three different continuous measures by each observations
PseudoData %>%
ggplot(aes(x = Grade_Math_Y1, y = StudentID)) +
geom_errorbarh(aes(xmin = 0, xmax = Grade_Math_Y1), height = 0) +
geom_point(aes(size = Age, color = Gender))
library(gggibbous)
## see an example here: https://cran.r-project.org/web/packages/gggibbous/vignettes/gggibbous.html#:~:text=gggibbous%20and%20its%20usage,require%20any%20special%20coordinate%20system.
## Example 1: point
tidymoons <- data.frame(
x = rep(1:3, 6),
y = rep(rep(3:1, each = 3), 2),
ratio = c(1:9 / 10, 9:1 / 10),
right = rep(c(TRUE, FALSE), each = 9)
)
ggplot(tidymoons, aes(x, y, ratio = ratio, right = right, size = 2^x)) +
geom_moon(data = subset(tidymoons, right), fill = "violetred") +
geom_moon(
data = subset(tidymoons, !right), fill = "turquoise3",
key_glyph = draw_key_moon_left
) +
lims(x = c(0.5, 3.5), y = c(0.5, 3.5)) +
scale_size("size", range = c(5, 10), breaks = 2^(1:3))
## Example 2 map plot
dmeladh_adj <- dmeladh
dmeladh_adj$long <- dmeladh$Longitude + c(
-2, 0, -2, 2, -3, 3, 3, 2, 3, 4, -2.5, -2.5, -1, -2, -2.5, -4, 2.5,
5, 6, 7, 2, -7, -5.5, -3, 0, -7, -2, 3, 5.5, 0.5, -1, -1.5, -3, 2)
dmeladh_adj$lat <- dmeladh$Latitude + c(
-2, 2, 0, 1, 0, 0, 0, 2, 0.5, -1, 1, -1.5, 2, 4, 1.5, 0, 2,
1, -1, -3, -2, 1, -1, -2, -3, -2, -4, -3, -1, 1.5, 2, 2, -2, 0)
moonmap <- ggplot(dmeladh_adj, aes(long, lat)) +
geom_polygon(
data = map_data(
"world", region = "(Australia)|(Indonesia)|(Papua New Guinea)"),
aes(group = group),
fill = "gray80"
) +
geom_segment(aes(xend = Longitude, yend = Latitude), color = "gray20") +
geom_point(aes(Longitude, Latitude), size = 0.75, color = "gray20") +
scale_size(range = c(4, 10)) +
coord_map(xlim = c(110, 160), ylim = c(-45, -5)) +
theme_void() +
theme(
legend.position = c(0.05, 0.05),
legend.direction = "horizontal",
legend.justification = c(0, 0)
)
moonmap +
geom_moon(
aes(ratio = AdhS / 100, size = N),
right = FALSE, fill = "gold", color = "gold",
key_glyph = draw_key_moon_left
) +
geom_moon(
aes(ratio = AdhF / 100, size = N),
fill = "forestgreen", color = "forestgreen"
)
#### Method 1. plot dot on map
PseudoData %>%
ggplot(aes(x = Longitude, y = Latitude, size = Grade_Math_Y1, color = Age)) +
#Use state border in map plot, try "world", or adding region = "florida"
borders("state") +
geom_point() +
coord_map() +
#Can also use ggthemes::theme_map
theme_void()
#### Method 2. plot region on map
PseudoData %>%
select(State, Grade_Stats_Y1) %>%
group_by(State) %>%
summarise(mean = mean(Grade_Stats_Y1, na.rm = T)) %>%
#Merge with map data to obtain state data
right_join(map_data("state"), by = c("State" = "region")) %>%
ggplot(aes(x = long, y = lat, group = group, fill = mean)) +
geom_polygon()+
coord_map()+
ggthemes::theme_map()
# use map_data("world") for word data
#### Method 3. interactive map with leaflet package
## Interactive map with leaflet package - see tidytuesday script:: "volcano eruption" for detailed walk through
library(leaflet)
library(glue)
library(htmlwidgets)
library(DT)
PseudoData %>%
select(StudentID,
Gender, Age, Grade_Stats_Y1, Grade_Stats_Y1_Letter,
Latitude, Longitude) %>%
# translate a continuous variable to a color variable
mutate(age_col = colorNumeric(c("blue", "red"), domain = Age)(Age)) %>%
gather(key, value,
StudentID, Gender, Age, Grade_Stats_Y1, Grade_Stats_Y1_Letter) %>%
# optional:: replace na to unknown
replace_na(list(value = "Unknown")) %>%
mutate(key = str_to_title(str_replace_all(key, "_", " ")),
# bold the key variable in the plot pop-up
key = paste0("<b>", key, "</b>")) %>%
# nest all descriptive data into a small tibble
nest(data = c(key, value)) %>%
# make a html table from the nested descriptive tibble
mutate(html = map(data,
knitr::kable,
format = "html",
escape = F,
# remove the column name of the html
col.names = c("", ""))) %>%
# make a leaflet object
leaflet() %>%
# add graphic layer
addTiles() %>%
# add circle marker (points on the map); also try addMarkers()
addCircleMarkers(lat = ~ Latitude,
lng = ~ Longitude,
col = ~ age_col,
# use the html object as popup object
popup = ~ html,
# specify the radius of the circle
radius = 1) %>%
addMeasure()
#### Method 4. Stamen map
library(ggmap)
## get map using longitude and latitude
tampa_borders <- c(bottom = 27.964157 - 0.2,
top = 27.964157 + 0.2,
left = -82.452606 - 0.2,
right = -82.452606 + 0.2)
## create pseudo data within the map range
set.seed(20120116)
tampa_pseudo_data <-
tbl_df(cbind(lon = runif(100, min = tampa_borders["left"], max = tampa_borders["right"]),
lat = runif(100, min = tampa_borders["bottom"], max = tampa_borders["top"]),
random_grade = PseudoData$Grade_Math_Y1))
## obtain map
map <- get_stamenmap(tampa_borders, zoom = 11, maptype = "watercolor")
## plot on map
ggmap(map) +
geom_point(data = tampa_pseudo_data,
mapping = aes(x = lon, y = lat,
col = random_grade, size = random_grade)) +
scale_color_distiller(palette = "YlOrRd", direction = 1)
#### Method 5. 3D map
## example from https://www.rayshader.com/
library(rayshader)
#Here, I load a map with the raster package.
loadzip = tempfile()
download.file("https://tylermw.com/data/dem_01.tif.zip", loadzip)
localtif = raster::raster(unzip(loadzip, "dem_01.tif"))
unlink(loadzip)
#And convert it to a matrix:
elmat = raster_to_matrix(localtif)
elmat %>%
sphere_shade(texture = "desert") %>%
add_water(detect_water(elmat), color = "lightblue") %>%
add_shadow(cloud_shade(elmat, zscale = 10, start_altitude = 500, end_altitude = 1000,), 0) %>%
plot_3d(elmat, zscale = 10, fov = 0, theta = 135, zoom = 0.75, phi = 45, windowsize = c(1000, 800),
background="darkred")
# render_camera(theta = 20, phi=40,zoom= 0.64, fov= 56 ) # this allows interactive rotation
render_snapshot(theta = 20, phi=40,zoom= 0.64, fov= 56, clear = TRUE)
library(survival)
library(survminer)
#### Situation 1. used in survival/time-to-event analysis
Sample_Fit <- survfit(Surv(Day2Employ, Employed == "Yes") ~ Grade_Stats_Y1_Letter,
data=PseudoData)
tb <- table(PseudoData$Grade_Stats_Y1_Letter)
ggsurvplot(Sample_Fit,
data = PseudoData,
#event = cumulative event, cumhaz = cumulative hazard, pct = survival probability
fun="event",
#Scale time by year
break.time.by = 365.25, xscale = "d_y",
#Calculate log-rank p-value
pval = F,
#Output risk table
risk.table = F,
#Show censor point
censor=F,
#Show confidence interval
conf.int = F,
title="Time to employment by first year stats grade",
legend="bottom",
legend.title="",
legend.labs=c(paste0("A n=",tb[1]),
paste0("B n=",tb[2]),
paste0("C n=",tb[3]),
paste0("D n=",tb[4]),
paste0("E n=",tb[5])),
xlab="Years",
ylab="Cumulative probability of employment",
ylim=c(0,0.5),
ggtheme = theme_bw(),
size=1.3)
#### Situation 2. Kaplan-Meier plot with risk table
Surv_plot_example <- ggsurvplot(Sample_Fit,
data = PseudoData,
fun="event",
break.time.by = 60, xscale = "d_y",
pval = F,
## show risk table
risk.table = T,
## risk table font size
risk.table.fontsize = 6,
## risk table theme
tables.theme = theme_cleantable(),
## risk table height
risk.table.height = 0.5,
censor=F,
conf.int = F,
title="Time to employment by first year stats grade",
legend="bottom",
legend.title="",
legend.labs=c(paste0("A n=",tb[1]),
paste0("B n=",tb[2]),
paste0("C n=",tb[3]),
paste0("D n=",tb[4]),
paste0("E n=",tb[5])),
xlab="Years",
ylab="Cumulative probability of employment",
ylim=c(0,0.5),
ggtheme = theme_bw(),
size=1.3)
Surv_plot_example
## additional edits on the table can be done by calling the table object from the plot
Surv_plot_example$table <-
Surv_plot_example$table +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())
#### Situation 3. stratified KM plot
ggsurvplot_facet(Sample_Fit,
data = PseudoData,
fun="event",
facet.by = "Gender")
#### Situation 1. used to plot HR and 95% confidence interval from Cox regression model
library(survival)
library(survminer)
model <- coxph( Surv(time, status) ~ sex + rx + adhere,
data = colon )
ggforest(model)
library(ggVennDiagram)
Venn_data <- list('Group A'= as.character(sample(c(1: 1000), 300)),
"Group B"= as.character(sample(c(1: 1000), 450)))
ggVennDiagram(Venn_data,
category.names = c(" ", " "),
label = "both")+
scale_fill_gradient(low = "white", high = "cornflowerblue")+
theme_void()+
annotate("text",x = -2, y = 4, label = "Customer label A", size = 8) +
annotate("text",x = 6, y = 4, label = "Customer label B", size = 8)
This site contains a bit more details, but you’ll have to rely on R doc for actual examples.
library(VennDetail)
A <- sample(1:1000, 400, replace = FALSE)
B <- sample(1:1000, 600, replace = FALSE)
C <- sample(1:1000, 350, replace = FALSE)
D <- sample(1:1000, 550, replace = FALSE)
res <- venndetail(list(A = A, B = B, C = C, D = D))
# result <- result(res)
# head(result)
# getSet(res, "A") # get unique elements in A
summary(res) #show overlap 'details' of all subsets
## Input groups are: A B C D
## Total unique elements are: 921
## Total subsets are: 15
## ====== Subsets detail: ======
## Shared B_C_D A_C_D C_D A_B_D B_D A_D D A_B_C B_C A_C
## 52 69 31 43 86 130 62 77 26 66 31
## C A_B B A
## 32 67 104 45
do <- dplot(res) #make a bargraph for 'details'
pie <- vennpie(res) # make a pie chart
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `select()` instead.
## ℹ The deprecated feature was likely used in the dplyr package.
## Please report the issue at <https://github.com/tidyverse/dplyr/issues>.
## Warning: `filter_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `filter()` instead.
## ℹ See vignette('programming') for more help
## ℹ The deprecated feature was likely used in the dplyr package.
## Please report the issue at <https://github.com/tidyverse/dplyr/issues>.
ggiraph::girafe(ggobj = cowplot::plot_grid(do, pie), width_svg = 8, height_svg = 4)
Chord diagram could be helpful in visualizing concordance between two measures of the same multi-level categorical variable. See this site for more details.
library(chorddiag)
# Create dummy data
m <- matrix(c(11975, 5871, 8916, 2868,
1951, 10048, 2060, 6171,
8010, 16145, 8090, 8045,
1013, 990, 940, 6907),
byrow = TRUE,
nrow = 4, ncol = 4)
# A vector of 4 colors for 4 groups
haircolors <- c("black", "blonde", "brown", "red")
dimnames(m) <- list(have = haircolors,
prefer = haircolors)
groupColors <- c("#000000", "#FFDD89", "#957244", "#F26223")
# Build the chord diagram:
p <- chorddiag(m, groupColors = groupColors, groupnamePadding = 20)
p
Faceting is particularly helpful when trying to stratify the data in someway.
#### Situation 1. facet by one stratifying variable
## add labels of the facets
Facet_Label <- c("Cus Label F", "Cus Label M")
names(Facet_Label) <- c("Female", "Male")
## use scatter plot as an example
PseudoData %>% ggplot(aes(x=Grade_Math_Y1,y=Grade_Stats_Y1)) +
geom_point() +
facet_wrap(~Gender, nrow=1,
labeller = labeller(Gender = Facet_Label))
#### Situation 2. facet plot allows for different order within each facet
## use bar plot as an example (also works when facetting box plot and other continuous vs. categorical plot)
PseudoData %>%
count(Race, Gender) %>%
## order the race by frequency, separately within each gender category
mutate(Race = tidytext::reorder_within(Race, n, Gender)) %>%
ggplot(aes(Race, n, fill = Gender)) +
geom_col(show.legend = F)+
facet_wrap(~Gender, scales = "free_y")+
tidytext::scale_x_reordered()+
coord_flip()
#### Situation 3. facet_wrap by two stratifying variable
##add labels of the facets
facet_labeller_left <- function(variable, value) {
c("A1", "", "B1", "", "M1", "",
"U1", "", "W1", "", "NA1", "")
}
facet_labeller_top <- function(variable, value) {
c("female", "male","", "","", "","", "", "", "","", "")
}
PseudoData %>% ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) +
geom_point() +
## facet by race (in rows) and gender (in columns),
## note, this will automatically remove empty combinations
facet_wrap(Race ~ Gender,
## allow the x scale to be free, also try "free_y", and "free"
scales = "free_x",
ncol = 2,
labeller = labeller(Race = as_labeller(facet_labeller_left),
Gender = as_labeller(facet_labeller_top)))
#### Situation 4. facet_grid by two stratifying variable
PseudoData %>%
ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) +
geom_point() +
## facet by gender (in rows) and race (in columns),
## note, this will NOT automatically remove empty combinations
facet_grid(rows = vars(Gender), cols = vars(Race))
#### Situation 5. geofacet - facet by locations
library(geofacet)
## facet by US state as an example
PseudoData %>%
mutate(State = snakecase::to_title_case(State)) %>%
ggplot(aes(Grade_Math_Y1, Grade_Stats_Y1)) +
geom_point() +
theme_bw() +
facet_geo(~ State, grid = "us_state_grid1", label = "name")
## One can choose to facet by any of the 141 pre-set grids
get_grid_names() %>% head(n = 10)
## [1] "us_state_grid1" "us_state_grid2"
## [3] "eu_grid1" "aus_grid1"
## [5] "sa_prov_grid1" "gb_london_boroughs_grid"
## [7] "nhs_scot_grid" "india_grid1"
## [9] "india_grid2" "argentina_grid1"
## Or, to create your own geo grid by design the grid data.
library(scales)
#### Situation 1. log scale x or y axis
PseudoData %>%
filter(! Schol_Amount %in% NA) %>%
ggplot(aes(x = Schol_Amount, y = Schol_Amount)) +
geom_point() +
## log10 is a built-in scale
scale_x_log10() +
## other log transformation can be done like the following
scale_y_continuous(trans = log2_trans(),
breaks = trans_breaks("log2", function(x) 2^x),
labels = trans_format("log2", math_format(2^.x)))
#### Situation 2. specify limit, breaks and name
PseudoData %>%
filter(! Schol_Amount %in% NA) %>%
ggplot(aes(x = Schol_Amount, y = Schol_Amount)) +
geom_point() +
## manually specify name, limit, breaks and labels of each tick
scale_x_continuous(name = "amount of scholarship",
limit = c(0, 20000),
labels = c("lower end", "somehwere in the middle", "upper end"),
breaks = c(0, 10000, 20000)) +
## put breaks as a function of the limit
scale_y_continuous(name = "same as x-axis",
breaks = function(lim) (seq(floor(lim[1]),
ceiling(lim[2]),
5000))) +
## one quick way to expand the limits is by adding expand_limits
expand_limits(y = 0)
#### Situation 3. scale size, color, or fill
Scatter_base <-
PseudoData %>%
filter(! Schol_Amount %in% NA) %>%
ggplot(aes(x = Schol_Amount,
y = Schol_Amount,
size = Schol_Amount,
color = Schol_Amount)) +
geom_point()
## size: specify range of the size
Scatter_base + scale_size_continuous(range = c(5, 10))
## color/fill: specify range of the color using scale_color_gradient
Scatter_base + scale_color_gradient(low = "yellow", high = "red", na.value = NA)
## color/fill: specify range of color and midpoint using scale_color_gradient2
Scatter_base + scale_color_gradient2(low = "blue", high = "red", midpoint = 75, na.value = NA)
## color/fill: can also work with transformed scale
Scatter_base + scale_color_continuous(trans = "log10")
#### Situation 4. reverse order of discrete scales
Temp_data <-
PseudoData %>%
count(Major) %>%
mutate(Major = fct_reorder(Major, n))
Temp_data %>%
ggplot(aes(x = n, y = Major, fill = n)) +
geom_col()
Temp_data %>%
ggplot(aes(x = n, y = Major, fill = n)) +
geom_col() +
scale_y_discrete(limits = rev(levels(Temp_data$Major)))
#### Situation 5. change x or y axis label format
#### these labels should work with any type of scaling function
## comma format
PseudoData %>%
filter(! Schol_Amount %in% NA) %>%
ggplot(aes(x = Schol_Amount)) +
geom_histogram() +
scale_x_continuous(labels = comma_format()) # labels = comma works the same
## dollar format
PseudoData %>%
filter(! Schol_Amount %in% NA) %>%
ggplot(aes(x = Schol_Amount)) +
geom_histogram() +
scale_x_continuous(labels = dollar_format())
## percent format
PseudoData %>%
count(Major) %>%
mutate(p = n/sum(n)) %>%
ggplot(aes(x = p, y = Major)) +
geom_col() +
scale_x_continuous(labels = percent_format()) # labels = percent works the same
Temp_data <-
PseudoData %>%
count(Major) %>%
mutate(Major = fct_reorder(Major, n))
#### Situation 1. Labels
## Axis label or legend label can be edited in scale function as described above
## In addition, these labels as well as title, subtitle and caption can be added by labs function
Temp_plot <-
Temp_data %>%
ggplot(aes(x = n, y = Major, fill = Major)) +
geom_col() +
labs(title = "title",
subtitle = "subtitle",
x = "x axis",
y = "y axis",
caption = "caption",
fill = "fill",
color = "color")
Temp_plot
#### Situation 2. legends
## position
Temp_plot + theme(legend.position = c(1,1))
Temp_plot + theme(legend.position = "bottom")
Temp_plot + theme(legend.justification = c(1,1))
## override legend
# you can override the aes in the legend to show fixed alpha, or dot size
# this is particularly useful when you have alpha and size in the aes, but you want to show the original color
Temp_plot +
guides(fill = guide_legend(override.aes = list(alpha = 0.5, size = 1)))
## other specification of legends
# break legends into multiple rows
Temp_plot + guides(fill = guide_legend(nrow = 3, byrow = F))
# reverse the order of the legend
Temp_plot + guides(fill = guide_legend(reverse = T))
# specify the size of the legend example
Temp_plot + guides(fill = guide_legend(keywidth = 5))
## hide legend
# hide legend in geom function
Temp_data %>% ggplot(aes(x = n, y = Major, fill = Major)) + geom_col(show.legend = F)
# hide legend in scale function
Temp_plot + scale_fill_discrete(guide = F)
# hide legend in theme function
Temp_plot + theme(legend.position = "none")
# hide legend in guides function
Temp_plot + guides(fill = "none")
#### Situation 1. Cropping plot
Temp_plot + coord_cartesian(xlim = c(0, 8))
#### Situation 2. Specify the coordinate and aspect ratio
## fixed equal aspect radio by coord_equal
Temp_plot + coord_equal()
## specify aspect ratio in theme with aspect.ratio
Temp_plot + theme(aspect.ratio = 1)
themes function allows you customize almost every aspect of the plot. No example will be shown in this section.
#### Situation 1. Some neat built-in themes
theme_bw()
theme_void()
theme_light()
theme_classic()
#### Situation 2. Set universal theme of the work space
theme_set(
## use a built-in theme as the base of your custom theme
theme_light() +
## add in other specifications of your custom theme
theme(...))
#### Situation 3. Some examples of custom specification
theme(plot.title = element_text(size=20,hjust=0.5),
axis.title = element_text(size=15,face="bold"),
axis.text.x = element_text(face='bold',size=15,angle=30),
axis.text.y = element_text(face='bold',size=15),
plot.caption = element_text(size=15,hjust=0.5),
legend.title = element_text(size=15),
legend.text = element_text(size=15),
strip.text = element_text(size=15),
legend.position = "bottom")
#### Situation 4. Revome grid line of the plot
theme(panel.grid = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
#### Situation 5. Use additional theme from ggthemes package
ggthemes::theme_wsj() # wall streat journal theme
ggthemes::theme_tufte() # Tufte maximal data
ggthemes::theme_stata() # Stata graph schemes
ggthemes::theme_excel() # old excel plot
ggthemes::theme_gdocs() # google doc chart
ggthemes::theme_fivethirtyeight() # fivethirtyeight.com
ggthemes::theme_economist() # Economist
ggthemes::theme_map() # for maps
#### Situation 1. adding text annotations
## using annotate function (only be added inside the plot)
Temp_plot +
annotate("text", x = 10, y = 5, label = "Texts here",
size = 8, color = "red", fontface = "bold.italic")+
annotate("text", x = 10, y = 4,
label = expression(italic("Everthing italic")), size = 6)
## using tag inside labs function (can be added outside of the plot area)
Temp_plot +
labs(tag = "tag A") +
theme(plot.tag = element_text(size = 20, face = "bold"),
plot.tag.position = c(0.05, 0.98))
#### Situation 2. add segment line
## using geom_segment
PseudoData %>%
ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) +
geom_point() +
geom_segment(aes(x = -Inf, y = 60, xend = Inf, yend = 60), colour = "darkred", size = 1.5)+
geom_segment(aes(x = 60, y = -Inf, xend = 60, yend = Inf), colour = "darkred", size = 1.5)
## using geom_hline
PseudoData %>%
ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) +
geom_point() +
geom_hline(yintercept = 70)
## using geom_vline
PseudoData %>%
ggplot(aes(x = Grade_Math_Y1, y = Grade_Stats_Y1)) +
geom_point() +
geom_vline(xintercept = 70)
#### Situation 3. add brackets (often works with boxplot)
## the old fashion curly bracket
library(pBrackets)
library(grid)
bracketsGrob <- function(...){
l <- list(...)
e <- new.env()
e$l <- l
grid:::recordGrob( {
do.call(grid.brackets, l)
}, e)
}
b1 <- bracketsGrob(1/6, 0.8, 1/2, 0.8, h = 0.05, lwd = 2, col = "red")
b2 <- bracketsGrob(5/6, 0.3, 1/2, 0.3, h = 0.05, lwd = 2, col = "red")
PseudoData %>%
ggplot(aes(x = Employed, y = Grade_Stats_Y1)) +
geom_boxplot() +
annotation_custom(b1) +
annotation_custom(b2)
## the more convenient square bracket with multiple ticks
PseudoData %>%
ggplot(aes(x = Employed, y = Grade_Stats_Y1)) +
geom_boxplot() +
geom_bracket(xmin = "No", xmax = "Yes", y.position = 100, label = "TEXT", size = 1, tip.length = 0.03)+
geom_bracket(xmin = "No", xmax = "Yes", y.position = 50, label = "text", size = 1, tip.length = -0.03)
#### Method 1. This is from Davis McCarthy
# By Davis McCarthy
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
multiplot(Temp_ggplot1, Temp_ggplot2, cols=2)
#### Method 2. using patchwork package
library(patchwork)
## this allows for shared titles and format
Temp_ggplot1 +
labs(title = "shared title whatever") +
Temp_ggplot2
#### Method 3. working with ggsurvplot (containing risk tables)
ggsurvplot_list <- list()
ggsurvplot_list[[1]] <- Temp_ggsurvplot1
ggsurvplot_list[[2]] <- Temp_ggsurvplot2
arrange_ggsurvplots(ggsurvplot_list, print = T, ncol = 2, nrow = 1, risk.table.height = 0.5)
## if no risk table is included, multiplot function works as well
multiplot(Temp_ggsurvplot1$plot,Temp_ggsurvplot1$plot, cols=2)
#### Method 4. Run the same ggplot function on a list of data frames
# Plot_List_Shell <- c()
# DataList <- list(Data_Frame1, Data_Frame2, Data_Frame3)
# names(DataList) <- c("Data_Name1","Data_Name2","Data_Name3")
# Yourggplot <- ggplot()
#
# Plot_List_Shell[["Data_Name1"]] <- Yourggplot(data= Data_Frame1)
# Plot_List_Shell[["Data_Name2"]] <- Yourggplot(data= Data_Frame2)
#### Method 5. merging base plot
par(mfrow=c(1,3))
hist(mtcars$wt)
hist(mtcars$mpg)
hist(mtcars$disp)
#### Method 6. merging multiple existing images
# library(gird)
# library(gridExtra)
# library(ggplot)
# jpeg1 <- readJPEG("image1.jpeg")
# jpeg2 <- readJPEG("image2.jpeg")
# jpeg3 <- readJPEG("image3.jpeg")
# jpeg4 <- readJPEG("image4.jpeg")
# jpeg5 <- readJPEG("image5.jpeg")
# jpeg6 <- readJPEG("image6.jpeg")
#
# arranged_fig <-
# grid.arrange(rasterGrob(jpeg1), rasterGrob(jpeg2), rasterGrob(jpeg3),
# rasterGrob(jpeg4), rasterGrob(jpeg5), rasterGrob(jpeg6),
# ncol = 3)
#
# output_fig <-
# qplot(1:10, 1:10, geom = "blank") +
# annotation_custom(arranged_fig, xlim = 0, xmax = 10, ymin = 0, ymax = 10) +
# theme_bw() +
# theme(panel.grid.major = element_blank(),
# panel.grid.minor = element_blank(),
# axis.ticks = element_blank(),
# panel.border = element_blank())
#### Method 7. merging multiple plots with a specific layout
# library(grid)
# library(gridExtra)
#
# lay <- rbind(c(1, 1, 1, 1, 2, 2),
# c(1, 1, 1, 1, 2, 2),
# c(1, 1, 1, 1, 3, 3))
#
# grid.arrange(plot1, plot2, plot3, layout_matrix = lay)
library(gridExtra)
data_table <- tibble(course = c("Math", "Art"),
Year = c("1", "1"),
mean = c(mean(PseudoData$Grade_Math_Y1, na.rm = T),
mean(PseudoData$Grade_Art_Y1, na.rm = T)))
ggplot(data=PseudoData, aes(x = Grade_Math_Y1, y = Grade_Art_Y1)) +
geom_point() +
annotation_custom(tableGrob(data_table, rows = NULL), xmin = 40, xmax = 60, ymin = 50, ymax = 70)
library(plotly)
#### Situation 1. regular ggplot object
ggplotly(Temp_ggplot1)
#### Situation 2. ggplot object from ggsurvplot
ggplotly(Temp_ggsurvplot1$plot)
#### Situation 1. export plot into jpeg
jpeg("PlotName.jpeg", width = 15, height = 9, units = 'in', res = 300)
Plot1
dev.off()
#### Situation 2. export plot into pdf
pdf("PlotName.pdf",width = 15, height = 9)
Plot2
dev.off()
#### Situation 3. exporting image using ragg package
## the agg_xxxx function from ragg package outputs a slightly nicer image as it uses a better engine. What's inside the function should be exact the same as the orginal png() or jpeg() function.
ragg::agg_jpeg("PlotName.jpeg", width = 15, height = 9, units = 'in', res = 300)
Plot1
dev.off()
A faster and convenient way to install packages
library(pak)
# pak::pkg_install("Waldo")
Contianing several datasets with world country names and abbreviations, as well as latitude and longitude of world/US cities.
maps::iso3166 %>% head()
## a2 a3 ISOname mapname sovereignty
## 1 AW ABW Aruba Aruba Netherlands
## 2 AF AFG Afghanistan Afghanistan Afghanistan
## 3 AO AGO Angola Angola Angola
## 4 AI AIA Anguilla Anguilla Anguilla
## 5 AX ALA Aland Islands Finland:Aland Islands Finland
## 6 AL ALB Albania Albania Albania
maps::us.cities %>% head()
## name country.etc pop lat long capital
## 1 Abilene TX TX 113888 32.45 -99.74 0
## 2 Akron OH OH 206634 41.08 -81.52 0
## 3 Alameda CA CA 70069 37.77 -122.26 0
## 4 Albany GA GA 75510 31.58 -84.18 0
## 5 Albany NY NY 93576 42.67 -73.80 2
## 6 Albany OR OR 45535 44.62 -123.09 0
A very helpful data source for nation-level income, economic and population data
#### Example of pulling GDP per capita in 2015 from WDI data
library(WDI)
Example_gdp_capita <- WDI(indicator = "NY.GDP.PCAP.CD",
start = 2015, end = 2015, extra = T) %>%
tbl_df()
Example_gdp_capita
## # A tibble: 266 × 13
## iso2c country NY.GD…¹ year status lastu…² iso3c region capital longi…³
## <chr> <chr> <dbl> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1A Arab World 6246. 2015 "" 2022-1… ARB Aggre… "" ""
## 2 1W World 10154. 2015 "" 2022-1… WLD Aggre… "" ""
## 3 4E East Asia & … 6502. 2015 "" 2022-1… EAP Aggre… "" ""
## 4 7E Europe & Cen… 7458. 2015 "" 2022-1… ECA Aggre… "" ""
## 5 8S South Asia 1524. 2015 "" 2022-1… SAS Aggre… "" ""
## 6 AD Andorra 38885. 2015 "" 2022-1… AND Europ… "Andor… "1.521…
## 7 AE United Arab … 41525. 2015 "" 2022-1… ARE Middl… "Abu D… "54.37…
## 8 AF Afghanistan 592. 2015 "" 2022-1… AFG South… "Kabul" "69.17…
## 9 AG Antigua and … 14862. 2015 "" 2022-1… ATG Latin… "Saint… "-61.8…
## 10 AL Albania 3953. 2015 "" 2022-1… ALB Europ… "Tiran… "19.81…
## # … with 256 more rows, 3 more variables: latitude <chr>, income <chr>,
## # lending <chr>, and abbreviated variable names ¹NY.GDP.PCAP.CD,
## # ²lastupdated, ³longitude
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
library(countrycode)
countrycode(c("china","us"), "country.name", "iso2c")
## [1] "CN" "US"
countrycode(c("china","us"), "country.name", "iso3c")
## [1] "CHN" "USA"
library(ggsci)
It also allows user to design his/her own pre-set theme. See example in this page.
library(ggtech)
d2 <- data.frame(x = c(1:4, 3:1), y=1:7)
ggplot(aes(x,y), data=d2) +
geom_tech(size=2, theme="twitter") +
theme_tech("twitter") +
ggtitle("Twitter geom")
library(ggtext)
library(ggiraph)
#### just an example
gg1 <-
ggplot(PseudoData) +
geom_point_interactive(aes(x = Grade_Stats_Y1, y = Grade_Math_Y1, color = Employed,
tooltip = StudentID, data_id = StudentID), size = 3)
gg2 <-
ggplot(PseudoData) +
geom_point_interactive(aes(x = Grade_Art_Y1, y = Grade_Engl_Y1, color = Employed,
tooltip = StudentID, data_id = StudentID), size = 3)
girafe(ggobj = cowplot::plot_grid(gg1, gg2), width_svg = 8, height_svg = 4)
library(magick)
These four packages can be used to create shinymetrics for easy plotting, particularly useful in presenting count data across cateogires and time.
In addition, utilizing both shinydashboard and shinybones, a interactive dashboard can be easily created.
An speech focusing on the workflow can be found on this site. David Robinson has also given several examples in his Tidy Tuesday screen casts.
An example created by myself using data of Spotify songs is also available.
# devtools::install_github("ramnathv/tidymetrics")
# library(tidymetrics)
More details about ggcorr correlation plots can be seen in this site.
library(GGally)
PseudoData %>%
select(Race, Age,Grade_Math_Y1,Grade_Stats_Y1,Grade_Math_Y1_Letter,Grade_Stats_Y1_Letter) %>%
ggpairs()
This is a helpful package when you want to generate a quick plot with some summary statistics and quick tests. For more details, see this site.
library(ggstatsplot)
ggbetweenstats(
data = PseudoData,
x = Major,
y = Grade_Math_Y1,
title = "Distribution of first year math grade across majors"
)
See this page for more details: https://r-coder.com/calendar-plot-r/
library(calendR)
# Vector of NA of the same length of the number of days of the year
events <- rep(NA, 365)
# Set the corresponding events
events[40:45] <- "Trip"
events[213:240] <- "Holidays"
events[252] <- "Birthday"
events[359] <- "Christmas"
# Creating the calendar with a legend
calendR(year = 2025,
special.days = events,
special.col = c("pink", "lightblue", # Colors
"lightgreen", "lightsalmon"),
legend.pos = "right") # Legend to the right
# example two
calendR(start_date = "2020-09-01", # Custom start date
end_date = "2021-05-31", # Custom end date
start = "M", # Start the weeks on Monday
mbg.col = 4, # Color of the background of the names of the months
months.col = "white", # Color text of the names of the months
special.days = "weekend", # Color the weekends
special.col = "lightblue", # Color of the special.days
lty = 0, # Line type
bg.col = "#f4f4f4", # Background color
title = "Academic calendar 2020-2021", # Title
title.size = 30, # Title size
orientation = "p") # Vertical orientation
One quick example of how classification, regression and other applications work in tidymodel format can be found in David Robinson’s tidytuesday screen case: penguins.
Try copying a table/table-like object from this wikipedia site. Then click on “Addins” and click “Paste as tribble” (or other format of your choice), and it shall return something like this, which can be directly ran as an r object.
tibble::tribble(
~Position, ~Tonic, ~Heptatonic.mode, ~Pentatonic.scales, ~Name,
0L, "F", "Lydian", "Major", NA,
1L, "C", "Ionian (major)", "Major, ritusen", "Straight harp",
2L, "G", "Mixolydian", "Major, ritusen, suspended", "Crossharp",
3L, "D", "Dorian", "Minor, ritusen, suspended", "Slant harp",
4L, "A", "Aeolian (natural minor)", "Minor, man gong, suspended", NA,
5L, "E", "Phrygian", "Minor, man gong", NA,
6L, "B", "Locrian", "Man gong, blues", NA
)
## # A tibble: 7 × 5
## Position Tonic Heptatonic.mode Pentatonic.scales Name
## <int> <chr> <chr> <chr> <chr>
## 1 0 F Lydian Major <NA>
## 2 1 C Ionian (major) Major, ritusen Straight ha…
## 3 2 G Mixolydian Major, ritusen, suspended Crossharp
## 4 3 D Dorian Minor, ritusen, suspended Slant harp
## 5 4 A Aeolian (natural minor) Minor, man gong, suspended <NA>
## 6 5 E Phrygian Minor, man gong <NA>
## 7 6 B Locrian Man gong, blues <NA>
This is package that aimed at consolidate visualization tools for multivariate analyses such as: PCA, correspondence analysis, clustering, particitioning. More details to be found in this site.
raddle creates an interactive and click-based user interface that allow the user to explore/mine the data without having to type any code. In addition, it generates the code behind each click and make the output more streamlined. For details see this post.
# library(rattle)
# rattle()
For details, see this post.
# library(esquisse)
# esquisser()
This site contains some examples.
library("easyPubMed")
dami_query_string <- "yayi zhao[AU]"
dami_on_pubmed <- get_pubmed_ids(dami_query_string)
dami_papers <- fetch_pubmed_data(dami_on_pubmed)
titles <- sapply(dami_papers, custom_grep, tag = "ArticleTitle", format = "char", USE.NAMES = FALSE)
## The following articles are under my name, but some of them aren't really my work... blame PubMed for its search engine.
print(titles)
## [,1]
## [1,] "Feasibility of a Culturally Specific DEmentia Competence Education for Nursing Home Taskforce (DECENT) Programme: A Mixed-Method Approach."
## [2,] "History of keratinocyte carcinoma and survival after a second primary malignancy: the Moffitt Cancer Center patient experience."
## [3,] "Ethical challenges experienced by care home staff during COVID-19 pandemic."
## [4,] "Effect of a culturally sensitive DEmentia Competence Education for Nursing home Taskforce (DECENT) programme in China: A quasi-experimental study."
## [5,] "Translation and validation of Chinese version of sense of competence in dementia care staff scale in healthcare providers: a cross-sectional study."
## [6,] "Effect of prior antibiotic or chemotherapy treatment on immunotherapy response in non-small cell lung cancer."
## [7,] "Natural History of Incident and Persistent Cutaneous Human Papillomavirus and Human Polyomavirus Infections."
## [8,] "Circulating Immunosuppressive Regulatory T Cells Predict Risk of Incident Cutaneous Squamous Cell Carcinoma."
## [9,] "Dementia care education interventions on healthcare providers' outcomes in the nursing home setting: A systematic review."
## [10,] "Staff's Psychological Well-Being and Coping Mechanisms During COVID-19 Lockdown in Care Homes for Older Adults: A Structural Equation Modeling Analysis."
## [11,] "Cutaneous Human Papillomaviruses and the Risk of Keratinocyte Carcinomas."
## [12,] "Association between Human Polyomaviruses and Keratinocyte Carcinomas: A Prospective Cohort Study."
## [13,] "Gamification for promoting advance care planning: A mixed-method systematic review and meta-analysis."
## [14,] "Understanding dementia care in care home setting in China: An exploratory qualitative study."
## [15,] "Validation of a Chinese version of the dementia knowledge assessment scale in healthcare providers in China."
## [16,] "Cutaneous viral infections associated with ultraviolet radiation exposure."
## [17,] "Measuring Self-Efficacy and Readiness for Advance Care Planning in Chinese Older Adults."
## [18,] "Viruses in Skin Cancer (VIRUSCAN): Study Design and Baseline Characteristics of a Prospective Clinic-Based Cohort Study."
## [19,] "T Regulatory Cell Subpopulations Associated with Recent Ultraviolet Radiation Exposure in a Skin Cancer Screening Cohort."
## [20,] "Cutaneous Viral Infections Across 2 Anatomic Sites Among a Cohort of Patients Undergoing Skin Cancer Screening."
This is something that I am yet to try. This package is supposed to update R, not the packages, using r itself. Check out this post for more details.