References

https://cran.r-project.org/

https://r4ds.had.co.nz/

https://adv-r.hadley.nz/

https://stats.idre.ucla.edu/r/

Objectives

-Analyze various variables in the birthwt dataset (from the MASS package).

-Evaluate bwt, a continous variable, in relation to other variables.

-Evaluate low, a categorical variable, in relation to other variables.

Libraries

library(tidyverse)
## -- Attaching packages ------------
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ---------------------
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(doBy)
library(haven)

Structure of the dataset

data("birthwt")

str(birthwt)
## 'data.frame':    189 obs. of  10 variables:
##  $ low  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age  : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt  : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ race : int  2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke: int  0 0 1 1 1 0 0 0 1 1 ...
##  $ ptl  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ht   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ui   : int  1 0 0 1 1 0 0 0 0 0 ...
##  $ ftv  : int  0 3 1 2 0 0 1 1 1 0 ...
##  $ bwt  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...

Describe the variables

# variable name variable label coded levels
1 low indicator of birth weight less than 2.5 kg 0, 1
2 age mother’s age in years continous variable
3 lwt mother’s weight in pounds at last menstrual period continous variable
4 race mother’s race (1 = white, 2 = black, 3 = other) 1, 2, 3
5 smoke smoking status during pregnancy 0, 1
6 ptl number of previous premature labours 0, 1, 2, 3
7 ht history of hypertension 0, 1
8 ui presence of uterine irritability 0, 1
9 ftv number of physician visits during the first trimester 0, 1, 2, 3, 4, 6
10 bwt birth weight in grams continous variable

Transform the data.frame

-Factorize -Collapse levels -Create indicator or dummy variables -Labels the levels of a factor

# make a copy of the original data.frame

bwt.df <- birthwt  

names(bwt.df)
##  [1] "low"   "age"   "lwt"   "race"  "smoke" "ptl"   "ht"    "ui"   
##  [9] "ftv"   "bwt"

Make grouping and/or indicator variables for variable race

# Check the levels of the variable

table(bwt.df$race)
## 
##  1  2  3 
## 96 26 67
# No. of levels

table(bwt.df$race) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist()%>% length() 
## [1] 3
# Levels

table(bwt.df$race) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist() 
## Var11 Var12 Var13 
##     1     2     3 
## Levels: 1 2 3
# regroup the levels without collapsing

bwt.df$racegr <- 99


bwt.df$racegr[bwt.df$race==1] <- 0
bwt.df$racegr[bwt.df$race==2] <- 1
bwt.df$racegr[bwt.df$race==3] <- 2

table(bwt.df$racegr)
## 
##  0  1  2 
## 96 26 67
table(bwt.df$racegr, bwt.df$race)
##    
##      1  2  3
##   0 96  0  0
##   1  0 26  0
##   2  0  0 67
# create two indicator/dummy variables

bwt.df$black <- 0
bwt.df$other <- 0


bwt.df$black[bwt.df$racegr==1] <- 1
bwt.df$other[bwt.df$racegr==2] <- 1

table(bwt.df$black, bwt.df$other)
##    
##      0  1
##   0 96 67
##   1 26  0

Make grouping and/or indicator variables for variable ptl

# Check levels of the variables

table(bwt.df$ptl)
## 
##   0   1   2   3 
## 159  24   5   1
table(bwt.df$ptl) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist()%>% length() # No. of levels: 4
## [1] 4
table(bwt.df$ptl) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist() # Levels: 0 1 2 3
## Var11 Var12 Var13 Var14 
##     0     1     2     3 
## Levels: 0 1 2 3
# collapse into 3 levels


bwt.df$ptlgr <- 99

bwt.df$ptlgr[bwt.df$ptl==0] <- 0
bwt.df$ptlgr[bwt.df$ptl==1] <- 1
bwt.df$ptlgr[bwt.df$ptl==2 | bwt.df$ptl==3] <- 2


table(bwt.df$ptlgr)
## 
##   0   1   2 
## 159  24   6
# create two indicator/dummy variables

bwt.df$one_preterm_labor <- 0
bwt.df$twoOrMore_preterm_labor <- 0


bwt.df$one_preterm_labor[bwt.df$ptlgr==1] <- 1
bwt.df$twoOrMore_preterm_labor[bwt.df$ptlgr==2] <- 1

table(bwt.df$one_preterm_labor)
## 
##   0   1 
## 165  24
table(bwt.df$twoOrMore_preterm_labor)
## 
##   0   1 
## 183   6

Make grouping and/or indicator variables for variable ftv

# check levels

table(bwt.df$ftv)
## 
##   0   1   2   3   4   6 
## 100  47  30   7   4   1
table(bwt.df$ftv) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist() %>% length() # No. of levels: 6
## [1] 6
table(bwt.df$ftv) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist() # Levels: 0 1 2 3 4 6
## Var11 Var12 Var13 Var14 Var15 Var16 
##     0     1     2     3     4     6 
## Levels: 0 1 2 3 4 6
# collapse into 3 levels

bwt.df$ftvgr <- 99

bwt.df$ftvgr[bwt.df$ftv==0] <- 0
bwt.df$ftvgr[bwt.df$ftv==1] <- 1
bwt.df$ftvgr[bwt.df$ftv==2 | bwt.df$ftv==3 | bwt.df$ftv==4 | bwt.df$ftv==6] <- 2


table(bwt.df$ftvgr)
## 
##   0   1   2 
## 100  47  42
# create indicator/dummy variables

bwt.df$one_firstTimester_drVisit <- 0
bwt.df$twoOrMore_firstTimester_drVisit <- 0


bwt.df$one_firstTimester_drVisit[bwt.df$ftvgr==1] <- 1
bwt.df$twoOrMore_firstTimester_drVisit[bwt.df$ftvgr==2] <- 1


table(bwt.df$one_firstTimester_drVisit)
## 
##   0   1 
## 142  47
table(bwt.df$twoOrMore_firstTimester_drVisit)
## 
##   0   1 
## 147  42

Factorize variables

bwt.df$lowf <- factor(bwt.df$low) # as.factor would also work

bwt.df$racegf <- factor(bwt.df$racegr)

bwt.df$smokef <- factor(bwt.df$smoke)

bwt.df$ptlgf <- factor(bwt.df$ptlgr)

bwt.df$htf <- factor(bwt.df$ht)

bwt.df$uif <- factor(bwt.df$ui)

bwt.df$ftvgf <- factor(bwt.df$ftvgr)

Create factors with labelled levels

bwt.df$lowfl <- factor(bwt.df$low, levels = c(0, 1), labels = c("High", "Low"))

bwt.df$smokefl <- factor(bwt.df$smoke, levels = c(0, 1), labels = c("Non-smoker", "Smoker"))

bwt.df$htfl <- factor(bwt.df$ht, levels = c(0, 1), labels = c("No_hypertension", "Hypertension"))

bwt.df$uifl <- factor(bwt.df$ui, levels = c(0, 1), labels = c("No_uterine_irritability", "Uterine_irritability"))

bwt.df$racegfl <- factor(bwt.df$racegr, levels = c(0, 1, 2), labels = c("White", "Black", "Other"))

bwt.df$ptlgfl <- factor(bwt.df$ptlgr, levels = c(0, 1, 2), labels = c("No_preterm_labor", "One_preterm_labor", "TwoOrMore_preterm_labor" ))

bwt.df$ftvgfl <- factor(bwt.df$ftvgr, levels = c(0, 1, 2), labels = c("No_physician_visit", "One_firstTimester_drVisit", "twoOrMore_firstTimester_drVisit"))

Rearrange the variables

bwt.df <- 
  
  bwt.df %>% dplyr::select(low, lowf, lowfl, 
                           age, lwt, 
                           race, racegr, racegf, racegfl, black, other, 
                           smoke, smokef, smokefl,
                           ptl, ptlgr, ptlgf, ptlgfl, one_preterm_labor, twoOrMore_preterm_labor,
                           ht, htf, htfl,
                           ui, uif, uifl,
                           ftv, ftvgr, ftvgf, ftvgfl, one_firstTimester_drVisit, twoOrMore_firstTimester_drVisit,
                           bwt
                           )

Structure of the data.frame after manipulations/transformations

str(bwt.df)
## 'data.frame':    189 obs. of  33 variables:
##  $ low                            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lowf                           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ lowfl                          : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age                            : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt                            : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ race                           : int  2 3 1 1 1 3 1 3 1 1 ...
##  $ racegr                         : num  1 2 0 0 0 2 0 2 0 0 ...
##  $ racegf                         : Factor w/ 3 levels "0","1","2": 2 3 1 1 1 3 1 3 1 1 ...
##  $ racegfl                        : Factor w/ 3 levels "White","Black",..: 2 3 1 1 1 3 1 3 1 1 ...
##  $ black                          : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ other                          : num  0 1 0 0 0 1 0 1 0 0 ...
##  $ smoke                          : int  0 0 1 1 1 0 0 0 1 1 ...
##  $ smokef                         : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 1 2 2 ...
##  $ smokefl                        : Factor w/ 2 levels "Non-smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
##  $ ptl                            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ptlgr                          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ptlgf                          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ptlgfl                         : Factor w/ 3 levels "No_preterm_labor",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ one_preterm_labor              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ twoOrMore_preterm_labor        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ht                             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ htf                            : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ htfl                           : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ui                             : int  1 0 0 1 1 0 0 0 0 0 ...
##  $ uif                            : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 1 1 ...
##  $ uifl                           : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
##  $ ftv                            : int  0 3 1 2 0 0 1 1 1 0 ...
##  $ ftvgr                          : num  0 2 1 2 0 0 1 1 1 0 ...
##  $ ftvgf                          : Factor w/ 3 levels "0","1","2": 1 3 2 3 1 1 2 2 2 1 ...
##  $ ftvgfl                         : Factor w/ 3 levels "No_physician_visit",..: 1 3 2 3 1 1 2 2 2 1 ...
##  $ one_firstTimester_drVisit      : num  0 0 1 0 0 0 1 1 1 0 ...
##  $ twoOrMore_firstTimester_drVisit: num  0 1 0 1 0 0 0 0 0 0 ...
##  $ bwt                            : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...

Variable names

names(bwt.df)
##  [1] "low"                             "lowf"                           
##  [3] "lowfl"                           "age"                            
##  [5] "lwt"                             "race"                           
##  [7] "racegr"                          "racegf"                         
##  [9] "racegfl"                         "black"                          
## [11] "other"                           "smoke"                          
## [13] "smokef"                          "smokefl"                        
## [15] "ptl"                             "ptlgr"                          
## [17] "ptlgf"                           "ptlgfl"                         
## [19] "one_preterm_labor"               "twoOrMore_preterm_labor"        
## [21] "ht"                              "htf"                            
## [23] "htfl"                            "ui"                             
## [25] "uif"                             "uifl"                           
## [27] "ftv"                             "ftvgr"                          
## [29] "ftvgf"                           "ftvgfl"                         
## [31] "one_firstTimester_drVisit"       "twoOrMore_firstTimester_drVisit"
## [33] "bwt"

Total number of variables

sum(table(names(bwt.df))) # Total number of variables: 33
## [1] 33

Make dummies using fastDummies library

library(fastDummies)
# select variables to keep in the data frame and also the variables for which dummies are to be created

# it keeps the variables transformed into dummies

bwt.fd <- 
  
  bwt.df %>% 
  
  dplyr::select(lowfl, age, lwt, racegfl, smokefl, ptlgfl, htfl, uifl, ftvgfl, bwt) %>% 
  
  dummy_cols(
    
    select_columns = c("racegfl", "ptlgfl", "ftvgfl"), 
    
    remove_first_dummy = TRUE
    
    )
str(bwt.fd)
## 'data.frame':    189 obs. of  16 variables:
##  $ lowfl                                 : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age                                   : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt                                   : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ racegfl                               : Factor w/ 3 levels "White","Black",..: 2 3 1 1 1 3 1 3 1 1 ...
##  $ smokefl                               : Factor w/ 2 levels "Non-smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
##  $ ptlgfl                                : Factor w/ 3 levels "No_preterm_labor",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ htfl                                  : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ uifl                                  : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
##  $ ftvgfl                                : Factor w/ 3 levels "No_physician_visit",..: 1 3 2 3 1 1 2 2 2 1 ...
##  $ bwt                                   : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
##  $ racegfl_Black                         : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ racegfl_Other                         : int  0 1 0 0 0 1 0 1 0 0 ...
##  $ ptlgfl_One_preterm_labor              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ptlgfl_TwoOrMore_preterm_labor        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ftvgfl_One_firstTimester_drVisit      : int  0 0 1 0 0 0 1 1 1 0 ...
##  $ ftvgfl_twoOrMore_firstTimester_drVisit: int  0 1 0 1 0 0 0 0 0 0 ...

Make dummies using recipes library

# Preprocessing Tools to Create Design Matrices

library(recipes)
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
# select variables to keep in the data frame and also the variables for which dummies are to be created

# it won't keep the variables transformed into dummies

bwt.dm <- 
  
  bwt.df %>% 
  
  recipe(~ lowfl + age + lwt + racegfl + smokefl + ptlgfl + htfl + uifl + ftvgfl + bwt) %>% 
  
  step_dummy(racegfl, ptlgfl, ftvgfl) %>% 
  
  prep(training = bwt.df) %>% 
  
  bake(new_data = bwt.df)
str(bwt.dm)
## Classes 'tbl_df', 'tbl' and 'data.frame':    189 obs. of  13 variables:
##  $ lowfl                                 : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age                                   : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt                                   : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ smokefl                               : Factor w/ 2 levels "Non-smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
##  $ htfl                                  : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ uifl                                  : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
##  $ bwt                                   : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
##  $ racegfl_Black                         : num  1 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ racegfl: chr "contr.treatment"
##  $ racegfl_Other                         : num  0 1 0 0 0 1 0 1 0 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ racegfl: chr "contr.treatment"
##  $ ptlgfl_One_preterm_labor              : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ ptlgfl: chr "contr.treatment"
##  $ ptlgfl_TwoOrMore_preterm_labor        : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ ptlgfl: chr "contr.treatment"
##  $ ftvgfl_One_firstTimester_drVisit      : num  0 0 1 0 0 0 1 1 1 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ ftvgfl: chr "contr.treatment"
##  $ ftvgfl_twoOrMore_firstTimester_drVisit: num  0 1 0 1 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ ftvgfl: chr "contr.treatment"

Save the birthwt dataset as a csv file

write_csv(birthwt, "birthwt.csv")

Import the birthwt.csv file as a tibble

bwt_t <- read_csv("birthwt.csv")
## Parsed with column specification:
## cols(
##   low = col_double(),
##   age = col_double(),
##   lwt = col_double(),
##   race = col_double(),
##   smoke = col_double(),
##   ptl = col_double(),
##   ht = col_double(),
##   ui = col_double(),
##   ftv = col_double(),
##   bwt = col_double()
## )

Structure of the imported tibble

str(bwt_t)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 189 obs. of  10 variables:
##  $ low  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age  : num  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt  : num  182 155 105 108 107 124 118 103 123 113 ...
##  $ race : num  2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke: num  0 0 1 1 1 0 0 0 1 1 ...
##  $ ptl  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ht   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ui   : num  1 0 0 1 1 0 0 0 0 0 ...
##  $ ftv  : num  0 3 1 2 0 0 1 1 1 0 ...
##  $ bwt  : num  2523 2551 2557 2594 2600 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   low = col_double(),
##   ..   age = col_double(),
##   ..   lwt = col_double(),
##   ..   race = col_double(),
##   ..   smoke = col_double(),
##   ..   ptl = col_double(),
##   ..   ht = col_double(),
##   ..   ui = col_double(),
##   ..   ftv = col_double(),
##   ..   bwt = col_double()
##   .. )
glimpse(bwt_t)
## Observations: 189
## Variables: 10
## $ low   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ age   <dbl> 19, 33, 20, 21, 18, 21, 22, 17, 29, 26, 19, 19, 22, 30, ...
## $ lwt   <dbl> 182, 155, 105, 108, 107, 124, 118, 103, 123, 113, 95, 15...
## $ race  <dbl> 2, 3, 1, 1, 1, 3, 1, 3, 1, 1, 3, 3, 3, 3, 1, 1, 2, 1, 3,...
## $ smoke <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,...
## $ ptl   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ ht    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ ui    <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,...
## $ ftv   <dbl> 0, 3, 1, 2, 0, 0, 1, 1, 1, 0, 0, 1, 0, 2, 0, 0, 0, 3, 0,...
## $ bwt   <dbl> 2523, 2551, 2557, 2594, 2600, 2622, 2637, 2637, 2663, 26...

Transform the tibble

-Factorize

-Recode

-Regroup

-Collapse levels and label the levels

Factorize, recode, collapse the levels of the variables and/or label the levels of the variables

bwt_tcf <- bwt_t %>%
  
  mutate(
    
    lowf=as_factor(low), 
    lowfl=fct_recode(lowf, High="0", Low="1"),
    
    smokef=as_factor(smoke), 
    smokefl=fct_recode(smokef, Non_smoker="0", Smoker="1"),
    
    racef=as_factor(race), 
    racegf=fct_recode(racef, "0"="1", "1"="2", "2"="3"),
    racegfl=fct_recode(racegf, "white"="0",  "black"="1", "other"="2"),
    
    htf=as_factor(ht), 
    htfl=fct_recode(htf, No_hypertension="0", Hypertension="1"),
    
    uif=as_factor(ui), 
    uifl=fct_recode(uif, No_uterine_irritability="0", Uterine_irritability="1"),
    
    ptlf=as_factor(ptl), 
    ptlgf=fct_collapse(ptlf, "0"="0", "1"="1", "2"=c("2", "3")),
    ptlgfl=fct_collapse(ptlf, No_preterm_labor="0", One_preterm_labor="1", Two_plus_preterm_labor=c("2", "3")),
    
    ftvf=as_factor(ftv), 
    ftvgf=fct_collapse(ftvf, "2"=c("2", "3", "4", "6")),
    ftvgfl=fct_collapse(ftvf, No_physician_visit="0", One_physician_visit="1", Two_plus_physician_visit=c("2", "3", "4", "6")),
    
    )

Rearrange the variables the tibble

bwt_tcf <- bwt_tcf %>% dplyr::select(low, lowf, lowfl,
                          age, lwt, 
                          race, racef, racegf,  racegfl,
                          smoke, smokef, smokefl,
                          ptl, ptlf, ptlgf, ptlgfl,
                          ht, htf, htfl,
                          ui, uif, uifl,
                          ftv, ftvf, ftvgf, ftvgfl,
                          bwt
                          )
str(bwt_tcf)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 189 obs. of  27 variables:
##  $ low    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lowf   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ lowfl  : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age    : num  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt    : num  182 155 105 108 107 124 118 103 123 113 ...
##  $ race   : num  2 3 1 1 1 3 1 3 1 1 ...
##  $ racef  : Factor w/ 3 levels "1","2","3": 2 3 1 1 1 3 1 3 1 1 ...
##  $ racegf : Factor w/ 3 levels "0","1","2": 2 3 1 1 1 3 1 3 1 1 ...
##  $ racegfl: Factor w/ 3 levels "white","black",..: 2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke  : num  0 0 1 1 1 0 0 0 1 1 ...
##  $ smokef : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 1 2 2 ...
##  $ smokefl: Factor w/ 2 levels "Non_smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
##  $ ptl    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ptlf   : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ptlgf  : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ptlgfl : Factor w/ 3 levels "No_preterm_labor",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ht     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ htf    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ htfl   : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ui     : num  1 0 0 1 1 0 0 0 0 0 ...
##  $ uif    : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 1 1 ...
##  $ uifl   : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
##  $ ftv    : num  0 3 1 2 0 0 1 1 1 0 ...
##  $ ftvf   : Factor w/ 6 levels "0","1","2","3",..: 1 4 2 3 1 1 2 2 2 1 ...
##  $ ftvgf  : Factor w/ 3 levels "0","1","2": 1 3 2 3 1 1 2 2 2 1 ...
##  $ ftvgfl : Factor w/ 3 levels "No_physician_visit",..: 1 3 2 3 1 1 2 2 2 1 ...
##  $ bwt    : num  2523 2551 2557 2594 2600 ...
names(bwt_tcf)
##  [1] "low"     "lowf"    "lowfl"   "age"     "lwt"     "race"    "racef"  
##  [8] "racegf"  "racegfl" "smoke"   "smokef"  "smokefl" "ptl"     "ptlf"   
## [15] "ptlgf"   "ptlgfl"  "ht"      "htf"     "htfl"    "ui"      "uif"    
## [22] "uifl"    "ftv"     "ftvf"    "ftvgf"   "ftvgfl"  "bwt"

Make dummies using fastDummies libray

# outputs all variables plus dummy variables

bwt_fd <- bwt_tcf %>% dummy_cols(
  
  select_columns = c("racegfl", "ptlgfl", "ftvgfl"), 
  
  remove_first_dummy = TRUE)
str(bwt_fd)
## Classes 'tbl_df', 'tbl' and 'data.frame':    189 obs. of  33 variables:
##  $ low                            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lowf                           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ lowfl                          : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age                            : num  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt                            : num  182 155 105 108 107 124 118 103 123 113 ...
##  $ race                           : num  2 3 1 1 1 3 1 3 1 1 ...
##  $ racef                          : Factor w/ 3 levels "1","2","3": 2 3 1 1 1 3 1 3 1 1 ...
##  $ racegf                         : Factor w/ 3 levels "0","1","2": 2 3 1 1 1 3 1 3 1 1 ...
##  $ racegfl                        : Factor w/ 3 levels "white","black",..: 2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke                          : num  0 0 1 1 1 0 0 0 1 1 ...
##  $ smokef                         : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 1 2 2 ...
##  $ smokefl                        : Factor w/ 2 levels "Non_smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
##  $ ptl                            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ptlf                           : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ptlgf                          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ptlgfl                         : Factor w/ 3 levels "No_preterm_labor",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ht                             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ htf                            : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ htfl                           : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ui                             : num  1 0 0 1 1 0 0 0 0 0 ...
##  $ uif                            : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 1 1 ...
##  $ uifl                           : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
##  $ ftv                            : num  0 3 1 2 0 0 1 1 1 0 ...
##  $ ftvf                           : Factor w/ 6 levels "0","1","2","3",..: 1 4 2 3 1 1 2 2 2 1 ...
##  $ ftvgf                          : Factor w/ 3 levels "0","1","2": 1 3 2 3 1 1 2 2 2 1 ...
##  $ ftvgfl                         : Factor w/ 3 levels "No_physician_visit",..: 1 3 2 3 1 1 2 2 2 1 ...
##  $ bwt                            : num  2523 2551 2557 2594 2600 ...
##  $ racegfl_black                  : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ racegfl_other                  : int  0 1 0 0 0 1 0 1 0 0 ...
##  $ ptlgfl_One_preterm_labor       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ptlgfl_Two_plus_preterm_labor  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ftvgfl_One_physician_visit     : int  0 0 1 0 0 0 1 1 1 0 ...
##  $ ftvgfl_Two_plus_physician_visit: int  0 1 0 1 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# Total number of variables: 45

sum(table(names(bwt_fd))) 
## [1] 33

Make dummies using recipes libray

# recipe selects some of the variables in the tibble

# won't keep the original variables transformed into dummies

bwt_dm <- 
  
  bwt_tcf %>% 
  
  recipe(~ lowfl + age + lwt + racegfl + smokefl + ptlgfl + htfl + uifl + ftvgfl + bwt) %>% 
  
  step_dummy(racegfl, ptlgfl, ftvgfl) %>% 
  
  prep(training = bwt_tcf) %>% 
  
  bake(new_data = bwt_tcf)
str(bwt_dm)
## Classes 'tbl_df', 'tbl' and 'data.frame':    189 obs. of  13 variables:
##  $ lowfl                          : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age                            : num  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt                            : num  182 155 105 108 107 124 118 103 123 113 ...
##  $ smokefl                        : Factor w/ 2 levels "Non_smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
##  $ htfl                           : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ uifl                           : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
##  $ bwt                            : num  2523 2551 2557 2594 2600 ...
##  $ racegfl_black                  : num  1 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ racegfl: chr "contr.treatment"
##  $ racegfl_other                  : num  0 1 0 0 0 1 0 1 0 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ racegfl: chr "contr.treatment"
##  $ ptlgfl_One_preterm_labor       : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ ptlgfl: chr "contr.treatment"
##  $ ptlgfl_Two_plus_preterm_labor  : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ ptlgfl: chr "contr.treatment"
##  $ ftvgfl_One_physician_visit     : num  0 0 1 0 0 0 1 1 1 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ ftvgfl: chr "contr.treatment"
##  $ ftvgfl_Two_plus_physician_visit: num  0 1 0 1 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int  0 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ ftvgfl: chr "contr.treatment"

Sample observations

head(birthwt, n=3)
low age lwt race smoke ptl ht ui ftv bwt
85 0 19 182 2 0 0 0 1 0 2523
86 0 33 155 3 0 0 0 0 3 2551
87 0 20 105 1 1 0 0 0 1 2557
head(bwt.df, n=3)
low lowf lowfl age lwt race racegr racegf racegfl black other smoke smokef smokefl ptl ptlgr ptlgf ptlgfl one_preterm_labor twoOrMore_preterm_labor ht htf htfl ui uif uifl ftv ftvgr ftvgf ftvgfl one_firstTimester_drVisit twoOrMore_firstTimester_drVisit bwt
85 0 0 High 19 182 2 1 1 Black 1 0 0 0 Non-smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit 0 0 2523
86 0 0 High 33 155 3 2 2 Other 0 1 0 0 Non-smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 0 0 No_uterine_irritability 3 2 2 twoOrMore_firstTimester_drVisit 0 1 2551
87 0 0 High 20 105 1 0 0 White 0 0 1 1 Smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_firstTimester_drVisit 1 0 2557
birthwt %>% sample_n(2, replace = TRUE)
low age lwt race smoke ptl ht ui ftv bwt
1 25 92 1 1 0 0 0 0 1928
1 20 150 1 1 0 0 0 2 1928
# glimpse the 1st 3 rows of the tibble
# Note there are no row or observation numbers.

head(bwt_t, n=3)
low age lwt race smoke ptl ht ui ftv bwt
0 19 182 2 0 0 0 1 0 2523
0 33 155 3 0 0 0 0 3 2551
0 20 105 1 1 0 0 0 1 2557
# sample two observations

bwt_t %>% sample_n(2)
low age lwt race smoke ptl ht ui ftv bwt
0 28 130 3 0 0 0 0 0 3969
0 29 150 1 0 0 0 0 2 2920
# first 3 rows

bwt_tcf[1:3,]
low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl bwt
0 0 High 19 182 2 2 1 black 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit 2523
0 0 High 33 155 3 3 2 other 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 3 3 2 Two_plus_physician_visit 2551
0 0 High 20 105 1 1 0 white 1 1 Smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_physician_visit 2557
# sample 1% of the tibble

bwt_tcf %>% sample_frac(.01)
low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl bwt
1 1 Low 21 100 3 3 2 other 0 0 Non_smoker 1 1 1 One_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 4 4 2 Two_plus_physician_visit 2301
0 0 High 35 170 1 1 0 white 0 0 Non_smoker 1 1 1 One_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_physician_visit 4174
# rows 1, 10 and 100

bwt_tcf[c(1, 10, 100),]
low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl bwt
0 0 High 19 182 2 2 1 black 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit 2523
0 0 High 26 113 1 1 0 white 1 1 Smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 0 0 0 No_physician_visit 2665
0 0 High 30 137 1 1 0 white 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_physician_visit 3699

Sample variables

# output: a vector of length 189

bwt_tcf[["bwt"]] %>% length()          
## [1] 189
# output: a vector of length 189

bwt_tcf$bwt %>% length()       
## [1] 189
# output: a vector of 3 obsrvations from variable "bwt"

bwt_tcf$bwt[c(1, 3, 6)]    
## [1] 2523 2557 2622

Select variables of your choice

## select race, smoke and bwt variables

birthwt[c(4, 5, 10)] %>% head(n=2)
race smoke bwt
85 2 0 2523
86 3 0 2551
bwt.df[c(6, 12, 33)] %>% head(n=2)
race smoke bwt
85 2 0 2523
86 3 0 2551
## select race, smoke and bwt variables

bwt_tcf %>% dplyr::select(race, smoke, bwt) %>% head(n=2)
race smoke bwt
2 0 2523
3 0 2551

Select continous variables

bwt_tcf %>% dplyr::select(age, lwt, bwt) %>% head(n=2)
age lwt bwt
19 182 2523
33 155 2551

Select variables that ends_with “t”

bwt_tcf %>% dplyr::select(ends_with("t")) %>% head(n=2)
lwt ht bwt
182 0 2523
155 0 2551

Drop few variables

bwt_tcf %>% dplyr::select(-c(low, race, smoke, ptl, ht, ui, ftv)) %>% head(n=2)
lowf lowfl age lwt racef racegf racegfl smokef smokefl ptlf ptlgf ptlgfl htf htfl uif uifl ftvf ftvgf ftvgfl bwt
0 High 19 182 2 1 black 0 Non_smoker 0 0 No_preterm_labor 0 No_hypertension 1 Uterine_irritability 0 0 No_physician_visit 2523
0 High 33 155 3 2 other 0 Non_smoker 0 0 No_preterm_labor 0 No_hypertension 0 No_uterine_irritability 3 2 Two_plus_physician_visit 2551

Obtain or filter or subset rows/observations from a dataset based on certain conditions

# output: a vector of 96 obsrvations for variable "racegfl" where the level is "white"

bwt_tcf$bwt[bwt_tcf$racegfl=="white"] %>% length()   
## [1] 96
# output: "bwt" as a list, data.frame

bwt_tcf["bwt"] %>% dim()                
## [1] 189   1
# output 3 observations/rows for the "bwt" variables

bwt_tcf[c(1,3,6), "bwt"]             
bwt
2523
2557
2622
# output: a data.frame when race=="White"

bwt_tcf[bwt_tcf$racegfl=="white", ]   %>% dim()   
## [1] 96 27

Identify the observation or row number for a particular value (or values of a ) variable

which(bwt.df$bwt==2557)
## [1] 3
bwt.df[bwt.df$bwt==2557,]
low lowf lowfl age lwt race racegr racegf racegfl black other smoke smokef smokefl ptl ptlgr ptlgf ptlgfl one_preterm_labor twoOrMore_preterm_labor ht htf htfl ui uif uifl ftv ftvgr ftvgf ftvgfl one_firstTimester_drVisit twoOrMore_firstTimester_drVisit bwt
87 0 0 High 20 105 1 0 0 White 0 0 1 1 Smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_firstTimester_drVisit 1 0 2557
which(bwt_tcf$bwt==2557)
## [1] 3
bwt_tcf[bwt_tcf$bwt==2557,]
low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl bwt
0 0 High 20 105 1 1 0 white 1 1 Smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_physician_visit 2557

Identify observations where a variable has missing value

bwt_tcf[is.na(bwt_tcf$age),] %>% count()
n
0
filter(bwt_tcf, is.na(age)) %>% count()
n
0
bwt_tcf %>% filter(is.na(age)) %>% count()
n
0

Identify observations for non-missing values of a variable

bwt_tcf[!is.na(bwt_tcf$age), ] %>% count()
n
189
bwt_tcf %>% filter(!is.na(age)) %>% count()
n
189
sum(!is.na(bwt_tcf$age))
## [1] 189

Identify observations where the values of a variable are compared to certain statistical measures

-Suitable for continous variables. For eaxmple, when age is less than or equal to the mean age.

# observations where age is less than or equal to the mean age 


bwt.df[bwt.df$age <= mean(bwt.df$age), ] %>% count()
n
107
bwt_tcf[bwt_tcf$age <= mean(bwt_tcf$age), ] %>% count()
n
107
bwt_tcf %>% filter(age<=mean(age)) %>% count()
n
107
# observations where age is greater than the mean age

bwt_tcf %>% filter(age > mean(age)) %>%  count() 
n
82
# observations where age is between 30 and 40 (must use "&")


bwt.df %>% subset(age >= 30 & age <= 40) %>% count()
n
26
# observations where age is between 30 and 40 for a categorical variable


bwt_tcf %>% filter(between(age, 30, 40)) %>% count(lowfl)
lowfl n
High 22
Low 4
bwt_tcf %>% filter(age >= 30, age <= 40) %>% count(racegfl)
racegfl n
white 18
black 2
other 6

Identify observations for various levels of a categorical variable

bwt.df[bwt.df$race==2, ] %>% count(race)
race n
2 26
bwt.df[bwt.df$racegr==1, ] %>% count(race)
race n
2 26
bwt.df[bwt.df$racegf==1, ] %>% count(race)
race n
2 26
bwt.df[bwt.df$racegfl=="Black", ] %>% count(race)
race n
2 26
bwt.df %>% subset(racegfl=="Black") %>% count(race)
race n
2 26
bwt_tcf[bwt_tcf$racegfl=="white", ] %>% count(race)
race n
1 96
bwt_tcf %>% subset(racegfl=="white") %>% count(race)
race n
1 96
bwt_tcf %>% filter(racegfl=="white") %>% count(race)
race n
1 96

Identify observations that meets more than one subsetting conditions

# observations where race is white or black


bwt_tcf %>% filter(racegfl=="white" | racegfl=="black") %>% count() # 122
n
122
bwt_tcf %>% filter(racegfl=="white" | racegfl=="black") %>% count(race) # 96+26
race n
1 96
2 26
bwt_tcf %>% filter(racegfl %in% c("white", "black")) %>% count() #122
n
122
bwt_tcf %>% filter(racegfl %in% c("white", "black")) %>% count(race) #96+26
race n
1 96
2 26
# observations where race is neither white nor black


bwt_tcf[bwt_tcf$racegfl!="white" & bwt_tcf$racegfl!="black",] %>% count() # 67
n
67
bwt_tcf %>% subset(!(racegfl %in% c("white", "black")) ) %>%  count()#67
n
67
bwt_tcf %>% filter(racegfl!="white" & racegfl!="black") %>% count()  #67
n
67
bwt_tcf %>% filter(!(racegfl %in% c("white", "black"))) %>% count()  #67
n
67
# observations where black folks also smoke


bwt_tcf %>% filter(racegfl=="black", smokefl=="Smoker") %>% count(race, smoke) #10
race smoke n
2 1 10
# observations where race is not either 1 or 2

bwt_tcf[bwt_tcf$race!=1 & bwt_tcf$race!=2,] %>% count() # 67
n
67
subset(bwt_tcf, !(race %in% c(1, 2)) ) %>%  count() # 67
n
67
bwt_tcf %>% filter(race!=1 & race!=2) %>% count()  #67
n
67
bwt_tcf %>% filter(!(race==1 | race==2)) %>% count()  #67
n
67
bwt_tcf %>% filter(!(race %in% c(1, 2))) %>% count()  #67
n
67

Arrange (sort descending/ascending) variables of interest

-NA are always sorted to the end for local data

-Default is ascending

-desc denotes descending

names(bwt.df)
##  [1] "low"                             "lowf"                           
##  [3] "lowfl"                           "age"                            
##  [5] "lwt"                             "race"                           
##  [7] "racegr"                          "racegf"                         
##  [9] "racegfl"                         "black"                          
## [11] "other"                           "smoke"                          
## [13] "smokef"                          "smokefl"                        
## [15] "ptl"                             "ptlgr"                          
## [17] "ptlgf"                           "ptlgfl"                         
## [19] "one_preterm_labor"               "twoOrMore_preterm_labor"        
## [21] "ht"                              "htf"                            
## [23] "htfl"                            "ui"                             
## [25] "uif"                             "uifl"                           
## [27] "ftv"                             "ftvgr"                          
## [29] "ftvgf"                           "ftvgfl"                         
## [31] "one_firstTimester_drVisit"       "twoOrMore_firstTimester_drVisit"
## [33] "bwt"
# sort by decreasing age

bwt.df[order(bwt.df$age, decreasing = TRUE), ] %>% head(n=5)
low lowf lowfl age lwt race racegr racegf racegfl black other smoke smokef smokefl ptl ptlgr ptlgf ptlgfl one_preterm_labor twoOrMore_preterm_labor ht htf htfl ui uif uifl ftv ftvgr ftvgf ftvgfl one_firstTimester_drVisit twoOrMore_firstTimester_drVisit bwt
226 0 0 High 45 123 1 0 0 White 0 0 0 0 Non-smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_firstTimester_drVisit 1 0 4990
108 0 0 High 36 202 1 0 0 White 0 0 0 0 Non-smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_firstTimester_drVisit 1 0 2836
183 0 0 High 36 175 1 0 0 White 0 0 0 0 Non-smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 0 0 No_uterine_irritability 0 0 0 No_physician_visit 0 0 3600
119 0 0 High 35 121 2 1 1 Black 1 0 1 1 Smoker 1 1 1 One_preterm_labor 1 0 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_firstTimester_drVisit 1 0 2948
223 0 0 High 35 170 1 0 0 White 0 0 0 0 Non-smoker 1 1 1 One_preterm_labor 1 0 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_firstTimester_drVisit 1 0 4174
# sort by decreasing age and show only age and lwt

bwt.df[order(bwt.df$age, decreasing = TRUE), ][c(4, 5)] %>% head(n=5)  # show columns or variables 10 $ 33
age lwt
226 45 123
108 36 202
183 36 175
119 35 121
223 35 170
# sort by age, lwt and bwt


bwt_tcf[order(bwt_tcf$age, bwt_tcf$lwt, bwt_tcf$bwt), ][c(4, 5, 27)] %>% head(n=5)
age lwt bwt
14 100 2495
14 101 2466
14 135 3941
15 98 2778
15 110 2353
# sort by  bwt

bwt_tcf %>% dplyr::select(bwt) %>% arrange(bwt) %>% head(n=5)
bwt
709
1021
1135
1330
1474
bwt_tcf %>% dplyr::select(bwt) %>% arrange(desc(bwt)) %>% head(n=5)
bwt
4990
4593
4238
4174
4167
# sort by age, lwt and bwt


bwt_tcf %>% dplyr::select(age, lwt, bwt) %>% arrange(age, lwt, bwt) %>% head(n=5)
age lwt bwt
14 100 2495
14 101 2466
14 135 3941
15 98 2778
15 110 2353
bwt_tcf %>% dplyr::select(age, lwt, bwt) %>% arrange(desc(age), desc(lwt), desc(bwt)) %>% head(n=5)
age lwt bwt
45 123 4990
36 202 2836
36 175 3600
35 170 4174
35 121 2948

Reorder the position of the variables

bwt.df %>% dplyr::select(bwt, low, lowf, lowfl, everything()) %>% head(n=2)
bwt low lowf lowfl age lwt race racegr racegf racegfl black other smoke smokef smokefl ptl ptlgr ptlgf ptlgfl one_preterm_labor twoOrMore_preterm_labor ht htf htfl ui uif uifl ftv ftvgr ftvgf ftvgfl one_firstTimester_drVisit twoOrMore_firstTimester_drVisit
85 2523 0 0 High 19 182 2 1 1 Black 1 0 0 0 Non-smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit 0 0
86 2551 0 0 High 33 155 3 2 2 Other 0 1 0 0 Non-smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 0 0 No_uterine_irritability 3 2 2 twoOrMore_firstTimester_drVisit 0 1
bwt_tcf %>% dplyr::select(bwt, low, lowf, lowfl, everything()) %>% head(n=2)
bwt low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl
2523 0 0 High 19 182 2 2 1 black 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit
2551 0 0 High 33 155 3 3 2 other 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 3 3 2 Two_plus_physician_visit

Rename a variable

bwt_tcf %>% rename(body_weight=bwt) %>% dplyr::select(body_weight, everything()) %>% head(n=2)
body_weight low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl
2523 0 0 High 19 182 2 2 1 black 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit
2551 0 0 High 33 155 3 3 2 other 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 3 3 2 Two_plus_physician_visit

Mutate

quantile(bwt_tcf$bwt)
##   0%  25%  50%  75% 100% 
##  709 2414 2977 3487 4990
range(bwt_tcf$bwt)
## [1]  709 4990

Dichotomize or categorize a continous variable using if_else function

# outcome is a "numeric" variable

bwt_tcf %>% 
  mutate(bwt_category =if_else(bwt <= 2500, 1, 0)) %>%
  dplyr::select(bwt, bwt_category, everything()) %>% 
  head(n=2)
bwt bwt_category low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl
2523 0 0 0 High 19 182 2 2 1 black 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit
2551 0 0 0 High 33 155 3 3 2 other 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 3 3 2 Two_plus_physician_visit
# outcome is  numeral "factor" variable

bwt_tcf %>% 
  mutate(bwt_category =as_factor(if_else(bwt <= 2500, 1, 0))) %>%
  dplyr::select(bwt, bwt_category, everything()) %>% 
  head(n=2)
bwt bwt_category low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl
2523 0 0 0 High 19 182 2 2 1 black 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit
2551 0 0 0 High 33 155 3 3 2 other 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 3 3 2 Two_plus_physician_visit
# outcome is a "character" variable

bwt_tcf %>% 
  mutate(bwt_category = if_else(bwt <= 2500, "low", "high")) %>%
  dplyr::select(bwt, bwt_category, everything()) %>% 
  head(n=2)
bwt bwt_category low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl
2523 high 0 0 High 19 182 2 2 1 black 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit
2551 high 0 0 High 33 155 3 3 2 other 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 3 3 2 Two_plus_physician_visit
# outcome is a character "factor" variable

bwt_tcf %>% 
  mutate(bwt_category = as_factor (if_else(bwt <= 2500, "low", "high"))) %>%
  dplyr::select(bwt, bwt_category, everything()) %>% 
  head(n=2)
bwt bwt_category low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl
2523 high 0 0 High 19 182 2 2 1 black 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit
2551 high 0 0 High 33 155 3 3 2 other 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 3 3 2 Two_plus_physician_visit

Dichotomize a continuous variable using cut function

# dichotomize bwt 

bwt.df$bwt_category <- cut(bwt.df$bwt, breaks=c(-Inf, 2500, Inf), labels=c("low","high"))

bwt.df %>% count(bwt_category)
bwt_category n
low 59
high 130
table(bwt.df$lowfl, bwt.df$bwt_category)
##       
##        low high
##   High   0  130
##   Low   59    0
boxplot(bwt.df$bwt ~ bwt.df$bwt_category, col=2:3, horizontal = TRUE)

bwt_tcf <- bwt_tcf %>% mutate(bwt_category=cut(bwt, breaks=c(-Inf, 2500, Inf), labels=c("low","high")))

boxplot(bwt_tcf$bwt ~ bwt_tcf$bwt_category, col=2:3, horizontal = TRUE)

Multi-categorize a contnous variable using cut function

bwt.df$bwt %>% quantile()
##   0%  25%  50%  75% 100% 
##  709 2414 2977 3487 4990
# categorize using `cut` function into 4 arbitrary levels

bwt.df$bwt_cut <- cut(bwt.df$bwt, 4, labels=c("wt1", "wt2", "wt3", "wt4"))


bwt.df %>% count(bwt_cut)
bwt_cut n
wt1 9
wt2 73
wt3 91
wt4 16
# Cut into 4 groups (using quantile values)

bwt.df$bwt_category <- cut(bwt.df$bwt, 
                   breaks=c(-Inf, 2414, 2977, 3487, Inf), 
                   labels=c("wt1", "wt2", "wt3", "wt4"))


bwt.df %>% count(bwt_category)
bwt_category n
wt1 48
wt2 49
wt3 45
wt4 47
boxplot(bwt.df$bwt ~ bwt.df$bwt_category, col=5:8, horizontal = TRUE)

# use quantile probablilities instead of calculated values to cut into 4 groups

bwt.df$bwt_quantiles <- cut(bwt.df$bwt, 
                   breaks=quantile(bwt.df$bwt, probs = seq(0, 1, 0.25)), 
                   include.lowest = TRUE,
                   labels=c(1:4))

bwt.df %>% count(bwt_quantiles)
bwt_quantiles n
1 48
2 49
3 45
4 47
bwt.df$bwt_quantiles <- cut(bwt.df$bwt, 
                   breaks=quantile(bwt.df$bwt, probs = c(0, 0.25, 0.5, 0.75, 1)), 
                   include.lowest = TRUE,
                   labels=c("wt1", "wt2", "wt3", "wt4"))


bwt.df %>% count(bwt_quantiles)
bwt_quantiles n
wt1 48
wt2 49
wt3 45
wt4 47

Create ordered factors from a continous variable

bwt.df <- bwt.df %>% mutate(bwt_tertile=cut(bwt, 
                                  breaks=quantile(bwt, probs = c(0, 0.337, 0.663, 1)), 
                                  include.lowest = TRUE,
                                  labels=c(3:1)))

bwt.df %>% count(bwt_tertile)
bwt_tertile n
3 64
2 62
1 63
boxplot(bwt.df$bwt ~ bwt.df$bwt_tertile, col=5:7, horizontal = TRUE )

bwt.df <- bwt.df %>% 
  mutate(
  #bwt_tertile_recode=fct_recode(bwt_tertile, Low="3", Medium="2", High="1"),
  #bwt_tertile_order=factor(bwt_tertile_recode, order=TRUE),
  #bwt_tertile_order=ordered(bwt_tertile_recode)
    bwt_tertile_order=factor(bwt_tertile, levels=c("1", "2", "3"), labels =c("low", "medium", "high"), order = TRUE )
  
  )

bwt.df %>% count(bwt_tertile_order)
bwt_tertile_order n
low 63
medium 62
high 64
boxplot(bwt.df$bwt ~ bwt.df$bwt_tertile_order, col=5:7, horizontal = TRUE )

Data transformation

# create numeric, logical and factor variable

bwt_tcf %>% 
  mutate(bwt_in_kg=bwt/1000, 
         low_bwt=bwt_in_kg <= 2.5,
         low_wt_class = as_factor(if_else(low_bwt, 1, 0))) %>% 
  dplyr::select(bwt, bwt_in_kg, low_bwt, low_wt_class) %>% 
  head(n=2)
bwt bwt_in_kg low_bwt low_wt_class
2523 2.523 FALSE 0
2551 2.551 FALSE 0

Introduce NA

bwt_tcf %>% 
  mutate(bwt_value = ifelse(bwt <= 2500, NA, bwt)) %>%
  arrange(bwt_value) %>% 
  dplyr::select(bwt, bwt_value, everything()) %>% tail(n=2)
bwt bwt_value low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl bwt_category
2495 NA 1 1 Low 17 142 2 2 1 black 0 0 Non_smoker 0 0 0 No_preterm_labor 1 1 Hypertension 0 0 No_uterine_irritability 0 0 0 No_physician_visit low
2495 NA 1 1 Low 21 130 1 1 0 white 1 1 Smoker 0 0 0 No_preterm_labor 1 1 Hypertension 0 0 No_uterine_irritability 3 3 2 Two_plus_physician_visit low
bwt_tcf %>% 
  mutate(bwt_new = ifelse(bwt < 2600 | bwt > 4000, NA, bwt)) %>% 
  dplyr::select(bwt, bwt_new, everything()) %>% 
  head(n=2)
bwt bwt_new low lowf lowfl age lwt race racef racegf racegfl smoke smokef smokefl ptl ptlf ptlgf ptlgfl ht htf htfl ui uif uifl ftv ftvf ftvgf ftvgfl bwt_category
2523 NA 0 0 High 19 182 2 2 1 black 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit high
2551 NA 0 0 High 33 155 3 3 2 other 0 0 Non_smoker 0 0 0 No_preterm_labor 0 0 No_hypertension 0 0 No_uterine_irritability 3 3 2 Two_plus_physician_visit high

Transmute

# keep only the new variables


bwt_tcf %>% 
  transmute(bwt_kg=bwt/1000, 
                   low_wt=bwt_kg <=2.5,
                   low_wt_class = as_factor(if_else(low_wt, 1, 0))) %>% 
  head(n=2)
bwt_kg low_wt low_wt_class
2.523 FALSE 0
2.551 FALSE 0

Summarise

Categorical variables: counts and frequencies

# output "tbl_df"     "tbl"        "data.frame"

bwt_tcf %>% count(race)
race n
1 96
2 26
3 67
bwt_tcf %>% count(low, race)
low race n
0 1 73
0 2 15
0 3 42
1 1 23
1 2 11
1 3 25
# output table

table(bwt_tcf$race)
## 
##  1  2  3 
## 96 26 67
table(bwt_tcf$low, bwt_tcf$race)
##    
##      1  2  3
##   0 73 15 42
##   1 23 11 25

Frequencies of the factor variables in the tibble

table(bwt_tcf$ptl)
## 
##   0   1   2   3 
## 159  24   5   1
table(bwt_tcf$ptlf)
## 
##   0   1   2   3 
## 159  24   5   1
table(bwt_tcf$ptlgf)
## 
##   0   1   2 
## 159  24   6
table(bwt_tcf$ptlgfl)
## 
##       No_preterm_labor      One_preterm_labor Two_plus_preterm_labor 
##                    159                     24                      6
table(bwt_tcf$ftv)
## 
##   0   1   2   3   4   6 
## 100  47  30   7   4   1
table(bwt_tcf$ftvf)
## 
##   0   1   2   3   4   6 
## 100  47  30   7   4   1
table(bwt_tcf$ftvgf)
## 
##   0   1   2 
## 100  47  42
table(bwt_tcf$ftvgfl)
## 
##       No_physician_visit      One_physician_visit Two_plus_physician_visit 
##                      100                       47                       42
table(bwt_tcf$race)
## 
##  1  2  3 
## 96 26 67
table(bwt_tcf$racef)
## 
##  1  2  3 
## 96 26 67
table(bwt_tcf$racegf)
## 
##  0  1  2 
## 96 26 67
table(bwt_tcf$racegfl)
## 
## white black other 
##    96    26    67

Continous variables (without any grouping)

# individual stats

nrow(table(bwt_tcf$bwt)) 
## [1] 131
min(bwt_tcf$bwt)
## [1] 709
max(bwt_tcf$bwt)       
## [1] 4990
range(bwt_tcf$bwt)             
## [1]  709 4990
mean(bwt_tcf$bwt)
## [1] 2944.587
median(bwt_tcf$bwt)
## [1] 2977
sd(bwt_tcf$bwt)
## [1] 729.2143
mad(bwt_tcf$bwt)                  # median absolute deviation
## [1] 834.7038
var(bwt_tcf$bwt)
## [1] 531753.5
IQR(bwt_tcf$bwt)                  # interquartile range
## [1] 1073
quantile(bwt_tcf$bwt)             # quartiles (by default)
##   0%  25%  50%  75% 100% 
##  709 2414 2977 3487 4990
quantile(bwt_tcf$bwt, c(1, 3)/4)  # specific percentiles (25% & 75% in this case)
##  25%  75% 
## 2414 3487
kurtosi(bwt_tcf$bwt)             # psych package
## [1] -0.1436834
skew(bwt_tcf$bwt)                # psych package
## [1] -0.205337
# use summarise function

bwt_tcf %>% 
  summarise(no_of_total_obs=n(),
            no_of_distinct_bwt_obs=n_distinct(bwt),
            
            bwt_min=min(bwt, na.rm = TRUE),
            bwt_max=max(bwt, na.rm = TRUE),
            bwt_50th_quantile=quantile(bwt, 0.5, na.rm = TRUE),
            
            bwt_avg=mean(bwt, na.rm = TRUE), 
            bwt_median=median(bwt, na.rm = TRUE),
            
            bwt_sd=sd(bwt, na.rm = TRUE),
            bwt_IQR=IQR(bwt, na.rm = TRUE),
            bwt_mad=mad(bwt, na.rm = TRUE),
            
            bwt_tirst=first(bwt),
            bwt_last=last(bwt),
            bwt_4th=nth(bwt, 4)) %>%
  t
##                             [,1]
## no_of_total_obs         189.0000
## no_of_distinct_bwt_obs  131.0000
## bwt_min                 709.0000
## bwt_max                4990.0000
## bwt_50th_quantile      2977.0000
## bwt_avg                2944.5873
## bwt_median             2977.0000
## bwt_sd                  729.2143
## bwt_IQR                1073.0000
## bwt_mad                 834.7038
## bwt_tirst              2523.0000
## bwt_last               2495.0000
## bwt_4th                2594.0000
# frequency

bwt_tcf %>% count(bwt) %>% arrange(desc(n)) %>% head()
bwt n
3062 5
2495 4
2920 4
2977 4
3651 4
1928 3
# use summary function

summary(bwt_tcf$bwt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     709    2414    2977    2945    3487    4990
# use describe function

describe(bwt_tcf$bwt) %>% t()
##                    X1
## vars        1.0000000
## n         189.0000000
## mean     2944.5873016
## sd        729.2142952
## median   2977.0000000
## trimmed  2961.7581699
## mad       834.7038000
## min       709.0000000
## max      4990.0000000
## range    4281.0000000
## skew       -0.2053370
## kurtosis   -0.1436834
## se         53.0425350
fivenum(bwt_tcf$bwt) #minimum, lower-hinge, median, upper-hinge, maximum
## [1]  709 2414 2977 3487 4990
boxplot.stats(bwt_tcf$bwt)
## $stats
## [1] 1021 2414 2977 3487 4990
## 
## $n
## [1] 189
## 
## $conf
## [1] 2853.682 3100.318
## 
## $out
## [1] 709

Whole data.frame

summary(bwt_tcf)
##       low         lowf     lowfl          age             lwt       
##  Min.   :0.0000   0:130   High:130   Min.   :14.00   Min.   : 80.0  
##  1st Qu.:0.0000   1: 59   Low : 59   1st Qu.:19.00   1st Qu.:110.0  
##  Median :0.0000                      Median :23.00   Median :121.0  
##  Mean   :0.3122                      Mean   :23.24   Mean   :129.8  
##  3rd Qu.:1.0000                      3rd Qu.:26.00   3rd Qu.:140.0  
##  Max.   :1.0000                      Max.   :45.00   Max.   :250.0  
##       race       racef  racegf  racegfl       smoke        smokef 
##  Min.   :1.000   1:96   0:96   white:96   Min.   :0.0000   0:115  
##  1st Qu.:1.000   2:26   1:26   black:26   1st Qu.:0.0000   1: 74  
##  Median :1.000   3:67   2:67   other:67   Median :0.0000          
##  Mean   :1.847                            Mean   :0.3915          
##  3rd Qu.:3.000                            3rd Qu.:1.0000          
##  Max.   :3.000                            Max.   :1.0000          
##        smokefl         ptl         ptlf    ptlgf  
##  Non_smoker:115   Min.   :0.0000   0:159   0:159  
##  Smoker    : 74   1st Qu.:0.0000   1: 24   1: 24  
##                   Median :0.0000   2:  5   2:  6  
##                   Mean   :0.1958   3:  1          
##                   3rd Qu.:0.0000                  
##                   Max.   :3.0000                  
##                     ptlgfl          ht          htf    
##  No_preterm_labor      :159   Min.   :0.00000   0:177  
##  One_preterm_labor     : 24   1st Qu.:0.00000   1: 12  
##  Two_plus_preterm_labor:  6   Median :0.00000          
##                               Mean   :0.06349          
##                               3rd Qu.:0.00000          
##                               Max.   :1.00000          
##               htfl           ui         uif    
##  No_hypertension:177   Min.   :0.0000   0:161  
##  Hypertension   : 12   1st Qu.:0.0000   1: 28  
##                        Median :0.0000          
##                        Mean   :0.1481          
##                        3rd Qu.:0.0000          
##                        Max.   :1.0000          
##                       uifl          ftv         ftvf    ftvgf  
##  No_uterine_irritability:161   Min.   :0.0000   0:100   0:100  
##  Uterine_irritability   : 28   1st Qu.:0.0000   1: 47   1: 47  
##                                Median :0.0000   2: 30   2: 42  
##                                Mean   :0.7937   3:  7          
##                                3rd Qu.:1.0000   4:  4          
##                                Max.   :6.0000   6:  1          
##                       ftvgfl         bwt       bwt_category
##  No_physician_visit      :100   Min.   : 709   low : 59    
##  One_physician_visit     : 47   1st Qu.:2414   high:130    
##  Two_plus_physician_visit: 42   Median :2977               
##                                 Mean   :2945               
##                                 3rd Qu.:3487               
##                                 Max.   :4990

lapply: Apply a Function over a List or Vector

#outputs a list

lapply(list(bwt_tcf$bwt, bwt_tcf$age, bwt_tcf$lwt), mean) 
## [[1]]
## [1] 2944.587
## 
## [[2]]
## [1] 23.2381
## 
## [[3]]
## [1] 129.8148

sapply: Apply a Function over a List or Vector

#outputs a vector

sapply(list(bwt_tcf$bwt, bwt_tcf$age, bwt_tcf$lwt), mean) 
## [1] 2944.5873   23.2381  129.8148

Grouping and summarizing

-for counting

-summarizing

Using tapply function

-Apply a Function Over a Ragged Array

# 1st dim(rows)=race, 2nd dim(cols)=smoke

tapply(bwt_tcf$bwt, list(bwt_tcf$race, bwt_tcf$smoke), mean)
##          0        1
## 1 3428.750 2826.846
## 2 2854.500 2504.000
## 3 2815.782 2757.167
#1st dim(rows)=race, 2nd dim(cols)=smoke, 3rd dim=ptl

tapply(bwt_tcf$bwt, list(bwt_tcf$race, bwt_tcf$smoke, bwt_tcf$ptl), mean) 
## , , 0
## 
##          0        1
## 1 3413.667 2902.225
## 2 2903.857 2474.500
## 3 2866.085 3065.750
## 
## , , 1
## 
##          0        1
## 1 4174.000 2452.000
## 2 2509.000 2622.000
## 3 2586.714 1766.667
## 
## , , 2
## 
##      0    1
## 1 3317 2601
## 2   NA   NA
## 3 2055 3260
## 
## , , 3
## 
##    0    1
## 1 NA 3637
## 2 NA   NA
## 3 NA   NA

Using aggregate function

# aggregate one or more continous variables by one factor


aggregate(bwt ~ race, data=bwt_tcf, mean)
race bwt
1 3102.719
2 2719.692
3 2805.284
aggregate(cbind(bwt, lwt) ~ race, data=bwt_tcf, mean)
race bwt lwt
1 3102.719 132.0521
2 2719.692 146.8077
3 2805.284 120.0149
# aggregate one or more continous variables two factors


aggregate(bwt ~ race+smoke, data=bwt_tcf, mean)
race smoke bwt
1 0 3428.750
2 0 2854.500
3 0 2815.782
1 1 2826.846
2 1 2504.000
3 1 2757.167
aggregate(cbind(bwt, lwt) ~ race+smoke, data=bwt_tcf, mean)
race smoke bwt lwt
1 0 3428.750 138.8409
2 0 2854.500 149.4375
3 0 2815.782 119.1455
1 1 2826.846 126.3077
2 1 2504.000 142.6000
3 1 2757.167 124.0000
# aggregate a number of continous variables by one or more factors

aggregate(. ~ race, data=bwt_tcf[c("bwt", "lwt", "age", "race")], mean) 
race bwt lwt age
1 3102.719 132.0521 24.29167
2 2719.692 146.8077 21.53846
3 2805.284 120.0149 22.38806
aggregate(. ~ race+smoke, data=bwt_tcf[c("bwt", "lwt", "age", "race", "smoke")], mean)  
race smoke bwt lwt age
1 0 3428.750 138.8409 26.02273
2 0 2854.500 149.4375 19.93750
3 0 2815.782 119.1455 22.36364
1 1 2826.846 126.3077 22.82692
2 1 2504.000 142.6000 24.10000
3 1 2757.167 124.0000 22.50000
# aggregate a a continous variable by many grouping variable

aggregate(bwt ~ ., data = bwt_tcf[c("race","smoke", "ht", "bwt")], mean) 
race smoke ht bwt
1 0 0 3436.395
2 0 0 2813.357
3 0 0 2874.824
1 1 0 2819.292
2 1 0 2656.111
3 1 0 2757.167
1 0 1 3100.000
2 0 1 3142.500
3 0 1 2063.000
1 1 1 2917.500
2 1 1 1135.000
# aggregate two continous variable by many grouping variable

aggregate(cbind(bwt, lwt) ~ ., data = bwt_tcf[c("race","smoke", "ht", "bwt", "lwt")], mean) 
race smoke ht bwt lwt
1 0 0 3436.395 139.2791
2 0 0 2813.357 143.4286
3 0 0 2874.824 118.9608
1 1 0 2819.292 121.9583
2 1 0 2656.111 137.6667
3 1 0 2757.167 124.0000
1 0 1 3100.000 120.0000
2 0 1 3142.500 191.5000
3 0 1 2063.000 121.5000
1 1 1 2917.500 178.5000
2 1 1 1135.000 187.0000

Using summaryBy function

-Opportunity to calculate multiple summary statistics for multiple variables

-Need to convert the tibble to data.frame for summaryBy analysis

library(doBy)
summary_fun <- function(x, ...){
  c(
  nobs=length(x),
  minm=min(x, na.rm=TRUE, ...), 
  maxm=max(x, na.rm=TRUE, ...), 
  qntl=quantile(x, na.rm=TRUE, ...), 
  iqr=IQR(x, na.rm=TRUE, ...), 
  MAD=mad(x, na.rm=TRUE, ...),
  avg=mean(x, na.rm=TRUE, ...),
  med=median(x, na.rm=TRUE, ...), 
  stdev=sd(x, na.rm=TRUE, ...),
  variance=var(x, na.rm=TRUE, ...)
  )
}
bwt_tcf_df <- as.data.frame(bwt_tcf) # tibble needs to be converted to a data.frame to work with the "doBy" package
# default function is 'mean'

summaryBy(bwt ~ race, data=bwt_tcf_df) 
race bwt.mean
1 3102.719
2 2719.692
3 2805.284
summaryBy(bwt ~ race+smoke, data=bwt_tcf_df)
race smoke bwt.mean
1 0 3428.750
1 1 2826.846
2 0 2854.500
2 1 2504.000
3 0 2815.782
3 1 2757.167
summaryBy(cbind(bwt,age) ~ race+smoke, data=bwt_tcf_df) 
race smoke bwt.mean age.mean
1 0 3428.750 26.02273
1 1 2826.846 22.82692
2 0 2854.500 19.93750
2 1 2504.000 24.10000
3 0 2815.782 22.36364
3 1 2757.167 22.50000
# default function is 'mean'

summaryBy(list(c("bwt", "age"), c("race", "smoke")), data=bwt_tcf_df )
race smoke bwt.mean age.mean
1 0 3428.750 26.02273
1 1 2826.846 22.82692
2 0 2854.500 19.93750
2 1 2504.000 24.10000
3 0 2815.782 22.36364
3 1 2757.167 22.50000
# use "FUN" to define the functions 
# output according to the order of the functons listed

summaryBy(bwt ~ race + smoke, data=bwt_tcf_df, FUN=c(min, max, quantile, IQR, mean, median, sd, var))
race smoke bwt.FUN1 bwt.FUN2 bwt.FUN3 bwt.FUN4 bwt.FUN5 bwt.FUN6 bwt.FUN7 bwt.FUN8 bwt.FUN9 bwt.FUN10 bwt.FUN11 bwt.FUN12
1 0 1021 4990 1021 3062.00 3593.0 3873.00 4990 811.00 3428.750 3593.0 710.0989 504240.5
1 1 1790 4238 1790 2410.00 2775.5 3189.50 4238 779.50 2826.846 2775.5 626.4725 392467.8
2 0 1701 3860 1701 2480.75 2920.0 3331.25 3860 850.50 2854.500 2920.0 621.2543 385956.9
2 1 1135 3444 1135 2313.75 2381.0 2941.00 3444 627.25 2504.000 2381.0 637.0568 405841.3
3 0 1330 4054 1330 2313.00 2807.0 3253.00 4054 940.00 2815.782 2807.0 709.3493 503176.5
3 1 709 3572 709 2402.25 3146.5 3307.50 3572 905.25 2757.167 3146.5 810.0446 656172.3
summaryBy(cbind(bwt, age) ~ race + smoke, data=bwt_tcf_df, FUN=c(min, max, quantile, IQR, mean, median, sd, var))
race smoke bwt.FUN1 bwt.FUN2 bwt.FUN3 bwt.FUN4 bwt.FUN5 bwt.FUN6 bwt.FUN7 bwt.FUN8 bwt.FUN9 bwt.FUN10 bwt.FUN11 bwt.FUN12 age.FUN1 age.FUN2 age.FUN3 age.FUN4 age.FUN5 age.FUN6 age.FUN7 age.FUN8 age.FUN9 age.FUN10 age.FUN11 age.FUN12
1 0 1021 14 4990 45 1021 3062.00 3593.0 3873.00 4990 14 22.00 24.5 30.00 45 811.00 8.00 3428.750 26.02273 3593.0 24.5 710.0989 6.017373 504240.5 36.20877
1 1 1790 16 4238 33 1790 2410.00 2775.5 3189.50 4238 16 19.00 21.5 26.25 33 779.50 7.25 2826.846 22.82692 2775.5 21.5 626.4725 4.925807 392467.8 24.26357
2 0 1701 15 3860 27 1701 2480.75 2920.0 3331.25 3860 15 17.00 18.5 23.25 27 850.50 6.25 2854.500 19.93750 2920.0 18.5 621.2543 3.889623 385956.9 15.12917
2 1 1135 18 3444 35 1135 2313.75 2381.0 2941.00 3444 18 20.00 22.0 25.50 35 627.25 5.50 2504.000 24.10000 2381.0 22.0 637.0568 5.952591 405841.3 35.43333
3 0 1330 14 4054 33 1330 2313.00 2807.0 3253.00 4054 14 19.00 22.0 25.00 33 940.00 6.00 2815.782 22.36364 2807.0 22.0 709.3493 4.452896 503176.5 19.82828
3 1 709 14 3572 31 709 2402.25 3146.5 3307.50 3572 14 19.25 22.5 26.50 31 905.25 7.25 2757.167 22.50000 3146.5 22.5 810.0446 5.107926 656172.3 26.09091
# using custom function

summaryBy(bwt ~ race, data=bwt_tcf_df, FUN=summary_fun) 
race bwt.nobs bwt.minm bwt.maxm bwt.qntl.0% bwt.qntl.25% bwt.qntl.50% bwt.qntl.75% bwt.qntl.100% bwt.iqr bwt.MAD bwt.avg bwt.med bwt.stdev bwt.variance
1 96 1021 4990 1021 2584.75 3062 3651 4990 1066.25 867.3210 3102.719 3062 727.8861 529818.2
2 26 1135 3860 1135 2370.50 2849 3057 3860 686.50 693.8568 2719.692 2849 638.6839 407917.1
3 67 709 4054 709 2313.00 2835 3274 4054 961.00 693.8568 2805.284 2835 722.1944 521564.7
summaryBy(bwt ~ race + smoke, data=bwt_tcf_df, FUN=summary_fun)
race smoke bwt.nobs bwt.minm bwt.maxm bwt.qntl.0% bwt.qntl.25% bwt.qntl.50% bwt.qntl.75% bwt.qntl.100% bwt.iqr bwt.MAD bwt.avg bwt.med bwt.stdev bwt.variance
1 0 44 1021 4990 1021 3062.00 3593.0 3873.00 4990 811.00 641.2245 3428.750 3593.0 710.0989 504240.5
1 1 52 1790 4238 1790 2410.00 2775.5 3189.50 4238 779.50 546.3381 2826.846 2775.5 626.4725 392467.8
2 0 16 1701 3860 1701 2480.75 2920.0 3331.25 3860 850.50 651.6027 2854.500 2920.0 621.2543 385956.9
2 1 10 1135 3444 1135 2313.75 2381.0 2941.00 3444 627.25 588.5922 2504.000 2381.0 637.0568 405841.3
3 0 55 1330 4054 1330 2313.00 2807.0 3253.00 4054 940.00 692.3742 2815.782 2807.0 709.3493 503176.5
3 1 12 709 3572 709 2402.25 3146.5 3307.50 3572 905.25 452.1930 2757.167 3146.5 810.0446 656172.3
# output to a tabular format

summaryBy(bwt ~ race + smoke, data=bwt_tcf_df, FUN=summary_fun) %>%
  t() %>%
  print(quote = FALSE, digits = 5)
##                       1         2         3         4         5         6
## race               1.00      1.00      2.00      2.00      3.00      3.00
## smoke              0.00      1.00      0.00      1.00      0.00      1.00
## bwt.nobs          44.00     52.00     16.00     10.00     55.00     12.00
## bwt.minm        1021.00   1790.00   1701.00   1135.00   1330.00    709.00
## bwt.maxm        4990.00   4238.00   3860.00   3444.00   4054.00   3572.00
## bwt.qntl.0%     1021.00   1790.00   1701.00   1135.00   1330.00    709.00
## bwt.qntl.25%    3062.00   2410.00   2480.75   2313.75   2313.00   2402.25
## bwt.qntl.50%    3593.00   2775.50   2920.00   2381.00   2807.00   3146.50
## bwt.qntl.75%    3873.00   3189.50   3331.25   2941.00   3253.00   3307.50
## bwt.qntl.100%   4990.00   4238.00   3860.00   3444.00   4054.00   3572.00
## bwt.iqr          811.00    779.50    850.50    627.25    940.00    905.25
## bwt.MAD          641.22    546.34    651.60    588.59    692.37    452.19
## bwt.avg         3428.75   2826.85   2854.50   2504.00   2815.78   2757.17
## bwt.med         3593.00   2775.50   2920.00   2381.00   2807.00   3146.50
## bwt.stdev        710.10    626.47    621.25    637.06    709.35    810.04
## bwt.variance  504240.47 392467.78 385956.93 405841.33 503176.47 656172.33

Using group_by function

# number of observations by group charcteristics

bwt_tcf %>% group_by(low, race) %>% count()
low race n
0 1 73
0 2 15
0 3 42
1 1 23
1 2 11
1 3 25
# number of observations by group charcteristics

bwt_tcf %>% group_by(low) %>% count(race)
low race n
0 1 73
0 2 15
0 3 42
1 1 23
1 2 11
1 3 25
# one grouping variable

bwt_tcf %>% 
  group_by(race) %>%
  summarise(no_of_obs=n(), 
            no_of_distinct_obs=n_distinct(bwt),
            
            bwt_avg=mean(bwt, na.rm = TRUE), 
            bwt_median=median(bwt, na.rm = TRUE),
            
            bwt_sd=sd(bwt, na.rm = TRUE),
            bwt_IQR=IQR(bwt, na.rm = TRUE),
            bwt_mad=mad(bwt, na.rm = TRUE),
            
            bwt_min=min(bwt, na.rm = TRUE),
            bwt_max=max(bwt, na.rm = TRUE),
            bwt_50th_quantile=quantile(bwt, 0.5, na.rm = TRUE),

            bwt_tirst=first(bwt),
            bwt_last=last(bwt),
            bwt_4th=nth(bwt, 4))
race no_of_obs no_of_distinct_obs bwt_avg bwt_median bwt_sd bwt_IQR bwt_mad bwt_min bwt_max bwt_50th_quantile bwt_tirst bwt_last bwt_4th
1 96 80 3102.719 3062 727.8861 1066.25 867.3210 1021 4990 3062 2557 2495 2637
2 26 23 2719.692 2849 638.6839 686.50 693.8568 1135 3860 2849 2523 2495 2920
3 67 53 2805.284 2835 722.1944 961.00 693.8568 709 4054 2835 2551 2495 2722
# two grouping variables

bwt_tcf %>% 
  group_by(race, smoke) %>%
  summarise(no_of_obs=n(), 
            no_of_distinct_obs=n_distinct(bwt),
            
            bwt_avg=mean(bwt, na.rm = TRUE), 
            bwt_median=median(bwt, na.rm = TRUE),
            
            bwt_sd=sd(bwt, na.rm = TRUE),
            bwt_IQR=IQR(bwt, na.rm = TRUE),
            bwt_mad=mad(bwt, na.rm = TRUE),
            
            bwt_min=min(bwt, na.rm = TRUE),
            bwt_max=max(bwt, na.rm = TRUE),
            bwt_50th_quantile=quantile(bwt, 0.5, na.rm = TRUE),

            bwt_tirst=first(bwt),
            bwt_last=last(bwt),
            bwt_4th=nth(bwt, 4))
race smoke no_of_obs no_of_distinct_obs bwt_avg bwt_median bwt_sd bwt_IQR bwt_mad bwt_min bwt_max bwt_50th_quantile bwt_tirst bwt_last bwt_4th
1 0 44 41 3428.750 3593.0 710.0989 811.00 641.2245 1021 4990 3593.0 2637 2353 2877
1 1 52 44 2826.846 2775.5 626.4725 779.50 546.3381 1790 4238 2775.5 2557 2495 2663
2 0 16 15 2854.500 2920.0 621.2543 850.50 651.6027 1701 3860 2920.0 2523 2495 2920
2 1 10 9 2504.000 2381.0 637.0568 627.25 588.5922 1135 3444 2381.0 2920 2381 3444
3 0 55 45 2815.782 2807.0 709.3493 940.00 692.3742 1330 4054 2807.0 2551 2495 2722
3 1 12 12 2757.167 3146.5 810.0446 905.25 452.1930 709 3572 3146.5 3090 2495 3303

Basic graphics and additional descriptive statistics

bwt.df <- bwt.df %>% dplyr::select(bwt, age, lwt, everything())

head(bwt.df, n=3)
bwt age lwt low lowf lowfl race racegr racegf racegfl black other smoke smokef smokefl ptl ptlgr ptlgf ptlgfl one_preterm_labor twoOrMore_preterm_labor ht htf htfl ui uif uifl ftv ftvgr ftvgf ftvgfl one_firstTimester_drVisit twoOrMore_firstTimester_drVisit bwt_category bwt_cut bwt_quantiles bwt_tertile bwt_tertile_order
2523 19 182 0 0 High 2 1 1 Black 1 0 0 0 Non-smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 1 1 Uterine_irritability 0 0 0 No_physician_visit 0 0 wt2 wt2 wt2 3 high
2551 33 155 0 0 High 3 2 2 Other 0 1 0 0 Non-smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 0 0 No_uterine_irritability 3 2 2 twoOrMore_firstTimester_drVisit 0 1 wt2 wt2 wt2 3 high
2557 20 105 0 0 High 1 0 0 White 0 0 1 1 Smoker 0 0 0 No_preterm_labor 0 0 0 0 No_hypertension 0 0 No_uterine_irritability 1 1 1 One_firstTimester_drVisit 1 0 wt2 wt2 wt2 3 high

Histograms, normal density curves and kernel density curves

par(mfrow=c(1,3))

for(i in 1:3) {
  hist(bwt.df[,i], prob=TRUE, main=names(bwt.df)[i])
  
  curve(dnorm(x, mean = mean(bwt.df[, i]), sd = sd(bwt.df[, i])), 
      col = "darkred", 
      lwd = 2,
      add = TRUE)
  
  lines(density(bwt.df[, i]), col = "blue", lwd = 2)
  
  rug(bwt.df[, i], col = "red", lwd = 2)
  
  }

Boxplots

par(mfrow=c(1,3))

for(i in 1:3) {
  boxplot(bwt.df[, i], main=names(bwt.df)[i])
}