data.frameraceptlftvfastDummies libraryrecipes librarybirthwt dataset as a csv filebirthwt.csv file as a tibble-Analyze various variables in the birthwt dataset (from the MASS package).
-Evaluate bwt, a continous variable, in relation to other variables.
-Evaluate low, a categorical variable, in relation to other variables.
library(tidyverse)
## -- Attaching packages ------------
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(doBy)
library(haven)
data("birthwt")
str(birthwt)
## 'data.frame': 189 obs. of 10 variables:
## $ low : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : int 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: int 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : int 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
| # | variable name | variable label | coded levels |
|---|---|---|---|
| 1 | low | indicator of birth weight less than 2.5 kg | 0, 1 |
| 2 | age | mother’s age in years | continous variable |
| 3 | lwt | mother’s weight in pounds at last menstrual period | continous variable |
| 4 | race | mother’s race (1 = white, 2 = black, 3 = other) | 1, 2, 3 |
| 5 | smoke | smoking status during pregnancy | 0, 1 |
| 6 | ptl | number of previous premature labours | 0, 1, 2, 3 |
| 7 | ht | history of hypertension | 0, 1 |
| 8 | ui | presence of uterine irritability | 0, 1 |
| 9 | ftv | number of physician visits during the first trimester | 0, 1, 2, 3, 4, 6 |
| 10 | bwt | birth weight in grams | continous variable |
data.frame-Factorize -Collapse levels -Create indicator or dummy variables -Labels the levels of a factor
# make a copy of the original data.frame
bwt.df <- birthwt
names(bwt.df)
## [1] "low" "age" "lwt" "race" "smoke" "ptl" "ht" "ui"
## [9] "ftv" "bwt"
race# Check the levels of the variable
table(bwt.df$race)
##
## 1 2 3
## 96 26 67
# No. of levels
table(bwt.df$race) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist()%>% length()
## [1] 3
# Levels
table(bwt.df$race) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist()
## Var11 Var12 Var13
## 1 2 3
## Levels: 1 2 3
# regroup the levels without collapsing
bwt.df$racegr <- 99
bwt.df$racegr[bwt.df$race==1] <- 0
bwt.df$racegr[bwt.df$race==2] <- 1
bwt.df$racegr[bwt.df$race==3] <- 2
table(bwt.df$racegr)
##
## 0 1 2
## 96 26 67
table(bwt.df$racegr, bwt.df$race)
##
## 1 2 3
## 0 96 0 0
## 1 0 26 0
## 2 0 0 67
# create two indicator/dummy variables
bwt.df$black <- 0
bwt.df$other <- 0
bwt.df$black[bwt.df$racegr==1] <- 1
bwt.df$other[bwt.df$racegr==2] <- 1
table(bwt.df$black, bwt.df$other)
##
## 0 1
## 0 96 67
## 1 26 0
ptl# Check levels of the variables
table(bwt.df$ptl)
##
## 0 1 2 3
## 159 24 5 1
table(bwt.df$ptl) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist()%>% length() # No. of levels: 4
## [1] 4
table(bwt.df$ptl) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist() # Levels: 0 1 2 3
## Var11 Var12 Var13 Var14
## 0 1 2 3
## Levels: 0 1 2 3
# collapse into 3 levels
bwt.df$ptlgr <- 99
bwt.df$ptlgr[bwt.df$ptl==0] <- 0
bwt.df$ptlgr[bwt.df$ptl==1] <- 1
bwt.df$ptlgr[bwt.df$ptl==2 | bwt.df$ptl==3] <- 2
table(bwt.df$ptlgr)
##
## 0 1 2
## 159 24 6
# create two indicator/dummy variables
bwt.df$one_preterm_labor <- 0
bwt.df$twoOrMore_preterm_labor <- 0
bwt.df$one_preterm_labor[bwt.df$ptlgr==1] <- 1
bwt.df$twoOrMore_preterm_labor[bwt.df$ptlgr==2] <- 1
table(bwt.df$one_preterm_labor)
##
## 0 1
## 165 24
table(bwt.df$twoOrMore_preterm_labor)
##
## 0 1
## 183 6
ftv# check levels
table(bwt.df$ftv)
##
## 0 1 2 3 4 6
## 100 47 30 7 4 1
table(bwt.df$ftv) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist() %>% length() # No. of levels: 6
## [1] 6
table(bwt.df$ftv) %>% as.data.frame() %>% dplyr::select(Var1) %>% unlist() # Levels: 0 1 2 3 4 6
## Var11 Var12 Var13 Var14 Var15 Var16
## 0 1 2 3 4 6
## Levels: 0 1 2 3 4 6
# collapse into 3 levels
bwt.df$ftvgr <- 99
bwt.df$ftvgr[bwt.df$ftv==0] <- 0
bwt.df$ftvgr[bwt.df$ftv==1] <- 1
bwt.df$ftvgr[bwt.df$ftv==2 | bwt.df$ftv==3 | bwt.df$ftv==4 | bwt.df$ftv==6] <- 2
table(bwt.df$ftvgr)
##
## 0 1 2
## 100 47 42
# create indicator/dummy variables
bwt.df$one_firstTimester_drVisit <- 0
bwt.df$twoOrMore_firstTimester_drVisit <- 0
bwt.df$one_firstTimester_drVisit[bwt.df$ftvgr==1] <- 1
bwt.df$twoOrMore_firstTimester_drVisit[bwt.df$ftvgr==2] <- 1
table(bwt.df$one_firstTimester_drVisit)
##
## 0 1
## 142 47
table(bwt.df$twoOrMore_firstTimester_drVisit)
##
## 0 1
## 147 42
bwt.df$lowf <- factor(bwt.df$low) # as.factor would also work
bwt.df$racegf <- factor(bwt.df$racegr)
bwt.df$smokef <- factor(bwt.df$smoke)
bwt.df$ptlgf <- factor(bwt.df$ptlgr)
bwt.df$htf <- factor(bwt.df$ht)
bwt.df$uif <- factor(bwt.df$ui)
bwt.df$ftvgf <- factor(bwt.df$ftvgr)
bwt.df$lowfl <- factor(bwt.df$low, levels = c(0, 1), labels = c("High", "Low"))
bwt.df$smokefl <- factor(bwt.df$smoke, levels = c(0, 1), labels = c("Non-smoker", "Smoker"))
bwt.df$htfl <- factor(bwt.df$ht, levels = c(0, 1), labels = c("No_hypertension", "Hypertension"))
bwt.df$uifl <- factor(bwt.df$ui, levels = c(0, 1), labels = c("No_uterine_irritability", "Uterine_irritability"))
bwt.df$racegfl <- factor(bwt.df$racegr, levels = c(0, 1, 2), labels = c("White", "Black", "Other"))
bwt.df$ptlgfl <- factor(bwt.df$ptlgr, levels = c(0, 1, 2), labels = c("No_preterm_labor", "One_preterm_labor", "TwoOrMore_preterm_labor" ))
bwt.df$ftvgfl <- factor(bwt.df$ftvgr, levels = c(0, 1, 2), labels = c("No_physician_visit", "One_firstTimester_drVisit", "twoOrMore_firstTimester_drVisit"))
bwt.df <-
bwt.df %>% dplyr::select(low, lowf, lowfl,
age, lwt,
race, racegr, racegf, racegfl, black, other,
smoke, smokef, smokefl,
ptl, ptlgr, ptlgf, ptlgfl, one_preterm_labor, twoOrMore_preterm_labor,
ht, htf, htfl,
ui, uif, uifl,
ftv, ftvgr, ftvgf, ftvgfl, one_firstTimester_drVisit, twoOrMore_firstTimester_drVisit,
bwt
)
str(bwt.df)
## 'data.frame': 189 obs. of 33 variables:
## $ low : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lowf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ lowfl : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : int 2 3 1 1 1 3 1 3 1 1 ...
## $ racegr : num 1 2 0 0 0 2 0 2 0 0 ...
## $ racegf : Factor w/ 3 levels "0","1","2": 2 3 1 1 1 3 1 3 1 1 ...
## $ racegfl : Factor w/ 3 levels "White","Black",..: 2 3 1 1 1 3 1 3 1 1 ...
## $ black : num 1 0 0 0 0 0 0 0 0 0 ...
## $ other : num 0 1 0 0 0 1 0 1 0 0 ...
## $ smoke : int 0 0 1 1 1 0 0 0 1 1 ...
## $ smokef : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 1 2 2 ...
## $ smokefl : Factor w/ 2 levels "Non-smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ptlgr : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ptlgf : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ ptlgfl : Factor w/ 3 levels "No_preterm_labor",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ one_preterm_labor : num 0 0 0 0 0 0 0 0 0 0 ...
## $ twoOrMore_preterm_labor : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : int 0 0 0 0 0 0 0 0 0 0 ...
## $ htf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ htfl : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ui : int 1 0 0 1 1 0 0 0 0 0 ...
## $ uif : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 1 1 ...
## $ uifl : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ ftvgr : num 0 2 1 2 0 0 1 1 1 0 ...
## $ ftvgf : Factor w/ 3 levels "0","1","2": 1 3 2 3 1 1 2 2 2 1 ...
## $ ftvgfl : Factor w/ 3 levels "No_physician_visit",..: 1 3 2 3 1 1 2 2 2 1 ...
## $ one_firstTimester_drVisit : num 0 0 1 0 0 0 1 1 1 0 ...
## $ twoOrMore_firstTimester_drVisit: num 0 1 0 1 0 0 0 0 0 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
names(bwt.df)
## [1] "low" "lowf"
## [3] "lowfl" "age"
## [5] "lwt" "race"
## [7] "racegr" "racegf"
## [9] "racegfl" "black"
## [11] "other" "smoke"
## [13] "smokef" "smokefl"
## [15] "ptl" "ptlgr"
## [17] "ptlgf" "ptlgfl"
## [19] "one_preterm_labor" "twoOrMore_preterm_labor"
## [21] "ht" "htf"
## [23] "htfl" "ui"
## [25] "uif" "uifl"
## [27] "ftv" "ftvgr"
## [29] "ftvgf" "ftvgfl"
## [31] "one_firstTimester_drVisit" "twoOrMore_firstTimester_drVisit"
## [33] "bwt"
sum(table(names(bwt.df))) # Total number of variables: 33
## [1] 33
fastDummies librarylibrary(fastDummies)
# select variables to keep in the data frame and also the variables for which dummies are to be created
# it keeps the variables transformed into dummies
bwt.fd <-
bwt.df %>%
dplyr::select(lowfl, age, lwt, racegfl, smokefl, ptlgfl, htfl, uifl, ftvgfl, bwt) %>%
dummy_cols(
select_columns = c("racegfl", "ptlgfl", "ftvgfl"),
remove_first_dummy = TRUE
)
str(bwt.fd)
## 'data.frame': 189 obs. of 16 variables:
## $ lowfl : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ racegfl : Factor w/ 3 levels "White","Black",..: 2 3 1 1 1 3 1 3 1 1 ...
## $ smokefl : Factor w/ 2 levels "Non-smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
## $ ptlgfl : Factor w/ 3 levels "No_preterm_labor",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ htfl : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ uifl : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
## $ ftvgfl : Factor w/ 3 levels "No_physician_visit",..: 1 3 2 3 1 1 2 2 2 1 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
## $ racegfl_Black : int 1 0 0 0 0 0 0 0 0 0 ...
## $ racegfl_Other : int 0 1 0 0 0 1 0 1 0 0 ...
## $ ptlgfl_One_preterm_labor : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ptlgfl_TwoOrMore_preterm_labor : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ftvgfl_One_firstTimester_drVisit : int 0 0 1 0 0 0 1 1 1 0 ...
## $ ftvgfl_twoOrMore_firstTimester_drVisit: int 0 1 0 1 0 0 0 0 0 0 ...
recipes library# Preprocessing Tools to Create Design Matrices
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
# select variables to keep in the data frame and also the variables for which dummies are to be created
# it won't keep the variables transformed into dummies
bwt.dm <-
bwt.df %>%
recipe(~ lowfl + age + lwt + racegfl + smokefl + ptlgfl + htfl + uifl + ftvgfl + bwt) %>%
step_dummy(racegfl, ptlgfl, ftvgfl) %>%
prep(training = bwt.df) %>%
bake(new_data = bwt.df)
str(bwt.dm)
## Classes 'tbl_df', 'tbl' and 'data.frame': 189 obs. of 13 variables:
## $ lowfl : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ smokefl : Factor w/ 2 levels "Non-smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
## $ htfl : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ uifl : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
## $ racegfl_Black : num 1 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ racegfl: chr "contr.treatment"
## $ racegfl_Other : num 0 1 0 0 0 1 0 1 0 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ racegfl: chr "contr.treatment"
## $ ptlgfl_One_preterm_labor : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ ptlgfl: chr "contr.treatment"
## $ ptlgfl_TwoOrMore_preterm_labor : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ ptlgfl: chr "contr.treatment"
## $ ftvgfl_One_firstTimester_drVisit : num 0 0 1 0 0 0 1 1 1 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ ftvgfl: chr "contr.treatment"
## $ ftvgfl_twoOrMore_firstTimester_drVisit: num 0 1 0 1 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ ftvgfl: chr "contr.treatment"
birthwt dataset as a csv filewrite_csv(birthwt, "birthwt.csv")
birthwt.csv file as a tibblebwt_t <- read_csv("birthwt.csv")
## Parsed with column specification:
## cols(
## low = col_double(),
## age = col_double(),
## lwt = col_double(),
## race = col_double(),
## smoke = col_double(),
## ptl = col_double(),
## ht = col_double(),
## ui = col_double(),
## ftv = col_double(),
## bwt = col_double()
## )
str(bwt_t)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 189 obs. of 10 variables:
## $ low : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age : num 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : num 182 155 105 108 107 124 118 103 123 113 ...
## $ race : num 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: num 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : num 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : num 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : num 2523 2551 2557 2594 2600 ...
## - attr(*, "spec")=
## .. cols(
## .. low = col_double(),
## .. age = col_double(),
## .. lwt = col_double(),
## .. race = col_double(),
## .. smoke = col_double(),
## .. ptl = col_double(),
## .. ht = col_double(),
## .. ui = col_double(),
## .. ftv = col_double(),
## .. bwt = col_double()
## .. )
glimpse(bwt_t)
## Observations: 189
## Variables: 10
## $ low <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ age <dbl> 19, 33, 20, 21, 18, 21, 22, 17, 29, 26, 19, 19, 22, 30, ...
## $ lwt <dbl> 182, 155, 105, 108, 107, 124, 118, 103, 123, 113, 95, 15...
## $ race <dbl> 2, 3, 1, 1, 1, 3, 1, 3, 1, 1, 3, 3, 3, 3, 1, 1, 2, 1, 3,...
## $ smoke <dbl> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,...
## $ ptl <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ ht <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ ui <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,...
## $ ftv <dbl> 0, 3, 1, 2, 0, 0, 1, 1, 1, 0, 0, 1, 0, 2, 0, 0, 0, 3, 0,...
## $ bwt <dbl> 2523, 2551, 2557, 2594, 2600, 2622, 2637, 2637, 2663, 26...
-Factorize
-Recode
-Regroup
-Collapse levels and label the levels
bwt_tcf <- bwt_t %>%
mutate(
lowf=as_factor(low),
lowfl=fct_recode(lowf, High="0", Low="1"),
smokef=as_factor(smoke),
smokefl=fct_recode(smokef, Non_smoker="0", Smoker="1"),
racef=as_factor(race),
racegf=fct_recode(racef, "0"="1", "1"="2", "2"="3"),
racegfl=fct_recode(racegf, "white"="0", "black"="1", "other"="2"),
htf=as_factor(ht),
htfl=fct_recode(htf, No_hypertension="0", Hypertension="1"),
uif=as_factor(ui),
uifl=fct_recode(uif, No_uterine_irritability="0", Uterine_irritability="1"),
ptlf=as_factor(ptl),
ptlgf=fct_collapse(ptlf, "0"="0", "1"="1", "2"=c("2", "3")),
ptlgfl=fct_collapse(ptlf, No_preterm_labor="0", One_preterm_labor="1", Two_plus_preterm_labor=c("2", "3")),
ftvf=as_factor(ftv),
ftvgf=fct_collapse(ftvf, "2"=c("2", "3", "4", "6")),
ftvgfl=fct_collapse(ftvf, No_physician_visit="0", One_physician_visit="1", Two_plus_physician_visit=c("2", "3", "4", "6")),
)
bwt_tcf <- bwt_tcf %>% dplyr::select(low, lowf, lowfl,
age, lwt,
race, racef, racegf, racegfl,
smoke, smokef, smokefl,
ptl, ptlf, ptlgf, ptlgfl,
ht, htf, htfl,
ui, uif, uifl,
ftv, ftvf, ftvgf, ftvgfl,
bwt
)
str(bwt_tcf)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 189 obs. of 27 variables:
## $ low : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lowf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ lowfl : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : num 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : num 182 155 105 108 107 124 118 103 123 113 ...
## $ race : num 2 3 1 1 1 3 1 3 1 1 ...
## $ racef : Factor w/ 3 levels "1","2","3": 2 3 1 1 1 3 1 3 1 1 ...
## $ racegf : Factor w/ 3 levels "0","1","2": 2 3 1 1 1 3 1 3 1 1 ...
## $ racegfl: Factor w/ 3 levels "white","black",..: 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke : num 0 0 1 1 1 0 0 0 1 1 ...
## $ smokef : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 1 2 2 ...
## $ smokefl: Factor w/ 2 levels "Non_smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
## $ ptl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ptlf : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ ptlgf : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ ptlgfl : Factor w/ 3 levels "No_preterm_labor",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ht : num 0 0 0 0 0 0 0 0 0 0 ...
## $ htf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ htfl : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ui : num 1 0 0 1 1 0 0 0 0 0 ...
## $ uif : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 1 1 ...
## $ uifl : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
## $ ftv : num 0 3 1 2 0 0 1 1 1 0 ...
## $ ftvf : Factor w/ 6 levels "0","1","2","3",..: 1 4 2 3 1 1 2 2 2 1 ...
## $ ftvgf : Factor w/ 3 levels "0","1","2": 1 3 2 3 1 1 2 2 2 1 ...
## $ ftvgfl : Factor w/ 3 levels "No_physician_visit",..: 1 3 2 3 1 1 2 2 2 1 ...
## $ bwt : num 2523 2551 2557 2594 2600 ...
names(bwt_tcf)
## [1] "low" "lowf" "lowfl" "age" "lwt" "race" "racef"
## [8] "racegf" "racegfl" "smoke" "smokef" "smokefl" "ptl" "ptlf"
## [15] "ptlgf" "ptlgfl" "ht" "htf" "htfl" "ui" "uif"
## [22] "uifl" "ftv" "ftvf" "ftvgf" "ftvgfl" "bwt"
fastDummies libray# outputs all variables plus dummy variables
bwt_fd <- bwt_tcf %>% dummy_cols(
select_columns = c("racegfl", "ptlgfl", "ftvgfl"),
remove_first_dummy = TRUE)
str(bwt_fd)
## Classes 'tbl_df', 'tbl' and 'data.frame': 189 obs. of 33 variables:
## $ low : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lowf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ lowfl : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : num 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : num 182 155 105 108 107 124 118 103 123 113 ...
## $ race : num 2 3 1 1 1 3 1 3 1 1 ...
## $ racef : Factor w/ 3 levels "1","2","3": 2 3 1 1 1 3 1 3 1 1 ...
## $ racegf : Factor w/ 3 levels "0","1","2": 2 3 1 1 1 3 1 3 1 1 ...
## $ racegfl : Factor w/ 3 levels "white","black",..: 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke : num 0 0 1 1 1 0 0 0 1 1 ...
## $ smokef : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 1 2 2 ...
## $ smokefl : Factor w/ 2 levels "Non_smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
## $ ptl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ptlf : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ ptlgf : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ ptlgfl : Factor w/ 3 levels "No_preterm_labor",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ht : num 0 0 0 0 0 0 0 0 0 0 ...
## $ htf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ htfl : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ui : num 1 0 0 1 1 0 0 0 0 0 ...
## $ uif : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 1 1 ...
## $ uifl : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
## $ ftv : num 0 3 1 2 0 0 1 1 1 0 ...
## $ ftvf : Factor w/ 6 levels "0","1","2","3",..: 1 4 2 3 1 1 2 2 2 1 ...
## $ ftvgf : Factor w/ 3 levels "0","1","2": 1 3 2 3 1 1 2 2 2 1 ...
## $ ftvgfl : Factor w/ 3 levels "No_physician_visit",..: 1 3 2 3 1 1 2 2 2 1 ...
## $ bwt : num 2523 2551 2557 2594 2600 ...
## $ racegfl_black : int 1 0 0 0 0 0 0 0 0 0 ...
## $ racegfl_other : int 0 1 0 0 0 1 0 1 0 0 ...
## $ ptlgfl_One_preterm_labor : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ptlgfl_Two_plus_preterm_labor : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ftvgfl_One_physician_visit : int 0 0 1 0 0 0 1 1 1 0 ...
## $ ftvgfl_Two_plus_physician_visit: int 0 1 0 1 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Total number of variables: 45
sum(table(names(bwt_fd)))
## [1] 33
recipes libray# recipe selects some of the variables in the tibble
# won't keep the original variables transformed into dummies
bwt_dm <-
bwt_tcf %>%
recipe(~ lowfl + age + lwt + racegfl + smokefl + ptlgfl + htfl + uifl + ftvgfl + bwt) %>%
step_dummy(racegfl, ptlgfl, ftvgfl) %>%
prep(training = bwt_tcf) %>%
bake(new_data = bwt_tcf)
str(bwt_dm)
## Classes 'tbl_df', 'tbl' and 'data.frame': 189 obs. of 13 variables:
## $ lowfl : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : num 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : num 182 155 105 108 107 124 118 103 123 113 ...
## $ smokefl : Factor w/ 2 levels "Non_smoker","Smoker": 1 1 2 2 2 1 1 1 2 2 ...
## $ htfl : Factor w/ 2 levels "No_hypertension",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ uifl : Factor w/ 2 levels "No_uterine_irritability",..: 2 1 1 2 2 1 1 1 1 1 ...
## $ bwt : num 2523 2551 2557 2594 2600 ...
## $ racegfl_black : num 1 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ racegfl: chr "contr.treatment"
## $ racegfl_other : num 0 1 0 0 0 1 0 1 0 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ racegfl: chr "contr.treatment"
## $ ptlgfl_One_preterm_labor : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ ptlgfl: chr "contr.treatment"
## $ ptlgfl_Two_plus_preterm_labor : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ ptlgfl: chr "contr.treatment"
## $ ftvgfl_One_physician_visit : num 0 0 1 0 0 0 1 1 1 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ ftvgfl: chr "contr.treatment"
## $ ftvgfl_Two_plus_physician_visit: num 0 1 0 1 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int 0 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ ftvgfl: chr "contr.treatment"
head(birthwt, n=3)
| low | age | lwt | race | smoke | ptl | ht | ui | ftv | bwt | |
|---|---|---|---|---|---|---|---|---|---|---|
| 85 | 0 | 19 | 182 | 2 | 0 | 0 | 0 | 1 | 0 | 2523 |
| 86 | 0 | 33 | 155 | 3 | 0 | 0 | 0 | 0 | 3 | 2551 |
| 87 | 0 | 20 | 105 | 1 | 1 | 0 | 0 | 0 | 1 | 2557 |
head(bwt.df, n=3)
| low | lowf | lowfl | age | lwt | race | racegr | racegf | racegfl | black | other | smoke | smokef | smokefl | ptl | ptlgr | ptlgf | ptlgfl | one_preterm_labor | twoOrMore_preterm_labor | ht | htf | htfl | ui | uif | uifl | ftv | ftvgr | ftvgf | ftvgfl | one_firstTimester_drVisit | twoOrMore_firstTimester_drVisit | bwt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 85 | 0 | 0 | High | 19 | 182 | 2 | 1 | 1 | Black | 1 | 0 | 0 | 0 | Non-smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit | 0 | 0 | 2523 |
| 86 | 0 | 0 | High | 33 | 155 | 3 | 2 | 2 | Other | 0 | 1 | 0 | 0 | Non-smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 2 | 2 | twoOrMore_firstTimester_drVisit | 0 | 1 | 2551 |
| 87 | 0 | 0 | High | 20 | 105 | 1 | 0 | 0 | White | 0 | 0 | 1 | 1 | Smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_firstTimester_drVisit | 1 | 0 | 2557 |
birthwt %>% sample_n(2, replace = TRUE)
| low | age | lwt | race | smoke | ptl | ht | ui | ftv | bwt |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 25 | 92 | 1 | 1 | 0 | 0 | 0 | 0 | 1928 |
| 1 | 20 | 150 | 1 | 1 | 0 | 0 | 0 | 2 | 1928 |
# glimpse the 1st 3 rows of the tibble
# Note there are no row or observation numbers.
head(bwt_t, n=3)
| low | age | lwt | race | smoke | ptl | ht | ui | ftv | bwt |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 19 | 182 | 2 | 0 | 0 | 0 | 1 | 0 | 2523 |
| 0 | 33 | 155 | 3 | 0 | 0 | 0 | 0 | 3 | 2551 |
| 0 | 20 | 105 | 1 | 1 | 0 | 0 | 0 | 1 | 2557 |
# sample two observations
bwt_t %>% sample_n(2)
| low | age | lwt | race | smoke | ptl | ht | ui | ftv | bwt |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 28 | 130 | 3 | 0 | 0 | 0 | 0 | 0 | 3969 |
| 0 | 29 | 150 | 1 | 0 | 0 | 0 | 0 | 2 | 2920 |
# first 3 rows
bwt_tcf[1:3,]
| low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl | bwt |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | High | 19 | 182 | 2 | 2 | 1 | black | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit | 2523 |
| 0 | 0 | High | 33 | 155 | 3 | 3 | 2 | other | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 3 | 2 | Two_plus_physician_visit | 2551 |
| 0 | 0 | High | 20 | 105 | 1 | 1 | 0 | white | 1 | 1 | Smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_physician_visit | 2557 |
# sample 1% of the tibble
bwt_tcf %>% sample_frac(.01)
| low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl | bwt |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | Low | 21 | 100 | 3 | 3 | 2 | other | 0 | 0 | Non_smoker | 1 | 1 | 1 | One_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 4 | 4 | 2 | Two_plus_physician_visit | 2301 |
| 0 | 0 | High | 35 | 170 | 1 | 1 | 0 | white | 0 | 0 | Non_smoker | 1 | 1 | 1 | One_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_physician_visit | 4174 |
# rows 1, 10 and 100
bwt_tcf[c(1, 10, 100),]
| low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl | bwt |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | High | 19 | 182 | 2 | 2 | 1 | black | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit | 2523 |
| 0 | 0 | High | 26 | 113 | 1 | 1 | 0 | white | 1 | 1 | Smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 0 | 0 | 0 | No_physician_visit | 2665 |
| 0 | 0 | High | 30 | 137 | 1 | 1 | 0 | white | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_physician_visit | 3699 |
# output: a vector of length 189
bwt_tcf[["bwt"]] %>% length()
## [1] 189
# output: a vector of length 189
bwt_tcf$bwt %>% length()
## [1] 189
# output: a vector of 3 obsrvations from variable "bwt"
bwt_tcf$bwt[c(1, 3, 6)]
## [1] 2523 2557 2622
## select race, smoke and bwt variables
birthwt[c(4, 5, 10)] %>% head(n=2)
| race | smoke | bwt | |
|---|---|---|---|
| 85 | 2 | 0 | 2523 |
| 86 | 3 | 0 | 2551 |
bwt.df[c(6, 12, 33)] %>% head(n=2)
| race | smoke | bwt | |
|---|---|---|---|
| 85 | 2 | 0 | 2523 |
| 86 | 3 | 0 | 2551 |
## select race, smoke and bwt variables
bwt_tcf %>% dplyr::select(race, smoke, bwt) %>% head(n=2)
| race | smoke | bwt |
|---|---|---|
| 2 | 0 | 2523 |
| 3 | 0 | 2551 |
bwt_tcf %>% dplyr::select(age, lwt, bwt) %>% head(n=2)
| age | lwt | bwt |
|---|---|---|
| 19 | 182 | 2523 |
| 33 | 155 | 2551 |
bwt_tcf %>% dplyr::select(ends_with("t")) %>% head(n=2)
| lwt | ht | bwt |
|---|---|---|
| 182 | 0 | 2523 |
| 155 | 0 | 2551 |
bwt_tcf %>% dplyr::select(-c(low, race, smoke, ptl, ht, ui, ftv)) %>% head(n=2)
| lowf | lowfl | age | lwt | racef | racegf | racegfl | smokef | smokefl | ptlf | ptlgf | ptlgfl | htf | htfl | uif | uifl | ftvf | ftvgf | ftvgfl | bwt |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | High | 19 | 182 | 2 | 1 | black | 0 | Non_smoker | 0 | 0 | No_preterm_labor | 0 | No_hypertension | 1 | Uterine_irritability | 0 | 0 | No_physician_visit | 2523 |
| 0 | High | 33 | 155 | 3 | 2 | other | 0 | Non_smoker | 0 | 0 | No_preterm_labor | 0 | No_hypertension | 0 | No_uterine_irritability | 3 | 2 | Two_plus_physician_visit | 2551 |
# output: a vector of 96 obsrvations for variable "racegfl" where the level is "white"
bwt_tcf$bwt[bwt_tcf$racegfl=="white"] %>% length()
## [1] 96
# output: "bwt" as a list, data.frame
bwt_tcf["bwt"] %>% dim()
## [1] 189 1
# output 3 observations/rows for the "bwt" variables
bwt_tcf[c(1,3,6), "bwt"]
| bwt |
|---|
| 2523 |
| 2557 |
| 2622 |
# output: a data.frame when race=="White"
bwt_tcf[bwt_tcf$racegfl=="white", ] %>% dim()
## [1] 96 27
which(bwt.df$bwt==2557)
## [1] 3
bwt.df[bwt.df$bwt==2557,]
| low | lowf | lowfl | age | lwt | race | racegr | racegf | racegfl | black | other | smoke | smokef | smokefl | ptl | ptlgr | ptlgf | ptlgfl | one_preterm_labor | twoOrMore_preterm_labor | ht | htf | htfl | ui | uif | uifl | ftv | ftvgr | ftvgf | ftvgfl | one_firstTimester_drVisit | twoOrMore_firstTimester_drVisit | bwt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 87 | 0 | 0 | High | 20 | 105 | 1 | 0 | 0 | White | 0 | 0 | 1 | 1 | Smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_firstTimester_drVisit | 1 | 0 | 2557 |
which(bwt_tcf$bwt==2557)
## [1] 3
bwt_tcf[bwt_tcf$bwt==2557,]
| low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl | bwt |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | High | 20 | 105 | 1 | 1 | 0 | white | 1 | 1 | Smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_physician_visit | 2557 |
bwt_tcf[is.na(bwt_tcf$age),] %>% count()
| n |
|---|
| 0 |
filter(bwt_tcf, is.na(age)) %>% count()
| n |
|---|
| 0 |
bwt_tcf %>% filter(is.na(age)) %>% count()
| n |
|---|
| 0 |
bwt_tcf[!is.na(bwt_tcf$age), ] %>% count()
| n |
|---|
| 189 |
bwt_tcf %>% filter(!is.na(age)) %>% count()
| n |
|---|
| 189 |
sum(!is.na(bwt_tcf$age))
## [1] 189
-Suitable for continous variables. For eaxmple, when age is less than or equal to the mean age.
# observations where age is less than or equal to the mean age
bwt.df[bwt.df$age <= mean(bwt.df$age), ] %>% count()
| n |
|---|
| 107 |
bwt_tcf[bwt_tcf$age <= mean(bwt_tcf$age), ] %>% count()
| n |
|---|
| 107 |
bwt_tcf %>% filter(age<=mean(age)) %>% count()
| n |
|---|
| 107 |
# observations where age is greater than the mean age
bwt_tcf %>% filter(age > mean(age)) %>% count()
| n |
|---|
| 82 |
# observations where age is between 30 and 40 (must use "&")
bwt.df %>% subset(age >= 30 & age <= 40) %>% count()
| n |
|---|
| 26 |
# observations where age is between 30 and 40 for a categorical variable
bwt_tcf %>% filter(between(age, 30, 40)) %>% count(lowfl)
| lowfl | n |
|---|---|
| High | 22 |
| Low | 4 |
bwt_tcf %>% filter(age >= 30, age <= 40) %>% count(racegfl)
| racegfl | n |
|---|---|
| white | 18 |
| black | 2 |
| other | 6 |
bwt.df[bwt.df$race==2, ] %>% count(race)
| race | n |
|---|---|
| 2 | 26 |
bwt.df[bwt.df$racegr==1, ] %>% count(race)
| race | n |
|---|---|
| 2 | 26 |
bwt.df[bwt.df$racegf==1, ] %>% count(race)
| race | n |
|---|---|
| 2 | 26 |
bwt.df[bwt.df$racegfl=="Black", ] %>% count(race)
| race | n |
|---|---|
| 2 | 26 |
bwt.df %>% subset(racegfl=="Black") %>% count(race)
| race | n |
|---|---|
| 2 | 26 |
bwt_tcf[bwt_tcf$racegfl=="white", ] %>% count(race)
| race | n |
|---|---|
| 1 | 96 |
bwt_tcf %>% subset(racegfl=="white") %>% count(race)
| race | n |
|---|---|
| 1 | 96 |
bwt_tcf %>% filter(racegfl=="white") %>% count(race)
| race | n |
|---|---|
| 1 | 96 |
# observations where race is white or black
bwt_tcf %>% filter(racegfl=="white" | racegfl=="black") %>% count() # 122
| n |
|---|
| 122 |
bwt_tcf %>% filter(racegfl=="white" | racegfl=="black") %>% count(race) # 96+26
| race | n |
|---|---|
| 1 | 96 |
| 2 | 26 |
bwt_tcf %>% filter(racegfl %in% c("white", "black")) %>% count() #122
| n |
|---|
| 122 |
bwt_tcf %>% filter(racegfl %in% c("white", "black")) %>% count(race) #96+26
| race | n |
|---|---|
| 1 | 96 |
| 2 | 26 |
# observations where race is neither white nor black
bwt_tcf[bwt_tcf$racegfl!="white" & bwt_tcf$racegfl!="black",] %>% count() # 67
| n |
|---|
| 67 |
bwt_tcf %>% subset(!(racegfl %in% c("white", "black")) ) %>% count()#67
| n |
|---|
| 67 |
bwt_tcf %>% filter(racegfl!="white" & racegfl!="black") %>% count() #67
| n |
|---|
| 67 |
bwt_tcf %>% filter(!(racegfl %in% c("white", "black"))) %>% count() #67
| n |
|---|
| 67 |
# observations where black folks also smoke
bwt_tcf %>% filter(racegfl=="black", smokefl=="Smoker") %>% count(race, smoke) #10
| race | smoke | n |
|---|---|---|
| 2 | 1 | 10 |
# observations where race is not either 1 or 2
bwt_tcf[bwt_tcf$race!=1 & bwt_tcf$race!=2,] %>% count() # 67
| n |
|---|
| 67 |
subset(bwt_tcf, !(race %in% c(1, 2)) ) %>% count() # 67
| n |
|---|
| 67 |
bwt_tcf %>% filter(race!=1 & race!=2) %>% count() #67
| n |
|---|
| 67 |
bwt_tcf %>% filter(!(race==1 | race==2)) %>% count() #67
| n |
|---|
| 67 |
bwt_tcf %>% filter(!(race %in% c(1, 2))) %>% count() #67
| n |
|---|
| 67 |
-NA are always sorted to the end for local data
-Default is ascending
-desc denotes descending
names(bwt.df)
## [1] "low" "lowf"
## [3] "lowfl" "age"
## [5] "lwt" "race"
## [7] "racegr" "racegf"
## [9] "racegfl" "black"
## [11] "other" "smoke"
## [13] "smokef" "smokefl"
## [15] "ptl" "ptlgr"
## [17] "ptlgf" "ptlgfl"
## [19] "one_preterm_labor" "twoOrMore_preterm_labor"
## [21] "ht" "htf"
## [23] "htfl" "ui"
## [25] "uif" "uifl"
## [27] "ftv" "ftvgr"
## [29] "ftvgf" "ftvgfl"
## [31] "one_firstTimester_drVisit" "twoOrMore_firstTimester_drVisit"
## [33] "bwt"
# sort by decreasing age
bwt.df[order(bwt.df$age, decreasing = TRUE), ] %>% head(n=5)
| low | lowf | lowfl | age | lwt | race | racegr | racegf | racegfl | black | other | smoke | smokef | smokefl | ptl | ptlgr | ptlgf | ptlgfl | one_preterm_labor | twoOrMore_preterm_labor | ht | htf | htfl | ui | uif | uifl | ftv | ftvgr | ftvgf | ftvgfl | one_firstTimester_drVisit | twoOrMore_firstTimester_drVisit | bwt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 226 | 0 | 0 | High | 45 | 123 | 1 | 0 | 0 | White | 0 | 0 | 0 | 0 | Non-smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_firstTimester_drVisit | 1 | 0 | 4990 |
| 108 | 0 | 0 | High | 36 | 202 | 1 | 0 | 0 | White | 0 | 0 | 0 | 0 | Non-smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_firstTimester_drVisit | 1 | 0 | 2836 |
| 183 | 0 | 0 | High | 36 | 175 | 1 | 0 | 0 | White | 0 | 0 | 0 | 0 | Non-smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 0 | 0 | 0 | No_physician_visit | 0 | 0 | 3600 |
| 119 | 0 | 0 | High | 35 | 121 | 2 | 1 | 1 | Black | 1 | 0 | 1 | 1 | Smoker | 1 | 1 | 1 | One_preterm_labor | 1 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_firstTimester_drVisit | 1 | 0 | 2948 |
| 223 | 0 | 0 | High | 35 | 170 | 1 | 0 | 0 | White | 0 | 0 | 0 | 0 | Non-smoker | 1 | 1 | 1 | One_preterm_labor | 1 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_firstTimester_drVisit | 1 | 0 | 4174 |
# sort by decreasing age and show only age and lwt
bwt.df[order(bwt.df$age, decreasing = TRUE), ][c(4, 5)] %>% head(n=5) # show columns or variables 10 $ 33
| age | lwt | |
|---|---|---|
| 226 | 45 | 123 |
| 108 | 36 | 202 |
| 183 | 36 | 175 |
| 119 | 35 | 121 |
| 223 | 35 | 170 |
# sort by age, lwt and bwt
bwt_tcf[order(bwt_tcf$age, bwt_tcf$lwt, bwt_tcf$bwt), ][c(4, 5, 27)] %>% head(n=5)
| age | lwt | bwt |
|---|---|---|
| 14 | 100 | 2495 |
| 14 | 101 | 2466 |
| 14 | 135 | 3941 |
| 15 | 98 | 2778 |
| 15 | 110 | 2353 |
# sort by bwt
bwt_tcf %>% dplyr::select(bwt) %>% arrange(bwt) %>% head(n=5)
| bwt |
|---|
| 709 |
| 1021 |
| 1135 |
| 1330 |
| 1474 |
bwt_tcf %>% dplyr::select(bwt) %>% arrange(desc(bwt)) %>% head(n=5)
| bwt |
|---|
| 4990 |
| 4593 |
| 4238 |
| 4174 |
| 4167 |
# sort by age, lwt and bwt
bwt_tcf %>% dplyr::select(age, lwt, bwt) %>% arrange(age, lwt, bwt) %>% head(n=5)
| age | lwt | bwt |
|---|---|---|
| 14 | 100 | 2495 |
| 14 | 101 | 2466 |
| 14 | 135 | 3941 |
| 15 | 98 | 2778 |
| 15 | 110 | 2353 |
bwt_tcf %>% dplyr::select(age, lwt, bwt) %>% arrange(desc(age), desc(lwt), desc(bwt)) %>% head(n=5)
| age | lwt | bwt |
|---|---|---|
| 45 | 123 | 4990 |
| 36 | 202 | 2836 |
| 36 | 175 | 3600 |
| 35 | 170 | 4174 |
| 35 | 121 | 2948 |
bwt.df %>% dplyr::select(bwt, low, lowf, lowfl, everything()) %>% head(n=2)
| bwt | low | lowf | lowfl | age | lwt | race | racegr | racegf | racegfl | black | other | smoke | smokef | smokefl | ptl | ptlgr | ptlgf | ptlgfl | one_preterm_labor | twoOrMore_preterm_labor | ht | htf | htfl | ui | uif | uifl | ftv | ftvgr | ftvgf | ftvgfl | one_firstTimester_drVisit | twoOrMore_firstTimester_drVisit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 85 | 2523 | 0 | 0 | High | 19 | 182 | 2 | 1 | 1 | Black | 1 | 0 | 0 | 0 | Non-smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit | 0 | 0 |
| 86 | 2551 | 0 | 0 | High | 33 | 155 | 3 | 2 | 2 | Other | 0 | 1 | 0 | 0 | Non-smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 2 | 2 | twoOrMore_firstTimester_drVisit | 0 | 1 |
bwt_tcf %>% dplyr::select(bwt, low, lowf, lowfl, everything()) %>% head(n=2)
| bwt | low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2523 | 0 | 0 | High | 19 | 182 | 2 | 2 | 1 | black | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit |
| 2551 | 0 | 0 | High | 33 | 155 | 3 | 3 | 2 | other | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 3 | 2 | Two_plus_physician_visit |
bwt_tcf %>% rename(body_weight=bwt) %>% dplyr::select(body_weight, everything()) %>% head(n=2)
| body_weight | low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2523 | 0 | 0 | High | 19 | 182 | 2 | 2 | 1 | black | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit |
| 2551 | 0 | 0 | High | 33 | 155 | 3 | 3 | 2 | other | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 3 | 2 | Two_plus_physician_visit |
quantile(bwt_tcf$bwt)
## 0% 25% 50% 75% 100%
## 709 2414 2977 3487 4990
range(bwt_tcf$bwt)
## [1] 709 4990
if_else function# outcome is a "numeric" variable
bwt_tcf %>%
mutate(bwt_category =if_else(bwt <= 2500, 1, 0)) %>%
dplyr::select(bwt, bwt_category, everything()) %>%
head(n=2)
| bwt | bwt_category | low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2523 | 0 | 0 | 0 | High | 19 | 182 | 2 | 2 | 1 | black | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit |
| 2551 | 0 | 0 | 0 | High | 33 | 155 | 3 | 3 | 2 | other | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 3 | 2 | Two_plus_physician_visit |
# outcome is numeral "factor" variable
bwt_tcf %>%
mutate(bwt_category =as_factor(if_else(bwt <= 2500, 1, 0))) %>%
dplyr::select(bwt, bwt_category, everything()) %>%
head(n=2)
| bwt | bwt_category | low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2523 | 0 | 0 | 0 | High | 19 | 182 | 2 | 2 | 1 | black | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit |
| 2551 | 0 | 0 | 0 | High | 33 | 155 | 3 | 3 | 2 | other | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 3 | 2 | Two_plus_physician_visit |
# outcome is a "character" variable
bwt_tcf %>%
mutate(bwt_category = if_else(bwt <= 2500, "low", "high")) %>%
dplyr::select(bwt, bwt_category, everything()) %>%
head(n=2)
| bwt | bwt_category | low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2523 | high | 0 | 0 | High | 19 | 182 | 2 | 2 | 1 | black | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit |
| 2551 | high | 0 | 0 | High | 33 | 155 | 3 | 3 | 2 | other | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 3 | 2 | Two_plus_physician_visit |
# outcome is a character "factor" variable
bwt_tcf %>%
mutate(bwt_category = as_factor (if_else(bwt <= 2500, "low", "high"))) %>%
dplyr::select(bwt, bwt_category, everything()) %>%
head(n=2)
| bwt | bwt_category | low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2523 | high | 0 | 0 | High | 19 | 182 | 2 | 2 | 1 | black | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit |
| 2551 | high | 0 | 0 | High | 33 | 155 | 3 | 3 | 2 | other | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 3 | 2 | Two_plus_physician_visit |
cut function# dichotomize bwt
bwt.df$bwt_category <- cut(bwt.df$bwt, breaks=c(-Inf, 2500, Inf), labels=c("low","high"))
bwt.df %>% count(bwt_category)
| bwt_category | n |
|---|---|
| low | 59 |
| high | 130 |
table(bwt.df$lowfl, bwt.df$bwt_category)
##
## low high
## High 0 130
## Low 59 0
boxplot(bwt.df$bwt ~ bwt.df$bwt_category, col=2:3, horizontal = TRUE)
bwt_tcf <- bwt_tcf %>% mutate(bwt_category=cut(bwt, breaks=c(-Inf, 2500, Inf), labels=c("low","high")))
boxplot(bwt_tcf$bwt ~ bwt_tcf$bwt_category, col=2:3, horizontal = TRUE)
cut functionbwt.df$bwt %>% quantile()
## 0% 25% 50% 75% 100%
## 709 2414 2977 3487 4990
# categorize using `cut` function into 4 arbitrary levels
bwt.df$bwt_cut <- cut(bwt.df$bwt, 4, labels=c("wt1", "wt2", "wt3", "wt4"))
bwt.df %>% count(bwt_cut)
| bwt_cut | n |
|---|---|
| wt1 | 9 |
| wt2 | 73 |
| wt3 | 91 |
| wt4 | 16 |
# Cut into 4 groups (using quantile values)
bwt.df$bwt_category <- cut(bwt.df$bwt,
breaks=c(-Inf, 2414, 2977, 3487, Inf),
labels=c("wt1", "wt2", "wt3", "wt4"))
bwt.df %>% count(bwt_category)
| bwt_category | n |
|---|---|
| wt1 | 48 |
| wt2 | 49 |
| wt3 | 45 |
| wt4 | 47 |
boxplot(bwt.df$bwt ~ bwt.df$bwt_category, col=5:8, horizontal = TRUE)
# use quantile probablilities instead of calculated values to cut into 4 groups
bwt.df$bwt_quantiles <- cut(bwt.df$bwt,
breaks=quantile(bwt.df$bwt, probs = seq(0, 1, 0.25)),
include.lowest = TRUE,
labels=c(1:4))
bwt.df %>% count(bwt_quantiles)
| bwt_quantiles | n |
|---|---|
| 1 | 48 |
| 2 | 49 |
| 3 | 45 |
| 4 | 47 |
bwt.df$bwt_quantiles <- cut(bwt.df$bwt,
breaks=quantile(bwt.df$bwt, probs = c(0, 0.25, 0.5, 0.75, 1)),
include.lowest = TRUE,
labels=c("wt1", "wt2", "wt3", "wt4"))
bwt.df %>% count(bwt_quantiles)
| bwt_quantiles | n |
|---|---|
| wt1 | 48 |
| wt2 | 49 |
| wt3 | 45 |
| wt4 | 47 |
bwt.df <- bwt.df %>% mutate(bwt_tertile=cut(bwt,
breaks=quantile(bwt, probs = c(0, 0.337, 0.663, 1)),
include.lowest = TRUE,
labels=c(3:1)))
bwt.df %>% count(bwt_tertile)
| bwt_tertile | n |
|---|---|
| 3 | 64 |
| 2 | 62 |
| 1 | 63 |
boxplot(bwt.df$bwt ~ bwt.df$bwt_tertile, col=5:7, horizontal = TRUE )
bwt.df <- bwt.df %>%
mutate(
#bwt_tertile_recode=fct_recode(bwt_tertile, Low="3", Medium="2", High="1"),
#bwt_tertile_order=factor(bwt_tertile_recode, order=TRUE),
#bwt_tertile_order=ordered(bwt_tertile_recode)
bwt_tertile_order=factor(bwt_tertile, levels=c("1", "2", "3"), labels =c("low", "medium", "high"), order = TRUE )
)
bwt.df %>% count(bwt_tertile_order)
| bwt_tertile_order | n |
|---|---|
| low | 63 |
| medium | 62 |
| high | 64 |
boxplot(bwt.df$bwt ~ bwt.df$bwt_tertile_order, col=5:7, horizontal = TRUE )
# create numeric, logical and factor variable
bwt_tcf %>%
mutate(bwt_in_kg=bwt/1000,
low_bwt=bwt_in_kg <= 2.5,
low_wt_class = as_factor(if_else(low_bwt, 1, 0))) %>%
dplyr::select(bwt, bwt_in_kg, low_bwt, low_wt_class) %>%
head(n=2)
| bwt | bwt_in_kg | low_bwt | low_wt_class |
|---|---|---|---|
| 2523 | 2.523 | FALSE | 0 |
| 2551 | 2.551 | FALSE | 0 |
NAbwt_tcf %>%
mutate(bwt_value = ifelse(bwt <= 2500, NA, bwt)) %>%
arrange(bwt_value) %>%
dplyr::select(bwt, bwt_value, everything()) %>% tail(n=2)
| bwt | bwt_value | low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl | bwt_category |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2495 | NA | 1 | 1 | Low | 17 | 142 | 2 | 2 | 1 | black | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 1 | 1 | Hypertension | 0 | 0 | No_uterine_irritability | 0 | 0 | 0 | No_physician_visit | low |
| 2495 | NA | 1 | 1 | Low | 21 | 130 | 1 | 1 | 0 | white | 1 | 1 | Smoker | 0 | 0 | 0 | No_preterm_labor | 1 | 1 | Hypertension | 0 | 0 | No_uterine_irritability | 3 | 3 | 2 | Two_plus_physician_visit | low |
bwt_tcf %>%
mutate(bwt_new = ifelse(bwt < 2600 | bwt > 4000, NA, bwt)) %>%
dplyr::select(bwt, bwt_new, everything()) %>%
head(n=2)
| bwt | bwt_new | low | lowf | lowfl | age | lwt | race | racef | racegf | racegfl | smoke | smokef | smokefl | ptl | ptlf | ptlgf | ptlgfl | ht | htf | htfl | ui | uif | uifl | ftv | ftvf | ftvgf | ftvgfl | bwt_category |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2523 | NA | 0 | 0 | High | 19 | 182 | 2 | 2 | 1 | black | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit | high |
| 2551 | NA | 0 | 0 | High | 33 | 155 | 3 | 3 | 2 | other | 0 | 0 | Non_smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 3 | 2 | Two_plus_physician_visit | high |
# keep only the new variables
bwt_tcf %>%
transmute(bwt_kg=bwt/1000,
low_wt=bwt_kg <=2.5,
low_wt_class = as_factor(if_else(low_wt, 1, 0))) %>%
head(n=2)
| bwt_kg | low_wt | low_wt_class |
|---|---|---|
| 2.523 | FALSE | 0 |
| 2.551 | FALSE | 0 |
# output "tbl_df" "tbl" "data.frame"
bwt_tcf %>% count(race)
| race | n |
|---|---|
| 1 | 96 |
| 2 | 26 |
| 3 | 67 |
bwt_tcf %>% count(low, race)
| low | race | n |
|---|---|---|
| 0 | 1 | 73 |
| 0 | 2 | 15 |
| 0 | 3 | 42 |
| 1 | 1 | 23 |
| 1 | 2 | 11 |
| 1 | 3 | 25 |
# output table
table(bwt_tcf$race)
##
## 1 2 3
## 96 26 67
table(bwt_tcf$low, bwt_tcf$race)
##
## 1 2 3
## 0 73 15 42
## 1 23 11 25
table(bwt_tcf$ptl)
##
## 0 1 2 3
## 159 24 5 1
table(bwt_tcf$ptlf)
##
## 0 1 2 3
## 159 24 5 1
table(bwt_tcf$ptlgf)
##
## 0 1 2
## 159 24 6
table(bwt_tcf$ptlgfl)
##
## No_preterm_labor One_preterm_labor Two_plus_preterm_labor
## 159 24 6
table(bwt_tcf$ftv)
##
## 0 1 2 3 4 6
## 100 47 30 7 4 1
table(bwt_tcf$ftvf)
##
## 0 1 2 3 4 6
## 100 47 30 7 4 1
table(bwt_tcf$ftvgf)
##
## 0 1 2
## 100 47 42
table(bwt_tcf$ftvgfl)
##
## No_physician_visit One_physician_visit Two_plus_physician_visit
## 100 47 42
table(bwt_tcf$race)
##
## 1 2 3
## 96 26 67
table(bwt_tcf$racef)
##
## 1 2 3
## 96 26 67
table(bwt_tcf$racegf)
##
## 0 1 2
## 96 26 67
table(bwt_tcf$racegfl)
##
## white black other
## 96 26 67
# individual stats
nrow(table(bwt_tcf$bwt))
## [1] 131
min(bwt_tcf$bwt)
## [1] 709
max(bwt_tcf$bwt)
## [1] 4990
range(bwt_tcf$bwt)
## [1] 709 4990
mean(bwt_tcf$bwt)
## [1] 2944.587
median(bwt_tcf$bwt)
## [1] 2977
sd(bwt_tcf$bwt)
## [1] 729.2143
mad(bwt_tcf$bwt) # median absolute deviation
## [1] 834.7038
var(bwt_tcf$bwt)
## [1] 531753.5
IQR(bwt_tcf$bwt) # interquartile range
## [1] 1073
quantile(bwt_tcf$bwt) # quartiles (by default)
## 0% 25% 50% 75% 100%
## 709 2414 2977 3487 4990
quantile(bwt_tcf$bwt, c(1, 3)/4) # specific percentiles (25% & 75% in this case)
## 25% 75%
## 2414 3487
kurtosi(bwt_tcf$bwt) # psych package
## [1] -0.1436834
skew(bwt_tcf$bwt) # psych package
## [1] -0.205337
# use summarise function
bwt_tcf %>%
summarise(no_of_total_obs=n(),
no_of_distinct_bwt_obs=n_distinct(bwt),
bwt_min=min(bwt, na.rm = TRUE),
bwt_max=max(bwt, na.rm = TRUE),
bwt_50th_quantile=quantile(bwt, 0.5, na.rm = TRUE),
bwt_avg=mean(bwt, na.rm = TRUE),
bwt_median=median(bwt, na.rm = TRUE),
bwt_sd=sd(bwt, na.rm = TRUE),
bwt_IQR=IQR(bwt, na.rm = TRUE),
bwt_mad=mad(bwt, na.rm = TRUE),
bwt_tirst=first(bwt),
bwt_last=last(bwt),
bwt_4th=nth(bwt, 4)) %>%
t
## [,1]
## no_of_total_obs 189.0000
## no_of_distinct_bwt_obs 131.0000
## bwt_min 709.0000
## bwt_max 4990.0000
## bwt_50th_quantile 2977.0000
## bwt_avg 2944.5873
## bwt_median 2977.0000
## bwt_sd 729.2143
## bwt_IQR 1073.0000
## bwt_mad 834.7038
## bwt_tirst 2523.0000
## bwt_last 2495.0000
## bwt_4th 2594.0000
# frequency
bwt_tcf %>% count(bwt) %>% arrange(desc(n)) %>% head()
| bwt | n |
|---|---|
| 3062 | 5 |
| 2495 | 4 |
| 2920 | 4 |
| 2977 | 4 |
| 3651 | 4 |
| 1928 | 3 |
# use summary function
summary(bwt_tcf$bwt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 709 2414 2977 2945 3487 4990
# use describe function
describe(bwt_tcf$bwt) %>% t()
## X1
## vars 1.0000000
## n 189.0000000
## mean 2944.5873016
## sd 729.2142952
## median 2977.0000000
## trimmed 2961.7581699
## mad 834.7038000
## min 709.0000000
## max 4990.0000000
## range 4281.0000000
## skew -0.2053370
## kurtosis -0.1436834
## se 53.0425350
fivenum(bwt_tcf$bwt) #minimum, lower-hinge, median, upper-hinge, maximum
## [1] 709 2414 2977 3487 4990
boxplot.stats(bwt_tcf$bwt)
## $stats
## [1] 1021 2414 2977 3487 4990
##
## $n
## [1] 189
##
## $conf
## [1] 2853.682 3100.318
##
## $out
## [1] 709
summary(bwt_tcf)
## low lowf lowfl age lwt
## Min. :0.0000 0:130 High:130 Min. :14.00 Min. : 80.0
## 1st Qu.:0.0000 1: 59 Low : 59 1st Qu.:19.00 1st Qu.:110.0
## Median :0.0000 Median :23.00 Median :121.0
## Mean :0.3122 Mean :23.24 Mean :129.8
## 3rd Qu.:1.0000 3rd Qu.:26.00 3rd Qu.:140.0
## Max. :1.0000 Max. :45.00 Max. :250.0
## race racef racegf racegfl smoke smokef
## Min. :1.000 1:96 0:96 white:96 Min. :0.0000 0:115
## 1st Qu.:1.000 2:26 1:26 black:26 1st Qu.:0.0000 1: 74
## Median :1.000 3:67 2:67 other:67 Median :0.0000
## Mean :1.847 Mean :0.3915
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
## smokefl ptl ptlf ptlgf
## Non_smoker:115 Min. :0.0000 0:159 0:159
## Smoker : 74 1st Qu.:0.0000 1: 24 1: 24
## Median :0.0000 2: 5 2: 6
## Mean :0.1958 3: 1
## 3rd Qu.:0.0000
## Max. :3.0000
## ptlgfl ht htf
## No_preterm_labor :159 Min. :0.00000 0:177
## One_preterm_labor : 24 1st Qu.:0.00000 1: 12
## Two_plus_preterm_labor: 6 Median :0.00000
## Mean :0.06349
## 3rd Qu.:0.00000
## Max. :1.00000
## htfl ui uif
## No_hypertension:177 Min. :0.0000 0:161
## Hypertension : 12 1st Qu.:0.0000 1: 28
## Median :0.0000
## Mean :0.1481
## 3rd Qu.:0.0000
## Max. :1.0000
## uifl ftv ftvf ftvgf
## No_uterine_irritability:161 Min. :0.0000 0:100 0:100
## Uterine_irritability : 28 1st Qu.:0.0000 1: 47 1: 47
## Median :0.0000 2: 30 2: 42
## Mean :0.7937 3: 7
## 3rd Qu.:1.0000 4: 4
## Max. :6.0000 6: 1
## ftvgfl bwt bwt_category
## No_physician_visit :100 Min. : 709 low : 59
## One_physician_visit : 47 1st Qu.:2414 high:130
## Two_plus_physician_visit: 42 Median :2977
## Mean :2945
## 3rd Qu.:3487
## Max. :4990
lapply: Apply a Function over a List or Vector#outputs a list
lapply(list(bwt_tcf$bwt, bwt_tcf$age, bwt_tcf$lwt), mean)
## [[1]]
## [1] 2944.587
##
## [[2]]
## [1] 23.2381
##
## [[3]]
## [1] 129.8148
sapply: Apply a Function over a List or Vector#outputs a vector
sapply(list(bwt_tcf$bwt, bwt_tcf$age, bwt_tcf$lwt), mean)
## [1] 2944.5873 23.2381 129.8148
-for counting
-summarizing
tapply function-Apply a Function Over a Ragged Array
# 1st dim(rows)=race, 2nd dim(cols)=smoke
tapply(bwt_tcf$bwt, list(bwt_tcf$race, bwt_tcf$smoke), mean)
## 0 1
## 1 3428.750 2826.846
## 2 2854.500 2504.000
## 3 2815.782 2757.167
#1st dim(rows)=race, 2nd dim(cols)=smoke, 3rd dim=ptl
tapply(bwt_tcf$bwt, list(bwt_tcf$race, bwt_tcf$smoke, bwt_tcf$ptl), mean)
## , , 0
##
## 0 1
## 1 3413.667 2902.225
## 2 2903.857 2474.500
## 3 2866.085 3065.750
##
## , , 1
##
## 0 1
## 1 4174.000 2452.000
## 2 2509.000 2622.000
## 3 2586.714 1766.667
##
## , , 2
##
## 0 1
## 1 3317 2601
## 2 NA NA
## 3 2055 3260
##
## , , 3
##
## 0 1
## 1 NA 3637
## 2 NA NA
## 3 NA NA
aggregate function# aggregate one or more continous variables by one factor
aggregate(bwt ~ race, data=bwt_tcf, mean)
| race | bwt |
|---|---|
| 1 | 3102.719 |
| 2 | 2719.692 |
| 3 | 2805.284 |
aggregate(cbind(bwt, lwt) ~ race, data=bwt_tcf, mean)
| race | bwt | lwt |
|---|---|---|
| 1 | 3102.719 | 132.0521 |
| 2 | 2719.692 | 146.8077 |
| 3 | 2805.284 | 120.0149 |
# aggregate one or more continous variables two factors
aggregate(bwt ~ race+smoke, data=bwt_tcf, mean)
| race | smoke | bwt |
|---|---|---|
| 1 | 0 | 3428.750 |
| 2 | 0 | 2854.500 |
| 3 | 0 | 2815.782 |
| 1 | 1 | 2826.846 |
| 2 | 1 | 2504.000 |
| 3 | 1 | 2757.167 |
aggregate(cbind(bwt, lwt) ~ race+smoke, data=bwt_tcf, mean)
| race | smoke | bwt | lwt |
|---|---|---|---|
| 1 | 0 | 3428.750 | 138.8409 |
| 2 | 0 | 2854.500 | 149.4375 |
| 3 | 0 | 2815.782 | 119.1455 |
| 1 | 1 | 2826.846 | 126.3077 |
| 2 | 1 | 2504.000 | 142.6000 |
| 3 | 1 | 2757.167 | 124.0000 |
# aggregate a number of continous variables by one or more factors
aggregate(. ~ race, data=bwt_tcf[c("bwt", "lwt", "age", "race")], mean)
| race | bwt | lwt | age |
|---|---|---|---|
| 1 | 3102.719 | 132.0521 | 24.29167 |
| 2 | 2719.692 | 146.8077 | 21.53846 |
| 3 | 2805.284 | 120.0149 | 22.38806 |
aggregate(. ~ race+smoke, data=bwt_tcf[c("bwt", "lwt", "age", "race", "smoke")], mean)
| race | smoke | bwt | lwt | age |
|---|---|---|---|---|
| 1 | 0 | 3428.750 | 138.8409 | 26.02273 |
| 2 | 0 | 2854.500 | 149.4375 | 19.93750 |
| 3 | 0 | 2815.782 | 119.1455 | 22.36364 |
| 1 | 1 | 2826.846 | 126.3077 | 22.82692 |
| 2 | 1 | 2504.000 | 142.6000 | 24.10000 |
| 3 | 1 | 2757.167 | 124.0000 | 22.50000 |
# aggregate a a continous variable by many grouping variable
aggregate(bwt ~ ., data = bwt_tcf[c("race","smoke", "ht", "bwt")], mean)
| race | smoke | ht | bwt |
|---|---|---|---|
| 1 | 0 | 0 | 3436.395 |
| 2 | 0 | 0 | 2813.357 |
| 3 | 0 | 0 | 2874.824 |
| 1 | 1 | 0 | 2819.292 |
| 2 | 1 | 0 | 2656.111 |
| 3 | 1 | 0 | 2757.167 |
| 1 | 0 | 1 | 3100.000 |
| 2 | 0 | 1 | 3142.500 |
| 3 | 0 | 1 | 2063.000 |
| 1 | 1 | 1 | 2917.500 |
| 2 | 1 | 1 | 1135.000 |
# aggregate two continous variable by many grouping variable
aggregate(cbind(bwt, lwt) ~ ., data = bwt_tcf[c("race","smoke", "ht", "bwt", "lwt")], mean)
| race | smoke | ht | bwt | lwt |
|---|---|---|---|---|
| 1 | 0 | 0 | 3436.395 | 139.2791 |
| 2 | 0 | 0 | 2813.357 | 143.4286 |
| 3 | 0 | 0 | 2874.824 | 118.9608 |
| 1 | 1 | 0 | 2819.292 | 121.9583 |
| 2 | 1 | 0 | 2656.111 | 137.6667 |
| 3 | 1 | 0 | 2757.167 | 124.0000 |
| 1 | 0 | 1 | 3100.000 | 120.0000 |
| 2 | 0 | 1 | 3142.500 | 191.5000 |
| 3 | 0 | 1 | 2063.000 | 121.5000 |
| 1 | 1 | 1 | 2917.500 | 178.5000 |
| 2 | 1 | 1 | 1135.000 | 187.0000 |
summaryBy function-Opportunity to calculate multiple summary statistics for multiple variables
-Need to convert the tibble to data.frame for summaryBy analysis
library(doBy)
summary_fun <- function(x, ...){
c(
nobs=length(x),
minm=min(x, na.rm=TRUE, ...),
maxm=max(x, na.rm=TRUE, ...),
qntl=quantile(x, na.rm=TRUE, ...),
iqr=IQR(x, na.rm=TRUE, ...),
MAD=mad(x, na.rm=TRUE, ...),
avg=mean(x, na.rm=TRUE, ...),
med=median(x, na.rm=TRUE, ...),
stdev=sd(x, na.rm=TRUE, ...),
variance=var(x, na.rm=TRUE, ...)
)
}
bwt_tcf_df <- as.data.frame(bwt_tcf) # tibble needs to be converted to a data.frame to work with the "doBy" package
# default function is 'mean'
summaryBy(bwt ~ race, data=bwt_tcf_df)
| race | bwt.mean |
|---|---|
| 1 | 3102.719 |
| 2 | 2719.692 |
| 3 | 2805.284 |
summaryBy(bwt ~ race+smoke, data=bwt_tcf_df)
| race | smoke | bwt.mean |
|---|---|---|
| 1 | 0 | 3428.750 |
| 1 | 1 | 2826.846 |
| 2 | 0 | 2854.500 |
| 2 | 1 | 2504.000 |
| 3 | 0 | 2815.782 |
| 3 | 1 | 2757.167 |
summaryBy(cbind(bwt,age) ~ race+smoke, data=bwt_tcf_df)
| race | smoke | bwt.mean | age.mean |
|---|---|---|---|
| 1 | 0 | 3428.750 | 26.02273 |
| 1 | 1 | 2826.846 | 22.82692 |
| 2 | 0 | 2854.500 | 19.93750 |
| 2 | 1 | 2504.000 | 24.10000 |
| 3 | 0 | 2815.782 | 22.36364 |
| 3 | 1 | 2757.167 | 22.50000 |
# default function is 'mean'
summaryBy(list(c("bwt", "age"), c("race", "smoke")), data=bwt_tcf_df )
| race | smoke | bwt.mean | age.mean |
|---|---|---|---|
| 1 | 0 | 3428.750 | 26.02273 |
| 1 | 1 | 2826.846 | 22.82692 |
| 2 | 0 | 2854.500 | 19.93750 |
| 2 | 1 | 2504.000 | 24.10000 |
| 3 | 0 | 2815.782 | 22.36364 |
| 3 | 1 | 2757.167 | 22.50000 |
# use "FUN" to define the functions
# output according to the order of the functons listed
summaryBy(bwt ~ race + smoke, data=bwt_tcf_df, FUN=c(min, max, quantile, IQR, mean, median, sd, var))
| race | smoke | bwt.FUN1 | bwt.FUN2 | bwt.FUN3 | bwt.FUN4 | bwt.FUN5 | bwt.FUN6 | bwt.FUN7 | bwt.FUN8 | bwt.FUN9 | bwt.FUN10 | bwt.FUN11 | bwt.FUN12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 1021 | 4990 | 1021 | 3062.00 | 3593.0 | 3873.00 | 4990 | 811.00 | 3428.750 | 3593.0 | 710.0989 | 504240.5 |
| 1 | 1 | 1790 | 4238 | 1790 | 2410.00 | 2775.5 | 3189.50 | 4238 | 779.50 | 2826.846 | 2775.5 | 626.4725 | 392467.8 |
| 2 | 0 | 1701 | 3860 | 1701 | 2480.75 | 2920.0 | 3331.25 | 3860 | 850.50 | 2854.500 | 2920.0 | 621.2543 | 385956.9 |
| 2 | 1 | 1135 | 3444 | 1135 | 2313.75 | 2381.0 | 2941.00 | 3444 | 627.25 | 2504.000 | 2381.0 | 637.0568 | 405841.3 |
| 3 | 0 | 1330 | 4054 | 1330 | 2313.00 | 2807.0 | 3253.00 | 4054 | 940.00 | 2815.782 | 2807.0 | 709.3493 | 503176.5 |
| 3 | 1 | 709 | 3572 | 709 | 2402.25 | 3146.5 | 3307.50 | 3572 | 905.25 | 2757.167 | 3146.5 | 810.0446 | 656172.3 |
summaryBy(cbind(bwt, age) ~ race + smoke, data=bwt_tcf_df, FUN=c(min, max, quantile, IQR, mean, median, sd, var))
| race | smoke | bwt.FUN1 | bwt.FUN2 | bwt.FUN3 | bwt.FUN4 | bwt.FUN5 | bwt.FUN6 | bwt.FUN7 | bwt.FUN8 | bwt.FUN9 | bwt.FUN10 | bwt.FUN11 | bwt.FUN12 | age.FUN1 | age.FUN2 | age.FUN3 | age.FUN4 | age.FUN5 | age.FUN6 | age.FUN7 | age.FUN8 | age.FUN9 | age.FUN10 | age.FUN11 | age.FUN12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 1021 | 14 | 4990 | 45 | 1021 | 3062.00 | 3593.0 | 3873.00 | 4990 | 14 | 22.00 | 24.5 | 30.00 | 45 | 811.00 | 8.00 | 3428.750 | 26.02273 | 3593.0 | 24.5 | 710.0989 | 6.017373 | 504240.5 | 36.20877 |
| 1 | 1 | 1790 | 16 | 4238 | 33 | 1790 | 2410.00 | 2775.5 | 3189.50 | 4238 | 16 | 19.00 | 21.5 | 26.25 | 33 | 779.50 | 7.25 | 2826.846 | 22.82692 | 2775.5 | 21.5 | 626.4725 | 4.925807 | 392467.8 | 24.26357 |
| 2 | 0 | 1701 | 15 | 3860 | 27 | 1701 | 2480.75 | 2920.0 | 3331.25 | 3860 | 15 | 17.00 | 18.5 | 23.25 | 27 | 850.50 | 6.25 | 2854.500 | 19.93750 | 2920.0 | 18.5 | 621.2543 | 3.889623 | 385956.9 | 15.12917 |
| 2 | 1 | 1135 | 18 | 3444 | 35 | 1135 | 2313.75 | 2381.0 | 2941.00 | 3444 | 18 | 20.00 | 22.0 | 25.50 | 35 | 627.25 | 5.50 | 2504.000 | 24.10000 | 2381.0 | 22.0 | 637.0568 | 5.952591 | 405841.3 | 35.43333 |
| 3 | 0 | 1330 | 14 | 4054 | 33 | 1330 | 2313.00 | 2807.0 | 3253.00 | 4054 | 14 | 19.00 | 22.0 | 25.00 | 33 | 940.00 | 6.00 | 2815.782 | 22.36364 | 2807.0 | 22.0 | 709.3493 | 4.452896 | 503176.5 | 19.82828 |
| 3 | 1 | 709 | 14 | 3572 | 31 | 709 | 2402.25 | 3146.5 | 3307.50 | 3572 | 14 | 19.25 | 22.5 | 26.50 | 31 | 905.25 | 7.25 | 2757.167 | 22.50000 | 3146.5 | 22.5 | 810.0446 | 5.107926 | 656172.3 | 26.09091 |
# using custom function
summaryBy(bwt ~ race, data=bwt_tcf_df, FUN=summary_fun)
| race | bwt.nobs | bwt.minm | bwt.maxm | bwt.qntl.0% | bwt.qntl.25% | bwt.qntl.50% | bwt.qntl.75% | bwt.qntl.100% | bwt.iqr | bwt.MAD | bwt.avg | bwt.med | bwt.stdev | bwt.variance |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 96 | 1021 | 4990 | 1021 | 2584.75 | 3062 | 3651 | 4990 | 1066.25 | 867.3210 | 3102.719 | 3062 | 727.8861 | 529818.2 |
| 2 | 26 | 1135 | 3860 | 1135 | 2370.50 | 2849 | 3057 | 3860 | 686.50 | 693.8568 | 2719.692 | 2849 | 638.6839 | 407917.1 |
| 3 | 67 | 709 | 4054 | 709 | 2313.00 | 2835 | 3274 | 4054 | 961.00 | 693.8568 | 2805.284 | 2835 | 722.1944 | 521564.7 |
summaryBy(bwt ~ race + smoke, data=bwt_tcf_df, FUN=summary_fun)
| race | smoke | bwt.nobs | bwt.minm | bwt.maxm | bwt.qntl.0% | bwt.qntl.25% | bwt.qntl.50% | bwt.qntl.75% | bwt.qntl.100% | bwt.iqr | bwt.MAD | bwt.avg | bwt.med | bwt.stdev | bwt.variance |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 44 | 1021 | 4990 | 1021 | 3062.00 | 3593.0 | 3873.00 | 4990 | 811.00 | 641.2245 | 3428.750 | 3593.0 | 710.0989 | 504240.5 |
| 1 | 1 | 52 | 1790 | 4238 | 1790 | 2410.00 | 2775.5 | 3189.50 | 4238 | 779.50 | 546.3381 | 2826.846 | 2775.5 | 626.4725 | 392467.8 |
| 2 | 0 | 16 | 1701 | 3860 | 1701 | 2480.75 | 2920.0 | 3331.25 | 3860 | 850.50 | 651.6027 | 2854.500 | 2920.0 | 621.2543 | 385956.9 |
| 2 | 1 | 10 | 1135 | 3444 | 1135 | 2313.75 | 2381.0 | 2941.00 | 3444 | 627.25 | 588.5922 | 2504.000 | 2381.0 | 637.0568 | 405841.3 |
| 3 | 0 | 55 | 1330 | 4054 | 1330 | 2313.00 | 2807.0 | 3253.00 | 4054 | 940.00 | 692.3742 | 2815.782 | 2807.0 | 709.3493 | 503176.5 |
| 3 | 1 | 12 | 709 | 3572 | 709 | 2402.25 | 3146.5 | 3307.50 | 3572 | 905.25 | 452.1930 | 2757.167 | 3146.5 | 810.0446 | 656172.3 |
# output to a tabular format
summaryBy(bwt ~ race + smoke, data=bwt_tcf_df, FUN=summary_fun) %>%
t() %>%
print(quote = FALSE, digits = 5)
## 1 2 3 4 5 6
## race 1.00 1.00 2.00 2.00 3.00 3.00
## smoke 0.00 1.00 0.00 1.00 0.00 1.00
## bwt.nobs 44.00 52.00 16.00 10.00 55.00 12.00
## bwt.minm 1021.00 1790.00 1701.00 1135.00 1330.00 709.00
## bwt.maxm 4990.00 4238.00 3860.00 3444.00 4054.00 3572.00
## bwt.qntl.0% 1021.00 1790.00 1701.00 1135.00 1330.00 709.00
## bwt.qntl.25% 3062.00 2410.00 2480.75 2313.75 2313.00 2402.25
## bwt.qntl.50% 3593.00 2775.50 2920.00 2381.00 2807.00 3146.50
## bwt.qntl.75% 3873.00 3189.50 3331.25 2941.00 3253.00 3307.50
## bwt.qntl.100% 4990.00 4238.00 3860.00 3444.00 4054.00 3572.00
## bwt.iqr 811.00 779.50 850.50 627.25 940.00 905.25
## bwt.MAD 641.22 546.34 651.60 588.59 692.37 452.19
## bwt.avg 3428.75 2826.85 2854.50 2504.00 2815.78 2757.17
## bwt.med 3593.00 2775.50 2920.00 2381.00 2807.00 3146.50
## bwt.stdev 710.10 626.47 621.25 637.06 709.35 810.04
## bwt.variance 504240.47 392467.78 385956.93 405841.33 503176.47 656172.33
group_by function# number of observations by group charcteristics
bwt_tcf %>% group_by(low, race) %>% count()
| low | race | n |
|---|---|---|
| 0 | 1 | 73 |
| 0 | 2 | 15 |
| 0 | 3 | 42 |
| 1 | 1 | 23 |
| 1 | 2 | 11 |
| 1 | 3 | 25 |
# number of observations by group charcteristics
bwt_tcf %>% group_by(low) %>% count(race)
| low | race | n |
|---|---|---|
| 0 | 1 | 73 |
| 0 | 2 | 15 |
| 0 | 3 | 42 |
| 1 | 1 | 23 |
| 1 | 2 | 11 |
| 1 | 3 | 25 |
# one grouping variable
bwt_tcf %>%
group_by(race) %>%
summarise(no_of_obs=n(),
no_of_distinct_obs=n_distinct(bwt),
bwt_avg=mean(bwt, na.rm = TRUE),
bwt_median=median(bwt, na.rm = TRUE),
bwt_sd=sd(bwt, na.rm = TRUE),
bwt_IQR=IQR(bwt, na.rm = TRUE),
bwt_mad=mad(bwt, na.rm = TRUE),
bwt_min=min(bwt, na.rm = TRUE),
bwt_max=max(bwt, na.rm = TRUE),
bwt_50th_quantile=quantile(bwt, 0.5, na.rm = TRUE),
bwt_tirst=first(bwt),
bwt_last=last(bwt),
bwt_4th=nth(bwt, 4))
| race | no_of_obs | no_of_distinct_obs | bwt_avg | bwt_median | bwt_sd | bwt_IQR | bwt_mad | bwt_min | bwt_max | bwt_50th_quantile | bwt_tirst | bwt_last | bwt_4th |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 96 | 80 | 3102.719 | 3062 | 727.8861 | 1066.25 | 867.3210 | 1021 | 4990 | 3062 | 2557 | 2495 | 2637 |
| 2 | 26 | 23 | 2719.692 | 2849 | 638.6839 | 686.50 | 693.8568 | 1135 | 3860 | 2849 | 2523 | 2495 | 2920 |
| 3 | 67 | 53 | 2805.284 | 2835 | 722.1944 | 961.00 | 693.8568 | 709 | 4054 | 2835 | 2551 | 2495 | 2722 |
# two grouping variables
bwt_tcf %>%
group_by(race, smoke) %>%
summarise(no_of_obs=n(),
no_of_distinct_obs=n_distinct(bwt),
bwt_avg=mean(bwt, na.rm = TRUE),
bwt_median=median(bwt, na.rm = TRUE),
bwt_sd=sd(bwt, na.rm = TRUE),
bwt_IQR=IQR(bwt, na.rm = TRUE),
bwt_mad=mad(bwt, na.rm = TRUE),
bwt_min=min(bwt, na.rm = TRUE),
bwt_max=max(bwt, na.rm = TRUE),
bwt_50th_quantile=quantile(bwt, 0.5, na.rm = TRUE),
bwt_tirst=first(bwt),
bwt_last=last(bwt),
bwt_4th=nth(bwt, 4))
| race | smoke | no_of_obs | no_of_distinct_obs | bwt_avg | bwt_median | bwt_sd | bwt_IQR | bwt_mad | bwt_min | bwt_max | bwt_50th_quantile | bwt_tirst | bwt_last | bwt_4th |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 44 | 41 | 3428.750 | 3593.0 | 710.0989 | 811.00 | 641.2245 | 1021 | 4990 | 3593.0 | 2637 | 2353 | 2877 |
| 1 | 1 | 52 | 44 | 2826.846 | 2775.5 | 626.4725 | 779.50 | 546.3381 | 1790 | 4238 | 2775.5 | 2557 | 2495 | 2663 |
| 2 | 0 | 16 | 15 | 2854.500 | 2920.0 | 621.2543 | 850.50 | 651.6027 | 1701 | 3860 | 2920.0 | 2523 | 2495 | 2920 |
| 2 | 1 | 10 | 9 | 2504.000 | 2381.0 | 637.0568 | 627.25 | 588.5922 | 1135 | 3444 | 2381.0 | 2920 | 2381 | 3444 |
| 3 | 0 | 55 | 45 | 2815.782 | 2807.0 | 709.3493 | 940.00 | 692.3742 | 1330 | 4054 | 2807.0 | 2551 | 2495 | 2722 |
| 3 | 1 | 12 | 12 | 2757.167 | 3146.5 | 810.0446 | 905.25 | 452.1930 | 709 | 3572 | 3146.5 | 3090 | 2495 | 3303 |
bwt.df <- bwt.df %>% dplyr::select(bwt, age, lwt, everything())
head(bwt.df, n=3)
| bwt | age | lwt | low | lowf | lowfl | race | racegr | racegf | racegfl | black | other | smoke | smokef | smokefl | ptl | ptlgr | ptlgf | ptlgfl | one_preterm_labor | twoOrMore_preterm_labor | ht | htf | htfl | ui | uif | uifl | ftv | ftvgr | ftvgf | ftvgfl | one_firstTimester_drVisit | twoOrMore_firstTimester_drVisit | bwt_category | bwt_cut | bwt_quantiles | bwt_tertile | bwt_tertile_order |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2523 | 19 | 182 | 0 | 0 | High | 2 | 1 | 1 | Black | 1 | 0 | 0 | 0 | Non-smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 1 | 1 | Uterine_irritability | 0 | 0 | 0 | No_physician_visit | 0 | 0 | wt2 | wt2 | wt2 | 3 | high |
| 2551 | 33 | 155 | 0 | 0 | High | 3 | 2 | 2 | Other | 0 | 1 | 0 | 0 | Non-smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 3 | 2 | 2 | twoOrMore_firstTimester_drVisit | 0 | 1 | wt2 | wt2 | wt2 | 3 | high |
| 2557 | 20 | 105 | 0 | 0 | High | 1 | 0 | 0 | White | 0 | 0 | 1 | 1 | Smoker | 0 | 0 | 0 | No_preterm_labor | 0 | 0 | 0 | 0 | No_hypertension | 0 | 0 | No_uterine_irritability | 1 | 1 | 1 | One_firstTimester_drVisit | 1 | 0 | wt2 | wt2 | wt2 | 3 | high |
par(mfrow=c(1,3))
for(i in 1:3) {
hist(bwt.df[,i], prob=TRUE, main=names(bwt.df)[i])
curve(dnorm(x, mean = mean(bwt.df[, i]), sd = sd(bwt.df[, i])),
col = "darkred",
lwd = 2,
add = TRUE)
lines(density(bwt.df[, i]), col = "blue", lwd = 2)
rug(bwt.df[, i], col = "red", lwd = 2)
}
par(mfrow=c(1,3))
for(i in 1:3) {
boxplot(bwt.df[, i], main=names(bwt.df)[i])
}