\(~\)
\(~\)
\(~\)
The first step is to read in the data. This can be done with a number of functions. Some common ones in base R include:
readRDS
read.csv
Help for functions can be found within the R terminal by use of the question mark: ?readRDS
Sometimes you may want to use newer or more specalized packages and libraries to read in or handle data. First you will need to make sure the library is installed with install.packages('readr')
then you can load the library with library('readr')
, and you will have access to all of the functionality withing the package.
Alternatively, you can access individual functions from libraries by using notation such as: readr::read_csv
; this tells R
to look in the readr
package for the function read_csv
. This can be useful in programming functions and packages as well as if multiple packages contain functions with the same name.
For excel files two useful packages::functions to read in excel are:
xlsx::read.xlsx
readxl::read_excel
\(~\)
\(~\)
\(~\)
Use the arrow <-
for assignments:
# read in the data
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
\(~\)
\(~\)
\(~\)
#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin - Categorical
##### dmdeduc2 - Education level - Adults 20+ - Categorical
##### dmdmartl - Marital status - Categorical
##### indhhin2 - Annual household income - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical
\(~\)
\(~\)
\(~\)
The R
Structure function will compactly display the internal structure of an R
object, a diagnostic function and an alternative to summary
.
To see help page for a function use ?
before the function name, for example try: ?str
#structure command
str(diab_pop)
## 'data.frame': 5719 obs. of 10 variables:
## $ seqn : num 83732 83733 83734 83735 83736 ...
## $ riagendr: Factor w/ 2 levels "Male","Female": 1 1 1 2 2 2 1 2 1 1 ...
## $ ridageyr: num 62 53 78 56 42 72 22 32 56 46 ...
## $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 3 3 4 1 4 1 4 3 ...
## $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 3 3 5 4 2 4 4 3 5 ...
## $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 1 3 1 6 3 4 5 1 3 6 ...
## $ indhhin2: Factor w/ 14 levels "$0-$4,999","$5,000-$9,999",..: 10 4 5 10 NA 13 NA 6 3 3 ...
## $ bmxbmi : num 27.8 30.8 28.8 42.4 20.3 28.6 28 28.2 33.6 27.6 ...
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 1 2 1 2 2 2 2 2 1 2 ...
## $ lbxglu : num NA 101 84 NA 84 107 95 NA NA NA ...
\(~\)
\(~\)
\(~\)
We can view the first 10 records in the data with the head
command:
# show data
head(diab_pop,10)
## seqn riagendr ridageyr ridreth1 dmdeduc2
## 1 83732 Male 62 Non-Hispanic White College grad or above
## 2 83733 Male 53 Non-Hispanic White High school graduate/GED
## 3 83734 Male 78 Non-Hispanic White High school graduate/GED
## 4 83735 Female 56 Non-Hispanic White College grad or above
## 5 83736 Female 42 Non-Hispanic Black Some college or AA degrees
## 6 83737 Female 72 MexicanAmerican Grades 9-11th
## 7 83741 Male 22 Non-Hispanic Black Some college or AA degrees
## 8 83742 Female 32 MexicanAmerican Some college or AA degrees
## 9 83744 Male 56 Non-Hispanic Black High school graduate/GED
## 10 83747 Male 46 Non-Hispanic White College grad or above
## dmdmartl indhhin2 bmxbmi diq010 lbxglu
## 1 Married $65,000-$74,999 27.8 Diabetes NA
## 2 Divorced $15,000-$19,999 30.8 No Diabetes 101
## 3 Married $20,000-$24,999 28.8 Diabetes 84
## 4 Living with partner $65,000-$74,999 42.4 No Diabetes NA
## 5 Divorced <NA> 20.3 No Diabetes 84
## 6 Separated $75,000-$99,999 28.6 No Diabetes 107
## 7 Never married <NA> 28.0 No Diabetes 95
## 8 Married $25,000-$34,999 28.2 No Diabetes NA
## 9 Divorced $10,000-$14,999 33.6 Diabetes NA
## 10 Living with partner $10,000-$14,999 27.6 No Diabetes NA
\(~\)
\(~\)
\(~\)
To check which packages are loaded you can use:
# loaded packages
(.packages())
## [1] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [7] "base"
Make sure that the tidyverse
and dplyr
packages are installed. You can run install.packages(c('tidyverse','dplyr'))
to install both.
To see which packages are installed you can use:
# which packages are installed
library()$results[,1]
## [1] "abind" "additivityTests" "Amelia"
## [4] "AMR" "arm" "arsenal"
## [7] "askpass" "assertthat" "backports"
## [10] "base64enc" "beeswarm" "bench"
## [13] "BH" "biclust" "bipartite"
## [16] "bit" "bit64" "bitops"
## [19] "blob" "bookdown" "brew"
## [22] "brio" "broom" "broom.mixed"
## [25] "bslib" "cachem" "callr"
## [28] "car" "carData" "caret"
## [31] "caTools" "cellranger" "checkmate"
## [34] "chron" "cli" "clipr"
## [37] "clue" "coda" "coin"
## [40] "colorspace" "commonmark" "conflicted"
## [43] "conquer" "corrplot" "corrr"
## [46] "cowplot" "cpp11" "crayon"
## [49] "credentials" "crosstalk" "cubelyr"
## [52] "curl" "data.table" "DataExplorer"
## [55] "DBI" "dbplyr" "dendextend"
## [58] "DEoptimR" "Deriv" "desc"
## [61] "devtools" "dials" "DiceDesign"
## [64] "diffobj" "diffr" "digest"
## [67] "dotCall64" "downlit" "dplyr"
## [70] "DT" "dtplyr" "e1071"
## [73] "ellipse" "ellipsis" "equatiomatic"
## [76] "evaluate" "factoextra" "FactoMineR"
## [79] "fansi" "farver" "fastmap"
## [82] "fields" "flashClust" "flexclust"
## [85] "flexdashboard" "forcats" "foreach"
## [88] "Formula" "fs" "furrr"
## [91] "future" "gapminder" "gargle"
## [94] "gclus" "gdata" "generics"
## [97] "gert" "GGally" "gganimate"
## [100] "ggbeeswarm" "ggdendro" "ggforce"
## [103] "ggfortify" "ggplot2" "ggpubr"
## [106] "ggraph" "ggrepel" "ggsci"
## [109] "ggsignif" "ggthemes" "gh"
## [112] "gifski" "gitcreds" "glmnet"
## [115] "globals" "glue" "gmodels"
## [118] "googledrive" "googlesheets4" "gower"
## [121] "GPfit" "gplots" "graphlayouts"
## [124] "gridExtra" "gsubfn" "gtable"
## [127] "gtools" "hardhat" "haven"
## [130] "highr" "Hmisc" "hms"
## [133] "htmlTable" "htmltools" "htmlwidgets"
## [136] "httpuv" "httr" "ids"
## [139] "igraph" "infer" "ini"
## [142] "installr" "ipred" "isoband"
## [145] "iterators" "jpeg" "jquerylib"
## [148] "jsonlite" "kableExtra" "knitr"
## [151] "labeling" "labelled" "laeken"
## [154] "lars" "later" "latticeExtra"
## [157] "lava" "lazyeval" "leaps"
## [160] "lhs" "libcoin" "LiblineaR"
## [163] "lifecycle" "listenv" "lme4"
## [166] "lmtest" "lubridate" "magrittr"
## [169] "maps" "maptools" "markdown"
## [172] "MatrixModels" "matrixStats" "memoise"
## [175] "mice" "microbenchmark" "mime"
## [178] "minqa" "mitools" "mlbench"
## [181] "mnormt" "modeldata" "ModelMetrics"
## [184] "modelr" "modeltools" "monomvn"
## [187] "multcomp" "munsell" "mvtnorm"
## [190] "MyExamplePackage" "network" "networkD3"
## [193] "NHANES" "NHSRdatasets" "nloptr"
## [196] "NLP" "numDeriv" "OddsPlotty"
## [199] "openssl" "openxlsx" "packrat"
## [202] "paletteer" "parallelly" "parsnip"
## [205] "patchwork" "pbkrtest" "permute"
## [208] "pillar" "pkgbuild" "pkgconfig"
## [211] "pkgload" "plogr" "plotly"
## [214] "pls" "plsRglm" "plumber"
## [217] "plyr" "png" "polyclip"
## [220] "polynom" "praise" "prettyunits"
## [223] "prismatic" "pROC" "processx"
## [226] "prodlim" "profmem" "profvis"
## [229] "progress" "promises" "proto"
## [232] "proxy" "ps" "psych"
## [235] "purrr" "qap" "quadprog"
## [238] "quantreg" "R6" "randomForest"
## [241] "ranger" "rapidoc" "rappdirs"
## [244] "rcmdcheck" "RColorBrewer" "Rcpp"
## [247] "RcppArmadillo" "RcppEigen" "readr"
## [250] "readxl" "recipes" "redoc"
## [253] "registry" "rematch" "rematch2"
## [256] "remotes" "repr" "reprex"
## [259] "reshape" "reshape2" "reticulate"
## [262] "rio" "rJava" "rlang"
## [265] "rmarkdown" "robustbase" "roxygen2"
## [268] "rprojroot" "rsample" "rsconnect"
## [271] "rsq" "RSQLite" "rstatix"
## [274] "rstudioapi" "rversions" "rvest"
## [277] "sandwich" "sass" "scales"
## [280] "scatterplot3d" "selectr" "seriation"
## [283] "sessioninfo" "shape" "shiny"
## [286] "shinydashboard" "skimr" "slam"
## [289] "slider" "sna" "sodium"
## [292] "sourcetools" "sp" "spam"
## [295] "SparseM" "sqldf" "SQUAREM"
## [298] "statnet.common" "stepPlr" "stringi"
## [301] "stringr" "survey" "svglite"
## [304] "swagger" "sys" "systemfonts"
## [307] "tableone" "taskscheduleR" "testthat"
## [310] "TH.data" "tibble" "tidygraph"
## [313] "tidymodels" "tidyr" "tidyselect"
## [316] "tidyverse" "timeDate" "tinytex"
## [319] "tm" "TMB" "tmvnsim"
## [322] "tree" "TSP" "tune"
## [325] "tweenr" "usethis" "utf8"
## [328] "uuid" "varhandle" "vcd"
## [331] "vctrs" "vegan" "VIM"
## [334] "vipor" "viridis" "viridisLite"
## [337] "waldo" "warp" "webshot"
## [340] "webutils" "wesanderson" "whisker"
## [343] "withr" "wordcloud" "wordcloud2"
## [346] "workflows" "workflowsets" "xfun"
## [349] "xlsx" "xlsxjars" "xml2"
## [352] "xopen" "xtable" "yaml"
## [355] "yardstick" "zip" "zoo"
## [358] "base" "boot" "class"
## [361] "cluster" "codetools" "compiler"
## [364] "datasets" "foreign" "graphics"
## [367] "grDevices" "grid" "KernSmooth"
## [370] "lattice" "MASS" "Matrix"
## [373] "methods" "mgcv" "nlme"
## [376] "nnet" "parallel" "rpart"
## [379] "spatial" "splines" "stats"
## [382] "stats4" "survival" "tcltk"
## [385] "tools" "translations" "utils"
# some additional informaion on the package including the version
tibble::as_tibble(installed.packages())
## # A tibble: 387 x 16
## Package LibPath Version Priority Depends Imports LinkingTo Suggests Enhances
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 abind C:/Use~ 1.4-5 <NA> R (>= ~ method~ <NA> <NA> <NA>
## 2 additiv~ C:/Use~ 1.1-4 <NA> <NA> <NA> <NA> "knitr" <NA>
## 3 Amelia C:/Use~ 1.8.0 <NA> R (>= ~ foreig~ Rcpp (>=~ "tcltk,~ <NA>
## 4 AMR C:/Use~ 1.7.1 <NA> R (>= ~ <NA> <NA> "cleane~ <NA>
## 5 arm C:/Use~ 1.11-2 <NA> R (>= ~ abind,~ <NA> <NA> <NA>
## 6 arsenal C:/Use~ 3.6.3 <NA> R (>= ~ knitr ~ <NA> "broom ~ <NA>
## 7 askpass C:/Use~ 1.1 <NA> <NA> sys (>~ <NA> "testth~ <NA>
## 8 assertt~ C:/Use~ 0.2.1 <NA> <NA> tools <NA> "testth~ <NA>
## 9 backpor~ C:/Use~ 1.2.1 <NA> R (>= ~ <NA> <NA> <NA> <NA>
## 10 base64e~ C:/Use~ 0.1-3 <NA> R (>= ~ <NA> <NA> <NA> png
## # ... with 377 more rows, and 7 more variables: License <chr>,
## # License_is_FOSS <chr>, License_restricts_use <chr>, OS_type <chr>,
## # MD5sum <chr>, NeedsCompilation <chr>, Built <chr>
The following function will check if a list of packages are already installed, if not then it will install them:
# install package if missing
install_if_not <- function( list.of.packages ) {
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}
You can use the function like this:
# test function
install_if_not(c("caret","randomForest"))
## [1] "the package 'caret' is already installed"
## [2] "the package 'randomForest' is already installed"
We make sections of code accessable to installed packages by using library
command:
loaded_package_before <- (.packages())
# everthing below here can call functions in dplyr package
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(diab_pop)
## Rows: 5,719
## Columns: 10
## $ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744~
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,~
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8~
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N~
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school ~
## $ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se~
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65~
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2~
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No~
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3~
(.packages())
## [1] "dplyr" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
loaded_package_after <- (.packages())
setdiff(loaded_package_after, loaded_package_before)
## [1] "dplyr"
detach(package:dplyr)
#the dplyr package has now been detached. calls to functions may have errors
Additionally, we can access a function from a library without loading the entire library, this can be done by using a command such as package::function
. This notation is needed in any instance where two or more loaded packages have at least one function with the same name. This notation is also useful in development of functions and packages.
For instance, the glimpse
function from the dplyr
package can also be accessed by using the following command
# glimpse the data
dplyr::glimpse(diab_pop)
## Rows: 5,719
## Columns: 10
## $ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744~
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,~
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8~
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N~
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school ~
## $ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se~
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65~
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2~
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No~
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3~
And just notice that without the library loaded or the dplyr::
in front we can error:
# error
glimpse(diab_pop)
## Error in glimpse(diab_pop): could not find function "glimpse"
\(~\)
\(~\)
A Tibble is an more modern R
version of a dataframe, you can easily convert a dataframe to a tibble.
tibble::as_tibble(diab_pop)
## # A tibble: 5,719 x 10
## seqn riagendr ridageyr ridreth1 dmdeduc2 dmdmartl indhhin2 bmxbmi diq010
## <dbl> <fct> <dbl> <fct> <fct> <fct> <fct> <dbl> <fct>
## 1 83732 Male 62 Non-Hisp~ College g~ Married $65,000~ 27.8 Diabe~
## 2 83733 Male 53 Non-Hisp~ High scho~ Divorced $15,000~ 30.8 No Di~
## 3 83734 Male 78 Non-Hisp~ High scho~ Married $20,000~ 28.8 Diabe~
## 4 83735 Female 56 Non-Hisp~ College g~ Living w~ $65,000~ 42.4 No Di~
## 5 83736 Female 42 Non-Hisp~ Some coll~ Divorced <NA> 20.3 No Di~
## 6 83737 Female 72 MexicanA~ Grades 9-~ Separated $75,000~ 28.6 No Di~
## 7 83741 Male 22 Non-Hisp~ Some coll~ Never ma~ <NA> 28 No Di~
## 8 83742 Female 32 MexicanA~ Some coll~ Married $25,000~ 28.2 No Di~
## 9 83744 Male 56 Non-Hisp~ High scho~ Divorced $10,000~ 33.6 Diabe~
## 10 83747 Male 46 Non-Hisp~ College g~ Living w~ $10,000~ 27.6 No Di~
## # ... with 5,709 more rows, and 1 more variable: lbxglu <dbl>
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
summary
The summary
is a generic function used to produce result summaries of the results of various model fitting functions. See ?summary
for more information.
summary(diab_pop)
## seqn riagendr ridageyr ridreth1
## Min. :83732 Male :2747 Min. :20.00 MexicanAmerican : 995
## 1st Qu.:86171 Female:2972 1st Qu.:34.00 Other Hispanic : 768
## Median :88651 Median :49.00 Non-Hispanic White:1863
## Mean :88672 Mean :49.54 Non-Hispanic Black:1198
## 3rd Qu.:91166 3rd Qu.:64.00 Other : 895
## Max. :93702 Max. :80.00
##
## dmdeduc2 dmdmartl
## Less than 9th grade : 688 Married :2886
## Grades 9-11th : 676 Widowed : 421
## High school graduate/GED :1236 Divorced : 614
## Some college or AA degrees:1692 Separated : 192
## College grad or above :1422 Never married :1048
## NA's : 5 Living with partner: 555
## NA's : 3
## indhhin2 bmxbmi diq010 lbxglu
## $100,000+ : 910 Min. :14.50 Diabetes : 844 Min. : 21.0
## $25,000-$34,999: 596 1st Qu.:24.50 No Diabetes:4875 1st Qu.: 95.0
## $75,000-$99,999: 524 Median :28.50 Median :103.0
## $45,000-$54,999: 451 Mean :29.54 Mean :113.6
## $20,000-$24,999: 350 3rd Qu.:33.20 3rd Qu.:114.0
## (Other) :1590 Max. :67.30 Max. :479.0
## NA's :1298 NA's :313 NA's :3267
by(diab_pop, diab_pop$diq010, summary)
## diab_pop$diq010: Diabetes
## seqn riagendr ridageyr ridreth1
## Min. :83732 Male :456 Min. :21.00 MexicanAmerican :200
## 1st Qu.:86069 Female:388 1st Qu.:53.00 Other Hispanic :119
## Median :88610 Median :63.00 Non-Hispanic White:217
## Mean :88574 Mean :61.34 Non-Hispanic Black:203
## 3rd Qu.:90995 3rd Qu.:71.00 Other :105
## Max. :93689 Max. :80.00
##
## dmdeduc2 dmdmartl
## Less than 9th grade :161 Married :471
## Grades 9-11th :123 Widowed :100
## High school graduate/GED :185 Divorced :106
## Some college or AA degrees:229 Separated : 29
## College grad or above :145 Never married : 83
## NA's : 1 Living with partner: 55
##
## indhhin2 bmxbmi diq010 lbxglu
## $100,000+ : 89 Min. :16.0 Diabetes :844 Min. : 21
## $25,000-$34,999: 86 1st Qu.:27.4 No Diabetes: 0 1st Qu.:117
## $10,000-$14,999: 81 Median :31.4 Median :147
## $20,000-$24,999: 68 Mean :32.6 Mean :166
## $45,000-$54,999: 65 3rd Qu.:36.3 3rd Qu.:192
## (Other) :261 Max. :67.3 Max. :479
## NA's :194 NA's :55 NA's :462
## ------------------------------------------------------------
## diab_pop$diq010: No Diabetes
## seqn riagendr ridageyr ridreth1
## Min. :83733 Male :2291 Min. :20.00 MexicanAmerican : 795
## 1st Qu.:86190 Female:2584 1st Qu.:32.00 Other Hispanic : 649
## Median :88654 Median :46.00 Non-Hispanic White:1646
## Mean :88689 Mean :47.49 Non-Hispanic Black: 995
## 3rd Qu.:91193 3rd Qu.:61.00 Other : 790
## Max. :93702 Max. :80.00
##
## dmdeduc2 dmdmartl
## Less than 9th grade : 527 Married :2415
## Grades 9-11th : 553 Widowed : 321
## High school graduate/GED :1051 Divorced : 508
## Some college or AA degrees:1463 Separated : 163
## College grad or above :1277 Never married : 965
## NA's : 4 Living with partner: 500
## NA's : 3
## indhhin2 bmxbmi diq010 lbxglu
## $100,000+ : 821 Min. :14.50 Diabetes : 0 Min. : 64.0
## $25,000-$34,999: 510 1st Qu.:24.10 No Diabetes:4875 1st Qu.: 94.0
## $75,000-$99,999: 472 Median :28.00 Median :101.0
## $45,000-$54,999: 386 Mean :29.02 Mean :103.9
## $15,000-$19,999: 290 3rd Qu.:32.50 3rd Qu.:109.0
## (Other) :1292 Max. :64.60 Max. :397.0
## NA's :1104 NA's :258 NA's :2805
\(~\)
\(~\)
install_if_not("Amelia")
## [1] "the package 'Amelia' is already installed"
Amelia::missmap(diab_pop)
install_if_not("mice")
## [1] "the package 'mice' is already installed"
install_if_not("VIM")
## [1] "the package 'VIM' is already installed"
install_if_not("lattice")
## [1] "the package 'lattice' is already installed"
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(lattice)
#understand the missing value pattern
md.pattern(diab_pop)
## seqn riagendr ridageyr ridreth1 diq010 dmdmartl dmdeduc2 bmxbmi indhhin2
## 1876 1 1 1 1 1 1 1 1 1
## 2332 1 1 1 1 1 1 1 1 1
## 551 1 1 1 1 1 1 1 1 0
## 642 1 1 1 1 1 1 1 1 0
## 18 1 1 1 1 1 1 1 0 1
## 192 1 1 1 1 1 1 1 0 1
## 6 1 1 1 1 1 1 1 0 0
## 95 1 1 1 1 1 1 1 0 0
## 2 1 1 1 1 1 1 0 1 1
## 1 1 1 1 1 1 1 0 1 0
## 1 1 1 1 1 1 1 0 0 1
## 1 1 1 1 1 1 0 1 1 0
## 1 1 1 1 1 1 0 1 1 0
## 1 1 1 1 1 1 0 0 0 0
## 0 0 0 0 0 3 5 313 1298
## lbxglu
## 1876 1 0
## 2332 0 1
## 551 1 1
## 642 0 2
## 18 1 1
## 192 0 2
## 6 1 2
## 95 0 3
## 2 0 2
## 1 0 3
## 1 0 3
## 1 1 2
## 1 0 3
## 1 0 5
## 3267 4886
#plot the missing values
diab_pop_miss = aggr(diab_pop,
numbers=TRUE,
sortVars=TRUE,
labels=colnames(diab_pop),
cex.axis=.7,
gap=3,
ylab=c("Proportion of missingness","Missingness Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## lbxglu 0.5712537157
## indhhin2 0.2269627557
## bmxbmi 0.0547298479
## dmdeduc2 0.0008742787
## dmdmartl 0.0005245672
## seqn 0.0000000000
## riagendr 0.0000000000
## ridageyr 0.0000000000
## ridreth1 0.0000000000
## diq010 0.0000000000
#Drawing margin plot
marginplot(diab_pop[, c("lbxglu", "diq010")],
cex.numbers = 1.2,
pch = 19)
\(~\)
\(~\)
sqldf
You can also create summary tables with sql and the sqldf
:
install_if_not('sqldf')
## [1] "the package 'sqldf' is already installed"
# Average age by Diabetes
sqldf::sqldf('select diq010,
avg(ridageyr) as mean_age
from diab_pop
group by diq010
order by diq010')
## diq010 mean_age
## 1 Diabetes 61.33768
## 2 No Diabetes 47.49313
# Notice we can save our tables with assignments
# Counts by Diabetic and Age
T_sqldf <- sqldf::sqldf('select diq010,
riagendr,
count(*) as cnt
from diab_pop
group by diq010, riagendr
order by cnt desc')
str(T_sqldf)
## 'data.frame': 4 obs. of 3 variables:
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 1 1
## $ riagendr: Factor w/ 2 levels "Male","Female": 2 1 1 2
## $ cnt : int 2584 2291 456 388
T_sqldf
## diq010 riagendr cnt
## 1 No Diabetes Female 2584
## 2 No Diabetes Male 2291
## 3 Diabetes Male 456
## 4 Diabetes Female 388
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
dplyr
dplyr
is a popular package to manage many common data manipulation tasks and data summaries.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
### Average age by Diabetes
diab_pop %>%
group_by(diq010) %>%
summarise(mean_age = mean(ridageyr)) %>%
arrange(diq010)
## # A tibble: 2 x 2
## diq010 mean_age
## <fct> <dbl>
## 1 Diabetes 61.3
## 2 No Diabetes 47.5
### Counts by Diabetic and Age
T_dplyr_dm2_gender <- diab_pop %>%
group_by(diq010,riagendr) %>%
summarise(cnt= n()) %>%
ungroup() %>%
arrange(-cnt)
## `summarise()` has grouped output by 'diq010'. You can override using the `.groups` argument.
str(T_dplyr_dm2_gender)
## tibble [4 x 3] (S3: tbl_df/tbl/data.frame)
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 1 1
## $ riagendr: Factor w/ 2 levels "Male","Female": 2 1 1 2
## $ cnt : int [1:4] 2584 2291 456 388
T_dplyr_dm2_gender
## # A tibble: 4 x 3
## diq010 riagendr cnt
## <fct> <fct> <int>
## 1 No Diabetes Female 2584
## 2 No Diabetes Male 2291
## 3 Diabetes Male 456
## 4 Diabetes Female 388
dplyr
resourcesdplyr
: https://dplyr.tidyverse.org/dplyr
vignette: https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.htmldplyr
cheatsheet: https://github.com/rstudio/cheatsheets/blob/master/data-transformation.pdf\(~\)
\(~\)
\(~\)
\(~\)
comparedf
The comparedf
function in the arsenal
package can be used to compare two data-frames:
install_if_not('arsenal')
## [1] "the package 'arsenal' is already installed"
my_comparison <- arsenal::comparedf(T_sqldf , T_dplyr_dm2_gender)
my_comparison
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = T_sqldf, y = T_dplyr_dm2_gender)
##
## Shared: 3 non-by variables and 4 observations.
## Not shared: 0 variables and 0 observations.
##
## Differences found in 0/3 variables compared.
## 0 variables compared have non-identical attributes.
summary(my_comparison)
##
##
## Table: Summary of data.frames
##
## version arg ncol nrow
## -------- ------------------- ----- -----
## x T_sqldf 3 4
## y T_dplyr_dm2_gender 3 4
##
##
##
## Table: Summary of overall comparison
##
## statistic value
## ------------------------------------------------------------ ------
## Number of by-variables 0
## Number of non-by variables in common 3
## Number of variables compared 3
## Number of variables in x but not y 0
## Number of variables in y but not x 0
## Number of variables compared with some values unequal 0
## Number of variables compared with all values equal 3
## Number of observations in common 4
## Number of observations in x but not y 0
## Number of observations in y but not x 0
## Number of observations with some compared variables unequal 0
## Number of observations with all compared variables equal 4
## Number of values unequal 0
##
##
##
## Table: Variables not shared
##
##
## ------------------------
## No variables not shared
## ------------------------
##
##
##
## Table: Other variables not compared
##
##
## --------------------------------
## No other variables not compared
## --------------------------------
##
##
##
## Table: Observations not shared
##
##
## ---------------------------
## No observations not shared
## ---------------------------
##
##
##
## Table: Differences detected by variable
##
## var.x var.y n NAs
## --------- --------- --- ----
## diq010 diq010 0 0
## riagendr riagendr 0 0
## cnt cnt 0 0
##
##
##
## Table: Differences detected
##
##
## ------------------------
## No differences detected
## ------------------------
##
##
##
## Table: Non-identical attributes
##
##
## ----------------------------
## No non-identical attributes
## ----------------------------
str(my_comparison)
## List of 4
## $ frame.summary:Classes 'comparedf.frame.summary' and 'data.frame': 2 obs. of 8 variables:
## ..$ version : chr [1:2] "x" "y"
## ..$ arg : chr [1:2] "T_sqldf" "T_dplyr_dm2_gender"
## ..$ ncol : int [1:2] 3 3
## ..$ nrow : int [1:2] 4 4
## ..$ by :List of 2
## .. ..$ : chr "..row.names.."
## .. ..$ : chr "..row.names.."
## .. ..- attr(*, "byrow")= logi TRUE
## ..$ attrs :List of 2
## .. ..$ :List of 3
## .. .. ..$ names : chr [1:3] "diq010" "riagendr" "cnt"
## .. .. ..$ row.names: int [1:4] 1 2 3 4
## .. .. ..$ class : chr "data.frame"
## .. ..$ :List of 3
## .. .. ..$ row.names: int [1:4] 1 2 3 4
## .. .. ..$ names : chr [1:3] "diq010" "riagendr" "cnt"
## .. .. ..$ class : chr [1:3] "tbl_df" "tbl" "data.frame"
## ..$ unique :List of 2
## .. ..$ :'data.frame': 0 obs. of 2 variables:
## .. .. ..$ ..row.names..: int(0)
## .. .. ..$ observation : int(0)
## .. ..$ :'data.frame': 0 obs. of 2 variables:
## .. .. ..$ ..row.names..: int(0)
## .. .. ..$ observation : int(0)
## ..$ n.shared: int [1:2] 4 4
## $ vars.summary :Classes 'comparedf.vars.summary' and 'data.frame': 4 obs. of 8 variables:
## ..$ var.x : chr [1:4] "diq010" "riagendr" "cnt" "..row.names.."
## ..$ pos.x : int [1:4] 1 2 3 4
## ..$ class.x:List of 4
## .. ..$ : chr "factor"
## .. ..$ : chr "factor"
## .. ..$ : chr "integer"
## .. ..$ : chr "integer"
## ..$ var.y : chr [1:4] "diq010" "riagendr" "cnt" "..row.names.."
## ..$ pos.y : int [1:4] 1 2 3 4
## ..$ class.y:List of 4
## .. ..$ : chr "factor"
## .. ..$ : chr "factor"
## .. ..$ : chr "integer"
## .. ..$ : chr "integer"
## ..$ values :List of 4
## .. ..$ :'data.frame': 0 obs. of 5 variables:
## .. .. ..$ ..row.names..: int(0)
## .. .. ..$ values.x : Factor w/ 2 levels "Diabetes","No Diabetes":
## .. .. ..$ values.y : Factor w/ 2 levels "Diabetes","No Diabetes":
## .. .. ..$ row.x : int(0)
## .. .. ..$ row.y : int(0)
## .. ..$ :'data.frame': 0 obs. of 5 variables:
## .. .. ..$ ..row.names..: int(0)
## .. .. ..$ values.x : Factor w/ 2 levels "Male","Female":
## .. .. ..$ values.y : Factor w/ 2 levels "Male","Female":
## .. .. ..$ row.x : int(0)
## .. .. ..$ row.y : int(0)
## .. ..$ :'data.frame': 0 obs. of 5 variables:
## .. .. ..$ ..row.names..: int(0)
## .. .. ..$ values.x : 'AsIs' int(0)
## .. .. ..$ values.y : 'AsIs' int(0)
## .. .. ..$ row.x : int(0)
## .. .. ..$ row.y : int(0)
## .. ..$ : chr "by-variable"
## ..$ attrs :List of 4
## .. ..$ :'data.frame': 0 obs. of 3 variables:
## .. .. ..$ name : chr(0)
## .. .. ..$ attr.x: Named list()
## .. .. ..$ attr.y: Named list()
## .. ..$ :'data.frame': 0 obs. of 3 variables:
## .. .. ..$ name : chr(0)
## .. .. ..$ attr.x: Named list()
## .. .. ..$ attr.y: Named list()
## .. ..$ : NULL
## .. ..$ : NULL
## $ control :List of 16
## ..$ tol.logical :List of 1
## .. ..$ :function (x, y)
## ..$ tol.num :List of 1
## .. ..$ :function (x, y, tol)
## ..$ tol.num.val : num 1.49e-08
## ..$ int.as.num : logi FALSE
## ..$ tol.char :List of 1
## .. ..$ :function (x, y)
## ..$ tol.factor :List of 1
## .. ..$ :function (x, y)
## ..$ factor.as.char : logi FALSE
## ..$ tol.date :List of 1
## .. ..$ :function (x, y, tol)
## ..$ tol.date.val : num 0
## ..$ tol.other :List of 1
## .. ..$ :function (x, y)
## ..$ tol.vars : chr "none"
## ..$ max.print.vars : logi NA
## ..$ max.print.obs : logi NA
## ..$ max.print.diffs.per.var: num 10
## ..$ max.print.diffs : num 50
## ..$ max.print.attrs : logi NA
## $ Call : language arsenal::comparedf(x = T_sqldf, y = T_dplyr_dm2_gender)
## - attr(*, "class")= chr "comparedf"
\(~\)
\(~\)
\(~\)
\(~\)
# The table command can give us a table of counts of categorical columns:
table(diab_pop$diq010, diab_pop$riagendr)
##
## Male Female
## Diabetes 456 388
## No Diabetes 2291 2584
table(diab_pop$dmdmartl, diab_pop$riagendr, diab_pop$diq010)
## , , = Diabetes
##
##
## Male Female
## Married 299 172
## Widowed 30 70
## Divorced 46 60
## Separated 11 18
## Never married 39 44
## Living with partner 31 24
##
## , , = No Diabetes
##
##
## Male Female
## Married 1229 1186
## Widowed 79 242
## Divorced 200 308
## Separated 58 105
## Never married 471 494
## Living with partner 253 247
# We can use similar call for the chi square test:
chisq.test(diab_pop$diq010, diab_pop$riagendr)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: diab_pop$diq010 and diab_pop$riagendr
## X-squared = 13.978, df = 1, p-value = 0.0001849
# We can store the results and then use them:
table.sex_dm2 <- table(diab_pop$riagendr, diab_pop$diq010)
table.sex_dm2
##
## Diabetes No Diabetes
## Male 456 2291
## Female 388 2584
chisq.test(table.sex_dm2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table.sex_dm2
## X-squared = 13.978, df = 1, p-value = 0.0001849
The R
functions can have multiple outputs. We can also see other data associated with the output, the structure command can be used to help:
chisq_results <- chisq.test(diab_pop$diq010, diab_pop$riagendr)
str(chisq_results)
## List of 9
## $ statistic: Named num 14
## ..- attr(*, "names")= chr "X-squared"
## $ parameter: Named int 1
## ..- attr(*, "names")= chr "df"
## $ p.value : num 0.000185
## $ method : chr "Pearson's Chi-squared test with Yates' continuity correction"
## $ data.name: chr "diab_pop$diq010 and diab_pop$riagendr"
## $ observed : 'table' int [1:2, 1:2] 456 2291 388 2584
## ..- attr(*, "dimnames")=List of 2
## .. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
## .. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
## $ expected : num [1:2, 1:2] 405 2342 439 2533
## ..- attr(*, "dimnames")=List of 2
## .. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
## .. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
## $ residuals: 'table' num [1:2, 1:2] 2.51 -1.05 -2.42 1.01
## ..- attr(*, "dimnames")=List of 2
## .. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
## .. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
## $ stdres : 'table' num [1:2, 1:2] 3.78 -3.78 -3.78 3.78
## ..- attr(*, "dimnames")=List of 2
## .. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
## .. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
## - attr(*, "class")= chr "htest"
# We can access different portions of the output:
chisq_results$p.value
## [1] 0.0001849292
chisq_results$statistic
## X-squared
## 13.97834
chisq_results$method
## [1] "Pearson's Chi-squared test with Yates' continuity correction"
\(~\)
broom
The default output of the chisq.test
test is a list, the tidy
function in the broom
package will take many results from other packages and convert into tibbles for easier handling:
str(chisq_results)
## List of 9
## $ statistic: Named num 14
## ..- attr(*, "names")= chr "X-squared"
## $ parameter: Named int 1
## ..- attr(*, "names")= chr "df"
## $ p.value : num 0.000185
## $ method : chr "Pearson's Chi-squared test with Yates' continuity correction"
## $ data.name: chr "diab_pop$diq010 and diab_pop$riagendr"
## $ observed : 'table' int [1:2, 1:2] 456 2291 388 2584
## ..- attr(*, "dimnames")=List of 2
## .. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
## .. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
## $ expected : num [1:2, 1:2] 405 2342 439 2533
## ..- attr(*, "dimnames")=List of 2
## .. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
## .. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
## $ residuals: 'table' num [1:2, 1:2] 2.51 -1.05 -2.42 1.01
## ..- attr(*, "dimnames")=List of 2
## .. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
## .. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
## $ stdres : 'table' num [1:2, 1:2] 3.78 -3.78 -3.78 3.78
## ..- attr(*, "dimnames")=List of 2
## .. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
## .. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
## - attr(*, "class")= chr "htest"
install_if_not('broom')
## [1] "the package 'broom' is already installed"
tidy_chisq_results <- broom::tidy(chisq_results)
str(tidy_chisq_results)
## tibble [1 x 4] (S3: tbl_df/tbl/data.frame)
## $ statistic: Named num 14
## ..- attr(*, "names")= chr "X-squared"
## $ p.value : num 0.000185
## $ parameter: Named int 1
## ..- attr(*, "names")= chr "df"
## $ method : chr "Pearson's Chi-squared test with Yates' continuity correction"
tidy_chisq_results$p.value
## [1] 0.0001849292
\(~\)
We can also make some basic graphics that are in base R
:
plot(table.sex_dm2)
mosaicplot(table.sex_dm2,
main = "Diabetes by Sex Mosaic Plot",
xlab = "Sex",
ylab = "Diabetes",
las = 1,
border = "chocolate",
shade = TRUE)
Other graphics are accessible by installing various packages:
install_if_not('ggplot2')
## [1] "the package 'ggplot2' is already installed"
library(ggplot2)
plot <- ggplot(diab_pop, aes(x=factor(riagendr))) +
geom_bar(stat="count") +
facet_wrap(~diq010)
plot
install_if_not('gplots')
## [1] "the package 'gplots' is already installed"
gplots::balloonplot(table.sex_dm2,
main ="Balloon Plot for Sex by Diabetes \n Area is proportional to Freq.")
We can view the residuals from the chi-square test:
chisq_results$residuals
## diab_pop$riagendr
## diab_pop$diq010 Male Female
## Diabetes 2.513228 -2.416222
## No Diabetes -1.045721 1.005358
install_if_not('corrplot')
## [1] "the package 'corrplot' is already installed"
corrplot::corrplot(chisq_results$residuals, is.cor = FALSE)
install_if_not('vcd')
## [1] "the package 'vcd' is already installed"
vcd::assoc(table.sex_dm2, shade = TRUE, las=3)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
t.test(diab_pop$ridageyr ~ diab_pop$diq010)
##
## Welch Two Sample t-test
##
## data: diab_pop$ridageyr by diab_pop$diq010
## t = 26.406, df = 1407.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Diabetes and group No Diabetes is not equal to 0
## 95 percent confidence interval:
## 12.81607 14.87303
## sample estimates:
## mean in group Diabetes mean in group No Diabetes
## 61.33768 47.49313
library(ggplot2)
# Basic box plot
p <- ggplot(diab_pop, aes(x=diq010 , y=ridageyr)) +
geom_boxplot()
p
diab_pop.age.NoDM2 <- (diab_pop %>%
filter(diq010 == "No Diabetes"))$ridageyr
diab_pop.age.DM2 <- (diab_pop %>%
filter(diq010 == "Diabetes"))$ridageyr
t.test(diab_pop.age.NoDM2 , diab_pop.age.DM2 )
##
## Welch Two Sample t-test
##
## data: diab_pop.age.NoDM2 and diab_pop.age.DM2
## t = -26.406, df = 1407.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -14.87303 -12.81607
## sample estimates:
## mean of x mean of y
## 47.49313 61.33768
diab_pop %>%
ggplot( aes(riagendr,ridageyr)) +
geom_boxplot(aes(fill=factor(diq010)))
ttest_reults <- t.test(diab_pop.age.NoDM2 , diab_pop.age.DM2 )
str(ttest_reults)
## List of 10
## $ statistic : Named num -26.4
## ..- attr(*, "names")= chr "t"
## $ parameter : Named num 1407
## ..- attr(*, "names")= chr "df"
## $ p.value : num 3.83e-125
## $ conf.int : num [1:2] -14.9 -12.8
## ..- attr(*, "conf.level")= num 0.95
## $ estimate : Named num [1:2] 47.5 61.3
## ..- attr(*, "names")= chr [1:2] "mean of x" "mean of y"
## $ null.value : Named num 0
## ..- attr(*, "names")= chr "difference in means"
## $ stderr : num 0.524
## $ alternative: chr "two.sided"
## $ method : chr "Welch Two Sample t-test"
## $ data.name : chr "diab_pop.age.NoDM2 and diab_pop.age.DM2"
## - attr(*, "class")= chr "htest"
ttest_reults$p.value
## [1] 3.825137e-125
broom::tidy(ttest_reults)
## # A tibble: 1 x 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -13.8 47.5 61.3 -26.4 3.83e-125 1407. -14.9 -12.8
## # ... with 2 more variables: method <chr>, alternative <chr>
broom::tidy(ttest_reults)$p.value
## [1] 3.825137e-125
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
ggplot2
library(ggplot2)
ggplot(diab_pop, aes(ridageyr, color=diq010) ) +
geom_freqpoly(binwidth = 1) +
labs(title="Age Distribution by Diabetes")
ggplot(diab_pop, aes(riagendr,ridageyr)) +
geom_boxplot(aes(fill=factor(diq010))) +
labs(title="Box Plot of Age by Gender and Diabetes")
ggplot(diab_pop, aes(bmxbmi, fill=diq010) ) +
geom_histogram(binwidth = 1) +
labs(title="BMI Distribution by Diabetes")
## Warning: Removed 313 rows containing non-finite values (stat_bin).
ggplot(diab_pop, aes(ridageyr)) +
geom_density(aes(fill=factor(diq010)), alpha=0.8) +
labs(title="Density Plot of Age by Diabetes")
ggplot(diab_pop, aes(x=ridageyr, y=bmxbmi, shape=diq010, color=diq010)) +
geom_point() +
geom_smooth(method=lm , aes(fill=diq010)) +
labs(title="Scatter Plot of Age by BMI by Diabetes")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 313 rows containing non-finite values (stat_smooth).
## Warning: Removed 313 rows containing missing values (geom_point).
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
dplyr
RevisitedLet’s look at some more complicated summary tables that we can produce
diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(~min(.), ~mean(.), ~max(.)))
## # A tibble: 12 x 12
## # Groups: diq010, riagendr [4]
## diq010 riagendr age_ntile ridageyr_min bmxbmi_min lbxglu_min ridageyr_mean
## <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Diabetes Male 1 21 NA NA 31.5
## 2 Diabetes Male 2 39 NA NA 51.5
## 3 Diabetes Male 3 60 NA NA 69.9
## 4 Diabetes Female 1 21 NA NA 31.6
## 5 Diabetes Female 2 39 NA NA 50.2
## 6 Diabetes Female 3 60 NA NA 69.6
## 7 No Diabe~ Male 1 20 NA NA 29.4
## 8 No Diabe~ Male 2 39 NA NA 49.1
## 9 No Diabe~ Male 3 59 NA NA 70.2
## 10 No Diabe~ Female 1 20 NA NA 29.2
## 11 No Diabe~ Female 2 39 NA NA 48.6
## 12 No Diabe~ Female 3 59 NA NA 70.5
## # ... with 5 more variables: bmxbmi_mean <dbl>, lbxglu_mean <dbl>,
## # ridageyr_max <dbl>, bmxbmi_max <dbl>, lbxglu_max <dbl>
# We can tell R to remove NA values from computation:
diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr), list(~min(.,na.rm=TRUE), ~mean(.,na.rm=TRUE), ~max(.,na.rm=TRUE)))
## # A tibble: 12 x 6
## # Groups: diq010, riagendr [4]
## diq010 riagendr age_ntile min mean max
## <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 Diabetes Male 1 21 31.5 39
## 2 Diabetes Male 2 39 51.5 59
## 3 Diabetes Male 3 60 69.9 80
## 4 Diabetes Female 1 21 31.6 39
## 5 Diabetes Female 2 39 50.2 59
## 6 Diabetes Female 3 60 69.6 80
## 7 No Diabetes Male 1 20 29.4 39
## 8 No Diabetes Male 2 39 49.1 59
## 9 No Diabetes Male 3 59 70.2 80
## 10 No Diabetes Female 1 20 29.2 39
## 11 No Diabetes Female 2 39 48.6 59
## 12 No Diabetes Female 3 59 70.5 80
We can also store our results of our dplyr query as a new dataframe:
Table_Stats_1 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(~min(.,na.rm=TRUE), ~mean(.,na.rm=TRUE), ~max(.,na.rm=TRUE))) %>%
ungroup()
Table_Stats_1
## # A tibble: 12 x 12
## diq010 riagendr age_ntile ridageyr_min bmxbmi_min lbxglu_min ridageyr_mean
## <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Diabetes Male 1 21 19.8 76 31.5
## 2 Diabetes Male 2 39 16 21 51.5
## 3 Diabetes Male 3 60 17.7 68 69.9
## 4 Diabetes Female 1 21 24 84 31.6
## 5 Diabetes Female 2 39 19.6 75 50.2
## 6 Diabetes Female 3 60 17.7 50 69.6
## 7 No Diabe~ Male 1 20 16.2 70 29.4
## 8 No Diabe~ Male 2 39 15.1 74 49.1
## 9 No Diabe~ Male 3 59 16.4 64 70.2
## 10 No Diabe~ Female 1 20 16.3 74 29.2
## 11 No Diabe~ Female 2 39 14.5 77 48.6
## 12 No Diabe~ Female 3 59 14.5 64 70.5
## # ... with 5 more variables: bmxbmi_mean <dbl>, lbxglu_mean <dbl>,
## # ridageyr_max <dbl>, bmxbmi_max <dbl>, lbxglu_max <dbl>
We can pass in user defined functions where they make sense:
SumNa <- function(col){sum(is.na(col))}
Table_Stats_2 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(~SumNa(.))) %>%
rename_at(vars(ridageyr,bmxbmi,lbxglu), list(~paste0(.,"_SumNa"))) %>%
ungroup()
Table_Stats_2
## # A tibble: 12 x 6
## diq010 riagendr age_ntile ridageyr_SumNa bmxbmi_SumNa lbxglu_SumNa
## <fct> <fct> <int> <int> <int> <int>
## 1 Diabetes Male 1 0 2 18
## 2 Diabetes Male 2 0 3 79
## 3 Diabetes Male 3 0 24 155
## 4 Diabetes Female 1 0 1 18
## 5 Diabetes Female 2 0 5 64
## 6 Diabetes Female 3 0 20 128
## 7 No Diabetes Male 1 0 57 531
## 8 No Diabetes Male 2 0 29 420
## 9 No Diabetes Male 3 0 49 360
## 10 No Diabetes Female 1 0 44 576
## 11 No Diabetes Female 2 0 43 515
## 12 No Diabetes Female 3 0 36 403
We can join our two tables by the ridageyr
,bmxbmi
, and lbxglu
values:
Table_Stats_3 <- Table_Stats_1 %>%
full_join(Table_Stats_2, by=c('diq010','riagendr','age_ntile'))
Table_Stats_3
## # A tibble: 12 x 15
## diq010 riagendr age_ntile ridageyr_min bmxbmi_min lbxglu_min ridageyr_mean
## <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Diabetes Male 1 21 19.8 76 31.5
## 2 Diabetes Male 2 39 16 21 51.5
## 3 Diabetes Male 3 60 17.7 68 69.9
## 4 Diabetes Female 1 21 24 84 31.6
## 5 Diabetes Female 2 39 19.6 75 50.2
## 6 Diabetes Female 3 60 17.7 50 69.6
## 7 No Diabe~ Male 1 20 16.2 70 29.4
## 8 No Diabe~ Male 2 39 15.1 74 49.1
## 9 No Diabe~ Male 3 59 16.4 64 70.2
## 10 No Diabe~ Female 1 20 16.3 74 29.2
## 11 No Diabe~ Female 2 39 14.5 77 48.6
## 12 No Diabe~ Female 3 59 14.5 64 70.5
## # ... with 8 more variables: bmxbmi_mean <dbl>, lbxglu_mean <dbl>,
## # ridageyr_max <dbl>, bmxbmi_max <dbl>, lbxglu_max <dbl>,
## # ridageyr_SumNa <int>, bmxbmi_SumNa <int>, lbxglu_SumNa <int>
We can also get the same result with one query:
Table_Stats_3a <- Table_Stats_1 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(min = ~ min(.,na.rm=TRUE),
mean= ~ mean(.,na.rm=TRUE),
max = ~ max(.,na.rm=TRUE),
SumNa =~ SumNa(.))
) %>%
ungroup()
Let’s compare to make sure things are the same:
arsenal::comparedf(Table_Stats_3,Table_Stats_3a)
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = Table_Stats_3, y = Table_Stats_3a)
##
## Shared: 15 non-by variables and 12 observations.
## Not shared: 0 variables and 0 observations.
##
## Differences found in 0/15 variables compared.
## 0 variables compared have non-identical attributes.
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
tidyverse
Even when we create a summary table things can be complicated, this result will have 3+(3*6) = 21 columns. In this section we will see even more of the data wrangling capabilities in the tidyverse
Table_Stats_3b <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(cnt =~ n() ,
min = ~ min(.,na.rm=TRUE),
mean= ~ mean(.,na.rm=TRUE),
sd = ~ sd(.,na.rm=TRUE),
max = ~ max(.,na.rm=TRUE),
SumNa =~ SumNa(.))
) %>%
ungroup()
colnames(Table_Stats_3b)
## [1] "diq010" "riagendr" "age_ntile" "ridageyr_cnt"
## [5] "bmxbmi_cnt" "lbxglu_cnt" "ridageyr_min" "bmxbmi_min"
## [9] "lbxglu_min" "ridageyr_mean" "bmxbmi_mean" "lbxglu_mean"
## [13] "ridageyr_sd" "bmxbmi_sd" "lbxglu_sd" "ridageyr_max"
## [17] "bmxbmi_max" "lbxglu_max" "ridageyr_SumNa" "bmxbmi_SumNa"
## [21] "lbxglu_SumNa"
ncol(Table_Stats_3b)
## [1] 21
Table_Stats_3b
## # A tibble: 12 x 21
## diq010 riagendr age_ntile ridageyr_cnt bmxbmi_cnt lbxglu_cnt ridageyr_min
## <fct> <fct> <int> <int> <int> <int> <dbl>
## 1 Diabetes Male 1 25 25 25 21
## 2 Diabetes Male 2 136 136 136 39
## 3 Diabetes Male 3 295 295 295 60
## 4 Diabetes Female 1 33 33 33 21
## 5 Diabetes Female 2 124 124 124 39
## 6 Diabetes Female 3 231 231 231 60
## 7 No Diabet~ Male 1 889 889 889 20
## 8 No Diabet~ Male 2 750 750 750 39
## 9 No Diabet~ Male 3 652 652 652 59
## 10 No Diabet~ Female 1 960 960 960 20
## 11 No Diabet~ Female 2 896 896 896 39
## 12 No Diabet~ Female 3 728 728 728 59
## # ... with 14 more variables: bmxbmi_min <dbl>, lbxglu_min <dbl>,
## # ridageyr_mean <dbl>, bmxbmi_mean <dbl>, lbxglu_mean <dbl>,
## # ridageyr_sd <dbl>, bmxbmi_sd <dbl>, lbxglu_sd <dbl>, ridageyr_max <dbl>,
## # bmxbmi_max <dbl>, lbxglu_max <dbl>, ridageyr_SumNa <int>,
## # bmxbmi_SumNa <int>, lbxglu_SumNa <int>
The gather
command is used to “un-pivot” a dataframe, it makes a “wide” data-frame longer. Here I am gathering all columns except for diq010
,riagendr
,age_ntile
, the column names will be placed in Feature_Type
and the values will be placed in Value
:
Table_Stats_4 <- Table_Stats_3b %>%
tidyr::gather(-diq010,-riagendr,-age_ntile, key="Feature_Type", value="Value")
Table_Stats_4
## # A tibble: 216 x 5
## diq010 riagendr age_ntile Feature_Type Value
## <fct> <fct> <int> <chr> <dbl>
## 1 Diabetes Male 1 ridageyr_cnt 25
## 2 Diabetes Male 2 ridageyr_cnt 136
## 3 Diabetes Male 3 ridageyr_cnt 295
## 4 Diabetes Female 1 ridageyr_cnt 33
## 5 Diabetes Female 2 ridageyr_cnt 124
## 6 Diabetes Female 3 ridageyr_cnt 231
## 7 No Diabetes Male 1 ridageyr_cnt 889
## 8 No Diabetes Male 2 ridageyr_cnt 750
## 9 No Diabetes Male 3 ridageyr_cnt 652
## 10 No Diabetes Female 1 ridageyr_cnt 960
## # ... with 206 more rows
The separate
command can be used to separate a column into two columns using a delimiter. Here I am separating Feature_Type
into Feature
and Type
:
Table_Stats_5 <- Table_Stats_4 %>%
tidyr::separate(Feature_Type, into=c('Feature','Type'), sep = "_")
Table_Stats_5
## # A tibble: 216 x 6
## diq010 riagendr age_ntile Feature Type Value
## <fct> <fct> <int> <chr> <chr> <dbl>
## 1 Diabetes Male 1 ridageyr cnt 25
## 2 Diabetes Male 2 ridageyr cnt 136
## 3 Diabetes Male 3 ridageyr cnt 295
## 4 Diabetes Female 1 ridageyr cnt 33
## 5 Diabetes Female 2 ridageyr cnt 124
## 6 Diabetes Female 3 ridageyr cnt 231
## 7 No Diabetes Male 1 ridageyr cnt 889
## 8 No Diabetes Male 2 ridageyr cnt 750
## 9 No Diabetes Male 3 ridageyr cnt 652
## 10 No Diabetes Female 1 ridageyr cnt 960
## # ... with 206 more rows
The spread
function is the opposite of gather
, it is used to pivot a dataframe and makes data go from a “wide” to “long” format:
Table_Stats_6 <- Table_Stats_5 %>%
tidyr::spread(key='Type',value='Value')
Table_Stats_6
## # A tibble: 36 x 10
## diq010 riagendr age_ntile Feature cnt max mean min sd SumNa
## <fct> <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Diabetes Male 1 bmxbmi 25 54 32.5 19.8 8.87 2
## 2 Diabetes Male 1 lbxglu 25 369 209. 76 120. 18
## 3 Diabetes Male 1 ridageyr 25 39 31.5 21 5.36 0
## 4 Diabetes Male 2 bmxbmi 136 57.4 32.6 16 8.12 3
## 5 Diabetes Male 2 lbxglu 136 454 176. 21 78.8 79
## 6 Diabetes Male 2 ridageyr 136 59 51.5 39 5.23 0
## 7 Diabetes Male 3 bmxbmi 295 54.2 30.7 17.7 6.22 24
## 8 Diabetes Male 3 lbxglu 295 479 163. 68 68.0 155
## 9 Diabetes Male 3 ridageyr 295 80 69.9 60 6.41 0
## 10 Diabetes Female 1 bmxbmi 33 59.4 36.6 24 9.07 1
## # ... with 26 more rows
We can now add new columns that we might be interested in:
Table_Stats_7 <- Table_Stats_6 %>%
mutate(pct_NA = SumNa/cnt)
Table_Stats_7
## # A tibble: 36 x 11
## diq010 riagendr age_ntile Feature cnt max mean min sd SumNa pct_NA
## <fct> <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Diabe~ Male 1 bmxbmi 25 54 32.5 19.8 8.87 2 0.08
## 2 Diabe~ Male 1 lbxglu 25 369 209. 76 120. 18 0.72
## 3 Diabe~ Male 1 ridage~ 25 39 31.5 21 5.36 0 0
## 4 Diabe~ Male 2 bmxbmi 136 57.4 32.6 16 8.12 3 0.0221
## 5 Diabe~ Male 2 lbxglu 136 454 176. 21 78.8 79 0.581
## 6 Diabe~ Male 2 ridage~ 136 59 51.5 39 5.23 0 0
## 7 Diabe~ Male 3 bmxbmi 295 54.2 30.7 17.7 6.22 24 0.0814
## 8 Diabe~ Male 3 lbxglu 295 479 163. 68 68.0 155 0.525
## 9 Diabe~ Male 3 ridage~ 295 80 69.9 60 6.41 0 0
## 10 Diabe~ Female 1 bmxbmi 33 59.4 36.6 24 9.07 1 0.0303
## # ... with 26 more rows
We can make R show us all the columns:
options(tibble.width = Inf)
Table_Stats_7
## # A tibble: 36 x 11
## diq010 riagendr age_ntile Feature cnt max mean min sd SumNa
## <fct> <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Diabetes Male 1 bmxbmi 25 54 32.5 19.8 8.87 2
## 2 Diabetes Male 1 lbxglu 25 369 209. 76 120. 18
## 3 Diabetes Male 1 ridageyr 25 39 31.5 21 5.36 0
## 4 Diabetes Male 2 bmxbmi 136 57.4 32.6 16 8.12 3
## 5 Diabetes Male 2 lbxglu 136 454 176. 21 78.8 79
## 6 Diabetes Male 2 ridageyr 136 59 51.5 39 5.23 0
## 7 Diabetes Male 3 bmxbmi 295 54.2 30.7 17.7 6.22 24
## 8 Diabetes Male 3 lbxglu 295 479 163. 68 68.0 155
## 9 Diabetes Male 3 ridageyr 295 80 69.9 60 6.41 0
## 10 Diabetes Female 1 bmxbmi 33 59.4 36.6 24 9.07 1
## pct_NA
## <dbl>
## 1 0.08
## 2 0.72
## 3 0
## 4 0.0221
## 5 0.581
## 6 0
## 7 0.0814
## 8 0.525
## 9 0
## 10 0.0303
## # ... with 26 more rows
Finally we can sort to see the feature with the highest percentage of missing values is lbxglu
among the lowest aged diabetic male population:
Table_Stats_8 <- Table_Stats_7 %>%
arrange(desc(pct_NA))
Table_Stats_8
## # A tibble: 36 x 11
## diq010 riagendr age_ntile Feature cnt max mean min sd SumNa
## <fct> <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Diabetes Male 1 lbxglu 25 369 209. 76 120. 18
## 2 No Diabetes Female 1 lbxglu 960 335 96.0 74 15.3 576
## 3 No Diabetes Male 1 lbxglu 889 322 101. 70 17.0 531
## 4 Diabetes Male 2 lbxglu 136 454 176. 21 78.8 79
## 5 No Diabetes Female 2 lbxglu 896 285 103. 77 20.5 515
## 6 No Diabetes Male 2 lbxglu 750 397 109. 74 27.5 420
## 7 Diabetes Female 3 lbxglu 231 461 160. 50 68.4 128
## 8 No Diabetes Female 3 lbxglu 728 309 109. 64 22.4 403
## 9 No Diabetes Male 3 lbxglu 652 234 108. 64 16.3 360
## 10 Diabetes Female 1 lbxglu 33 390 150. 84 79.9 18
## pct_NA
## <dbl>
## 1 0.72
## 2 0.6
## 3 0.597
## 4 0.581
## 5 0.575
## 6 0.56
## 7 0.554
## 8 0.554
## 9 0.552
## 10 0.545
## # ... with 26 more rows
\(~\)
\(~\)
Although we built the table one step at a time we can also string together our commands to achieve the same result:
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.2 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks mice::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
Table_Stats_9 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(cnt =~ n() ,
min = ~ min(.,na.rm=TRUE),
mean= ~ mean(.,na.rm=TRUE),
sd = ~ sd(.,na.rm=TRUE),
max = ~ max(.,na.rm=TRUE),
SumNa =~ SumNa(.))
) %>%
ungroup() %>%
gather(-diq010,-riagendr,-age_ntile, key="Feature_Type", value="Value") %>%
separate(Feature_Type,into=c('Feature','Type')) %>%
spread(key='Type',value='Value') %>%
mutate(pct_NA = SumNa/cnt) %>%
arrange(desc(pct_NA))
Table_Stats_9
## # A tibble: 36 x 11
## diq010 riagendr age_ntile Feature cnt max mean min sd SumNa
## <fct> <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Diabetes Male 1 lbxglu 25 369 209. 76 120. 18
## 2 No Diabetes Female 1 lbxglu 960 335 96.0 74 15.3 576
## 3 No Diabetes Male 1 lbxglu 889 322 101. 70 17.0 531
## 4 Diabetes Male 2 lbxglu 136 454 176. 21 78.8 79
## 5 No Diabetes Female 2 lbxglu 896 285 103. 77 20.5 515
## 6 No Diabetes Male 2 lbxglu 750 397 109. 74 27.5 420
## 7 Diabetes Female 3 lbxglu 231 461 160. 50 68.4 128
## 8 No Diabetes Female 3 lbxglu 728 309 109. 64 22.4 403
## 9 No Diabetes Male 3 lbxglu 652 234 108. 64 16.3 360
## 10 Diabetes Female 1 lbxglu 33 390 150. 84 79.9 18
## pct_NA
## <dbl>
## 1 0.72
## 2 0.6
## 3 0.597
## 4 0.581
## 5 0.575
## 6 0.56
## 7 0.554
## 8 0.554
## 9 0.552
## 10 0.545
## # ... with 26 more rows
arsenal::comparedf(Table_Stats_8,Table_Stats_9)
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = Table_Stats_8, y = Table_Stats_9)
##
## Shared: 11 non-by variables and 36 observations.
## Not shared: 0 variables and 0 observations.
##
## Differences found in 0/11 variables compared.
## 0 variables compared have non-identical attributes.
tidyverse
: https://www.tidyverse.org/R
for Data Science: https://r4ds.had.co.nz/R
with the tidyverse
: https://b-rodrigues.github.io/modern_R/\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
Often categorical variables are encoded into binary 0 or 1 flags of new features.
diab_pop %>%
group_by(diq010,riagendr) %>%
summarise(cnt = n())
## `summarise()` has grouped output by 'diq010'. You can override using the `.groups` argument.
## # A tibble: 4 x 3
## # Groups: diq010 [2]
## diq010 riagendr cnt
## <fct> <fct> <int>
## 1 Diabetes Male 456
## 2 Diabetes Female 388
## 3 No Diabetes Male 2291
## 4 No Diabetes Female 2584
diab_pop.EncodeSex <- diab_pop %>%
mutate(SexMale = ifelse(riagendr == 'Male',1,0)) %>%
mutate(DM2 = ifelse(diq010 == 'Diabetes',1,0))
glimpse(diab_pop.EncodeSex)
## Rows: 5,719
## Columns: 12
## $ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744~
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,~
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8~
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N~
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school ~
## $ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se~
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65~
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2~
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No~
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3~
## $ SexMale <dbl> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0~
## $ DM2 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0~
diab_pop.EncodeSex %>%
group_by(DM2) %>%
summarise(cnt = sum(SexMale))
## # A tibble: 2 x 2
## DM2 cnt
## <dbl> <dbl>
## 1 0 2291
## 2 1 456
diab_pop.NoDm2 <- diab_pop.EncodeSex %>%
filter(diq010 == 'No Diabetes')
diab_pop.Dm2 <- diab_pop.EncodeSex %>%
filter(diq010 == 'Diabetes')
hist(diab_pop.NoDm2$SexMale, col=rgb(1,0,0,0.5), main="Overlapping Histogram", xlab="Sex Male")
hist(diab_pop.Dm2$SexMale, col=rgb(0,0,1,0.5), add=T)
box()
heatmap(as.matrix(diab_pop.EncodeSex[,c('DM2','SexMale')]))
Of course encoding each categorical variable can take time so we can use the dummyVars
function in the caret
package:
library('caret')
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# The return of dummyVars is a mapping which tells R which variables are factors and will need encoding
caret.dummyVars.diab_pop <- dummyVars(" ~ .", data = diab_pop)
str(caret.dummyVars.diab_pop)
## List of 9
## $ call : language dummyVars.default(formula = " ~ .", data = diab_pop)
## $ form :Class 'formula' language ~.
## .. ..- attr(*, ".Environment")=<environment: 0x000000002750a680>
## $ vars : chr [1:10] "seqn" "riagendr" "ridageyr" "ridreth1" ...
## $ facVars : chr [1:6] "riagendr" "ridreth1" "dmdeduc2" "dmdmartl" ...
## $ lvls :List of 6
## ..$ riagendr: chr [1:2] "Male" "Female"
## ..$ ridreth1: chr [1:5] "MexicanAmerican" "Other Hispanic" "Non-Hispanic White" "Non-Hispanic Black" ...
## ..$ dmdeduc2: chr [1:5] "Less than 9th grade" "Grades 9-11th" "High school graduate/GED" "Some college or AA degrees" ...
## ..$ dmdmartl: chr [1:6] "Married" "Widowed" "Divorced" "Separated" ...
## ..$ indhhin2: chr [1:14] "$0-$4,999" "$5,000-$9,999" "$10,000-$14,999" "$15,000-$19,999" ...
## ..$ diq010 : chr [1:2] "Diabetes" "No Diabetes"
## $ sep : chr "."
## $ terms :Classes 'terms', 'formula' language ~seqn + riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + diq010 + lbxglu
## .. ..- attr(*, "variables")= language list(seqn, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2, bmxbmi, diq010, lbxglu)
## .. ..- attr(*, "factors")= int [1:10, 1:10] 1 0 0 0 0 0 0 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:10] "seqn" "riagendr" "ridageyr" "ridreth1" ...
## .. .. .. ..$ : chr [1:10] "seqn" "riagendr" "ridageyr" "ridreth1" ...
## .. ..- attr(*, "term.labels")= chr [1:10] "seqn" "riagendr" "ridageyr" "ridreth1" ...
## .. ..- attr(*, "order")= int [1:10] 1 1 1 1 1 1 1 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 0
## .. ..- attr(*, ".Environment")=<environment: 0x000000002750a680>
## .. ..- attr(*, "predvars")= language list(seqn, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2, bmxbmi, diq010, lbxglu)
## .. ..- attr(*, "dataClasses")= Named chr [1:10] "numeric" "factor" "numeric" "factor" ...
## .. .. ..- attr(*, "names")= chr [1:10] "seqn" "riagendr" "ridageyr" "ridreth1" ...
## $ levelsOnly: logi FALSE
## $ fullRank : logi FALSE
## - attr(*, "class")= chr "dummyVars"
# We have to apply the mapping to the set:
predict_dummyVars_diab_pop <- predict(caret.dummyVars.diab_pop, newdata = diab_pop)
str(predict_dummyVars_diab_pop)
## num [1:5719, 1:38] 83732 83733 83734 83735 83736 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:5719] "1" "2" "3" "4" ...
## ..$ : chr [1:38] "seqn" "riagendr.Male" "riagendr.Female" "ridageyr" ...
# And then we can turn back into a tibble:
encode.diab_pop <- as_tibble(predict_dummyVars_diab_pop)
str(encode.diab_pop)
## tibble [5,719 x 38] (S3: tbl_df/tbl/data.frame)
## $ seqn : num [1:5719] 83732 83733 83734 83735 83736 ...
## $ riagendr.Male : num [1:5719] 1 1 1 0 0 0 1 0 1 1 ...
## $ riagendr.Female : num [1:5719] 0 0 0 1 1 1 0 1 0 0 ...
## $ ridageyr : num [1:5719] 62 53 78 56 42 72 22 32 56 46 ...
## $ ridreth1.MexicanAmerican : num [1:5719] 0 0 0 0 0 1 0 1 0 0 ...
## $ ridreth1.Other Hispanic : num [1:5719] 0 0 0 0 0 0 0 0 0 0 ...
## $ ridreth1.Non-Hispanic White : num [1:5719] 1 1 1 1 0 0 0 0 0 1 ...
## $ ridreth1.Non-Hispanic Black : num [1:5719] 0 0 0 0 1 0 1 0 1 0 ...
## $ ridreth1.Other : num [1:5719] 0 0 0 0 0 0 0 0 0 0 ...
## $ dmdeduc2.Less than 9th grade : num [1:5719] 0 0 0 0 0 0 0 0 0 0 ...
## $ dmdeduc2.Grades 9-11th : num [1:5719] 0 0 0 0 0 1 0 0 0 0 ...
## $ dmdeduc2.High school graduate/GED : num [1:5719] 0 1 1 0 0 0 0 0 1 0 ...
## $ dmdeduc2.Some college or AA degrees: num [1:5719] 0 0 0 0 1 0 1 1 0 0 ...
## $ dmdeduc2.College grad or above : num [1:5719] 1 0 0 1 0 0 0 0 0 1 ...
## $ dmdmartl.Married : num [1:5719] 1 0 1 0 0 0 0 1 0 0 ...
## $ dmdmartl.Widowed : num [1:5719] 0 0 0 0 0 0 0 0 0 0 ...
## $ dmdmartl.Divorced : num [1:5719] 0 1 0 0 1 0 0 0 1 0 ...
## $ dmdmartl.Separated : num [1:5719] 0 0 0 0 0 1 0 0 0 0 ...
## $ dmdmartl.Never married : num [1:5719] 0 0 0 0 0 0 1 0 0 0 ...
## $ dmdmartl.Living with partner : num [1:5719] 0 0 0 1 0 0 0 0 0 1 ...
## $ indhhin2.$0-$4,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
## $ indhhin2.$5,000-$9,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
## $ indhhin2.$10,000-$14,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 1 1 ...
## $ indhhin2.$15,000-$19,999 : num [1:5719] 0 1 0 0 NA 0 NA 0 0 0 ...
## $ indhhin2.$20,000-$24,999 : num [1:5719] 0 0 1 0 NA 0 NA 0 0 0 ...
## $ indhhin2.$25,000-$34,999 : num [1:5719] 0 0 0 0 NA 0 NA 1 0 0 ...
## $ indhhin2.$35,000-$44,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
## $ indhhin2.$45,000-$54,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
## $ indhhin2.$55,000-$64,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
## $ indhhin2.$65,000-$74,999 : num [1:5719] 1 0 0 1 NA 0 NA 0 0 0 ...
## $ indhhin2.20,000+ : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
## $ indhhin2.less than $20,000 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
## $ indhhin2.$75,000-$99,999 : num [1:5719] 0 0 0 0 NA 1 NA 0 0 0 ...
## $ indhhin2.$100,000+ : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
## $ bmxbmi : num [1:5719] 27.8 30.8 28.8 42.4 20.3 28.6 28 28.2 33.6 27.6 ...
## $ diq010.Diabetes : num [1:5719] 1 0 1 0 0 0 0 0 1 0 ...
## $ diq010.No Diabetes : num [1:5719] 0 1 0 1 1 1 1 1 0 1 ...
## $ lbxglu : num [1:5719] NA 101 84 NA 84 107 95 NA NA NA ...
heatmap(as.matrix(encode.diab_pop[,c('diq010.Diabetes','riagendr.Male','dmdmartl.Married','dmdmartl.Widowed','dmdmartl.Divorced', 'dmdmartl.Separated','dmdmartl.Never married')]))
In our last example we left a space in the column name dmdmartl.Never married
here is one way to remove it:
library('stringr')
diab_pop_s1 <- diab_pop %>%
mutate(dmdmartl = str_replace(dmdmartl ," ","_"))
glimpse(diab_pop_s1)
## Rows: 5,719
## Columns: 10
## $ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744~
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,~
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8~
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N~
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school ~
## $ dmdmartl <chr> "Married", "Divorced", "Married", "Living_with partner", "Div~
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65~
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2~
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No~
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3~
The stringr::str_replace
function only replaced the first whitespace so that “Living with partner” is now “Living_with partner”. We can use str_replace_all
instead to get “Living_with_partner”.
diab_pop_s1 <- diab_pop %>%
mutate(dmdmartl = str_replace_all(dmdmartl ," ","_")) %>%
mutate(dmdmartl = as.factor(dmdmartl))
glimpse(diab_pop_s1)
## Rows: 5,719
## Columns: 10
## $ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744~
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,~
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8~
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N~
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school ~
## $ dmdmartl <fct> Married, Divorced, Married, Living_with_partner, Divorced, Se~
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65~
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2~
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No~
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3~
diab_pop_f1 <- diab_pop %>%
mutate(dmdmartl = str_replace_all(dmdmartl," ","_")) %>%
mutate(dmdmartl = as.factor(dmdmartl)) %>%
mutate(ridreth1 = str_replace_all(ridreth1," ","_")) %>%
mutate(ridreth1 = as.factor(ridreth1)) %>%
mutate(diq010 = str_replace_all(ridreth1," ","_")) %>%
mutate(diq010 = as.factor(diq010)) %>%
mutate(dmdeduc2 = str_replace_all(dmdeduc2," ","_") ) %>%
mutate(dmdeduc2 = as.factor(dmdeduc2)) %>%
mutate(indhhin2 = str_replace_all(indhhin2," ","_") ) %>%
mutate(indhhin2 = as.factor(indhhin2))
caret.dummyVars.diab_pop_f1 <- dummyVars(" ~ .", data = diab_pop_f1)
predict_dummyVars_diab_pop <- predict(caret.dummyVars.diab_pop_f1, newdata = diab_pop_f1)
encode.diab_pop_f1 <- as_tibble(predict_dummyVars_diab_pop)
glimpse(encode.diab_pop_f1)
## Rows: 5,719
## Columns: 39
## $ seqn <dbl> 83732, 83733, 83734, 83735, 83736,~
## $ riagendr.Male <dbl> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0~
## $ riagendr.Female <dbl> 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1~
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56~
## $ ridreth1.MexicanAmerican <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0~
## $ `ridreth1.Non-Hispanic_Black` <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0~
## $ `ridreth1.Non-Hispanic_White` <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0~
## $ ridreth1.Other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0~
## $ ridreth1.Other_Hispanic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1~
## $ dmdeduc2.College_grad_or_above <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0~
## $ `dmdeduc2.Grades_9-11th` <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0~
## $ `dmdeduc2.High_school_graduate/GED` <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0~
## $ dmdeduc2.Less_than_9th_grade <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ dmdeduc2.Some_college_or_AA_degrees <dbl> 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1~
## $ dmdmartl.Divorced <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0~
## $ dmdmartl.Living_with_partner <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1~
## $ dmdmartl.Married <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0~
## $ dmdmartl.Never_married <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0~
## $ dmdmartl.Separated <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0~
## $ dmdmartl.Widowed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ `indhhin2.$0-$4,999` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,~
## $ `indhhin2.$10,000-$14,999` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 1, 1, 0,~
## $ `indhhin2.$100,000+` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,~
## $ `indhhin2.$15,000-$19,999` <dbl> 0, 1, 0, 0, NA, 0, NA, 0, 0, 0, 0,~
## $ `indhhin2.$20,000-$24,999` <dbl> 0, 0, 1, 0, NA, 0, NA, 0, 0, 0, 0,~
## $ `indhhin2.$25,000-$34,999` <dbl> 0, 0, 0, 0, NA, 0, NA, 1, 0, 0, 0,~
## $ `indhhin2.$45,000-$54,999` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,~
## $ `indhhin2.$5,000-$9,999` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,~
## $ `indhhin2.$65,000-$74,999` <dbl> 1, 0, 0, 1, NA, 0, NA, 0, 0, 0, 1,~
## $ `indhhin2.$75,000-$99,999` <dbl> 0, 0, 0, 0, NA, 1, NA, 0, 0, 0, 0,~
## $ `indhhin2.20,000+` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,~
## $ `indhhin2.less_than_$20,000` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,~
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6~
## $ diq010.MexicanAmerican <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0~
## $ `diq010.Non-Hispanic_Black` <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0~
## $ `diq010.Non-Hispanic_White` <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0~
## $ diq010.Other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0~
## $ diq010.Other_Hispanic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1~
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, ~
Notice that we are applying the same functions to the columns, so another way to solve this issue might be with a function:
my_replace_all_factor_function <- function(my_data_in , the_column){
enquo_the_column <- enquo(the_column)
my_data_out <- my_data_in %>%
mutate( !! enquo_the_column := str_replace_all(!! enquo_the_column," ","_")) %>%
mutate( !! enquo_the_column := as.factor(!! enquo_the_column))
}
Then we can apply our function itteratively:
result1 <- my_replace_all_factor_function(diab_pop,dmdmartl)
result2 <- my_replace_all_factor_function(result1,ridreth1)
result3 <- my_replace_all_factor_function(result2,diq010)
result4 <- my_replace_all_factor_function(result3,dmdeduc2)
result5 <- my_replace_all_factor_function(result4,indhhin2)
arsenal::comparedf(diab_pop_f1 , result5)
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = diab_pop_f1, y = result5)
##
## Shared: 10 non-by variables and 5719 observations.
## Not shared: 0 variables and 0 observations.
##
## Differences found in 1/10 variables compared.
## 1 variables compared have non-identical attributes.
Here’s another way to apply our function:
result_f <- my_replace_all_factor_function(diab_pop,dmdmartl) %>%
my_replace_all_factor_function(ridreth1) %>%
my_replace_all_factor_function(diq010) %>%
my_replace_all_factor_function(dmdeduc2) %>%
my_replace_all_factor_function(indhhin2)
glimpse(result_f)
## Rows: 5,719
## Columns: 10
## $ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744~
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,~
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8~
## $ ridreth1 <fct> Non-Hispanic_White, Non-Hispanic_White, Non-Hispanic_White, N~
## $ dmdeduc2 <fct> College_grad_or_above, High_school_graduate/GED, High_school_~
## $ dmdmartl <fct> Married, Divorced, Married, Living_with_partner, Divorced, Se~
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65~
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2~
## $ diq010 <fct> Diabetes, No_Diabetes, Diabetes, No_Diabetes, No_Diabetes, No~
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3~
arsenal::comparedf(result5 , result_f)
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = result5, y = result_f)
##
## Shared: 10 non-by variables and 5719 observations.
## Not shared: 0 variables and 0 observations.
##
## Differences found in 0/10 variables compared.
## 0 variables compared have non-identical attributes.
dplyr
vignette: https://cran.r-project.org/web/packages/dplyr/vignettes/programming.html\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
library('tidyverse')
diab_pop.NoDM2.age <- (diab_pop %>%
filter(diq010 == "No Diabetes"))$ridageyr
diab_pop.DM2.age <- (diab_pop %>%
filter(diq010 == "Diabetes"))$ridageyr
ks.test(diab_pop.NoDM2.age , diab_pop.DM2.age)$p.value
## Warning in ks.test(diab_pop.NoDM2.age, diab_pop.DM2.age): p-value will be
## approximate in the presence of ties
## [1] 0
Now let’s create a table of some ks-tests. We can write a function to assist us:
ks_wrapper <- function(df, factor, factor_level, continuous_feature){
F <- NULL
factor <- enquo(factor)
data.local <- df %>%
select(!! factor, continuous_feature)
X <- data.local %>%
filter(!! factor == factor_level)
Y <- data.local %>%
filter(!(!! factor == factor_level))
F$x <- X[,continuous_feature]
F$y <- Y[,continuous_feature]
ks_test_stat_T <- ks.test(F$x,F$y)
ks_test_stat <- ks_test_stat_T$p.value
result_row <- tibble::enframe(ks_test_stat)
result_row <- result_row %>%
mutate(Feature = continuous_feature)
return(result_row)
}
A single call to the function will give us a table with the p-value of the ks-test:
diab_pop_f2 <- diab_pop %>%
mutate(DM2_num = ifelse(diq010 == "Diabetes",1,0))
ks_wrapper(diab_pop_f2, DM2_num,1,"ridageyr")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(continuous_feature)` instead of `continuous_feature` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Warning in ks.test(F$x, F$y): p-value will be approximate in the presence of
## ties
## # A tibble: 1 x 3
## name value Feature
## <int> <dbl> <chr>
## 1 1 0 ridageyr
We can iterate our function over other columns and use purrr::map_dfr
to make a single table:
compute_ks_cols <- c('ridageyr','bmxbmi','lbxglu')
ks_test_results_table <- purrr::map_dfr(compute_ks_cols,
df = diab_pop_f2,
factor = DM2_num,
factor_level = 1,
ks_wrapper)
## Warning in ks.test(F$x, F$y): p-value will be approximate in the presence of
## ties
## Warning in ks.test(F$x, F$y): p-value will be approximate in the presence of
## ties
## Warning in ks.test(F$x, F$y): p-value will be approximate in the presence of
## ties
ks_test_results_table
## # A tibble: 3 x 3
## name value Feature
## <int> <dbl> <chr>
## 1 1 0 ridageyr
## 2 1 0 bmxbmi
## 3 1 0 lbxglu
ks_test_results_table <- ks_test_results_table %>%
rename(ks_test_p_value=value)
ks_test_results_table
## # A tibble: 3 x 3
## name ks_test_p_value Feature
## <int> <dbl> <chr>
## 1 1 0 ridageyr
## 2 1 0 bmxbmi
## 3 1 0 lbxglu
We can add to the table, by finding some related values to showcase along:
means.diab_pop_f2 <- diab_pop_f2 %>%
group_by(DM2_num) %>%
summarise_at(vars(c('ridageyr','bmxbmi','lbxglu')), list(~mean(.,na.rm = TRUE),~sd(.,na.rm = TRUE))) %>%
gather(-DM2_num, key="Feature_Type", value="Value") %>%
separate(Feature_Type, into= c('Feature','Type'), sep='_') %>%
mutate(DM2_stats_char = ifelse(DM2_num == 1,"Diabetes","No_Diabetes")) %>%
mutate(DM2_stats_char_Type = paste0(DM2_stats_char,"_",Type)) %>%
select(-DM2_num,-Type,-DM2_stats_char) %>%
select(Feature, DM2_stats_char_Type, Value) %>%
spread(key='DM2_stats_char_Type', value = 'Value')
means.diab_pop_f2
## # A tibble: 3 x 5
## Feature Diabetes_mean Diabetes_sd No_Diabetes_mean No_Diabetes_sd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 bmxbmi 32.6 7.73 29.0 6.82
## 2 lbxglu 166. 73.5 104. 20.7
## 3 ridageyr 61.3 13.3 47.5 17.6
Finally we can join our tables together:
ks_table_1 <- ks_test_results_table %>%
left_join(means.diab_pop_f2) %>%
select(Feature, Diabetes_mean, Diabetes_sd, ks_test_p_value, No_Diabetes_mean, No_Diabetes_sd)
## Joining, by = "Feature"
ks_table_1
## # A tibble: 3 x 6
## Feature Diabetes_mean Diabetes_sd ks_test_p_value No_Diabetes_mean
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ridageyr 61.3 13.3 0 47.5
## 2 bmxbmi 32.6 7.73 0 29.0
## 3 lbxglu 166. 73.5 0 104.
## No_Diabetes_sd
## <dbl>
## 1 17.6
## 2 6.82
## 3 20.7
We can also use R
to make the table more presentable:
install_if_not('kableExtra')
## [1] "the package 'kableExtra' is already installed"
library('kableExtra')
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
ks_table_style <- kable(ks_table_1) %>%
kable_styling("striped") %>%
add_header_above(c(" " = 1, "Diabetes" = 2, "ks-test pvalue" = 1, "Non Diabetes" = 2))
ks_table_style
Feature | Diabetes_mean | Diabetes_sd | ks_test_p_value | No_Diabetes_mean | No_Diabetes_sd |
---|---|---|---|---|---|
ridageyr | 61.33768 | 13.347476 | 0 | 47.49313 | 17.635870 |
bmxbmi | 32.59632 | 7.733781 | 0 | 29.01988 | 6.823482 |
lbxglu | 165.98953 | 73.475333 | 0 | 103.94155 | 20.720142 |
ggplot(diab_pop, aes(diq010,ridageyr)) +
geom_boxplot(aes(fill=factor(diq010)))
ggplot(diab_pop, aes(x=ridageyr)) + geom_histogram() + facet_wrap(~diq010)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
diab_pop.NoDm2 <- diab_pop %>% filter(diq010 == 'No Diabetes')
diab_pop.Dm2 <- diab_pop %>% filter(diq010 == 'Diabetes')
hist(diab_pop.NoDm2$ridageyr, col=rgb(1,0,0,0.5), main="Overlapping Histograms Diabetics by Age", xlab="Age")
hist(diab_pop.Dm2$ridageyr, col=rgb(0,0,1,0.5), add=T)
legend("topright", c("No Diabetes", "Diabetes"), fill=c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))
box()
purrr
vignette: https://purrr.tidyverse.org/purrr
cheatsheet: https://github.com/rstudio/cheatsheets/blob/master/purrr.pdf\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
install_if_not('tableone')
## [1] "the package 'tableone' is already installed"
diab_pop2 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3))
my_table_one <- tableone::CreateTableOne(
vars = c('ridageyr','bmxbmi','lbxglu',"riagendr","age_ntile"),
strata = c("diq010"),
data = diab_pop2,
factorVars = c("riagendr","age_ntile")
)
my_table_one
## Stratified by diq010
## Diabetes No Diabetes p test
## n 844 4875
## ridageyr (mean (SD)) 61.34 (13.35) 47.49 (17.64) <0.001
## bmxbmi (mean (SD)) 32.60 (7.73) 29.02 (6.82) <0.001
## lbxglu (mean (SD)) 165.99 (73.48) 103.94 (20.72) <0.001
## riagendr = Female (%) 388 (46.0) 2584 (53.0) <0.001
## age_ntile (%) <0.001
## 1 58 ( 6.9) 1849 (37.9)
## 2 260 (30.8) 1646 (33.8)
## 3 526 (62.3) 1380 (28.3)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
install_if_not(c('arsenal','knitr','kableExtra'))
## [1] "the package 'arsenal' is already installed"
## [2] "the package 'knitr' is already installed"
## [3] "the package 'kableExtra' is already installed"
library('arsenal')
library('knitr')
library('kableExtra')
table_arsenal <- tableby(diq010 ~ riagendr + dmdmartl, data = diab_pop)
table_arsenal
## tableby Object
##
## Function Call:
## tableby(formula = diq010 ~ riagendr + dmdmartl, data = diab_pop)
##
## Variable(s):
## diq010 ~ riagendr, dmdmartl
summary(table_arsenal, title="Diabetes by Sex and Marital Status")
##
##
## Table: Diabetes by Sex and Marital Status
##
## | | Diabetes (N=844) | No Diabetes (N=4875) | Total (N=5719) | p value|
## |:-------------------------------------|:----------------:|:--------------------:|:--------------:|-------:|
## |**riagendr** | | | | < 0.001|
## | Male | 456 (54.0%) | 2291 (47.0%) | 2747 (48.0%) | |
## | Female | 388 (46.0%) | 2584 (53.0%) | 2972 (52.0%) | |
## |**dmdmartl** | | | | < 0.001|
## | N-Miss | 0 | 3 | 3 | |
## | Married | 471 (55.8%) | 2415 (49.6%) | 2886 (50.5%) | |
## | Widowed | 100 (11.8%) | 321 (6.6%) | 421 (7.4%) | |
## | Divorced | 106 (12.6%) | 508 (10.4%) | 614 (10.7%) | |
## | Separated | 29 (3.4%) | 163 (3.3%) | 192 (3.4%) | |
## | Never married | 83 (9.8%) | 965 (19.8%) | 1048 (18.3%) | |
## | Living with partner | 55 (6.5%) | 500 (10.3%) | 555 (9.7%) | |
summary.table_arsenal <- summary(table_arsenal, title="Diabetes by Sex and Marital Status",results="asis") %>%
kable() %>%
kable_styling("striped")
summary.table_arsenal
Diabetes (N=844) | No Diabetes (N=4875) | Total (N=5719) | p value | |
---|---|---|---|---|
riagendr | < 0.001 | |||
Male | 456 (54.0%) | 2291 (47.0%) | 2747 (48.0%) | |
Female | 388 (46.0%) | 2584 (53.0%) | 2972 (52.0%) | |
dmdmartl | < 0.001 | |||
N-Miss | 0 | 3 | 3 | |
Married | 471 (55.8%) | 2415 (49.6%) | 2886 (50.5%) | |
Widowed | 100 (11.8%) | 321 (6.6%) | 421 (7.4%) | |
Divorced | 106 (12.6%) | 508 (10.4%) | 614 (10.7%) | |
Separated | 29 (3.4%) | 163 (3.3%) | 192 (3.4%) | |
Never married | 83 (9.8%) | 965 (19.8%) | 1048 (18.3%) | |
Living with partner | 55 (6.5%) | 500 (10.3%) | 555 (9.7%) |
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
res.aov <- aov(ridageyr ~ diq010, data=diab_pop)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## diq010 1 137897 137897 473.2 <2e-16 ***
## Residuals 5717 1666115 291
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
res.mov <- manova(cbind(ridageyr,bmxbmi) ~ diq010, data=diab_pop)
summary(res.mov)
## Df Pillai approx F num Df den Df Pr(>F)
## diq010 1 0.10247 308.43 2 5403 < 2.2e-16 ***
## Residuals 5404
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(res.mov)
## Response ridageyr :
## Df Sum Sq Mean Sq F value Pr(>F)
## diq010 1 122870 122870 425.86 < 2.2e-16 ***
## Residuals 5404 1559176 289
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response bmxbmi :
## Df Sum Sq Mean Sq F value Pr(>F)
## diq010 1 8619 8619.1 177.74 < 2.2e-16 ***
## Residuals 5404 262052 48.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## 313 observations deleted due to missingness
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
install_if_not("psych")
## [1] "the package 'psych' is already installed"
psych::describe(diab_pop)
## vars n mean sd median trimmed mad min max
## seqn 1 5719 88671.71 2877.82 88651.0 88662.57 3703.53 83732.0 93702.0
## riagendr* 2 5719 1.52 0.50 2.0 1.52 0.00 1.0 2.0
## ridageyr 3 5719 49.54 17.76 49.0 49.19 22.24 20.0 80.0
## ridreth1* 4 5719 3.04 1.29 3.0 3.05 1.48 1.0 5.0
## dmdeduc2* 5 5714 3.43 1.30 4.0 3.54 1.48 1.0 5.0
## dmdmartl* 6 5716 2.61 1.89 1.0 2.39 0.00 1.0 6.0
## indhhin2* 7 4421 8.47 4.32 8.0 8.59 5.93 1.0 14.0
## bmxbmi 8 5406 29.54 7.08 28.5 28.86 6.38 14.5 67.3
## diq010* 9 5719 1.85 0.35 2.0 1.94 0.00 1.0 2.0
## lbxglu 10 2452 113.61 41.33 103.0 105.08 13.34 21.0 479.0
## range skew kurtosis se
## seqn 9970.0 0.02 -1.20 38.05
## riagendr* 1.0 -0.08 -1.99 0.01
## ridageyr 60.0 0.11 -1.14 0.23
## ridreth1* 4.0 -0.12 -0.96 0.02
## dmdeduc2* 4.0 -0.49 -0.84 0.02
## dmdmartl* 5.0 0.63 -1.26 0.03
## indhhin2* 13.0 -0.03 -1.46 0.06
## bmxbmi 52.8 1.12 2.01 0.10
## diq010* 1.0 -1.99 1.95 0.00
## lbxglu 458.0 3.98 20.33 0.83
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describeBy(diab_pop, diab_pop$diq010, skew=FALSE, ranges=TRUE)
##
## Descriptive statistics by group
## group: Diabetes
## vars n mean sd min max range se
## seqn 1 844 88574.35 2845.20 83732 93689.0 9957.0 97.94
## riagendr* 2 844 1.46 0.50 1 2.0 1.0 0.02
## ridageyr 3 844 61.34 13.35 21 80.0 59.0 0.46
## ridreth1* 4 844 2.87 1.35 1 5.0 4.0 0.05
## dmdeduc2* 5 843 3.09 1.37 1 5.0 4.0 0.05
## dmdmartl* 6 844 2.19 1.65 1 6.0 5.0 0.06
## indhhin2* 7 650 7.24 4.23 1 14.0 13.0 0.17
## bmxbmi 8 789 32.60 7.73 16 67.3 51.3 0.28
## diq010* 9 844 1.00 0.00 1 1.0 0.0 0.00
## lbxglu 10 382 165.99 73.48 21 479.0 458.0 3.76
## ------------------------------------------------------------
## group: No Diabetes
## vars n mean sd min max range se
## seqn 1 4875 88688.57 2883.38 83733.0 93702.0 9969.0 41.30
## riagendr* 2 4875 1.53 0.50 1.0 2.0 1.0 0.01
## ridageyr 3 4875 47.49 17.64 20.0 80.0 60.0 0.25
## ridreth1* 4 4875 3.07 1.28 1.0 5.0 4.0 0.02
## dmdeduc2* 5 4871 3.49 1.28 1.0 5.0 4.0 0.02
## dmdmartl* 6 4872 2.68 1.92 1.0 6.0 5.0 0.03
## indhhin2* 7 3771 8.68 4.30 1.0 14.0 13.0 0.07
## bmxbmi 8 4617 29.02 6.82 14.5 64.6 50.1 0.10
## diq010* 9 4875 2.00 0.00 2.0 2.0 0.0 0.00
## lbxglu 10 2070 103.94 20.72 64.0 397.0 333.0 0.46
detach(package:psych)
\(~\)
\(~\)
GGally::ggpairs
install_if_not('GGally')
## [1] "the package 'GGally' is already installed"
data2 <- diab_pop %>% na.omit()
GGally::ggpairs(data2)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
\(~\)
\(~\)
skimr
install_if_not('skimr')
## [1] "the package 'skimr' is already installed"
library('skimr')
skim(diab_pop) %>%
summary()
Name | diab_pop |
Number of rows | 5719 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
factor | 6 |
numeric | 4 |
________________________ | |
Group variables | None |
skim(diab_pop)
Name | diab_pop |
Number of rows | 5719 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
factor | 6 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
riagendr | 0 | 1.00 | FALSE | 2 | Fem: 2972, Mal: 2747 |
ridreth1 | 0 | 1.00 | FALSE | 5 | Non: 1863, Non: 1198, Mex: 995, Oth: 895 |
dmdeduc2 | 5 | 1.00 | FALSE | 5 | Som: 1692, Col: 1422, Hig: 1236, Les: 688 |
dmdmartl | 3 | 1.00 | FALSE | 6 | Mar: 2886, Nev: 1048, Div: 614, Liv: 555 |
indhhin2 | 1298 | 0.77 | FALSE | 12 | $10: 910, $25: 596, $75: 524, $45: 451 |
diq010 | 0 | 1.00 | FALSE | 2 | No : 4875, Dia: 844 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
seqn | 0 | 1.00 | 88671.71 | 2877.82 | 83732.0 | 86170.5 | 88651.0 | 91165.5 | 93702.0 | <U+2587><U+2587><U+2587><U+2587><U+2587> |
ridageyr | 0 | 1.00 | 49.54 | 17.76 | 20.0 | 34.0 | 49.0 | 64.0 | 80.0 | <U+2587><U+2587><U+2587><U+2587><U+2586> |
bmxbmi | 313 | 0.95 | 29.54 | 7.08 | 14.5 | 24.5 | 28.5 | 33.2 | 67.3 | <U+2585><U+2587><U+2582><U+2581><U+2581> |
lbxglu | 3267 | 0.43 | 113.61 | 41.33 | 21.0 | 95.0 | 103.0 | 114.0 | 479.0 | <U+2587><U+2582><U+2581><U+2581><U+2581> |
diab_pop %>%
group_by(diq010) %>%
skim()
Name | Piped data |
Number of rows | 5719 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
factor | 5 |
numeric | 4 |
________________________ | |
Group variables | diq010 |
Variable type: factor
skim_variable | diq010 | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|---|
riagendr | Diabetes | 0 | 1.00 | FALSE | 2 | Mal: 456, Fem: 388 |
riagendr | No Diabetes | 0 | 1.00 | FALSE | 2 | Fem: 2584, Mal: 2291 |
ridreth1 | Diabetes | 0 | 1.00 | FALSE | 5 | Non: 217, Non: 203, Mex: 200, Oth: 119 |
ridreth1 | No Diabetes | 0 | 1.00 | FALSE | 5 | Non: 1646, Non: 995, Mex: 795, Oth: 790 |
dmdeduc2 | Diabetes | 1 | 1.00 | FALSE | 5 | Som: 229, Hig: 185, Les: 161, Col: 145 |
dmdeduc2 | No Diabetes | 4 | 1.00 | FALSE | 5 | Som: 1463, Col: 1277, Hig: 1051, Gra: 553 |
dmdmartl | Diabetes | 0 | 1.00 | FALSE | 6 | Mar: 471, Div: 106, Wid: 100, Nev: 83 |
dmdmartl | No Diabetes | 3 | 1.00 | FALSE | 6 | Mar: 2415, Nev: 965, Div: 508, Liv: 500 |
indhhin2 | Diabetes | 194 | 0.77 | FALSE | 12 | $10: 89, $25: 86, $10: 81, $20: 68 |
indhhin2 | No Diabetes | 1104 | 0.77 | FALSE | 12 | $10: 821, $25: 510, $75: 472, $45: 386 |
Variable type: numeric
skim_variable | diq010 | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|---|
seqn | Diabetes | 0 | 1.00 | 88574.35 | 2845.20 | 83732.0 | 86068.75 | 88610.0 | 90994.5 | 93689.0 | <U+2587><U+2587><U+2587><U+2587><U+2587> |
seqn | No Diabetes | 0 | 1.00 | 88688.57 | 2883.38 | 83733.0 | 86189.50 | 88654.0 | 91193.0 | 93702.0 | <U+2587><U+2587><U+2587><U+2587><U+2587> |
ridageyr | Diabetes | 0 | 1.00 | 61.34 | 13.35 | 21.0 | 53.00 | 63.0 | 71.0 | 80.0 | <U+2581><U+2582><U+2585><U+2587><U+2587> |
ridageyr | No Diabetes | 0 | 1.00 | 47.49 | 17.64 | 20.0 | 32.00 | 46.0 | 61.0 | 80.0 | <U+2587><U+2587><U+2586><U+2586><U+2585> |
bmxbmi | Diabetes | 55 | 0.93 | 32.60 | 7.73 | 16.0 | 27.40 | 31.4 | 36.3 | 67.3 | <U+2582><U+2587><U+2582><U+2581><U+2581> |
bmxbmi | No Diabetes | 258 | 0.95 | 29.02 | 6.82 | 14.5 | 24.10 | 28.0 | 32.5 | 64.6 | <U+2583><U+2587><U+2582><U+2581><U+2581> |
lbxglu | Diabetes | 462 | 0.45 | 165.99 | 73.48 | 21.0 | 117.00 | 147.0 | 192.0 | 479.0 | <U+2583><U+2587><U+2582><U+2581><U+2581> |
lbxglu | No Diabetes | 2805 | 0.42 | 103.94 | 20.72 | 64.0 | 94.00 | 101.0 | 109.0 | 397.0 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
\(~\)
\(~\)
install_if_not('networkD3')
## [1] "the package 'networkD3' is already installed"
library(networkD3)
library(tidyverse)
by_sex <- diab_pop %>%
group_by(riagendr) %>%
summarise(cnt = n_distinct(seqn))
by_sex_by_eth <- diab_pop %>%
group_by(riagendr, ridreth1) %>%
summarise(cnt = n_distinct(seqn))
## `summarise()` has grouped output by 'riagendr'. You can override using the `.groups` argument.
by_sex_by_dm2 <- diab_pop %>%
group_by(riagendr, diq010) %>%
summarise(cnt = n_distinct(seqn))
## `summarise()` has grouped output by 'riagendr'. You can override using the `.groups` argument.
join1 <- by_sex %>%
left_join(by_sex_by_eth,
by= c('riagendr'='riagendr'),
suffix = c("_sex","_eth")) %>%
mutate(diff_sex_eth = cnt_sex - cnt_eth) %>%
mutate(source = riagendr,
target = ridreth1) %>%
mutate(diff = diff_sex_eth) %>%
select(source, target, diff)
join2 <- by_sex %>%
left_join(by_sex_by_dm2,
by= c('riagendr'='riagendr'),
suffix = c("_sex","_dm2")) %>%
mutate(diff_sex_dm2 = cnt_sex - cnt_dm2 )%>%
mutate(source = riagendr,
target = diq010) %>%
mutate(diff = diff_sex_dm2)%>%
select(source, target, diff)
by_dm2 <-diab_pop %>%
group_by(diq010)%>%
summarise(cnt = n_distinct(seqn))
by_dm2_by_eth <-diab_pop %>%
group_by(diq010, ridreth1)%>%
summarise(cnt = n_distinct(seqn))
## `summarise()` has grouped output by 'diq010'. You can override using the `.groups` argument.
join3 <- by_dm2 %>%
left_join(by_dm2_by_eth,
by= c('diq010'='diq010'),
suffix = c("_dm2","_eth")) %>%
mutate(diff_dm2_eth = cnt_dm2 - cnt_eth) %>%
mutate(source = ridreth1,
target = diq010) %>%
mutate(diff = diff_dm2_eth) %>%
select(source, target, diff)
JOIN <- bind_rows(join1, join2, join3)
nodes <- data.frame(
name=c( as.character(JOIN$source),
as.character(JOIN$target))
%>% unique()
)
JOIN$IDsource <- match(JOIN$source, nodes$name)-1
JOIN$IDtarget <- match(JOIN$target, nodes$name)-1
p <- sankeyNetwork(Links = JOIN,
Nodes = nodes,
Source = "IDsource",
Target = "IDtarget",
Value = "diff",
NodeID = "name",
sinksRight=FALSE)
## Links is a tbl_df. Converting to a plain data frame.
p
Sankey Network
resources:\(~\)
\(~\)
\(~\)
\(~\)
# read in the data
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
#### Variable in Data - Definition - Data Type
##### seqn - Respondent sequence number - Identifier
##### riagendr - Gender - Categorical
##### ridageyr - Age in years at screening - Continuous / Numerical
##### ridreth1 - Race/Hispanic origin - Categorical
##### dmdeduc2 - Education level - Adults 20+ - Categorical
##### dmdmartl - Marital status - Categorical
##### indhhin2 - Annual household income - Categorical
##### bmxbmi - Body Mass Index (kg/m**2) - Continuous / Numerical
##### diq010 - Doctor diagnosed diabetes - Categorical / Target
##### lbxglu - Fasting Glucose (mg/dL) - Continuous / Numerical
#structure command
str(diab_pop)
# show data
head(diab_pop,10)
# loaded packages
(.packages())
# which packages are installed
library()$results[,1]
# some additional informaion on the package including the version
tibble::as_tibble(installed.packages())
# install package if missing
install_if_not <- function( list.of.packages ) {
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}
# test function
install_if_not(c("caret","randomForest"))
loaded_package_before <- (.packages())
# everthing below here can call functions in dplyr package
library('dplyr')
glimpse(diab_pop)
(.packages())
loaded_package_after <- (.packages())
setdiff(loaded_package_after, loaded_package_before)
detach(package:dplyr)
#the dplyr package has now been detached. calls to functions may have errors
# glimpse the data
dplyr::glimpse(diab_pop)
# error
glimpse(diab_pop)
tibble::as_tibble(diab_pop)
summary(diab_pop)
by(diab_pop, diab_pop$diq010, summary)
install_if_not("Amelia")
Amelia::missmap(diab_pop)
install_if_not("mice")
install_if_not("VIM")
install_if_not("lattice")
library(mice)
library(VIM)
library(lattice)
#understand the missing value pattern
md.pattern(diab_pop)
#plot the missing values
diab_pop_miss = aggr(diab_pop,
numbers=TRUE,
sortVars=TRUE,
labels=colnames(diab_pop),
cex.axis=.7,
gap=3,
ylab=c("Proportion of missingness","Missingness Pattern"))
#Drawing margin plot
marginplot(diab_pop[, c("lbxglu", "diq010")],
cex.numbers = 1.2,
pch = 19)
install_if_not('sqldf')
# Average age by Diabetes
sqldf::sqldf('select diq010,
avg(ridageyr) as mean_age
from diab_pop
group by diq010
order by diq010')
# Notice we can save our tables with assignments
# Counts by Diabetic and Age
T_sqldf <- sqldf::sqldf('select diq010,
riagendr,
count(*) as cnt
from diab_pop
group by diq010, riagendr
order by cnt desc')
str(T_sqldf)
T_sqldf
library(dplyr)
### Average age by Diabetes
diab_pop %>%
group_by(diq010) %>%
summarise(mean_age = mean(ridageyr)) %>%
arrange(diq010)
### Counts by Diabetic and Age
T_dplyr_dm2_gender <- diab_pop %>%
group_by(diq010,riagendr) %>%
summarise(cnt= n()) %>%
ungroup() %>%
arrange(-cnt)
str(T_dplyr_dm2_gender)
T_dplyr_dm2_gender
install_if_not('arsenal')
my_comparison <- arsenal::comparedf(T_sqldf , T_dplyr_dm2_gender)
my_comparison
summary(my_comparison)
str(my_comparison)
# The table command can give us a table of counts of categorical columns:
table(diab_pop$diq010, diab_pop$riagendr)
table(diab_pop$dmdmartl, diab_pop$riagendr, diab_pop$diq010)
# We can use similar call for the chi square test:
chisq.test(diab_pop$diq010, diab_pop$riagendr)
# We can store the results and then use them:
table.sex_dm2 <- table(diab_pop$riagendr, diab_pop$diq010)
table.sex_dm2
chisq.test(table.sex_dm2)
chisq_results <- chisq.test(diab_pop$diq010, diab_pop$riagendr)
str(chisq_results)
# We can access different portions of the output:
chisq_results$p.value
chisq_results$statistic
chisq_results$method
str(chisq_results)
install_if_not('broom')
tidy_chisq_results <- broom::tidy(chisq_results)
str(tidy_chisq_results)
tidy_chisq_results$p.value
plot(table.sex_dm2)
mosaicplot(table.sex_dm2,
main = "Diabetes by Sex Mosaic Plot",
xlab = "Sex",
ylab = "Diabetes",
las = 1,
border = "chocolate",
shade = TRUE)
install_if_not('ggplot2')
library(ggplot2)
plot <- ggplot(diab_pop, aes(x=factor(riagendr))) +
geom_bar(stat="count") +
facet_wrap(~diq010)
plot
install_if_not('gplots')
gplots::balloonplot(table.sex_dm2,
main ="Balloon Plot for Sex by Diabetes \n Area is proportional to Freq.")
chisq_results$residuals
install_if_not('corrplot')
corrplot::corrplot(chisq_results$residuals, is.cor = FALSE)
install_if_not('vcd')
vcd::assoc(table.sex_dm2, shade = TRUE, las=3)
t.test(diab_pop$ridageyr ~ diab_pop$diq010)
library(ggplot2)
# Basic box plot
p <- ggplot(diab_pop, aes(x=diq010 , y=ridageyr)) +
geom_boxplot()
p
diab_pop.age.NoDM2 <- (diab_pop %>%
filter(diq010 == "No Diabetes"))$ridageyr
diab_pop.age.DM2 <- (diab_pop %>%
filter(diq010 == "Diabetes"))$ridageyr
t.test(diab_pop.age.NoDM2 , diab_pop.age.DM2 )
diab_pop %>%
ggplot( aes(riagendr,ridageyr)) +
geom_boxplot(aes(fill=factor(diq010)))
ttest_reults <- t.test(diab_pop.age.NoDM2 , diab_pop.age.DM2 )
str(ttest_reults)
ttest_reults$p.value
broom::tidy(ttest_reults)
broom::tidy(ttest_reults)$p.value
library(ggplot2)
ggplot(diab_pop, aes(ridageyr, color=diq010) ) +
geom_freqpoly(binwidth = 1) +
labs(title="Age Distribution by Diabetes")
ggplot(diab_pop, aes(riagendr,ridageyr)) +
geom_boxplot(aes(fill=factor(diq010))) +
labs(title="Box Plot of Age by Gender and Diabetes")
ggplot(diab_pop, aes(bmxbmi, fill=diq010) ) +
geom_histogram(binwidth = 1) +
labs(title="BMI Distribution by Diabetes")
ggplot(diab_pop, aes(ridageyr)) +
geom_density(aes(fill=factor(diq010)), alpha=0.8) +
labs(title="Density Plot of Age by Diabetes")
ggplot(diab_pop, aes(x=ridageyr, y=bmxbmi, shape=diq010, color=diq010)) +
geom_point() +
geom_smooth(method=lm , aes(fill=diq010)) +
labs(title="Scatter Plot of Age by BMI by Diabetes")
diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(~min(.), ~mean(.), ~max(.)))
# We can tell R to remove NA values from computation:
diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr), list(~min(.,na.rm=TRUE), ~mean(.,na.rm=TRUE), ~max(.,na.rm=TRUE)))
Table_Stats_1 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(~min(.,na.rm=TRUE), ~mean(.,na.rm=TRUE), ~max(.,na.rm=TRUE))) %>%
ungroup()
Table_Stats_1
SumNa <- function(col){sum(is.na(col))}
Table_Stats_2 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(~SumNa(.))) %>%
rename_at(vars(ridageyr,bmxbmi,lbxglu), list(~paste0(.,"_SumNa"))) %>%
ungroup()
Table_Stats_2
Table_Stats_3 <- Table_Stats_1 %>%
full_join(Table_Stats_2, by=c('diq010','riagendr','age_ntile'))
Table_Stats_3
Table_Stats_3a <- Table_Stats_1 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(min = ~ min(.,na.rm=TRUE),
mean= ~ mean(.,na.rm=TRUE),
max = ~ max(.,na.rm=TRUE),
SumNa =~ SumNa(.))
) %>%
ungroup()
arsenal::comparedf(Table_Stats_3,Table_Stats_3a)
Table_Stats_3b <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(cnt =~ n() ,
min = ~ min(.,na.rm=TRUE),
mean= ~ mean(.,na.rm=TRUE),
sd = ~ sd(.,na.rm=TRUE),
max = ~ max(.,na.rm=TRUE),
SumNa =~ SumNa(.))
) %>%
ungroup()
colnames(Table_Stats_3b)
ncol(Table_Stats_3b)
Table_Stats_3b
Table_Stats_4 <- Table_Stats_3b %>%
tidyr::gather(-diq010,-riagendr,-age_ntile, key="Feature_Type", value="Value")
Table_Stats_4
Table_Stats_5 <- Table_Stats_4 %>%
tidyr::separate(Feature_Type, into=c('Feature','Type'), sep = "_")
Table_Stats_5
Table_Stats_6 <- Table_Stats_5 %>%
tidyr::spread(key='Type',value='Value')
Table_Stats_6
Table_Stats_7 <- Table_Stats_6 %>%
mutate(pct_NA = SumNa/cnt)
Table_Stats_7
options(tibble.width = Inf)
Table_Stats_7
Table_Stats_8 <- Table_Stats_7 %>%
arrange(desc(pct_NA))
Table_Stats_8
library('tidyverse')
Table_Stats_9 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(cnt =~ n() ,
min = ~ min(.,na.rm=TRUE),
mean= ~ mean(.,na.rm=TRUE),
sd = ~ sd(.,na.rm=TRUE),
max = ~ max(.,na.rm=TRUE),
SumNa =~ SumNa(.))
) %>%
ungroup() %>%
gather(-diq010,-riagendr,-age_ntile, key="Feature_Type", value="Value") %>%
separate(Feature_Type,into=c('Feature','Type')) %>%
spread(key='Type',value='Value') %>%
mutate(pct_NA = SumNa/cnt) %>%
arrange(desc(pct_NA))
Table_Stats_9
arsenal::comparedf(Table_Stats_8,Table_Stats_9)
diab_pop %>%
group_by(diq010,riagendr) %>%
summarise(cnt = n())
diab_pop.EncodeSex <- diab_pop %>%
mutate(SexMale = ifelse(riagendr == 'Male',1,0)) %>%
mutate(DM2 = ifelse(diq010 == 'Diabetes',1,0))
glimpse(diab_pop.EncodeSex)
diab_pop.EncodeSex %>%
group_by(DM2) %>%
summarise(cnt = sum(SexMale))
diab_pop.NoDm2 <- diab_pop.EncodeSex %>%
filter(diq010 == 'No Diabetes')
diab_pop.Dm2 <- diab_pop.EncodeSex %>%
filter(diq010 == 'Diabetes')
hist(diab_pop.NoDm2$SexMale, col=rgb(1,0,0,0.5), main="Overlapping Histogram", xlab="Sex Male")
hist(diab_pop.Dm2$SexMale, col=rgb(0,0,1,0.5), add=T)
box()
heatmap(as.matrix(diab_pop.EncodeSex[,c('DM2','SexMale')]))
library('caret')
# The return of dummyVars is a mapping which tells R which variables are factors and will need encoding
caret.dummyVars.diab_pop <- dummyVars(" ~ .", data = diab_pop)
str(caret.dummyVars.diab_pop)
# We have to apply the mapping to the set:
predict_dummyVars_diab_pop <- predict(caret.dummyVars.diab_pop, newdata = diab_pop)
str(predict_dummyVars_diab_pop)
# And then we can turn back into a tibble:
encode.diab_pop <- as_tibble(predict_dummyVars_diab_pop)
str(encode.diab_pop)
heatmap(as.matrix(encode.diab_pop[,c('diq010.Diabetes','riagendr.Male','dmdmartl.Married','dmdmartl.Widowed','dmdmartl.Divorced', 'dmdmartl.Separated','dmdmartl.Never married')]))
library('stringr')
diab_pop_s1 <- diab_pop %>%
mutate(dmdmartl = str_replace(dmdmartl ," ","_"))
glimpse(diab_pop_s1)
diab_pop_s1 <- diab_pop %>%
mutate(dmdmartl = str_replace_all(dmdmartl ," ","_")) %>%
mutate(dmdmartl = as.factor(dmdmartl))
glimpse(diab_pop_s1)
diab_pop_f1 <- diab_pop %>%
mutate(dmdmartl = str_replace_all(dmdmartl," ","_")) %>%
mutate(dmdmartl = as.factor(dmdmartl)) %>%
mutate(ridreth1 = str_replace_all(ridreth1," ","_")) %>%
mutate(ridreth1 = as.factor(ridreth1)) %>%
mutate(diq010 = str_replace_all(ridreth1," ","_")) %>%
mutate(diq010 = as.factor(diq010)) %>%
mutate(dmdeduc2 = str_replace_all(dmdeduc2," ","_") ) %>%
mutate(dmdeduc2 = as.factor(dmdeduc2)) %>%
mutate(indhhin2 = str_replace_all(indhhin2," ","_") ) %>%
mutate(indhhin2 = as.factor(indhhin2))
caret.dummyVars.diab_pop_f1 <- dummyVars(" ~ .", data = diab_pop_f1)
predict_dummyVars_diab_pop <- predict(caret.dummyVars.diab_pop_f1, newdata = diab_pop_f1)
encode.diab_pop_f1 <- as_tibble(predict_dummyVars_diab_pop)
glimpse(encode.diab_pop_f1)
my_replace_all_factor_function <- function(my_data_in , the_column){
enquo_the_column <- enquo(the_column)
my_data_out <- my_data_in %>%
mutate( !! enquo_the_column := str_replace_all(!! enquo_the_column," ","_")) %>%
mutate( !! enquo_the_column := as.factor(!! enquo_the_column))
}
result1 <- my_replace_all_factor_function(diab_pop,dmdmartl)
result2 <- my_replace_all_factor_function(result1,ridreth1)
result3 <- my_replace_all_factor_function(result2,diq010)
result4 <- my_replace_all_factor_function(result3,dmdeduc2)
result5 <- my_replace_all_factor_function(result4,indhhin2)
arsenal::comparedf(diab_pop_f1 , result5)
result_f <- my_replace_all_factor_function(diab_pop,dmdmartl) %>%
my_replace_all_factor_function(ridreth1) %>%
my_replace_all_factor_function(diq010) %>%
my_replace_all_factor_function(dmdeduc2) %>%
my_replace_all_factor_function(indhhin2)
glimpse(result_f)
arsenal::comparedf(result5 , result_f)
library('tidyverse')
diab_pop.NoDM2.age <- (diab_pop %>%
filter(diq010 == "No Diabetes"))$ridageyr
diab_pop.DM2.age <- (diab_pop %>%
filter(diq010 == "Diabetes"))$ridageyr
ks.test(diab_pop.NoDM2.age , diab_pop.DM2.age)$p.value
ks_wrapper <- function(df, factor, factor_level, continuous_feature){
F <- NULL
factor <- enquo(factor)
data.local <- df %>%
select(!! factor, continuous_feature)
X <- data.local %>%
filter(!! factor == factor_level)
Y <- data.local %>%
filter(!(!! factor == factor_level))
F$x <- X[,continuous_feature]
F$y <- Y[,continuous_feature]
ks_test_stat_T <- ks.test(F$x,F$y)
ks_test_stat <- ks_test_stat_T$p.value
result_row <- tibble::enframe(ks_test_stat)
result_row <- result_row %>%
mutate(Feature = continuous_feature)
return(result_row)
}
diab_pop_f2 <- diab_pop %>%
mutate(DM2_num = ifelse(diq010 == "Diabetes",1,0))
ks_wrapper(diab_pop_f2, DM2_num,1,"ridageyr")
compute_ks_cols <- c('ridageyr','bmxbmi','lbxglu')
ks_test_results_table <- purrr::map_dfr(compute_ks_cols,
df = diab_pop_f2,
factor = DM2_num,
factor_level = 1,
ks_wrapper)
ks_test_results_table
ks_test_results_table <- ks_test_results_table %>%
rename(ks_test_p_value=value)
ks_test_results_table
means.diab_pop_f2 <- diab_pop_f2 %>%
group_by(DM2_num) %>%
summarise_at(vars(c('ridageyr','bmxbmi','lbxglu')), list(~mean(.,na.rm = TRUE),~sd(.,na.rm = TRUE))) %>%
gather(-DM2_num, key="Feature_Type", value="Value") %>%
separate(Feature_Type, into= c('Feature','Type'), sep='_') %>%
mutate(DM2_stats_char = ifelse(DM2_num == 1,"Diabetes","No_Diabetes")) %>%
mutate(DM2_stats_char_Type = paste0(DM2_stats_char,"_",Type)) %>%
select(-DM2_num,-Type,-DM2_stats_char) %>%
select(Feature, DM2_stats_char_Type, Value) %>%
spread(key='DM2_stats_char_Type', value = 'Value')
means.diab_pop_f2
ks_table_1 <- ks_test_results_table %>%
left_join(means.diab_pop_f2) %>%
select(Feature, Diabetes_mean, Diabetes_sd, ks_test_p_value, No_Diabetes_mean, No_Diabetes_sd)
ks_table_1
install_if_not('kableExtra')
library('kableExtra')
ks_table_style <- kable(ks_table_1) %>%
kable_styling("striped") %>%
add_header_above(c(" " = 1, "Diabetes" = 2, "ks-test pvalue" = 1, "Non Diabetes" = 2))
ks_table_style
ggplot(diab_pop, aes(diq010,ridageyr)) +
geom_boxplot(aes(fill=factor(diq010)))
ggplot(diab_pop, aes(x=ridageyr)) + geom_histogram() + facet_wrap(~diq010)
diab_pop.NoDm2 <- diab_pop %>% filter(diq010 == 'No Diabetes')
diab_pop.Dm2 <- diab_pop %>% filter(diq010 == 'Diabetes')
hist(diab_pop.NoDm2$ridageyr, col=rgb(1,0,0,0.5), main="Overlapping Histograms Diabetics by Age", xlab="Age")
hist(diab_pop.Dm2$ridageyr, col=rgb(0,0,1,0.5), add=T)
legend("topright", c("No Diabetes", "Diabetes"), fill=c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))
box()
install_if_not('tableone')
diab_pop2 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3))
my_table_one <- tableone::CreateTableOne(
vars = c('ridageyr','bmxbmi','lbxglu',"riagendr","age_ntile"),
strata = c("diq010"),
data = diab_pop2,
factorVars = c("riagendr","age_ntile")
)
my_table_one
install_if_not(c('arsenal','knitr','kableExtra'))
library('arsenal')
library('knitr')
library('kableExtra')
table_arsenal <- tableby(diq010 ~ riagendr + dmdmartl, data = diab_pop)
table_arsenal
summary(table_arsenal, title="Diabetes by Sex and Marital Status")
summary.table_arsenal <- summary(table_arsenal, title="Diabetes by Sex and Marital Status",results="asis") %>%
kable() %>%
kable_styling("striped")
summary.table_arsenal
res.aov <- aov(ridageyr ~ diq010, data=diab_pop)
summary(res.aov)
res.mov <- manova(cbind(ridageyr,bmxbmi) ~ diq010, data=diab_pop)
summary(res.mov)
summary.aov(res.mov)
install_if_not("psych")
psych::describe(diab_pop)
library(psych)
describeBy(diab_pop, diab_pop$diq010, skew=FALSE, ranges=TRUE)
detach(package:psych)
install_if_not('GGally')
data2 <- diab_pop %>% na.omit()
GGally::ggpairs(data2)
install_if_not('skimr')
library('skimr')
skim(diab_pop) %>%
summary()
skim(diab_pop)
diab_pop %>%
group_by(diq010) %>%
skim()
install_if_not('networkD3')
library(networkD3)
library(tidyverse)
by_sex <- diab_pop %>%
group_by(riagendr) %>%
summarise(cnt = n_distinct(seqn))
by_sex_by_eth <- diab_pop %>%
group_by(riagendr, ridreth1) %>%
summarise(cnt = n_distinct(seqn))
by_sex_by_dm2 <- diab_pop %>%
group_by(riagendr, diq010) %>%
summarise(cnt = n_distinct(seqn))
join1 <- by_sex %>%
left_join(by_sex_by_eth,
by= c('riagendr'='riagendr'),
suffix = c("_sex","_eth")) %>%
mutate(diff_sex_eth = cnt_sex - cnt_eth) %>%
mutate(source = riagendr,
target = ridreth1) %>%
mutate(diff = diff_sex_eth) %>%
select(source, target, diff)
join2 <- by_sex %>%
left_join(by_sex_by_dm2,
by= c('riagendr'='riagendr'),
suffix = c("_sex","_dm2")) %>%
mutate(diff_sex_dm2 = cnt_sex - cnt_dm2 )%>%
mutate(source = riagendr,
target = diq010) %>%
mutate(diff = diff_sex_dm2)%>%
select(source, target, diff)
by_dm2 <-diab_pop %>%
group_by(diq010)%>%
summarise(cnt = n_distinct(seqn))
by_dm2_by_eth <-diab_pop %>%
group_by(diq010, ridreth1)%>%
summarise(cnt = n_distinct(seqn))
join3 <- by_dm2 %>%
left_join(by_dm2_by_eth,
by= c('diq010'='diq010'),
suffix = c("_dm2","_eth")) %>%
mutate(diff_dm2_eth = cnt_dm2 - cnt_eth) %>%
mutate(source = ridreth1,
target = diq010) %>%
mutate(diff = diff_dm2_eth) %>%
select(source, target, diff)
JOIN <- bind_rows(join1, join2, join3)
nodes <- data.frame(
name=c( as.character(JOIN$source),
as.character(JOIN$target))
%>% unique()
)
JOIN$IDsource <- match(JOIN$source, nodes$name)-1
JOIN$IDtarget <- match(JOIN$target, nodes$name)-1
p <- sankeyNetwork(Links = JOIN,
Nodes = nodes,
Source = "IDsource",
Target = "IDtarget",
Value = "diff",
NodeID = "name",
sinksRight=FALSE)
p