The HELP (Health Evaluation and Linkage to Primary Care) study was a clinical trial for adult inpatients recruited from a detoxification unit. Patients with no primary care physician were randomized to receive a multidisciplinary assessment and a brief motivational intervention or usual care, with the goal of linking them to primary medical care. Eligible subjects were adults, who spoke Spanish or English, reported alcohol, heroin or cocaine as their first or second drug of choice, resided in proximity to the primary care clinic to which they would be referred or were homeless. Subjects were interviewed at baseline during their detoxification stay and follow-up interviews were undertaken every 6 months for 2 years. A variety of continuous, count, discrete, and survival time predictors and outcomes were collected at each of these five occasions.
The following R script is used to manage the data file at the initial stage of investigation. Provide comments on what each line of the script is meant to achieve.
Source: Kleinman, K., & Horton, N.J. (2015). Using R for Data Management, Statistical Analysis, and Graphics.
options(digits=3)
options(width=72) # narrow output
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv")
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
newds = dplyr::select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)names(newds)
#> [1] "cesd" "female" "i1" "i2" "id" "treat" "f1a"
#> [8] "f1b" "f1c" "f1d" "f1e" "f1f" "f1g" "f1h"
#> [15] "f1i" "f1j" "f1k" "f1l" "f1m" "f1n" "f1o"
#> [22] "f1p" "f1q" "f1r" "f1s" "f1t"
str(newds[,1:10]) # structure of the first 10 variables
#> 'data.frame': 453 obs. of 10 variables:
#> $ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
#> $ female: int 0 0 0 1 0 1 1 0 1 0 ...
#> $ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
#> $ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
#> $ id : int 1 2 3 4 5 6 7 8 9 10 ...
#> $ treat : int 1 1 0 0 0 1 0 1 0 1 ...
#> $ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
#> $ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
#> $ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
#> $ f1d : int 0 3 0 3 3 3 1 3 1 0 ...summary(newds[,1:10]) # summary of the first 10 variables
#> cesd female i1 i2
#> Min. : 1.0 Min. :0.000 Min. : 0.0 Min. : 0.0
#> 1st Qu.:25.0 1st Qu.:0.000 1st Qu.: 3.0 1st Qu.: 3.0
#> Median :34.0 Median :0.000 Median : 13.0 Median : 15.0
#> Mean :32.8 Mean :0.236 Mean : 17.9 Mean : 22.6
#> 3rd Qu.:41.0 3rd Qu.:0.000 3rd Qu.: 26.0 3rd Qu.: 32.0
#> Max. :60.0 Max. :1.000 Max. :142.0 Max. :184.0
#> id treat f1a f1b
#> Min. : 1 Min. :0.000 Min. :0.00 Min. :0.00
#> 1st Qu.:119 1st Qu.:0.000 1st Qu.:1.00 1st Qu.:0.00
#> Median :233 Median :0.000 Median :2.00 Median :1.00
#> Mean :233 Mean :0.497 Mean :1.63 Mean :1.39
#> 3rd Qu.:348 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:2.00
#> Max. :470 Max. :1.000 Max. :3.00 Max. :3.00
#> f1c f1d
#> Min. :0.00 Min. :0.00
#> 1st Qu.:1.00 1st Qu.:0.00
#> Median :2.00 Median :1.00
#> Mean :1.92 Mean :1.56
#> 3rd Qu.:3.00 3rd Qu.:3.00
#> Max. :3.00 Max. :3.00head(newds, n=3)
#> cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
#> 1 49 0 13 26 1 1 3 2 3 0 2 3 3 0 2 3
#> 2 30 0 56 62 2 1 3 2 0 3 3 2 0 0 3 0
#> 3 39 0 0 0 3 0 3 2 3 0 2 2 1 3 2 3
#> f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
#> 1 3 0 1 2 2 2 2 3 3 2
#> 2 3 0 0 3 0 0 0 2 0 0
#> 3 1 0 1 3 2 0 0 3 2 0library(mosaic)
#> Loading required package: lattice
#> Loading required package: ggformula
#> Loading required package: ggplot2
#> Loading required package: ggstance
#>
#> Attaching package: 'ggstance'
#> The following objects are masked from 'package:ggplot2':
#>
#> geom_errorbarh, GeomErrorbarh
#>
#> New to ggformula? Try the tutorials:
#> learnr::run_tutorial("introduction", package = "ggformula")
#> learnr::run_tutorial("refining", package = "ggformula")
#> Loading required package: mosaicData
#> Loading required package: Matrix
#> Registered S3 method overwritten by 'mosaic':
#> method from
#> fortify.SpatialPolygonsDataFrame ggplot2
#>
#> The 'mosaic' package masks several functions from core packages in order to add
#> additional features. The original behavior of these functions should not be affected by this.
#>
#> Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
#>
#> Have you tried the ggformula package for your plots?
#>
#> Attaching package: 'mosaic'
#> The following object is masked from 'package:Matrix':
#>
#> mean
#> The following object is masked from 'package:ggplot2':
#>
#> stat
#> The following objects are masked from 'package:dplyr':
#>
#> count, do, tally
#> The following objects are masked from 'package:stats':
#>
#> binom.test, cor, cor.test, cov, fivenum, IQR, median,
#> prop.test, quantile, sd, t.test, var
#> The following objects are masked from 'package:base':
#>
#> max, mean, min, prod, range, sample, sum
tally(~ is.na(f1g), data=newds)
#> is.na(f1g)
#> TRUE FALSE
#> 1 452
favstats(~ f1g, data=newds)
#> min Q1 median Q3 max mean sd n missing
#> 0 1 2 3 3 1.73 1.1 452 1# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
(3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
f1q, f1r, f1s, f1t))
nmisscesd = apply(is.na(cesditems), 1, sum)
ncesditems = cesditems
ncesditems[is.na(cesditems)] = 0
newcesd = apply(ncesditems, 1, sum)
imputemeancesd = 20/(20-nmisscesd)*newcesdlibrary(dplyr)
library(memisc)
#> Loading required package: MASS
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#> select
#>
#> Attaching package: 'memisc'
#> The following object is masked from 'package:Matrix':
#>
#> as.array
#> The following object is masked from 'package:ggplot2':
#>
#> syms
#> The following objects are masked from 'package:dplyr':
#>
#> collect, recode, rename, syms
#> The following objects are masked from 'package:stats':
#>
#> contr.sum, contr.treatment, contrasts
#> The following object is masked from 'package:base':
#>
#> as.array
newds = dplyr::mutate(newds, drinkstat=
cases(
"abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
(i1>0 & i1<=2 & i2<=4 & female==0),
"highrisk" = ((i1>1 | i2>3) & female==1) |
((i1>2 | i2>4) & female==0)))library(gmodels)
with(tmpds, CrossTable(drinkstat))
#>
#>
#> Cell Contents
#> |-------------------------|
#> | N |
#> | N / Table Total |
#> |-------------------------|
#>
#>
#> Total Observations in Table: 453
#>
#>
#> | abstinent | moderate | highrisk |
#> |-----------|-----------|-----------|
#> | 68 | 28 | 357 |
#> | 0.150 | 0.062 | 0.788 |
#> |-----------|-----------|-----------|
#>
#>
#>
#> with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))
#>
#>
#> Cell Contents
#> |-------------------------|
#> | N |
#> | N / Row Total |
#> |-------------------------|
#>
#>
#> Total Observations in Table: 453
#>
#>
#> | female
#> drinkstat | 0 | 1 | Row Total |
#> -------------|-----------|-----------|-----------|
#> abstinent | 42 | 26 | 68 |
#> | 0.618 | 0.382 | 0.150 |
#> -------------|-----------|-----------|-----------|
#> moderate | 21 | 7 | 28 |
#> | 0.750 | 0.250 | 0.062 |
#> -------------|-----------|-----------|-----------|
#> highrisk | 283 | 74 | 357 |
#> | 0.793 | 0.207 | 0.788 |
#> -------------|-----------|-----------|-----------|
#> Column Total | 346 | 107 | 453 |
#> -------------|-----------|-----------|-----------|
#>
#>