Task: Exercises: Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score mathscr against average reading score readscr for the sampled data set. Make sure your results are reproducible (e.g., the same random sample will be drawn each time).
# load package
::p_load(Ecdat, dplyr, tidyr)
pacmandata(Caschool, package="Ecdat")
# read data
<- Caschool dat1
# set random seed
set.seed(1234)
# select at random one school per county
<- group_by(dat1, county) %>%
dat1random sample_n(1)
# load package for addTextLabels function (new for label)
::p_load("basicPlotteR")
pacman
with(dat1random, plot(mathscr, readscr,
xlab = "Average math score",
ylab = "Average reading score",
pch = 20, bty="n", col="lightblue",
axes = FALSE))
axis(1, at = seq(600,750, by=10))
axis(2, at = seq(600,750, by=10))
grid()
# the previous label way:
# with(dat1random, text(mathscr,readscr, labels = county, adj = c(0,0), cex=0.6))
# to avoid overlaping labels, the function addTextLabels() from {basicPlotteR}
# install.packages("remotes")
# remotes::install_github("JosephCrispell/basicPlotteR")
addTextLabels(dat1random$mathscr,
$readscr,
dat1random$county,
dat1randomcex.label=.5,
col.label="black",
lty=2, col.line=rgb(0, 0, 0, 0.5))
::xyplot(readscr ~ mathscr,
latticegroups = county,
data = dat1random,
type=c("p", "g"),
auto.key=list(columns=4))
Task: Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach. The tail end of the data object should looks as follows:
classID | language_mean | language_lb | language_ub |
---|---|---|---|
131 | 11.273 | … | … |
132 | 10.550 | … | … |
133 | 10.643 | … | … |
# load package
::p_load(MASS)
pacman
# load and input data
data(nlschools, package="MASS")
<- nlschools dat2
%>%
dat2 group_by(class) %>% # group
summarize(language_mean = mean(lang),
language_lb = language_mean - 1.96*sd(lang),
language_ub = language_mean + 1.96*sd(lang)) %>% # mean and 95%-confidence intervals
arrange(desc(language_mean)) %>%
mutate(classID = paste0(1:n())) %>%
::select(., c(5,2,3,4)) %>%
dplyrtail(.,3) %>% # the last 3 row
::kable (digits = 3) knitr
classID | language_mean | language_lb | language_ub |
---|---|---|---|
131 | 23.714 | 11.333 | 36.096 |
132 | 23.500 | 14.129 | 32.871 |
133 | 20.000 | 6.999 | 33.001 |
::nlschools |>
MASSgroup_by(class) |>
summarize(c_avg = mean(lang)) |>
arrange(c_avg)
## # A tibble: 133 x 2
## class c_avg
## <fct> <dbl>
## 1 10380 20
## 2 4780 23.5
## 3 280 23.7
## 4 25880 28.4
## 5 25680 29.3
## 6 17980 30.2
## 7 1082 30.4
## 8 1280 30.9
## 9 1580 30.9
## 10 21480 32.5
## # ... with 123 more rows
Task: Convert the wide form of the O’Brien and Kaiser’s repeated-measures data in OBrienKaiser{carData} to the long form in OBrienKaiserLong{carData}.
# load package
::p_load(carData, dplyr, readr)
pacman# load and input data
data(OBrienKaiser, package = "carData")
<- OBrienKaiser dat3
<- dat3 %>%
dat3L mutate(ID = paste0(1:n())) %>% # Augment
rename(Treatment = treatment, Gender = gender) %>% # Rename
pivot_longer(cols = starts_with(c("p","f")), # wide to long
names_to = "phase",
values_to = "Score") %>%
separate(phase, c("Phase", "hour")) %>% # split
mutate(Hour = parse_number(hour)) %>%
::select(-hour) %>%
dplyrc(1,2,5,3,4,6)] %>%
.[,as.data.frame()
head(dat3L)
## Treatment Gender Score ID Phase Hour
## 1 control M 1 1 pre 1
## 2 control M 2 1 pre 2
## 3 control M 4 1 pre 3
## 4 control M 2 1 pre 4
## 5 control M 1 1 pre 5
## 6 control M 3 1 post 1
head(OBrienKaiserLong)
## treatment gender score id phase hour
## 1.1 control M 1 1 pre 1
## 1.2 control M 2 1 pre 2
## 1.3 control M 4 1 pre 3
## 1.4 control M 2 1 pre 4
## 1.5 control M 1 1 pre 5
## 1.6 control M 3 1 post 1
Information about student majors, activities in intramural sports, and survey responses about their satisfaction with the on-campus cafeteria (for the variety and quality of the food served, and hours of operation) are recorded in three separate plain text files by three different administrative units: majors, sports, cafeteria
Task: Input these files into an R session, merge them and output the results as in the comma-separate values file.
# input data
<- read.table("student_major.txt", h = T)
majors <- read.table("student_sport.txt", h = T)
sports <- read.table("student_cafe.txt", h = T) cafeteria
<- merge(majors, sports, all = TRUE)
ms <- merge(ms, cafeteria, all = TRUE)
msc head(msc)
## id major sport variety quality hours
## 1 123 Mathematics badminton 3 4 2
## 2 123 Mathematics basketball 3 4 2
## 3 222 Computer Science badminton NA NA NA
## 4 345 Computer Science <NA> 4 4 2
## 5 385 Mathematics volleyball 1 1 2
## 6 385 Mathematics basketball 1 1 2
<- full_join(majors, sports) ms2
## Joining, by = "id"
<- full_join(ms, cafeteria) msc2
## Joining, by = "id"
head(msc2)
## id major sport variety quality hours
## 1 123 Mathematics basketball 3 4 2
## 2 123 Mathematics badminton 3 4 2
## 3 222 Computer Science badminton NA NA NA
## 4 345 Computer Science <NA> 4 4 2
## 5 385 Mathematics basketball 1 1 2
## 6 385 Mathematics volleyball 1 1 2
# save as csv file
# write.csv(msc, "msc.csv", sep = "," ,row.names = F)
The HELP (Health Evaluation and Linkage to Primary Care) study was a clinical trial for adult inpatients recruited from a detoxification unit.
Patients with no primary care physician were randomized to receive a multidisciplinary assessment and a brief motivational intervention or usual care, with the goal of linking them to primary medical care.
Eligible subjects were adults, who spoke Spanish or English, reported alcohol, heroin or cocaine as their first or second drug of choice, resided in proximity to the primary care clinic to which they would be referred or were homeless.
Subjects were interviewed at baseline during their detoxification stay and follow-up interviews were undertaken every 6 months for 2 years.
A variety of continuous, count, discrete, and survival time predictors and outcomes were collected at each of these five occasions.
Task: The following R script is used to manage the data file at the initial stage of investigation. Provide comments on what each line of the script is meant to achieve.
# show column name
names(newds)
## [1] "cesd" "female" "i1" "i2" "id" "treat" "f1a"
## [8] "f1b" "f1c" "f1d" "f1e" "f1f" "f1g" "f1h"
## [15] "f1i" "f1j" "f1k" "f1l" "f1m" "f1n" "f1o"
## [22] "f1p" "f1q" "f1r" "f1s" "f1t"
# structure of the first 10 variables
str(newds[,1:10])
## 'data.frame': 453 obs. of 10 variables:
## $ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
## $ female: int 0 0 0 1 0 1 1 0 1 0 ...
## $ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
## $ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ treat : int 1 1 0 0 0 1 0 1 0 1 ...
## $ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
## $ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
## $ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
## $ f1d : int 0 3 0 3 3 3 1 3 1 0 ...
# summary of the first 10 variables
summary(newds[,1:10])
## cesd female i1 i2
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:25.0 1st Qu.:0.000 1st Qu.: 3.0 1st Qu.: 3.0
## Median :34.0 Median :0.000 Median : 13.0 Median : 15.0
## Mean :32.8 Mean :0.236 Mean : 17.9 Mean : 22.6
## 3rd Qu.:41.0 3rd Qu.:0.000 3rd Qu.: 26.0 3rd Qu.: 32.0
## Max. :60.0 Max. :1.000 Max. :142.0 Max. :184.0
## id treat f1a f1b
## Min. : 1 Min. :0.000 Min. :0.00 Min. :0.00
## 1st Qu.:119 1st Qu.:0.000 1st Qu.:1.00 1st Qu.:0.00
## Median :233 Median :0.000 Median :2.00 Median :1.00
## Mean :233 Mean :0.497 Mean :1.63 Mean :1.39
## 3rd Qu.:348 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:2.00
## Max. :470 Max. :1.000 Max. :3.00 Max. :3.00
## f1c f1d
## Min. :0.00 Min. :0.00
## 1st Qu.:1.00 1st Qu.:0.00
## Median :2.00 Median :1.00
## Mean :1.92 Mean :1.56
## 3rd Qu.:3.00 3rd Qu.:3.00
## Max. :3.00 Max. :3.00
# show the top 3 row in the data
head(newds, n=3)
## cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1 49 0 13 26 1 1 3 2 3 0 2 3 3 0 2 3
## 2 30 0 56 62 2 1 3 2 0 3 3 2 0 0 3 0
## 3 39 0 0 0 3 0 3 2 3 0 2 2 1 3 2 3
## f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1 3 0 1 2 2 2 2 3 3 2
## 2 3 0 0 3 0 0 0 2 0 0
## 3 1 0 1 3 2 0 0 3 2 0
# add comment in the data frame and save
comment(newds) = "HELP baseline dataset"
comment(newds)
## [1] "HELP baseline dataset"
# save the file without certain file format.
save(ds, file="savedfile")
# save the file as csv file.
write.csv(ds, file="ds.csv")
# load package
library(foreign)
# save the file as sas file.
# df = newds, datafile = file.dat, codefile = file.sas
write.foreign(newds, "file.dat", "file.sas", package="SAS")
# show the 1 to 10 row in cesd column
with(newds, cesd[1:10])
## [1] 49 30 39 15 39 6 52 32 50 46
with(newds, head(cesd, 10))
## [1] 49 30 39 15 39 6 52 32 50 46
# show the value higher than 56 in cesd column
with(newds, cesd[cesd > 56])
## [1] 57 58 57 60 58 58 57
# show the value higher than 56 in cesd column with id column
::filter(newds, cesd > 56) %>% dplyr::select(id, cesd) dplyr
## id cesd
## 1 71 57
## 2 127 58
## 3 200 57
## 4 228 60
## 5 273 58
## 6 351 58
## 7 13 57
# 列出cesd最低的4個分數
with(newds, sort(cesd)[1:4])
## [1] 1 3 3 4
# which.min: index of the (first) minimum or maximum of a numeric (or logical) vector.
# 列出cesd最低分數的列數
with(newds, which.min(cesd))
## [1] 199
library(mosaic)
# list the count of na value in f1g column
tally(~ is.na(f1g), data=newds)
## is.na(f1g)
## TRUE FALSE
## 1 452
# similar to summary(newds$f1g) but more information
favstats(~ f1g, data=newds)
## min Q1 median Q3 max mean sd n missing
## 0 1 2 3 3 1.73 1.1 452 1
# reverse code f1d, f1h, f1l and f1p (問題計分為反向題)
= with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
cesditems 3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o,
(3 - f1p), f1q, f1r, f1s, f1t))
(
# number of missing in cesditems
= apply(is.na(cesditems), 1, sum) nmisscesd
= cesditems
ncesditems
# check 沒有missing
is.na(cesditems)] = 0
ncesditems[
# 每列的總和(總分?)
# apply(X, MARGIN, FUN), MARGIN= 1 indicates rows
= apply(ncesditems, 1, sum)
newcesd
# 總共20題 / (總題數-未填數)* newcesd
= 20/(20-nmisscesd)*newcesd imputemeancesd
# list for comparasion
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
## newcesd newds.cesd nmisscesd imputemeancesd
## 4 15 15 1 15.8
## 17 19 19 1 20.0
## 87 44 44 1 46.3
## 101 17 17 1 17.9
## 154 29 29 1 30.5
## 177 44 44 1 46.3
## 229 39 39 1 41.1
## ----createdrink, message=FALSE-------------------------------------------
library(memisc)
# add column abstinent, moderate, highrisk by certain definition
# cases(): allows to distinguish several cases defined logical conditions.
= dplyr::mutate(newds, drinkstat=
newds ::cases("abstinent" = i1 == 0,
memisc"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) | (i1>0 & i1<=2 & i2<=4 & female==0),
"highrisk" = ((i1>1 | i2>3) & female==1) | ((i1>2 | i2>4) & female==0)))
## ----echo=FALSE----------------------------------------------------------
library(mosaic)
## ----echo=FALSE----------------------------------------------------------
detach(package:memisc)
detach(package:MASS)
library(dplyr)
# create a new data frame "tmpds" contains i1, i2, female, drinkstat from the newds data frame.
= dplyr::select(newds, i1, i2, female, drinkstat)
tmpds
# show row from 365 to 370
365:370,] tmpds[
## i1 i2 female drinkstat
## 365 6 24 0 highrisk
## 366 6 6 0 highrisk
## 367 0 0 0 abstinent
## 368 0 0 1 abstinent
## 369 8 8 0 highrisk
## 370 32 32 0 highrisk
# select the data that female with moderate drink status
::filter(tmpds, drinkstat=="moderate" & female==1) dplyr
## i1 i2 female drinkstat
## 1 1 1 1 moderate
## 2 1 3 1 moderate
## 3 1 2 1 moderate
## 4 1 1 1 moderate
## 5 1 1 1 moderate
## 6 1 1 1 moderate
## 7 1 1 1 moderate
## ----message=FALSE-------------------------------------------------------
library(gmodels)
# CrossTable: Cross Tabulation with Tests for Factor Independence
with(tmpds, CrossTable(drinkstat))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | abstinent | moderate | highrisk |
## |-----------|-----------|-----------|
## | 68 | 28 | 357 |
## | 0.150 | 0.062 | 0.788 |
## |-----------|-----------|-----------|
##
##
##
##
## 有點像在作"列聯表"
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE,
prop.c=FALSE,
prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | female
## drinkstat | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## abstinent | 42 | 26 | 68 |
## | 0.618 | 0.382 | 0.150 |
## -------------|-----------|-----------|-----------|
## moderate | 21 | 7 | 28 |
## | 0.750 | 0.250 | 0.062 |
## -------------|-----------|-----------|-----------|
## highrisk | 283 | 74 | 357 |
## | 0.793 | 0.207 | 0.788 |
## -------------|-----------|-----------|-----------|
## Column Total | 346 | 107 | 453 |
## -------------|-----------|-----------|-----------|
##
##
# set gender as factor type with level
= transform(newds,
newds gender = factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)
## gender
## female Male Female
## 0 346 0
## 1 0 107
library(dplyr)
# orders the rows of a ds data frame by the values of "cesd" and "i1" columns.
= arrange(ds, cesd, i1)
newds
1:5, c("cesd", "i1", "id")] newds[
## cesd i1 id
## 1 1 3 233
## 2 3 1 139
## 3 3 13 418
## 4 4 4 251
## 5 4 9 95
library(dplyr)
# subset the data with female subject.
= filter(ds, female==1)
females # calculated the mean cesd scores among female.
with(females, mean(cesd))
## [1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1])
## [1] 36.9
# calculated the mean cesd scores among male and female.
with(ds, tapply(cesd, female, mean))
## 0 1
## 31.6 36.9
library(mosaic)
mean(cesd ~ female, data=ds)
## 0 1
## 31.6 36.9
Source: Kleinman, K., & Horton, N.J. (2015). Using R for Data Management, Statistical Analysis, and Graphics.