# create reproducible ramdom process
set.seed(271)
dta_cashool <- Ecdat::Caschool
# split the data object by counties
dta_cashool_county <- split(dta_cashool, dta_cashool$county)
# define a function to sample one row in a data frame
sample_one_row <- function(x) {
x <- as.data.frame(x)
x[sample(nrow(x), size = 1), ]
}
# apply the function to each element in the list that generated by split
dta_cashool_county_sampled <- lapply(dta_cashool_county, sample_one_row)
# unsplit the outcome, which gives us a data frame
dta_cashool_county_sampled <- unsplit(dta_cashool_county_sampled, unique(dta_cashool$county))
# scatter plot
with(dta_cashool_county_sampled, plot(x = mathscr, y = readscr))
library(tidyverse)
# create reproducible ramdom process
set.seed(314)
dta_cashool <- Ecdat::Caschool %>%
# group the data by county
group_by(county) %>%
# samole 1 row from each group
sample_n(1)
with(dta_cashool, plot(x = mathscr, y = readscr))
## read data
dta_nlschools <- MASS::nlschools
head(dta_nlschools)
## lang IQ class GS SES COMB
## 1 46 15.0 180 29 23 0
## 2 45 14.5 180 29 10 0
## 3 33 9.5 180 29 15 0
## 4 46 11.0 180 29 23 0
## 5 20 8.0 180 29 10 0
## 6 30 9.5 180 29 10 0
# define a function to compute lower bound of confident interval
lowerBound <- function(x) {
# compute t scores for given df
t <- qt(.975, df = (length(x) - 1))
# lower bound formular
mean(x) - (t * sd(x)) / sqrt(length(x))
}
# define a function to compute lower bound of confident interval
upperBound <- function(x) {
# compute t scores for given df
t <- qt(.975, df = (length(x) - 1))
# upper bound formular
mean(x) + (t * sd(x)) / sqrt(length(x))
}
# compute mean by class
dta_nlschools_mean <- aggregate(data = dta_nlschools,
lang ~ class,
FUN = mean)
# rename the varibale
colnames(dta_nlschools_mean)[2] <- "language_mean"
# compute lower bound by class
dta_nlschools_lb <- aggregate(data = dta_nlschools,
lang ~ class,
FUN = lowerBound)
# rename the varibale
colnames(dta_nlschools_lb)[2] <- "language_lb"
# compute upper bound by class
dta_nlschools_ub <- aggregate(data = dta_nlschools,
lang ~ class,
FUN = upperBound)
# rename the varibale
colnames(dta_nlschools_lb)[2] <- "language_ub"
# set the digits to 4
options(digits = 4)
# merge three data objects
dta_nlschools_merge1 <- merge(dta_nlschools_mean, dta_nlschools_lb, by = "class")
dta_nlschools_summary <- merge(dta_nlschools_merge1, dta_nlschools_ub, by = "class")
# sort the data object by class number in a descending order
dta_nlschools_summary <- dta_nlschools_summary[order(dta_nlschools_summary$class), ]
# create classid column
dta_nlschools_summary$classid <- 1:133
# remove column class and set class id as the first column
dta_nlschools_summary <- dta_nlschools_summary[,c(5, 2:4)]
# print the last 3 rows
tail(dta_nlschools_summary[order(dta_nlschools_summary$class), ], 3)
## classid language_mean language_ub lang
## 91 131 38.09 34.27 41.91
## 92 132 29.30 19.80 38.80
## 93 133 28.43 21.98 34.88
dta_nlschools %>%
# group the data by class
group_by(class) %>%
# compute mean within each class
summarise(language_mean = mean(lang),
# compute lower bound
language_lb = lowerBound(lang),
# compute upper bound
language_ub = upperBound(lang)) %>%
mutate(classid = 1:133) %>%
# remove column class and set classid as the first column
select(c(5, 2:4)) %>%
# print the last 3 rows
tail(3)
## # A tibble: 3 x 4
## classid language_mean language_lb language_ub
## <int> <dbl> <dbl> <dbl>
## 1 131 38.1 34.3 41.9
## 2 132 29.3 19.8 38.8
## 3 133 28.4 22.0 34.9
library(car)
dta_prestige <- Prestige
head(dta_prestige)
## education income women prestige census type
## gov.administrators 13.11 12351 11.16 68.8 1113 prof
## general.managers 12.26 25879 4.02 69.1 1130 prof
## accountants 12.77 9271 15.70 63.4 1171 prof
## purchasing.officers 11.42 8865 9.11 56.8 1175 prof
## chemists 14.62 8403 11.68 73.5 2111 prof
## physicists 15.64 11030 5.13 77.6 2113 prof
# grouped summary table
aggregate(data = dta_prestige,
prestige ~ type,
FUN = "median")
## type prestige
## 1 bc 35.9
## 2 prof 68.4
## 3 wc 41.5
# remove rows contain na value
dta_prestige1 <- na.omit(dta_prestige)
# large than median is high
# lower than median is low
dta_prestige1$prestige_level <- ifelse(dta_prestige1$type == "bc" & dta_prestige1$prestige < 35.9, "low", "high")
dta_prestige1$prestige_level <- ifelse(dta_prestige1$type == "prof" & dta_prestige1$prestige < 68.4, "low", "high")
dta_prestige1$prestige_level <- ifelse(dta_prestige1$type == "wc" & dta_prestige1$prestige < 41.5, "low", "high")
library(lattice)
xyplot(income ~ education | type,
data = dta_prestige1)
xyplot(income ~ education | type * prestige_level,
data = dta_prestige1)
dta_prestige %>%
na.omit() %>%
group_by(type) %>%
summarise(prestige_median = median(prestige))
## # A tibble: 3 x 2
## type prestige_median
## <fct> <dbl>
## 1 bc 35.9
## 2 prof 68.4
## 3 wc 41.5
dta_prestige_summary <- dta_prestige %>% na.omit() %>% group_by(type) %>% mutate(prestige_level = ifelse(prestige >= median(prestige), "high", "low")) %>% select(education, income, prestige_level, type)
head(dta_prestige_summary)
## # A tibble: 6 x 4
## # Groups: type [1]
## education income prestige_level type
## <dbl> <int> <chr> <fct>
## 1 13.1 12351 high prof
## 2 12.3 25879 high prof
## 3 12.8 9271 low prof
## 4 11.4 8865 low prof
## 5 14.6 8403 high prof
## 6 15.6 11030 high prof
xyplot(income ~ education | prestige_level,
data = dta_prestige_summary)
xyplot(income ~ education | type * prestige_level,
data = dta_prestige_summary)
nbl_c <- read.table("nobel_countries.txt", header = T)
nbl_w <- read.table("nobel_winners.txt", header = T)
# return all rows from x, and all columns from x and y.
inner_join(nbl_w, nbl_c)
## Joining, by = "Year"
## Name Gender Year Country
## 1 Patrick Modiano Male 2014 France
## 2 Bertrand Russell Male 1950 UK
## 3 Kazuo Ishiguro Male 2017 UK
## 4 Bob Dylan Male 2016 US
## 5 Alice Munro Female 2013 Canada
## 6 Mo Yan Male 2012 China
inner_join(nbl_c, nbl_w)
## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
# return all rows from x where there are matching values in y, keeping just columns from x.
semi_join(nbl_w, nbl_c)
## Joining, by = "Year"
## Name Gender Year
## 1 Patrick Modiano Male 2014
## 2 Bertrand Russell Male 1950
## 3 Kazuo Ishiguro Male 2017
## 4 Bob Dylan Male 2016
## 5 Alice Munro Female 2013
## 6 Mo Yan Male 2012
semi_join(nbl_c, nbl_w)
## Joining, by = "Year"
## Country Year
## 1 France 2014
## 2 UK 1950
## 3 UK 2017
## 4 US 2016
## 5 Canada 2013
## 6 China 2012
# return all rows from x where there are matching values in y, and all columns from x and y.
left_join(nbl_w, nbl_c)
## Joining, by = "Year"
## Name Gender Year Country
## 1 Patrick Modiano Male 2014 France
## 2 Bertrand Russell Male 1950 UK
## 3 Kazuo Ishiguro Male 2017 UK
## 4 Bob Dylan Male 2016 US
## 5 Alice Munro Female 2013 Canada
## 6 Mo Yan Male 2012 China
## 7 Pearl Buck Female 1938 <NA>
left_join(nbl_c, nbl_w)
## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
## 7 Russia 2015 <NA> <NA>
## 8 Sweden 2011 <NA> <NA>
# return all rows from x where there are not matching values in y, keeping just columns from x.
anti_join(nbl_w, nbl_c)
## Joining, by = "Year"
## Name Gender Year
## 1 Pearl Buck Female 1938
anti_join(nbl_c, nbl_w)
## Joining, by = "Year"
## Country Year
## 1 Russia 2015
## 2 Sweden 2011
# return all rows and all columns from both x and y.
# not matching returns NA
full_join(nbl_w, nbl_c)
## Joining, by = "Year"
## Name Gender Year Country
## 1 Patrick Modiano Male 2014 France
## 2 Bertrand Russell Male 1950 UK
## 3 Kazuo Ishiguro Male 2017 UK
## 4 Bob Dylan Male 2016 US
## 5 Alice Munro Female 2013 Canada
## 6 Mo Yan Male 2012 China
## 7 Pearl Buck Female 1938 <NA>
## 8 <NA> <NA> 2015 Russia
## 9 <NA> <NA> 2011 Sweden
full_join(nbl_c, nbl_w)
## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
## 7 Russia 2015 <NA> <NA>
## 8 Sweden 2011 <NA> <NA>
## 9 <NA> 1938 Pearl Buck Female
# read data
library(datasets)
fL <- "http://www.amstat.org/publications/jse/datasets/sat.dat.txt"
dta_sat <- read.table(fL, row.names=1)
# recode variable names
names(dta_sat) <- c("Spending", "PTR", "Salary", "PE", "Verbal", "Math", "SAT")
# merge two data object
dta_sat$division <- state.division
# remove the columns unrelated to this excercise
dta_sat <- dta_sat[, c("Salary", "SAT", "division")]
# print the first 6 lines
head(dta_sat)
## Salary SAT division
## Alabama 31.14 1029 East South Central
## Alaska 47.95 934 Pacific
## Arizona 32.17 944 Mountain
## Arkansas 28.93 1005 West South Central
## California 41.08 902 Pacific
## Colorado 34.57 980 Mountain
# subset the data object by division
dta_sat_division <- split(dta_sat, dta_sat$division)
# bulid a linear model for each division
dta_sat_lm <- lapply(dta_sat_division, function(x) lm(data = x, SAT ~ Salary))
# print the outcome
dta_sat_lm
## $`New England`
##
## Call:
## lm(formula = SAT ~ Salary, data = x)
##
## Coefficients:
## (Intercept) Salary
## 913.883 -0.207
##
##
## $`Middle Atlantic`
##
## Call:
## lm(formula = SAT ~ Salary, data = x)
##
## Coefficients:
## (Intercept) Salary
## 709.81 3.91
##
##
## $`South Atlantic`
##
## Call:
## lm(formula = SAT ~ Salary, data = x)
##
## Coefficients:
## (Intercept) Salary
## 758.23 3.76
##
##
## $`East South Central`
##
## Call:
## lm(formula = SAT ~ Salary, data = x)
##
## Coefficients:
## (Intercept) Salary
## 1106.41 -2.62
##
##
## $`West South Central`
##
## Call:
## lm(formula = SAT ~ Salary, data = x)
##
## Coefficients:
## (Intercept) Salary
## 1795.6 -28.2
##
##
## $`East North Central`
##
## Call:
## lm(formula = SAT ~ Salary, data = x)
##
## Coefficients:
## (Intercept) Salary
## 292.0 18.4
##
##
## $`West North Central`
##
## Call:
## lm(formula = SAT ~ Salary, data = x)
##
## Coefficients:
## (Intercept) Salary
## 1114.18 -1.32
##
##
## $Mountain
##
## Call:
## lm(formula = SAT ~ Salary, data = x)
##
## Coefficients:
## (Intercept) Salary
## 1419.7 -13.8
##
##
## $Pacific
##
## Call:
## lm(formula = SAT ~ Salary, data = x)
##
## Coefficients:
## (Intercept) Salary
## 907.404 0.356
New England, East South Central, West South Central, West North Central, Mountain, these 5 divisions’ estimated slopes are negative.
## ------------------------------------------------------------------------
# controls the number of significant digits to print, in this case 3
options(digits=3)
options(width=72) # narrow output
# read data from url
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv")
# load package dplyr
library(dplyr)
# select columns by column names
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e,
f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
## ------------------------------------------------------------------------
# print column names
names(newds)
## [1] "cesd" "female" "i1" "i2" "id" "treat" "f1a"
## [8] "f1b" "f1c" "f1d" "f1e" "f1f" "f1g" "f1h"
## [15] "f1i" "f1j" "f1k" "f1l" "f1m" "f1n" "f1o"
## [22] "f1p" "f1q" "f1r" "f1s" "f1t"
str(newds[,1:10]) # structure of the first 10 variables
## 'data.frame': 453 obs. of 10 variables:
## $ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
## $ female: int 0 0 0 1 0 1 1 0 1 0 ...
## $ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
## $ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ treat : int 1 1 0 0 0 1 0 1 0 1 ...
## $ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
## $ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
## $ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
## $ f1d : int 0 3 0 3 3 3 1 3 1 0 ...
## ------------------------------------------------------------------------
summary(newds[,1:10]) # summary of the first 10 variables
## cesd female i1 i2
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:25.0 1st Qu.:0.000 1st Qu.: 3.0 1st Qu.: 3.0
## Median :34.0 Median :0.000 Median : 13.0 Median : 15.0
## Mean :32.8 Mean :0.236 Mean : 17.9 Mean : 22.6
## 3rd Qu.:41.0 3rd Qu.:0.000 3rd Qu.: 26.0 3rd Qu.: 32.0
## Max. :60.0 Max. :1.000 Max. :142.0 Max. :184.0
## id treat f1a f1b
## Min. : 1 Min. :0.000 Min. :0.00 Min. :0.00
## 1st Qu.:119 1st Qu.:0.000 1st Qu.:1.00 1st Qu.:0.00
## Median :233 Median :0.000 Median :2.00 Median :1.00
## Mean :233 Mean :0.497 Mean :1.63 Mean :1.39
## 3rd Qu.:348 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:2.00
## Max. :470 Max. :1.000 Max. :3.00 Max. :3.00
## f1c f1d
## Min. :0.00 Min. :0.00
## 1st Qu.:1.00 1st Qu.:0.00
## Median :2.00 Median :1.00
## Mean :1.92 Mean :1.56
## 3rd Qu.:3.00 3rd Qu.:3.00
## Max. :3.00 Max. :3.00
## ------------------------------------------------------------------------
# first 3 rows of data frame newds
head(newds, n=3)
## cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1 49 0 13 26 1 1 3 2 3 0 2 3 3 0 2 3
## 2 30 0 56 62 2 1 3 2 0 3 3 2 0 0 3 0
## 3 39 0 0 0 3 0 3 2 3 0 2 2 1 3 2 3
## f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1 3 0 1 2 2 2 2 3 3 2
## 2 3 0 0 3 0 0 0 2 0 0
## 3 1 0 1 3 2 0 0 3 2 0
## ------------------------------------------------------------------------
# make a comment in data object newds, you can check it by function comment() and attributes()
comment(newds) = "HELP baseline dataset"
# print the comment of data object newds
comment(newds)
## [1] "HELP baseline dataset"
# write an external representation of data object ds named savedfile
save(ds, file="savedfile")
## ------------------------------------------------------------------------
# write an external csv file for data object ds named ds.csv
write.csv(ds, file="ds.csv")
## ------------------------------------------------------------------------
# load library foreign
library(foreign)
# write two external SAS files for data object ds named file.dat and file.sas
write.foreign(newds, "file.dat", "file.sas", package="SAS")
## ------------------------------------------------------------------------
# print the first 10 elements of column cesd from data frame newds
with(newds, cesd[1:10])
## [1] 49 30 39 15 39 6 52 32 50 46
# print the first 10 elements of column cesd from data frame newds
with(newds, head(cesd, 10))
## [1] 49 30 39 15 39 6 52 32 50 46
## ------------------------------------------------------------------------
# print the elements that have number larger than 56 in column cesd from newds
with(newds, cesd[cesd > 56])
## [1] 57 58 57 60 58 58 57
## ------------------------------------------------------------------------
# load dplyr
library(dplyr)
# remove those rows that contains numbers less than 56 in column 56 from newds
# select column id and cesd to print
filter(newds, cesd > 56) %>% select(id, cesd)
## id cesd
## 1 71 57
## 2 127 58
## 3 200 57
## 4 228 60
## 5 273 58
## 6 351 58
## 7 13 57
## ------------------------------------------------------------------------
# sort the data frame by column cesd in an increasing order
# print the first 4 element of cesd after sorting
with(newds, sort(cesd)[1:4])
## [1] 1 3 3 4
# print the number of row that contains the least number in column cesd
with(newds, which.min(cesd))
## [1] 199
## ------------------------------------------------------------------------
# load mosaic
library(mosaic)
# print how many na in column f1g from data object newds
tally(~ is.na(f1g), data=newds)
## is.na(f1g)
## TRUE FALSE
## 1 452
# give you descriptive statistics for column f1g from data newds
favstats(~ f1g, data=newds)
## min Q1 median Q3 max mean sd n missing
## 0 1 2 3 3 1.73 1.1 452 1
## ------------------------------------------------------------------------
# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
(3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
f1q, f1r, f1s, f1t))
# create a matrix filled with logical value where TRUE is na value in the original data and FALSE is not na value.
# apply sum() to all the rows of the matrix
nmisscesd = apply(is.na(cesditems), 1, sum)
# create data object ncesditems and store the data from cesditems to it
ncesditems = cesditems
# change na value in ncesditems to 0
ncesditems[is.na(cesditems)] = 0
# apply sum to all the rows in ncesditems and store the output in newcesd
newcesd = apply(ncesditems, 1, sum)
# compute imputemeancesd by the formular
imputemeancesd = 20/(20-nmisscesd)*newcesd
## ------------------------------------------------------------------------
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
## newcesd newds.cesd nmisscesd imputemeancesd
## 4 15 15 1 15.8
## 17 19 19 1 20.0
## 87 44 44 1 46.3
## 101 17 17 1 17.9
## 154 29 29 1 30.5
## 177 44 44 1 46.3
## 229 39 39 1 41.1
## ----createdrink,message=FALSE-------------------------------------------
library(memisc)
# create new column named drinksta in data frame newds
newds = mutate(newds, drinkstat=
# make three categories by some conditional operation performed in other columns
cases(
"abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
(i1>0 & i1<=2 & i2<=4 & female==0),
"highrisk" = ((i1>1 | i2>3) & female==1) |
((i1>2 | i2>4) & female==0)))
## ----echo=FALSE----------------------------------------------------------
library(mosaic)
## ----echo=FALSE----------------------------------------------------------
# detach a database
detach(package:memisc)
detach(package:MASS)
## ------------------------------------------------------------------------
# select 4 columns in newds and store them in tmpds
tmpds = select(newds, i1, i2, female, drinkstat)
# print rows 365 to 370
tmpds[365:370,]
## i1 i2 female drinkstat
## 365 6 24 0 highrisk
## 366 6 6 0 highrisk
## 367 0 0 0 abstinent
## 368 0 0 1 abstinent
## 369 8 8 0 highrisk
## 370 32 32 0 highrisk
## ------------------------------------------------------------------------
filter(tmpds, drinkstat=="moderate" & female==1)
## i1 i2 female drinkstat
## 1 1 1 1 moderate
## 2 1 3 1 moderate
## 3 1 2 1 moderate
## 4 1 1 1 moderate
## 5 1 1 1 moderate
## 6 1 1 1 moderate
## 7 1 1 1 moderate
## ----message=FALSE-------------------------------------------------------
library(gmodels)
# compute freq and proportion for each value in column drinkstat
with(tmpds, CrossTable(drinkstat))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | abstinent | moderate | highrisk |
## |-----------|-----------|-----------|
## | 68 | 28 | 357 |
## | 0.150 | 0.062 | 0.788 |
## |-----------|-----------|-----------|
##
##
##
##
## ------------------------------------------------------------------------
# create 6 category by two variables drinkstat and female
# compute freq and proportion for these 6 category
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | female
## drinkstat | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## abstinent | 42 | 26 | 68 |
## | 0.618 | 0.382 | 0.150 |
## -------------|-----------|-----------|-----------|
## moderate | 21 | 7 | 28 |
## | 0.750 | 0.250 | 0.062 |
## -------------|-----------|-----------|-----------|
## highrisk | 283 | 74 | 357 |
## | 0.793 | 0.207 | 0.788 |
## -------------|-----------|-----------|-----------|
## Column Total | 346 | 107 | 453 |
## -------------|-----------|-----------|-----------|
##
##
## ------------------------------------------------------------------------
newds = transform(newds,
gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)
## gender
## female Male Female
## 0 346 0
## 1 0 107
## ------------------------------------------------------------------------
# sort ds by cesd and i1
# store the data in newds object after sorting
newds = arrange(ds, cesd, i1)
# select 3 columns to print
# print the first 5 rows
newds[1:5, c("cesd", "i1", "id")]
## cesd i1 id
## 1 1 3 233
## 2 3 1 139
## 3 3 13 418
## 4 4 4 251
## 5 4 9 95
## ------------------------------------------------------------------------
# create a subset of ds which only contains rows that have value 1 in column female
females = filter(ds, female==1)
# compute the subset females' average cesd
with(females, mean(cesd))
## [1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1])
## [1] 36.9
## ------------------------------------------------------------------------
# compute average cesd for each level of female
with(ds, tapply(cesd, female, mean))
## 0 1
## 31.6 36.9
# compute average cesd for each level of female
mean(cesd ~ female, data=ds)
## 0 1
## 31.6 36.9