Exercise-1

base R approach

# create reproducible ramdom process
set.seed(271)
dta_cashool <- Ecdat::Caschool 

# split the data object by counties
dta_cashool_county <- split(dta_cashool, dta_cashool$county)

# define a function to sample one row in a data frame
sample_one_row <- function(x) {
  x <- as.data.frame(x)
  x[sample(nrow(x), size = 1), ]
}

# apply the function to each element in the list that generated by split
dta_cashool_county_sampled <- lapply(dta_cashool_county, sample_one_row)

# unsplit the outcome, which gives us a data frame
dta_cashool_county_sampled <- unsplit(dta_cashool_county_sampled, unique(dta_cashool$county))

# scatter plot
with(dta_cashool_county_sampled, plot(x = mathscr, y = readscr))

tidyverse approach

library(tidyverse)

# create reproducible ramdom process
set.seed(314)

dta_cashool <- Ecdat::Caschool %>% 
  # group the data by county
  group_by(county) %>%
  # samole 1 row from each group
  sample_n(1)

with(dta_cashool, plot(x = mathscr, y = readscr))

Exercise-2

## read data
dta_nlschools <- MASS::nlschools
head(dta_nlschools)

##   lang   IQ class GS SES COMB
## 1   46 15.0   180 29  23    0
## 2   45 14.5   180 29  10    0
## 3   33  9.5   180 29  15    0
## 4   46 11.0   180 29  23    0
## 5   20  8.0   180 29  10    0
## 6   30  9.5   180 29  10    0

base R approach

# define a function to compute lower bound of confident interval
lowerBound <- function(x) {
  # compute t scores for given df
  t <- qt(.975, df = (length(x) - 1))
  # lower bound formular
  mean(x) - (t * sd(x)) / sqrt(length(x))
}

# define a function to compute lower bound of confident interval
upperBound <- function(x) {
  # compute t scores for given df
  t <- qt(.975, df = (length(x) - 1))
  # upper bound formular
  mean(x) + (t * sd(x)) / sqrt(length(x))
}

# compute mean by class
dta_nlschools_mean <- aggregate(data = dta_nlschools, 
                            lang ~ class, 
                            FUN = mean)
# rename the varibale 
colnames(dta_nlschools_mean)[2] <- "language_mean"

# compute lower bound by class
dta_nlschools_lb <- aggregate(data = dta_nlschools, 
                            lang ~ class, 
                            FUN = lowerBound)
# rename the varibale 
colnames(dta_nlschools_lb)[2] <- "language_lb"

# compute upper bound by class
dta_nlschools_ub <- aggregate(data = dta_nlschools, 
                            lang ~ class, 
                            FUN = upperBound)
# rename the varibale 
colnames(dta_nlschools_lb)[2] <- "language_ub"

# set the digits to 4
options(digits = 4)

# merge three data objects
dta_nlschools_merge1 <- merge(dta_nlschools_mean, dta_nlschools_lb, by = "class")
dta_nlschools_summary <- merge(dta_nlschools_merge1, dta_nlschools_ub, by = "class")

# sort the data object by class number in a descending order
dta_nlschools_summary <- dta_nlschools_summary[order(dta_nlschools_summary$class), ]

# create classid column
dta_nlschools_summary$classid <- 1:133

# remove column class and set class id as the first column
dta_nlschools_summary <- dta_nlschools_summary[,c(5, 2:4)]

# print the last 3 rows
tail(dta_nlschools_summary[order(dta_nlschools_summary$class), ], 3)

##    classid language_mean language_ub  lang
## 91     131         38.09       34.27 41.91
## 92     132         29.30       19.80 38.80
## 93     133         28.43       21.98 34.88

tidyverse approach

dta_nlschools  %>% 
  # group the data by class
  group_by(class) %>% 
  # compute mean within each class
  summarise(language_mean = mean(lang),
            # compute lower bound 
            language_lb = lowerBound(lang),
            # compute upper bound
            language_ub = upperBound(lang)) %>% 
  mutate(classid = 1:133) %>% 
  # remove column class and set classid as the first column
   select(c(5, 2:4)) %>% 
  # print the last 3 rows
  tail(3)

## # A tibble: 3 x 4
##   classid language_mean language_lb language_ub
##     <int>         <dbl>       <dbl>       <dbl>
## 1     131          38.1        34.3        41.9
## 2     132          29.3        19.8        38.8
## 3     133          28.4        22.0        34.9

Exercise-3

library(car)
dta_prestige <- Prestige
head(dta_prestige)

##                     education income women prestige census type
## gov.administrators      13.11  12351 11.16     68.8   1113 prof
## general.managers        12.26  25879  4.02     69.1   1130 prof
## accountants             12.77   9271 15.70     63.4   1171 prof
## purchasing.officers     11.42   8865  9.11     56.8   1175 prof
## chemists                14.62   8403 11.68     73.5   2111 prof
## physicists              15.64  11030  5.13     77.6   2113 prof

base R approach

# grouped summary table
aggregate(data = dta_prestige,
          prestige ~ type,
          FUN = "median")

##   type prestige
## 1   bc     35.9
## 2 prof     68.4
## 3   wc     41.5

# remove rows contain na value
dta_prestige1 <- na.omit(dta_prestige)

# large than median is high 
# lower than median is low
dta_prestige1$prestige_level <- ifelse(dta_prestige1$type == "bc" & dta_prestige1$prestige < 35.9, "low", "high")

dta_prestige1$prestige_level <- ifelse(dta_prestige1$type == "prof" & dta_prestige1$prestige < 68.4, "low", "high")

dta_prestige1$prestige_level <- ifelse(dta_prestige1$type == "wc" & dta_prestige1$prestige < 41.5, "low", "high")

library(lattice)
xyplot(income ~ education | type,
       data = dta_prestige1)

xyplot(income ~ education | type * prestige_level,
       data = dta_prestige1)

tidyverse approach

dta_prestige %>%  
  na.omit() %>% 
  group_by(type) %>% 
  summarise(prestige_median = median(prestige))

## # A tibble: 3 x 2
##   type  prestige_median
##   <fct>           <dbl>
## 1 bc               35.9
## 2 prof             68.4
## 3 wc               41.5

dta_prestige_summary <- dta_prestige %>% na.omit() %>% group_by(type） %>% mutate(prestige_level = ifelse(prestige >= median(prestige), "high", "low")) %>% select(education, income, prestige_level, type)

head(dta_prestige_summary)

## # A tibble: 6 x 4
## # Groups:   type [1]
##   education income prestige_level type 
##       <dbl>  <int> <chr>          <fct>
## 1      13.1  12351 high           prof 
## 2      12.3  25879 high           prof 
## 3      12.8   9271 low            prof 
## 4      11.4   8865 low            prof 
## 5      14.6   8403 high           prof 
## 6      15.6  11030 high           prof

xyplot(income ~ education | prestige_level,
       data = dta_prestige_summary)

xyplot(income ~ education | type * prestige_level,
       data = dta_prestige_summary)

Exercise-4

nbl_c <- read.table("nobel_countries.txt", header = T)
nbl_w <- read.table("nobel_winners.txt", header = T)

inner_join

# return all rows from x, and all columns from x and y.

inner_join(nbl_w, nbl_c)

## Joining, by = "Year"

##                Name Gender Year Country
## 1   Patrick Modiano   Male 2014  France
## 2 Bertrand  Russell   Male 1950      UK
## 3    Kazuo Ishiguro   Male 2017      UK
## 4        Bob  Dylan   Male 2016      US
## 5      Alice  Munro Female 2013  Canada
## 6            Mo Yan   Male 2012   China

inner_join(nbl_c, nbl_w)

## Joining, by = "Year"

##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male

semi-join

# return all rows from x where there are matching values in y, keeping just columns from x.


semi_join(nbl_w, nbl_c)

## Joining, by = "Year"

##                Name Gender Year
## 1   Patrick Modiano   Male 2014
## 2 Bertrand  Russell   Male 1950
## 3    Kazuo Ishiguro   Male 2017
## 4        Bob  Dylan   Male 2016
## 5      Alice  Munro Female 2013
## 6            Mo Yan   Male 2012

semi_join(nbl_c, nbl_w)

## Joining, by = "Year"

##   Country Year
## 1  France 2014
## 2      UK 1950
## 3      UK 2017
## 4      US 2016
## 5  Canada 2013
## 6   China 2012

left-join

# return all rows from x where there are matching values in y, and all columns from x and y. 
left_join(nbl_w, nbl_c)

## Joining, by = "Year"

##                Name Gender Year Country
## 1   Patrick Modiano   Male 2014  France
## 2 Bertrand  Russell   Male 1950      UK
## 3    Kazuo Ishiguro   Male 2017      UK
## 4        Bob  Dylan   Male 2016      US
## 5      Alice  Munro Female 2013  Canada
## 6            Mo Yan   Male 2012   China
## 7        Pearl Buck Female 1938    <NA>

left_join(nbl_c, nbl_w)

## Joining, by = "Year"

##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male
## 7  Russia 2015              <NA>   <NA>
## 8  Sweden 2011              <NA>   <NA>

Anti-join

# return all rows from x where there are not matching values in y, keeping just columns from x.
anti_join(nbl_w, nbl_c)

## Joining, by = "Year"

##         Name Gender Year
## 1 Pearl Buck Female 1938

anti_join(nbl_c, nbl_w)

## Joining, by = "Year"

##   Country Year
## 1  Russia 2015
## 2  Sweden 2011

full-join

# return all rows and all columns from both x and y.
# not matching returns NA
full_join(nbl_w, nbl_c)

## Joining, by = "Year"

##                Name Gender Year Country
## 1   Patrick Modiano   Male 2014  France
## 2 Bertrand  Russell   Male 1950      UK
## 3    Kazuo Ishiguro   Male 2017      UK
## 4        Bob  Dylan   Male 2016      US
## 5      Alice  Munro Female 2013  Canada
## 6            Mo Yan   Male 2012   China
## 7        Pearl Buck Female 1938    <NA>
## 8              <NA>   <NA> 2015  Russia
## 9              <NA>   <NA> 2011  Sweden

full_join(nbl_c, nbl_w)

## Joining, by = "Year"

##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male
## 7  Russia 2015              <NA>   <NA>
## 8  Sweden 2011              <NA>   <NA>
## 9    <NA> 1938        Pearl Buck Female

Exercise-5

# read data
library(datasets)
fL <- "http://www.amstat.org/publications/jse/datasets/sat.dat.txt"
dta_sat <- read.table(fL, row.names=1)
# recode variable names
names(dta_sat) <- c("Spending", "PTR", "Salary", "PE", "Verbal", "Math", "SAT")
# merge two data object
dta_sat$division <- state.division
# remove the columns unrelated to this excercise
dta_sat <- dta_sat[, c("Salary", "SAT", "division")]
# print the first 6 lines
head(dta_sat)

##            Salary  SAT           division
## Alabama     31.14 1029 East South Central
## Alaska      47.95  934            Pacific
## Arizona     32.17  944           Mountain
## Arkansas    28.93 1005 West South Central
## California  41.08  902            Pacific
## Colorado    34.57  980           Mountain

base R approach

# subset the data object by division
dta_sat_division <- split(dta_sat, dta_sat$division)
# bulid a linear model for each division
dta_sat_lm <- lapply(dta_sat_division, function(x) lm(data = x, SAT ~ Salary))
# print the outcome
dta_sat_lm

## $`New England`
## 
## Call:
## lm(formula = SAT ~ Salary, data = x)
## 
## Coefficients:
## (Intercept)       Salary  
##     913.883       -0.207  
## 
## 
## $`Middle Atlantic`
## 
## Call:
## lm(formula = SAT ~ Salary, data = x)
## 
## Coefficients:
## (Intercept)       Salary  
##      709.81         3.91  
## 
## 
## $`South Atlantic`
## 
## Call:
## lm(formula = SAT ~ Salary, data = x)
## 
## Coefficients:
## (Intercept)       Salary  
##      758.23         3.76  
## 
## 
## $`East South Central`
## 
## Call:
## lm(formula = SAT ~ Salary, data = x)
## 
## Coefficients:
## (Intercept)       Salary  
##     1106.41        -2.62  
## 
## 
## $`West South Central`
## 
## Call:
## lm(formula = SAT ~ Salary, data = x)
## 
## Coefficients:
## (Intercept)       Salary  
##      1795.6        -28.2  
## 
## 
## $`East North Central`
## 
## Call:
## lm(formula = SAT ~ Salary, data = x)
## 
## Coefficients:
## (Intercept)       Salary  
##       292.0         18.4  
## 
## 
## $`West North Central`
## 
## Call:
## lm(formula = SAT ~ Salary, data = x)
## 
## Coefficients:
## (Intercept)       Salary  
##     1114.18        -1.32  
## 
## 
## $Mountain
## 
## Call:
## lm(formula = SAT ~ Salary, data = x)
## 
## Coefficients:
## (Intercept)       Salary  
##      1419.7        -13.8  
## 
## 
## $Pacific
## 
## Call:
## lm(formula = SAT ~ Salary, data = x)
## 
## Coefficients:
## (Intercept)       Salary  
##     907.404        0.356

New England, East South Central, West South Central, West North Central, Mountain, these 5 divisions’ estimated slopes are negative.

Exercise-6

## ------------------------------------------------------------------------

# controls the number of significant digits to print, in this case 3
options(digits=3)
options(width=72) # narrow output 
# read data from url
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv")
# load package dplyr
library(dplyr)
# select columns by column names
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, 
  f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)

## ------------------------------------------------------------------------
# print column names
names(newds)

##  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
##  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
## [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
## [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"

str(newds[,1:10]) # structure of the first 10 variables

## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...

## ------------------------------------------------------------------------
summary(newds[,1:10]) # summary of the first 10 variables

##       cesd          female            i1              i2       
##  Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
##  Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
##  Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
##  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
##  Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
##        id          treat            f1a            f1b      
##  Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
##  Median :233   Median :0.000   Median :2.00   Median :1.00  
##  Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
##  3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
##  Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
##       f1c            f1d      
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00  
##  Median :2.00   Median :1.00  
##  Mean   :1.92   Mean   :1.56  
##  3rd Qu.:3.00   3rd Qu.:3.00  
##  Max.   :3.00   Max.   :3.00

## ------------------------------------------------------------------------
#　first 3 rows of data frame newds
head(newds, n=3)

##   cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3
## 2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0
## 3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3
##   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1   3   0   1   2   2   2   2   3   3   2
## 2   3   0   0   3   0   0   0   2   0   0
## 3   1   0   1   3   2   0   0   3   2   0

## ------------------------------------------------------------------------
# make a comment in data object newds, you can check it by function comment() and attributes()
comment(newds) = "HELP baseline dataset"
# print the comment of data object newds
comment(newds)

## [1] "HELP baseline dataset"

# write an external representation of data object ds named savedfile
save(ds, file="savedfile")

## ------------------------------------------------------------------------
# write an external csv file for data object ds named ds.csv
write.csv(ds, file="ds.csv")

## ------------------------------------------------------------------------
# load library foreign
library(foreign)
# write two external SAS files for data object ds named file.dat and file.sas
write.foreign(newds, "file.dat", "file.sas", package="SAS")

## ------------------------------------------------------------------------
# print the first 10 elements of column cesd from data frame newds
with(newds, cesd[1:10])

##  [1] 49 30 39 15 39  6 52 32 50 46

# print the first 10 elements of column cesd from data frame newds
with(newds, head(cesd, 10))

##  [1] 49 30 39 15 39  6 52 32 50 46

## ------------------------------------------------------------------------
# print the elements that have number larger than 56 in column cesd from newds
with(newds, cesd[cesd > 56])

## [1] 57 58 57 60 58 58 57

## ------------------------------------------------------------------------
# load dplyr
library(dplyr)

# remove those rows that contains numbers less than 56 in column 56 from newds
# select column id and cesd to print
filter(newds, cesd > 56) %>% select(id, cesd)

##    id cesd
## 1  71   57
## 2 127   58
## 3 200   57
## 4 228   60
## 5 273   58
## 6 351   58
## 7  13   57

## ------------------------------------------------------------------------
# sort the data frame by column cesd in an increasing order
# print the first 4 element of cesd after sorting
with(newds, sort(cesd)[1:4])

## [1] 1 3 3 4

# print the number of row that contains the least number in column cesd 
with(newds, which.min(cesd))

## [1] 199

## ------------------------------------------------------------------------
# load mosaic
library(mosaic)
# print how many na in column f1g from data object newds
tally(~ is.na(f1g), data=newds)

## is.na(f1g)
##  TRUE FALSE 
##     1   452

# give you descriptive statistics for column f1g from data newds
favstats(~ f1g, data=newds)

##  min Q1 median Q3 max mean  sd   n missing
##    0  1      2  3   3 1.73 1.1 452       1

## ------------------------------------------------------------------------
# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t))
# create a matrix filled with logical value where TRUE is na value in the original data and FALSE is not na value.
# apply sum() to all the rows of the matrix
nmisscesd = apply(is.na(cesditems), 1, sum)
# create data object ncesditems and store the data from cesditems to it
ncesditems = cesditems
# change na value in ncesditems to 0
ncesditems[is.na(cesditems)] = 0
# apply sum to all the rows in ncesditems and store the output in newcesd
newcesd = apply(ncesditems, 1, sum)
# compute imputemeancesd by the formular
imputemeancesd = 20/(20-nmisscesd)*newcesd

## ------------------------------------------------------------------------
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]

##     newcesd newds.cesd nmisscesd imputemeancesd
## 4        15         15         1           15.8
## 17       19         19         1           20.0
## 87       44         44         1           46.3
## 101      17         17         1           17.9
## 154      29         29         1           30.5
## 177      44         44         1           46.3
## 229      39         39         1           41.1

## ----createdrink,message=FALSE-------------------------------------------

library(memisc)
# create new column named drinksta in data frame newds
newds = mutate(newds, drinkstat= 
                 # make three categories by some conditional operation performed in other columns
  cases(
    "abstinent" = i1==0,
    "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
               (i1>0 & i1<=2 & i2<=4 & female==0),
    "highrisk" = ((i1>1 | i2>3) & female==1) |
               ((i1>2 | i2>4) & female==0)))

## ----echo=FALSE----------------------------------------------------------
library(mosaic)

## ----echo=FALSE----------------------------------------------------------
# detach a database
detach(package:memisc)
detach(package:MASS)

## ------------------------------------------------------------------------
# select 4 columns in newds and store them in tmpds
tmpds = select(newds, i1, i2, female, drinkstat)
# print rows 365 to 370
tmpds[365:370,]

##     i1 i2 female drinkstat
## 365  6 24      0  highrisk
## 366  6  6      0  highrisk
## 367  0  0      0 abstinent
## 368  0  0      1 abstinent
## 369  8  8      0  highrisk
## 370 32 32      0  highrisk

## ------------------------------------------------------------------------

filter(tmpds, drinkstat=="moderate" & female==1)

##   i1 i2 female drinkstat
## 1  1  1      1  moderate
## 2  1  3      1  moderate
## 3  1  2      1  moderate
## 4  1  1      1  moderate
## 5  1  1      1  moderate
## 6  1  1      1  moderate
## 7  1  1      1  moderate

## ----message=FALSE-------------------------------------------------------
library(gmodels)
# compute freq and proportion for each value in column drinkstat
with(tmpds, CrossTable(drinkstat))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
##

## ------------------------------------------------------------------------
# create 6 category by two variables drinkstat and female
# compute freq and proportion for these 6 category
with(tmpds, CrossTable(drinkstat, female, 
  prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##              | female 
##    drinkstat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##    abstinent |        42 |        26 |        68 | 
##              |     0.618 |     0.382 |     0.150 | 
## -------------|-----------|-----------|-----------|
##     moderate |        21 |         7 |        28 | 
##              |     0.750 |     0.250 |     0.062 | 
## -------------|-----------|-----------|-----------|
##     highrisk |       283 |        74 |       357 | 
##              |     0.793 |     0.207 |     0.788 | 
## -------------|-----------|-----------|-----------|
## Column Total |       346 |       107 |       453 | 
## -------------|-----------|-----------|-----------|
## 
##

## ------------------------------------------------------------------------
newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)

##       gender
## female Male Female
##      0  346      0
##      1    0    107

## ------------------------------------------------------------------------
# sort ds by cesd and i1
# store the data in newds object after sorting
newds = arrange(ds, cesd, i1)

# select 3 columns to print
# print the first 5 rows 
newds[1:5, c("cesd", "i1", "id")]

##   cesd i1  id
## 1    1  3 233
## 2    3  1 139
## 3    3 13 418
## 4    4  4 251
## 5    4  9  95

## ------------------------------------------------------------------------
# create a subset of ds which only contains rows that have value 1 in column female
females = filter(ds, female==1)
# compute the subset females' average cesd
with(females, mean(cesd))

## [1] 36.9

# an alternative approach
mean(ds$cesd[ds$female==1])

## [1] 36.9

## ------------------------------------------------------------------------
# compute average cesd for each level of female
with(ds, tapply(cesd, female, mean))

##    0    1 
## 31.6 36.9

# compute average cesd for each level of female
mean(cesd ~ female, data=ds)

##    0    1 
## 31.6 36.9

Exercises-330

Zhe Sun

4/8/2020

Exercise-1

base R approach

tidyverse approach

Exercise-2

base R approach

tidyverse approach

Exercise-3

base R approach

tidyverse approach

Exercise-4

inner_join

semi-join

left-join

Anti-join

full-join

Exercise-5

base R approach

Exercise-6