Data Manipulation with actual examples
## Load datasets package full of data!
library(datasets)
## List all the datasets (suppressed due to too much output)
## data(package = "datasets")
## You have already seen vectors.
## Array 4-dimensional example
data(Titanic)
Titanic
, , Age = Child, Survived = No
Sex
Class Male Female
1st 0 0
2nd 0 0
3rd 35 17
Crew 0 0
, , Age = Adult, Survived = No
Sex
Class Male Female
1st 118 4
2nd 154 13
3rd 387 89
Crew 670 3
, , Age = Child, Survived = Yes
Sex
Class Male Female
1st 5 1
2nd 11 13
3rd 13 14
Crew 0 0
, , Age = Adult, Survived = Yes
Sex
Class Male Female
1st 57 140
2nd 14 80
3rd 75 76
Crew 192 20
## Extraction of first stratum results in a matrix (2-dimensional array)
stratum1 <- Titanic[,,"Child","No"]
stratum1
Sex
Class Male Female
1st 0 0
2nd 0 0
3rd 35 17
Crew 0 0
## List example
data(Harman23.cor)
Harman23.cor
$cov
height arm.span forearm lower.leg weight bitro.diameter chest.girth chest.width
height 1.000 0.846 0.805 0.859 0.473 0.398 0.301 0.382
arm.span 0.846 1.000 0.881 0.826 0.376 0.326 0.277 0.415
forearm 0.805 0.881 1.000 0.801 0.380 0.319 0.237 0.345
lower.leg 0.859 0.826 0.801 1.000 0.436 0.329 0.327 0.365
weight 0.473 0.376 0.380 0.436 1.000 0.762 0.730 0.629
bitro.diameter 0.398 0.326 0.319 0.329 0.762 1.000 0.583 0.577
chest.girth 0.301 0.277 0.237 0.327 0.730 0.583 1.000 0.539
chest.width 0.382 0.415 0.345 0.365 0.629 0.577 0.539 1.000
$center
[1] 0 0 0 0 0 0 0 0
$n.obs
[1] 305
## Named elements can be extracted with $ operator
Harman23.cor$center
[1] 0 0 0 0 0 0 0 0
## Data Frame example: Similar to matrix but each vector (column) can hold different variables
## Elements are named and can be accessed by $ operator
data(esoph)
head(esoph, 20)
agegp alcgp tobgp ncases ncontrols
1 25-34 0-39g/day 0-9g/day 0 40
2 25-34 0-39g/day 10-19 0 10
3 25-34 0-39g/day 20-29 0 6
4 25-34 0-39g/day 30+ 0 5
5 25-34 40-79 0-9g/day 0 27
6 25-34 40-79 10-19 0 7
7 25-34 40-79 20-29 0 4
8 25-34 40-79 30+ 0 7
9 25-34 80-119 0-9g/day 0 2
10 25-34 80-119 10-19 0 1
11 25-34 80-119 30+ 0 2
12 25-34 120+ 0-9g/day 0 1
13 25-34 120+ 10-19 1 1
14 25-34 120+ 20-29 0 1
15 25-34 120+ 30+ 0 2
16 35-44 0-39g/day 0-9g/day 0 60
17 35-44 0-39g/day 10-19 1 14
18 35-44 0-39g/day 20-29 0 7
19 35-44 0-39g/day 30+ 0 8
20 35-44 40-79 0-9g/day 0 35
## Let's break a data frame into a list
esoph10 <- head(esoph, 10)
class(esoph10) <- "list"
esoph10
$agegp
[1] 25-34 25-34 25-34 25-34 25-34 25-34 25-34 25-34 25-34 25-34
Levels: 25-34 < 35-44 < 45-54 < 55-64 < 65-74 < 75+
$alcgp
[1] 0-39g/day 0-39g/day 0-39g/day 0-39g/day 40-79 40-79 40-79 40-79 80-119 80-119
Levels: 0-39g/day < 40-79 < 80-119 < 120+
$tobgp
[1] 0-9g/day 10-19 20-29 30+ 0-9g/day 10-19 20-29 30+ 0-9g/day 10-19
Levels: 0-9g/day < 10-19 < 20-29 < 30+
$ncases
[1] 0 0 0 0 0 0 0 0 0 0
$ncontrols
[1] 40 10 6 5 27 7 4 7 2 1
attr(,"row.names")
[1] 1 2 3 4 5 6 7 8 9 10