save()save. My_data <- read.csv("studentgrades.csv", header = TRUE) save(My_data, file="My_data.RData")load()load rm('My_data')
load("My_data.RData")
My_data| StudentID | First | Last | Math | Science | Social.Studies |
|---|---|---|---|---|---|
| 11 | Bob | Smith | 90 | 80 | 67 |
| 12 | Jane | Weary | 75 | NA | 80 |
| 10 | Dan | Thornton | 65 | 75 | 70 |
| 40 | Mary | O’Leary | 90 | 95 | 92 |
x <- runif(20)
y <- list(a = 1, b = TRUE, c = "oops")
save(x, y, file = "xy.RData")
rm('x')
rm('y')
load('xy.RData')
x## [1] 0.42872105 0.60667648 0.04343634 0.77761082 0.49699601 0.58876900
## [7] 0.69749981 0.16832778 0.59985822 0.69661858 0.38214905 0.56301265
## [13] 0.59091740 0.99918827 0.46933250 0.27585323 0.93890536 0.06592883
## [19] 0.14667353 0.47795312
y## $a
## [1] 1
##
## $b
## [1] TRUE
##
## $c
## [1] "oops"
add_square <- function(x, y) return((x + y )^2)
save(add_square, file='myadd_square_function.Rdata')
rm('add_square')
load('myadd_square_function.Rdata')
add_square(1, 2)## [1] 9
add_square(10, 5)## [1] 225
pastepaste. The paste function allows you to concatenate multiple character vectors into a single vector. For more information on paste, please see the help file. ?paste x <- c('a', 'b', 'c', 'd', 'e')
y <- c('A', 'B', 'C', 'D', 'E')
paste(x, y)## [1] "a A" "b B" "c C" "d D" "e E"
sep argument: paste(x, y, sep='_')## [1] "a_A" "b_B" "c_C" "d_D" "e_E"
collapse argument. The value of collapse will be used as the separator in this value: paste(x, y, sep='-', collapse = '_')## [1] "a-A_b-B_c-C_d-D_e-E"
rbind and cbindSometimes, we would like to bind together multiple data frames or matrices. We can do this with the rbind and cbind functions.
The cbind function will combine objects by adding columns. let’s start with the matrix.
A <- matrix(rnorm(12), 4, 3)
B <- matrix(rnorm(8), 4, 2)
A## [,1] [,2] [,3]
## [1,] 0.5750867 1.483409 -0.8729875
## [2,] -1.8740441 -1.966150 0.3645746
## [3,] 0.3842797 1.192968 0.4623019
## [4,] 1.1064848 2.087169 -1.2398803
B## [,1] [,2]
## [1,] -0.9726710 -0.8418112
## [2,] -0.9176559 -0.7047176
## [3,] 1.0523675 -0.4254123
## [4,] -1.0292503 0.3892591
cbind(A, B)## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.5750867 1.483409 -0.8729875 -0.9726710 -0.8418112
## [2,] -1.8740441 -1.966150 0.3645746 -0.9176559 -0.7047176
## [3,] 0.3842797 1.192968 0.4623019 1.0523675 -0.4254123
## [4,] 1.1064848 2.087169 -1.2398803 -1.0292503 0.3892591
Then, for the data frame.
grades <- read.csv("studentgrades.csv", header = TRUE)
grades| StudentID | First | Last | Math | Science | Social.Studies |
|---|---|---|---|---|---|
| 11 | Bob | Smith | 90 | 80 | 67 |
| 12 | Jane | Weary | 75 | NA | 80 |
| 10 | Dan | Thornton | 65 | 75 | 70 |
| 40 | Mary | O’Leary | 90 | 95 | 92 |
Now, let’s create a new data frame with two more columns
Chinese <- c(73, 82, 90, 68)
English <- c(89, 97, 82, 86)
more.cols <- data.frame(Chinese, English)
more.cols| Chinese | English |
|---|---|
| 73 | 89 |
| 82 | 97 |
| 90 | 82 |
| 68 | 86 |
Finally, let’s put together these two data frames:
Grades <- cbind(grades, more.cols)
Grades| StudentID | First | Last | Math | Science | Social.Studies | Chinese | English |
|---|---|---|---|---|---|---|---|
| 11 | Bob | Smith | 90 | 80 | 67 | 73 | 89 |
| 12 | Jane | Weary | 75 | NA | 80 | 82 | 97 |
| 10 | Dan | Thornton | 65 | 75 | 70 | 90 | 82 |
| 40 | Mary | O’Leary | 90 | 95 | 92 | 68 | 86 |
rbind function will combine objects by adding rows. We can picture this as combining two tables vertically. For the matrix, A <- matrix(rnorm(16), 4, 4)
B <- matrix(rnorm(12), 3, 4)
A## [,1] [,2] [,3] [,4]
## [1,] -0.6800206 0.3129961 -1.6470310 0.4876048
## [2,] -1.3049666 0.4789635 -0.6016342 -0.4401903
## [3,] -1.6451458 0.6233991 0.3725783 -1.7780730
## [4,] -1.1572684 1.0998613 0.3528149 0.2917322
B## [,1] [,2] [,3] [,4]
## [1,] 1.5221356 0.5455718 -0.2560096 -1.5297287
## [2,] -0.5258189 -1.5839013 -0.6406959 -1.6194910
## [3,] -0.1384319 0.9207622 -2.4610803 -0.4031037
rbind(A, B)## [,1] [,2] [,3] [,4]
## [1,] -0.6800206 0.3129961 -1.6470310 0.4876048
## [2,] -1.3049666 0.4789635 -0.6016342 -0.4401903
## [3,] -1.6451458 0.6233991 0.3725783 -1.7780730
## [4,] -1.1572684 1.0998613 0.3528149 0.2917322
## [5,] 1.5221356 0.5455718 -0.2560096 -1.5297287
## [6,] -0.5258189 -1.5839013 -0.6406959 -1.6194910
## [7,] -0.1384319 0.9207622 -2.4610803 -0.4031037
For the data frame, let’s create a new data frame with two more rows
StudentID <- c(43, 52)
First <- c('Ming', 'Qiang')
Last <- c('Li', 'Zhang')
Math <- c(93, 87)
Science <- c(84, 93)
Social.Studies <- c(71, 88)
Chinese <- c(98, 96)
English <- c(73, 80)
more.rows <- data.frame(StudentID, First, Last, Math, Science,
Social.Studies, Chinese, English)
more.rows| StudentID | First | Last | Math | Science | Social.Studies | Chinese | English |
|---|---|---|---|---|---|---|---|
| 43 | Ming | Li | 93 | 84 | 71 | 98 | 73 |
| 52 | Qiang | Zhang | 87 | 93 | 88 | 96 | 80 |
rbind(Grades, more.rows)| StudentID | First | Last | Math | Science | Social.Studies | Chinese | English |
|---|---|---|---|---|---|---|---|
| 11 | Bob | Smith | 90 | 80 | 67 | 73 | 89 |
| 12 | Jane | Weary | 75 | NA | 80 | 82 | 97 |
| 10 | Dan | Thornton | 65 | 75 | 70 | 90 | 82 |
| 40 | Mary | O’Leary | 90 | 95 | 92 | 68 | 86 |
| 43 | Ming | Li | 93 | 84 | 71 | 98 | 73 |
| 52 | Qiang | Zhang | 87 | 93 | 88 | 96 | 80 |
mergemerge command. More information can be found by ?mergeFor example
x <- data.frame(k1 = c(NA,NA,3,7,9), k2 = c(1,2,3,4,11))
y <- data.frame(k2 = c(1,2,3,4,12), k3 = c(NA,4,NA,6,8))
x| k1 | k2 |
|---|---|
| NA | 1 |
| NA | 2 |
| 3 | 3 |
| 7 | 4 |
| 9 | 11 |
y| k2 | k3 |
|---|---|
| 1 | NA |
| 2 | 4 |
| 3 | NA |
| 4 | 6 |
| 12 | 8 |
merge(x, y)| k2 | k1 | k3 |
|---|---|---|
| 1 | NA | NA |
| 2 | NA | 4 |
| 3 | 3 | NA |
| 4 | 7 | 6 |
merge(x, y, all.x = T)| k2 | k1 | k3 |
|---|---|---|
| 1 | NA | NA |
| 2 | NA | 4 |
| 3 | 3 | NA |
| 4 | 7 | 6 |
| 11 | 9 | NA |
merge(x, y, all.y = T)| k2 | k1 | k3 |
|---|---|---|
| 1 | NA | NA |
| 2 | NA | 4 |
| 3 | 3 | NA |
| 4 | 7 | 6 |
| 12 | NA | 8 |
merge(x, y, all = T)| k2 | k1 | k3 |
|---|---|---|
| 1 | NA | NA |
| 2 | NA | 4 |
| 3 | 3 | NA |
| 4 | 7 | 6 |
| 11 | 9 | NA |
| 12 | NA | 8 |
x <- data.frame(k1 = c(NA,NA,3,4,5), k2 = c(1,NA,NA,4,5), data = 1:5)
y <- data.frame(k1 = c(NA,2,NA,4,5), k2 = c(NA,NA,3,4,5), data = 1:5)
x| k1 | k2 | data |
|---|---|---|
| NA | 1 | 1 |
| NA | NA | 2 |
| 3 | NA | 3 |
| 4 | 4 | 4 |
| 5 | 5 | 5 |
y| k1 | k2 | data |
|---|---|---|
| NA | NA | 1 |
| 2 | NA | 2 |
| NA | 3 | 3 |
| 4 | 4 | 4 |
| 5 | 5 | 5 |
merge(x, y, by = c("k1","k2")) # NA's match| k1 | k2 | data.x | data.y |
|---|---|---|---|
| 4 | 4 | 4 | 4 |
| 5 | 5 | 5 | 5 |
| NA | NA | 2 | 1 |
merge(x, y, by = "k1") # NA's match, so 6 rows| k1 | k2.x | data.x | k2.y | data.y |
|---|---|---|---|---|
| 4 | 4 | 4 | 4 | 4 |
| 5 | 5 | 5 | 5 | 5 |
| NA | 1 | 1 | NA | 1 |
| NA | 1 | 1 | 3 | 3 |
| NA | NA | 2 | NA | 1 |
| NA | NA | 2 | 3 | 3 |
merge(x, y, by = "k2") # NA's match, so 6 rows| k2 | k1.x | data.x | k1.y | data.y |
|---|---|---|---|---|
| 4 | 4 | 4 | 4 | 4 |
| 5 | 5 | 5 | 5 | 5 |
| NA | NA | 2 | NA | 1 |
| NA | NA | 2 | 2 | 2 |
| NA | 3 | 3 | NA | 1 |
| NA | 3 | 3 | 2 | 2 |
merge(x, y, by = "k2", incomparables = NA) # 2 rows| k2 | k1.x | data.x | k1.y | data.y |
|---|---|---|---|---|
| 4 | 4 | 4 | 4 | 4 |
| 5 | 5 | 5 | 5 | 5 |
Whereas mathematical and statistical functions operate on numerical data, character functions extract information from textual data or reformat textual data for printing and reporting. For example, we may want to concatenate a person’s first name and last name, ensuring that the first letter of each is capitalized. Here are some of the most useful character functions.
nchar(x) :
nchar('123456abc')## [1] 9
x <- c("ab", "cde", "fghij")
length(x)## [1] 3
nchar(x[3])## [1] 5
substr(x, start, stop):
x <- "abcdef"
substr(x, 2, 4)## [1] "bcd"
substr(x, 2, 4) <- "22222"
x## [1] "a222ef"
grep(pattern, x, ignore.case=FALSE, fixed=FALSE):
grep("A", c("b","A","c"), fixed=TRUE)## [1] 2
#letters
grep("[a-z]", letters)## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26
sub(pattern, replacement, x, ignore.case=FALSE, fixed=FALSE):
sub("\\s",".","Hello There")## [1] "Hello.There"
sub(" ",".","Hello There")## [1] "Hello.There"
sub("."," ",sub("\\s",".","Hello There"),fixed = TRUE)## [1] "Hello There"
gsub("\\s",".","Hello World There")## [1] "Hello.World.There"
gsub(" ",".","Hello World There")## [1] "Hello.World.There"
gsub("."," ",gsub("\\s",".","Hello Word There"),fixed = TRUE)## [1] "Hello Word There"
- **NOTE** that "\\s" is a regular expression for finding whitespace; use
“\\s” instead, because “\” is R’s escape character.
strsplit(x, split, fixed=FALSE):
x at split. If fixed=FALSE, then pattern is a regular expression. If fixed=TRUE, then pattern is a text string. strsplit("abc", "")## [[1]]
## [1] "a" "b" "c"
chartr(old, new, x):
x that is specified in old to the corresponding character specified in new. Ranges are supported in the specifications, but character classes and repeated characters are not. If old contains more characters than new, an error is signaled; if it contains fewer characters, the extra characters at the end of new are ignored.x <- "Hello World there"
chartr("the", " He", x)## [1] "Hello World Here"
chartr("Th", "th", x)## [1] "Hello World there"
chartr("th", "Th", x)## [1] "Hello World There"
toupper(x) :
toupper("abc")## [1] "ABC"
tolower(x):
tolower("ABC")## [1] "abc"
| Function | Purpose |
|---|---|
| as.factor | coerces its argument to a factor. |
| as.numeric | attempts to turn its argument into numeric. |
| as.matrix | attempts to turn its argument into a matrix. |
| as.data.frame | coerce it into a data.frame. |
x <- c(1, 1, 2, 2, 2, 3, 3)
class(x)## [1] "numeric"
y <- as.factor(x)
class(y)## [1] "factor"
z <- as.numeric(y)
z## [1] 1 1 2 2 2 3 3
class(z)## [1] "numeric"
Chinese <- c(73, 82, 90, 68)
English <- c(89, 97, 82, 86)
my_data_1 <- data.frame(Chinese, English)
class(my_data_1)## [1] "data.frame"
my_data_1| Chinese | English |
|---|---|
| 73 | 89 |
| 82 | 97 |
| 90 | 82 |
| 68 | 86 |
my_data_2 <- as.matrix(my_data_1)
class(my_data_2)## [1] "matrix"
my_data_2## Chinese English
## [1,] 73 89
## [2,] 82 97
## [3,] 90 82
## [4,] 68 86
my_data_3 <- as.data.frame(my_data_2)
class(my_data_3)## [1] "data.frame"
my_data_3| Chinese | English |
|---|---|
| 73 | 89 |
| 82 | 97 |
| 90 | 82 |
| 68 | 86 |
transformtransform istransform(`_data`, ...)Notice that there aren’t any named arguments for this function. To use transform, we specify a data frame (as the first argument) and a set of expressions that use variables within the data frame. The transform function applies each expression to the data frame and then returns the final data frame.
head(airquality)| Ozone | Solar.R | Wind | Temp | Month | Day |
|---|---|---|---|---|---|
| 41 | 190 | 7.4 | 67 | 5 | 1 |
| 36 | 118 | 8.0 | 72 | 5 | 2 |
| 12 | 149 | 12.6 | 74 | 5 | 3 |
| 18 | 313 | 11.5 | 62 | 5 | 4 |
| NA | NA | 14.3 | 56 | 5 | 5 |
| 28 | NA | 14.9 | 66 | 5 | 6 |
NEW <- transform(airquality, new = -Ozone, Temp = (Temp-32)/1.8)
head(NEW)| Ozone | Solar.R | Wind | Temp | Month | Day | new |
|---|---|---|---|---|---|---|
| 41 | 190 | 7.4 | 19.44444 | 5 | 1 | -41 |
| 36 | 118 | 8.0 | 22.22222 | 5 | 2 | -36 |
| 12 | 149 | 12.6 | 23.33333 | 5 | 3 | -12 |
| 18 | 313 | 11.5 | 16.66667 | 5 | 4 | -18 |
| NA | NA | 14.3 | 13.33333 | 5 | 5 | NA |
| 28 | NA | 14.9 | 18.88889 | 5 | 6 | -28 |
apply function:apply(X, MARGIN, FUN, ...)Apply accepts three arguments: X is the array to which a function is applied, FUN is the function, and MARGIN specifies the dimensions to which we would like to apply a function. Optionally, we can specify arguments to FUN as addition arguments to apply arguments to FUN.) Here’s a simple example to show how this works.
x <- rnorm(20)
dim(x) <- c(5,4)
x## [,1] [,2] [,3] [,4]
## [1,] -0.1722572 0.1283156 0.2593631 0.38021267
## [2,] 0.7606900 -0.3660551 0.7553124 -1.38121944
## [3,] 0.2686112 -0.6668906 -1.8518024 0.65494485
## [4,] 1.1949534 -1.2119232 0.4495720 0.09011726
## [5,] 2.0180621 -0.6925227 0.2752041 1.84210159
apply(X = x , MARGIN = 1, FUN = max)## [1] 0.3802127 0.7606900 0.6549448 1.1949534 2.0180621
apply(X = x , MARGIN = 2, FUN = max)## [1] 2.0180621 0.1283156 0.7553124 1.8421016
apply(x , 1, function(x) sum(x)/length(x) )## [1] 0.14890854 -0.05781802 -0.39878423 0.13067987 0.86071127
apply(x , 2, function(x) sum(x)/length(x) )## [1] 0.81401191 -0.56181520 -0.02247016 0.31723139
apply(x , 1, var)## [1] 0.05642443 1.05917931 1.24630146 1.01286624 1.68598078
apply(x , 2, var)## [1] 0.7169713 0.2413421 1.0855665 1.3448906
apply(x , 1, function(x) sum((x-mean(x))^2)/(length(x)-1) )## [1] 0.05642443 1.05917931 1.24630146 1.01286624 1.68598078
apply(x , 2, function(x) sum((x-mean(x))^2)/(length(x)-1) )## [1] 0.7169713 0.2413421 1.0855665 1.3448906
Consider the following three-dimensional array:
x <- 1:27
dim(x) <- c(3,3,3)
x## , , 1
##
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
##
## , , 2
##
## [,1] [,2] [,3]
## [1,] 10 13 16
## [2,] 11 14 17
## [3,] 12 15 18
##
## , , 3
##
## [,1] [,2] [,3]
## [1,] 19 22 25
## [2,] 20 23 26
## [3,] 21 24 27
apply(X=x, MARGIN=1, FUN=paste,collapse=",")## [1] "1,4,7,10,13,16,19,22,25" "2,5,8,11,14,17,20,23,26"
## [3] "3,6,9,12,15,18,21,24,27"
apply(X=x, MARGIN=2, FUN=paste,collapse=",")## [1] "1,2,3,10,11,12,19,20,21" "4,5,6,13,14,15,22,23,24"
## [3] "7,8,9,16,17,18,25,26,27"
apply(X=x, MARGIN=3, FUN=paste,collapse=",")## [1] "1,2,3,4,5,6,7,8,9" "10,11,12,13,14,15,16,17,18"
## [3] "19,20,21,22,23,24,25,26,27"
colSums, rowSums, colMeans and rowMeans. These functions are equivalent to use of apply with FUN = mean or FUN = sum with appropriate margins. x <- rnorm(20)
dim(x) <- c(5,4)
apply(x , 1, sum)## [1] 0.4162867 2.8209556 1.0353302 1.3544686 -0.5727775
rowSums(x)## [1] 0.4162867 2.8209556 1.0353302 1.3544686 -0.5727775
apply(x , 2, sum)## [1] 2.7959838 2.7201720 0.8091082 -1.2710004
colSums(x)## [1] 2.7959838 2.7201720 0.8091082 -1.2710004
apply(x , 1, mean)## [1] 0.1040717 0.7052389 0.2588325 0.3386172 -0.1431944
rowMeans(x)## [1] 0.1040717 0.7052389 0.2588325 0.3386172 -0.1431944
apply(x , 2, mean)## [1] 0.5591968 0.5440344 0.1618216 -0.2542001
colMeans(x)## [1] 0.5591968 0.5440344 0.1618216 -0.2542001
lapply. The function lapply requires two arguments: an object X and a function FUNC. (We may specify additional arguments that will be passed to FUNC.) Let’s look at a simple example of how to use lapply: x <- as.list(1:5)
lapply(x, function(x) 2^x)## [[1]]
## [1] 2
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 8
##
## [[4]]
## [1] 16
##
## [[5]]
## [1] 32
d <- data.frame(x=1:5, y=6:10)
d| x | y |
|---|---|
| 1 | 6 |
| 2 | 7 |
| 3 | 8 |
| 4 | 9 |
| 5 | 10 |
lapply(d, function(x) 2^x)## $x
## [1] 2 4 8 16 32
##
## $y
## [1] 64 128 256 512 1024
lapply(d, max)## $x
## [1] 5
##
## $y
## [1] 10
lapply(d, mean)## $x
## [1] 3
##
## $y
## [1] 8
sapply function. This function works exactly the same way as apply, except that it returns a vector or matrix (when appropriate): d <- data.frame(x=1:5, y=6:10)
sapply(d, function(x) 2^x)## x y
## [1,] 2 64
## [2,] 4 128
## [3,] 8 256
## [4,] 16 512
## [5,] 32 1024
d <- data.frame(x=1:5, y=6:10)
sapply(d, mean)## x y
## 3 8
vapply. vapply is similar to sapply, but has a pre-specified type of return value, so it can be safer (and sometimes faster) to use. i39 <- sapply(3:9, seq)
i39## [[1]]
## [1] 1 2 3
##
## [[2]]
## [1] 1 2 3 4
##
## [[3]]
## [1] 1 2 3 4 5
##
## [[4]]
## [1] 1 2 3 4 5 6
##
## [[5]]
## [1] 1 2 3 4 5 6 7
##
## [[6]]
## [1] 1 2 3 4 5 6 7 8
##
## [[7]]
## [1] 1 2 3 4 5 6 7 8 9
sapply(i39, fivenum)## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1.0 1.0 1 1.0 1.0 1.0 1
## [2,] 1.5 1.5 2 2.0 2.5 2.5 3
## [3,] 2.0 2.5 3 3.5 4.0 4.5 5
## [4,] 2.5 3.5 4 5.0 5.5 6.5 7
## [5,] 3.0 4.0 5 6.0 7.0 8.0 9
vapply(i39, fivenum,
c(Min. = 0, "1st Qu." = 0, Median = 0, "3rd Qu." = 0, Max. = 0))## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## Min. 1.0 1.0 1 1.0 1.0 1.0 1
## 1st Qu. 1.5 1.5 2 2.0 2.5 2.5 3
## Median 2.0 2.5 3 3.5 4.0 4.5 5
## 3rd Qu. 2.5 3.5 4 5.0 5.5 6.5 7
## Max. 3.0 4.0 5 6.0 7.0 8.0 9
x <- data.frame(cbind(x1=3, x2=c(2:1,4:5)))
x| x1 | x2 |
|---|---|
| 3 | 2 |
| 3 | 1 |
| 3 | 4 |
| 3 | 5 |
sapply(x, cumsum)## x1 x2
## [1,] 3 2
## [2,] 6 3
## [3,] 9 7
## [4,] 12 12
vapply(x,cumsum,FUN.VALUE=c('a'=0,'b'=0,'c'=0,'d'=0))## x1 x2
## a 3 2
## b 6 3
## c 9 7
## d 12 12
mapply, the “multivariate”" version of sapply:mapply(FUN, ..., MoreArgs = , SIMPLIFY = , USE.NAMES = )For more information, see
?mapply mapply(rep, 1:4, 4:1)## [[1]]
## [1] 1 1 1 1
##
## [[2]]
## [1] 2 2 2
##
## [[3]]
## [1] 3 3
##
## [[4]]
## [1] 4
mapply(rep, times = 1:4, x = 4:1)## [[1]]
## [1] 4
##
## [[2]]
## [1] 3 3
##
## [[3]]
## [1] 2 2 2
##
## [[4]]
## [1] 1 1 1 1
mapply(rep, times = 1:4, MoreArgs = list(x = 42))## [[1]]
## [1] 42
##
## [[2]]
## [1] 42 42
##
## [[3]]
## [1] 42 42 42
##
## [[4]]
## [1] 42 42 42 42
mapply(function(x, y) seq_len(x) + y,
c(a = 1, b = 2, c = 3),
c(A = 10, B = 0, C = -10))## $a
## [1] 11
##
## $b
## [1] 1 2
##
## $c
## [1] -9 -8 -7
mapply(paste, c(1,2,3,4,5), c("a","b","c","d","e"),
c("A","B","C","D","E"), MoreArgs=list(sep="-"))## [1] "1-a-A" "2-b-B" "3-c-C" "4-d-D" "5-e-E"
NOTE: In the following subsection, in our example, we will use the MLB Batting Data, 2008 Season data.
library(nutshell)## Loading required package: nutshell.bbdb
## Loading required package: nutshell.audioscrobbler
data(batting.2008)
dim(batting.2008)## [1] 1384 32
summary(batting.2008)## nameLast nameFirst weight height
## Length:1384 Length:1384 Min. : 0.0 Min. : 0.00
## Class :character Class :character 1st Qu.:182.0 1st Qu.:72.00
## Mode :character Mode :character Median :195.0 Median :74.00
## Mean :197.4 Mean :73.59
## 3rd Qu.:210.0 3rd Qu.:75.00
## Max. :280.0 Max. :83.00
## bats throws debut birthYear
## Length:1384 Length:1384 Length:1384 Min. :1962
## Class :character Class :character Class :character 1st Qu.:1976
## Mode :character Mode :character Mode :character Median :1980
## Mean :1979
## 3rd Qu.:1982
## Max. :1988
## playerID yearID stint teamID
## Length:1384 Min. :2008 Min. :1.000 Length:1384
## Class :character 1st Qu.:2008 1st Qu.:1.000 Class :character
## Mode :character Median :2008 Median :1.000 Mode :character
## Mean :2008 Mean :1.068
## 3rd Qu.:2008 3rd Qu.:1.000
## Max. :2008 Max. :3.000
## lgID G G_batting AB
## Length:1384 Min. : 1.00 Min. : 0.00 Min. : 0.0
## Class :character 1st Qu.: 13.00 1st Qu.: 4.00 1st Qu.: 0.0
## Mode :character Median : 33.00 Median : 24.00 Median : 16.0
## Mean : 50.26 Mean : 44.12 Mean :120.5
## 3rd Qu.: 76.00 3rd Qu.: 74.00 3rd Qu.:182.2
## Max. :163.00 Max. :163.00 Max. :688.0
## R H 2B 3B
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 1.00 Median : 2.00 Median : 0.000 Median : 0.0000
## Mean : 16.32 Mean : 31.77 Mean : 6.513 Mean : 0.6402
## 3rd Qu.: 22.00 3rd Qu.: 45.00 3rd Qu.: 9.000 3rd Qu.: 0.0000
## Max. :125.00 Max. :213.00 Max. :54.000 Max. :19.0000
## HR RBI SB CS
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.000 Median : 1.00 Median : 0.000 Median : 0.0000
## Mean : 3.525 Mean : 15.56 Mean : 2.022 Mean : 0.7478
## 3rd Qu.: 3.000 3rd Qu.: 20.00 3rd Qu.: 1.000 3rd Qu.: 1.0000
## Max. :48.000 Max. :146.00 Max. :68.000 Max. :16.0000
## BB SO IBB HBP
## Min. : 0.0 Min. : 0.00 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 1.0 Median : 5.00 Median : 0.0000 Median : 0.000
## Mean : 11.8 Mean : 23.76 Mean : 0.9465 Mean : 1.208
## 3rd Qu.: 16.0 3rd Qu.: 34.00 3rd Qu.: 1.0000 3rd Qu.: 1.000
## Max. :111.0 Max. :204.00 Max. :34.0000 Max. :27.000
## SH SF GIDP G_old
## Min. : 0.000 Min. : 0.0000 Min. : 0.000 Min. : 1.00
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 6.00
## Median : 0.000 Median : 0.0000 Median : 0.000 Median : 25.00
## Mean : 1.103 Mean : 0.9863 Mean : 2.806 Mean : 45.11
## 3rd Qu.: 1.000 3rd Qu.: 1.0000 3rd Qu.: 4.000 3rd Qu.: 74.00
## Max. :19.000 Max. :11.0000 Max. :32.000 Max. :163.00
cut is useful for taking a continuous variable and splitting it into discrete pieces. ?cutHere is the default form of cut for use with numeric vectors:
# numeric form
cut(x, breaks, labels = NULL,
include.lowest = FALSE, right = TRUE, dig.lab = 3,
ordered_result = FALSE, ...)There is also a version of cut for manipulating Date objects:
# Date form
cut(x, breaks, labels = NULL, start.on.monday = TRUE,
right = FALSE, ...)The cut function takes a numeric vector as input and returns a factor. Each level in the factor corresponds to an interval of values in the input vector.
Z <- rnorm(10000)
table(cut(Z, breaks = -6:6))##
## (-6,-5] (-5,-4] (-4,-3] (-3,-2] (-2,-1] (-1,0] (0,1] (1,2] (2,3]
## 0 1 17 228 1378 3425 3418 1331 193
## (3,4] (4,5] (5,6]
## 9 0 0
# install.packages('nutshell')
# library(nutshell)
# data(batting.2008)
# first, add batting average to the data frame:
batting.2008.AB <- transform(batting.2008, AVG = H/AB)
dim(batting.2008.AB)## [1] 1384 33
batting.2008.AB[1:10,1:8]| nameLast | nameFirst | weight | height | bats | throws | debut | birthYear |
|---|---|---|---|---|---|---|---|
| Abreu | Bobby | 200 | 72 | L | R | 1996-09-01 | 1974 |
| Alou | Moises | 190 | 75 | R | R | 1990-07-26 | 1966 |
| Anderson | Garret | 190 | 75 | L | L | 1994-07-27 | 1972 |
| Anderson | Marlon | 198 | 71 | L | R | 1998-09-08 | 1974 |
| Ankiel | Rick | 210 | 73 | L | L | 1999-08-23 | 1979 |
| Ardoin | Danny | 218 | 72 | R | R | 2000-08-02 | 1974 |
| Armas | Tony | 205 | 76 | R | R | 1999-08-16 | 1978 |
| Arroyo | Bronson | 180 | 77 | R | R | 2000-06-12 | 1977 |
| Aurilia | Rich | 170 | 72 | R | R | 1995-09-06 | 1971 |
| Ausmus | Brad | 195 | 71 | R | R | 1993-07-28 | 1969 |
# now, select a subset of players with over 100 AB (for some
# statistical significance)
batting.2008.over100AB <- subset(batting.2008.AB, subset = (AB > 100))
# finally, split the results into 10 bins:
batting.2008.bins <- cut(batting.2008.over100AB$AVG, breaks = 10)
table(batting.2008.bins)## batting.2008.bins
## (0.137,0.163] (0.163,0.189] (0.189,0.215] (0.215,0.24] (0.24,0.266]
## 4 6 24 67 121
## (0.266,0.292] (0.292,0.318] (0.318,0.344] (0.344,0.37] (0.37,0.396]
## 132 70 11 5 2
make.groups function in the lattice package. library(lattice)
hat.sizes <- seq(from=6.25, to=7.75, by=.25)
pants.sizes <- c(30,31,32,33,34,36,38,40)
shoe.sizes <- seq(from=7, to=12)
make.groups(hat.sizes, pants.sizes, shoe.sizes)| data | which | |
|---|---|---|
| hat.sizes1 | 6.25 | hat.sizes |
| hat.sizes2 | 6.50 | hat.sizes |
| hat.sizes3 | 6.75 | hat.sizes |
| hat.sizes4 | 7.00 | hat.sizes |
| hat.sizes5 | 7.25 | hat.sizes |
| hat.sizes6 | 7.50 | hat.sizes |
| hat.sizes7 | 7.75 | hat.sizes |
| pants.sizes1 | 30.00 | pants.sizes |
| pants.sizes2 | 31.00 | pants.sizes |
| pants.sizes3 | 32.00 | pants.sizes |
| pants.sizes4 | 33.00 | pants.sizes |
| pants.sizes5 | 34.00 | pants.sizes |
| pants.sizes6 | 36.00 | pants.sizes |
| pants.sizes7 | 38.00 | pants.sizes |
| pants.sizes8 | 40.00 | pants.sizes |
| shoe.sizes1 | 7.00 | shoe.sizes |
| shoe.sizes2 | 8.00 | shoe.sizes |
| shoe.sizes3 | 9.00 | shoe.sizes |
| shoe.sizes4 | 10.00 | shoe.sizes |
| shoe.sizes5 | 11.00 | shoe.sizes |
| shoe.sizes6 | 12.00 | shoe.sizes |
subset function to select a subset of rows and columns from a data frame. batting.w.names.2008 <- subset(batting.2008.AB, yearID==2008)
dim(batting.w.names.2008)## [1] 1384 33
batting.w.names.2008[1:10,1:8]| nameLast | nameFirst | weight | height | bats | throws | debut | birthYear |
|---|---|---|---|---|---|---|---|
| Abreu | Bobby | 200 | 72 | L | R | 1996-09-01 | 1974 |
| Alou | Moises | 190 | 75 | R | R | 1990-07-26 | 1966 |
| Anderson | Garret | 190 | 75 | L | L | 1994-07-27 | 1972 |
| Anderson | Marlon | 198 | 71 | L | R | 1998-09-08 | 1974 |
| Ankiel | Rick | 210 | 73 | L | L | 1999-08-23 | 1979 |
| Ardoin | Danny | 218 | 72 | R | R | 2000-08-02 | 1974 |
| Armas | Tony | 205 | 76 | R | R | 1999-08-16 | 1978 |
| Arroyo | Bronson | 180 | 77 | R | R | 2000-06-12 | 1977 |
| Aurilia | Rich | 170 | 72 | R | R | 1995-09-06 | 1971 |
| Ausmus | Brad | 195 | 71 | R | R | 1993-07-28 | 1969 |
batting.w.names.2008.short <- subset(batting.2008.AB, yearID==2008,
c("nameFirst","nameLast","AB","H","BB"))
dim(batting.w.names.2008.short)## [1] 1384 5
head(batting.w.names.2008.short)| nameFirst | nameLast | AB | H | BB |
|---|---|---|---|---|
| Bobby | Abreu | 609 | 180 | 73 |
| Moises | Alou | 49 | 17 | 2 |
| Garret | Anderson | 557 | 163 | 29 |
| Marlon | Anderson | 138 | 29 | 9 |
| Rick | Ankiel | 413 | 109 | 42 |
| Danny | Ardoin | 51 | 12 | 2 |
sample function. The sample function returns a random sample of the elements of a vector:sample(x, size, replace = FALSE, prob = NULL)For example
sample(1:10, 5)## [1] 9 10 7 3 5
sample(1:10, 5, replace = TRUE)## [1] 8 1 1 6 3
sample to create a random sample of row numbers and then select these row numbers using an index operator. For example, let’s take a random sample of five elements from the batting.2008 data set: batting.2008[sample(1:nrow(batting.2008),10),][1:8]| nameLast | nameFirst | weight | height | bats | throws | debut | birthYear | |
|---|---|---|---|---|---|---|---|---|
| 1066 | Parra | Manny | 200 | 75 | L | L | 2007-07-20 | 1982 |
| 309 | Posada | Jorge | 190 | 74 | B | R | 1995-09-04 | 1971 |
| 580 | Ramirez | Horacio | 170 | 73 | L | L | 2003-04-02 | 1979 |
| 666 | Gonzalez | Adrian | 220 | 74 | L | L | 2004-04-18 | 1982 |
| 677 | DiNardo | Lenny | 195 | 76 | L | L | 2004-04-23 | 1979 |
| 1190 | Diaz | Robinzon | 225 | 72 | R | R | 2008-04-23 | 1983 |
| 43 | Buehrle | Mark | 200 | 74 | L | L | 2000-07-16 | 1979 |
| 1188 | Harman | Brad | 195 | 73 | R | R | 2008-04-22 | 1985 |
| 787 | Fiorentino | Jeff | 188 | 73 | L | R | 2005-05-12 | 1983 |
| 606 | Bautista | Jose | 192 | 72 | R | R | 2004-04-04 | 1980 |
tapplyThe function tapply is a very flexible function for summarizing a vector X. We can specify which subsets of X to summarize as well as the function used for summarization:
tapply(X, INDEX, FUN = , ..., simplify = )?tapplyFor example, we can use tapply to sum the number of home runs by team:
data(batting.2008)
tapply(X=batting.2008$HR,INDEX=list(batting.2008$teamID),FUN=sum)## ARI ATL BAL BOS CHA CHN CIN CLE COL DET FLO HOU KCA LAA LAN MIL MIN NYA
## 159 130 172 173 235 184 187 171 160 200 208 167 120 159 137 198 111 180
## NYN OAK PHI PIT SDN SEA SFN SLN TBA TEX TOR WAS
## 172 125 214 153 154 124 94 174 180 194 126 117
We can also apply a function that returns multiple items, such as fivenum (which returns a vector containing minimum, lower-hinge, median, upper-hinge, maximum) to the data. For example, here is the result of applying fivenum to the batting averages of each player, aggregated by league:
tapply(X=(batting.2008$H/batting.2008$AB),
INDEX=list(batting.2008$lgID),FUN=fivenum)## $AL
## [1] 0.0000000 0.1758242 0.2487923 0.2825485 1.0000000
##
## $NL
## [1] 0.0000000 0.0952381 0.2172524 0.2679739 1.0000000
We can also use tapply to calculate summaries over multiple dimensions. For example, we can calculate the mean number of home runs per player by league and batting hand:
tapply(X=(batting.2008$HR),INDEX=list(batting.w.names.2008$lgID,
batting.w.names.2008$bats), FUN=mean)## B L R
## AL 4.254902 4.564516 2.980198
## NL 4.104478 3.981395 3.203905
tapply is by. The function by works the same way as tapply, except that it works on data frames. The INDEX argument is replaced by an INDICES argument. Here is an example: require(stats)
head(warpbreaks)| breaks | wool | tension |
|---|---|---|
| 26 | A | L |
| 30 | A | L |
| 54 | A | L |
| 25 | A | L |
| 70 | A | L |
| 52 | A | L |
by(warpbreaks[, 1:2], warpbreaks[,"tension"], summary)## warpbreaks[, "tension"]: L
## breaks wool
## Min. :14.00 A:9
## 1st Qu.:26.00 B:9
## Median :29.50
## Mean :36.39
## 3rd Qu.:49.25
## Max. :70.00
## --------------------------------------------------------
## warpbreaks[, "tension"]: M
## breaks wool
## Min. :12.00 A:9
## 1st Qu.:18.25 B:9
## Median :27.00
## Mean :26.39
## 3rd Qu.:33.75
## Max. :42.00
## --------------------------------------------------------
## warpbreaks[, "tension"]: H
## breaks wool
## Min. :10.00 A:9
## 1st Qu.:15.25 B:9
## Median :20.50
## Mean :21.67
## 3rd Qu.:25.50
## Max. :43.00
by(warpbreaks[, 1], warpbreaks[, -1], summary)## wool: A
## tension: L
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 26.00 51.00 44.56 54.00 70.00
## --------------------------------------------------------
## wool: B
## tension: L
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 20.00 29.00 28.22 31.00 44.00
## --------------------------------------------------------
## wool: A
## tension: M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 18 21 24 30 36
## --------------------------------------------------------
## wool: B
## tension: M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 21.00 28.00 28.78 39.00 42.00
## --------------------------------------------------------
## wool: A
## tension: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 18.00 24.00 24.56 28.00 43.00
## --------------------------------------------------------
## wool: B
## tension: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 15.00 17.00 18.78 21.00 28.00
by(warpbreaks, warpbreaks[,"tension"],
function(x) lm(breaks ~ wool, data = x))## warpbreaks[, "tension"]: L
##
## Call:
## lm(formula = breaks ~ wool, data = x)
##
## Coefficients:
## (Intercept) woolB
## 44.56 -16.33
##
## --------------------------------------------------------
## warpbreaks[, "tension"]: M
##
## Call:
## lm(formula = breaks ~ wool, data = x)
##
## Coefficients:
## (Intercept) woolB
## 24.000 4.778
##
## --------------------------------------------------------
## warpbreaks[, "tension"]: H
##
## Call:
## lm(formula = breaks ~ wool, data = x)
##
## Coefficients:
## (Intercept) woolB
## 24.556 -5.778
aggregateaggregate. Here is the form of aggregate when applied to data frames: aggregate(x, by, FUN, ...)Use aggregate to summarize batting statistics by team:
aggregate(x = batting.2008[, c('AB', 'H', 'BB', '2B', '3B', 'HR')],
by = list(batting.2008$teamID), FUN = sum)| Group.1 | AB | H | BB | 2B | 3B | HR |
|---|---|---|---|---|---|---|
| ARI | 5409 | 1355 | 587 | 318 | 47 | 159 |
| ATL | 5604 | 1514 | 618 | 316 | 33 | 130 |
| BAL | 5559 | 1486 | 533 | 322 | 30 | 172 |
| BOS | 5596 | 1565 | 646 | 353 | 33 | 173 |
| CHA | 5553 | 1458 | 540 | 296 | 13 | 235 |
| CHN | 5588 | 1552 | 636 | 329 | 21 | 184 |
| CIN | 5465 | 1351 | 560 | 269 | 24 | 187 |
| CLE | 5543 | 1455 | 560 | 339 | 22 | 171 |
| COL | 5557 | 1462 | 570 | 310 | 28 | 160 |
| DET | 5641 | 1529 | 572 | 293 | 41 | 200 |
| FLO | 5499 | 1397 | 543 | 302 | 28 | 208 |
| HOU | 5451 | 1432 | 449 | 284 | 22 | 167 |
| KCA | 5608 | 1507 | 392 | 303 | 28 | 120 |
| LAA | 5540 | 1486 | 481 | 274 | 25 | 159 |
| LAN | 5506 | 1455 | 543 | 271 | 29 | 137 |
| MIL | 5535 | 1398 | 550 | 324 | 35 | 198 |
| MIN | 5641 | 1572 | 529 | 298 | 49 | 111 |
| NYA | 5572 | 1512 | 535 | 289 | 20 | 180 |
| NYN | 5606 | 1491 | 619 | 274 | 38 | 172 |
| OAK | 5451 | 1318 | 574 | 270 | 23 | 125 |
| PHI | 5509 | 1407 | 586 | 291 | 36 | 214 |
| PIT | 5628 | 1454 | 474 | 314 | 21 | 153 |
| SDN | 5568 | 1390 | 518 | 264 | 27 | 154 |
| SEA | 5643 | 1498 | 417 | 285 | 20 | 124 |
| SFN | 5543 | 1452 | 452 | 311 | 37 | 94 |
| SLN | 5636 | 1585 | 577 | 283 | 26 | 174 |
| TBA | 5541 | 1443 | 626 | 284 | 37 | 180 |
| TEX | 5728 | 1619 | 595 | 376 | 35 | 194 |
| TOR | 5503 | 1453 | 521 | 303 | 32 | 126 |
| WAS | 5491 | 1376 | 534 | 269 | 26 | 117 |
rowsumrowsum function: rowsum(x, group, reorder = TRUE, ...)Use rowsum to summarize batting statistics by team:
rowsum(batting.2008[,c("AB","H","BB","2B","3B","HR")],
group=batting.2008$teamID)| AB | H | BB | 2B | 3B | HR | |
|---|---|---|---|---|---|---|
| ARI | 5409 | 1355 | 587 | 318 | 47 | 159 |
| ATL | 5604 | 1514 | 618 | 316 | 33 | 130 |
| BAL | 5559 | 1486 | 533 | 322 | 30 | 172 |
| BOS | 5596 | 1565 | 646 | 353 | 33 | 173 |
| CHA | 5553 | 1458 | 540 | 296 | 13 | 235 |
| CHN | 5588 | 1552 | 636 | 329 | 21 | 184 |
| CIN | 5465 | 1351 | 560 | 269 | 24 | 187 |
| CLE | 5543 | 1455 | 560 | 339 | 22 | 171 |
| COL | 5557 | 1462 | 570 | 310 | 28 | 160 |
| DET | 5641 | 1529 | 572 | 293 | 41 | 200 |
| FLO | 5499 | 1397 | 543 | 302 | 28 | 208 |
| HOU | 5451 | 1432 | 449 | 284 | 22 | 167 |
| KCA | 5608 | 1507 | 392 | 303 | 28 | 120 |
| LAA | 5540 | 1486 | 481 | 274 | 25 | 159 |
| LAN | 5506 | 1455 | 543 | 271 | 29 | 137 |
| MIL | 5535 | 1398 | 550 | 324 | 35 | 198 |
| MIN | 5641 | 1572 | 529 | 298 | 49 | 111 |
| NYA | 5572 | 1512 | 535 | 289 | 20 | 180 |
| NYN | 5606 | 1491 | 619 | 274 | 38 | 172 |
| OAK | 5451 | 1318 | 574 | 270 | 23 | 125 |
| PHI | 5509 | 1407 | 586 | 291 | 36 | 214 |
| PIT | 5628 | 1454 | 474 | 314 | 21 | 153 |
| SDN | 5568 | 1390 | 518 | 264 | 27 | 154 |
| SEA | 5643 | 1498 | 417 | 285 | 20 | 124 |
| SFN | 5543 | 1452 | 452 | 311 | 37 | 94 |
| SLN | 5636 | 1585 | 577 | 283 | 26 | 174 |
| TBA | 5541 | 1443 | 626 | 284 | 37 | 180 |
| TEX | 5728 | 1619 | 595 | 376 | 35 | 194 |
| TOR | 5503 | 1453 | 521 | 303 | 32 | 126 |
| WAS | 5491 | 1376 | 534 | 269 | 26 | 117 |
The simplest function for counting the number of observations that take on a value is the tabulate function. This function counts the number of elements in a vector that take on each integer value and returns a vector with the counts. As an example, suppose that we wante to count the number of players who hit 0 HR, 1 HR, 2 HR, 3 HR, and so on. This can be done with the function tabulate:
HR.cnts <- tabulate(batting.w.names.2008$HR)
# tabulate doesn't label results, so let's add names:
names(HR.cnts) <- 0:(length(HR.cnts)-1)
HR.cnts## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 92 63 45 20 15 26 23 21 22 15 15 18 12 10 12 4 9 3 3 13 9 7 10 4 8
## 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
## 2 5 2 4 0 1 6 6 3 1 2 4 1 0 0 0 0 0 0 0 0 0 1
Another simple example is
tabulate(c(2,3,5))## [1] 0 1 1 0 1
tabulate(c(2,3,3,5), nbins = 10)## [1] 0 1 2 0 1 0 0 0 0 0
table. Suppose that we are presented with some data that includes a few categorical values (encoded as factors in R) and wanted to count how many observations in the data had each categorical value. To do this, we can use the table function:table(...,
exclude = if (useNA == "no") c(NA, NaN),
useNA = c("no", "ifany", "always"),
dnn = list.names(...), deparse.level = 1)More details can be found
?tableTable returns a table object showing the number of observations that have each possible categorical value.
table(batting.2008$bats)##
## B L R
## 118 401 865
table(batting.2008[,c('bats', 'throws')])## throws
## bats L R
## B 10 108
## L 240 161
## R 25 840
Table only works on factors, but sometimes you might like to calculate tables with numeric values as well. x <- c(rep(0,3), 2, 4, 2, 6, 8, 4, 8, 9, 6, 10)
table(x)## x
## 0 2 4 6 8 9 10
## 3 2 2 2 2 1 1
xtabs, which creates contingency tables from factors using formulas:xtabs(formula = ~., data = parent.frame(), subset, sparse = FALSE,
na.action, addNA = FALSE, exclude = if(!addNA) c(NA, NaN),
drop.unused.levels = FALSE)xtabs works the same as table, but allows you to specify the groupings by specifying a formula and a data frame.
xtabs(~bats+lgID,batting.2008)## lgID
## bats AL NL
## B 51 67
## L 186 215
## R 404 461
is equivalent to
table(batting.2008[, c('bats', 'lgID')])## lgID
## bats AL NL
## B 51 67
## L 186 215
## R 404 461
t, which transposes objects. The t function takes one argument: an object to transpose. The object can be a matrix, vector, or data frame. Here is an example with a matrix: x <- matrix(rnorm(20), 4, 5)
x## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.2525134 -0.08562777 1.0002629 0.42522937 -0.3381858
## [2,] -0.5998721 1.01294184 -0.9190992 0.04490328 -0.3605422
## [3,] 0.3459427 -0.36187167 0.5567472 -1.84682487 -0.3512131
## [4,] 0.1076104 1.41009508 1.2193319 -0.53157659 1.0006366
t(x)## [,1] [,2] [,3] [,4]
## [1,] -0.25251340 -0.59987209 0.3459427 0.1076104
## [2,] -0.08562777 1.01294184 -0.3618717 1.4100951
## [3,] 1.00026293 -0.91909924 0.5567472 1.2193319
## [4,] 0.42522937 0.04490328 -1.8468249 -0.5315766
## [5,] -0.33818577 -0.36054217 -0.3512131 1.0006366
?reshape df3 <- data.frame(id = 1:4, age = c(40,50,60,50),
dose1 = c(1,2,1,2),dose2 = c(2,1,2,1),
dose4 = c(3,3,3,3))
df3| id | age | dose1 | dose2 | dose4 |
|---|---|---|---|---|
| 1 | 40 | 1 | 2 | 3 |
| 2 | 50 | 2 | 1 | 3 |
| 3 | 60 | 1 | 2 | 3 |
| 4 | 50 | 2 | 1 | 3 |
reshape(df3, direction = "long", varying = 3:5, sep = "")| id | age | time | dose | |
|---|---|---|---|---|
| 1.1 | 1 | 40 | 1 | 1 |
| 2.1 | 2 | 50 | 1 | 2 |
| 3.1 | 3 | 60 | 1 | 1 |
| 4.1 | 4 | 50 | 1 | 2 |
| 1.2 | 1 | 40 | 2 | 2 |
| 2.2 | 2 | 50 | 2 | 1 |
| 3.2 | 3 | 60 | 2 | 2 |
| 4.2 | 4 | 50 | 2 | 1 |
| 1.4 | 1 | 40 | 4 | 3 |
| 2.4 | 2 | 50 | 4 | 3 |
| 3.4 | 3 | 60 | 4 | 3 |
| 4.4 | 4 | 50 | 4 | 3 |
df <- data.frame(id = rep(1:4, rep(2,4)),
visit = I(rep(c("Before","After"), 4)),
x = rnorm(4), y = runif(4))
df| id | visit | x | y |
|---|---|---|---|
| 1 | Before | -0.5240285 | 0.7799757 |
| 1 | After | 2.1108666 | 0.7812365 |
| 2 | Before | 1.1071719 | 0.4277117 |
| 2 | After | 0.4413647 | 0.7871837 |
| 3 | Before | -0.5240285 | 0.7799757 |
| 3 | After | 2.1108666 | 0.7812365 |
| 4 | Before | 1.1071719 | 0.4277117 |
| 4 | After | 0.4413647 | 0.7871837 |
reshape(df, timevar = "visit", idvar = "id", direction = "wide")| id | x.Before | y.Before | x.After | y.After | |
|---|---|---|---|---|---|
| 1 | 1 | -0.5240285 | 0.7799757 | 2.1108666 | 0.7812365 |
| 3 | 2 | 1.1071719 | 0.4277117 | 0.4413647 | 0.7871837 |
| 5 | 3 | -0.5240285 | 0.7799757 | 2.1108666 | 0.7812365 |
| 7 | 4 | 1.1071719 | 0.4277117 | 0.4413647 | 0.7871837 |
duplicated function. This function returns a logical vector showing which elements are duplicates of values with lower indices. x <- c(9:20, 1:5, 3:7, 0:8)
duplicated(x)## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE
## [23] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## extract unique elements
xu <- x[!duplicated(x)]
xu## [1] 9 10 11 12 13 14 15 16 17 18 19 20 1 2 3 4 5 6 7 0 8
is equivalent to
unique(x)## [1] 9 10 11 12 13 14 15 16 17 18 19 20 1 2 3 4 5 6 7 0 8
levels(factor(x))## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13"
## [15] "14" "15" "16" "17" "18" "19" "20"
(x <- c(sort(sample(1:20, 9)), NA))## [1] 6 9 11 13 14 15 17 18 20 NA
(y <- c(sort(sample(3:23, 7)), NA))## [1] 5 6 7 13 18 19 22 NA
union(x, y)## [1] 6 9 11 13 14 15 17 18 20 NA 5 7 19 22
intersect(x, y)## [1] 6 13 18 NA
setdiff(x, y)## [1] 9 11 14 15 17 20
setdiff(y, x)## [1] 5 7 19 22
setequal(x, y)## [1] FALSE
is.element(x, y) ## [1] TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE
x%in%y## [1] TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE
is.element(y, x) ## [1] FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE
y%in%x## [1] FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE
The function is.na can be used to detect NA’s.
#install.packages('mice')
library(mice)
data(nhanes2)
dim(nhanes2)## [1] 25 4
head(nhanes2)| age | bmi | hyp | chl |
|---|---|---|---|
| 20-39 | NA | NA | NA |
| 40-59 | 22.7 | no | 187 |
| 20-39 | NA | no | 187 |
| 60-99 | NA | NA | NA |
| 20-39 | 20.4 | no | 113 |
| 60-99 | NA | NA | 184 |
is.na(nhanes2)## age bmi hyp chl
## 1 FALSE TRUE TRUE TRUE
## 2 FALSE FALSE FALSE FALSE
## 3 FALSE TRUE FALSE FALSE
## 4 FALSE TRUE TRUE TRUE
## 5 FALSE FALSE FALSE FALSE
## 6 FALSE TRUE TRUE FALSE
## 7 FALSE FALSE FALSE FALSE
## 8 FALSE FALSE FALSE FALSE
## 9 FALSE FALSE FALSE FALSE
## 10 FALSE TRUE TRUE TRUE
## 11 FALSE TRUE TRUE TRUE
## 12 FALSE TRUE TRUE TRUE
## 13 FALSE FALSE FALSE FALSE
## 14 FALSE FALSE FALSE FALSE
## 15 FALSE FALSE FALSE TRUE
## 16 FALSE TRUE TRUE TRUE
## 17 FALSE FALSE FALSE FALSE
## 18 FALSE FALSE FALSE FALSE
## 19 FALSE FALSE FALSE FALSE
## 20 FALSE FALSE FALSE TRUE
## 21 FALSE TRUE TRUE TRUE
## 22 FALSE FALSE FALSE FALSE
## 23 FALSE FALSE FALSE FALSE
## 24 FALSE FALSE FALSE TRUE
## 25 FALSE FALSE FALSE FALSE
The complete.cases function detects rows in a data.frame that do not contain any missing value.
complete.cases(nhanes2)## [1] FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## [12] FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE
## [23] TRUE FALSE TRUE
The na.omit function can be used to remove incomplete records from the data.frame.
na.omit(nhanes2)| age | bmi | hyp | chl | |
|---|---|---|---|---|
| 2 | 40-59 | 22.7 | no | 187 |
| 5 | 20-39 | 20.4 | no | 113 |
| 7 | 20-39 | 22.5 | no | 118 |
| 8 | 20-39 | 30.1 | no | 187 |
| 9 | 40-59 | 22.0 | no | 238 |
| 13 | 60-99 | 21.7 | no | 206 |
| 14 | 40-59 | 28.7 | yes | 204 |
| 17 | 60-99 | 27.2 | yes | 284 |
| 18 | 40-59 | 26.3 | yes | 199 |
| 19 | 20-39 | 35.3 | no | 218 |
| 22 | 20-39 | 33.2 | no | 229 |
| 23 | 20-39 | 27.5 | no | 131 |
| 25 | 40-59 | 27.4 | no | 186 |
Note: Outliers do not equal errors. They should be detected, but not necessarily removed. Their inclusion in the analysis is a statistical decision.
The boxplot.stats can be used to detecte the outlier.
# ?boxplot.stats
x <- c(rnorm(100), 10, -7) boxplot.stats(x)$out## [1] 10 -7
Another useful function is outlier. This function finds value with largest difference between it and sample mean.
# install.packages('outliers')
library(outliers)
outlier(x)## [1] 10
Also, we can use ‘boxplot’ to detecte the outlier.
boxplot(x)sort function to sort the elements of an object:sort(x, partial = NULL, na.last = NA, decreasing = FALSE,
method = c("auto", "shell", "quick", "radix"), index.return = FALSE)`w <- c(5, 4, 7, 2, 7, 1)
sort(w)## [1] 1 2 4 5 7 7
sort(w, decreasing = TRUE, index.return = T)## $x
## [1] 7 7 5 4 2 1
##
## $ix
## [1] 3 5 1 2 4 6
na.last argument:length(w) <- 8
w## [1] 5 4 7 2 7 1 NA NA
sort(w,na.last=TRUE)## [1] 1 2 4 5 7 7 NA NA
sort(w, decreasing = TRUE, na.last=FALSE)## [1] NA NA 7 7 5 4 2 1
dplyrif_elseUsage
if_else(condition, true, false, missing = NULL) library(dplyr)
x <- c(-5:5, NA)
if_else(x < 0, NA_integer_, x) ## [1] NA NA NA NA NA 0 1 2 3 4 5 NA
if_else(x < 0, "negative", "positive", "missing")## [1] "negative" "negative" "negative" "negative" "negative" "positive"
## [7] "positive" "positive" "positive" "positive" "positive" "missing"
Unlike ifelse, if_else preserves types.
x <- factor(sample(letters[1:5], 10, replace = TRUE))
x## [1] a c a a a c a e e e
## Levels: a c e
ifelse(x %in% c("a", "b", "c"), x, factor(NA)) ## [1] 1 2 1 1 1 2 1 NA NA NA
if_else(x %in% c("a", "b", "c"), x, factor(NA)) ## [1] a c a a a c a <NA> <NA> <NA>
## Levels: a c e
ifelse(rnorm(20)<0, 1, 0)## [1] 1 1 1 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0
lead or lag lead(1:10, 1) ## [1] 2 3 4 5 6 7 8 9 10 NA
lead(1:10, 2)## [1] 3 4 5 6 7 8 9 10 NA NA
lag(1:10, 1) ## [1] NA 1 2 3 4 5 6 7 8 9
lag(1:10, 2)## [1] NA NA 1 2 3 4 5 6 7 8
case_whenif else() statements. If no cases match, NA is returned. x <- 1:50
case_when( x %% 35 == 0 ~ "fizz buzz",
x %% 5 == 0 ~ "fizz",
x %% 7 == 0 ~ "buzz",
TRUE ~ as.character(x) )## [1] "1" "2" "3" "4" "fizz"
## [6] "6" "buzz" "8" "9" "fizz"
## [11] "11" "12" "13" "buzz" "fizz"
## [16] "16" "17" "18" "19" "fizz"
## [21] "buzz" "22" "23" "24" "fizz"
## [26] "26" "27" "buzz" "29" "fizz"
## [31] "31" "32" "33" "34" "fizz buzz"
## [36] "36" "37" "38" "39" "fizz"
## [41] "41" "buzz" "43" "44" "fizz"
## [46] "46" "47" "48" "buzz" "fizz"
If none of the cases match, NA is used:
case_when( x %% 35 == 0 ~ "fizz buzz",
x %% 5 == 0 ~ "fizz",
x %% 7 == 0 ~ "buzz")## [1] NA NA NA NA "fizz"
## [6] NA "buzz" NA NA "fizz"
## [11] NA NA NA "buzz" "fizz"
## [16] NA NA NA NA "fizz"
## [21] "buzz" NA NA NA "fizz"
## [26] NA NA "buzz" NA "fizz"
## [31] NA NA NA NA "fizz buzz"
## [36] NA NA NA NA "fizz"
## [41] NA "buzz" NA NA "fizz"
## [46] NA NA NA "buzz" "fizz"
foo.rda or foo.RData._, and vice versa.apply() and sapply().