This documents serves as a reference sheet with example for the basics of R programming. It is more of brain dump of functions and examples I find to be useful.
+ - * /
< <= > >= == != | &
<- is the assignment operator
x <- 1
Use " "
c("My", "name", "is")
## [1] "My" "name" "is"
Combining elements
my_char <- c("My", "name", "is")
paste(my_char, collapse = " ")
## [1] "My name is"
# Vectorized
paste(1:3, c("X", "Y", "Z"), sep = "")
## [1] "1X" "2Y" "3Z"
# recycling
paste(LETTERS, 1:4, sep = "-")
## [1] "A-1" "B-2" "C-3" "D-4" "E-1" "F-2" "G-3" "H-4" "I-1" "J-2" "K-3"
## [12] "L-4" "M-1" "N-2" "O-3" "P-4" "Q-1" "R-2" "S-3" "T-4" "U-1" "V-2"
## [23] "W-3" "X-4" "Y-1" "Z-2"
class(1)
## [1] "numeric"
class (1L)
## [1] "integer"
1/0
## [1] Inf
1/Inf
## [1] 0
0/0
## [1] NaN
z <- complex(real = 1, imaginary = 2)
z
## [1] 1+2i
my_vector <-1:10
my_logical <- my_vector == 5
my_logical
## [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
Sample attributes:
temp <- 1:10
length (temp)
## [1] 10
Attributes can be used using attributes()
Simplest way is using :
temp <- 1:10
temp
## [1] 1 2 3 4 5 6 7 8 9 10
temp <- pi:10
temp
## [1] 3.141593 4.141593 5.141593 6.141593 7.141593 8.141593 9.141593
temp <- 15:1
temp
## [1] 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
Alternatively you can use seq()
my_seq <- seq(0, 20)
my_seq <- seq(0, 10, by=0.5)
my_seq <- seq(5, 10, length=30)
seq_along (my_seq)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30
my_seq <- seq(along.with = my_seq)
rep(0, times = 40)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0
rep(c(0, 1, 2), times = 10)
## [1] 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2
rep(c(0, 1, 2), each = 10)
## [1] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
Vectors can only contain objects of same class
List is a vector that can contain objects of different classes
x <- vector("numeric", length = 10)
x
## [1] 0 0 0 0 0 0 0 0 0 0
x <- c(0.5, 0.6)
x <- c(TRUE, FALSE) ## logical
x <- c(T, F) ## logical
x <- c("a", "b", "c") ## character
x <- 9:29 ## integer
x <- c(1+0i, 2+4i) ## complex
Coercion occurs when mixing objects
y <- c(1.7, "a") ## character
y
## [1] "1.7" "a"
y <- c(TRUE, 2) ## numeric
y
## [1] 1 2
y <- c("a", TRUE) ## character
y
## [1] "a" "TRUE"
use the as.* functions
x <- 0:6
class(x)
## [1] "integer"
as.numeric(x)
## [1] 0 1 2 3 4 5 6
as.logical(x)
## [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE
as.character(x)
## [1] "0" "1" "2" "3" "4" "5" "6"
Note: nonsensical coercion results in NA
x <- c("a", "b", "c")
as.numeric(x)
## Warning: NAs introduced by coercion
## [1] NA NA NA
x <- rnorm(100)
x[1:10]
## [1] -0.09395277 -0.57027407 1.94268854 -0.68255566 0.59790498
## [6] 0.91508766 0.26957541 -0.81158391 -0.23726188 -1.50521598
#subset everything except 2 and 10
x[c(-2, -10)]
## [1] -0.09395277 1.94268854 -0.68255566 0.59790498 0.91508766
## [6] 0.26957541 -0.81158391 -0.23726188 0.25465048 2.81339326
## [11] -1.19823288 1.49524733 0.04258432 -0.55912845 1.95869557
## [16] 0.04002838 -0.75245094 -0.12954895 -0.02649436 1.01778248
## [21] -1.65050478 0.45054472 1.42448902 -0.72963672 1.07730775
## [26] -0.29340742 0.55711576 -0.09962152 -0.43695608 -0.34377805
## [31] 0.36087754 -1.29022680 -0.30197948 0.23639093 -0.15399597
## [36] 0.30102203 0.29552396 -0.84844166 0.74993891 -0.39342129
## [41] 0.62084805 0.68010494 0.48228051 -0.52695683 -1.40116875
## [46] -0.68463419 -2.29923708 0.71177792 -1.12863548 -0.91979920
## [51] 0.51470227 -1.05691883 -0.11445467 -0.75333934 0.59201581
## [56] 1.37231180 1.34272400 -0.00330700 -0.15438057 0.95481515
## [61] -1.10379628 -1.55768665 1.10785064 1.41759311 1.18387602
## [66] 0.46881241 -2.35489957 -1.35077875 -0.57805577 0.62606079
## [71] -1.56333473 0.93642731 -1.10128476 1.54773458 1.12321609
## [76] -0.74726358 1.00009303 -0.98599075 -0.37965946 1.58991656
## [81] -0.08269716 -0.07365312 0.84002278 -0.31310774 0.35137289
## [86] -1.32260399 -1.45120209 -0.47992977 -1.09667338 -1.21224085
## [91] 0.58221614 -1.16962665 1.05707835 0.22598321 0.08169949
## [96] 1.11492505 -0.90819208 2.73815977
x[-c(2, 10)]
## [1] -0.09395277 1.94268854 -0.68255566 0.59790498 0.91508766
## [6] 0.26957541 -0.81158391 -0.23726188 0.25465048 2.81339326
## [11] -1.19823288 1.49524733 0.04258432 -0.55912845 1.95869557
## [16] 0.04002838 -0.75245094 -0.12954895 -0.02649436 1.01778248
## [21] -1.65050478 0.45054472 1.42448902 -0.72963672 1.07730775
## [26] -0.29340742 0.55711576 -0.09962152 -0.43695608 -0.34377805
## [31] 0.36087754 -1.29022680 -0.30197948 0.23639093 -0.15399597
## [36] 0.30102203 0.29552396 -0.84844166 0.74993891 -0.39342129
## [41] 0.62084805 0.68010494 0.48228051 -0.52695683 -1.40116875
## [46] -0.68463419 -2.29923708 0.71177792 -1.12863548 -0.91979920
## [51] 0.51470227 -1.05691883 -0.11445467 -0.75333934 0.59201581
## [56] 1.37231180 1.34272400 -0.00330700 -0.15438057 0.95481515
## [61] -1.10379628 -1.55768665 1.10785064 1.41759311 1.18387602
## [66] 0.46881241 -2.35489957 -1.35077875 -0.57805577 0.62606079
## [71] -1.56333473 0.93642731 -1.10128476 1.54773458 1.12321609
## [76] -0.74726358 1.00009303 -0.98599075 -0.37965946 1.58991656
## [81] -0.08269716 -0.07365312 0.84002278 -0.31310774 0.35137289
## [86] -1.32260399 -1.45120209 -0.47992977 -1.09667338 -1.21224085
## [91] 0.58221614 -1.16962665 1.05707835 0.22598321 0.08169949
## [96] 1.11492505 -0.90819208 2.73815977
vect <- c(foo = 11, bar = 2, norf = NA)
#view names
names (vect)
## [1] "foo" "bar" "norf"
Named vector after creation
vect2 <- c(11,2,NA)
names(vect2)<- c("foo", "bar", "norf")
View names
vect[c("foo", "bar")]
## foo bar
## 11 2
m <- matrix(nrow = 2, ncol = 3)
m
## [,1] [,2] [,3]
## [1,] NA NA NA
## [2,] NA NA NA
dim(m)
## [1] 2 3
attributes(m)
## $dim
## [1] 2 3
m <- matrix (1:6, nrow = 2, ncol =3)
m
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
m<-1:10
m
## [1] 1 2 3 4 5 6 7 8 9 10
dim(m)<- c(2,5)
m
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 3 5 7 9
## [2,] 2 4 6 8 10
x <- 1:3
y <- 10:12
cbind (x,y)
## x y
## [1,] 1 10
## [2,] 2 11
## [3,] 3 12
rbind (x,y)
## [,1] [,2] [,3]
## x 1 2 3
## y 10 11 12
Matrices can be subsetted in the usual way with (i,j) type indices.
x <- matrix(1:6, 2, 3)
x[1, 2]
x[2, 1]
Subsetting a single element resutls in returning a vector of length 1 rather than a 1x1 matrix. This can be changed using ‘drop = FALSE’
x <- matrix(1:6, 2, 3)
x[1, 2]
x[2, 1]
Similarly, subsetting a single column or a single row will give you a vector, not a matrix (by default).
x <- matrix(1:6, 2, 3)
x[1, ]
x[1, , drop = FALSE]
As mentioned before, lists are a vector which contain elements of different classes
x <- list(1, "a", TRUE, 1 + 4i)
x
## [[1]]
## [1] 1
##
## [[2]]
## [1] "a"
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] 1+4i
x <- list(foo = 1:4, bar = 0.6)
x$foo
## [1] 1 2 3 4
x$bar
## [1] 0.6
x[["bar"]]
## [1] 0.6
x["bar"]
## $bar
## [1] 0.6
x <- list(foo = 1:4, bar = 0.6, baz = "hello")
x[c(1, 3)]
## $foo
## [1] 1 2 3 4
##
## $baz
## [1] "hello"
The [[ operator can be used with computed indices; $ can only be used with literal names.
x <- list(foo = 1:4, bar = 0.6, baz = "hello")
name <- "foo"
x[[name]] ## computed index for âfooâ
x$name ## element ânameâ doesnât exist!
x$foo
The [[ can take an integer sequence.
x <- list(a = list(10, 12, 14), b = c(3.14, 2.81))
x[[c(1, 3)]]
x[[1]][[3]]
x[[c(2, 1)]]
Used to represent categorical data (ie male and female)
x <- factor(c("yes", "yes", "no", "yes", "no"))
x
## [1] yes yes no yes no
## Levels: no yes
table (x)
## x
## no yes
## 2 3
unclass(x)
## [1] 2 2 1 2 1
## attr(,"levels")
## [1] "no" "yes"
x<- factor(c("yes", "yes", "no", "yes", "no"),
levels = c("yes", "no"))
x
## [1] yes yes no yes no
## Levels: yes no
Missing values are denoted by NA or NaN for undefined mathematical operations.
is.na() is used to test objects if they are NA
is.nan() is used to test for NaN
NA values have a class also, so there are integer NA, character NA, etc.
A NaN value is also NA but the converse is not true
Be very careful when combining missing values with logical expressions
x <- c(1, 2, NA, 10, 3)
is.na(x)
## [1] FALSE FALSE TRUE FALSE FALSE
is.nan(x)
## [1] FALSE FALSE FALSE FALSE FALSE
x <- c(1, 2, NaN, NA, 4)
is.na(x)
## [1] FALSE FALSE TRUE TRUE FALSE
is.nan(x)
## [1] FALSE FALSE TRUE FALSE FALSE
x[is.na(x)]
## [1] NaN NA
x[!is.na(x)]
## [1] 1 2 4
x[is.na(x)] <-0
x <- c(1, 2, NA, 4, NA, 5)
y <- c("a", "b", NA, "d", NA, "f")
good <- complete.cases(x, y)
good
x[good]
y[good]
-Used to save tabular data
-List of same length
-attribute called ‘row.names’
can be created using ‘read.table()’ and ‘read.csv()’
be careful when converting to matrix ‘data.matrix()’
x <- data.frame(foo = 1:4, bar = c(T, T, F, F))
x
## foo bar
## 1 1 TRUE
## 2 2 TRUE
## 3 3 FALSE
## 4 4 FALSE
nrow(x)
## [1] 4
ncol(x)
## [1] 2
R objects can have names as shown before.
x <- 1:3
names(x)
## NULL
names(x) <- c("foo", "bar", "norf")
x
## foo bar norf
## 1 2 3
names(x)
## [1] "foo" "bar" "norf"
x <- list (a =1, b = 2, c = 3)
m <- matrix(1:4, nrow = 2, ncol = 2)
dimnames(m) <- list(c("a", "b"), c("c", "d"))
m
## c d
## a 1 3
## b 2 4
There are a few principal functions reading data into R. - read.table, read.csv, for reading tabular data
readLines, for reading lines of a text file
source, for reading in R code files (inverse of dump)
dget, for reading in R code files (inverse of dput)
load, for reading in saved workspaces
unserialize, for reading single R objects in binary form
The read.table function is one of the most commonly used functions for reading data.
For small to moderately sized datasets, you can usually call read.table without specifying any other arguments
data <- read.table("foo.txt")
With much larger datasets, doing the following things will make your life easier and will prevent R from choking.
Read the help page for read.table, which contains many hints
Make a rough calculation of the memory required to store your dataset. If the dataset is larger than the amount of RAM on your computer, you can probably stop right here.
Set comment.char = "" if there are no commented lines in your file.
Use the colClasses argument. Specifying this option instead of using the default can make âread.tableâ run MUCH faster, often twice as fast. In order to use this option, you have to know the class of each column in your data frame. If all of the columns are ânumericâ, for example, then you can just set colClasses = "numeric". A quick an dirty way to figure out the classes of each column is the following:
initial <- read.table("datatable.txt", nrows = 100)
classes <- sapply(initial, class)
tabAll <- read.table("datatable.txt",
colClasses = classes)
con <- gzfile("words.gz")
x <- readLines(con, 10)
x
writeLines takes a character vector and writes each element one line at a time to a text file.
readLines can be useful for reading in lines of webpages
## This might take time
con <- url("http://www.jhsph.edu", "r")
x <- readLines(con)
> head(x)
There are analogous functions for writing data to files - write.table - writeLines - dump - dput - save - serialize
dumping and dputing are useful because the resulting textual format is edit-able, and in the case of corruption, potentially recoverable.
Unlike writing out a table or csv file, dump and dput preserve the metadata (sacrificing some readability), so that another user doesnât have to specify it all over again.
Textual formats can work much better with version control programs like subversion or git which can only track changes meaningfully in text files
Textual formats can be longer-lived; if there is corruption somewhere in the file, it can be easier to fix the problem
Textual formats adhere to the âUnix philosophyâ
Downside: The format is not very space-efficient
Another way to pass data around is by deparsing the R object with dput and reading it back in using dget.
y <- data.frame(a = 1, b = "a")
dput(y)
structure(list(a = 1,
b = structure(1L, .Label = "a",
class = "factor")),
.Names = c("a", "b"), row.names = c(NA, -1L),
class = "data.frame")
dput(y, file = "y.R")
new.y <- dget("y.R")
new.y
Multiple objects can be deparsed using the dump function and read back in using source.
x <- "foo"
y <- data.frame(a = 1, b = "a")
dump(c("x", "y"), file = "data.R")
rm(x, y)
source("data.R")
y
x
Data are read in using connection interfaces. Connections can be made to files (most common) or to other more exotic things.
file, opens a connection to a file
gzfile, opens a connection to a file compressed with gzip
bzfile, opens a connection to a file compressed with bzip2
url, opens a connection to a webpage
> str(file)
function (description = "", open = "", blocking = TRUE,
encoding = getOption("encoding"))
description is the name of the fileopen is a code indicatingIn general, connections are powerful tools that let you navigate files or other external objects. In practice, we often donât need to deal with the connection interface directly.
con <- file("foo.txt", "r")
data <- read.csv(con)
close(con)
is the same as
data <- read.csv("foo.txt")
Control structures in R allow you to control the flow of execution of the program, depending on runtime conditions. Common structures are
if, else: testing a condition
for: execute a loop a fixed number of times
while: execute a loop while a condition is true
repeat: execute an infinite loop
break: break the execution of a loop
next: skip an interation of a loop
return: exit a function
Example of how to use if:
if(
y <- if(x > 3) {
10
} else {
0
}
for loops take an interator variable and assign it successive values from a sequence or vector. For loops are most commonly used for iterating over the elements of an object (list, vector, etc.)
for(i in 1:10) {
print(i)
}
for loops can be nested.
x <- matrix(1:6, 2, 3)
for(i in seq_len(nrow(x))) {
for(j in seq_len(ncol(x))) {
print(x[i, j])
}
}
While loops begin by testing a condition. If it is true, then they execute the loop body. Once the loop body is executed, the condition is tested again, and so forth.
count <- 0
while(count < 10) {
print(count)
count <- count + 1
}
Sometimes there will be more than one condition in the test.
z <- 5
while(z >= 3 && z <= 10) {
print(z)
coin <- rbinom(1, 1, 0.5)
if(coin == 1) { ## random walk
z <- z + 1
} else {
z <- z - 1
}
}
Repeat initiates an infinite loop; these are not commonly used in statistical applications but they do have their uses. The only way to exit a repeat loop is to call break.
x0 <- 1
tol <- 1e-8
repeat {
x1 <- computeEstimate()
if(abs(x1 - x0) < tol) {
break
} else {
x0 <- x1
}
}
The previous loop is a bit dangerous because thereâs no guarantee it will stop. Better to set a hard limit on the number of iterations (e.g. using a for loop) and then report whether convergence was achieved or not.
next is used to skip an iteration of a loop
for(i in 1:100) {
if(i <= 20) {
## Skip the first 20 iterations
next
}
## Do something here
}
return signals that a function should exit and return a given value
rnorm
sample
print () prints values identical(vect, vect2)