Jeho Park
Jan 23, 2017
This seminar is a compact version of HMC R Bootcamp, a 2-day workshop offered during the summer break.
Some materials were adapted from the following websites:
Or just download (or clone) all the files from https://github.com/jehopark/r_seminar_math161.git
Module 3: Working with Data (20 min)
Module 4: Programming in R (10 min)
save.image("r-seminar.Rdata") # save workspace
rm(list=ls()) # remove all objects
load("r-seminar.Rdata") # bring the workspace back
save.image() # by default it saves workspace to .Rdata
curr_wd <- getwd() # returns absolute path to the working directory
setwd("data") # change working directory to data folder
setwd(file.path('~', 'Desktop'))
require("datasets") # load/attach datasets if the package exists
ls('package:datasets')
airmiles # airmiles object in datasets package
airmiles <- 0 # Oops! overwritten?
datasets::airmiles # package namespace
rm(airmiles) # removes user defined object airmiles
demo() # display available demos
demo(graphics) # try graphics demo
library() # show available packages on the computer
search() # show loaded packages
?hist # search for the usage of hist function
??histogram # search for package documents containing the word "histogram"
R workspace stores objects like vectors, datasets and functions in memory (the available space for calculation is limited to the size of the RAM).
a <- 5 # notice a in your Environment window
A <- "text"
a
A
ls()
print(c(a,A))
print(a, A)
1+1
2+runif(1,0,1)
2+runif(1,min=0,max=1)
3^2
3*3
sqrt(3*3) # comments
# comments are preceded by hash sign
Numerical Integral of
\( \displaystyle\int_0^{\infty} \frac{1}{(x+1)\sqrt{x}}dx \)
integrand <- function(x) {1/((x+1)*sqrt(x))} ## define the integrated function
integrate(integrand, lower=0, upper=Inf) ## integrate the function from 0 to infinity
3.141593 with absolute error < 2.7e-05
The most basic form of an R object.
Scalar values are vectors of length one.
A vector is an array object of the same type (homogeneous) data elements.
class(a)
class(A)
B <- c(a,A) # concatenation function
B # see the values
class(B) # why?
a <- rnorm(10)
a[3:5] <- NA # NA is a missing value
a
R has five basic or “atomic” classes of objects:
A vector contains a set of data in any one of the atomic classes.
A matrix is a two-dimensional rectangular object of the same type (homogeneous) data elements.
mat <- matrix(rnorm(6), nrow = 3, ncol = 2)
mat # a matrix
dim(mat) # dimension
t(mat) # transpose
summary(mat)
A list is an object that can store different types of vectors.
aList <- list(name=c("Joseph"), married=T, kids=2)
aList
aList$kids <- aList$kids+1
aList$kids
aList2 <- list(numeric_data=a,character_data=A)
aList2
allList <- list(aList, aList2)
allList
A data frame is a list of vectors of equal length with possibly different types. It is used for storing retengular data tables (where columns are variables and rows are observations).
n <- c(2, 3, 5) # a vector
s <- c("aa", "bb", "cc") # a vector
b <- c(TRUE, FALSE, TRUE) # a vector
df <- data.frame(n, s, b) # a data frame
df
class(df$s) # was a string vector but now a factor column. why?
mtcars # a built-in (attached) data frame
mtcars$mpg
myFrame <- data.frame(y1=rnorm(100),y2=rnorm(100), y3=rnorm(100))
head(myFrame) # display first few lines of data
names(myFrame) # display column names
summary(myFrame) # output depends on the data types
plot(myFrame)
myFrame2 <- read.table(file="http://scicomp.hmc.edu/data/R/Rtest.txt", header=T, sep=",")
myFrame2
v <- c("a","b","c","c","b")
x <- factor(v) # turn the character vector into a factor object
z <- factor(v, ordered = TRUE) # ordered factor
x
z
table(x)
Use of the as() family of functions. Type as. and wait to see the list of as() functions.
integers <- 1:10
as.character(integers)
as.numeric(c('3.7', '4.8'))
indices <- c(1.7, 2.3)
integers[indices] # sometimes R is too generous
integers[0.999999999] # close to 1 but...
df <- as.data.frame(mat)
df
cpds <- read.csv(file.path('.', 'data', 'cpds.csv'))
head(cpds) # good to look at a few lines
class(cpds) # data.frame
data <- read.table(file="http://scicomp.hmc.edu/data/R/normtemp.txt", header=T)
tail(data)
rta <- read.table("./data/RTADataSub.csv", sep = ",", head = TRUE)
dim(rta)
rta[1:5, 1:5]
class(rta)
class(rta$time) # what? let's see ?read.table more carefully
rta2 <- read.table("./data/RTADataSub.csv", sep = ",", head = TRUE, stringsAsFactors = FALSE)
class(rta2$time)
write.csv(data, file = "temp.csv", row.names = FALSE)
pdf('myplot.pdf', width = 7, height = 7) # call pdf() before calling plot()
x <- rnorm(10); y <- rnorm(10)
plot(x, y)
dev.off()
Operators that can be used to extract subsets of R objects.
x <- c("a", "b", "c", "c", "d", "a")
x[1]
x[1:4]
x[x > "a"]
u <- x > "a" # what's u here?
u
x[u] # subsetting using a boolean vector
y <- list(foo=x, bar=x[u])
y
y[[1]]
y$bar
subset(mtcars, gear == 5) # use of subset function for data frames
attach(mtcars) # Attach mtcars to search path
plot(wt, mpg) # notice objects are called by their names, not mtcars$wt
plot(wt, mpg,
main = "Regression of MPG on Weight",
xlab = "Weight",
ylab = "MPG")
plot(wt, mpg, ann = FALSE)
abline(h=25) # a reference line
abline(lm(mpg~wt)) # look at the argument, what's lm?
title(main = "Regression of MPG on Weight", xlab = "Weight", ylab = "MPG")
par() # view current settings
orig_par <- par() # save current settings
par(col.lab="red") # red x and y labels
plot(wt, mpg) # create a plot with these new settings
par(orig_par) # restore original settings
plot(wt, mpg)
plot(wt, mpg, col.lab="red") # change settings withing plot()
?par # see all the options
mult_fun <- function(a = 1, b = 1) {
return(a*b)
}
mult_fun # show the function's code
mult_fun(2,3) # function call
mult_fun() # would this be an error?
x <- 10; y <- 20
x + y
`+`(x, y)
for(i in 1:10) {
print(i)
}
i <- 0
while(i < 5) {
i <- i + 1
print(i)
}
########## a bad loop, with 'growing' data
set.seed(42);
m=1000; n=1000;
mymat <- replicate(m, rnorm(n)) # create matrix of normal random numbers
system.time(
for (i in 1:m) {
for (j in 1:n) {
mymat[i,j] <- mymat[i,j] + 10*sin(0.75*pi)
}
}
)
#### vectorized version
set.seed(42);
m=1000; n=1000;
mymat1 <- replicate(m, rnorm(n))
system.time(
mymat1 <- mymat1 + 10*sin(0.75*pi)
)
Stopping on a line
Read https://support.rstudio.com/hc/en-us/articles/205612627-Debugging-with-RStudio
if (y < 0 && debug) {
message("Y is negative")
} else {
message("Y is not negative")
}