# Originally published in:
# https://eehh-stanford.github.io/SNA-workshop/intro-r.html#summarizing-data
# The easiest and most frequently used ways to read data in to R is reading data from plain-text (ASCII) files. These files can be space, tab, or comma delimited.
# R expects delimited files to be "white-space delimited" with values separated by either tabs or spaces and rows separated by carriage returns.
# It's always a good idea to specify whether or not you have a header (i.e., column names).
# If you don't, say `header=FALS`; if you do, say `header=TRUE`
(kids <- read.table("https://web.stanford.edu/class/earthsys214/data/strayer_strayer1976-fig2.txt", header=FALSE))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17
## 1 0 1 3 4 1 0 0 1 1 0 1 0 7 0 1 0 0
## 2 1 0 7 8 2 1 1 12 3 0 1 1 4 1 0 0 2
## 3 1 4 0 7 3 2 2 0 1 0 8 1 5 5 0 0 1
## 4 3 3 2 0 3 1 13 3 5 1 0 0 8 3 0 2 1
## 5 1 0 0 3 0 4 6 0 8 5 1 0 1 3 0 2 1
## 6 0 0 0 0 0 0 2 8 11 0 4 0 4 3 0 1 0
## 7 1 0 1 9 3 4 0 2 0 0 1 0 7 9 1 1 0
## 8 0 0 0 1 1 1 2 0 7 5 1 1 1 0 0 0 0
## 9 1 1 1 2 5 11 0 3 0 0 0 0 1 0 0 0 0
## 10 0 0 0 0 0 0 1 0 0 0 0 11 0 1 0 1 4
## 11 4 0 4 3 3 2 1 0 0 0 0 3 11 5 0 2 2
## 12 0 0 0 0 0 0 0 0 0 2 0 0 2 0 8 0 0
## 13 0 1 9 3 0 3 6 0 0 0 11 2 0 1 0 7 5
## 14 0 1 4 0 1 2 1 0 0 0 1 0 1 0 0 0 0
## 15 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0
## 16 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0
## 17 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
# If your file is delimited by something other than spaces, ti's a good idea to use a slightly different function, `read.delim()` and specify exactly what the delimiter is.
# Frequently, there will be non-tabular information at the top of a file (e.g., meta-data describing the data set). Use the `skip=n` option, where `n` is the number of lines you want skipped.
quercus <- read.delim("https://web.stanford.edu/class/earthsys214/data/quercus.txt", skip=24, sep="\t", header=TRUE)
head(quercus)
## Species Region Range acorn.size tree.height
## 1 Quercus alba L. Atlantic 24196 1.4 27
## 2 Quercus bicolor Willd. Atlantic 7900 3.4 21
## 3 Quercus macrocarpa Michx. Atlantic 23038 9.1 25
## 4 Quercus prinoides Willd. Atlantic 17042 1.6 3
## 5 Quercus Prinus L. Atlantic 7646 10.5 24
## 6 Quercus stellata Wang. Atlantic 19938 2.5 17
# R handles data in a manner that is different than many statistical packages.
# In particular, you are not limited to a single rectangular data matrix at a time.
# The workspace holds all the objects (e.g., data frames, variables, functions) that you have created or read in.
# You can essentially have as many data frames as your machine's memory will allow.
# To find out what lurks in your workspace, use `objects()` command.
# To remove an object, use `rm()`.
# If you really want to clear your whole workspace, you can use the following syntax: `rm(list=ls())`.
# Beware, though. Once you do this, you don't get the data back.
objects()
## [1] "kids" "quercus"
rm(kids)
rm(list=ls())
objects()
## character(0)
# Because the R workspace can contain many different varialbes and even multiple data frames, you must be aware of scope.
# When we extract columns of a data frame (e.g., if we wanted to plot them) we need to use the syntax `data.frame$col.name`.
quercus <- read.delim("https://web.stanford.edu/class/earthsys214/data/quercus.txt", skip=24, sep="\t", header=TRUE)
plot(quercus$tree.height, quercus$acorn.size, pch=16, col='red', xlab='Tree Height (m)', ylab = 'Acorn Size (cm3)')

# It can be a hassle having to type the data frame name (and dollar sign) over and over again.
# With the `with()` function, we can set up a local scoping rule that allows us to drop the need to type the data frame naem (and dollar sign) to access columns of a data frame.
quercus <- read.delim("https://web.stanford.edu/class/earthsys214/data/quercus.txt", skip=24, sep="\t", header=TRUE)
with(quercus, plot(tree.height, acorn.size, pch=16, col='blue', xlab='Tree Height (m)', ylab='Acorn Size (cm3)'))

# Index (and access) the elements of a vector using square brackets. `myvec[1]` takes the first element of a vector called `myvec`.
# Use the colon(:) operator for sequences. `myvec[1:5]` takes the first five elemnts of `myvec`.
# R allows negative indexing: `myvec[-1]` takes all elements of except the first one. To exclude a sequence, you need to place the sequence within parentheses: `myvec[-(1:5)]`.
# Vector indices don't have to be consecutive: `myvec[c(2, 5, 1, 11)]`
myvec <- c(1, 2, 3, 4, 5, 6, 66, 77, 7, 8, 9, 10)
myvec[1]
## [1] 1
myvec[1:5]
## [1] 1 2 3 4 5
myvec[-1]
## [1] 2 3 4 5 6 66 77 7 8 9 10
myvec[-(1:5)]
## [1] 6 66 77 7 8 9 10
myvec[c(2, 5, 1, 11)]
## [1] 2 5 1 9
# Access the elements of a data frame using the dollar sign.
# Subsetting anything other than a data frame users square brackets.
dim(quercus)
## [1] 39 5
(size <- quercus$acorn.size)
## [1] 1.4 3.4 9.1 1.6 10.5 2.5 0.9 6.8 1.8 0.3 0.9 0.8 2.0 1.1 0.6
## [16] 1.8 4.8 1.1 3.6 1.1 1.1 3.6 8.1 3.6 1.8 0.4 1.1 1.2 4.1 1.6
## [31] 2.0 5.5 5.9 2.6 6.0 1.0 17.1 0.4 7.1
size[1:3] # first 3 elements
## [1] 1.4 3.4 9.1
size[17] # only 17th element
## [1] 4.8
size[-39] # all but the 39th element
## [1] 1.4 3.4 9.1 1.6 10.5 2.5 0.9 6.8 1.8 0.3 0.9 0.8 2.0 1.1 0.6
## [16] 1.8 4.8 1.1 3.6 1.1 1.1 3.6 8.1 3.6 1.8 0.4 1.1 1.2 4.1 1.6
## [31] 2.0 5.5 5.9 2.6 6.0 1.0 17.1 0.4
size[c(3, 6, 9)] # 3rd, 6th, 9th elements
## [1] 9.1 2.5 1.8
size[quercus$Region=="California"] # use a logical test to subset
## [1] 4.1 1.6 2.0 5.5 5.9 2.6 6.0 1.0 17.1 0.4 7.1
quercus[3, 4] # access an element of an array or data frame by X[row, col]
## [1] 9.1
quercus[, "tree.height"]
## [1] 27.0 21.0 25.0 3.0 24.0 17.0 15.0 0.3 24.0 11.0 15.0 23.0 24.0 3.0 13.0
## [16] 30.0 9.0 27.0 9.0 24.0 23.0 27.0 24.0 23.0 18.0 9.0 9.0 4.0 18.0 6.0
## [31] 17.0 20.0 30.0 23.0 26.0 21.0 15.0 1.0 18.0
# The comma with nothing in front of it means take every row in the column named `tree.height`.
# Positive indices include, negative indices exclude elements.
# 1:3 means a sequence from 1 to 3
# You can only use a single negative subscript, i.e., you can't use quercus$acorn.size[-1:3]
# Of course, you can get around this by enclosing the vector in parentheses `quercus$acorn.size[-(1:3)]
# Logical operator: `&&` (another and), `||` (another or)
# `&` and `!` work elementwise on vectors: element 1 is compared in two vectors, then element 2, and so on.
# `&&` and `||` are tricky. These logical tests evaluate left to right, examining only the first element of each vector (they go until a result is determine dfor `||`.).
# Why would you want that? In general, you don't. It makes some calculations faster.
# When you refer to a variable in a data frame, you must specify the dataf frame named followed a dollar sigh and the variable name `quercus$acorn.size`.
# Testing for equality is just a special case of a logical test. We frequently want to identify numbers either above or below some criterion.
myvec <- c(1, 2, 3, 4, 5, 6, 66, 7, 77, 8, 9, 10)
myvec <- myvec[myvec<=10]
myvec
## [1] 1 2 3 4 5 6 7 8 9 10
x <- 1:7
# elements that are greater than 2 but less than 6
(x>2) & (x<6)
## [1] FALSE FALSE TRUE TRUE TRUE FALSE FALSE
# This is not intuitive
(x>2) && (x<6)
## [1] FALSE
# but dis doe ...
(x<2) && (x<6)
## [1] TRUE
is.even <- rep(c(FALSE, TRUE), 5)
(evens <- myvec[is.even])
## [1] 2 4 6 8 10
# `NA` is a special code for missing data. It pretty much means "Don't know".
# You can't test for a `NA` the way you would test for any other value (i.e., using `--` operator) since `variable==NA` is like asking in English, "Is the variable equal to some number I don't know?"
# It also doesn't make any sense to add one to something you don't know.
# R therefore provides the function `is.na()` that allows us to subset using locals.
aaa <- c(1,2,3,NA,4,5,6,NA,NA,7,8,9,NA,10)
aaa <- aaa[!is.na(aaa)]
aaa
## [1] 1 2 3 4 5 6 7 8 9 10
aaa <- c(1,2,3,NA,4,5,6,NA,NA,7,8,9,NA,10)
is.na(aaa)
## [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## [13] TRUE FALSE
!is.na(aaa)
## [1] TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE
## [13] FALSE TRUE
# There are a couple other special values of objects.
# One is `Inf`, which means 'infinity'. It may result from dividing zero by zero.
# Another is `NaN`, which means 'not a number'. You'll get this, e.g., you try to make a logarithm of a negative number.
# The function `table()` is a very useful way of exploring data.
aaa <- rpois(100,5)
table(aaa)
## aaa
## 1 2 3 4 5 6 7 8 9 10
## 6 8 12 20 18 18 9 5 3 1
donner <- read.table("https://web.stanford.edu/class/earthsys214/data/donner.dat", header=TRUE, skip=2)
# survival=0 == died; male=0 == female
with(donner, table(male,survival))
## survival
## male 0 1
## 0 5 10
## 1 20 10
# table along 3 dimensions
with(donner, table(male, survival, age))
## , , age = 15
##
## survival
## male 0 1
## 0 0 1
## 1 1 0
##
## , , age = 18
##
## survival
## male 0 1
## 0 0 0
## 1 0 1
##
## , , age = 20
##
## survival
## male 0 1
## 0 0 1
## 1 0 1
##
## , , age = 21
##
## survival
## male 0 1
## 0 0 1
## 1 0 0
##
## , , age = 22
##
## survival
## male 0 1
## 0 0 1
## 1 0 0
##
## , , age = 23
##
## survival
## male 0 1
## 0 0 1
## 1 2 1
##
## , , age = 24
##
## survival
## male 0 1
## 0 0 1
## 1 1 0
##
## , , age = 25
##
## survival
## male 0 1
## 0 1 1
## 1 5 1
##
## , , age = 28
##
## survival
## male 0 1
## 0 0 0
## 1 2 2
##
## , , age = 30
##
## survival
## male 0 1
## 0 0 0
## 1 3 1
##
## , , age = 32
##
## survival
## male 0 1
## 0 0 2
## 1 0 1
##
## , , age = 35
##
## survival
## male 0 1
## 0 0 0
## 1 1 0
##
## , , age = 40
##
## survival
## male 0 1
## 0 0 1
## 1 1 1
##
## , , age = 45
##
## survival
## male 0 1
## 0 2 0
## 1 0 0
##
## , , age = 46
##
## survival
## male 0 1
## 0 0 0
## 1 0 1
##
## , , age = 47
##
## survival
## male 0 1
## 0 1 0
## 1 0 0
##
## , , age = 50
##
## survival
## male 0 1
## 0 1 0
## 1 0 0
##
## , , age = 57
##
## survival
## male 0 1
## 0 0 0
## 1 1 0
##
## , , age = 60
##
## survival
## male 0 1
## 0 0 0
## 1 1 0
##
## , , age = 62
##
## survival
## male 0 1
## 0 0 0
## 1 1 0
##
## , , age = 65
##
## survival
## male 0 1
## 0 0 0
## 1 1 0
cage <- rep(0, length(donner$age))
# simplify by defining 2 age classes: over/under 25
cage[donner$age<=25] <- 1
cage[donner$age>25] <- 2
(donner <- data.frame(donner, cage=cage))
## age male survival cage
## 01 23 1 0 1
## 02 40 0 1 2
## 03 40 1 1 2
## 04 30 1 0 2
## 05 28 1 0 2
## 06 40 1 0 2
## 07 45 0 0 2
## 08 62 1 0 2
## 09 65 1 0 2
## 10 45 0 0 2
## 11 25 0 0 1
## 12 28 1 1 2
## 13 28 1 0 2
## 14 23 1 0 1
## 15 22 0 1 1
## 16 23 0 1 1
## 17 28 1 1 2
## 18 15 0 1 1
## 19 47 0 0 2
## 20 57 1 0 2
## 21 20 0 1 1
## 22 18 1 1 1
## 23 25 1 0 1
## 24 60 1 0 2
## 25 25 1 1 1
## 26 20 1 1 1
## 27 32 1 1 2
## 28 32 0 1 2
## 29 24 0 1 1
## 30 30 1 1 2
## 31 15 1 0 1
## 32 50 0 0 2
## 33 21 0 1 1
## 34 25 1 0 1
## 35 46 1 1 2
## 36 32 0 1 2
## 37 30 1 0 2
## 38 25 1 0 1
## 39 25 1 0 1
## 40 25 1 0 1
## 41 30 1 0 2
## 42 35 1 0 2
## 43 23 1 1 1
## 44 24 1 0 1
## 45 25 0 1 1
(with(donner, table(male, survival, cage)))
## , , cage = 1
##
## survival
## male 0 1
## 0 1 7
## 1 9 4
##
## , , cage = 2
##
## survival
## male 0 1
## 0 4 3
## 1 11 6
# It's sometimes useful to sort a vector
aaa <- rpois(100,5)
sort(aaa)
## [1] 0 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3
## [26] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5
## [51] 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6
## [76] 6 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 9 9 9 9 9 10
# decreasing order
sort(aaa, decreasing=TRUE)
## [1] 10 9 9 9 9 9 8 8 8 8 8 8 8 7 7 7 7 7 7 7 7 7 7 7 6
## [26] 6 6 6 6 6 6 6 6 6 6 6 6 6 5 5 5 5 5 5 5 5 5 5 5 5
## [51] 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [76] 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 1 1 1 1 0
# Sorting data frames is a bit more involved, but still straightforward
# use the funciton `order()`.
# five columns of data again
satu <- c(1,2,3,4,5)
dua <- c("a","b","c","d","e")
tiga <- sample(c(TRUE, FALSE), 5, replace=TRUE)
empat <- LETTERS[7:11]
lima <- rnorm(5)
# construct a data frame
(collection <- data.frame(satu, dua, tiga, empat, lima))
## satu dua tiga empat lima
## 1 1 a TRUE G 1.6971778
## 2 2 b FALSE H -0.5200089
## 3 3 c FALSE I -0.8786923
## 4 4 d FALSE J 0.2653374
## 5 5 e FALSE K -0.4019141
o <- order(collection$lima)
collection[o,]
## satu dua tiga empat lima
## 3 3 c FALSE I -0.8786923
## 2 2 b FALSE H -0.5200089
## 5 5 e FALSE K -0.4019141
## 4 4 d FALSE J 0.2653374
## 1 1 a TRUE G 1.6971778
# The matrix of aggressive interactions among kids had neither colum nor row names.
# We can add the codes used in the Strayer and Streayer (1976) paper.
kids <- read.table("https://web.stanford.edu/class/earthsys214/data/strayer_strayer1976-fig2.txt", header=FALSE)
kid.names <- c("Ro","Ss","Br","If","Td","Sd","Pe","Ir","Cs","Ka",
"Ch","Ty","Gl","Sa", "Me","Ju","Sh")
colnames(kids) <- kid.names
rownames(kids) <- kid.names
# `colnames()` and `rownames()` are convenient functions.
# `apply()` applies a function along the margins of a matrix
# `lapply()` applies a function to a list and generates a list as its output.
# `sapply()` is similar to `lapply()` but generates a vector as its output.
aaa <- list(alpha=1:10, beta=rnorm(100), x=sample(1:100, 100, replace=TRUE))
lapply(aaa, mean)
## $alpha
## [1] 5.5
##
## $beta
## [1] 0.1370515
##
## $x
## [1] 47.17
# more compact as a vector
sapply(aaa, mean)
## alpha beta x
## 5.5000000 0.1370515 47.1700000
# cross-tabulation of sex partners from NHSLS
sextable <- read.csv("https://web.stanford.edu/class/earthsys214/data/nhsls_sextable.txt", header=FALSE)
dimnames(sextable)[[1]] <- c("white","black","hispanic","asian","other")
dimnames(sextable)[[2]] <- c("white","black","hispanic","asian","other")
sextable
## white black hispanic asian other
## white 1131 12 16 3 15
## black 5 268 5 0 0
## hispanic 39 1 115 0 3
## asian 12 0 0 10 4
## other 7 0 1 0 18
# calculate marginals
(row.sums <- apply(sextable, 1, sum))
## white black hispanic asian other
## 1177 278 158 26 26
(col.sums <- apply(sextable, 2, sum))
## white black hispanic asian other
## 1194 281 137 13 40
# using sapply() gives similar output
sapply(sextable,sum)
## white black hispanic asian other
## 1194 281 137 13 40
# compare the output of sapply() to lapply()
lapply(sextable, sum)
## $white
## [1] 1194
##
## $black
## [1] 281
##
## $hispanic
## [1] 137
##
## $asian
## [1] 13
##
## $other
## [1] 40
# `if` allows you to conditionally evaluate expressions.
# The basic syntax of an `if` statement is: `if(condition) true.branch else false.branch`
# The `else` part of the statement is optional.
(coin <- sample(c("heads", "tails"), 1))
## [1] "tails"
if(coin=="tails") b <- 1 else b <- 0
b
## [1] 1
# Sometimes you can use the very efficient `ifelse` statement
(x <- 4:-2)
## [1] 4 3 2 1 0 -1 -2
# sqrt(x) produces warnings, but using ifelse to check works without producing warnings.
(sqrt(ifelse(x>=0, x, NA)))
## [1] 2.000000 1.732051 1.414214 1.000000 0.000000 NA NA
# If you want to repeat an action over and over again you need a loop.
# Loops are mostly generated using `for` statements.
# The basic syntax of a `for` loop is: `for(item in sequence) statement(s)`.
x <- 1:5
for(i in 1:5) print(x[i])
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
# If there are multiple statements executed by a `for` loop, those statements must be enclosed in curly braces, `{}`.
# We need to be careful with `for` loops because they can slow code down, particularly when they are nested and the number of iterations is very large.
# Vectorizing and using mapping functions like `apply` and its relatives can greatly speed your code up.
# Much of the functionality of R comes from the many contributed packages.
# Installing can be done by: `install.packages(package.name)`
# It is often more convenient to use a menu command under: `Tools > Install Packages...`
# Once a pckage is installed, you must load it: `library()`
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
g <- graph(c(1,2, 1,3, 2,3, 3,5), n=5)
plot(g)
