# Originally published in: 
# https://eehh-stanford.github.io/SNA-workshop/intro-r.html#summarizing-data
# The easiest and most frequently used ways to read data in to R is reading data from plain-text (ASCII) files. These files can be space, tab, or comma delimited.

# R expects delimited files to be "white-space delimited" with values separated by either tabs or spaces and rows separated by carriage returns.

# It's always a good idea to specify whether or not you have a header (i.e., column names).
# If you don't, say `header=FALS`; if you do, say `header=TRUE`

(kids <- read.table("https://web.stanford.edu/class/earthsys214/data/strayer_strayer1976-fig2.txt", header=FALSE))
##    V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17
## 1   0  1  3  4  1  0  0  1  1   0   1   0   7   0   1   0   0
## 2   1  0  7  8  2  1  1 12  3   0   1   1   4   1   0   0   2
## 3   1  4  0  7  3  2  2  0  1   0   8   1   5   5   0   0   1
## 4   3  3  2  0  3  1 13  3  5   1   0   0   8   3   0   2   1
## 5   1  0  0  3  0  4  6  0  8   5   1   0   1   3   0   2   1
## 6   0  0  0  0  0  0  2  8 11   0   4   0   4   3   0   1   0
## 7   1  0  1  9  3  4  0  2  0   0   1   0   7   9   1   1   0
## 8   0  0  0  1  1  1  2  0  7   5   1   1   1   0   0   0   0
## 9   1  1  1  2  5 11  0  3  0   0   0   0   1   0   0   0   0
## 10  0  0  0  0  0  0  1  0  0   0   0  11   0   1   0   1   4
## 11  4  0  4  3  3  2  1  0  0   0   0   3  11   5   0   2   2
## 12  0  0  0  0  0  0  0  0  0   2   0   0   2   0   8   0   0
## 13  0  1  9  3  0  3  6  0  0   0  11   2   0   1   0   7   5
## 14  0  1  4  0  1  2  1  0  0   0   1   0   1   0   0   0   0
## 15  0  0  0  0  0  0  0  0  0   0   0   3   0   0   0   0   0
## 16  0  0  0  0  0  1  0  0  0   0   0   1   1   0   0   0   0
## 17  0  0  0  0  0  0  0  0  1   1   0   0   0   0   0   0   0
# If your file is delimited by something other than spaces, ti's a good idea to use a slightly different function, `read.delim()` and specify exactly what the delimiter is.

# Frequently, there will be non-tabular information at the top of a file (e.g., meta-data describing the data set). Use the `skip=n` option, where `n` is the number of lines you want skipped.

quercus <- read.delim("https://web.stanford.edu/class/earthsys214/data/quercus.txt", skip=24, sep="\t", header=TRUE)
head(quercus)
##                     Species   Region Range acorn.size tree.height
## 1           Quercus alba L. Atlantic 24196        1.4          27
## 2  Quercus bicolor Willd.   Atlantic  7900        3.4          21
## 3 Quercus macrocarpa Michx. Atlantic 23038        9.1          25
## 4  Quercus prinoides Willd. Atlantic 17042        1.6           3
## 5         Quercus Prinus L. Atlantic  7646       10.5          24
## 6    Quercus stellata Wang. Atlantic 19938        2.5          17
# R handles data in a manner that is different than many statistical packages.
# In particular, you are not limited to a single rectangular data matrix at a time.
# The workspace holds all the objects (e.g., data frames, variables, functions) that you have created or read in.
# You can essentially have as many data frames as your machine's memory will allow.

# To find out what lurks in your workspace, use `objects()` command.
# To remove an object, use `rm()`.
# If you really want to clear your whole workspace, you can use the following syntax: `rm(list=ls())`.
# Beware, though. Once you do this, you don't get the data back.

objects()
## [1] "kids"    "quercus"
rm(kids)
rm(list=ls())
objects()
## character(0)
# Because the R workspace can contain many different varialbes and even multiple data frames, you must be aware of scope.
# When we extract columns of a data frame (e.g., if we wanted to plot them) we need to use the syntax `data.frame$col.name`.

quercus <- read.delim("https://web.stanford.edu/class/earthsys214/data/quercus.txt", skip=24, sep="\t", header=TRUE)
plot(quercus$tree.height, quercus$acorn.size, pch=16, col='red', xlab='Tree Height (m)', ylab = 'Acorn Size (cm3)')

# It can be a hassle having to type the data frame name (and dollar sign) over and over again.
# With the `with()` function, we can set up a local scoping rule that allows us to drop the need to type the data frame naem (and dollar sign) to access columns of a data frame.

quercus <- read.delim("https://web.stanford.edu/class/earthsys214/data/quercus.txt", skip=24, sep="\t", header=TRUE)
with(quercus, plot(tree.height, acorn.size, pch=16, col='blue', xlab='Tree Height (m)', ylab='Acorn Size (cm3)'))

# Index (and access) the elements of a vector using square brackets. `myvec[1]` takes the first element of a vector called `myvec`.

# Use the colon(:) operator for sequences. `myvec[1:5]` takes the first five elemnts of `myvec`.

# R allows negative indexing: `myvec[-1]` takes all elements of except the first one. To exclude a sequence, you need to place the sequence within parentheses: `myvec[-(1:5)]`.

# Vector indices don't have to be consecutive: `myvec[c(2, 5, 1, 11)]`

myvec <- c(1, 2, 3, 4, 5, 6, 66, 77, 7, 8, 9, 10)
myvec[1]
## [1] 1
myvec[1:5]
## [1] 1 2 3 4 5
myvec[-1]
##  [1]  2  3  4  5  6 66 77  7  8  9 10
myvec[-(1:5)]
## [1]  6 66 77  7  8  9 10
myvec[c(2, 5, 1, 11)]
## [1] 2 5 1 9
# Access the elements of a data frame using the dollar sign.
# Subsetting anything other than a data frame users square brackets.

dim(quercus)
## [1] 39  5
(size <- quercus$acorn.size)
##  [1]  1.4  3.4  9.1  1.6 10.5  2.5  0.9  6.8  1.8  0.3  0.9  0.8  2.0  1.1  0.6
## [16]  1.8  4.8  1.1  3.6  1.1  1.1  3.6  8.1  3.6  1.8  0.4  1.1  1.2  4.1  1.6
## [31]  2.0  5.5  5.9  2.6  6.0  1.0 17.1  0.4  7.1
size[1:3] # first 3 elements
## [1] 1.4 3.4 9.1
size[17] # only 17th element
## [1] 4.8
size[-39] # all but the 39th element
##  [1]  1.4  3.4  9.1  1.6 10.5  2.5  0.9  6.8  1.8  0.3  0.9  0.8  2.0  1.1  0.6
## [16]  1.8  4.8  1.1  3.6  1.1  1.1  3.6  8.1  3.6  1.8  0.4  1.1  1.2  4.1  1.6
## [31]  2.0  5.5  5.9  2.6  6.0  1.0 17.1  0.4
size[c(3, 6, 9)] # 3rd, 6th, 9th elements
## [1] 9.1 2.5 1.8
size[quercus$Region=="California"] # use a logical test to subset
##  [1]  4.1  1.6  2.0  5.5  5.9  2.6  6.0  1.0 17.1  0.4  7.1
quercus[3, 4] # access an element of an array or data frame by X[row, col]
## [1] 9.1
quercus[, "tree.height"]
##  [1] 27.0 21.0 25.0  3.0 24.0 17.0 15.0  0.3 24.0 11.0 15.0 23.0 24.0  3.0 13.0
## [16] 30.0  9.0 27.0  9.0 24.0 23.0 27.0 24.0 23.0 18.0  9.0  9.0  4.0 18.0  6.0
## [31] 17.0 20.0 30.0 23.0 26.0 21.0 15.0  1.0 18.0
# The comma with nothing in front of it means take every row in the column named `tree.height`.
# Positive indices include, negative indices exclude elements.

# 1:3 means a sequence from 1 to 3

# You can only use a single negative subscript, i.e., you can't use quercus$acorn.size[-1:3]

# Of course, you can get around this by enclosing the vector in parentheses `quercus$acorn.size[-(1:3)]

# Logical operator: `&&` (another and), `||` (another or)

# `&` and `!` work elementwise on vectors: element 1 is compared in two vectors, then element 2, and so on.

# `&&` and `||` are tricky. These logical tests evaluate left to right, examining only the first element of each vector (they go until a result is determine dfor `||`.).

# Why would you want that? In general, you don't. It makes some calculations faster.

# When you refer to a variable in a data frame, you must specify the dataf frame named followed a dollar sigh and the variable name `quercus$acorn.size`.

# Testing for equality is just a special case of a logical test. We frequently want to identify numbers either above or below some criterion.

myvec <- c(1, 2, 3, 4, 5, 6, 66, 7, 77, 8, 9, 10)
myvec <- myvec[myvec<=10]
myvec
##  [1]  1  2  3  4  5  6  7  8  9 10
x <- 1:7
# elements that are greater than 2 but less than 6
(x>2) & (x<6)
## [1] FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE
# This is not intuitive
(x>2) && (x<6)
## [1] FALSE
# but dis doe ...
(x<2) && (x<6)
## [1] TRUE
is.even <- rep(c(FALSE, TRUE), 5)
(evens <- myvec[is.even])
## [1]  2  4  6  8 10
# `NA` is a special code for missing data. It pretty much means "Don't know".

# You can't test for a `NA` the way you would test for any other value (i.e., using `--` operator) since `variable==NA` is like asking in English, "Is the variable equal to some number I don't know?"

# It also doesn't make any sense to add one to something you don't know.

# R therefore provides the function `is.na()` that allows us to subset using locals.

aaa <- c(1,2,3,NA,4,5,6,NA,NA,7,8,9,NA,10)
aaa <- aaa[!is.na(aaa)]
aaa
##  [1]  1  2  3  4  5  6  7  8  9 10
aaa <- c(1,2,3,NA,4,5,6,NA,NA,7,8,9,NA,10)
is.na(aaa)
##  [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [13]  TRUE FALSE
!is.na(aaa)
##  [1]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## [13] FALSE  TRUE
# There are a couple other special values of objects.
# One is `Inf`, which means 'infinity'. It may result from dividing zero by zero.
# Another is `NaN`, which means 'not a number'. You'll get this, e.g., you try to make a logarithm of a negative number.
# The function `table()` is a very useful way of exploring data.
aaa <- rpois(100,5)
table(aaa)
## aaa
##  1  2  3  4  5  6  7  8  9 10 
##  6  8 12 20 18 18  9  5  3  1
donner <- read.table("https://web.stanford.edu/class/earthsys214/data/donner.dat", header=TRUE, skip=2)
# survival=0 == died; male=0 == female
with(donner, table(male,survival))
##     survival
## male  0  1
##    0  5 10
##    1 20 10
# table along 3 dimensions
with(donner, table(male, survival, age))
## , , age = 15
## 
##     survival
## male 0 1
##    0 0 1
##    1 1 0
## 
## , , age = 18
## 
##     survival
## male 0 1
##    0 0 0
##    1 0 1
## 
## , , age = 20
## 
##     survival
## male 0 1
##    0 0 1
##    1 0 1
## 
## , , age = 21
## 
##     survival
## male 0 1
##    0 0 1
##    1 0 0
## 
## , , age = 22
## 
##     survival
## male 0 1
##    0 0 1
##    1 0 0
## 
## , , age = 23
## 
##     survival
## male 0 1
##    0 0 1
##    1 2 1
## 
## , , age = 24
## 
##     survival
## male 0 1
##    0 0 1
##    1 1 0
## 
## , , age = 25
## 
##     survival
## male 0 1
##    0 1 1
##    1 5 1
## 
## , , age = 28
## 
##     survival
## male 0 1
##    0 0 0
##    1 2 2
## 
## , , age = 30
## 
##     survival
## male 0 1
##    0 0 0
##    1 3 1
## 
## , , age = 32
## 
##     survival
## male 0 1
##    0 0 2
##    1 0 1
## 
## , , age = 35
## 
##     survival
## male 0 1
##    0 0 0
##    1 1 0
## 
## , , age = 40
## 
##     survival
## male 0 1
##    0 0 1
##    1 1 1
## 
## , , age = 45
## 
##     survival
## male 0 1
##    0 2 0
##    1 0 0
## 
## , , age = 46
## 
##     survival
## male 0 1
##    0 0 0
##    1 0 1
## 
## , , age = 47
## 
##     survival
## male 0 1
##    0 1 0
##    1 0 0
## 
## , , age = 50
## 
##     survival
## male 0 1
##    0 1 0
##    1 0 0
## 
## , , age = 57
## 
##     survival
## male 0 1
##    0 0 0
##    1 1 0
## 
## , , age = 60
## 
##     survival
## male 0 1
##    0 0 0
##    1 1 0
## 
## , , age = 62
## 
##     survival
## male 0 1
##    0 0 0
##    1 1 0
## 
## , , age = 65
## 
##     survival
## male 0 1
##    0 0 0
##    1 1 0
cage <- rep(0, length(donner$age))
# simplify by defining 2 age classes: over/under 25
cage[donner$age<=25] <- 1
cage[donner$age>25] <- 2
(donner <- data.frame(donner, cage=cage))
##    age male survival cage
## 01  23    1        0    1
## 02  40    0        1    2
## 03  40    1        1    2
## 04  30    1        0    2
## 05  28    1        0    2
## 06  40    1        0    2
## 07  45    0        0    2
## 08  62    1        0    2
## 09  65    1        0    2
## 10  45    0        0    2
## 11  25    0        0    1
## 12  28    1        1    2
## 13  28    1        0    2
## 14  23    1        0    1
## 15  22    0        1    1
## 16  23    0        1    1
## 17  28    1        1    2
## 18  15    0        1    1
## 19  47    0        0    2
## 20  57    1        0    2
## 21  20    0        1    1
## 22  18    1        1    1
## 23  25    1        0    1
## 24  60    1        0    2
## 25  25    1        1    1
## 26  20    1        1    1
## 27  32    1        1    2
## 28  32    0        1    2
## 29  24    0        1    1
## 30  30    1        1    2
## 31  15    1        0    1
## 32  50    0        0    2
## 33  21    0        1    1
## 34  25    1        0    1
## 35  46    1        1    2
## 36  32    0        1    2
## 37  30    1        0    2
## 38  25    1        0    1
## 39  25    1        0    1
## 40  25    1        0    1
## 41  30    1        0    2
## 42  35    1        0    2
## 43  23    1        1    1
## 44  24    1        0    1
## 45  25    0        1    1
(with(donner, table(male, survival, cage)))
## , , cage = 1
## 
##     survival
## male  0  1
##    0  1  7
##    1  9  4
## 
## , , cage = 2
## 
##     survival
## male  0  1
##    0  4  3
##    1 11  6
# It's sometimes useful to sort a vector
aaa <- rpois(100,5)
sort(aaa)
##   [1]  0  1  1  1  1  2  2  2  2  2  2  2  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [26]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  5  5  5  5  5  5
##  [51]  5  5  5  5  5  5  5  5  5  5  5  5  6  6  6  6  6  6  6  6  6  6  6  6  6
##  [76]  6  7  7  7  7  7  7  7  7  7  7  7  8  8  8  8  8  8  8  9  9  9  9  9 10
# decreasing order
sort(aaa, decreasing=TRUE)
##   [1] 10  9  9  9  9  9  8  8  8  8  8  8  8  7  7  7  7  7  7  7  7  7  7  7  6
##  [26]  6  6  6  6  6  6  6  6  6  6  6  6  6  5  5  5  5  5  5  5  5  5  5  5  5
##  [51]  5  5  5  5  5  5  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
##  [76]  3  3  3  3  3  3  3  3  3  3  3  3  3  2  2  2  2  2  2  2  1  1  1  1  0
# Sorting data frames is a bit more involved, but still straightforward
# use the funciton `order()`.

# five columns of data again
satu <- c(1,2,3,4,5)
dua <- c("a","b","c","d","e")
tiga <- sample(c(TRUE, FALSE), 5, replace=TRUE)
empat <- LETTERS[7:11]
lima <- rnorm(5)
# construct a data frame
(collection <- data.frame(satu, dua, tiga, empat, lima))
##   satu dua  tiga empat       lima
## 1    1   a  TRUE     G  1.6971778
## 2    2   b FALSE     H -0.5200089
## 3    3   c FALSE     I -0.8786923
## 4    4   d FALSE     J  0.2653374
## 5    5   e FALSE     K -0.4019141
o <- order(collection$lima)
collection[o,]
##   satu dua  tiga empat       lima
## 3    3   c FALSE     I -0.8786923
## 2    2   b FALSE     H -0.5200089
## 5    5   e FALSE     K -0.4019141
## 4    4   d FALSE     J  0.2653374
## 1    1   a  TRUE     G  1.6971778
# The matrix of aggressive interactions among kids had neither colum nor row names.
# We can add the codes used in the Strayer and Streayer (1976) paper.

kids <- read.table("https://web.stanford.edu/class/earthsys214/data/strayer_strayer1976-fig2.txt", header=FALSE)
kid.names <- c("Ro","Ss","Br","If","Td","Sd","Pe","Ir","Cs","Ka",
                "Ch","Ty","Gl","Sa", "Me","Ju","Sh")
colnames(kids) <- kid.names
rownames(kids) <- kid.names

# `colnames()` and `rownames()` are convenient functions.
# `apply()` applies a function along the margins of a matrix
# `lapply()` applies a function to a list and generates a list as its output.
# `sapply()` is similar to `lapply()` but generates a vector as its output.

aaa <- list(alpha=1:10, beta=rnorm(100), x=sample(1:100, 100, replace=TRUE))
lapply(aaa, mean)
## $alpha
## [1] 5.5
## 
## $beta
## [1] 0.1370515
## 
## $x
## [1] 47.17
# more compact as a vector
sapply(aaa, mean)
##      alpha       beta          x 
##  5.5000000  0.1370515 47.1700000
# cross-tabulation of sex partners from NHSLS
sextable <- read.csv("https://web.stanford.edu/class/earthsys214/data/nhsls_sextable.txt", header=FALSE)
dimnames(sextable)[[1]] <- c("white","black","hispanic","asian","other")
dimnames(sextable)[[2]] <- c("white","black","hispanic","asian","other")
sextable
##          white black hispanic asian other
## white     1131    12       16     3    15
## black        5   268        5     0     0
## hispanic    39     1      115     0     3
## asian       12     0        0    10     4
## other        7     0        1     0    18
# calculate marginals
(row.sums <- apply(sextable, 1, sum))
##    white    black hispanic    asian    other 
##     1177      278      158       26       26
(col.sums <- apply(sextable, 2, sum))
##    white    black hispanic    asian    other 
##     1194      281      137       13       40
# using sapply() gives similar output
sapply(sextable,sum)
##    white    black hispanic    asian    other 
##     1194      281      137       13       40
# compare the output of sapply() to lapply()
lapply(sextable, sum)
## $white
## [1] 1194
## 
## $black
## [1] 281
## 
## $hispanic
## [1] 137
## 
## $asian
## [1] 13
## 
## $other
## [1] 40
# `if` allows you to conditionally evaluate expressions.
# The basic syntax of an `if` statement is: `if(condition) true.branch else false.branch`
# The `else` part of the statement is optional.

(coin <- sample(c("heads", "tails"), 1))
## [1] "tails"
if(coin=="tails") b <- 1 else b <- 0
b
## [1] 1
# Sometimes you can use the very efficient `ifelse` statement
(x <- 4:-2)
## [1]  4  3  2  1  0 -1 -2
# sqrt(x) produces warnings, but using ifelse to check works without producing warnings.
(sqrt(ifelse(x>=0, x, NA)))
## [1] 2.000000 1.732051 1.414214 1.000000 0.000000       NA       NA
# If you want to repeat an action over and over again you need a loop.
# Loops are mostly generated using `for` statements.
# The basic syntax of a `for` loop is: `for(item in sequence) statement(s)`.

x <- 1:5
for(i in 1:5) print(x[i])
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
# If there are multiple statements executed by a `for` loop, those statements must be enclosed in curly braces, `{}`.
# We need to be careful with `for` loops because they can slow code down, particularly when they are nested and the number of iterations is very large.

# Vectorizing and using mapping functions like `apply` and its relatives can greatly speed your code up.
# Much of the functionality of R comes from the many contributed packages.
# Installing can be done by: `install.packages(package.name)`
# It is often more convenient to use a menu command under: `Tools > Install Packages...`
# Once a pckage is installed, you must load it: `library()`
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
g <- graph(c(1,2, 1,3, 2,3, 3,5), n=5)
plot(g)