SAA Seminar - R Basic Concept and Syntax

R - Basic Concepts and Syntax

Intro

Basic data types and working with data structures R has three main data types: chracter, numeric, and logical These data types are used as single objects, or within data structures incuding: vector, matrix, list, and data.frame

Characters

# simple character
# "a" # RESULTS IN ERROR
# but not unquoted, because R expects that to be an object defined as somthing
# print(a) # returns: Error: object 'a' not found
# use the assignment operator '<-' to assign a value to the object 'a'
a <- "x"
print(a)

[1] "x"

class(a) # tells us the data type of the object a: [1] "character"

[1] "character"

# now the object 'a' is assigned the chracter value "x"

## That object can be used in a printed string with the paste0() function
print(paste0("object a contains the value: ", a))

[1] "object a contains the value: x"

# we can change the value of 'a' by reassigning it with another value
a <- "y" # R does not warn you that 'a' already is assigned a value!
print(paste0("object a contains the value: ", a))

[1] "object a contains the value: y"

Numeric

## object 'a' can also be a numeric
a <- 2
print(a)

[1] 2

# assign another number to object 'b'
b <- 5
# now we can add the two object since they are just placeholders for numbers
a + b

[1] 7

# this could also be accomplished simply by typeing:
2 + 5

[1] 7

# but using objects allows for the declaration of values, types, and all sorts of stuff

Vectors

Character vector

# character vector
v <- c("1", "x", "ch")
print(v)

[1] "1"  "x"  "ch"

v1 <- c("The", "brown", "dog")
print(v1)

[1] "The"   "brown" "dog"

v2 <- c("runs", "fast!")
print(v2)

[1] "runs"  "fast!"

# concatenate character vectors
v3 <- c(v1, v2)
# compose into sentence
new_sentence <- (paste0(v3, collapse = ' '))
print(new_sentence)

[1] "The brown dog runs fast!"

Numeric vector

# numeric vector 
bvec <- c(1,4,78)
print(bvec)

[1]  1  4 78

# logical vector
c <- c(TRUE, FALSE, TRUE)
print(c)

[1]  TRUE FALSE  TRUE

as.numeric(c)

[1] 1 0 1

Numeric vector from sequences

# make a numeric vector from a sequence
bvec1 <- 1:10
# find the length of a vector
length(bvec1)

[1] 10

# or - to get the same thing
bvec2 <- seq(from = 1, to = 10, by = 1)
# add vectors
bvec1 + bvec2

 [1]  2  4  6  8 10 12 14 16 18 20

# vector of boot strpped samples from vector b1
boot_samp <- sample(bvec1, 40, replace = TRUE)
print(boot_samp)

 [1]  8  8  6  7  3  8  7  2  9  8  3 10  9  7  4  1  5  1  7  3  1  1  6
[24] 10 10  8  8  4  1  5  8  4  8  8  8 10  3  2  9  1

# a vector of length 1000 filled with random standard normals
set.seed(717)
rand_norm <- rnorm(1000,0,1)
length(rand_norm)

[1] 1000

Factors

# make a factor data type 
colors <- sample(c("orange", "green", "blue"), 15, replace = TRUE)
fact1 <- factor(colors)
print(fact1)

 [1] green  green  orange orange green  orange orange blue   blue   blue  
[11] orange green  orange green  green 
Levels: blue green orange

# prints as color names, but is stored internally as integer values assigned based on alphabetical order
print(fact1)

 [1] green  green  orange orange green  orange orange blue   blue   blue  
[11] orange green  orange green  green 
Levels: blue green orange

levels(fact1)

[1] "blue"   "green"  "orange"

as.numeric(fact1)

 [1] 2 2 3 3 2 3 3 1 1 1 3 2 3 2 2

as.character(fact1)

 [1] "green"  "green"  "orange" "orange" "green"  "orange" "orange"
 [8] "blue"   "blue"   "blue"   "orange" "green"  "orange" "green" 
[15] "green"

## Sometimes the realtive order of the levels matter
fact2 <- factor(fact1, levels = c("blue", "orange", "green"))
levels(fact2)

[1] "blue"   "orange" "green"

## Other times, factors need to be expliclty ordered
ord_fact <- ordered(colors, levels = c("blue", "orange", "green"))
# notice that the Levels attribute shows that blue is less than orange is less than green
print(ord_fact)

 [1] green  green  orange orange green  orange orange blue   blue   blue  
[11] orange green  orange green  green 
Levels: blue < orange < green

Qualifying and Quantifying vectors

table(v)

v
 1 ch  x 
 1  1  1

table(c)

c
FALSE  TRUE 
    1     2

table(boot_samp)

boot_samp
 1  2  3  4  5  6  7  8  9 10 
 6  2  4  3  2  2  4 10  3  4

sum(boot_samp)

[1] 231

mean(rand_norm)

[1] -0.01100348

var(rand_norm)

[1] 0.9722378

sd(rand_norm)^2

[1] 0.9722378

quantile(rand_norm)

        0%        25%        50%        75%       100% 
-2.9095517 -0.6568909 -0.0269866  0.6667781  2.8128756

Creating a matrix

m1 <- matrix(1:20, nrow = 5)
print(m1)

     [,1] [,2] [,3] [,4]
[1,]    1    6   11   16
[2,]    2    7   12   17
[3,]    3    8   13   18
[4,]    4    9   14   19
[5,]    5   10   15   20

m2 <- matrix(1:20, ncol = 6)

Warning in matrix(1:20, ncol = 6): data length [20] is not a sub-multiple
or multiple of the number of columns [6]

print(m2)

     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    5    9   13   17    1
[2,]    2    6   10   14   18    2
[3,]    3    7   11   15   19    3
[4,]    4    8   12   16   20    4

m2 <- matrix(1:20, ncol = 2)
print(m2)

      [,1] [,2]
 [1,]    1   11
 [2,]    2   12
 [3,]    3   13
 [4,]    4   14
 [5,]    5   15
 [6,]    6   16
 [7,]    7   17
 [8,]    8   18
 [9,]    9   19
[10,]   10   20

m3 <- matrix(letters[1:20], ncol = 2)
print(m3)

      [,1] [,2]
 [1,] "a"  "k" 
 [2,] "b"  "l" 
 [3,] "c"  "m" 
 [4,] "d"  "n" 
 [5,] "e"  "o" 
 [6,] "f"  "p" 
 [7,] "g"  "q" 
 [8,] "h"  "r" 
 [9,] "i"  "s" 
[10,] "j"  "t"

dim(m3) # dimensions of matrix

[1] 10  2

nrow(m3) # number of rows in matrix

[1] 10

ncol(m3) # number of columns in matrix

[1] 2

summary(m3) # summary of values in columns

       V1          V2   
 a      :1   k      :1  
 b      :1   l      :1  
 c      :1   m      :1  
 d      :1   n      :1  
 e      :1   o      :1  
 f      :1   p      :1  
 (Other):4   (Other):4

# change columns names
summary(m2) # default column names

       V1              V2       
 Min.   : 1.00   Min.   :11.00  
 1st Qu.: 3.25   1st Qu.:13.25  
 Median : 5.50   Median :15.50  
 Mean   : 5.50   Mean   :15.50  
 3rd Qu.: 7.75   3rd Qu.:17.75  
 Max.   :10.00   Max.   :20.00

colnames(m2) <- c("Column 1", "Column 2")
print(m2) # new column names

      Column 1 Column 2
 [1,]        1       11
 [2,]        2       12
 [3,]        3       13
 [4,]        4       14
 [5,]        5       15
 [6,]        6       16
 [7,]        7       17
 [8,]        8       18
 [9,]        9       19
[10,]       10       20

t(m2) # transposed matrix

         [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
Column 1    1    2    3    4    5    6    7    8    9    10
Column 2   11   12   13   14   15   16   17   18   19    20

# mixing char and num vectors results in all character matrix
m4 <- matrix(c(1:10,letters[1:10]), ncol = 2)
print(m4)

      [,1] [,2]
 [1,] "1"  "a" 
 [2,] "2"  "b" 
 [3,] "3"  "c" 
 [4,] "4"  "d" 
 [5,] "5"  "e" 
 [6,] "6"  "f" 
 [7,] "7"  "g" 
 [8,] "8"  "h" 
 [9,] "9"  "i" 
[10,] "10" "j"

Lists

# ex. a list of three elements: 1) a single numeric, 2) a chracter vector, 3) a numeric marix
l1 <- list(b, new_sentence, m2)
print(l1)

[[1]]
[1] 5

[[2]]
[1] "The brown dog runs fast!"

[[3]]
      Column 1 Column 2
 [1,]        1       11
 [2,]        2       12
 [3,]        3       13
 [4,]        4       14
 [5,]        5       15
 [6,]        6       16
 [7,]        7       17
 [8,]        8       18
 [9,]        9       19
[10,]       10       20

y_var <- rbinom(10,1,0.5)
x_vars <- matrix(c(rnorm(10,0,1),rnorm(10,4,0.5)),ncol = 2)
mod1 <- glm(y_var ~ x_vars, family = "binomial")
model_list <- list(y = y_var, x = x_vars, model = summary(mod1))
print(model_list)

$y
 [1] 0 1 1 1 0 0 0 0 0 0

$x
            [,1]     [,2]
 [1,]  2.2097418 3.940926
 [2,]  0.4195389 4.051594
 [3,]  1.4387764 3.996274
 [4,]  0.2971389 3.554381
 [5,] -0.3063221 4.630435
 [6,]  0.9295945 3.331206
 [7,] -0.8209835 4.342619
 [8,] -0.1748246 4.195883
 [9,]  0.9464769 4.331505
[10,] -2.8249518 5.097590

$model

Call:
glm(formula = y_var ~ x_vars, family = "binomial")

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4566  -0.7169  -0.5534   0.7764   1.5488  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)
(Intercept)   7.0780     8.7534   0.809    0.419
x_vars1       0.1175     0.8451   0.139    0.889
x_vars2      -1.9667     2.1264  -0.925    0.355

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 12.217  on 9  degrees of freedom
Residual deviance: 10.588  on 7  degrees of freedom
AIC: 16.588

Number of Fisher Scoring iterations: 5

indexing an subsetting

## Each of the data structures can be indexed and subsetted to retieve elements, rows, vectors, etc...
# for vectors
char1 <- v1[2]
print(char1)

[1] "brown"

num1 <- bvec[3]
print(num1)

[1] 78

num2 <- boot_samp[3:15]
print(num2)

 [1]  6  7  3  8  7  2  9  8  3 10  9  7  4

Indexing matrices

## Matrices are indexed by row (n) the column (m) as [n,m]
# single value from row 1, column 1
m2[1,1]

Column 1 
       1

# single value from row 5, column 2
m2[5,2]

Column 2 
      15

# get entire row - index by row number, but leave column index blank
# returns as single row matrix
m2[5,]

Column 1 Column 2 
       5       15

# get entire column - index by column number, but leave row index blank
# returns a vector
m2[,2]

 [1] 11 12 13 14 15 16 17 18 19 20

# or return a single column matrix
m2[,2,drop = FALSE]

      Column 2
 [1,]       11
 [2,]       12
 [3,]       13
 [4,]       14
 [5,]       15
 [6,]       16
 [7,]       17
 [8,]       18
 [9,]       19
[10,]       20

# return a range of rows, same works for columns
m2[1:3,]

     Column 1 Column 2
[1,]        1       11
[2,]        2       12
[3,]        3       13

Apending a matrix

## adding rows or columns
# create some random data to append
new_row <- c(99,109)
new_col <- sample(1:11,11)
# add row with rbind() function
new_matrix <- rbind(m2,new_row)
print(new_matrix)

        Column 1 Column 2
               1       11
               2       12
               3       13
               4       14
               5       15
               6       16
               7       17
               8       18
               9       19
              10       20
new_row       99      109

# add column with cbind() function
new_matrix <- cbind(new_matrix, new_col)
print(new_matrix)

        Column 1 Column 2 new_col
               1       11       1
               2       12       2
               3       13       7
               4       14       9
               5       15       5
               6       16       8
               7       17       3
               8       18      10
               9       19       4
              10       20       6
new_row       99      109      11

# set row and column names
colnames(new_matrix) <- c("col1", "col2", "col3")
rownames(new_matrix) <- NULL
print(new_matrix)

      col1 col2 col3
 [1,]    1   11    1
 [2,]    2   12    2
 [3,]    3   13    7
 [4,]    4   14    9
 [5,]    5   15    5
 [6,]    6   16    8
 [7,]    7   17    3
 [8,]    8   18   10
 [9,]    9   19    4
[10,]   10   20    6
[11,]   99  109   11

data.frames

data.frames are similar to matrices, but can store values of either num, char, or logic for each column data.frames are more general than matrices and are a very common data format for analysis

df1 <- data.frame(new_matrix)
print(df1)

   col1 col2 col3
1     1   11    1
2     2   12    2
3     3   13    7
4     4   14    9
5     5   15    5
6     6   16    8
7     7   17    3
8     8   18   10
9     9   19    4
10   10   20    6
11   99  109   11

## df1 contains only numeric data, but we can add a column of characters using cbind()
# a matrix would warn you about this and convert all data to characters
df1 <- cbind(df1, col4 = letters[1:nrow(df1)])
print(df1)

   col1 col2 col3 col4
1     1   11    1    a
2     2   12    2    b
3     3   13    7    c
4     4   14    9    d
5     5   15    5    e
6     6   16    8    f
7     7   17    3    g
8     8   18   10    h
9     9   19    4    i
10   10   20    6    j
11   99  109   11    k

## It can be indexed the same way as a matrix using [row,column]
# returns a vector of the second column
df1[,2]

 [1]  11  12  13  14  15  16  17  18  19  20 109

# or single values
df1[5,2]

[1] 15

## data.frames can also be indexed by column names in two different ways
# 1) quoted string with brackets as above, e.g. [,"column_name"]
# note that it was automatically converted to a factor when cbind() added it to the dataframe
df1[,"col4"]

 [1] a b c d e f g h i j k
Levels: a b c d e f g h i j k

# or index by the column as its own object
## or multiple columns using the c() function and column names
df1[ ,c("col1", "col4")]

   col1 col4
1     1    a
2     2    b
3     3    c
4     4    d
5     5    e
6     6    f
7     7    g
8     8    h
9     9    i
10   10    j
11   99    k

# or 2) with the '$' operator 
df1$col1

 [1]  1  2  3  4  5  6  7  8  9 10 99