Basic data types and working with data structures R has three main data types: chracter, numeric, and logical These data types are used as single objects, or within data structures incuding: vector, matrix, list, and data.frame
# simple character
# "a" # RESULTS IN ERROR
# but not unquoted, because R expects that to be an object defined as somthing
# print(a) # returns: Error: object 'a' not found
# use the assignment operator '<-' to assign a value to the object 'a'
a <- "x"
print(a)
[1] "x"
class(a) # tells us the data type of the object a: [1] "character"
[1] "character"
# now the object 'a' is assigned the chracter value "x"
## That object can be used in a printed string with the paste0() function
print(paste0("object a contains the value: ", a))
[1] "object a contains the value: x"
# we can change the value of 'a' by reassigning it with another value
a <- "y" # R does not warn you that 'a' already is assigned a value!
print(paste0("object a contains the value: ", a))
[1] "object a contains the value: y"
## object 'a' can also be a numeric
a <- 2
print(a)
[1] 2
# assign another number to object 'b'
b <- 5
# now we can add the two object since they are just placeholders for numbers
a + b
[1] 7
# this could also be accomplished simply by typeing:
2 + 5
[1] 7
# but using objects allows for the declaration of values, types, and all sorts of stuff
# character vector
v <- c("1", "x", "ch")
print(v)
[1] "1" "x" "ch"
v1 <- c("The", "brown", "dog")
print(v1)
[1] "The" "brown" "dog"
v2 <- c("runs", "fast!")
print(v2)
[1] "runs" "fast!"
# concatenate character vectors
v3 <- c(v1, v2)
# compose into sentence
new_sentence <- (paste0(v3, collapse = ' '))
print(new_sentence)
[1] "The brown dog runs fast!"
# numeric vector
bvec <- c(1,4,78)
print(bvec)
[1] 1 4 78
# logical vector
c <- c(TRUE, FALSE, TRUE)
print(c)
[1] TRUE FALSE TRUE
as.numeric(c)
[1] 1 0 1
# make a numeric vector from a sequence
bvec1 <- 1:10
# find the length of a vector
length(bvec1)
[1] 10
# or - to get the same thing
bvec2 <- seq(from = 1, to = 10, by = 1)
# add vectors
bvec1 + bvec2
[1] 2 4 6 8 10 12 14 16 18 20
# vector of boot strpped samples from vector b1
boot_samp <- sample(bvec1, 40, replace = TRUE)
print(boot_samp)
[1] 8 8 6 7 3 8 7 2 9 8 3 10 9 7 4 1 5 1 7 3 1 1 6
[24] 10 10 8 8 4 1 5 8 4 8 8 8 10 3 2 9 1
# a vector of length 1000 filled with random standard normals
set.seed(717)
rand_norm <- rnorm(1000,0,1)
length(rand_norm)
[1] 1000
# make a factor data type
colors <- sample(c("orange", "green", "blue"), 15, replace = TRUE)
fact1 <- factor(colors)
print(fact1)
[1] green green orange orange green orange orange blue blue blue
[11] orange green orange green green
Levels: blue green orange
# prints as color names, but is stored internally as integer values assigned based on alphabetical order
print(fact1)
[1] green green orange orange green orange orange blue blue blue
[11] orange green orange green green
Levels: blue green orange
levels(fact1)
[1] "blue" "green" "orange"
as.numeric(fact1)
[1] 2 2 3 3 2 3 3 1 1 1 3 2 3 2 2
as.character(fact1)
[1] "green" "green" "orange" "orange" "green" "orange" "orange"
[8] "blue" "blue" "blue" "orange" "green" "orange" "green"
[15] "green"
## Sometimes the realtive order of the levels matter
fact2 <- factor(fact1, levels = c("blue", "orange", "green"))
levels(fact2)
[1] "blue" "orange" "green"
## Other times, factors need to be expliclty ordered
ord_fact <- ordered(colors, levels = c("blue", "orange", "green"))
# notice that the Levels attribute shows that blue is less than orange is less than green
print(ord_fact)
[1] green green orange orange green orange orange blue blue blue
[11] orange green orange green green
Levels: blue < orange < green
table(v)
v
1 ch x
1 1 1
table(c)
c
FALSE TRUE
1 2
table(boot_samp)
boot_samp
1 2 3 4 5 6 7 8 9 10
6 2 4 3 2 2 4 10 3 4
sum(boot_samp)
[1] 231
mean(rand_norm)
[1] -0.01100348
var(rand_norm)
[1] 0.9722378
sd(rand_norm)^2
[1] 0.9722378
quantile(rand_norm)
0% 25% 50% 75% 100%
-2.9095517 -0.6568909 -0.0269866 0.6667781 2.8128756
m1 <- matrix(1:20, nrow = 5)
print(m1)
[,1] [,2] [,3] [,4]
[1,] 1 6 11 16
[2,] 2 7 12 17
[3,] 3 8 13 18
[4,] 4 9 14 19
[5,] 5 10 15 20
m2 <- matrix(1:20, ncol = 6)
Warning in matrix(1:20, ncol = 6): data length [20] is not a sub-multiple
or multiple of the number of columns [6]
print(m2)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1 5 9 13 17 1
[2,] 2 6 10 14 18 2
[3,] 3 7 11 15 19 3
[4,] 4 8 12 16 20 4
m2 <- matrix(1:20, ncol = 2)
print(m2)
[,1] [,2]
[1,] 1 11
[2,] 2 12
[3,] 3 13
[4,] 4 14
[5,] 5 15
[6,] 6 16
[7,] 7 17
[8,] 8 18
[9,] 9 19
[10,] 10 20
m3 <- matrix(letters[1:20], ncol = 2)
print(m3)
[,1] [,2]
[1,] "a" "k"
[2,] "b" "l"
[3,] "c" "m"
[4,] "d" "n"
[5,] "e" "o"
[6,] "f" "p"
[7,] "g" "q"
[8,] "h" "r"
[9,] "i" "s"
[10,] "j" "t"
dim(m3) # dimensions of matrix
[1] 10 2
nrow(m3) # number of rows in matrix
[1] 10
ncol(m3) # number of columns in matrix
[1] 2
summary(m3) # summary of values in columns
V1 V2
a :1 k :1
b :1 l :1
c :1 m :1
d :1 n :1
e :1 o :1
f :1 p :1
(Other):4 (Other):4
# change columns names
summary(m2) # default column names
V1 V2
Min. : 1.00 Min. :11.00
1st Qu.: 3.25 1st Qu.:13.25
Median : 5.50 Median :15.50
Mean : 5.50 Mean :15.50
3rd Qu.: 7.75 3rd Qu.:17.75
Max. :10.00 Max. :20.00
colnames(m2) <- c("Column 1", "Column 2")
print(m2) # new column names
Column 1 Column 2
[1,] 1 11
[2,] 2 12
[3,] 3 13
[4,] 4 14
[5,] 5 15
[6,] 6 16
[7,] 7 17
[8,] 8 18
[9,] 9 19
[10,] 10 20
t(m2) # transposed matrix
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
Column 1 1 2 3 4 5 6 7 8 9 10
Column 2 11 12 13 14 15 16 17 18 19 20
# mixing char and num vectors results in all character matrix
m4 <- matrix(c(1:10,letters[1:10]), ncol = 2)
print(m4)
[,1] [,2]
[1,] "1" "a"
[2,] "2" "b"
[3,] "3" "c"
[4,] "4" "d"
[5,] "5" "e"
[6,] "6" "f"
[7,] "7" "g"
[8,] "8" "h"
[9,] "9" "i"
[10,] "10" "j"
# ex. a list of three elements: 1) a single numeric, 2) a chracter vector, 3) a numeric marix
l1 <- list(b, new_sentence, m2)
print(l1)
[[1]]
[1] 5
[[2]]
[1] "The brown dog runs fast!"
[[3]]
Column 1 Column 2
[1,] 1 11
[2,] 2 12
[3,] 3 13
[4,] 4 14
[5,] 5 15
[6,] 6 16
[7,] 7 17
[8,] 8 18
[9,] 9 19
[10,] 10 20
y_var <- rbinom(10,1,0.5)
x_vars <- matrix(c(rnorm(10,0,1),rnorm(10,4,0.5)),ncol = 2)
mod1 <- glm(y_var ~ x_vars, family = "binomial")
model_list <- list(y = y_var, x = x_vars, model = summary(mod1))
print(model_list)
$y
[1] 0 1 1 1 0 0 0 0 0 0
$x
[,1] [,2]
[1,] 2.2097418 3.940926
[2,] 0.4195389 4.051594
[3,] 1.4387764 3.996274
[4,] 0.2971389 3.554381
[5,] -0.3063221 4.630435
[6,] 0.9295945 3.331206
[7,] -0.8209835 4.342619
[8,] -0.1748246 4.195883
[9,] 0.9464769 4.331505
[10,] -2.8249518 5.097590
$model
Call:
glm(formula = y_var ~ x_vars, family = "binomial")
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4566 -0.7169 -0.5534 0.7764 1.5488
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 7.0780 8.7534 0.809 0.419
x_vars1 0.1175 0.8451 0.139 0.889
x_vars2 -1.9667 2.1264 -0.925 0.355
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 12.217 on 9 degrees of freedom
Residual deviance: 10.588 on 7 degrees of freedom
AIC: 16.588
Number of Fisher Scoring iterations: 5
## Each of the data structures can be indexed and subsetted to retieve elements, rows, vectors, etc...
# for vectors
char1 <- v1[2]
print(char1)
[1] "brown"
num1 <- bvec[3]
print(num1)
[1] 78
num2 <- boot_samp[3:15]
print(num2)
[1] 6 7 3 8 7 2 9 8 3 10 9 7 4
## Matrices are indexed by row (n) the column (m) as [n,m]
# single value from row 1, column 1
m2[1,1]
Column 1
1
# single value from row 5, column 2
m2[5,2]
Column 2
15
# get entire row - index by row number, but leave column index blank
# returns as single row matrix
m2[5,]
Column 1 Column 2
5 15
# get entire column - index by column number, but leave row index blank
# returns a vector
m2[,2]
[1] 11 12 13 14 15 16 17 18 19 20
# or return a single column matrix
m2[,2,drop = FALSE]
Column 2
[1,] 11
[2,] 12
[3,] 13
[4,] 14
[5,] 15
[6,] 16
[7,] 17
[8,] 18
[9,] 19
[10,] 20
# return a range of rows, same works for columns
m2[1:3,]
Column 1 Column 2
[1,] 1 11
[2,] 2 12
[3,] 3 13
## adding rows or columns
# create some random data to append
new_row <- c(99,109)
new_col <- sample(1:11,11)
# add row with rbind() function
new_matrix <- rbind(m2,new_row)
print(new_matrix)
Column 1 Column 2
1 11
2 12
3 13
4 14
5 15
6 16
7 17
8 18
9 19
10 20
new_row 99 109
# add column with cbind() function
new_matrix <- cbind(new_matrix, new_col)
print(new_matrix)
Column 1 Column 2 new_col
1 11 1
2 12 2
3 13 7
4 14 9
5 15 5
6 16 8
7 17 3
8 18 10
9 19 4
10 20 6
new_row 99 109 11
# set row and column names
colnames(new_matrix) <- c("col1", "col2", "col3")
rownames(new_matrix) <- NULL
print(new_matrix)
col1 col2 col3
[1,] 1 11 1
[2,] 2 12 2
[3,] 3 13 7
[4,] 4 14 9
[5,] 5 15 5
[6,] 6 16 8
[7,] 7 17 3
[8,] 8 18 10
[9,] 9 19 4
[10,] 10 20 6
[11,] 99 109 11
data.frames are similar to matrices, but can store values of either num, char, or logic for each column data.frames are more general than matrices and are a very common data format for analysis
df1 <- data.frame(new_matrix)
print(df1)
col1 col2 col3
1 1 11 1
2 2 12 2
3 3 13 7
4 4 14 9
5 5 15 5
6 6 16 8
7 7 17 3
8 8 18 10
9 9 19 4
10 10 20 6
11 99 109 11
## df1 contains only numeric data, but we can add a column of characters using cbind()
# a matrix would warn you about this and convert all data to characters
df1 <- cbind(df1, col4 = letters[1:nrow(df1)])
print(df1)
col1 col2 col3 col4
1 1 11 1 a
2 2 12 2 b
3 3 13 7 c
4 4 14 9 d
5 5 15 5 e
6 6 16 8 f
7 7 17 3 g
8 8 18 10 h
9 9 19 4 i
10 10 20 6 j
11 99 109 11 k
## It can be indexed the same way as a matrix using [row,column]
# returns a vector of the second column
df1[,2]
[1] 11 12 13 14 15 16 17 18 19 20 109
# or single values
df1[5,2]
[1] 15
## data.frames can also be indexed by column names in two different ways
# 1) quoted string with brackets as above, e.g. [,"column_name"]
# note that it was automatically converted to a factor when cbind() added it to the dataframe
df1[,"col4"]
[1] a b c d e f g h i j k
Levels: a b c d e f g h i j k
# or index by the column as its own object
## or multiple columns using the c() function and column names
df1[ ,c("col1", "col4")]
col1 col4
1 1 a
2 2 b
3 3 c
4 4 d
5 5 e
6 6 f
7 7 g
8 8 h
9 9 i
10 10 j
11 99 k
# or 2) with the '$' operator
df1$col1
[1] 1 2 3 4 5 6 7 8 9 10 99