This document provides an overview of the key data types in R used
for statistical analysis. Below are the examples of a few basic data
types and data structures that are used while doing data analysis.
Numeric data type can be used to represent quantities. They can be
float too.
Example: weight, height, distance
# Example of Numeric
x <- 42 # Numeric
y <- 3.14 # Numeric (floating-point)
class(x)
## [1] "numeric"
class(y)
## [1] "numeric"
# Logical operations on Numeric
# Addition
a <- x+y
print(a)
## [1] 45.14
# Subtraction
b <- x-y
print(b)
## [1] 38.86
# Multiplication
c <- x*y
print(c)
## [1] 131.88
# Division
d <- x/y
print(d)
## [1] 13.3758
# Floor division
e <- floor(x/y)
print(e)
## [1] 13
# Power
f <- x*2
print(f)
## [1] 84
# Divisibility or modulus
g <- x %% 3
print(g)
## [1] 0
Represents whole numbers. Must be explicitly defined with an L
suffix.
Example: Number of people
# Example of Integer
x <- 10L # Integer
class(x)
## [1] "integer"
# Logical operations are similar as numeric
Represents text or categorical data as strings. Often used for
labeling
Example: Naming variable, labelling experiment types like control,
tumor, non_tumor
# Example of Character
name <- "Amy"
class(name)
## [1] "character"
A simplest data structure in R. It is a
one-dimensional array that contains elements of the
same type (e.g., numeric, character, logical).
Characteristics:
- Homogeneous (all elements must be of the same
type).
- Created using the c() function.
# Example of a Vector
numeric_vector <- c(1, 2, 3, 4) # Numeric
character_vector <- c("A", "B", "C") # Character
logical_vector <- c(TRUE, FALSE, TRUE) # Logical
print(numeric_vector)
## [1] 1 2 3 4
print(character_vector)
## [1] "A" "B" "C"
print(logical_vector)
## [1] TRUE FALSE TRUE
class(numeric_vector)
## [1] "numeric"
# Special ways of creating vectors
# Repeat vector
# 1. Repeat the same element number of times
repeated_vector <- rep(5, times = 4) # Repeats 5 four times
print(repeated_vector)
## [1] 5 5 5 5
# 2. # Repeating a sequence
repeated_sequence <- rep(c(1, 2, 3), times = 2) # Repeats the sequence twice
print(repeated_sequence)
## [1] 1 2 3 1 2 3
# 3. # Repeating with each element repeated individually
each_repeated <- rep(c("A", "B"), each = 3)
print(each_repeated)
## [1] "A" "A" "A" "B" "B" "B"
# Using : operator
# 4. Sequence from 1 to 10
sequence_colon <- 1:10
print(sequence_colon)
## [1] 1 2 3 4 5 6 7 8 9 10
#5. # Sequence from 10 to 1
reverse_sequence <- 10:1
print(reverse_sequence)
## [1] 10 9 8 7 6 5 4 3 2 1
# Using seq() function
# 6. # Sequence with step size
step_sequence <- seq(from = 1, to = 10, by = 2)
print(step_sequence)
## [1] 1 3 5 7 9
# 7. # Sequence with specific length
length_sequence <- seq(from = 0, to = 1, length.out = 5)
print(length_sequence)
## [1] 0.00 0.25 0.50 0.75 1.00
# Using the c() concatenate function
# 8. Combining elements
combined_vector <- c(1, 2, 3, "A", "B")
print(combined_vector) # Output: (All get converted to character: homogenous data type in vectors)
## [1] "1" "2" "3" "A" "B"
# Using replicate() function
# 9. # Repeating random numbers
random_values <- replicate(5, sample(1:10, 1)) # 5 random numbers between 1 and 10
print(random_values)
## [1] 4 6 6 6 8
# Accessing elements in a vector (3 cases: particular element, range of elements, conditional elements)
# Example vector
my_vector <- c(10, 20, 30, 40, 50)
# Access the second element (particular element)
second_element <- my_vector[2]
print(second_element)
## [1] 20
# Access the first three elements (range of elements)
first_three <- my_vector[1:3]
print(first_three)
## [1] 10 20 30
# Access elements greater than 30 (conditional element)
greater_than_30 <- my_vector[my_vector > 30]
print(greater_than_30)
## [1] 40 50
# Modify an element in a vector
# Modify the third element to 100
my_vector[3] <- 100
print(my_vector)
## [1] 10 20 100 40 50
# Adding an element at the end of the vector using c()
# Add 60 to the end of the vector
my_vector <- c(my_vector, 60)
print(my_vector)
## [1] 10 20 100 40 50 60
# Removing elements from vector
# Remove the second element
my_vector <- my_vector[-2]
print(my_vector)
## [1] 10 100 40 50 60
# Filtering data based on logic
# Keep only values greater than 30
filtered_vector <- my_vector[my_vector > 30]
print(filtered_vector)
## [1] 100 40 50 60
# Performing mathematical operations
# Multiply each element by 2
doubled_vector <- my_vector * 2
print(doubled_vector)
## [1] 20 200 80 100 120
# Add a constant to each element
increased_vector <- my_vector + 10
print(increased_vector)
## [1] 20 110 50 60 70
# Vectorized function
# Calculate the sum of elements
sum_value <- sum(my_vector)
print(sum_value)
## [1] 260
# Calculate the mean of elements
mean_value <- mean(my_vector)
print(mean_value)
## [1] 52
# Calculate the max of elements
max_value <- max(my_vector)
print(max_value)
## [1] 100
A matrix is a two-dimensional array where all elements are of the
same type.
Characteristics:
-Homogeneous (all elements must be of the same
type).
-Created using the matrix() function.
-Accessed via row and column indices.
# Example of a Matrix
my_matrix <- matrix(1:9, nrow = 3, ncol = 3, byrow = TRUE) # 2 rows, 3 columns, by default it is by column but there is no command for
print(my_matrix)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
## [3,] 7 8 9
# Accessing elements in a matrix
# Access element in the 2nd row, 3rd column
element <- my_matrix[2, 3]
print(element)
## [1] 6
# Access the entire second row
row2 <- my_matrix[2, ]
print(row2)
## [1] 4 5 6
# Access the entire third column
col3 <- my_matrix[, 3]
print(col3)
## [1] 3 6 9
# Access submatrix from rows 1 to 2 and columns 2 to 3
submatrix <- my_matrix[1:2, 2:3]
print(submatrix)
## [,1] [,2]
## [1,] 2 3
## [2,] 5 6
# Adding rows and columns to existing matrix
# Add a row to a matrix
new_row <- c(10, 11, 12)
matrix_with_row <- rbind(my_matrix, new_row)
print(matrix_with_row)
## [,1] [,2] [,3]
## 1 2 3
## 4 5 6
## 7 8 9
## new_row 10 11 12
# Add a column to a matrix
new_col <- c(13, 14, 15)
matrix_with_col <- cbind(my_matrix, new_col)
print(matrix_with_col)
## new_col
## [1,] 1 2 3 13
## [2,] 4 5 6 14
## [3,] 7 8 9 15
# Performing arithmetic operations on a row or column
# Sum of each row
row_sums <- apply(my_matrix, 1, sum)
print(row_sums)
## [1] 6 15 24
# Sum of each column
col_sums <- apply(my_matrix, 2, sum)
print(col_sums)
## [1] 12 15 18
# Mathematical operations (element-wise)
# Matrix multiplication
# Define two matrices
matrix1 <- matrix(1:6, nrow = 2, byrow = TRUE)
matrix2 <- matrix(7:12, nrow = 2, byrow = TRUE)
# Multiply matrices
product <- matrix1 %*% t(matrix2)
print(product)
## [,1] [,2]
## [1,] 50 68
## [2,] 122 167
# Add two matrices element-wise
sum_matrix <- matrix1 + matrix2
print(sum_matrix)
## [,1] [,2] [,3]
## [1,] 8 10 12
## [2,] 14 16 18
# Multiply two matrices element-wise
prod_matrix <- matrix1 * matrix2
print(prod_matrix)
## [,1] [,2] [,3]
## [1,] 7 16 27
## [2,] 40 55 72
# Transposing a matrix using t() function
transpose_matrix <- t(my_matrix)
print(transpose_matrix)
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
# Setting name or rows and columns using rownames() and colnames() functions and c() function
rownames(my_matrix) <- c("Row1", "Row2", "Row3")
colnames(my_matrix) <- c("Col1", "Col2", "Col3")
print(my_matrix)
## Col1 Col2 Col3
## Row1 1 2 3
## Row2 4 5 6
## Row3 7 8 9
An array is a multi-dimensional data structure. It is an extension of
matrices to higher dimensions.
Characteristics:
-Homogeneous (all elements must be of the same
type).
-Created using the array() function.
-Accessed via indices for each dimension.
# Example of a 3D Array
my_array <- array(1:8, dim = c(2, 2, 2)) # 2x2x2 array
print(my_array)
## , , 1
##
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
##
## , , 2
##
## [,1] [,2]
## [1,] 5 7
## [2,] 6 8
# Accessing elements
my_array[1, 1, 2] # First row, first column, second layer
## [1] 5
A list is a heterogeneous data structure that can contain elements of
different types, including other lists, vectors, matrices, or
arrays.
Characteristics:
-Heterogeneous (can store different data types).
-Created using the list() function.
-Accessed via indexing ([[ ]]) double square brackets or names
($).
# Example of a List
my_list <- list(
name = "Amy",
age = 25,
scores = c(90, 85, 88),
matrix = matrix(1:4, nrow = 2)
)
print(my_list)
## $name
## [1] "Amy"
##
## $age
## [1] 25
##
## $scores
## [1] 90 85 88
##
## $matrix
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
# Accessing elements
my_list$name # Access by name
## [1] "Amy"
my_list[[2]] # Access by index
## [1] 25
# List slicing : Creating a slice(another list) from the existing list with chosen elements of the primary list
my_new_list <- my_list[3]
print(my_new_list)
## $scores
## [1] 90 85 88
# Slicing a section or group of elements using :
my_slice_list <- my_list[c(1:3)]
# Changing an element in the list using $name
my_list$age <- 100
# changing elements using [[]][]
sample_list <- list(c(3:4), 'Amy', matrix(c(1:10), nrow = 2))
sample_list[[1]][2] <- 5
sample_list[[3]][2,2] <-300
print(sample_list)
## [[1]]
## [1] 3 5
##
## [[2]]
## [1] "Amy"
##
## [[3]]
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 3 5 7 9
## [2,] 2 300 6 8 10
# Different for lists: Perform arithematic operations on lists using lapply(), sapply() and mapply()
# mapply() allows you to apply a function to multiple arguments or a list simultaneously and returns a vector: syntax <function><list$element1>, <list$element2>
# sapply() allows you to apply a function and simplify the result into a vector or matrix if possible
# lapply() allows you to apply a function to each element of a list or vector, and you want to preserve the result as a list
# Create a list of numeric vectors
my_list2 <- list(vec1 = c(1, 2, 3), vec2 = c(4, 5, 6))
# Perform element-wise addition of two vectors inside the list
sum_of_vectors <- mapply(function(x, y) x + y, my_list2$vec1, my_list2$vec2)
print(sum_of_vectors)
## [1] 5 7 9
# Add a constant value to each element in the list
added_values <- lapply(my_list2, function(x) x + 10)
print(added_values)
## $vec1
## [1] 11 12 13
##
## $vec2
## [1] 14 15 16
# Square each element
my_list <- list(a = 1, b = 2, c = 3)
result <- sapply(my_list, function(x) x^2)
print(result)
## a b c
## 1 4 9
It is two dimensional tabular data structure that can have different
data. It is like a table in a database.
Characteristics:
-Rows and Columns:
Rows represent observations.
Columns represent variables.
-Heterogeneous Columns:
Each column can have a different data type. For example, one column
might store numeric values, while another stores character
strings.
-Column Names and Row Names:
Columns typically have names (headers), which can be assigned or
accessed.
Rows may also have names, but they default to numeric indices if not
specified.
-Structure:
It is a list of equal-length vectors, where each vector represents
a column.
# Create a dataframe
mynum <- c(11,22,33)
mystr <- c("I", "love", "math")
mymat <- matrix(c(TRUE, FALSE, FALSE), ncol = 1)
mydf <- data.frame(mynum, mystr, mymat)
print(mydf)
## mynum mystr mymat
## 1 11 I TRUE
## 2 22 love FALSE
## 3 33 math FALSE
# Accessing a particular column (different ways: using $columnname, using [, 'colname'], using [,colnumber])
# Using $colname
col1 <- mydf$mynum
print(col1)
## [1] 11 22 33
# Using [ , 'colname']
col2 <- mydf[, 'mynum']
print(col2)
## [1] 11 22 33
# Using [ , colnumber]
col3 <- mydf[, 2]
print(col3)
## [1] "I" "love" "math"
# Accessing a row
row1 <- mydf[1, ]
print(row1)
## mynum mystr mymat
## 1 11 I TRUE
# If the rows are named, say 'a', 'b' and 'c'
mydf1 <- mydf
rownames(mydf1) <- c('a', 'b', 'c')
print(mydf)
## mynum mystr mymat
## 1 11 I TRUE
## 2 22 love FALSE
## 3 33 math FALSE
row2 <- mydf1['a',]
print(row2)
## mynum mystr mymat
## a 11 I TRUE
# Accessing a particular cell using df['rowname', 'colname']
print(mydf1['a', 'mystr'])
## [1] "I"
column1 <- c(1,2,3,4,5)
column2 <- c(6,7,8,9,10)
column3 <- c(11,12,13,14,15)
ar_df <- data.frame(column1, column2, column3)
# Mean/Sum/ any function across all rows using apply() function -> 1
sum_row <- apply(ar_df, 1, sum)
print(sum_row)
## [1] 18 21 24 27 30
# Mean/Sum/ any function across all columns using apply() function -> 2
sum_col <- apply(ar_df, 2, sum)
print(sum_col)
## column1 column2 column3
## 15 40 65
# Mean of a particular column $mynum
mean_num <- mean(mydf$mynum)
print(mean_num)
## [1] 22
# Define the new column
column_new <- c(rep('new',5))
# Adding a column to the existing dataframe using cbind() function
ar_df <- cbind(ar_df, column_new)
print(ar_df)
## column1 column2 column3 column_new
## 1 1 6 11 new
## 2 2 7 12 new
## 3 3 8 13 new
## 4 4 9 14 new
## 5 5 10 15 new
# Adding a row to an existing data frame using rbind() function
row_new <- c(rep('new', 4))
ar_df <- rbind(ar_df, row_new)
print(ar_df)
## column1 column2 column3 column_new
## 1 1 6 11 new
## 2 2 7 12 new
## 3 3 8 13 new
## 4 4 9 14 new
## 5 5 10 15 new
## 6 new new new new
# Removing row using row number
ar_df <- ar_df[-6, ]
print(ar_df)
## column1 column2 column3 column_new
## 1 1 6 11 new
## 2 2 7 12 new
## 3 3 8 13 new
## 4 4 9 14 new
## 5 5 10 15 new
# Removing row using df[-c(1,2,3,...), ]
ar_df2 <- ar_df[-c(1,2),]
print(ar_df2)
## column1 column2 column3 column_new
## 3 3 8 13 new
## 4 4 9 14 new
## 5 5 10 15 new
# Removing column by column number
ar_df1 <- ar_df[,4]
print(ar_df1)
## [1] "new" "new" "new" "new" "new"
# Removing column number by column name
ar_df <- ar_df[, !names(ar_df) %in% "column_new"]
print(ar_df)
## column1 column2 column3
## 1 1 6 11
## 2 2 7 12
## 3 3 8 13
## 4 4 9 14
## 5 5 10 15
# Removing multiple columns using df[ ,c(x,y,z,...)]
ar_df1 <- ar_df
ar_df1 <- ar_df1[ ,-c(1,2)]
print(ar_df1)
## [1] "11" "12" "13" "14" "15"
In case there is a data structure and we want to know its type, R has
a function that can be used to know the data types of the data
object.
R also has the feature of changing the data type of the data object and
its modes.
# Check the data type using is.()-> True/ False
is.vector(ar_df)
## [1] FALSE
# Check if the data is numeric
is.numeric(ar_df)
## [1] FALSE
# Change from numeric to character using as.()
char1 <- as.character(ar_df)
print(char1)
## [1] "c(\"1\", \"2\", \"3\", \"4\", \"5\")"
## [2] "c(\"6\", \"7\", \"8\", \"9\", \"10\")"
## [3] "c(\"11\", \"12\", \"13\", \"14\", \"15\")"
# Finding if there is a null value in data where you might know the location of the null value
# Using is.na() to check if a value at that location is a null value
sample_na <- c (2,4,5,6,7,NA,3)
is.na(sample_na[3])
## [1] FALSE
is.na(sample_na[6])
## [1] TRUE