R Data Structures

There are 6 data types in R
Vectors - (homogeneous - Similar type of Items)
Lists - (heterogeneous - Disimilar type of Items)
Matrices - (homogeneous - Similar type of Items)
Data Frames - (heterogeneous - DiSimilar type of Items)
Arrays - (homogeneous - Similar type of Items)

Vectors

Vectors are homogeneous data structure in R.Could either be only number or characters or logical or Complex only.

## Creating Vectors ##
x <- c(0.5, 0.6)      # numeric
x
## [1] 0.5 0.6
x <- c(TRUE, FALSE)   # logical
x
## [1]  TRUE FALSE
x <- c(T, F)          # logical
x
## [1]  TRUE FALSE
x <- c("a", "b", "c") # character
x
## [1] "a" "b" "c"
x <- c(9:29)          # integer
x
##  [1]  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
x <- c(1+0i, 2+4i)    # complex
x
## [1] 1+0i 2+4i

Using the vector function:

x <- vector("numeric", length = 10)
x
##  [1] 0 0 0 0 0 0 0 0 0 0

Adding elements in a vector:

x <-c(88,233,45,67)
x <- c(x[1:3],168,x[4]) ## insert 168 before 13
x
## [1]  88 233  45 168  67

length of a vector

x <- c(1,2,4)
length(x)
## [1] 3

0 element vector

x <- c()
x
## NULL
length(x)
## [1] 0
1:length(x)
## [1] 1 0

Implicit Coercion

y <- c(1.7, "a")    # character, 1.7 is converted into "1.7"
y
## [1] "1.7" "a"
y <- c(TRUE, 2)   # numeric, TRUE is converted into number (1)
y
## [1] 1 2
y <- c("a", TRUE) # character, TRUE is converted to "TRUE"
y
## [1] "a"    "TRUE"

using seq() function

x <- seq(from = 12,to = 30, by = 3)
x
## [1] 12 15 18 21 24 27 30
x <- seq(from=1.1,to=2,length = 10)
x
##  [1] 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0

vector sequences with rep()

x <- rep(8,4)
x
## [1] 8 8 8 8
rep(c(5,12,13),3)
## [1]  5 12 13  5 12 13  5 12 13
rep(c(5,12,13),each=2)
## [1]  5  5 12 12 13 13
rep(1:3,2)
## [1] 1 2 3 1 2 3

indexing Vectors

y <- c(1.2,3.9,0.4,0.12)
y[c(1,3)] # extract elements 1 and 3 of y
## [1] 1.2 0.4
y[2:3]
## [1] 3.9 0.4
v <- 3:4
y[v]
## [1] 0.40 0.12

Duplicates are allowed in indexing

x <- c(4,2,17,5)
x
## [1]  4  2 17  5
y <- x[c(1,1,3)]
y
## [1]  4  4 17

Negative subscripts mean we want to exclude the given elements

z <- c(5,12,13)
z[-1]
## [1] 12 13
z# exclude element 1
## [1]  5 12 13
z[-1:-2]
## [1] 13
z
## [1]  5 12 13
z[1:(length(z)-1)]# exclude elements 1 through 2
## [1]  5 12
z[-length(z)]
## [1]  5 12

using all() and any()

x <- 1:10
any(x > 8)
## [1] TRUE
any(x > 88)
## [1] FALSE
all(x > 88)
## [1] FALSE
all(x > 0)
## [1] TRUE

Example of Vectors using combine function, arithmetic and logical vectors with conditions

student.marks <- c(10, 20, 30,  40) #using combine function
student.marks #print variable content
## [1] 10 20 30 40
#Vectorized operations :Flavor I :Input - Single vector, Output - Scalar
mean(student.marks)
## [1] 25
#Vectorized operations :Flavor II :Input - Single vector, output - Single vector
student.marks <- student.marks + 5 #arithmetic operator
student.marks
## [1] 15 25 35 45
student.marks >= 30 #logical operator
## [1] FALSE FALSE  TRUE  TRUE
#Vectorized operations :Flavor III :Input - Multiple vectors, output - Single vector 
student.physics.marks <- c(20,40,30,50)
student.chemistry.marks <- c(30,20,50,20)
student.total.marks <- student.physics.marks + student.chemistry.marks #Addition
student.total.marks
## [1] 50 60 80 70

Lists

Lists are heterogeneous data structure in R.Could be a combination of numbers or character

Assigning values to named vectors

student.names <- c("Raj","Rahul","Priya","Poonam")
student.names
## [1] "Raj"    "Rahul"  "Priya"  "Poonam"
student.weights <- c( 60.5, 72.5 , 45.2,  47.5)
student.weights
## [1] 60.5 72.5 45.2 47.5
student.genders <- factor(c("Male","Male","Female","Female"))
student.genders
## [1] Male   Male   Female Female
## Levels: Female Male
student.physics.marks <- c( 70L , 75L , 80L,  85L)
student.physics.marks
## [1] 70 75 80 85
student.chemistry.marks <- c(60L, 70L, 85L, 70L)
student.chemistry.marks
## [1] 60 70 85 70
Creating unnamed list
student1 <- list(student.names[1], student.weights[1], student.genders[1], 
                 student.physics.marks[1], student.chemistry.marks[1])
str(student1)
## List of 5
##  $ : chr "Raj"
##  $ : num 60.5
##  $ : Factor w/ 2 levels "Female","Male": 2
##  $ : int 70
##  $ : int 60
student1
## [[1]]
## [1] "Raj"
## 
## [[2]]
## [1] 60.5
## 
## [[3]]
## [1] Male
## Levels: Female Male
## 
## [[4]]
## [1] 70
## 
## [[5]]
## [1] 60

Creating named list

student1 <- list(name = student.names[1], 
                 weight = student.weights[1], 
                 gender = student.genders[1], 
                 physics = student.physics.marks[1],
                 chemistry = student.chemistry.marks[1])

str(student1)
## List of 5
##  $ name     : chr "Raj"
##  $ weight   : num 60.5
##  $ gender   : Factor w/ 2 levels "Female","Male": 2
##  $ physics  : int 70
##  $ chemistry: int 60

Final List with aggregates of physics and chemistry marks

student1 <- list(name = student.names[1], 
                 weight = student.weights[1], 
                 gender = student.genders[1], 
                 marks = c(student.physics.marks[1], student.chemistry.marks[1]))
str(student1)
## List of 4
##  $ name  : chr "Raj"
##  $ weight: num 60.5
##  $ gender: Factor w/ 2 levels "Female","Male": 2
##  $ marks : int [1:2] 70 60

Subsetting: Extract element(s) using unnamed list

#Subsetting: Extract element(s) using unnamed list
student1 <- list(student.names[1], student.weights[1], student.genders[1],
                 student.physics.marks[1], student.chemistry.marks[1])

student1[1] #Single brackets [] return element of same type
## [[1]]
## [1] "Raj"
typeof(student1[1]) 
## [1] "list"
student1[[1]] #double brackets [[]] return the object in its own type
## [1] "Raj"
typeof(student1[[1]]) 
## [1] "character"
student1[1:3] #Access multiple elements using index
## [[1]]
## [1] "Raj"
## 
## [[2]]
## [1] 60.5
## 
## [[3]]
## [1] Male
## Levels: Female Male

Subsetting: Extract element(s) using named list

#Subsetting: Extract element(s) using named list
student1 <- list(name = student.names[1], 
                 weight = student.weights[1], 
                 gender = student.genders[1], 
                 physics = student.physics.marks[1],
                 chemistry = student.chemistry.marks[1])
student1[["name"]]
## [1] "Raj"
student1$gender #Access element using element name or label
## [1] Male
## Levels: Female Male
student1[c("physics","chemistry")] #Access multiple elements using label names
## $physics
## [1] 70
## 
## $chemistry
## [1] 60
length(student1)
## [1] 5

Matrix

Matrix are another homogeneous data structure in R.They are 2 dimensional and Could either be only number or characters or logical or Complex only

Example of a Matrix

it also shows the use of cbind and rbind
student.physics.marks <- c( 70L , 75L , 80L,  85L)
student.chemistry.marks <- c(60L, 70L, 85L, 70L)
student.marks <-rbind(student.physics.marks , student.chemistry.marks)
student.marks
##                         [,1] [,2] [,3] [,4]
## student.physics.marks     70   75   80   85
## student.chemistry.marks   60   70   85   70
student.marks <-cbind(student.physics.marks , student.chemistry.marks)
student.marks
##      student.physics.marks student.chemistry.marks
## [1,]                    70                      60
## [2,]                    75                      70
## [3,]                    80                      85
## [4,]                    85                      70
rownames(student.marks) <- c("Raj","Rahul","Priya","Poonam")
student.marks
##        student.physics.marks student.chemistry.marks
## Raj                       70                      60
## Rahul                     75                      70
## Priya                     80                      85
## Poonam                    85                      70
str(student.marks)
##  int [1:4, 1:2] 70 75 80 85 60 70 85 70
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:4] "Raj" "Rahul" "Priya" "Poonam"
##   ..$ : chr [1:2] "student.physics.marks" "student.chemistry.marks"
student.marks <- matrix(c( 70L , 75L , 80L,  85L, 60L, 70L, 85L, 70L),ncol=2,nrow=4)
student.marks
##      [,1] [,2]
## [1,]   70   60
## [2,]   75   70
## [3,]   80   85
## [4,]   85   70
student.marks <- matrix(c( 70L , 75L , 80L,  85L, 60L, 70L, 85L, 70L),ncol=4,nrow=2
                        ,byrow=TRUE)
student.marks
##      [,1] [,2] [,3] [,4]
## [1,]   70   75   80   85
## [2,]   60   70   85   70

Common Operations on Matrix

## Creating a matrix for operations
student.physics.marks <- c( 70L , 75L , 80L,  85L)
student.chemistry.marks <- c(60L, 70L, 85L, 70L)
student.marks <-cbind(student.physics.marks , student.chemistry.marks)
rownames(student.marks) <- c("Raj","Rahul","Priya","Poonam")
student.marks
##        student.physics.marks student.chemistry.marks
## Raj                       70                      60
## Rahul                     75                      70
## Priya                     80                      85
## Poonam                    85                      70

Subsetting a Matrix

#Subsetting: Extract element(s) from matrix
student.marks[,] #row number,column number
##        student.physics.marks student.chemistry.marks
## Raj                       70                      60
## Rahul                     75                      70
## Priya                     80                      85
## Poonam                    85                      70
student.marks[2,2] 
## [1] 70
student.marks[2,]
##   student.physics.marks student.chemistry.marks 
##                      75                      70
student.marks[,2]
##    Raj  Rahul  Priya Poonam 
##     60     70     85     70
student.marks[1:3,]
##       student.physics.marks student.chemistry.marks
## Raj                      70                      60
## Rahul                    75                      70
## Priya                    80                      85
student.marks[c(1,3),]
##       student.physics.marks student.chemistry.marks
## Raj                      70                      60
## Priya                    80                      85
student.marks[c(T,F,F,T),]
##        student.physics.marks student.chemistry.marks
## Raj                       70                      60
## Poonam                    85                      70

Summary of Students Matrix

#summary
student.marks
##        student.physics.marks student.chemistry.marks
## Raj                       70                      60
## Rahul                     75                      70
## Priya                     80                      85
## Poonam                    85                      70
rowSums(student.marks) #Row wise sum
##    Raj  Rahul  Priya Poonam 
##    130    145    165    155
colSums(student.marks) #Column wise sum
##   student.physics.marks student.chemistry.marks 
##                     310                     285
colMeans(student.marks) #Column wise mean
##   student.physics.marks student.chemistry.marks 
##                   77.50                   71.25

Data Frames

Data Frames are heterogeneous data structure in R.Could be a combination of numbers or character.This is one of the most used Data Structured in R specially as Machine learning datasets.

Creating a Data frame from vectors and checking data types of data frames

student.names <- c("Raj","Rahul","Priya","Poonam")
student.weights <- c( 60.5, 72.5 , 45.2,  47.5)
student.genders <- factor(c("Male","Male","Female","Female"))
student.physics.marks <- c( 70L , 75L , 80L,  85L)
student.chemistry.marks <- c(60L, 70L, 85L, 70L)
students <- data.frame(student.names,student.weights,student.genders,
                       student.physics.marks, student.chemistry.marks)
typeof(students)
## [1] "list"
students
##   student.names student.weights student.genders student.physics.marks
## 1           Raj            60.5            Male                    70
## 2         Rahul            72.5            Male                    75
## 3         Priya            45.2          Female                    80
## 4        Poonam            47.5          Female                    85
##   student.chemistry.marks
## 1                      60
## 2                      70
## 3                      85
## 4                      70
str(students)
## 'data.frame':    4 obs. of  5 variables:
##  $ student.names          : Factor w/ 4 levels "Poonam","Priya",..: 4 3 2 1
##  $ student.weights        : num  60.5 72.5 45.2 47.5
##  $ student.genders        : Factor w/ 2 levels "Female","Male": 2 2 1 1
##  $ student.physics.marks  : int  70 75 80 85
##  $ student.chemistry.marks: int  60 70 85 70

To Avoid implicit conversion of Strings to factors we may include an extra argument as stringAsFactors

#Use stringAsFactors = FALSE , to avoid converstion of character vector to factor
students <- data.frame(student.names,student.weights,student.genders,
                       student.physics.marks, student.chemistry.marks, 
                       stringsAsFactors = FALSE)
str(students)
## 'data.frame':    4 obs. of  5 variables:
##  $ student.names          : chr  "Raj" "Rahul" "Priya" "Poonam"
##  $ student.weights        : num  60.5 72.5 45.2 47.5
##  $ student.genders        : Factor w/ 2 levels "Female","Male": 2 2 1 1
##  $ student.physics.marks  : int  70 75 80 85
##  $ student.chemistry.marks: int  60 70 85 70

Common operations on Data Frames

Subsetting values in data frames and use of [], [[]]

students[1] #Single brackets [] return element of same type
##   student.names
## 1           Raj
## 2         Rahul
## 3         Priya
## 4        Poonam
typeof(students[1])  
## [1] "list"
students[[1]] #double brackets [[]] return the object in its own type
## [1] "Raj"    "Rahul"  "Priya"  "Poonam"
typeof(students[[1]])
## [1] "character"
students[["student.names"]] #double brackets [[]] return the object in its own type
## [1] "Raj"    "Rahul"  "Priya"  "Poonam"
typeof(students[["student.names"]])
## [1] "character"
students$student.names #$ return the object in its own type
## [1] "Raj"    "Rahul"  "Priya"  "Poonam"
typeof(students$student.names)
## [1] "character"
students[1:3]
##   student.names student.weights student.genders
## 1           Raj            60.5            Male
## 2         Rahul            72.5            Male
## 3         Priya            45.2          Female
## 4        Poonam            47.5          Female
students[c("student.physics.marks","student.chemistry.marks")]
##   student.physics.marks student.chemistry.marks
## 1                    70                      60
## 2                    75                      70
## 3                    80                      85
## 4                    85                      70
students
##   student.names student.weights student.genders student.physics.marks
## 1           Raj            60.5            Male                    70
## 2         Rahul            72.5            Male                    75
## 3         Priya            45.2          Female                    80
## 4        Poonam            47.5          Female                    85
##   student.chemistry.marks
## 1                      60
## 2                      70
## 3                      85
## 4                      70
students[1,2] #Row number, Column number
## [1] 60.5
students[1:3,1:2]
##   student.names student.weights
## 1           Raj            60.5
## 2         Rahul            72.5
## 3         Priya            45.2
students[c(1,2),c(1,3)]
##   student.names student.genders
## 1           Raj            Male
## 2         Rahul            Male
students[,1]
## [1] "Raj"    "Rahul"  "Priya"  "Poonam"
students[1,]
##   student.names student.weights student.genders student.physics.marks
## 1           Raj            60.5            Male                    70
##   student.chemistry.marks
## 1                      60
students[c(T,F,T,F),]
##   student.names student.weights student.genders student.physics.marks
## 1           Raj            60.5            Male                    70
## 3         Priya            45.2          Female                    80
##   student.chemistry.marks
## 1                      60
## 3                      85
students[student.genders == "Male",]
##   student.names student.weights student.genders student.physics.marks
## 1           Raj            60.5            Male                    70
## 2         Rahul            72.5            Male                    75
##   student.chemistry.marks
## 1                      60
## 2                      70
students[student.physics.marks >= 75,]
##   student.names student.weights student.genders student.physics.marks
## 2         Rahul            72.5            Male                    75
## 3         Priya            45.2          Female                    80
## 4        Poonam            47.5          Female                    85
##   student.chemistry.marks
## 2                      70
## 3                      85
## 4                      70

More operations on Data Frames

# Create data frame
# A dataset is ~ table (list of vectors)
id <- c(1,2,3)
name <- c("John", "Kirk", "AJ")
age <- c(21,27,18)
employees <- data.frame(ID=id, Name=name, Age=age)
employees
##   ID Name Age
## 1  1 John  21
## 2  2 Kirk  27
## 3  3   AJ  18
city <- c("New York","Chicago","London")
address <- data.frame(ID=id, City=city)
address
##   ID     City
## 1  1 New York
## 2  2  Chicago
## 3  3   London
more.id <- c(11,12,13)
more.name <- c("Kira", "Jen", "Liz")
more.age <- c(25,27,21)
more.employees <- data.frame(ID=more.id, Name=more.name, Age=more.age)
more.employees
##   ID Name Age
## 1 11 Kira  25
## 2 12  Jen  27
## 3 13  Liz  21
# ----------------------------------
# Inspect data frames
# check first few rows
head(employees)
##   ID Name Age
## 1  1 John  21
## 2  2 Kirk  27
## 3  3   AJ  18
# check some last rows
tail(employees)
##   ID Name Age
## 1  1 John  21
## 2  2 Kirk  27
## 3  3   AJ  18
# ----------------------------------
# Accessing elements of data frame
# data frames are addressed by row and columns in the matrix notation

# get a value from a cell (a particular row and a particular column)
employees[1,2] # first row, second column
## [1] John
## Levels: AJ John Kirk
employees[1,"Name"] # first row, column by name
## [1] John
## Levels: AJ John Kirk
employees[1,]$Name # first row, column by name
## [1] John
## Levels: AJ John Kirk
# get one row
employees[1,]
##   ID Name Age
## 1  1 John  21
# get one column
employees[,2]
## [1] John Kirk AJ  
## Levels: AJ John Kirk
employees[,"Name"]
## [1] John Kirk AJ  
## Levels: AJ John Kirk
employees$Name
## [1] John Kirk AJ  
## Levels: AJ John Kirk
# get multiple rows/columns (subset)
employees[1:2,] # returns 2 rows
##   ID Name Age
## 1  1 John  21
## 2  2 Kirk  27
employees[,1:2] # returns 2 columns
##   ID Name
## 1  1 John
## 2  2 Kirk
## 3  3   AJ
employees[,c(1, 2)] # returns 2 columns
##   ID Name
## 1  1 John
## 2  2 Kirk
## 3  3   AJ
employees[,c("ID", "Name")] # returns 2 columns
##   ID Name
## 1  1 John
## 2  2 Kirk
## 3  3   AJ
# get rows that pass a test
employees[employees$Age > 20, ]
##   ID Name Age
## 1  1 John  21
## 2  2 Kirk  27
# ----------------------------------
# Data Frame properties
# number of rows
nrow(employees)
## [1] 3
# number of columns
ncol(employees)
## [1] 3
# summary stats
summary(employees)
##        ID        Name        Age      
##  Min.   :1.0   AJ  :1   Min.   :18.0  
##  1st Qu.:1.5   John:1   1st Qu.:19.5  
##  Median :2.0   Kirk:1   Median :21.0  
##  Mean   :2.0            Mean   :22.0  
##  3rd Qu.:2.5            3rd Qu.:24.0  
##  Max.   :3.0            Max.   :27.0
# structure
str(employees)
## 'data.frame':    3 obs. of  3 variables:
##  $ ID  : num  1 2 3
##  $ Name: Factor w/ 3 levels "AJ","John","Kirk": 2 3 1
##  $ Age : num  21 27 18
# ----------------------------------
# Manipulate data frame
# set value
employees[3,"Age"] <- 29
# order
employees[order(employees$Age),]
##   ID Name Age
## 1  1 John  21
## 2  2 Kirk  27
## 3  3   AJ  29
# reverse order
employees[order(employees$Age, decreasing=T),]
##   ID Name Age
## 3  3   AJ  29
## 2  2 Kirk  27
## 1  1 John  21
# merging data frames
merge(employees, address, by="ID")
##   ID Name Age     City
## 1  1 John  21 New York
## 2  2 Kirk  27  Chicago
## 3  3   AJ  29   London
# add rows
all.employees <- rbind(employees, more.employees)
all.employees
##   ID Name Age
## 1  1 John  21
## 2  2 Kirk  27
## 3  3   AJ  29
## 4 11 Kira  25
## 5 12  Jen  27
## 6 13  Liz  21
# add columns
cbind(employees, city) # city is treated as a data frame
##   ID Name Age     city
## 1  1 John  21 New York
## 2  2 Kirk  27  Chicago
## 3  3   AJ  29   London
# grouping
# aggregate is similar to group by in SQL. Here are the # employees grouped by age
aggregate(all.employees[,2], list(Age=all.employees$Age), FUN=length)
##   Age x
## 1  21 2
## 2  25 1
## 3  27 2
## 4  29 1
# A column and a row of a data frame is a vector and all vector operations can be applied to it e.g. math/stats functions 
mean(all.employees$Age)
## [1] 25
# ----------------------------------
# Test for data frame
is.data.frame(employees)
## [1] TRUE

Array

Array are another homogeneous data structure in R.They are 2 dimensional and Could either be only number or characters or logical or Complex only.

Creating Arrays

class1.student.physics.marks <- c( 70L , 75L , 80L,  85L)
class1.student.chemistry.marks <- c(60L, 70L, 85L, 70L)
class1.student.marks <-cbind(class1.student.physics.marks 
                             , class1.student.chemistry.marks)
class1.student.marks
##      class1.student.physics.marks class1.student.chemistry.marks
## [1,]                           70                             60
## [2,]                           75                             70
## [3,]                           80                             85
## [4,]                           85                             70
class2.student.physics.marks <- c( 71L , 76L , 81L,  86L)
class2.student.chemistry.marks <- c(61L, 71L, 86L, 71L)
class2.student.marks <-cbind(class2.student.physics.marks 
                             , class2.student.chemistry.marks)
class2.student.marks
##      class2.student.physics.marks class2.student.chemistry.marks
## [1,]                           71                             61
## [2,]                           76                             71
## [3,]                           81                             86
## [4,]                           86                             71
student.marks <- array(c(class1.student.marks,class2.student.marks), dim=c(4,2,2))
student.marks
## , , 1
## 
##      [,1] [,2]
## [1,]   70   60
## [2,]   75   70
## [3,]   80   85
## [4,]   85   70
## 
## , , 2
## 
##      [,1] [,2]
## [1,]   71   61
## [2,]   76   71
## [3,]   81   86
## [4,]   86   71
student.marks[2,2,2] #Row number, column number, sheet number
## [1] 71
student.marks[1:3,2,]
##      [,1] [,2]
## [1,]   60   61
## [2,]   70   71
## [3,]   85   86

I hope the above mentioned Data Structures and their practices in R would have been helpful for Beginners.