Data Frame Operations in R

This assignment is to introduce Data Frame operations in R.

1. Creating a Data Frame

Method 1: Loading an existing built-in data set in R (eg:mtcars)

head(mtcars,5) #preview first 5 rows of mtcars
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
str(mtcars) #structure of dataset mtcars
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

Method 2: Loading data frame from existing data set file using read.table(), read.csv() or read.delim() functions.

Sales_record=read.csv("C:/Users/User/Documents/R/Transaction_Records.csv")
head(Sales_record,5) #preview first 5 rows of Sales_record dataset
##          Date Description  Deposits Withdrawls   Balance
## 1 20-Aug-2020        NEFT 23,237.00      00.00 37,243.31
## 2 20-Aug-2020        NEFT     00.00   3,724.33 33,518.98
## 3 20-Aug-2020  Commission    245.00      00.00 33,763.98
## 4 20-Aug-2020        NEFT 12,480.00      00.00 46,243.98
## 5 20-Aug-2020        RTGS     00.00  11,561.00 34,682.98
str(Sales_record) #Structure of data
## 'data.frame':    100 obs. of  5 variables:
##  $ Date       : chr  "20-Aug-2020" "20-Aug-2020" "20-Aug-2020" "20-Aug-2020" ...
##  $ Description: chr  "NEFT" "NEFT" "Commission" "NEFT" ...
##  $ Deposits   : chr  "23,237.00" "00.00" "245.00" "12,480.00" ...
##  $ Withdrawls : chr  "00.00" "3,724.33" "00.00" "00.00" ...
##  $ Balance    : chr  "37,243.31" "33,518.98" "33,763.98" "46,243.98" ...

Method 3: Creating new data frame by using data.frame() to coerce list of vectors of equal length

student_db = data.frame(int_vec = c(10,23,56,92,335),
                        char_vec = c('Esther','Daniel','Isabel','Adams','Clint'),
                        bool_vec = c(TRUE,FALSE,TRUE,TRUE,FALSE),
                        float_vec = c(93.4,28.36,37.86,14.2,45.85))
student_db
##   int_vec char_vec bool_vec float_vec
## 1      10   Esther     TRUE     93.40
## 2      23   Daniel    FALSE     28.36
## 3      56   Isabel     TRUE     37.86
## 4      92    Adams     TRUE     14.20
## 5     335    Clint    FALSE     45.85
class(student_db) # class function shows the type of data
## [1] "data.frame"

2. Understanding the Contents of a Data Frame

head(student_db) #preview of dataframe
##   int_vec char_vec bool_vec float_vec
## 1      10   Esther     TRUE     93.40
## 2      23   Daniel    FALSE     28.36
## 3      56   Isabel     TRUE     37.86
## 4      92    Adams     TRUE     14.20
## 5     335    Clint    FALSE     45.85
str(student_db) #structure of data
## 'data.frame':    5 obs. of  4 variables:
##  $ int_vec  : num  10 23 56 92 335
##  $ char_vec : chr  "Esther" "Daniel" "Isabel" "Adams" ...
##  $ bool_vec : logi  TRUE FALSE TRUE TRUE FALSE
##  $ float_vec: num  93.4 28.4 37.9 14.2 45.9
summary(student_db) #summary of dataframe
##     int_vec        char_vec          bool_vec         float_vec    
##  Min.   : 10.0   Length:5           Mode :logical   Min.   :14.20  
##  1st Qu.: 23.0   Class :character   FALSE:2         1st Qu.:28.36  
##  Median : 56.0   Mode  :character   TRUE :3         Median :37.86  
##  Mean   :103.2                                      Mean   :43.93  
##  3rd Qu.: 92.0                                      3rd Qu.:45.85  
##  Max.   :335.0                                      Max.   :93.40
dim(student_db) #dimensions of dataframe
## [1] 5 4
ncol(student_db) #no of columns of dataframe
## [1] 4
nrow(student_db) #no of rows of dataframe
## [1] 5
names(student_db) #names of columns
## [1] "int_vec"   "char_vec"  "bool_vec"  "float_vec"
row.names(student_db) #names of rows
## [1] "1" "2" "3" "4" "5"

3. Renaming Rows and Columns

Renaming row(s):

#renaming one row
row.names(student_db)[row.names(student_db)=='3'] <- 'Student3'; 
row.names(student_db) 
## [1] "1"        "2"        "Student3" "4"        "5"
#renaming multiple rows
row.names(student_db) <- c('S1','S2','S3','S4','S5');
student_db 
##    int_vec char_vec bool_vec float_vec
## S1      10   Esther     TRUE     93.40
## S2      23   Daniel    FALSE     28.36
## S3      56   Isabel     TRUE     37.86
## S4      92    Adams     TRUE     14.20
## S5     335    Clint    FALSE     45.85

Renaming Column:

#Renaming specific column
names(student_db)[names(student_db)=='char_vec'] <- 'Name';
names(student_db)
## [1] "int_vec"   "Name"      "bool_vec"  "float_vec"

4. Accessing/Extracting Data from Data Frame

Given data frame:

student_db
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     93.40
## S2      23 Daniel    FALSE     28.36
## S3      56 Isabel     TRUE     37.86
## S4      92  Adams     TRUE     14.20
## S5     335  Clint    FALSE     45.85

a. Accessing an element in a column

Accessing like matrix (by index or name):

student_db[3,1] # Get data from 3rd row, 1st column
## [1] 56
student_db['S1','float_vec'] #Get data from "S1" row, "float_vec" column
## [1] 93.4
student_db[5,'Name'] #Get data from 5th row, "Name" column
## [1] "Clint"

Accessing like list:

#Example 1
student_db$Name[3] # Get 3rd record from "Name" column
## [1] "Isabel"
#Example 2
student_db[['bool_vec']][1] # Get 1st record from "bool_vec" column
## [1] TRUE
#Example 3
student_db[[1]][2] # Get 2nd record from 1st column
## [1] 23

b. Extracting Column (using indices or names)

In R, single column data.frames are often converted to vectors when manipulated. This happens when operators $, [[]] or [,] is used.

For example, the following manipulations return values as vector:

#Example 1
#Extract values of int_vec column
eg1<-student_db$int_vec; eg1; class(eg1) 
## [1]  10  23  56  92 335
## [1] "numeric"
#Example 2
#Extract values of 2nd column
eg2<-student_db[[2]]; eg2; class(eg2) 
## [1] "Esther" "Daniel" "Isabel" "Adams"  "Clint"
## [1] "character"
#Example 3
#Extract values of bool_vec column
eg3<-student_db[['bool_vec']]; eg3; class(eg3) 
## [1]  TRUE FALSE  TRUE  TRUE FALSE
## [1] "logical"
#Example 4
#Extract values of 4th column
eg4<-student_db[,4]; eg4; class(eg4)
## [1] 93.40 28.36 37.86 14.20 45.85
## [1] "numeric"

To avoid having vector as returned type, drop=FALSE can be used as the third argument in a [ , ] case to return a normal data frame.

eg4<-student_db[,4,drop=FALSE]; eg4; class(eg4)
##    float_vec
## S1     93.40
## S2     28.36
## S3     37.86
## S4     14.20
## S5     45.85
## [1] "data.frame"

Another way to extract a column as a data frame is by using []:

#Example 1
eg1<-student_db[3]; eg1; class(eg1) #Extract 3rd column
##    bool_vec
## S1     TRUE
## S2    FALSE
## S3     TRUE
## S4     TRUE
## S5    FALSE
## [1] "data.frame"
#Example 2
eg2<-student_db['float_vec']; eg2; class(eg2) #Extract float_vec column
##    float_vec
## S1     93.40
## S2     28.36
## S3     37.86
## S4     14.20
## S5     45.85
## [1] "data.frame"

c. Extracting Specific Row(s)

#Example 1
student_db[1,] # Extract 1st column
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE      93.4
#Example 2
student_db[1:3,] #Extract 1st to 3rd row
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     93.40
## S2      23 Daniel    FALSE     28.36
## S3      56 Isabel     TRUE     37.86
#Example 3
student_db[c('S2','S4'),] #Extract "S2" and "S4" row
##    int_vec   Name bool_vec float_vec
## S2      23 Daniel    FALSE     28.36
## S4      92  Adams     TRUE     14.20

d. Extracting Row(s) and Column(s)

#Example 1
student_db[c(1,4),c(1:3)] #Extract 1st and 4th row, of 1st to 3rd column
##    int_vec   Name bool_vec
## S1      10 Esther     TRUE
## S4      92  Adams     TRUE
#Example 2
student_db['S3',c(1,2,4)] #Extract "S3" row, of 1st, 2nd and 4th column
##    int_vec   Name float_vec
## S3      56 Isabel     37.86

5. Modifying the Data Frame

Given data frame:

student_db
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     93.40
## S2      23 Daniel    FALSE     28.36
## S3      56 Isabel     TRUE     37.86
## S4      92  Adams     TRUE     14.20
## S5     335  Clint    FALSE     45.85

a. Reassigning an element in a column using coordinates or names

#Example 1
student_db[1,4]=50.2; student_db #Reassign value of 1st row, 4th column
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     50.20
## S2      23 Daniel    FALSE     28.36
## S3      56 Isabel     TRUE     37.86
## S4      92  Adams     TRUE     14.20
## S5     335  Clint    FALSE     45.85
#Example 2
student_db[3,'bool_vec']=FALSE; student_db #Reassign value of 3rd row, bool_vec column
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     50.20
## S2      23 Daniel    FALSE     28.36
## S3      56 Isabel    FALSE     37.86
## S4      92  Adams     TRUE     14.20
## S5     335  Clint    FALSE     45.85
#Example 3
student_db$int_vec[3]=30; student_db #Reassign 3rd record of int_vec column
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     50.20
## S2      23 Daniel    FALSE     28.36
## S3      30 Isabel    FALSE     37.86
## S4      92  Adams     TRUE     14.20
## S5     335  Clint    FALSE     45.85

b. Adding Rows

New Entry details:

new_entry <- c(82,'Cardiff',FALSE,90.47);new_entry #new data
## [1] "82"      "Cardiff" "FALSE"   "90.47"

Using rbind():

AddRow1 <- rbind(student_db,new_entry) ;AddRow1
##    int_vec    Name bool_vec float_vec
## S1      10  Esther     TRUE      50.2
## S2      23  Daniel    FALSE     28.36
## S3      30  Isabel    FALSE     37.86
## S4      92   Adams     TRUE      14.2
## S5     335   Clint    FALSE     45.85
## 6       82 Cardiff    FALSE     90.47

Using nrow to assign values to new row:

AddRow2=student_db #Duplicate content of student_db dataframe to AddRow2
AddRow2[nrow(AddRow2)+1,] <- new_entry ;AddRow2 #Add new row into dataframe
##    int_vec    Name bool_vec float_vec
## S1      10  Esther     TRUE      50.2
## S2      23  Daniel    FALSE     28.36
## S3      30  Isabel    FALSE     37.86
## S4      92   Adams     TRUE      14.2
## S5     335   Clint    FALSE     45.85
## 6       82 Cardiff    FALSE     90.47

c. Adding Column

Adding column using $ Operator

mod_df = student_db
mod_df$test1 <- 1:5; mod_df #add new column named test1
##    int_vec   Name bool_vec float_vec test1
## S1      10 Esther     TRUE     50.20     1
## S2      23 Daniel    FALSE     28.36     2
## S3      30 Isabel    FALSE     37.86     3
## S4      92  Adams     TRUE     14.20     4
## S5     335  Clint    FALSE     45.85     5

Adding column using Square brackets [] or [[]]:

mod_df['test2'] <- 6:10; mod_df #add new column named test2
##    int_vec   Name bool_vec float_vec test1 test2
## S1      10 Esther     TRUE     50.20     1     6
## S2      23 Daniel    FALSE     28.36     2     7
## S3      30 Isabel    FALSE     37.86     3     8
## S4      92  Adams     TRUE     14.20     4     9
## S5     335  Clint    FALSE     45.85     5    10

Adding column using cbind function:

mod_df <- cbind(mod_df,test3=11:15); mod_df #add new column "test3"
##    int_vec   Name bool_vec float_vec test1 test2 test3
## S1      10 Esther     TRUE     50.20     1     6    11
## S2      23 Daniel    FALSE     28.36     2     7    12
## S3      30 Isabel    FALSE     37.86     3     8    13
## S4      92  Adams     TRUE     14.20     4     9    14
## S5     335  Clint    FALSE     45.85     5    10    15

Combining data of existing columns to create new column of data

#Add new column "Sum", which is the total of test1, test2, and test3
mod_df <- cbind(mod_df, Sum = mod_df$test1 + mod_df$test2 + mod_df$test3); mod_df
##    int_vec   Name bool_vec float_vec test1 test2 test3 Sum
## S1      10 Esther     TRUE     50.20     1     6    11  18
## S2      23 Daniel    FALSE     28.36     2     7    12  21
## S3      30 Isabel    FALSE     37.86     3     8    13  24
## S4      92  Adams     TRUE     14.20     4     9    14  27
## S5     335  Clint    FALSE     45.85     5    10    15  30

d. Removing Column

#Example 1
mod_df['Sum'] <- NULL; mod_df #Remove "Sum" column
##    int_vec   Name bool_vec float_vec test1 test2 test3
## S1      10 Esther     TRUE     50.20     1     6    11
## S2      23 Daniel    FALSE     28.36     2     7    12
## S3      30 Isabel    FALSE     37.86     3     8    13
## S4      92  Adams     TRUE     14.20     4     9    14
## S5     335  Clint    FALSE     45.85     5    10    15
#Example 2
mod_df$test1 <- NULL; mod_df #Remove "test1" column
##    int_vec   Name bool_vec float_vec test2 test3
## S1      10 Esther     TRUE     50.20     6    11
## S2      23 Daniel    FALSE     28.36     7    12
## S3      30 Isabel    FALSE     37.86     8    13
## S4      92  Adams     TRUE     14.20     9    14
## S5     335  Clint    FALSE     45.85    10    15
#Example 3
mod_df[4] <- NULL; mod_df #Remove 4th column
##    int_vec   Name bool_vec test2 test3
## S1      10 Esther     TRUE     6    11
## S2      23 Daniel    FALSE     7    12
## S3      30 Isabel    FALSE     8    13
## S4      92  Adams     TRUE     9    14
## S5     335  Clint    FALSE    10    15
#Example 4
mod_df <- mod_df[-3];mod_df #Remove 3rd column
##    int_vec   Name test2 test3
## S1      10 Esther     6    11
## S2      23 Daniel     7    12
## S3      30 Isabel     8    13
## S4      92  Adams     9    14
## S5     335  Clint    10    15

e. Removing multiple columns

mod_df <- mod_df[-c(1,3)];mod_df #Remove 1st and 3rd column
##      Name test3
## S1 Esther    11
## S2 Daniel    12
## S3 Isabel    13
## S4  Adams    14
## S5  Clint    15

f. Removing row

Original dataframe

student_db
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     50.20
## S2      23 Daniel    FALSE     28.36
## S3      30 Isabel    FALSE     37.86
## S4      92  Adams     TRUE     14.20
## S5     335  Clint    FALSE     45.85

Removing row by index:

mod_df <- student_db[-c(2,4),];mod_df #remove 2nd and 4th row
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     50.20
## S3      30 Isabel    FALSE     37.86
## S5     335  Clint    FALSE     45.85

Removing row by name:

student_db
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     50.20
## S2      23 Daniel    FALSE     28.36
## S3      30 Isabel    FALSE     37.86
## S4      92  Adams     TRUE     14.20
## S5     335  Clint    FALSE     45.85
mod_df <- student_db['S1',];mod_df #Remove "S1" column
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE      50.2

g. Reordering columns

Before Reordering:

student_db
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     50.20
## S2      23 Daniel    FALSE     28.36
## S3      30 Isabel    FALSE     37.86
## S4      92  Adams     TRUE     14.20
## S5     335  Clint    FALSE     45.85

After Reordering:

# Reorder columns according to specific sequence
Reorder_df <- student_db[c(2,1,4,3)];Reorder_df 
##      Name int_vec float_vec bool_vec
## S1 Esther      10     50.20     TRUE
## S2 Daniel      23     28.36    FALSE
## S3 Isabel      30     37.86    FALSE
## S4  Adams      92     14.20     TRUE
## S5  Clint     335     45.85    FALSE

h. Reordering rows

Before Reordering:

student_db
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     50.20
## S2      23 Daniel    FALSE     28.36
## S3      30 Isabel    FALSE     37.86
## S4      92  Adams     TRUE     14.20
## S5     335  Clint    FALSE     45.85

After Reordering:

# Reorder rows according to values of a column in ascending order
Reorder_df <- student_db[order(student_db$float_vec),];Reorder_df
##    int_vec   Name bool_vec float_vec
## S4      92  Adams     TRUE     14.20
## S2      23 Daniel    FALSE     28.36
## S3      30 Isabel    FALSE     37.86
## S5     335  Clint    FALSE     45.85
## S1      10 Esther     TRUE     50.20

6. Creating a new dataframe from an existing dataframe

Examples shown in Sections 4 and 5 for data extraction (rows and columns) and modification of data frame of an existing data frame can be used to create a new data frame.

Alternatively, the following functions can be used in creating a new data frame from an existing data frame.

Existing data frame:

student_db
##    int_vec   Name bool_vec float_vec
## S1      10 Esther     TRUE     50.20
## S2      23 Daniel    FALSE     28.36
## S3      30 Isabel    FALSE     37.86
## S4      92  Adams     TRUE     14.20
## S5     335  Clint    FALSE     45.85

a. Using data.frame command

#Create a new data frame using the Name, float_vec, and bool_vec columns of student_db data frame
new_df <- data.frame(student_db$Name, student_db$float_vec, student_db$bool_vec); new_df
##   student_db.Name student_db.float_vec student_db.bool_vec
## 1          Esther                50.20                TRUE
## 2          Daniel                28.36               FALSE
## 3          Isabel                37.86               FALSE
## 4           Adams                14.20                TRUE
## 5           Clint                45.85               FALSE
names(new_df) <- c('Names','Marks','Pass');new_df #Reassign column names
##    Names Marks  Pass
## 1 Esther 50.20  TRUE
## 2 Daniel 28.36 FALSE
## 3 Isabel 37.86 FALSE
## 4  Adams 14.20  TRUE
## 5  Clint 45.85 FALSE
row.names(new_df) <- row.names(student_db); new_df; class(new_df) #copy row names
##     Names Marks  Pass
## S1 Esther 50.20  TRUE
## S2 Daniel 28.36 FALSE
## S3 Isabel 37.86 FALSE
## S4  Adams 14.20  TRUE
## S5  Clint 45.85 FALSE
## [1] "data.frame"

b. Using subset command

#use subset function to select columns by name to create new data frame
new_df2 <- subset(student_db,select=c(int_vec,bool_vec)); new_df2; class(new_df2)
##    int_vec bool_vec
## S1      10     TRUE
## S2      23    FALSE
## S3      30    FALSE
## S4      92     TRUE
## S5     335    FALSE
## [1] "data.frame"
#use subset function to select columns by index to create new data frame
new_df3 <- subset(student_db,select=c(2,4)); new_df3; class(new_df3)
##      Name float_vec
## S1 Esther     50.20
## S2 Daniel     28.36
## S3 Isabel     37.86
## S4  Adams     14.20
## S5  Clint     45.85
## [1] "data.frame"