This assignment is to introduce Data Frame operations in R.
Method 1: Loading an existing built-in data set in R (eg:mtcars)
head(mtcars,5) #preview first 5 rows of mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
str(mtcars) #structure of dataset mtcars
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
Method 2: Loading data frame from existing data set file using read.table(), read.csv() or read.delim() functions.
Sales_record=read.csv("C:/Users/User/Documents/R/Transaction_Records.csv")
head(Sales_record,5) #preview first 5 rows of Sales_record dataset
## Date Description Deposits Withdrawls Balance
## 1 20-Aug-2020 NEFT 23,237.00 00.00 37,243.31
## 2 20-Aug-2020 NEFT 00.00 3,724.33 33,518.98
## 3 20-Aug-2020 Commission 245.00 00.00 33,763.98
## 4 20-Aug-2020 NEFT 12,480.00 00.00 46,243.98
## 5 20-Aug-2020 RTGS 00.00 11,561.00 34,682.98
str(Sales_record) #Structure of data
## 'data.frame': 100 obs. of 5 variables:
## $ Date : chr "20-Aug-2020" "20-Aug-2020" "20-Aug-2020" "20-Aug-2020" ...
## $ Description: chr "NEFT" "NEFT" "Commission" "NEFT" ...
## $ Deposits : chr "23,237.00" "00.00" "245.00" "12,480.00" ...
## $ Withdrawls : chr "00.00" "3,724.33" "00.00" "00.00" ...
## $ Balance : chr "37,243.31" "33,518.98" "33,763.98" "46,243.98" ...
Method 3: Creating new data frame by using data.frame() to coerce list of vectors of equal length
student_db = data.frame(int_vec = c(10,23,56,92,335),
char_vec = c('Esther','Daniel','Isabel','Adams','Clint'),
bool_vec = c(TRUE,FALSE,TRUE,TRUE,FALSE),
float_vec = c(93.4,28.36,37.86,14.2,45.85))
student_db
## int_vec char_vec bool_vec float_vec
## 1 10 Esther TRUE 93.40
## 2 23 Daniel FALSE 28.36
## 3 56 Isabel TRUE 37.86
## 4 92 Adams TRUE 14.20
## 5 335 Clint FALSE 45.85
class(student_db) # class function shows the type of data
## [1] "data.frame"
head(student_db) #preview of dataframe
## int_vec char_vec bool_vec float_vec
## 1 10 Esther TRUE 93.40
## 2 23 Daniel FALSE 28.36
## 3 56 Isabel TRUE 37.86
## 4 92 Adams TRUE 14.20
## 5 335 Clint FALSE 45.85
str(student_db) #structure of data
## 'data.frame': 5 obs. of 4 variables:
## $ int_vec : num 10 23 56 92 335
## $ char_vec : chr "Esther" "Daniel" "Isabel" "Adams" ...
## $ bool_vec : logi TRUE FALSE TRUE TRUE FALSE
## $ float_vec: num 93.4 28.4 37.9 14.2 45.9
summary(student_db) #summary of dataframe
## int_vec char_vec bool_vec float_vec
## Min. : 10.0 Length:5 Mode :logical Min. :14.20
## 1st Qu.: 23.0 Class :character FALSE:2 1st Qu.:28.36
## Median : 56.0 Mode :character TRUE :3 Median :37.86
## Mean :103.2 Mean :43.93
## 3rd Qu.: 92.0 3rd Qu.:45.85
## Max. :335.0 Max. :93.40
dim(student_db) #dimensions of dataframe
## [1] 5 4
ncol(student_db) #no of columns of dataframe
## [1] 4
nrow(student_db) #no of rows of dataframe
## [1] 5
names(student_db) #names of columns
## [1] "int_vec" "char_vec" "bool_vec" "float_vec"
row.names(student_db) #names of rows
## [1] "1" "2" "3" "4" "5"
Renaming row(s):
#renaming one row
row.names(student_db)[row.names(student_db)=='3'] <- 'Student3';
row.names(student_db)
## [1] "1" "2" "Student3" "4" "5"
#renaming multiple rows
row.names(student_db) <- c('S1','S2','S3','S4','S5');
student_db
## int_vec char_vec bool_vec float_vec
## S1 10 Esther TRUE 93.40
## S2 23 Daniel FALSE 28.36
## S3 56 Isabel TRUE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
Renaming Column:
#Renaming specific column
names(student_db)[names(student_db)=='char_vec'] <- 'Name';
names(student_db)
## [1] "int_vec" "Name" "bool_vec" "float_vec"
Given data frame:
student_db
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 93.40
## S2 23 Daniel FALSE 28.36
## S3 56 Isabel TRUE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
a. Accessing an element in a column
Accessing like matrix (by index or name):
student_db[3,1] # Get data from 3rd row, 1st column
## [1] 56
student_db['S1','float_vec'] #Get data from "S1" row, "float_vec" column
## [1] 93.4
student_db[5,'Name'] #Get data from 5th row, "Name" column
## [1] "Clint"
Accessing like list:
#Example 1
student_db$Name[3] # Get 3rd record from "Name" column
## [1] "Isabel"
#Example 2
student_db[['bool_vec']][1] # Get 1st record from "bool_vec" column
## [1] TRUE
#Example 3
student_db[[1]][2] # Get 2nd record from 1st column
## [1] 23
b. Extracting Column (using indices or names)
In R, single column data.frames are often converted to vectors when manipulated. This happens when operators $, [[]] or [,] is used.
For example, the following manipulations return values as vector:
#Example 1
#Extract values of int_vec column
eg1<-student_db$int_vec; eg1; class(eg1)
## [1] 10 23 56 92 335
## [1] "numeric"
#Example 2
#Extract values of 2nd column
eg2<-student_db[[2]]; eg2; class(eg2)
## [1] "Esther" "Daniel" "Isabel" "Adams" "Clint"
## [1] "character"
#Example 3
#Extract values of bool_vec column
eg3<-student_db[['bool_vec']]; eg3; class(eg3)
## [1] TRUE FALSE TRUE TRUE FALSE
## [1] "logical"
#Example 4
#Extract values of 4th column
eg4<-student_db[,4]; eg4; class(eg4)
## [1] 93.40 28.36 37.86 14.20 45.85
## [1] "numeric"
To avoid having vector as returned type, drop=FALSE can be used as the third argument in a [ , ] case to return a normal data frame.
eg4<-student_db[,4,drop=FALSE]; eg4; class(eg4)
## float_vec
## S1 93.40
## S2 28.36
## S3 37.86
## S4 14.20
## S5 45.85
## [1] "data.frame"
Another way to extract a column as a data frame is by using []:
#Example 1
eg1<-student_db[3]; eg1; class(eg1) #Extract 3rd column
## bool_vec
## S1 TRUE
## S2 FALSE
## S3 TRUE
## S4 TRUE
## S5 FALSE
## [1] "data.frame"
#Example 2
eg2<-student_db['float_vec']; eg2; class(eg2) #Extract float_vec column
## float_vec
## S1 93.40
## S2 28.36
## S3 37.86
## S4 14.20
## S5 45.85
## [1] "data.frame"
c. Extracting Specific Row(s)
#Example 1
student_db[1,] # Extract 1st column
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 93.4
#Example 2
student_db[1:3,] #Extract 1st to 3rd row
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 93.40
## S2 23 Daniel FALSE 28.36
## S3 56 Isabel TRUE 37.86
#Example 3
student_db[c('S2','S4'),] #Extract "S2" and "S4" row
## int_vec Name bool_vec float_vec
## S2 23 Daniel FALSE 28.36
## S4 92 Adams TRUE 14.20
d. Extracting Row(s) and Column(s)
#Example 1
student_db[c(1,4),c(1:3)] #Extract 1st and 4th row, of 1st to 3rd column
## int_vec Name bool_vec
## S1 10 Esther TRUE
## S4 92 Adams TRUE
#Example 2
student_db['S3',c(1,2,4)] #Extract "S3" row, of 1st, 2nd and 4th column
## int_vec Name float_vec
## S3 56 Isabel 37.86
Given data frame:
student_db
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 93.40
## S2 23 Daniel FALSE 28.36
## S3 56 Isabel TRUE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
a. Reassigning an element in a column using coordinates or names
#Example 1
student_db[1,4]=50.2; student_db #Reassign value of 1st row, 4th column
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.20
## S2 23 Daniel FALSE 28.36
## S3 56 Isabel TRUE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
#Example 2
student_db[3,'bool_vec']=FALSE; student_db #Reassign value of 3rd row, bool_vec column
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.20
## S2 23 Daniel FALSE 28.36
## S3 56 Isabel FALSE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
#Example 3
student_db$int_vec[3]=30; student_db #Reassign 3rd record of int_vec column
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.20
## S2 23 Daniel FALSE 28.36
## S3 30 Isabel FALSE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
b. Adding Rows
New Entry details:
new_entry <- c(82,'Cardiff',FALSE,90.47);new_entry #new data
## [1] "82" "Cardiff" "FALSE" "90.47"
Using rbind():
AddRow1 <- rbind(student_db,new_entry) ;AddRow1
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.2
## S2 23 Daniel FALSE 28.36
## S3 30 Isabel FALSE 37.86
## S4 92 Adams TRUE 14.2
## S5 335 Clint FALSE 45.85
## 6 82 Cardiff FALSE 90.47
Using nrow to assign values to new row:
AddRow2=student_db #Duplicate content of student_db dataframe to AddRow2
AddRow2[nrow(AddRow2)+1,] <- new_entry ;AddRow2 #Add new row into dataframe
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.2
## S2 23 Daniel FALSE 28.36
## S3 30 Isabel FALSE 37.86
## S4 92 Adams TRUE 14.2
## S5 335 Clint FALSE 45.85
## 6 82 Cardiff FALSE 90.47
c. Adding Column
Adding column using $ Operator
mod_df = student_db
mod_df$test1 <- 1:5; mod_df #add new column named test1
## int_vec Name bool_vec float_vec test1
## S1 10 Esther TRUE 50.20 1
## S2 23 Daniel FALSE 28.36 2
## S3 30 Isabel FALSE 37.86 3
## S4 92 Adams TRUE 14.20 4
## S5 335 Clint FALSE 45.85 5
Adding column using Square brackets [] or [[]]:
mod_df['test2'] <- 6:10; mod_df #add new column named test2
## int_vec Name bool_vec float_vec test1 test2
## S1 10 Esther TRUE 50.20 1 6
## S2 23 Daniel FALSE 28.36 2 7
## S3 30 Isabel FALSE 37.86 3 8
## S4 92 Adams TRUE 14.20 4 9
## S5 335 Clint FALSE 45.85 5 10
Adding column using cbind function:
mod_df <- cbind(mod_df,test3=11:15); mod_df #add new column "test3"
## int_vec Name bool_vec float_vec test1 test2 test3
## S1 10 Esther TRUE 50.20 1 6 11
## S2 23 Daniel FALSE 28.36 2 7 12
## S3 30 Isabel FALSE 37.86 3 8 13
## S4 92 Adams TRUE 14.20 4 9 14
## S5 335 Clint FALSE 45.85 5 10 15
Combining data of existing columns to create new column of data
#Add new column "Sum", which is the total of test1, test2, and test3
mod_df <- cbind(mod_df, Sum = mod_df$test1 + mod_df$test2 + mod_df$test3); mod_df
## int_vec Name bool_vec float_vec test1 test2 test3 Sum
## S1 10 Esther TRUE 50.20 1 6 11 18
## S2 23 Daniel FALSE 28.36 2 7 12 21
## S3 30 Isabel FALSE 37.86 3 8 13 24
## S4 92 Adams TRUE 14.20 4 9 14 27
## S5 335 Clint FALSE 45.85 5 10 15 30
d. Removing Column
#Example 1
mod_df['Sum'] <- NULL; mod_df #Remove "Sum" column
## int_vec Name bool_vec float_vec test1 test2 test3
## S1 10 Esther TRUE 50.20 1 6 11
## S2 23 Daniel FALSE 28.36 2 7 12
## S3 30 Isabel FALSE 37.86 3 8 13
## S4 92 Adams TRUE 14.20 4 9 14
## S5 335 Clint FALSE 45.85 5 10 15
#Example 2
mod_df$test1 <- NULL; mod_df #Remove "test1" column
## int_vec Name bool_vec float_vec test2 test3
## S1 10 Esther TRUE 50.20 6 11
## S2 23 Daniel FALSE 28.36 7 12
## S3 30 Isabel FALSE 37.86 8 13
## S4 92 Adams TRUE 14.20 9 14
## S5 335 Clint FALSE 45.85 10 15
#Example 3
mod_df[4] <- NULL; mod_df #Remove 4th column
## int_vec Name bool_vec test2 test3
## S1 10 Esther TRUE 6 11
## S2 23 Daniel FALSE 7 12
## S3 30 Isabel FALSE 8 13
## S4 92 Adams TRUE 9 14
## S5 335 Clint FALSE 10 15
#Example 4
mod_df <- mod_df[-3];mod_df #Remove 3rd column
## int_vec Name test2 test3
## S1 10 Esther 6 11
## S2 23 Daniel 7 12
## S3 30 Isabel 8 13
## S4 92 Adams 9 14
## S5 335 Clint 10 15
e. Removing multiple columns
mod_df <- mod_df[-c(1,3)];mod_df #Remove 1st and 3rd column
## Name test3
## S1 Esther 11
## S2 Daniel 12
## S3 Isabel 13
## S4 Adams 14
## S5 Clint 15
f. Removing row
Original dataframe
student_db
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.20
## S2 23 Daniel FALSE 28.36
## S3 30 Isabel FALSE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
Removing row by index:
mod_df <- student_db[-c(2,4),];mod_df #remove 2nd and 4th row
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.20
## S3 30 Isabel FALSE 37.86
## S5 335 Clint FALSE 45.85
Removing row by name:
student_db
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.20
## S2 23 Daniel FALSE 28.36
## S3 30 Isabel FALSE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
mod_df <- student_db['S1',];mod_df #Remove "S1" column
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.2
g. Reordering columns
Before Reordering:
student_db
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.20
## S2 23 Daniel FALSE 28.36
## S3 30 Isabel FALSE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
After Reordering:
# Reorder columns according to specific sequence
Reorder_df <- student_db[c(2,1,4,3)];Reorder_df
## Name int_vec float_vec bool_vec
## S1 Esther 10 50.20 TRUE
## S2 Daniel 23 28.36 FALSE
## S3 Isabel 30 37.86 FALSE
## S4 Adams 92 14.20 TRUE
## S5 Clint 335 45.85 FALSE
h. Reordering rows
Before Reordering:
student_db
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.20
## S2 23 Daniel FALSE 28.36
## S3 30 Isabel FALSE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
After Reordering:
# Reorder rows according to values of a column in ascending order
Reorder_df <- student_db[order(student_db$float_vec),];Reorder_df
## int_vec Name bool_vec float_vec
## S4 92 Adams TRUE 14.20
## S2 23 Daniel FALSE 28.36
## S3 30 Isabel FALSE 37.86
## S5 335 Clint FALSE 45.85
## S1 10 Esther TRUE 50.20
Examples shown in Sections 4 and 5 for data extraction (rows and columns) and modification of data frame of an existing data frame can be used to create a new data frame.
Alternatively, the following functions can be used in creating a new data frame from an existing data frame.
Existing data frame:
student_db
## int_vec Name bool_vec float_vec
## S1 10 Esther TRUE 50.20
## S2 23 Daniel FALSE 28.36
## S3 30 Isabel FALSE 37.86
## S4 92 Adams TRUE 14.20
## S5 335 Clint FALSE 45.85
a. Using data.frame command
#Create a new data frame using the Name, float_vec, and bool_vec columns of student_db data frame
new_df <- data.frame(student_db$Name, student_db$float_vec, student_db$bool_vec); new_df
## student_db.Name student_db.float_vec student_db.bool_vec
## 1 Esther 50.20 TRUE
## 2 Daniel 28.36 FALSE
## 3 Isabel 37.86 FALSE
## 4 Adams 14.20 TRUE
## 5 Clint 45.85 FALSE
names(new_df) <- c('Names','Marks','Pass');new_df #Reassign column names
## Names Marks Pass
## 1 Esther 50.20 TRUE
## 2 Daniel 28.36 FALSE
## 3 Isabel 37.86 FALSE
## 4 Adams 14.20 TRUE
## 5 Clint 45.85 FALSE
row.names(new_df) <- row.names(student_db); new_df; class(new_df) #copy row names
## Names Marks Pass
## S1 Esther 50.20 TRUE
## S2 Daniel 28.36 FALSE
## S3 Isabel 37.86 FALSE
## S4 Adams 14.20 TRUE
## S5 Clint 45.85 FALSE
## [1] "data.frame"
b. Using subset command
#use subset function to select columns by name to create new data frame
new_df2 <- subset(student_db,select=c(int_vec,bool_vec)); new_df2; class(new_df2)
## int_vec bool_vec
## S1 10 TRUE
## S2 23 FALSE
## S3 30 FALSE
## S4 92 TRUE
## S5 335 FALSE
## [1] "data.frame"
#use subset function to select columns by index to create new data frame
new_df3 <- subset(student_db,select=c(2,4)); new_df3; class(new_df3)
## Name float_vec
## S1 Esther 50.20
## S2 Daniel 28.36
## S3 Isabel 37.86
## S4 Adams 14.20
## S5 Clint 45.85
## [1] "data.frame"