Introduction This assignment is to explore various ways we can modified a data frame.
Creating a data frame from vectors
name <- c("Anne","Pete","Frank","Julia","Cath")
age <- c(28,30,21,39,40)
income <- c(3000,5000,3000,10000,12000)
debt_payment <- c(510.5,1230.4,210.80,5088.30,4500)
df <- data.frame(name,age,income, debt_payment)
df
## name age income debt_payment
## 1 Anne 28 3000 510.5
## 2 Pete 30 5000 1230.4
## 3 Frank 21 3000 210.8
## 4 Julia 39 10000 5088.3
## 5 Cath 40 12000 4500.0
Noticed that str() function automatically round up debt_payment while the data frame still retains its original value
str(df)
## 'data.frame': 5 obs. of 4 variables:
## $ name : chr "Anne" "Pete" "Frank" "Julia" ...
## $ age : num 28 30 21 39 40
## $ income : num 3000 5000 3000 10000 12000
## $ debt_payment: num 510 1230 211 5088 4500
Functions of data frame
names(df)
## [1] "name" "age" "income" "debt_payment"
nrow(df)
## [1] 5
ncol(df)
## [1] 4
dim() function finds nrow and ncol at the same time. While length() will return similar output as ncol()
dim(df)
## [1] 5 4
length(df)
## [1] 4
Naming rows and column of data frame
row.names(df)
## [1] "1" "2" "3" "4" "5"
row.names(df) <- c("ind_1", "ind_2" ,"ind_3","ind_4","ind_5")
df
## name age income debt_payment
## ind_1 Anne 28 3000 510.5
## ind_2 Pete 30 5000 1230.4
## ind_3 Frank 21 3000 210.8
## ind_4 Julia 39 10000 5088.3
## ind_5 Cath 40 12000 4500.0
names(df)[names(df)=="debt_payment"] <- "installment"
df
## name age income installment
## ind_1 Anne 28 3000 510.5
## ind_2 Pete 30 5000 1230.4
## ind_3 Frank 21 3000 210.8
## ind_4 Julia 39 10000 5088.3
## ind_5 Cath 40 12000 4500.0
names(df) <- c("Name","Age","Income","Installment")
df
## Name Age Income Installment
## ind_1 Anne 28 3000 510.5
## ind_2 Pete 30 5000 1230.4
## ind_3 Frank 21 3000 210.8
## ind_4 Julia 39 10000 5088.3
## ind_5 Cath 40 12000 4500.0
if length not match, the remaining names will be NA
names(df) <- c("Name")
df
## Name NA NA NA
## ind_1 Anne 28 3000 510.5
## ind_2 Pete 30 5000 1230.4
## ind_3 Frank 21 3000 210.8
## ind_4 Julia 39 10000 5088.3
## ind_5 Cath 40 12000 4500.0
Different ways of accessing data frame
names(df) <- c("Name","Age","Income","Installment")
df["Income"]
## Income
## ind_1 3000
## ind_2 5000
## ind_3 3000
## ind_4 10000
## ind_5 12000
df$Income
## [1] 3000 5000 3000 10000 12000
df[["Income"]]
## [1] 3000 5000 3000 10000 12000
df[[3]]
## [1] 3000 5000 3000 10000 12000
df[,3, drop = TRUE]
## [1] 3000 5000 3000 10000 12000
df[,3, drop = FALSE]
## Income
## ind_1 3000
## ind_2 5000
## ind_3 3000
## ind_4 10000
## ind_5 12000
class(df["Income"])
## [1] "data.frame"
class(df$Income)
## [1] "numeric"
class(df[["Income"]])
## [1] "numeric"
class(df[[3]])
## [1] "numeric"
class(df[,3, drop = TRUE])
## [1] "numeric"
class(df[,3, drop = FALSE])
## [1] "data.frame"
typeof(df["Income"])
## [1] "list"
typeof(df$Income)
## [1] "double"
typeof(df[["Income"]])
## [1] "double"
typeof(df[[3]])
## [1] "double"
typeof(df[,3, drop = TRUE])
## [1] "double"
typeof(df[,3, drop = FALSE])
## [1] "list"
Notice that only df[“column”] and df[,column, drop = False] return as data.frame while the rest are vectors.
accessing df by row
df[1,]
## Name Age Income Installment
## ind_1 Anne 28 3000 510.5
id <- "ind_3"
df[c(id),]
## Name Age Income Installment
## ind_3 Frank 21 3000 210.8
Eliminating specific rows or columns
df[,-2]
## Name Income Installment
## ind_1 Anne 3000 510.5
## ind_2 Pete 5000 1230.4
## ind_3 Frank 3000 210.8
## ind_4 Julia 10000 5088.3
## ind_5 Cath 12000 4500.0
df[-5,]
## Name Age Income Installment
## ind_1 Anne 28 3000 510.5
## ind_2 Pete 30 5000 1230.4
## ind_3 Frank 21 3000 210.8
## ind_4 Julia 39 10000 5088.3
df[-5,-4]
## Name Age Income
## ind_1 Anne 28 3000
## ind_2 Pete 30 5000
## ind_3 Frank 21 3000
## ind_4 Julia 39 10000
df[["Income"]][-3]
## [1] 3000 5000 10000 12000
df[1,][-1]
## Age Income Installment
## ind_1 28 3000 510.5
Accessing an element in a row or col
df[["Income"]][3]
## [1] 3000
df$Income[3]
## [1] 3000
df[,1][3]
## [1] "Frank"
df[2,][3]
## Income
## ind_2 5000
df[c(id),][3]
## Income
## ind_3 3000
Code that returns based list based on a set rule
idx <- df$Age == 21
df[idx, ]
## Name Age Income Installment
## ind_3 Frank 21 3000 210.8
iddx <- which(df$Age > 30)
df[iddx, ]
## Name Age Income Installment
## ind_4 Julia 39 10000 5088.3
## ind_5 Cath 40 12000 4500.0
Adding additional observation or variable
tom <- data.frame(Name = "Tom", Age = 45, Income = 7000, Installment = 3000, row.names = "ind_6")
rbind(df, tom)
## Name Age Income Installment
## ind_1 Anne 28 3000 510.5
## ind_2 Pete 30 5000 1230.4
## ind_3 Frank 21 3000 210.8
## ind_4 Julia 39 10000 5088.3
## ind_5 Cath 40 12000 4500.0
## ind_6 Tom 45 7000 3000.0
df$debtcapacity <- round(c(df$Income/df$Installment),digits = 2); df
## Name Age Income Installment debtcapacity
## ind_1 Anne 28 3000 510.5 5.88
## ind_2 Pete 30 5000 1230.4 4.06
## ind_3 Frank 21 3000 210.8 14.23
## ind_4 Julia 39 10000 5088.3 1.97
## ind_5 Cath 40 12000 4500.0 2.67
Sorting data frame
sort(df$Age)
## [1] 21 28 30 39 40
ranks <- order(df$Age)
ranks
## [1] 3 1 2 4 5
df[ranks,]
## Name Age Income Installment debtcapacity
## ind_3 Frank 21 3000 210.8 14.23
## ind_1 Anne 28 3000 510.5 5.88
## ind_2 Pete 30 5000 1230.4 4.06
## ind_4 Julia 39 10000 5088.3 1.97
## ind_5 Cath 40 12000 4500.0 2.67
df[order(df$Age, decreasing = TRUE), ]
## Name Age Income Installment debtcapacity
## ind_5 Cath 40 12000 4500.0 2.67
## ind_4 Julia 39 10000 5088.3 1.97
## ind_2 Pete 30 5000 1230.4 4.06
## ind_1 Anne 28 3000 510.5 5.88
## ind_3 Frank 21 3000 210.8 14.23
Summary of the dataframe and by specific column or row
str(df)
## 'data.frame': 5 obs. of 5 variables:
## $ Name : chr "Anne" "Pete" "Frank" "Julia" ...
## $ Age : num 28 30 21 39 40
## $ Income : num 3000 5000 3000 10000 12000
## $ Installment : num 510 1230 211 5088 4500
## $ debtcapacity: num 5.88 4.06 14.23 1.97 2.67
summary(df)
## Name Age Income Installment
## Length:5 Min. :21.0 Min. : 3000 Min. : 210.8
## Class :character 1st Qu.:28.0 1st Qu.: 3000 1st Qu.: 510.5
## Mode :character Median :30.0 Median : 5000 Median :1230.4
## Mean :31.6 Mean : 6600 Mean :2308.0
## 3rd Qu.:39.0 3rd Qu.:10000 3rd Qu.:4500.0
## Max. :40.0 Max. :12000 Max. :5088.3
## debtcapacity
## Min. : 1.970
## 1st Qu.: 2.670
## Median : 4.060
## Mean : 5.762
## 3rd Qu.: 5.880
## Max. :14.230
str(df["Income"])
## 'data.frame': 5 obs. of 1 variable:
## $ Income: num 3000 5000 3000 10000 12000
summary(df["Income"])
## Income
## Min. : 3000
## 1st Qu.: 3000
## Median : 5000
## Mean : 6600
## 3rd Qu.:10000
## Max. :12000
str(df[2,])
## 'data.frame': 1 obs. of 5 variables:
## $ Name : chr "Pete"
## $ Age : num 30
## $ Income : num 5000
## $ Installment : num 1230
## $ debtcapacity: num 4.06
summary(df[2,])
## Name Age Income Installment debtcapacity
## Length:1 Min. :30 Min. :5000 Min. :1230 Min. :4.06
## Class :character 1st Qu.:30 1st Qu.:5000 1st Qu.:1230 1st Qu.:4.06
## Mode :character Median :30 Median :5000 Median :1230 Median :4.06
## Mean :30 Mean :5000 Mean :1230 Mean :4.06
## 3rd Qu.:30 3rd Qu.:5000 3rd Qu.:1230 3rd Qu.:4.06
## Max. :30 Max. :5000 Max. :1230 Max. :4.06
str(df[-2,-3])
## 'data.frame': 4 obs. of 4 variables:
## $ Name : chr "Anne" "Frank" "Julia" "Cath"
## $ Age : num 28 21 39 40
## $ Installment : num 510 211 5088 4500
## $ debtcapacity: num 5.88 14.23 1.97 2.67
summary(df[-2,-3])
## Name Age Installment debtcapacity
## Length:4 Min. :21.00 Min. : 210.8 Min. : 1.970
## Class :character 1st Qu.:26.25 1st Qu.: 435.6 1st Qu.: 2.495
## Mode :character Median :33.50 Median :2505.2 Median : 4.275
## Mean :32.00 Mean :2577.4 Mean : 6.188
## 3rd Qu.:39.25 3rd Qu.:4647.1 3rd Qu.: 7.968
## Max. :40.00 Max. :5088.3 Max. :14.230