Introduction This assignment is to explore various ways we can modified a data frame.

Creating a data frame from vectors

name <- c("Anne","Pete","Frank","Julia","Cath")
age <- c(28,30,21,39,40)
income <- c(3000,5000,3000,10000,12000)
debt_payment <- c(510.5,1230.4,210.80,5088.30,4500)
df <- data.frame(name,age,income, debt_payment)
df
##    name age income debt_payment
## 1  Anne  28   3000        510.5
## 2  Pete  30   5000       1230.4
## 3 Frank  21   3000        210.8
## 4 Julia  39  10000       5088.3
## 5  Cath  40  12000       4500.0

Noticed that str() function automatically round up debt_payment while the data frame still retains its original value

str(df)
## 'data.frame':    5 obs. of  4 variables:
##  $ name        : chr  "Anne" "Pete" "Frank" "Julia" ...
##  $ age         : num  28 30 21 39 40
##  $ income      : num  3000 5000 3000 10000 12000
##  $ debt_payment: num  510 1230 211 5088 4500

Functions of data frame

names(df)
## [1] "name"         "age"          "income"       "debt_payment"
nrow(df)
## [1] 5
ncol(df)
## [1] 4

dim() function finds nrow and ncol at the same time. While length() will return similar output as ncol()

dim(df)
## [1] 5 4
length(df)
## [1] 4

Naming rows and column of data frame

row.names(df)
## [1] "1" "2" "3" "4" "5"
row.names(df) <- c("ind_1", "ind_2" ,"ind_3","ind_4","ind_5")
df
##        name age income debt_payment
## ind_1  Anne  28   3000        510.5
## ind_2  Pete  30   5000       1230.4
## ind_3 Frank  21   3000        210.8
## ind_4 Julia  39  10000       5088.3
## ind_5  Cath  40  12000       4500.0
names(df)[names(df)=="debt_payment"] <- "installment"
df
##        name age income installment
## ind_1  Anne  28   3000       510.5
## ind_2  Pete  30   5000      1230.4
## ind_3 Frank  21   3000       210.8
## ind_4 Julia  39  10000      5088.3
## ind_5  Cath  40  12000      4500.0
names(df) <- c("Name","Age","Income","Installment")
df
##        Name Age Income Installment
## ind_1  Anne  28   3000       510.5
## ind_2  Pete  30   5000      1230.4
## ind_3 Frank  21   3000       210.8
## ind_4 Julia  39  10000      5088.3
## ind_5  Cath  40  12000      4500.0

if length not match, the remaining names will be NA

names(df) <- c("Name")
df
##        Name NA    NA     NA
## ind_1  Anne 28  3000  510.5
## ind_2  Pete 30  5000 1230.4
## ind_3 Frank 21  3000  210.8
## ind_4 Julia 39 10000 5088.3
## ind_5  Cath 40 12000 4500.0

Different ways of accessing data frame

names(df) <- c("Name","Age","Income","Installment")
df["Income"]
##       Income
## ind_1   3000
## ind_2   5000
## ind_3   3000
## ind_4  10000
## ind_5  12000
df$Income
## [1]  3000  5000  3000 10000 12000
df[["Income"]]
## [1]  3000  5000  3000 10000 12000
df[[3]]
## [1]  3000  5000  3000 10000 12000
df[,3, drop = TRUE]
## [1]  3000  5000  3000 10000 12000
df[,3, drop = FALSE]
##       Income
## ind_1   3000
## ind_2   5000
## ind_3   3000
## ind_4  10000
## ind_5  12000
class(df["Income"])
## [1] "data.frame"
class(df$Income)
## [1] "numeric"
class(df[["Income"]])
## [1] "numeric"
class(df[[3]])
## [1] "numeric"
class(df[,3, drop = TRUE])
## [1] "numeric"
class(df[,3, drop = FALSE])
## [1] "data.frame"
typeof(df["Income"])
## [1] "list"
typeof(df$Income)
## [1] "double"
typeof(df[["Income"]])
## [1] "double"
typeof(df[[3]])
## [1] "double"
typeof(df[,3, drop = TRUE])
## [1] "double"
typeof(df[,3, drop = FALSE])
## [1] "list"

Notice that only df[“column”] and df[,column, drop = False] return as data.frame while the rest are vectors.

accessing df by row

df[1,]
##       Name Age Income Installment
## ind_1 Anne  28   3000       510.5
id <- "ind_3"
df[c(id),]
##        Name Age Income Installment
## ind_3 Frank  21   3000       210.8

Eliminating specific rows or columns

df[,-2]
##        Name Income Installment
## ind_1  Anne   3000       510.5
## ind_2  Pete   5000      1230.4
## ind_3 Frank   3000       210.8
## ind_4 Julia  10000      5088.3
## ind_5  Cath  12000      4500.0
df[-5,]
##        Name Age Income Installment
## ind_1  Anne  28   3000       510.5
## ind_2  Pete  30   5000      1230.4
## ind_3 Frank  21   3000       210.8
## ind_4 Julia  39  10000      5088.3
df[-5,-4]
##        Name Age Income
## ind_1  Anne  28   3000
## ind_2  Pete  30   5000
## ind_3 Frank  21   3000
## ind_4 Julia  39  10000
df[["Income"]][-3]
## [1]  3000  5000 10000 12000
df[1,][-1]
##       Age Income Installment
## ind_1  28   3000       510.5

Accessing an element in a row or col

df[["Income"]][3]
## [1] 3000
df$Income[3]
## [1] 3000
df[,1][3]
## [1] "Frank"
df[2,][3]
##       Income
## ind_2   5000
df[c(id),][3]
##       Income
## ind_3   3000

Code that returns based list based on a set rule

idx <- df$Age == 21
df[idx, ]
##        Name Age Income Installment
## ind_3 Frank  21   3000       210.8
iddx <- which(df$Age > 30)
df[iddx, ]
##        Name Age Income Installment
## ind_4 Julia  39  10000      5088.3
## ind_5  Cath  40  12000      4500.0

Adding additional observation or variable

tom <- data.frame(Name = "Tom", Age = 45, Income = 7000, Installment = 3000, row.names = "ind_6")
rbind(df, tom)
##        Name Age Income Installment
## ind_1  Anne  28   3000       510.5
## ind_2  Pete  30   5000      1230.4
## ind_3 Frank  21   3000       210.8
## ind_4 Julia  39  10000      5088.3
## ind_5  Cath  40  12000      4500.0
## ind_6   Tom  45   7000      3000.0
df$debtcapacity <- round(c(df$Income/df$Installment),digits = 2); df
##        Name Age Income Installment debtcapacity
## ind_1  Anne  28   3000       510.5         5.88
## ind_2  Pete  30   5000      1230.4         4.06
## ind_3 Frank  21   3000       210.8        14.23
## ind_4 Julia  39  10000      5088.3         1.97
## ind_5  Cath  40  12000      4500.0         2.67

Sorting data frame

sort(df$Age)
## [1] 21 28 30 39 40
ranks <- order(df$Age)
ranks
## [1] 3 1 2 4 5
df[ranks,]
##        Name Age Income Installment debtcapacity
## ind_3 Frank  21   3000       210.8        14.23
## ind_1  Anne  28   3000       510.5         5.88
## ind_2  Pete  30   5000      1230.4         4.06
## ind_4 Julia  39  10000      5088.3         1.97
## ind_5  Cath  40  12000      4500.0         2.67
df[order(df$Age, decreasing = TRUE), ]
##        Name Age Income Installment debtcapacity
## ind_5  Cath  40  12000      4500.0         2.67
## ind_4 Julia  39  10000      5088.3         1.97
## ind_2  Pete  30   5000      1230.4         4.06
## ind_1  Anne  28   3000       510.5         5.88
## ind_3 Frank  21   3000       210.8        14.23

Summary of the dataframe and by specific column or row

str(df)
## 'data.frame':    5 obs. of  5 variables:
##  $ Name        : chr  "Anne" "Pete" "Frank" "Julia" ...
##  $ Age         : num  28 30 21 39 40
##  $ Income      : num  3000 5000 3000 10000 12000
##  $ Installment : num  510 1230 211 5088 4500
##  $ debtcapacity: num  5.88 4.06 14.23 1.97 2.67
summary(df)
##      Name                Age           Income       Installment    
##  Length:5           Min.   :21.0   Min.   : 3000   Min.   : 210.8  
##  Class :character   1st Qu.:28.0   1st Qu.: 3000   1st Qu.: 510.5  
##  Mode  :character   Median :30.0   Median : 5000   Median :1230.4  
##                     Mean   :31.6   Mean   : 6600   Mean   :2308.0  
##                     3rd Qu.:39.0   3rd Qu.:10000   3rd Qu.:4500.0  
##                     Max.   :40.0   Max.   :12000   Max.   :5088.3  
##   debtcapacity   
##  Min.   : 1.970  
##  1st Qu.: 2.670  
##  Median : 4.060  
##  Mean   : 5.762  
##  3rd Qu.: 5.880  
##  Max.   :14.230
str(df["Income"])
## 'data.frame':    5 obs. of  1 variable:
##  $ Income: num  3000 5000 3000 10000 12000
summary(df["Income"])
##      Income     
##  Min.   : 3000  
##  1st Qu.: 3000  
##  Median : 5000  
##  Mean   : 6600  
##  3rd Qu.:10000  
##  Max.   :12000
str(df[2,])
## 'data.frame':    1 obs. of  5 variables:
##  $ Name        : chr "Pete"
##  $ Age         : num 30
##  $ Income      : num 5000
##  $ Installment : num 1230
##  $ debtcapacity: num 4.06
summary(df[2,])
##      Name                Age         Income      Installment    debtcapacity 
##  Length:1           Min.   :30   Min.   :5000   Min.   :1230   Min.   :4.06  
##  Class :character   1st Qu.:30   1st Qu.:5000   1st Qu.:1230   1st Qu.:4.06  
##  Mode  :character   Median :30   Median :5000   Median :1230   Median :4.06  
##                     Mean   :30   Mean   :5000   Mean   :1230   Mean   :4.06  
##                     3rd Qu.:30   3rd Qu.:5000   3rd Qu.:1230   3rd Qu.:4.06  
##                     Max.   :30   Max.   :5000   Max.   :1230   Max.   :4.06
str(df[-2,-3])
## 'data.frame':    4 obs. of  4 variables:
##  $ Name        : chr  "Anne" "Frank" "Julia" "Cath"
##  $ Age         : num  28 21 39 40
##  $ Installment : num  510 211 5088 4500
##  $ debtcapacity: num  5.88 14.23 1.97 2.67
summary(df[-2,-3])
##      Name                Age         Installment      debtcapacity   
##  Length:4           Min.   :21.00   Min.   : 210.8   Min.   : 1.970  
##  Class :character   1st Qu.:26.25   1st Qu.: 435.6   1st Qu.: 2.495  
##  Mode  :character   Median :33.50   Median :2505.2   Median : 4.275  
##                     Mean   :32.00   Mean   :2577.4   Mean   : 6.188  
##                     3rd Qu.:39.25   3rd Qu.:4647.1   3rd Qu.: 7.968  
##                     Max.   :40.00   Max.   :5088.3   Max.   :14.230