Creating New Columns

Prologue

You may want to create a new column with numeric values by (1) Splitting variables base a single column or multiple columns into groups (2) Transforming variables of a single column and (3) Arithmetic output of a single column or multiple columns, among other applications.
You may want to create a new column with character strings by (1) Splitting variables base a single column or multiple columns into groups (2) Replacement of multiple strings of a single column, i.e. renaming variables, among other applications
Let’s create a data frame to play with.

# Set seed
set.seed(1)

# Create data frame
df <- data.frame(ID=c(1:100), number1=sample(1:100, size=100, replace=TRUE), number2=sample(1:100, size=100, replace=TRUE), string1=sample(c("A", "B", "C", "D"), size=100, replace=TRUE), string2=sample(c("A", "B", "C", "D"), size=100, replace=TRUE))

Creating new columns with numeric values

Transforming variables of a single column.

df$number3 <- log10(df$number1)
head(df)

##   ID number1 number2 string1 string2  number3
## 1  1      27      66       B       C 1.431364
## 2  2      38      36       A       A 1.579784
## 3  3      58      28       C       B 1.763428
## 4  4      91     100       B       B 1.959041
## 5  5      21      64       A       B 1.322219
## 6  6      90      22       C       D 1.954243

df$number3 <- log2(df$number1)
head(df)

##   ID number1 number2 string1 string2  number3
## 1  1      27      66       B       C 4.754888
## 2  2      38      36       A       A 5.247928
## 3  3      58      28       C       B 5.857981
## 4  4      91     100       B       B 6.507795
## 5  5      21      64       A       B 4.392317
## 6  6      90      22       C       D 6.491853

df$number3 <- log(df$number1)
head(df)

##   ID number1 number2 string1 string2  number3
## 1  1      27      66       B       C 3.295837
## 2  2      38      36       A       A 3.637586
## 3  3      58      28       C       B 4.060443
## 4  4      91     100       B       B 4.510860
## 5  5      21      64       A       B 3.044522
## 6  6      90      22       C       D 4.499810

Arithmetic calculations on single column and across multiple columns.

# Single column
df$number3 <- df$number1*10
head(df)

##   ID number1 number2 string1 string2 number3
## 1  1      27      66       B       C     270
## 2  2      38      36       A       A     380
## 3  3      58      28       C       B     580
## 4  4      91     100       B       B     910
## 5  5      21      64       A       B     210
## 6  6      90      22       C       D     900

# Multiple columns
df$number3 <- df$number1*10 + (df$number2+5)
head(df)

##   ID number1 number2 string1 string2 number3
## 1  1      27      66       B       C     341
## 2  2      38      36       A       A     421
## 3  3      58      28       C       B     613
## 4  4      91     100       B       B    1015
## 5  5      21      64       A       B     279
## 6  6      90      22       C       D     927

Use the ifelse() function if you would like a quick way to create a new column with 2 groups base on a single column or multiple columns.

# Split by the mean
df$number3 <- ifelse(df$number1 > mean(df$number1), "high", "low")
head(df)

##   ID number1 number2 string1 string2 number3
## 1  1      27      66       B       C     low
## 2  2      38      36       A       A     low
## 3  3      58      28       C       B    high
## 4  4      91     100       B       B    high
## 5  5      21      64       A       B     low
## 6  6      90      22       C       D    high

# Split by a specific value
df$number3 <- ifelse(df$number1 > 20, "above 20", "below 20")
head(df)

##   ID number1 number2 string1 string2  number3
## 1  1      27      66       B       C above 20
## 2  2      38      36       A       A above 20
## 3  3      58      28       C       B above 20
## 4  4      91     100       B       B above 20
## 5  5      21      64       A       B above 20
## 6  6      90      22       C       D above 20

# Split base on multiple columns
df$number3 <- ifelse(df$number1 > mean(df$number1) & df$number1 > mean(df$number1), "Double high", "Others")
head(df)

##   ID number1 number2 string1 string2     number3
## 1  1      27      66       B       C      Others
## 2  2      38      36       A       A      Others
## 3  3      58      28       C       B Double high
## 4  4      91     100       B       B Double high
## 5  5      21      64       A       B      Others
## 6  6      90      22       C       D Double high

Use the cut2() function from the Hmisc library if you would like to split a numeric column into equal-sized groups. without knowing beforehand the boundaries of the numeric values.
Use the g argument to define the number of groups.

library(Hmisc)

df$number3 <- cut2(df$number1, g=5)
head(df)

##   ID number1 number2 string1 string2  number3
## 1  1      27      66       B       C [26, 44)
## 2  2      38      36       A       A [26, 44)
## 3  3      58      28       C       B [44, 66)
## 4  4      91     100       B       B [80,100]
## 5  5      21      64       A       B [ 2, 26)
## 6  6      90      22       C       D [80,100]

data.frame(table(df$number3))

##       Var1 Freq
## 1 [ 2, 26)   20
## 2 [26, 44)   21
## 3 [44, 66)   20
## 4 [66, 80)   20
## 5 [80,100]   19

The cuts argument allow you to define your groups when you know beforehand the boundaries of the numeric values that you would like to use.
This method does not always guarantee equal-sized groups.

df$number3 <- cut2(df$number1, cuts=c(0, 20, 40, 60, 80, 100))
head(df)

##   ID number1 number2 string1 string2   number3
## 1  1      27      66       B       C [ 20, 40)
## 2  2      38      36       A       A [ 20, 40)
## 3  3      58      28       C       B [ 40, 60)
## 4  4      91     100       B       B [ 80,100]
## 5  5      21      64       A       B [ 20, 40)
## 6  6      90      22       C       D [ 80,100]

data.frame(table(df$number3))

##        Var1 Freq
## 1 [  0, 20)   13
## 2 [ 20, 40)   24
## 3 [ 40, 60)   19
## 4 [ 60, 80)   25
## 5 [ 80,100]   19

Note the cut2() returns a factor variable.
Also note that the square bracket means inclusive while the parenthesis means exclusive. so [0,20) means interval from 0 to 20 inclusive of 0 but exclusive of 20.

Creating new columns with character strings

Use the ifelse() function if you would like a quick way to create a new column with 2 groups base on a single column and multiple columns.

# Split base on a single column
df$string3 <- ifelse(df$string1 == "A", "A", "non-A")
head(df)

##   ID number1 number2 string1 string2   number3 string3
## 1  1      27      66       B       C [ 20, 40)   non-A
## 2  2      38      36       A       A [ 20, 40)       A
## 3  3      58      28       C       B [ 40, 60)   non-A
## 4  4      91     100       B       B [ 80,100]   non-A
## 5  5      21      64       A       B [ 20, 40)       A
## 6  6      90      22       C       D [ 80,100]   non-A

# Split base on multiple columns
df$string3 <- ifelse(df$string1 == "B" & df$string2 == "C", "BC", "Others")
head(df)

##   ID number1 number2 string1 string2   number3 string3
## 1  1      27      66       B       C [ 20, 40)      BC
## 2  2      38      36       A       A [ 20, 40)  Others
## 3  3      58      28       C       B [ 40, 60)  Others
## 4  4      91     100       B       B [ 80,100]  Others
## 5  5      21      64       A       B [ 20, 40)  Others
## 6  6      90      22       C       D [ 80,100]  Others

Use the gsub() function to rename a single variable of a single column. Use the pattern argument to define the target string and the replacement argument to define its replacement. Variables not specified will be returned as they are.

df$string3 <- gsub(pattern="A", replacement="Alice", df$string1)
head(df)

##   ID number1 number2 string1 string2   number3 string3
## 1  1      27      66       B       C [ 20, 40)       B
## 2  2      38      36       A       A [ 20, 40)   Alice
## 3  3      58      28       C       B [ 40, 60)       C
## 4  4      91     100       B       B [ 80,100]       B
## 5  5      21      64       A       B [ 20, 40)   Alice
## 6  6      90      22       C       D [ 80,100]       C

Use the gsubfun() function from the gsubfn to rename multiple variables of a single column.

library(gsubfn)

df$string3 <- gsubfn(".", list("A"="Alice", "B"="Bob", "C"="Catherine", "D"="Dylan"), as.character(df$string1))
head(df)

##   ID number1 number2 string1 string2   number3   string3
## 1  1      27      66       B       C [ 20, 40)       Bob
## 2  2      38      36       A       A [ 20, 40)     Alice
## 3  3      58      28       C       B [ 40, 60) Catherine
## 4  4      91     100       B       B [ 80,100]       Bob
## 5  5      21      64       A       B [ 20, 40)     Alice
## 6  6      90      22       C       D [ 80,100] Catherine

Use the multigsub() function from the qdap library achieves the same thing as gsubfun(). But instead of simultaneously defining the patterns and the replacements, multigsub() allows you to define the patterns first followed by the replacements.
This is especially useful when given a file containing a list of original patterns and the corresponding alternative names.
You might want to run the following code in your terminal if you have problem loading the library: sudo R CMD javareconf

library(qdap)

df$string3 <- multigsub(c("A", "B", "C", "D"), c("Alice", "Bob", "Catherine", "Dylan"), df$string1)
head(df)

##   ID number1 number2 string1 string2   number3   string3
## 1  1      27      66       B       C [ 20, 40)       Bob
## 2  2      38      36       A       A [ 20, 40)     Alice
## 3  3      58      28       C       B [ 40, 60) Catherine
## 4  4      91     100       B       B [ 80,100]       Bob
## 5  5      21      64       A       B [ 20, 40)     Alice
## 6  6      90      22       C       D [ 80,100] Catherine