regexp test in R
References
Spring Cleaning Data: 5 of 6- 2 ifelse vs Merge example with gsub()
dat <- read.table(header = T, text = "
dist
'Boston (1)'
'New York (2)'
'Philadelphia (3)'
'Cleveland (4)'
'Richmond (5)'
'Atlanta (6)'
'Chicago (7)'
'St. Louis (8)'
'Minneapolis (9)'
'Kansas City (10)'
'Dallas (11)'
'San Francisco (12)'
")
dat.orig <- dat
## Numbers only
## Matches any one of these letters [a-zA-Z .()], and repleces with ""
dat$num <- gsub(pattern = "[a-zA-Z .()]", replacement = "", x = dat$dist)
## City names
## Matches single space, literal (, and anything that follows until the line end
dat$name <- gsub(pattern = " \\(.*$", replacement = "", x = dat$dist)
## Check result
dat
dist num name
1 Boston (1) 1 Boston
2 New York (2) 2 New York
3 Philadelphia (3) 3 Philadelphia
4 Cleveland (4) 4 Cleveland
5 Richmond (5) 5 Richmond
6 Atlanta (6) 6 Atlanta
7 Chicago (7) 7 Chicago
8 St. Louis (8) 8 St. Louis
9 Minneapolis (9) 9 Minneapolis
10 Kansas City (10) 10 Kansas City
11 Dallas (11) 11 Dallas
12 San Francisco (12) 12 San Francisco
str(dat)
'data.frame': 12 obs. of 3 variables:
$ dist: Factor w/ 12 levels "Atlanta (6)",..: 2 8 9 4 10 1 3 12 7 6 ...
$ num : chr "1" "2" "3" "4" ...
$ name: chr "Boston" "New York" "Philadelphia" "Cleveland" ...
Older try
## Numbers
## pattern = "^[a-zA-Z .]{1,}\\(([0-9]{1,})\\)$" matches as follows
## ^ beginning of line
## [a-zA-Z .]{1,} any of a-zA-z(space)(period) times one or more
## \\( literal (
## ([0-9]{1,}) any of 0-9 times one or more, and saves it as backreference \\1
## \\) literal )
## $ end of line
##
## replacement = "\\1" recalls the backreference
dat$num2 <- gsub(pattern = "^[a-zA-Z .]{1,}\\(([0-9]{1,})\\)$", replacement = "\\1", x = dat$dist)
dat$num2 <- as.numeric(dat$num)
dat
dist num name num2
1 Boston (1) 1 Boston 1
2 New York (2) 2 New York 2
3 Philadelphia (3) 3 Philadelphia 3
4 Cleveland (4) 4 Cleveland 4
5 Richmond (5) 5 Richmond 5
6 Atlanta (6) 6 Atlanta 6
7 Chicago (7) 7 Chicago 7
8 St. Louis (8) 8 St. Louis 8
9 Minneapolis (9) 9 Minneapolis 9
10 Kansas City (10) 10 Kansas City 10
11 Dallas (11) 11 Dallas 11
12 San Francisco (12) 12 San Francisco 12
Backreference simplified
## Saves 1 or 2 numbers between literal "(" and ")", and replaces with the backreference
gsub(pattern = ".*\\(([0-9]{1,})\\)", replacement = "\\1", x = dat$dist)
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
Suggestions from Tyler Rinker
library(qdap)
## bracketXtract() Apply bracket extraction to character vectors. (returns a list)
as.numeric(bracketXtract(dat$dist))
[1] 1 2 3 4 5 6 7 8 9 10 11 12
## bracketX() Apply bracket removal to character vectors
## Trim() Remove leading/trailing white space.
Trim(bracketX(dat$dist))
[1] "Boston" "New York" "Philadelphia" "Cleveland" "Richmond" "Atlanta" "Chicago"
[8] "St. Louis" "Minneapolis" "Kansas City" "Dallas" "San Francisco"
## Environment based hash table useful for large vector lookups.
lookup(terms = dat$dist, key.match = dat$dist, key.reassign = 1:12)
[1] 1 2 3 4 5 6 7 8 9 10 11 12
Speed test
## Create 120,000 observations
dat.rep1 <- dat.orig[rep(seq(nrow(dat.orig)), 10000), , drop = F]
dat.rep2 <- dat.rep1
## gsub() method
system.time({
dat.rep1$num <- as.numeric(gsub(pattern = "[a-zA-Z .()]", replacement = "", x = dat.rep1$dist))
dat.rep1$name <- gsub(pattern = " \\(.*$", replacement = "", x = dat.rep1$dist)
})
user system elapsed
0.833 0.003 0.835
## qdap method
system.time({
dat.rep2$num <- as.numeric(bracketXtract(dat.rep2$dist))
dat.rep2$name <- Trim(bracketX(dat.rep2$dist))
})
user system elapsed
21.319 0.149 21.475