base::date()
## [1] "Fri Sep 29 21:32:24 2017"
Sys.Date()
## [1] "2017-09-29"
Sys.time()
## [1] "2017-09-29 21:32:24 EDT"
library(tidyverse)
library(stringr)
library(RCurl)
mystrings = c('the cat in the hat','green eggs and ham','fox in socks')
strsplit(mystrings,' ')
## [[1]]
## [1] "the" "cat" "in" "the" "hat"
##
## [[2]]
## [1] "green" "eggs" "and" "ham"
##
## [[3]]
## [1] "fox" "in" "socks"
world=c("United Kingdom","United States","Russia")
grep('United',world,value=TRUE)
## [1] "United Kingdom" "United States"
strings = c('elephant','aardvark','chicken','dog','duck','frog')
substring(strings,1,5)
## [1] "eleph" "aardv" "chick" "dog" "duck" "frog"
# take a peek of USArrests
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
# names of states
states = rownames(USArrests)
states
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
Abbreviation
# abbreviate state names
states2 = abbreviate(states)
states2
## Alabama Alaska Arizona Arkansas California
## "Albm" "Alsk" "Arzn" "Arkn" "Clfr"
## Colorado Connecticut Delaware Florida Georgia
## "Clrd" "Cnnc" "Dlwr" "Flrd" "Gerg"
## Hawaii Idaho Illinois Indiana Iowa
## "Hawa" "Idah" "Illn" "Indn" "Iowa"
## Kansas Kentucky Louisiana Maine Maryland
## "Knss" "Kntc" "Losn" "Main" "Mryl"
## Massachusetts Michigan Minnesota Mississippi Missouri
## "Mssc" "Mchg" "Mnns" "Msss" "Mssr"
## Montana Nebraska Nevada New Hampshire New Jersey
## "Mntn" "Nbrs" "Nevd" "NwHm" "NwJr"
## New Mexico New York North Carolina North Dakota Ohio
## "NwMx" "NwYr" "NrtC" "NrtD" "Ohio"
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## "Oklh" "Orgn" "Pnns" "RhdI" "SthC"
## South Dakota Tennessee Texas Utah Vermont
## "SthD" "Tnns" "Texs" "Utah" "Vrmn"
## Virginia Washington West Virginia Wisconsin Wyoming
## "Vrgn" "Wshn" "WstV" "Wscn" "Wymn"
# remove vector names (for convenience)
names(states2) = NULL
states2
## [1] "Albm" "Alsk" "Arzn" "Arkn" "Clfr" "Clrd" "Cnnc" "Dlwr" "Flrd" "Gerg"
## [11] "Hawa" "Idah" "Illn" "Indn" "Iowa" "Knss" "Kntc" "Losn" "Main" "Mryl"
## [21] "Mssc" "Mchg" "Mnns" "Msss" "Mssr" "Mntn" "Nbrs" "Nevd" "NwHm" "NwJr"
## [31] "NwMx" "NwYr" "NrtC" "NrtD" "Ohio" "Oklh" "Orgn" "Pnns" "RhdI" "SthC"
## [41] "SthD" "Tnns" "Texs" "Utah" "Vrmn" "Vrgn" "Wshn" "WstV" "Wscn" "Wymn"
Getting the longest name
# size (in characters) of each name
state_chars = nchar(states)
# longest name
states[which(state_chars == max(state_chars))]
## [1] "North Carolina" "South Carolina"
Counting the number a’s
# position of a's
positions_a = gregexpr(pattern = "a", text = states, ignore.case = TRUE)
# how many a's?
num_a = sapply(positions_a, function(x) ifelse(x[1] > 0, length(x), 0))
num_a
## [1] 4 3 2 3 2 1 0 2 1 1 2 1 0 2 1 2 0 2 1 2 2 1 1 0 0 2 2 2 1 0 0 0 2 2 0
## [36] 2 0 2 1 2 2 0 1 1 0 1 1 1 0 0
# load stringr (remember to install it first)
library(stringr)
# total number of a's
str_count(states, "a")
## [1] 3 2 1 2 2 1 0 2 1 1 2 1 0 2 1 2 0 2 1 2 2 1 1 0 0 2 2 2 1 0 0 0 2 2 0
## [36] 2 0 2 1 2 2 0 1 1 0 1 1 1 0 0
Since str count() does not contain the argument ignore.case, we need to transform all letters to lower case, and then count the number of a’s like this:
# total number of a's
str_count(tolower(states), "a")
## [1] 4 3 2 3 2 1 0 2 1 1 2 1 0 2 1 2 0 2 1 2 2 1 1 0 0 2 2 2 1 0 0 0 2 2 0
## [36] 2 0 2 1 2 2 0 1 1 0 1 1 1 0 0
counting the number of vowels
# vector of vowels
vowels = c("a", "e", "i", "o", "u")
# vector for storing results
num_vowels = vector(mode = "integer", length = 5)
# calculate number of vowels in each name
for (j in seq_along(vowels)) {
num_aux = str_count(tolower(states), vowels[j])
num_vowels[j] = sum(num_aux) }
# add vowel names
names(num_vowels) = vowels
# total number of vowels
num_vowels
## a e i o u
## 61 28 44 36 8
# sort them in decreasing order
sort(num_vowels, decreasing = TRUE)
## a i o e u
## 61 44 36 28 8
## a i o e u ##61443628 8
No Quotes
# text string
my_string = "programming with data is fun"
# print string
print(my_string)
## [1] "programming with data is fun"
# print without quotes
print(my_string, quote = FALSE)
## [1] programming with data is fun
# simply print with 'cat()'
cat(my_string)
## programming with data is fun
# concatenate and print
cat(my_string, "with R")
## programming with data is fun with R
# especifying 'sep'
cat(my_string, "with R", sep = " =) ")
## programming with data is fun =) with R
# another example
cat(1:10, sep = "-")
## 1-2-3-4-5-6-7-8-9-10
# first four months
cat(month.name[1:6], sep = " ")
## January February March April May June
# fill = 30
cat("Loooooooooong strings", "can be displayed", "in a nice format",
"by using the 'fill' argument", fill = 30)
## Loooooooooong strings
## can be displayed
## in a nice format
## by using the 'fill' argument
format
# default usage
format(13.7)
## [1] "13.7"
# another example
format(13.12345678)
## [1] "13.12346"
# use of 'nsmall'
format(13.7, nsmall = 3)
## [1] "13.700"
# noquote
noquote(my_string)
## [1] programming with data is fun
# class noquote
no_quotes = noquote(c("some", "quoted", "text", "!%^(&="))
# display
no_quotes
## [1] some quoted text !%^(&=
# check class
class(no_quotes)
## [1] "noquote"
# test character
is.character(no_quotes)
## [1] TRUE
# no quotes even when subscripting
no_quotes[2:3]
## [1] quoted text
# justify options
format(c("A", "BB", "CCC"), width = 5, justify = "centre")
## [1] " A " " BB " " CCC "
format(c("A", "BB", "CCC"), width = 5, justify = "left")
## [1] "A " "BB " "CCC "
format(c("A", "BB", "CCC"), width = 5, justify = "right")
## [1] " A" " BB" " CCC"
format(c("A", "BB", "CCC"), width = 5, justify = "none")
## [1] "A" "BB" "CCC"
# digits
format(1/1:5, digits = 2)
## [1] "1.00" "0.50" "0.33" "0.25" "0.20"
format(format(1/1:5, digits = 2), width = 6, justify = "c")
## [1] " 1.00 " " 0.50 " " 0.33 " " 0.25 " " 0.20 "
# big.mark
format(123456789, big.mark = ",")
## [1] "123,456,789"
The function sprintf() is a wrapper for the C function sprintf() that returns a formatted string combining text and variable values. The nice feature about sprintf() is that it provides us a very flexible way of formatting vector elements as character strings. Its usage has the following form: sprintf(fmt, …) The argument fmt is a character vector of format strings. The allowed conversion specifica- tions start the symbol % followed by numbers and letters. For demonstration purposes here are several ways in which the number pi can be formatted
# '%f' indicates 'fixed point' decimal notation
sprintf("%f", pi)
## [1] "3.141593"
# decimal notation with 3 decimal digits
sprintf("%.3f", pi)
## [1] "3.142"
# 1 integer and 0 decimal digits
sprintf("%1.0f", pi)
## [1] "3"
# decimal notation with 3 decimal digits
sprintf("%5.1f", pi)
## [1] " 3.1"
sprintf("%05.1f", pi)
## [1] "003.1"
# print with sign (positive)
sprintf("%+f", pi)
## [1] "+3.141593"
# prefix a space
sprintf("% f", pi)
## [1] " 3.141593"
# left adjustment
sprintf("%-10f", pi) # left justified
## [1] "3.141593 "
# exponential decimal notation 'e'
sprintf("%e", pi)
## [1] "3.141593e+00"
# exponential decimal notation 'E'
sprintf("%E", pi)
## [1] "3.141593E+00"
# number of significant digits (6 by default)
sprintf("%g", pi)
## [1] "3.14159"
# default usage
#toString(17.04)
# combining two objects
#toString(c(17.04, 1978))
# combining several objects
#toString(c("Bonjour", 123, TRUE, NA, log(exp(1))))
One of the nice features about toString() is that you can specify its argument width to fix a maximum field width.
# use of 'width'
#toString(c("one", "two", "3333333333"), width = 8)
Function Description nchar() number of characters tolower() convert to lower case toupper() convert to upper case casefold() case folding chartr() character translation abbreviation abbreviate() abbreviation substring() substrings of a character vector substr() substrings of a character vector
# how many characters?
nchar(c("How", "many", "characters?"))
## [1] 3 4 11
# how many characters?
nchar("How many characters?")
## [1] 20
# how many elements?
length(c("How", "many", "characters?"))
## [1] 3
# how many elements?
length("How many characters?")
## [1] 1
# to lower case
tolower(c("aLL ChaRacterS in LoweR caSe", "ABCDE"))
## [1] "all characters in lower case" "abcde"
# to upper case
toupper(c("All ChaRacterS in Upper Case", "abcde"))
## [1] "ALL CHARACTERS IN UPPER CASE" "ABCDE"
casefold(x, upper = FALSE)
# lower case folding
casefold("aLL ChaRacterS in LoweR caSe")
## [1] "all characters in lower case"
# upper case folding
casefold("All ChaRacterS in Upper Case", upper = TRUE)
## [1] "ALL CHARACTERS IN UPPER CASE"
chartr(old, new, x)
# replace 'a' by 'A'
chartr("a", "A", "This is a boring string")
## [1] "This is A boring string"
# multiple replacements
crazy = c("Here's to the crazy ones", "The misfits", "The rebels")
chartr("aei", "#!?", crazy)
## [1] "H!r!'s to th! cr#zy on!s" "Th! m?sf?ts"
## [3] "Th! r!b!ls"
Another useful function for basic manipulation of character strings is abbreviate(). Its usage has the following structure: abbreviate(names.org, minlength = 4, dot = FALSE, strict = FALSE, method = c(“left.keep”, “both.sides”))
# some color names
some_colors = colors()[1:4]
some_colors
## [1] "white" "aliceblue" "antiquewhite" "antiquewhite1"
colors1 = abbreviate(some_colors)
colors1
## white aliceblue antiquewhite antiquewhite1
## "whit" "alcb" "antq" "ant1"
# abbreviate with 'minlength'
colors2 = abbreviate(some_colors, minlength = 5)
colors2
## white aliceblue antiquewhite antiquewhite1
## "white" "alcbl" "antqw" "antq1"
# abbreviate
colors3 = abbreviate(some_colors, minlength = 3, method = "both.sides")
colors3
## white aliceblue antiquewhite antiquewhite1
## "wht" "alc" "ant" "an1"
substr(x, start, stop)
# extract 'bcd'
substr("abcdef", 2, 4)
## [1] "bcd"
# replace 2nd letter with hash symbol
x = c("may", "the", "force", "be", "with", "you")
substr(x, 2, 2) <- "#"
x
## [1] "m#y" "t#e" "f#rce" "b#" "w#th" "y#u"
# replace 2nd and 3rd letters with happy face
y = c("may", "the", "force", "be", "with", "you")
substr(y, 2, 3) <- ":)"
y
## [1] "m:)" "t:)" "f:)ce" "b:" "w:)h" "y:)"
# replacement with recycling
z = c("may", "the", "force", "be", "with", "you")
substr(z, 2, 3) <- c("#", "@")
z
## [1] "m#y" "t@e" "f#rce" "b@" "w#th" "y@u"
substring(text, first, last = 1000000L)
# same as 'substr'
substring("ABCDEF", 2, 4)
## [1] "BCD"
substr("ABCDEF", 2, 4)
## [1] "BCD"
# extract each letter
substring("ABCDEF", 1:6, 1:6)
## [1] "A" "B" "C" "D" "E" "F"
# multiple replacement with recycling
text = c("more", "emotions", "are", "better", "than", "less")
substring(text, 1:3)<- c(" ", "zzz")
text
## [1] " ore" "ezzzions" "ar " "zzzter" "t an" "lezz"
Function union() Description set union intersect() intersection setdiff() set difference setequal() equal sets identical() exact equality is.element() is element %in%() contains sort() sorting paste(rep()) repetition
# two character vectors
set1 = c("some", "random", "words", "some")
set2 = c("some", "many", "none", "few")
# union of set1 and set2
union(set1, set2)
## [1] "some" "random" "words" "many" "none" "few"
# two character vectors
set3 = c("some", "random", "few", "words")
set4 = c("some", "many", "none", "few")
# intersect of set3 and set4
intersect(set3, set4)
## [1] "some" "few"
# two character vectors
set5 = c("some", "random", "few", "words")
set6 = c("some", "many", "none", "few")
# difference between set5 and set6
setdiff(set5, set6)
## [1] "random" "words"
# three character vectors
set7 = c("some", "random", "strings")
set8 = c("some", "many", "none", "few")
set9 = c("strings", "random", "some")
# set7 == set8?
setequal(set7, set8)
## [1] FALSE
# set7 == set9?
setequal(set7, set9)
## [1] TRUE
# set7 identical to set7?
identical(set7, set7)
## [1] TRUE
# set7 identical to set9?
identical(set7, set9)
## [1] FALSE
# three vectors
set10 = c("some", "stuff", "to", "play", "with")
elem1 = "play"
elem2 = "crazy"
# elem1 in set10?
is.element(elem1, set10)
## [1] TRUE
# elem2 in set10?
is.element(elem2, set10)
## [1] FALSE
# elem1 in set10?
elem1 %in% set10
## [1] TRUE
# elem2 in set10?
elem2 %in% set10
## [1] FALSE
set11 = c("today", "produced", "example", "beautiful", "a", "nicely")
# sort (decreasing order)
sort(set11)
## [1] "a" "beautiful" "example" "nicely" "produced" "today"
# sort (increasing order)
sort(set11, decreasing = TRUE)
## [1] "today" "produced" "nicely" "example" "beautiful" "a"
sort(c(2,3,4,1))
## [1] 1 2 3 4
Function Description Similar to
str c() string concatenation paste() str_length() number of characters nchar() str_sub() extracts substrings substring() str_dup() duplicates characters none str_trim() removes leading and trailing whitespace none str_pad() pads a string none
str_wrap() wraps a string paragraph strwrap()
str_trim() trims a string none
# default usage
str_c("May", "The", "Force", "Be", "With", "You")
## [1] "MayTheForceBeWithYou"
# removing zero length objects
str_c("May", "The", "Force", NULL, "Be", "With", "You", character(0))
## [1] "MayTheForceBeWithYou"
# changing separator
str_c("May", "The", "Force", "Be", "With", "You", sep = "_")
## [1] "May_The_Force_Be_With_You"
# some text (NA included)
some_text = c("one", "two", "three", NA, "five")
# compare 'str_length' with 'nchar'
nchar(some_text)
## [1] 3 3 5 NA 4
str_length(some_text)
## [1] 3 3 5 NA 4
# some factor
some_factor = factor(c(1, 1, 1, 2, 2, 2), labels = c("good", "bad"))
some_factor
## [1] good good good bad bad bad
## Levels: good bad
# some factor
#nchar(some_factor)
## Error: ’nchar()’ requires a character vector
# now compare it with 'str_length'
str_length(some_factor)
## [1] 4 4 4 3 3 3
str_sub(string, start = 1L, end = -1L)
# some text
lorem = "Lorem Ipsum"
# apply 'str_sub'
str_sub(lorem, start = 1, end = 5)
## [1] "Lorem"
# equivalent to 'substring'
substring(lorem, first = 1, last = 5)
## [1] "Lorem"
# another example 1 to end, 2 to end 3 to end
str_sub("adios", 1:3)
## [1] "adios" "dios" "ios"
# some strings
resto = c("brasserie", "bistrot", "creperie", "bouchon")
# 'str_sub' with negative positions
str_sub(resto, start = -4, end = -1)
## [1] "erie" "trot" "erie" "chon"
# compared to substring (useless)
substring(resto, first = -4, last = -1)
## [1] "" "" "" ""
# extracting sequentially
str_sub(lorem, seq_len(nchar(lorem)))
## [1] "Lorem Ipsum" "orem Ipsum" "rem Ipsum" "em Ipsum" "m Ipsum"
## [6] " Ipsum" "Ipsum" "psum" "sum" "um"
## [11] "m"
substring(lorem, seq_len(nchar(lorem)))
## [1] "Lorem Ipsum" "orem Ipsum" "rem Ipsum" "em Ipsum" "m Ipsum"
## [6] " Ipsum" "Ipsum" "psum" "sum" "um"
## [11] "m"
# reverse substrings with negative positions
str_sub(lorem, -seq_len(nchar(lorem)))
## [1] "m" "um" "sum" "psum" "Ipsum"
## [6] " Ipsum" "m Ipsum" "em Ipsum" "rem Ipsum" "orem Ipsum"
## [11] "Lorem Ipsum"
substring(lorem, -seq_len(nchar(lorem)))
## [1] "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum"
## [6] "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum"
## [11] "Lorem Ipsum"
# replacing 'Lorem' with 'Nullam'
lorem = "Lorem Ipsum"
str_sub(lorem, 1, 5) <- "Nullam"
lorem
## [1] "Nullam Ipsum"
# replacing with negative positions
lorem = "Lorem Ipsum"
str_sub(lorem, -1) <- "Nullam"
lorem
## [1] "Lorem IpsuNullam"
# multiple replacements
lorem = "Lorem Ipsum"
str_sub(lorem, c(1, 7), c(5, 8)) <- c("Nullam", "Enim")
lorem
## [1] "Nullam Ipsum" "Lorem Enimsum"
# replacing 'Lorem' with ''
lorem = "Lorem Ipsum"
str_sub(lorem, 1, 5) <- ""
lorem
## [1] " Ipsum"
str_dup(string, times)
# default usage
str_dup("hola", 3)
## [1] "holaholahola"
# use with differetn 'times'
str_dup("adios", 1:3)
## [1] "adios" "adiosadios" "adiosadiosadios"
# use with a string vector
words = c("lorem", "ipsum", "dolor", "sit", "amet")
str_dup(words, 2)
## [1] "loremlorem" "ipsumipsum" "dolordolor" "sitsit" "ametamet"
str_dup(words, 1:5)
## [1] "lorem" "ipsumipsum" "dolordolordolor"
## [4] "sitsitsitsit" "ametametametametamet"
str_pad(string, width, side = “left”, pad = " “)
# default usage
str_pad("hola", width = 7)
## [1] " hola"
# pad both sides
str_pad("adios", width = 7, side = "both")
## [1] " adios "
# left padding with '#'
str_pad("hashtag", width = 8, pad = "#")
## [1] "#hashtag"
# pad both sides with '-'
str_pad("hashtag", width = 9, side = "both", pad = "-")
## [1] "-hashtag-"
str_wrap(string, width = 80, indent = 0, exdent = 0)
# quote (by Douglas Adams)
some_quote = c(
"I may not have gone",
"where I intended to go,",
"but I think I have ended up",
"where I needed to be")
# some_quote in a single paragraph
some_quote = paste(some_quote, collapse = " ")
# display paragraph with width=30
cat(str_wrap(some_quote, width = 30))
## I may not have gone where I
## intended to go, but I think I
## have ended up where I needed
## to be
# display paragraph with first line indentation of 2
cat(str_wrap(some_quote, width = 30, indent = 2), "\n")
## I may not have gone where I
## intended to go, but I think I
## have ended up where I needed
## to be
# display paragraph with following lines indentation of 3
cat(str_wrap(some_quote, width = 30, exdent = 3), "\n")
## I may not have gone where I
## intended to go, but I think I
## have ended up where I needed
## to be
str_trim(string, side = “both”)
# text with whitespaces
bad_text = c("This", " example ", "has several ", "
whitespaces ")
# remove whitespaces on the left side
str_trim(bad_text, side = "left")
## [1] "This" "example " "has several " "whitespaces "
# remove whitespaces on the right side
str_trim(bad_text, side = "right")
## [1] "This" " example" "has several" "\nwhitespaces"
# remove whitespaces on the both sides
str_trim(bad_text, side = "both")
## [1] "This" "example" "has several" "whitespaces"
word(string, start = 1L, end = start, sep = fixed(" “))
# some sentence
change = c("Be the change", "you want to be")
# extract first word
word(change, 1)
## [1] "Be" "you"
# extract second word
word(change, 2)
## [1] "the" "want"
# extract last word
word(change, -1)
## [1] "change" "be"
# extract all but the first words
word(change, 2, -1)
## [1] "the change" "want to be"
# string
money = "$money"
# the right way in R
sub(pattern = "\\$", replacement = "", x = money)
## [1] "money"
# dollar
sub("\\$", "", "$Peace-Love")
## [1] "Peace-Love"
# dot
sub("\\.", "", "Peace.Love")
## [1] "PeaceLove"
# plus
sub("\\+", "", "Peace+Love")
## [1] "PeaceLove"
# caret
sub("\\^", "", "Peace^Love")
## [1] "PeaceLove"
# vertical bar
sub("\\|", "", "Peace|Love")
## [1] "PeaceLove"
# opening round bracket
sub("\\(", "", "Peace(Love)")
## [1] "PeaceLove)"
# closing round bracket
sub("\\)", "", "Peace(Love)")
## [1] "Peace(Love"
# opening square bracket
sub("\\[", "", "Peace[Love]")
## [1] "PeaceLove]"
# closing square bracket
sub("\\]", "", "Peace[Love]")
## [1] "Peace[Love"
# opening curly bracket
sub("\\{", "", "Peace{Love}")
## [1] "PeaceLove}"
# closing curly bracket
sub("\\}", "", "Peace{Love}")
## [1] "Peace{Love"
# double backslash
sub("\\\\", "", "Peace\\Love")
## [1] "PeaceLove"
# opening and closing square bracket
sub("\\[| \\] ", "", "Peace[Love]")
## [1] "PeaceLove]"
#%>%sub(" \\]", "", "Peace[Love]")
gsub("\\[| \\] ", "", "Peace[Love]")
## [1] "PeaceLove]"
n = c("[Dave]", "[Tony]", "[Sara]")
gsub("\\[|\\]", "", n)
## [1] "Dave" "Tony" "Sara"
n %>% str_replace_all("\\[|\\]", "")
## [1] "Dave" "Tony" "Sara"
"Peace[Love]"%>%str_replace_all("\\[|\\]", "")
## [1] "PeaceLove"
Digits and non-digits
# replace digit with '_'
sub("\\d", "_", "the dandelion war 2010")
## [1] "the dandelion war _010"
gsub("\\d", "_", "the dandelion war 2010")
## [1] "the dandelion war ____"
# replace non-digit with '_'
sub("\\D", "_", "the dandelion war 2010")
## [1] "_he dandelion war 2010"
gsub("\\D", "_", "the dandelion war 2010")
## [1] "__________________2010"
# replace space with '_'
sub("\\s", "_", "the dandelion war 2010")
## [1] "the_dandelion war 2010"
gsub("\\s", "_", "the dandelion war 2010")
## [1] "the_dandelion_war_2010"
# replace non-space with '_'
sub("\\S", "_", "the dandelion war 2010")
## [1] "_he dandelion war 2010"
gsub("\\S", "_", "the dandelion war 2010")
## [1] "___ _________ ___ ____"
Words and non-words
# replace word with '_'
sub("\\b", "_", "the dandelion war 2010")
## [1] "_the dandelion war 2010"
gsub("\\b", "_", "the dandelion war 2010")
## [1] "_t_h_e_ _d_a_n_d_e_l_i_o_n_ _w_a_r_ _2_0_1_0_"
# replace non-word with '_'
sub("\\B", "_", "the dandelion war 2010")
## [1] "t_he dandelion war 2010"
gsub("\\B", "_", "the dandelion war 2010")
## [1] "t_he d_an_de_li_on w_ar 2_01_0"
# replace word boundary with '_'
sub("\\w", "_", "the dandelion war 2010")
## [1] "_he dandelion war 2010"
gsub("\\w", "_", "the dandelion war 2010")
## [1] "___ _________ ___ ____"
# replace non-word-boundary with '_'
sub("\\W", "_", "the dandelion war 2010") ## [1] "the_dandelion war 2010"
## [1] "the_dandelion war 2010"
gsub("\\W", "_", "the dandelion war 2010")
## [1] "the_dandelion_war_2010"
# some string
transport = c("car", "bike", "plane", "boat")
grep(pattern = "[ei]", transport, value = TRUE)
## [1] "bike" "plane"
# some numeric strings
numerics = c("123", "17-April", "I-II-III", "R 3.0.1")
# match strings with 0 or 1
grep(pattern = "[01]", numerics, value = TRUE)
## [1] "123" "17-April" "R 3.0.1"
# match any digit
grep(pattern = "[0-9]", numerics, value = TRUE)
## [1] "123" "17-April" "R 3.0.1"
# negated digit
grep(pattern = "[^0-9]", numerics, value = TRUE)
## [1] "17-April" "I-II-III" "R 3.0.1"
# la vie (string)
la_vie = "La vie en #FFC0CB (rose);\nCes't la vie! \ttres jolie 78"
# if you print 'la_vie'
print(la_vie)
## [1] "La vie en #FFC0CB (rose);\nCes't la vie! \ttres jolie 78"
# if you cat 'la_vie'
cat(la_vie)
## La vie en #FFC0CB (rose);
## Ces't la vie! tres jolie 78
# remove space characters
gsub(pattern = "[[:blank:]]", replacement = "", la_vie)
## [1] "Lavieen#FFC0CB(rose);\nCes'tlavie!tresjolie78"
# remove puntuations
gsub(pattern = "[[:punct:]]", replacement = "", la_vie)
## [1] "La vie en FFC0CB rose\nCest la vie \ttres jolie 78"
# remove digits
gsub(pattern = "[[:xdigit:]]", replacement = "", la_vie)
## [1] "L vi n # (ros);\ns't l vi! \ttrs joli "
# remove printable characters
gsub(pattern = "[[:print:]]", replacement = "", la_vie)
## [1] "\n\t"
# remove non-printable characters
gsub(pattern = "[^[:print:]]", replacement = "", la_vie)
## [1] "La vie en #FFC0CB (rose);Ces't la vie! tres jolie 78"
# remove graphical characters
gsub(pattern = "[[:graph:]]", replacement = "", la_vie)
## [1] " \n \t "
# remove non-graphical characters
gsub(pattern = "[^[:graph:]]", replacement = "", la_vie)
## [1] "Lavieen#FFC0CB(rose);Ces'tlavie!tresjolie78"
#remove non alphabet characters
gsub("[^[:alpha:]]", "", la_vie)
## [1] "LavieenFFCCBroseCestlavietresjolie"
set of regex elements are the so-called quantifiers. These are used when we want to match a certain number of characters that meet certain criteria.
# people names
people = c("rori", "emilia", "matteo", "mehmet", "filipe", "anna", "tyler",
"rasmus", "jacob", "youna", "flora", "adi")
# match 'm' at most once
# ? The preceding item is optional and will be matched at most once
grep(pattern = "m?", people, value = TRUE)
## [1] "rori" "emilia" "matteo" "mehmet" "filipe" "anna" "tyler"
## [8] "rasmus" "jacob" "youna" "flora" "adi"
# match 'm' exactly once
#{n} The preceding item is matched exactly n times
grep(pattern = "m{1}", people, value = TRUE, perl = FALSE)
## [1] "emilia" "matteo" "mehmet" "rasmus"
#{n,m} The preceding item is matched at least n times, but not more than m times
grep(pattern = "m{1,1}", people, value = TRUE, perl = FALSE)
## [1] "emilia" "matteo" "mehmet" "rasmus"
#{n,} The preceding item is matched n or more times
grep(pattern = "m{1,}", people, value = TRUE, perl = FALSE)
## [1] "emilia" "matteo" "mehmet" "rasmus"
# match 'm' zero or more times, and 't'
grep(pattern = "m*t", people, value = TRUE)
## [1] "matteo" "mehmet" "tyler"
# * The preceding item will be matched zero or more times
grep(pattern = "t*m", people, value = TRUE)
## [1] "emilia" "matteo" "mehmet" "rasmus"
# match 'm' one or more times
# + The preceding item will be matched one or more times
grep(pattern = "m+", people, value = TRUE)
## [1] "emilia" "matteo" "mehmet" "rasmus"
# match 'm' one or more times, and 't'
grep(pattern = "m+.t", people, value = TRUE)
## [1] "matteo" "mehmet"
# match 't' exactly twice
grep(pattern = "t{2}", people, value = TRUE)
## [1] "matteo"
#grep() finding regex matches which elements are matched (index or value)
# grepl() finding regex matches which elements are matched (TRUE & FALSE
# regexpr() finding regex matches positions of the first match
# gregexpr() finding regex matches positions of all matches
# regexec() finding regex matches hybrid of regexpr() and gregexpr()
# sub() replacing regex matches only first match is replaced
# gsub() replacing regex matches all matches are replaced
# strsplit() splitting regex matches split vector according to matches
#Function Description
# str detect() Detect the presence or absence of a pattern in a string
# str extract() Extract first piece of a string that matches a pattern
# str extract all() Extract all pieces of a string that match a pattern
#str match() Extract first matched group from a string
# str match all() Extract all matched groups from a string
#str locate() Locate the position of the first occurence of a pattern in a string
# str locate all() Locate the position of all occurences of a pattern in a string
#str replace() Replace first occurrence of a matched pattern in a string
# str replace all() Replace all occurrences of a matched pattern in a string
#str split() Split up a string into a variable number of pieces
# str split fixed() Split up a string into a fixed number of pieces
the important things to keep in mind is that all pattern matching functions in stringr have the following general form: str_function(string, pattern)
# Function Purpose Characteristic
# regmatches() extract or replace matches use with data from regexpr(), # gregexpr() or regexec()
# match() value matching finding positions of (first) matches
# pmatch() partial string matching finding positions
# charmatch() similar to pmatch() finding positions
# Function Description
# apropos() find objects by (partial) name
# browseEnv() browse objects in environment
# glob2rx() change wildcard or globbing pattern into Regular Expression
# help.search() search the help system
# list.files() list the files in a directory/folder
The first five grep()-like functions grep(), grepl(), regexpr(), gregexpr(), and regexec(). The goal is the same for all these functions: finding a match. The difference between them is in the format of the output. Essentially these functions require two main arguments: a pattern (i.e. regular expression), and a text to match.
grep(pattern, text) grepl(pattern, text) regexpr(pattern, text) gregexpr(pattern, text) regexec(pattern, text)
grep() is perhaps the most basic functions that allows us to match a pattern in a string vector. The first argument in grep() is a regular expression that specifies the pattern to match.
# some text
text = c("one word", "a sentence", "you and me", "three two one")
# pattern
pat = "one"
# default usage
grep(pat, text)
## [1] 1 4
# with 'value' (showing matched text)
grep(pat, text, value = TRUE)
## [1] "one word" "three two one"
# with 'invert' (showing unmatched parts)
grep(pat, text, invert = TRUE)
## [1] 2 3
# same with 'values'
grep(pat, text, invert = TRUE, value = TRUE)
## [1] "a sentence" "you and me"
To find exactly where the pattern is found in a given string, we can use the regexpr() function. This function returns more detailed information than grep() providing us: a) which elements of the text vector actually contain the regex pattern, and b) identifies the position of the substring that is matched by the regular expression pattern.
# some text
text = c("one word", "a sentence", "you and me", "three two one")
# default usage
regexpr("one", text)
## [1] 1 -1 -1 11
## attr(,"match.length")
## [1] 3 -1 -1 3
## attr(,"useBytes")
## [1] TRUE
At first glance the output from regexpr() may look a bit messy but it’s very simple to interpret. What we have in the output are three displayed elements. The first element is an integer vector of the same length as text giving the starting positions of the first match. In this example the number 1 indicates that the pattern “one” starts at the position 1 of the first element in text. The negative index -1 means that there was no match; the number 11 indicates the position of the substring that was matched in the fourth element of text. The attribute “match.length” gives us the length of the match in each element of text. Again, a negative value of -1 means that there was no match in that element. Finally, the attribute “useBytes” has a value of TRUE which means that the matching was done byte-by-byte rather than character-by-character.
The function gregexpr() does practically the same thing as regexpr(): identify where a pattern is within a string vector, by searching each element separately. The only difference is that gregexpr() has an output in the form of a list. In other words, gregexpr() returns a list of the same length as text, each element of which is of the same form as the return value for regexpr(), except that the starting positions of every (disjoint) match are given.
# some text
text = c("one word", "a sentence", "you and me", "three two one")
# pattern
pat = "one"
# default usage
gregexpr(pat, text)
## [[1]]
## [1] 1
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE
##
## [[2]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
##
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
##
## [[4]]
## [1] 11
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE
The function regexec() is very close to gregexpr() in the sense that the output is also a list of the same length as text. Each element of the list contains the starting position of the match. A value of -1 reflects that there is no match. In addition, each element of the list has the attribute “match.length” giving the lengths of the matches (or -1 for no match):
# some text
text = c("one word", "a sentence", "you and me", "three two one")
# pattern
pat = "one"
# default usage
regexec(pat, text)
## [[1]]
## [1] 1
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE
##
## [[2]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
##
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
##
## [[4]]
## [1] 11
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE
# handy function to extract matched term
x = regexpr(pat, text)
substring(text, x, x + attr(x, "match.length") - 1)
## [1] "one" "" "" "one"
# with NA
regexpr(pat, c(text, NA))
## [1] 1 -1 -1 11 NA
## attr(,"match.length")
## [1] 3 -1 -1 3 NA
## attr(,"useBytes")
## [1] TRUE
Sometimes finding a pattern in a given string vector is all we want. However, there are occasions in which we might also be interested in replacing one pattern with another one. For this purpose we can use the substitution functions sub() and gsub(). The difference between sub() and gsub() is that the former replaces only the first occurrence of a pattern whereas the latter replaces all occurrences. The replacement functions require three main arguments: a regex pattern to be matched, a replacement for the matched pattern, and the text where matches are sought. The basic usage is: sub(pattern, replacement, text) gsub(pattern, replacement, text)
The function sub() replaces the first occurrence of a pattern in a given text. This means that if there is more than one occurrence of the pattern in each element of a string vector, only the first one will be replaced. For example, suppose we have the following text vector containing various strings:
# string
Rstring = c("The R Foundation",
"for Statistical Computing",
"R is FREE software",
"R is a collaborative project")
# substitute 'R' with 'RR'
sub("R", "RR", Rstring)
## [1] "The RR Foundation" "for Statistical Computing"
## [3] "RR is FREE software" "RR is a collaborative project"
To replace not only the first pattern occurrence, but all of the occurrences we should use gsub() (think of it as general substition). If we take the same vector Rstring and patterns of the last example, this is what we obtain when we apply gsub()
# string
Rstring = c("The R Foundation",
"for Statistical Computing",
"R is FREE software",
"R is a collaborative project")
# substitute 'R' with 'RR'
gsub("R", "RR", Rstring)
## [1] "The RR Foundation" "for Statistical Computing"
## [3] "RR is FRREE software" "RR is a collaborative project"
Besides the operations of finding patterns and replacing patterns, another common task is splitting a string based on a pattern. To do this R comes with the function strsplit() which is designed to split the elements of a character vector into substrings according to regex matches. If you check the help documentation —help(strsplit)— you will see that the basic usage of strsplit() requires two main arguments: strsplit(x, split) x is the character vector and split is the regular expression pattern. However, in order to keep the same notation that we’ve been using with the other grep() functions, it is better if we think of x as text, and split as pattern. In this way we can express the usage of strsplit() as: strsplit(text, pattern) strsplit(text, pattern)
# a sentence
sentence = c("R is a collaborative project with many contributors")
# split into words
strsplit(sentence, " ")
## [[1]]
## [1] "R" "is" "a" "collaborative"
## [5] "project" "with" "many" "contributors"
# telephone numbers
tels = c("510-548-2238", "707-231-2440", "650-752-1300")
# split each number into its portions
strsplit(tels, "-")
## [[1]]
## [1] "510" "548" "2238"
##
## [[2]]
## [1] "707" "231" "2440"
##
## [[3]]
## [1] "650" "752" "1300"
In the previous chapter we briefly presented the functions of the R package stringr for regular expressions. As we mentioned, all the stringr functions share a common usage structure: str_function(string, pattern) The main two arguments are: a string vector to be processed , and a single pattern (i.e. regular expression) to match. Moreover, all the function names begin with the prefix str , followed by the name of the action to be performed. For example, to locate the position of the first occurence, we should use str locate(); to locate the positions of all matches we should use str locate all()
For detecting whether a pattern is present (or absent) in a string vector, we can use the function str detect(). Actually, this function is a wraper of grepl():
# some objects
some_objs = c("pen", "pencil", "marker", "spray")
# detect phones
str_detect(some_objs, "pen")
## [1] TRUE TRUE FALSE FALSE
# select detected macthes
some_objs[str_detect(some_objs, "pen")]
## [1] "pen" "pencil"
The pattern matches dates of the form day-month-year:
# some strings
strings = c("12 Jun 2002", " 8 September 2004 ", "22-July-2009 ",
"01 01 2001", "date", "02.06.2000",
"xxx-yyy-zzzz", "$2,600")
# date pattern (month as text)
dates = "([0-9]{1,2})[- .]([a-zA-Z]+)[- .]([0-9]{4})" # detect dates
str_detect(strings, dates)
## [1] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
For extracting a string containing a pattern, we can use the function str extract(). In fact, this function extracts the first piece of a string that matches a given pattern. For example, imagine that we have a character vector with some tweets about Paris, and that we want to extract the hashtags. We can do this simply by defining a #hashtag pattern like #[a-zA-Z]{1}
# tweets about 'Paris'
paris_tweets = c(
"#Paris is chock-full of cultural and culinary attractions",
"Some time in #Paris along Canal St.-Martin famous by #Amelie",
"While you're in #Paris, stop at cafe: http://goo.gl/yaCbW",
"Paris, the city of light")
# hashtag pattern
hash = "#[a-zA-Z]{1,}"
# extract (first) hashtag
str_extract(paris_tweets, hash)
## [1] "#Paris" "#Paris" "#Paris" NA
In addition to str extract(), stringr also provides the function str extract all(). As its name indicates, we use str extract all() to extract all patterns in a vector string. Taking the same string as in the previous example, we can extract all the hashtag matches like so:
# extract (all) hashtags
str_extract_all(paris_tweets, "#[a-zA-Z]{1,}")
## [[1]]
## [1] "#Paris"
##
## [[2]]
## [1] "#Paris" "#Amelie"
##
## [[3]]
## [1] "#Paris"
##
## [[4]]
## character(0)
Closely related to str extract() the package stringr offers another extracting function: str match(). This function not only extracts the matched pattern but it also shows each of the matched groups in a regex character class pattern.
# string vector
strings = c("12 Jun 2002", " 8 September 2004 ", "22-July-2009 ",
"01 01 2001", "date", "02.06.2000",
"xxx-yyy-zzzz", "$2,600")
# date pattern (month as text)
dates = "([0-9]{1,2})[- .]([a-zA-Z]+)[- .]([0-9]{4})"
# extract first matched group
str_match(strings, dates)
## [,1] [,2] [,3] [,4]
## [1,] "12 Jun 2002" "12" "Jun" "2002"
## [2,] "8 September 2004" "8" "September" "2004"
## [3,] "22-July-2009" "22" "July" "2009"
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] NA NA NA NA
## [7,] NA NA NA NA
## [8,] NA NA NA NA
If what we’re looking for is extracting all patterns in a string vector, instead of using str extract() we should use str extract all():
# tweets about 'Paris'
paris_tweets = c(
"#Paris is chock-full of cultural and culinary attractions",
"Some time in #Paris along Canal St.-Martin famous by #Amelie",
"While you're in #Paris, stop at cafe: http://goo.gl/yaCbW",
"Paris, the city of light")
# match (all) hashtags in 'paris_tweets'
str_match_all(paris_tweets, "#[a-zA-Z]{1,}")
## [[1]]
## [,1]
## [1,] "#Paris"
##
## [[2]]
## [,1]
## [1,] "#Paris"
## [2,] "#Amelie"
##
## [[3]]
## [,1]
## [1,] "#Paris"
##
## [[4]]
## [,1]
Besides detecting, extracting and matching regex patterns, stringr allows us to locate oc- curences of patterns. For locating the position of the first occurence of a pattern in a string vector, we should use str locate().
# locate position of (first) hashtag
str_locate(paris_tweets, "#[a-zA-Z]{1,}")
## start end
## [1,] 1 6
## [2,] 14 19
## [3,] 17 22
## [4,] NA NA
To locate not just the first but all the occurence patterns in a string vector, we should use str locate all():
# locate (all) hashtags in 'paris_tweets'
str_locate_all(paris_tweets, "#[a-zA-Z]{1,}")
## [[1]]
## start end
## [1,] 1 6
##
## [[2]]
## start end
## [1,] 14 19
## [2,] 54 60
##
## [[3]]
## start end
## [1,] 17 22
##
## [[4]]
## start end
For replacing the first occurrence of a matched pattern in a string, we can use str replace(). Its usage has the following form: str_replace(string, pattern, replacement)
In addition to the main 2 inputs of the rest of functions, str replace() requires a third argument that indicates the replacement pattern. Say we have the city names of San Francisco, Barcelona, Naples and Paris in a vector. And let’s suppose that we want to replace the first vowel in each name with a semicolon. Here’s how we can do that:
# city names
cities = c("San Francisco", "Barcelona", "Naples", "Paris")
# replace first matched vowel
str_replace(cities, "[aeiou]", ";")
## [1] "S;n Francisco" "B;rcelona" "N;ples" "P;ris"
Now, suppose that we want to replace the first consonant in each name. We just need to modify the pattern with a negated class:
# replace first matched consonant
str_replace(cities, "[^aeiou]", ";")
## [1] ";an Francisco" ";arcelona" ";aples" ";aris"
For replacing all occurrences of a matched pattern in a string, we can use str replace all(). Once again, consider a vector with some city names, and let’s suppose that we want to replace all the vowels in each name:
# city names
cities = c("San Francisco", "Barcelona", "Naples", "Paris")
# replace all matched vowel
str_replace_all(cities, pattern = "[aeiou]", ";")
## [1] "S;n Fr;nc;sc;" "B;rc;l;n;" "N;pl;s" "P;r;s"
To replace all consonants with a semicolon in each name, we just need to change the pattern with a negated class:
# replace all matched consonants
str_replace_all(cities, pattern = "[^aeiou]", ";")
## [1] ";a;;;;a;;i;;o" ";a;;e;o;a" ";a;;e;" ";a;i;"
Similar to strsplit(), stringr gives us the function str split() to separate a character vector into a number of pieces. This function has the following usage: str_split(string, pattern, n = Inf) The argument n is the maximum number of pieces to return. The default value (n = Inf) implies that all possible split positions are used. Let’s see the same example of strsplit() in which we wish to split up a sentence into individuals words:
# a sentence
sentence = c("R is a collaborative project with many contributors")
# split into words
str_split(sentence, " ")
## [[1]]
## [1] "R" "is" "a" "collaborative"
## [5] "project" "with" "many" "contributors"
we can break apart the portions of a telephone number by splitting those sets of digits joined by a dash “-”
# telephone numbers
tels = c("510-548-2238", "707-231-2440", "650-752-1300")
# split each number into its portions
str_split(tels, "-")
## [[1]]
## [1] "510" "548" "2238"
##
## [[2]]
## [1] "707" "231" "2440"
##
## [[3]]
## [1] "650" "752" "1300"
let’s consider a vector with flavors “chocolate”, “vanilla”, “cinnamon”, “mint”, and “lemon”. Suppose we want to split each flavor name defining as pattern the class of vowels:
# string
flavors = c("chocolate", "vanilla", "cinnamon", "mint", "lemon")
# split by vowels
str_split(flavors, "[aeiou]")
## [[1]]
## [1] "ch" "c" "l" "t" ""
##
## [[2]]
## [1] "v" "n" "ll" ""
##
## [[3]]
## [1] "c" "nn" "m" "n"
##
## [[4]]
## [1] "m" "nt"
##
## [[5]]
## [1] "l" "m" "n"
Now let’s modify the maximum number of pieces to n = 2. This means that str split() will split each element into a maximum of 2 pieces. Here’s what we obtain:
# split by first vowel
str_split(flavors, "[aeiou]", n = 2)
## [[1]]
## [1] "ch" "colate"
##
## [[2]]
## [1] "v" "nilla"
##
## [[3]]
## [1] "c" "nnamon"
##
## [[4]]
## [1] "m" "nt"
##
## [[5]]
## [1] "l" "mon"
In addition to str split(), there is also the str split fixed() function that splits up a string into a fixed number of pieces. Its usage has the following form: str_split_fixed(string, pattern, n) Note that the argument n does not have a default value. In other words, we need to specify an integer to indicate the number of pieces. Consider again the same vector of flavors, and the letter “n” as the pattern to match. Let’s see the behavior of str split fixed() with n = 2.
# string
flavors = c("chocolate", "vanilla", "cinnamon", "mint", "lemon")
# split flavors into 2 pieces
str_split_fixed(flavors, "n", 2)
## [,1] [,2]
## [1,] "chocolate" ""
## [2,] "va" "illa"
## [3,] "ci" "namon"
## [4,] "mi" "t"
## [5,] "lemo" ""
If we change the value n = 3, we will obtain a matrix with three columns:
# split favors into 3 pieces
str_split_fixed(flavors, "n", 3)
## [,1] [,2] [,3]
## [1,] "chocolate" "" ""
## [2,] "va" "illa" ""
## [3,] "ci" "" "amon"
## [4,] "mi" "t" ""
## [5,] "lemo" "" ""
# function that reverses a string by characters
reverse_chars <- function(string) {
# split string by characters
string_split = strsplit(string, split = "")
# reverse order
rev_order = nchar(string):1
# reversed characters
reversed_chars = string_split[[1]][rev_order]
# collapse reversed characters
paste(reversed_chars, collapse="") }
# try 'reverse_chars'
reverse_chars("abcdefg")
## [1] "gfedcba"
# reversing a string by characters
reverse_chars <- function(string) {
string_split = strsplit(as.character(string), split = "")
reversed_split = string_split[[1]][nchar(string):1]
paste(reversed_split, collapse="")
}
# example with one word
reverse_chars("atmosphere")
## [1] "erehpsomta"
# example with a several words
reverse_chars("the big bang theory")
## [1] "yroeht gnab gib eht"
# try 'reverse_chars'
reverse_chars("abcdefg")
## [1] "gfedcba"
# try with non-character input
reverse_chars(12345)
## [1] "54321"
# reverse vector (by characters)
lapply(c("the big bang theory", "atmosphere"), reverse_chars)
## [[1]]
## [1] "yroeht gnab gib eht"
##
## [[2]]
## [1] "erehpsomta"
The second type of reversing operation is to reverse a string by words. In this case the procedure involves splitting up a string by words, re-arrange them in reverse order, and paste them back in one sentence. Here’s how we can defined our reverse words() function:
# # function that reverses a string by words
# reverse_words <- function(string) {
# # split string by blank spaces
# string_split = strsplit(as.character(string), split = " ")
# # how many split terms?
# string_length = length(string_split[[1]])
# # decide what to do
# if (string_length == 1) {
# # one word (do nothing) reversed_string = string_split[[1]]
# } else {
# # more than one word (collapse them)
# reversed_split = string_split[[1]][string_length:1] reversed_string = paste(reversed_split, collapse = " ")
# }
# # output
# return(reversed_string) }
# examples
# reverse_words("atmosphere")
#
# reverse_words("the big bang theory")
# reverse vector (by words)
#lapply(c("the big bang theory", "atmosphere"), reverse_words)
The second practical example that we will discuss consists of matching an email address. We will work with usual email addresses having one (or a similar variant) of the following forms: somename@email.com somename99@email.com some.name@email.com some.name@an-email.com some.name@an.email.com Since our goal is to match an email address, this implies that we need to define a corresponding regex pattern. If we look at the previous email forms it is possible to see that they have a general structure that can be broken into three parts. The first part is the username (e.g. somename99). The second part is an @ symbol. The third part is the domain name (e.g. an.email.com). The username pattern can be defined as: ^([a-z0-9_\.-]+) The username pattern starts with a caret ^ to indicate the beginning of the string. Then we have a group indicated with parentheses. It matches one or more lowercase letters, numbers, underscores, dots, or hyphens. The domain name pattern can be defined as: ([\da-z\.-]+)\.([a-z\.]{2,6})$ The domain name should be one or more lowercase letters, numbers, underscores, dots, or hyphens. Then another (escaped) dot, followed by an extension of two to six letters or dots. And finally the end of the string (\(). The complete regular expression pattern (in R) for an email address is: "^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})\)" Let’s test our pattern with a minimalist example:
T# pattern
## [1] TRUE
email_pat = "^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})$" # string that matches
grepl(pattern = email_pat, x = "gaston@abc.com")
## [1] TRUE
# another string that matches
grep(pattern = email_pat, x = "gaston.sanchez@research-center.fr")
## [1] 1
# unmatched email (TLD too long)
grep(pattern = email_pat, x = "gaston@abc.something")
## integer(0)
## integer(0)
# potential email addresses
emails = c(
"simple@example.com",
"johnsmith@email.gov",
"marie.curie@college.edu",
"very.common@example.com",
"a.little.lengthy.but.ok@dept.example.com",
"disposable.style.email.with+symbol@example.com",
"not_good@email.address")
# detect pattern
str_detect(string=emails, pattern=email_pat)
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE
The first step is to create a vector of character strings that will contain the lines of the mailing lists webpage. We can create this vector by simply passing the URL name to readLines():
# read html content
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
mail_lists = readLines("http://www.r-project.org/mail.html")
mail_lists%>%head()
## [1] "<!DOCTYPE html>"
## [2] "<html lang=\"en\">"
## [3] " <head>"
## [4] " <meta charset=\"utf-8\">"
## [5] " <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\">"
## [6] " <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">"
docs <- Corpus(VectorSource(mail_lists))
docs
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 153
Once we’ve read the HTML content of the R mailing lists webpage, the next step is to define our regex pattern that matches the SIG links. ’^.*
.$’
# SIG's href pattern
sig_pattern = '^.*<td> *<a href="(https.*)">.*$'
# find SIG href attributes
sig_hrefs = grep(sig_pattern, mail_lists, value = TRUE)
# let's see first 5 elements (shorten output)
shorten_sigs = c()
for (i in 1:5) {
shorten_sigs[i] = toString(sig_hrefs[i], width=70)
}
shorten_sigs
## [1] "NA" "NA" "NA" "NA" "NA"
# get first matched group
sub(sig_pattern, "\\1", sig_hrefs)
## character(0)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
mydat <- fread('http://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat')
head(mydat)
## V1 V2 V3 V4 V5
## 1: 1 307 930 36.58 0
## 2: 2 307 940 36.73 0
## 3: 3 307 950 36.93 0
## 4: 4 307 1000 37.15 0
## 5: 5 307 1010 37.23 0
## 6: 6 307 1020 37.24 0
Read TXT files with read.table() If you have a .txt or a tab-delimited text file, you can easily import it with the basic R function read.table(). In other words, the contents of your file will look similar to this
# link of data set
url = "http://norvig.com/big.txt"
# read data (stringsAsFactors=FALSE)
biomed = read.table(url, header = TRUE, sep = ",", stringsAsFactors = FALSE)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : EOF within quoted string
biomed%>%head()
## The.Project.Gutenberg.EBook.of.The.Adventures.of.Sherlock.Holmes
## 1 by Sir Arthur Conan Doyle
## 2 (
## 3 Copyright laws are changing all over the world. Be sure to check the
## 4 copyright laws for your country before downloading or redistributing
## 5 this or any other Project Gutenberg eBook.
## 6 This header should be the first thing seen when viewing this Project
# structure of the dataset
str(biomed, vec.len = 1)
## 'data.frame': 47359 obs. of 1 variable:
## $ The.Project.Gutenberg.EBook.of.The.Adventures.of.Sherlock.Holmes: chr "by Sir Arthur Conan Doyle" ...
# remove punctuation
biomed = str_replace_all(biomed, pattern = "[[:punct:]]", "")
#biomed[[1]]%>%head()
# trim extra whitespaces
biomed = str_replace_all(biomed, pattern = "\\s+", " ")
#biomed
# split titles by words
biomed = str_split(biomed, pattern = " ")
# show first 2 elements
#biomed
# how many words per title
words_biomed = sapply(biomed, length)
# table of frequencies
table(words_biomed)
## words_biomed
## 1027773
## 1
(words_biomed)
## [1] 1027773
df <- read.table("https://s3.amazonaws.com/assets.datacamp.com/blog_assets/test.txt",
header = FALSE)
df
## V1 V2 V3
## 1 1 6 a
## 2 2 7 b
## 3 3 8 c
## 4 4 9 d
## 5 5 10 e
library(RCurl)
myfile <- getURL('https://sakai.unc.edu/access/content/group/3d1eb92e-7848-4f55-90c3-7c72a54e7e43/public/data/bycatch.csv', ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
myfile%>%head()
## [1] "Season,Area,Gear Type,Time,Tows,Bycatch\r1989-90,North,Bottom,Day,48,0\r1989-90,North,Bottom,Night,6,0\r1989-90,North,Mid-Water,Night,1,0\r1989-90,South,Bottom,Day,139,0\r1989-90,South,Mid-Water,Day,6,0\r1989-90,South,Bottom,Night,6,0\r1989-90,South,Mid-Water,Night,90,23\r1990-91,North,Bottom,Day,2,0\r1990-91,South,Bottom,Day,47,0\r1990-91,South,Mid-Water,Day,110,0\r1990-91,South,Bottom,Night,12,0\r1990-91,South,Mid-Water,Night,73,0\r1991-92,North,Bottom,Day,101,0\r1991-92,North,Mid-Water,Day,4,0\r1991-92,North,Bottom,Night,36,2\r1991-92,North,Mid-Water,Night,3,5\r1991-92,South,Bottom,Day,74,1\r1991-92,South,Mid-Water,Day,3,0\r1991-92,South,Bottom,Night,7,5\r1991-92,South,Mid-Water,Night,15,16\r1992-93,North,Bottom,Day,135,0\r1992-93,North,Mid-Water,Day,3,0\r1992-93,North,Bottom,Night,22,0\r1992-93,North,Mid-Water,Night,16,0\r1992-93,South,Bottom,Day,112,0\r1992-93,South,Bottom,Night,6,0\r1992-93,South,Mid-Water,Night,28,9\r1993-94,North,Bottom,Day,78,0\r1993-94,North,Mid-Water,Day,19,0\r1993-94,North,Bottom,Night,13,0\r1993-94,North,Mid-Water,Night,28,0\r1993-94,South,Bottom,Day,155,0\r1993-94,South,Mid-Water,Day,20,0\r1993-94,South,Bottom,Night,14,0\r1993-94,South,Mid-Water,Night,71,8\r1994-95,North,Bottom,Day,17,0\r1994-95,North,Mid-Water,Day,80,0\r1994-95,North,Bottom,Night,9,0\r1994-95,North,Mid-Water,Night,74,0\r1994-95,South,Bottom,Day,41,0\r1994-95,South,Mid-Water,Day,73,6\r1994-95,South,Bottom,Night,13,0\r1994-95,South,Mid-Water,Night,74,15"
What are the arguments ssl.verifyhost=F and ssl.verifypeer=F doing? To be quite honest, I don’t really know. But if I’m having trouble reading from a URL I try specifying these arguments and changing one or both to FALSE almost always circumvents whatever error I’m getting. This grabs the content residing at the specified URL, but doesn’t return a data.frame object. It has simply put the URL’s content into a string.
class(myfile)
## [1] "character"
So how to get this into a data.frame object? We’ll use textConnection() to open a “connection” with the string, much like you would open a connection with a file on your hard drive in order to read it. Then we’ll have read.csv() (or you could use read.table() or fread() or similar) to read the string object like a text file and create a data.frame object.
mydat <- read.csv(textConnection(myfile), header=T)
head(mydat)
## Season Area Gear.Type Time Tows Bycatch
## 1 1989-90 North Bottom Day 48 0
## 2 1989-90 North Bottom Night 6 0
## 3 1989-90 North Mid-Water Night 1 0
## 4 1989-90 South Bottom Day 139 0
## 5 1989-90 South Mid-Water Day 6 0
## 6 1989-90 South Bottom Night 6 0
baseURL <- 'http://www.wunderground.com/history/airport/KMDW'
suffixURL <- 'DailyHistory.html?HideSpecis=1&format=1'
Date <- Sys.Date()
datestring <- format(Date, '%Y/%m/%d')
datestring
## [1] "2017/09/29"
url2fetch <- paste(baseURL, datestring, suffixURL, sep='/')
url2fetch
## [1] "http://www.wunderground.com/history/airport/KMDW/2017/09/29/DailyHistory.html?HideSpecis=1&format=1"
getURL(paste(baseURL, datestring, suffixURL, sep='/'))
## [1] "Bad Request - Blocked at Akamai"
url_content <- getURL("https://www.wunderground.com/history/airport/KMDW/2017/07/29/DailyHistory.html?HideSpecis=1&format=1")
url_content
## [1] "Bad Request - Blocked at Akamai"
weather_data <- read.csv(textConnection(url_content))
#head(weather_data)%>%mutate(TimeCDT=as.POSIXct.Date(TimeCDT),DateUTC=as.POSIXct(DateUTC.br...))
# Read a delimited file
df1 <- read.delim("https://s3.amazonaws.com/assets.datacamp.com/blog_assets/test_delim.txt", sep="$")
df2 <- read.delim2("https://s3.amazonaws.com/assets.datacamp.com/blog_assets/test_delim.txt", sep="$")
# Inspect the result
df1
## Col1 Col2 Col3
## 1 1 2 3
## 2 4 5 6
## 3 7 8 9
## 4 a b c
df2
## Col1 Col2 Col3
## 1 1 2 3
## 2 4 5 6
## 3 7 8 9
## 4 a b c