set.seed(19181112)
library(tidyverse)
## ── Attaching packages ───────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
library(ggwordcloud)
getRegEx<-function(re, v) {
# extract all strings matching regular expression re in
# input text or vector v
vectorResult<-c() # initialize vector container for matches
v<-v[grep(re,v)] # line up the first candidates for macthing
while (length(v) >0 ) { # any candidates left?
matchStart<-regexpr(re,v)
matchEnd<-matchStart+attr(matchStart,"match.length")-1
endOfString<-nchar(v)
# add matching strings to results
vectorResult<-c( vectorResult,
substr(v,matchStart,matchEnd)
)
# remove matching strings from candidates
v<-paste(substr(v,1,matchStart-1),substr(v,matchEnd+1,endOfString),sep="")
v<-v[grep(re,v)] # line up remaining candidates
}
return(vectorResult)
}
require(ggwordcloud)
myGgWordCloud<-function(wordSpec, max_size=24,title="",caption="") {
# wordSpec is a tibble with required column names words and frquency
# if not specified, columns angle and color will be generated as below
# max_size is in points, title and caption are usual ggplot args
if (!("angle" %in% names(wordSpec))) {
wordSpec <- wordSpec %>%
mutate(angle = 45 * sample(-2:2, n(), replace = TRUE, prob = c(1, 2, 4, 2, 1)))
}
if (!("color" %in% names(wordSpec))) {
wordSpec <- wordSpec %>%
mutate(color = factor(sample.int(10, nrow(wordSpec), replace = TRUE)))
}
plotReturn<-ggplot(
wordSpec,
aes(
label = words, size = frequency,
color = color,
angle = angle
)
) +
geom_text_wordcloud_area(area_corr = TRUE) +
scale_size_area(max_size = max_size) +
labs(title=title, caption = caption) +
theme_set(theme_minimal(base_size = 15))
theme_minimal()
return(plotReturn)
}
richardII<-c(
"THIS EXAMPLE from Shakespeare's Richard II:",
"",
"1050. Gaunt: This royal throne of kings, this sceptred isle,",
"This earth of majesty, this seat of Mars,",
"This other Eden, demi-paradise,",
"This fortress built by Nature for herself",
"Against infection and the hand of war,",
"1055. This happy breed of men, this little world,",
"This precious stone set in the silver sea,",
"Which serves it in the office of a wall",
"Or as a moat defensive to a house,",
"Against the envy of less happier lands,--",
"1060. This blessed plot, this earth, this realm, this England.",
"<snip>",
"Yorke: The King is come, deale mildly with his youth,"
)
print(gsub("this", "for that", richardII))
## [1] "THIS EXAMPLE from Shakespeare's Richard II:"
## [2] ""
## [3] "1050. Gaunt: This royal throne of kings, for that sceptred isle,"
## [4] "This earth of majesty, for that seat of Mars,"
## [5] "This other Eden, demi-paradise,"
## [6] "This fortress built by Nature for herself"
## [7] "Against infection and the hand of war,"
## [8] "1055. This happy breed of men, for that little world,"
## [9] "This precious stone set in the silver sea,"
## [10] "Which serves it in the office of a wall"
## [11] "Or as a moat defensive to a house,"
## [12] "Against the envy of less happier lands,--"
## [13] "1060. This blessed plot, for that earth, for that realm, for that England."
## [14] "<snip>"
## [15] "Yorke: The King is come, deale mildly with his youth,"
## # A tibble: 10 x 2
## original formatted
## <chr> <chr>
## 1 136-540-1970 (136) 540-1970
## 2 967-469-4943 (967) 469-4943
## 3 515-071-8078 (515) 071-8078
## 4 163.070.3292 1234 (163) 070-3292 ext. 1234
## 5 (621) 902-2951 ext 220 (621) 902-2951 ext. 220
## 6 985-954-1652 (985) 954-1652
## 7 668.771.5174 (668) 771-5174
## 8 (418) 915-8969 (418) 915-8969
## 9 3233449931 (323) 344-9931
## 10 835-255-3199 (835) 255-3199
## # A tibble: 10 x 4
## fullName lastName firstName middleName
## <chr> <chr> <chr> <chr>
## 1 " David R Al-Hashimi" Al-Hashimi David R
## 2 "Christopher L Brown " Brown Christopher L
## 3 "Davis-Mbali, Jane " Davis-Mbali Jane <NA>
## 4 "Garcia, Daniel S " Garcia Daniel S
## 5 "Jones, James D " Jones James D
## 6 "O'Brien, Roberta June " O'Brien Roberta June
## 7 "Rodriguez,Brian C " Rodriguez Brian C
## 8 "Smith, John M " Smith John M
## 9 Thomas, Joseph Anthony Thomas Joseph Anthony
## 10 "Williams, Michael A " Williams Michael A
The short answer is coders of many apps and languages:
R
python
java
javascript
shell scripts, perl, PowerShell
Tableau
even Excel after the usual bending over backwards
Google Search supports some regular expressions: try (gray|red) (fox|wolf)
in the simple cases:
Everybody has a favorite way of formatting phone numbers.
Let’s make these as ( <area code> ) <exchange> <line> ext. <extension>
originalPhones<-c("136-540-1970", "967-469-4943", "515-071-8078", "163.070.3292 1234", "(621) 902-2951 ext 220", "985-954-1652", "668.771.5174", "(418) 915-8969", "3233449931", "835-255-3199")
print(c("original phone numbers",originalPhones))
## [1] "original phone numbers" "136-540-1970"
## [3] "967-469-4943" "515-071-8078"
## [5] "163.070.3292 1234" "(621) 902-2951 ext 220"
## [7] "985-954-1652" "668.771.5174"
## [9] "(418) 915-8969" "3233449931"
## [11] "835-255-3199"
# we do this in two passes. First, let's extract all numbers:
formattedPhones<-sub(
"^\\D*(\\d{3})\\D*(\\d{3})\\D*(\\d{4})\\D*(\\d*)$",
# From the beginning, ignore any non-digits, capture 3 digits then
# ignore any non-digits, capture 3 digits then
# ignore any non-digits, capture 4 digits then
# ignore any non-digits, capture any digits to the end
"(\\1) \\2-\\3 ext. \\4",
originalPhones
)
# Second remove the trailing space for all the numbers that have no extension:
formattedPhones<-sub(" ext\\. $","",formattedPhones)
# note "\\." to specify period
cat("\n")
print(c("formatted phone numbers",formattedPhones))
## [1] "formatted phone numbers" "(136) 540-1970"
## [3] "(967) 469-4943" "(515) 071-8078"
## [5] "(163) 070-3292 ext. 1234" "(621) 902-2951 ext. 220"
## [7] "(985) 954-1652" "(668) 771-5174"
## [9] "(418) 915-8969" "(323) 344-9931"
## [11] "(835) 255-3199"
# a function to do that
nameConvert<-function(fullName) {
# function to return tibble with full, first, middle, and last names
# initialize tibble for return to function caller
xfr<-tibble(fullName="",lastName="",firstName="",middleName="")
xfr<-xfr[-1,] # leaves only the tibble header w/o rows
# code continues after internal function definition:
getNames<-function(fullName) {
# internal function to extract name chunks from fullName
if (grepl(",",fullName)) {
# name in format last, first middle name/initial
# use capture group in parentheses to extract each name
# in form <any white space><name><any white>
# swallow the entire fullName from beginning to end.
regex="^\\s*([A-Za-z'\\-]+)\\s*,.*$"
lastName<-sub(regex,"\\1",fullName)
regex="^\\s*[A-Za-z'\\-]+\\s*,\\s*([A-Za-z'\\-]+).*$"
firstName<-sub(regex,"\\1",fullName)
regex="^\\s*[A-Za-z'\\-]+\\s*,\\s*[A-Za-z'\\-]+\\s*([A-Za-z'\\-]*).*$"
middleName<-sub(regex,"\\1",fullName)
} else {
# name in format , first { optional middle name/initial } last
# use capture group in parentheses to extract each name
# in form <any white space><name><any white>
# swallow the entire fullName from beginning to end.
regex="^\\s*([A-Za-z'\\-]+)\\s*.*$"
firstName<-sub(regex,"\\1",fullName)
regex="^\\s*[A-Za-z'\\-]+\\s*([A-Za-z'\\-]+).*$"
middleName<-sub(regex,"\\1",fullName)
regex="^\\s*[A-Za-z'\\-]+\\s*[A-Za-z'\\-]+\\s*([A-Za-z'\\-]*).*$"
lastName<-sub(regex,"\\1",fullName)
# handle case where no middle name/initial: last misinterpretted as middle
if (lastName=="") { lastName<-middleName; middleName<-NA}
}
if (middleName=="") middleName<-NA
return (tibble( # new row for adding
fullName=fullName,
lastName=lastName,
firstName=firstName,
middleName=middleName)
)
}
# use internal function to extract names, then add row to xfr
# there's probably a better way to do this with dplyr and an apply function
for ( fullN in fullName ) xfr<-rbind(xfr,getNames(fullN))
return(xfr)
}
library(stringr)
myData<-tibble(
fullName=c(
"Williams, Michael A ",
"Christopher L Brown ",
"$Smith, John M$ ",
"O'Brien, Roberta June ",
"Jones, James D ",
"Rodriguez,Brian C. ",
"Davis-Mbali, JANE ",
" David R Al-Hashimi",
"Thomas, Joseph Anthony",
"Garcia, Daniel S "
)#, lastName=NA,firstName=NA,middleName=NA
)
print("here are the original names:")
## [1] "here are the original names:"
(myData)
## # A tibble: 10 x 1
## fullName
## <chr>
## 1 "Williams, Michael A "
## 2 "Christopher L Brown "
## 3 "$Smith, John M$ "
## 4 "O'Brien, Roberta June "
## 5 "Jones, James D "
## 6 "Rodriguez,Brian C. "
## 7 "Davis-Mbali, JANE "
## 8 " David R Al-Hashimi"
## 9 Thomas, Joseph Anthony
## 10 "Garcia, Daniel S "
myCleanerData<- myData %>%
# toss anything that's not a letter or acceptable punctuation
# note double escape for whitespace, \\s, and hyphen, \\-
# also note that \\s requires optional argument perl=TRUE
mutate(fullName=gsub("[^\\sA-Za-z,'\\-]","",fullName,perl = TRUE)) %>%
# capitalize first letter of every name using stringr function
# note: fix up hack #1 to make O'brien => O'B... and Al-hashimi => Al-H...
mutate(fullName=gsub("(['\\-])([A-Za-z])","\\1 \\2",fullName)) %>%
mutate(fullName=str_to_title(fullName)) %>%
mutate(fullName=gsub("(['\\-])[A-Za-z]","\\1 ",fullName)) %>%
# undo fix up: remove space after "'" or "-"
mutate(fullName=gsub("(['\\-]) ","\\1",fullName))
# There's probably a better way to do this with dplyr and an apply function:
# I wimped out
myCleanData<-nameConvert(myCleanerData$fullName) #extract seperate names
# Let's sort them in alphabetic order by last, first, middle
myCleanData<- myCleanData %>%
arrange( lastName, firstName, middleName )
# and out it goes:
(myCleanData)
## # A tibble: 10 x 4
## fullName lastName firstName middleName
## <chr> <chr> <chr> <chr>
## 1 " David R Al-Hashimi" Al-Hashimi David R
## 2 "Christopher L Brown " Brown Christopher L
## 3 "Davis-Mbali, Jane " Davis-Mbali Jane <NA>
## 4 "Garcia, Daniel S " Garcia Daniel S
## 5 "Jones, James D " Jones James D
## 6 "O'Brien, Roberta June " O'Brien Roberta June
## 7 "Rodriguez,Brian C " Rodriguez Brian C
## 8 "Smith, John M " Smith John M
## 9 Thomas, Joseph Anthony Thomas Joseph Anthony
## 10 "Williams, Michael A " Williams Michael A
trumpsTweets.txt<-read_lines(url("https://raw.githubusercontent.com/sdutky/mcData110/master/regexTutorial/trumpTweets12Aug15Dec.txt"))
head(trumpsTweets.txt)
## [1] "08-12-2019 04:01:28"
## [2] "Thank you NEGOP! https://t.co/S5A24nleW8 [Twitter for iPhone]"
## [3] "08-12-2019 04:02:50"
## [4] "RT @TrumpWarRoom: Joe Biden: “Poor kids are just as bright and just as talented as white kids.” https://t.co/YhDSMnoRce [Twitter for iPhone]"
## [5] "08-12-2019 04:03:51"
## [6] "RT @marc_lotter: STRONG! The American economy is doing very well [under @realDonaldTrump] & wages are up but we cannot rely on the media… [Twitter for iPhone]"
> **<date>**
> **<text> [<source>]**
dateTimeIndex<-grep("..-..-2019 ..:..:[0-9]{2}$",trumpsTweets.txt)
$ => look for lines ending with
..-.. => two groups of two characters seperated by a hyphen
2019 => literal 2019 followed by a space that is followed by
..: => any two characters followed by a colon followed by
..: => another any two characters followed by a colon and
[0-9]{2}$ => exactly two digits followed by an end of line
cat("\nhead(dateTimeIndex):\n")
##
## head(dateTimeIndex):
head(dateTimeIndex)
## [1] 1 3 5 7 9 11
cat("\nhead(dateTimeIndex+1):\n")
##
## head(dateTimeIndex+1):
head(dateTimeIndex+1)
## [1] 2 4 6 8 10 12
# and the dates and text
cat("\nhead(trumpsTweets.txt[dateTimeIndex]):\n")
##
## head(trumpsTweets.txt[dateTimeIndex]):
head(trumpsTweets.txt[dateTimeIndex])
## [1] "08-12-2019 04:01:28" "08-12-2019 04:02:50" "08-12-2019 04:03:51"
## [4] "08-12-2019 19:31:56" "08-12-2019 19:32:03" "08-12-2019 19:33:09"
cat("\nhead(trumpsTweets.txt[dateTimeIndex+1]):\n")
##
## head(trumpsTweets.txt[dateTimeIndex+1]):
head(trumpsTweets.txt[dateTimeIndex+1])
## [1] "Thank you NEGOP! https://t.co/S5A24nleW8 [Twitter for iPhone]"
## [2] "RT @TrumpWarRoom: Joe Biden: “Poor kids are just as bright and just as talented as white kids.” https://t.co/YhDSMnoRce [Twitter for iPhone]"
## [3] "RT @marc_lotter: STRONG! The American economy is doing very well [under @realDonaldTrump] & wages are up but we cannot rely on the media… [Twitter for iPhone]"
## [4] "Scaramucci who like so many others had nothing to do with my Election victory is only upset that I didn’t want him back in the Administration (where he desperately wanted to be). Also I seldom had time to return his many calls to me. He just wanted to be on TV! [Twitter for iPhone]"
## [5] "RT @WhiteHouse: JUST ANNOUNCED: President @realDonaldTrump will enforce a Clinton-era law to ensure that non-citizens do not abuse our publ… [Twitter for iPhone]"
## [6] "RT @RandPaul: Why is the United States always stuck paying for everyone when prosperous nations don’t pay their fair share? @RichardGrenell… [Twitter for iPhone]"
trumpsTweets.tib<-tibble(
date=trumpsTweets.txt[dateTimeIndex],
tweet=trumpsTweets.txt[dateTimeIndex+1],
retweetFrom=NA
)
str(trumpsTweets.tib)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3810 obs. of 3 variables:
## $ date : chr "08-12-2019 04:01:28" "08-12-2019 04:02:50" "08-12-2019 04:03:51" "08-12-2019 19:31:56" ...
## $ tweet : chr "Thank you NEGOP! https://t.co/S5A24nleW8 [Twitter for iPhone]" "RT @TrumpWarRoom: Joe Biden: “Poor kids are just as bright and just as talented as white kids.” https://t.co/Yh"| __truncated__ "RT @marc_lotter: STRONG! The American economy is doing very well [under @realDonaldTrump] & wages are up bu"| __truncated__ "Scaramucci who like so many others had nothing to do with my Election victory is only upset that I didn’t want "| __truncated__ ...
## $ retweetFrom: logi NA NA NA NA NA NA ...
We’re going to convert the dates to R’s favorite format Note the use of capturing parenthese and back references
trumpsTweets.tib<- trumpsTweets.tib %>%
mutate(
date=parse_date_time(date,"%m-%d-%YHMS",tz="EST"),
source=sub("^.*[[]([^]]*)[]] *$","\\1",tweet),
tweet=sub("[[][^]]*[]] *$","",tweet),
# We're looking to get <source> from [<source>], so:
# find an open bracket then capture as many characters as possible
# that are not a close bracket, followed by a close bracket and
# any number of spaces at the end of the tweet.
# and put it into \\1 for a back reference replacement
# isolate retweets, but keep the tweeter's handle
retweetFrom=sub("^(?:RT (@[^:]+))*.*$","\\1",tweet),
tweet=sub("^(?:RT @[^:]+: )","",tweet),
#get rid of urls: either http or https followed by
# :// followed by anything up to a space or end of tweet:
tweet=gsub("http[s]*://.*( |$)","",tweet)
)
handles<-getRegEx("@[A-Za-z0-9_]+",trumpsTweets.tib$tweet)
handles<-c( handles, trumpsTweets.tib$retweetFrom[trumpsTweets.tib$retweetFrom!=""])
hanTable<-sort(table(handles),decreasing =TRUE)
handles.tib<-tibble(
words=names(hanTable),
frequency=hanTable
)
handles.tib
## # A tibble: 720 x 2
## words frequency
## <chr> <table>
## 1 @realDonaldTrump 453
## 2 @WhiteHouse 140
## 3 @Jim_Jordan 73
## 4 @FoxNews 69
## 5 @GOPChairwoman 64
## 6 @RepMarkMeadows 53
## 7 @foxandfriends 50
## 8 @TeamTrump 44
## 9 @POTUS 40
## 10 @SteveScalise 37
## # … with 710 more rows
# since Trump includes @realDonaldTrump more than anybody else,
# we will convert frequency to log(frequency)
a<-handles.tib[1:20,]
a$frequency<-log(a$frequency)
wcHandles<-myGgWordCloud(a,title="top 20 handles in trump tweets 2019 12 Aug - 15 Dec",max_size = 8)
wcHandles
allCaps<-getRegEx("[A-Z']+ (?:[A-Z.']+( |!|$)){2,}",trumpsTweets.tib$tweet)
allCapsTable<-sort(table(allCaps), decreasing = TRUE)
allCaps.tib<-tibble(
words=names(allCapsTable),
frequency=allCapsTable
)
allCaps.tib
## # A tibble: 156 x 2
## words frequency
## <chr> <table>
## 1 KEEP AMERICA GREAT! 6
## 2 MAKE AMERICA GREAT AGAIN! 6
## 3 DRAIN THE SWAMP! 5
## 4 I AM DRAINING THE SWAMP! 5
## 5 I WANT NOTHING! 5
## 6 "GET OUT AND VOTE " 3
## 7 JOBS JOBS JOBS! 3
## 8 READ THE TRANSCRIPT! 3
## 9 SCHIFF'S FACT WITNESSES! 3
## 10 BIG NIGHT FOR THE REPUBLICAN PARTY. CONGRATULATIONS TO ALL! 2
## # … with 146 more rows
a<-allCaps.tib[1:20,]
a$angle<-0 # display all phrases horizontally
wcAllCaps<-myGgWordCloud(a,title="top 20 all caps in trump tweets 2019 12 Aug - 15 Dec",max_size = 8)
wcAllCaps
titleStrings<-getRegEx("[A-Z][a-z]+ (?:[A-Z][a-z]+( |!|$)){2,}",trumpsTweets.tib$tweet)
# remove beginning The's
titleStrings<-gsub("^The ","",titleStrings)
titleTable<-sort(table(titleStrings),decreasing = TRUE)
titleStrings.tib<-tibble(
words=names(titleTable),
frequency=titleTable
)
titleStrings.tib
## # A tibble: 698 x 2
## words frequency
## <chr> <table>
## 1 "Do Nothing Democrats " 34
## 2 "Fake News Media " 27
## 3 "New York Times " 16
## 4 "Do Nothing Dems " 14
## 5 "United States " 14
## 6 "Failing New York Times " 10
## 7 "Nervous Nancy Pelosi " 8
## 8 "Shifty Adam Schiff " 8
## 9 Do Nothing Democrats! 7
## 10 "Fake News " 7
## # … with 688 more rows
a<-titleStrings.tib[1:20,]
a$frequency<-log(a$frequency)
a$angle<-0 # display all phrase horizontally
wcTitleStrings<-myGgWordCloud(a,title="top 20 title strings in trump tweets 2019 12 Aug - 15 Dec",max_size = 8)
wcTitleStrings
richardII
## [1] "THIS EXAMPLE from Shakespeare's Richard II:"
## [2] ""
## [3] "1050. Gaunt: This royal throne of kings, this sceptred isle,"
## [4] "This earth of majesty, this seat of Mars,"
## [5] "This other Eden, demi-paradise,"
## [6] "This fortress built by Nature for herself"
## [7] "Against infection and the hand of war,"
## [8] "1055. This happy breed of men, this little world,"
## [9] "This precious stone set in the silver sea,"
## [10] "Which serves it in the office of a wall"
## [11] "Or as a moat defensive to a house,"
## [12] "Against the envy of less happier lands,--"
## [13] "1060. This blessed plot, this earth, this realm, this England."
## [14] "<snip>"
## [15] "Yorke: The King is come, deale mildly with his youth,"
# look for literal string "ess"
grep("ess",richardII)
## [1] 6 12 13
richardII[grep("ess",richardII)]
## [1] "This fortress built by Nature for herself"
## [2] "Against the envy of less happier lands,--"
## [3] "1060. This blessed plot, this earth, this realm, this England."
# look for commas in the middle
grep("[a-z], [a-z]",richardII)
## [1] 3 4 5 8 13 15
richardII[grep("[a-z], [a-z]",richardII)]
## [1] "1050. Gaunt: This royal throne of kings, this sceptred isle,"
## [2] "This earth of majesty, this seat of Mars,"
## [3] "This other Eden, demi-paradise,"
## [4] "1055. This happy breed of men, this little world,"
## [5] "1060. This blessed plot, this earth, this realm, this England."
## [6] "Yorke: The King is come, deale mildly with his youth,"
# look for capital letters not at the beginning
grep(".[A-Z]",richardII)
## [1] 1 3 4 5 6 8 13 15
richardII[grep(".[A-Z]",richardII)]
## [1] "THIS EXAMPLE from Shakespeare's Richard II:"
## [2] "1050. Gaunt: This royal throne of kings, this sceptred isle,"
## [3] "This earth of majesty, this seat of Mars,"
## [4] "This other Eden, demi-paradise,"
## [5] "This fortress built by Nature for herself"
## [6] "1055. This happy breed of men, this little world,"
## [7] "1060. This blessed plot, this earth, this realm, this England."
## [8] "Yorke: The King is come, deale mildly with his youth,"
grep("^\\d+",richardII,value = TRUE)
## [1] "1050. Gaunt: This royal throne of kings, this sceptred isle,"
## [2] "1055. This happy breed of men, this little world,"
## [3] "1060. This blessed plot, this earth, this realm, this England."
# make richardII a tibble and pipe it to a filter:
tibble(text=richardII) %>%
filter(grepl(".[A-Z]",text)) # still looking for uppercase in the middle
## # A tibble: 8 x 1
## text
## <chr>
## 1 THIS EXAMPLE from Shakespeare's Richard II:
## 2 1050. Gaunt: This royal throne of kings, this sceptred isle,
## 3 This earth of majesty, this seat of Mars,
## 4 This other Eden, demi-paradise,
## 5 This fortress built by Nature for herself
## 6 1055. This happy breed of men, this little world,
## 7 1060. This blessed plot, this earth, this realm, this England.
## 8 Yorke: The King is come, deale mildly with his youth,
sub("blessed","darned",richardII)[13]
## [1] "1060. This darned plot, this earth, this realm, this England."
# capture and backreference
sub("this ([A-Z][a-z]*)","\\1's fair and pleasant land",richardII)[13]
## [1] "1060. This blessed plot, this earth, this realm, England's fair and pleasant land."
# make richardII a tibble and pipe it to a filter:
tibble(text=richardII) %>%
# let's look for characters in the scene with dialog
filter(grepl("[A-Z][a-z]+:",text)) %>%
# from beginning to end capture the first alphabetic string starting with an uppercase letter up to a colon and discard the rest
mutate(character=sub("^[^A-Z]*([A-Z][a-z]+):.*$","\\1",text))
## # A tibble: 2 x 2
## text character
## <chr> <chr>
## 1 1050. Gaunt: This royal throne of kings, this sceptred isle, Gaunt
## 2 Yorke: The King is come, deale mildly with his youth, Yorke
```
# flip the words on either side of "of"
of<-grep("of",value=TRUE,richardII) #get the line containing "of"
gsub("([A-Za-z]*) of ([A-Za-z]*)","\\2 of \\1",of)
## [1] "1050. Gaunt: This royal kings of throne, this sceptred isle,"
## [2] "This majesty of earth, this Mars of seat,"
## [3] "Against infection and the war of hand,"
## [4] "1055. This happy men of breed, this little world,"
## [5] "Which serves it in the a of office wall"
## [6] "Against the less of envy happier lands,--"
# split the up the phases bounded by non space punctuation
strsplit(richardII,"[^A-Za-z0-9 ]+ *")
## [[1]]
## [1] "THIS EXAMPLE from Shakespeare" "s Richard II"
##
## [[2]]
## character(0)
##
## [[3]]
## [1] "1050" "Gaunt"
## [3] "This royal throne of kings" "this sceptred isle"
##
## [[4]]
## [1] "This earth of majesty" "this seat of Mars"
##
## [[5]]
## [1] "This other Eden" "demi" "paradise"
##
## [[6]]
## [1] "This fortress built by Nature for herself"
##
## [[7]]
## [1] "Against infection and the hand of war"
##
## [[8]]
## [1] "1055" "This happy breed of men"
## [3] "this little world"
##
## [[9]]
## [1] "This precious stone set in the silver sea"
##
## [[10]]
## [1] "Which serves it in the office of a wall"
##
## [[11]]
## [1] "Or as a moat defensive to a house"
##
## [[12]]
## [1] "Against the envy of less happier lands"
##
## [[13]]
## [1] "1060" "This blessed plot" "this earth"
## [4] "this realm" "this England"
##
## [[14]]
## [1] "" "snip"
##
## [[15]]
## [1] "Yorke" "The King is come"
## [3] "deale mildly with his youth"
richardII
## [1] "THIS EXAMPLE from Shakespeare's Richard II:"
## [2] ""
## [3] "1050. Gaunt: This royal throne of kings, this sceptred isle,"
## [4] "This earth of majesty, this seat of Mars,"
## [5] "This other Eden, demi-paradise,"
## [6] "This fortress built by Nature for herself"
## [7] "Against infection and the hand of war,"
## [8] "1055. This happy breed of men, this little world,"
## [9] "This precious stone set in the silver sea,"
## [10] "Which serves it in the office of a wall"
## [11] "Or as a moat defensive to a house,"
## [12] "Against the envy of less happier lands,--"
## [13] "1060. This blessed plot, this earth, this realm, this England."
## [14] "<snip>"
## [15] "Yorke: The King is come, deale mildly with his youth,"
# let's get the 2nd through 4th words:
matchStart<-regexpr(" ([A-Za-z]+[ ,]+){3}",richardII)
matchEnd<-matchStart+attr(matchStart,"match.length")-1
substr(richardII,matchStart+1,matchEnd-1)
## [1] "" "" "This royal throne"
## [4] "earth of majesty," "" "fortress built by"
## [7] "infection and the" "This happy breed" "precious stone set"
## [10] "serves it in" "as a moat" "the envy of"
## [13] "This blessed plot," "" "The King is"