Instructions:
Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit. You may work in a small group, but please submit separately with names of all group participants in your submission.
Rpubs link: http://rpubs.com/jefflittlejohn/Data_607_Lab_3
Load the raw data and packages.
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
Extract names from the raw data using the example code from p. 236.
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
Problem 3:
title <- c('Dr. |Rev. |Gen. ')
#Create vector with titles removed
name_no_title <- str_replace_all(name, title, '')
name_no_title
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
Now that titles have been removed, test if name format is last name, first name by looking for a comma. Only return strings with two or more alpha characters, eliminating middle initials.
#Create vector with middle initial removed
name_no_mid_initial <- str_replace_all(name_no_title, " [[:alpha:]]\\.", '')
name_no_mid_initial
## [1] "Moe Szyslak" "Burns, Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
name_conforming <- str_replace(name_no_mid_initial, "(\\w+), (\\w+)", "\\2 \\1")
name_df <- data.frame(name, name_conforming)
name_df
## name name_conforming
## 1 Moe Szyslak Moe Szyslak
## 2 Burns, C. Montgomery Montgomery Burns
## 3 Rev. Timothy Lovejoy Timothy Lovejoy
## 4 Ned Flanders Ned Flanders
## 5 Simpson, Homer Homer Simpson
## 6 Dr. Julius Hibbert Julius Hibbert
#Using title logic developed in part a
title_test <- str_detect(name,title)
title_df <- data.frame(name, title_test)
title_df
## name title_test
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert TRUE
#Poor man's way of testing this is looking for two or more spaces
second_name_test <- str_count(name_no_title, "[[:space:]]") >= 2
second_name_df <- data.frame(name, second_name_test)
second_name_df
## name second_name_test
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert FALSE
Problem 4 Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
#A positive number that ends in a dollar sign
test_string_a <- c("steve", "$123", "123$", "$125.15", "125.15$", "-25.13$")
str_extract(test_string_a,"[0-9]+\\$")
## [1] NA NA "123$" NA "15$" "13$"
#lower case words between 1 and 4 letters in length - first word that meets requirement in a string
test_string_b <- c("steve", "ant", "123$", "by", "spoon", "the car", "booster seat", "Sara", "dad hat")
str_extract(test_string_b,"\\b[a-z]{1,4}\\b")
## [1] NA "ant" NA "by" NA "the" "seat" NA "dad"
#a string that ends in ".txt"
test_string_c <- c("steve.txt", "steve.doc", ".txt", "123.txt", "dog cat.txt", "steve.txt 2", "steve.txt2")
str_extract(test_string_c,".*?\\.txt$")
## [1] "steve.txt" NA ".txt" "123.txt" "dog cat.txt"
## [6] NA NA
# 2 digits followed by forward slash, 2 digits followed by forward slash, then 4 digits - presumably a date format, but without valid date constraints - can pick out of larger string
test_string_d <- c("11/22/1986","11221986","11-22-1986","1986-11-22","aa/bb/dddd", "11/11/22/1986", "11/22/1986/1986", "cat 11/22/1986")
str_extract(test_string_d,"\\d{2}/\\d{2}/\\d{4}")
## [1] "11/22/1986" NA NA NA NA
## [6] "11/22/1986" "11/22/1986" "11/22/1986"
#pattern to match tags - string that includes an open tag and a closing tag with the same string pattern and contains at least one character (can be a space) between the two tags
test_string_e <- c("<b> </b>", "<b> ahhhhh </b>", "<b></b>", "<b> bold </c>", "<b> bold <b>", "cat <b> bold </b>", "<bb> bbbbb </bb>")
str_extract(test_string_e,"<(.+?)>.+?</\\1>")
## [1] "<b> </b>" "<b> ahhhhh </b>" NA
## [4] NA NA "<b> bold </b>"
## [7] "<bb> bbbbb </bb>"
Problem 9 Decipher…
mystery_str <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
Check a few different extraction methods… Alphabet characters only
str_extract_all(mystery_str, "[[:alpha:]]")
## [[1]]
## [1] "c" "l" "c" "o" "p" "C" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k"
## [18] "i" "g" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g" "v" "h" "r" "y" "n"
## [35] "G" "j" "u" "w" "c" "z" "i" "h" "q" "r" "f" "p" "R" "x" "s" "A" "j"
## [52] "d" "w" "p" "n" "T" "a" "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "L"
## [69] "j" "k" "p" "f" "A" "T" "I" "d" "r" "c" "o" "c" "b" "t" "y" "c" "z"
## [86] "j" "a" "t" "O" "a" "o" "o" "t" "j" "t" "N" "j" "n" "e" "c" "S" "f"
## [103] "e" "k" "r" "w" "Y" "w" "w" "o" "j" "i" "g" "O" "d" "v" "r" "f" "U"
## [120] "r" "b" "z" "b" "k" "A" "n" "b" "h" "z" "g" "v" "R" "i" "z" "E" "c"
## [137] "r" "o" "p" "w" "A" "g" "n" "b" "S" "q" "o" "U" "f" "P" "a" "o" "t"
## [154] "f" "b" "w" "E" "m" "k" "t" "s" "R" "z" "q" "e" "f" "y" "n" "N" "d"
## [171] "t" "k" "c" "f" "E" "g" "m" "c" "R" "g" "x" "o" "n" "h" "D" "k" "g"
## [188] "r"
str_extract_all(mystery_str, "[[:digit:]]")
## [[1]]
## [1] "1" "0" "8" "7" "7" "9" "2" "8" "5" "5" "0" "7" "8" "0" "3" "5" "3"
## [18] "0" "7" "5" "5" "3" "3" "6" "4" "1" "1" "6" "2" "2" "4" "9" "0" "5"
## [35] "6" "5" "1" "7" "2" "4" "6" "3" "9" "5" "8" "9" "6" "5" "9" "4" "9"
## [52] "0" "5" "4" "5"
str_extract_all(mystery_str, "[[:upper:]]")
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"
str_extract_all(mystery_str, "[[:lower:]]")
## [[1]]
## [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
## [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
## [35] "c" "z" "i" "h" "q" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
## [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "d" "r" "c" "o" "c"
## [69] "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n" "e"
## [86] "c" "f" "e" "k" "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f" "r"
## [103] "b" "z" "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o" "p" "w"
## [120] "g" "n" "b" "q" "o" "f" "a" "o" "t" "f" "b" "w" "m" "k" "t" "s" "z"
## [137] "q" "e" "f" "y" "n" "d" "t" "k" "c" "f" "g" "m" "c" "g" "x" "o" "n"
## [154] "h" "k" "g" "r"
Uppercase is the key - see message below.
str_extract_all(mystery_str, "[[:upper:]]")
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"
Tried to use paste to turn this into singular string, but getting weird formatting error (reproduced below). Message is “CONGRATULATIONS YOU ARE A SUPER NERD.”
paste(str_extract_all(mystery_str, "[[:upper:]]"),sep="",collapse="")
## [1] "c(\"C\", \"O\", \"N\", \"G\", \"R\", \"A\", \"T\", \"U\", \"L\", \"A\", \"T\", \"I\", \"O\", \"N\", \"S\", \"Y\", \"O\", \"U\", \"A\", \"R\", \"E\", \"A\", \"S\", \"U\", \"P\", \"E\", \"R\", \"N\", \"E\", \"R\", \"D\")"