Lab 3 - R Character Manipulation

Instructions:

Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit. You may work in a small group, but please submit separately with names of all group participants in your submission.

Rpubs link: http://rpubs.com/jefflittlejohn/Data_607_Lab_3

Github link: https://github.com/littlejohnjeff/DATA607_Fall2018/blob/master/Lab%203%20-%20R%20Character%20Manipulation%20-%20Jeff%20Littlejohn.Rmd

Load the raw data and packages.

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

Extract names from the raw data using the example code from p. 236.

name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Problem 3:

Use the tools from the chapter to rerrange the vector so that all elements conform to the standard first_name last_name.

title <- c('Dr. |Rev. |Gen. ')
#Create vector with titles removed
name_no_title <- str_replace_all(name, title, '')
name_no_title

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Timothy Lovejoy"     
## [4] "Ned Flanders"         "Simpson, Homer"       "Julius Hibbert"

Now that titles have been removed, test if name format is last name, first name by looking for a comma. Only return strings with two or more alpha characters, eliminating middle initials.

#Create vector with middle initial removed
name_no_mid_initial <- str_replace_all(name_no_title, " [[:alpha:]]\\.", '')
name_no_mid_initial

## [1] "Moe Szyslak"       "Burns, Montgomery" "Timothy Lovejoy"  
## [4] "Ned Flanders"      "Simpson, Homer"    "Julius Hibbert"

name_conforming <- str_replace(name_no_mid_initial, "(\\w+), (\\w+)", "\\2 \\1")
name_df <- data.frame(name, name_conforming)
name_df

##                   name  name_conforming
## 1          Moe Szyslak      Moe Szyslak
## 2 Burns, C. Montgomery Montgomery Burns
## 3 Rev. Timothy Lovejoy  Timothy Lovejoy
## 4         Ned Flanders     Ned Flanders
## 5       Simpson, Homer    Homer Simpson
## 6   Dr. Julius Hibbert   Julius Hibbert

Construct a logical vector indicating whether a character has a title (i.e. Rev. and Dr.).

#Using title logic developed in part a
title_test <- str_detect(name,title)
title_df <- data.frame(name, title_test)
title_df

##                   name title_test
## 1          Moe Szyslak      FALSE
## 2 Burns, C. Montgomery      FALSE
## 3 Rev. Timothy Lovejoy       TRUE
## 4         Ned Flanders      FALSE
## 5       Simpson, Homer      FALSE
## 6   Dr. Julius Hibbert       TRUE

Construct a logical vector indicating whether a character has a second name.

#Poor man's way of testing this is looking for two or more spaces
second_name_test <- str_count(name_no_title, "[[:space:]]") >= 2
second_name_df <- data.frame(name, second_name_test)
second_name_df

##                   name second_name_test
## 1          Moe Szyslak            FALSE
## 2 Burns, C. Montgomery             TRUE
## 3 Rev. Timothy Lovejoy            FALSE
## 4         Ned Flanders            FALSE
## 5       Simpson, Homer            FALSE
## 6   Dr. Julius Hibbert            FALSE

Problem 4 Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

[0-9]+\$

#A positive number that ends in a dollar sign
test_string_a <- c("steve", "$123", "123$", "$125.15", "125.15$", "-25.13$")
str_extract(test_string_a,"[0-9]+\\$")

## [1] NA     NA     "123$" NA     "15$"  "13$"

\b[a-z]{1,4}\b

#lower case words between 1 and 4 letters in length - first word that meets requirement in a string 
test_string_b <- c("steve", "ant", "123$", "by", "spoon", "the car", "booster seat", "Sara", "dad hat")
str_extract(test_string_b,"\\b[a-z]{1,4}\\b")

## [1] NA     "ant"  NA     "by"   NA     "the"  "seat" NA     "dad"

.*?\.txt$

#a string that ends in ".txt"  
test_string_c <- c("steve.txt", "steve.doc", ".txt", "123.txt", "dog cat.txt", "steve.txt 2", "steve.txt2")
str_extract(test_string_c,".*?\\.txt$")

## [1] "steve.txt"   NA            ".txt"        "123.txt"     "dog cat.txt"
## [6] NA            NA

\d{2}/\d{2}/\d{4}

# 2 digits followed by forward slash, 2 digits followed by forward slash, then 4 digits - presumably a date format, but without valid date constraints - can pick out of larger string
test_string_d <- c("11/22/1986","11221986","11-22-1986","1986-11-22","aa/bb/dddd", "11/11/22/1986", "11/22/1986/1986", "cat 11/22/1986")
str_extract(test_string_d,"\\d{2}/\\d{2}/\\d{4}")

## [1] "11/22/1986" NA           NA           NA           NA          
## [6] "11/22/1986" "11/22/1986" "11/22/1986"

<(.+?)>.+?</\1>

#pattern to match tags - string that includes an open tag and a closing tag with the same string pattern and contains at least one character (can be a space) between the two tags 
test_string_e <- c("<b> </b>", "<b> ahhhhh </b>", "<b></b>", "<b> bold </c>", "<b> bold <b>", "cat <b> bold </b>", "<bb> bbbbb </bb>")
str_extract(test_string_e,"<(.+?)>.+?</\\1>")

## [1] "<b> </b>"         "<b> ahhhhh </b>"  NA                
## [4] NA                 NA                 "<b> bold </b>"   
## [7] "<bb> bbbbb </bb>"

Problem 9 Decipher…

mystery_str <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

Check a few different extraction methods… Alphabet characters only

str_extract_all(mystery_str, "[[:alpha:]]")

## [[1]]
##   [1] "c" "l" "c" "o" "p" "C" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k"
##  [18] "i" "g" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g" "v" "h" "r" "y" "n"
##  [35] "G" "j" "u" "w" "c" "z" "i" "h" "q" "r" "f" "p" "R" "x" "s" "A" "j"
##  [52] "d" "w" "p" "n" "T" "a" "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "L"
##  [69] "j" "k" "p" "f" "A" "T" "I" "d" "r" "c" "o" "c" "b" "t" "y" "c" "z"
##  [86] "j" "a" "t" "O" "a" "o" "o" "t" "j" "t" "N" "j" "n" "e" "c" "S" "f"
## [103] "e" "k" "r" "w" "Y" "w" "w" "o" "j" "i" "g" "O" "d" "v" "r" "f" "U"
## [120] "r" "b" "z" "b" "k" "A" "n" "b" "h" "z" "g" "v" "R" "i" "z" "E" "c"
## [137] "r" "o" "p" "w" "A" "g" "n" "b" "S" "q" "o" "U" "f" "P" "a" "o" "t"
## [154] "f" "b" "w" "E" "m" "k" "t" "s" "R" "z" "q" "e" "f" "y" "n" "N" "d"
## [171] "t" "k" "c" "f" "E" "g" "m" "c" "R" "g" "x" "o" "n" "h" "D" "k" "g"
## [188] "r"

str_extract_all(mystery_str, "[[:digit:]]")

## [[1]]
##  [1] "1" "0" "8" "7" "7" "9" "2" "8" "5" "5" "0" "7" "8" "0" "3" "5" "3"
## [18] "0" "7" "5" "5" "3" "3" "6" "4" "1" "1" "6" "2" "2" "4" "9" "0" "5"
## [35] "6" "5" "1" "7" "2" "4" "6" "3" "9" "5" "8" "9" "6" "5" "9" "4" "9"
## [52] "0" "5" "4" "5"

str_extract_all(mystery_str, "[[:upper:]]")

## [[1]]
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"

str_extract_all(mystery_str, "[[:lower:]]")

## [[1]]
##   [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
##  [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
##  [35] "c" "z" "i" "h" "q" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
##  [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "d" "r" "c" "o" "c"
##  [69] "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n" "e"
##  [86] "c" "f" "e" "k" "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f" "r"
## [103] "b" "z" "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o" "p" "w"
## [120] "g" "n" "b" "q" "o" "f" "a" "o" "t" "f" "b" "w" "m" "k" "t" "s" "z"
## [137] "q" "e" "f" "y" "n" "d" "t" "k" "c" "f" "g" "m" "c" "g" "x" "o" "n"
## [154] "h" "k" "g" "r"

Uppercase is the key - see message below.

str_extract_all(mystery_str, "[[:upper:]]")

## [[1]]
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"

Tried to use paste to turn this into singular string, but getting weird formatting error (reproduced below). Message is “CONGRATULATIONS YOU ARE A SUPER NERD.”

paste(str_extract_all(mystery_str, "[[:upper:]]"),sep="",collapse="")

## [1] "c(\"C\", \"O\", \"N\", \"G\", \"R\", \"A\", \"T\", \"U\", \"L\", \"A\", \"T\", \"I\", \"O\", \"N\", \"S\", \"Y\", \"O\", \"U\", \"A\", \"R\", \"E\", \"A\", \"S\", \"U\", \"P\", \"E\", \"R\", \"N\", \"E\", \"R\", \"D\")"

Lab 3 - R Character Manipulation

Jeff Littlejohn

September 15, 2018