Data 607 HW3

Rpub link:

GitHub link:

Question 3

library(tidyverse)

## -- Attaching packages ----- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts -------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(stringr)

#copy the raw data
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
 
raw.data

## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

The names extracted from the original is shown below:

name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))

name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

library(stringr)

#extract last names
last1 <- str_extract(name, '[[:alpha:]]{1,}\\,|\\b [[:alpha:]]{1,}')
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

last <- str_extract(last1, "[[:alpha:]]{1,}")

last

## [1] "Szyslak"  "Burns"    "Lovejoy"  "Flanders" "Simpson"  "Hibbert"

#extract first names

first1 <- str_extract(name, '[[:alpha:]]{1,} |\\. [[:alpha:]]{1,}|\\, [[:alpha:]]{2,}')


first <- str_extract(first1, "[[:alpha:]]{1,}")

first1

## [1] "Moe "         ". Montgomery" ". Timothy"    "Ned "        
## [5] ", Homer"      ". Julius"

first

## [1] "Moe"        "Montgomery" "Timothy"    "Ned"        "Homer"     
## [6] "Julius"

# prob 3 cont'd:  putting firstname  lastname convention in dataframe
df <- data.frame(first,last)

df

##        first     last
## 1        Moe  Szyslak
## 2 Montgomery    Burns
## 3    Timothy  Lovejoy
## 4        Ned Flanders
## 5      Homer  Simpson
## 6     Julius  Hibbert

# logical check for titles or not in dataframe

df$title <- str_detect(name, 'Rev.|Dr.')

df

##        first     last title
## 1        Moe  Szyslak FALSE
## 2 Montgomery    Burns FALSE
## 3    Timothy  Lovejoy  TRUE
## 4        Ned Flanders FALSE
## 5      Homer  Simpson FALSE
## 6     Julius  Hibbert  TRUE

# logical check if there's middle name in dataframe
df$middlename <- str_detect(name, ' [[:alpha:]]{1,}\\.')

df

##        first     last title middlename
## 1        Moe  Szyslak FALSE      FALSE
## 2 Montgomery    Burns FALSE       TRUE
## 3    Timothy  Lovejoy  TRUE      FALSE
## 4        Ned Flanders FALSE      FALSE
## 5      Homer  Simpson FALSE      FALSE
## 6     Julius  Hibbert  TRUE      FALSE

# create a new column with first and last names in FINAL dataframe

df$first_last <- paste(df$first,df$last, sep=" ",collapse=NULL)
df

##        first     last title middlename       first_last
## 1        Moe  Szyslak FALSE      FALSE      Moe Szyslak
## 2 Montgomery    Burns FALSE       TRUE Montgomery Burns
## 3    Timothy  Lovejoy  TRUE      FALSE  Timothy Lovejoy
## 4        Ned Flanders FALSE      FALSE     Ned Flanders
## 5      Homer  Simpson FALSE      FALSE    Homer Simpson
## 6     Julius  Hibbert  TRUE      FALSE   Julius Hibbert

Question 4 and its associated 5 sub questions

[0-9]+\$

ans:

This regex returns a string of numericals followed by $ sign

test <- "999$a###6-77$.2.2$a11$"

unlist(str_extract_all(test, '[0-9]+\\$'))

## [1] "999$" "77$"  "2$"   "11$"

\b[a-z]{1-4}\b

ans:

This regex returns 1-4 lower alpha characters bounded at both ends with a leading non-word and an ending non-word

test2 <- "z-one3 55.tree !CRAZY%$-czdz"

unlist(str_extract_all(test2, '\\b[a-z]{1,4}\\b'))

## [1] "z"    "tree" "czdz"

.*?\.txt$

ans:

This regex returns anything with as many repititions, 0 or more times that ends with a litteral of .txt

test3 <- "911!889]]2.txt else.pdf----%&@.txt"
unlist(str_extract_all(test3, '.*?\\.txt$'))

## [1] "911!889]]2.txt else.pdf----%&@.txt"

\d{2}/\d{2}/\d{4

ans: This regex returns a date format type such 2 digit month, 2 digit days followed by 4 digit year: two digits(usually months) followed by another two digits(days) and then four digits(years)

test4 <- "09/11/2019 02/02/2002 1986/01/02"
unlist(str_extract_all(test4, '\\d{2}/\\d{2}/\\d{4}'))

## [1] "09/11/2019" "02/02/2002"

<(.+?)>.+?</\1>

ans:

This regex matches any that ends with

test5 <- "<oor><pi>jacki007<//ollaah>12ee22<li>once upon a time in 1999 </oor>"

unlist(str_extract_all(test5, '<(.+?)>.+?</\\1>'))

## [1] "<oor><pi>jacki007<//ollaah>12ee22<li>once upon a time in 1999 </oor>"

Question 9. Secret Message

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

encrypted <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"


## remove all numbers and lowercase letters
cracked <- unlist(str_extract_all(encrypted, '[^[a-z]|[0-9]]'))

cracked

##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"

cracked1 <- paste0(cracked, collapse="")

cracked1

## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"

gsub(".", " ", cracked1,fixed=TRUE)

## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"

Data 607 HW3 - RegEX

Sufian

9/9/2019

Question 3

Question 4 and its associated 5 sub questions

Question 9. Secret Message