Wk 9 Assignment_Hiral Purohit

library(tidyverse)

## -- Attaching packages -------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#Sections: Introduction, Prerequisites, String Basics, String Length, Combining Strings, Subsetting Strings, Locales; Exercises
#14.2 String basics
string1 <- "This is a string"
string2 <- 'If I want to include a "quote" inside a string, I use single quotes'

double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"

x <- c("\"", "\\")
x

## [1] "\"" "\\"

#> [1] "\"" "\\"
writeLines(x)

## "
## \

#> "
#> \

x <- "\u00b5"
x

## [1] "µ"

#> [1] "µ"

c("one", "two", "three")

## [1] "one"   "two"   "three"

#> [1] "one"   "two"   "three"

#14.2.1 String length
str_length(c("a", "R for data science", NA))

## [1]  1 18 NA

#> [1]  1 18 NA

str_c("x", "y")

## [1] "xy"

#> [1] "xy"
str_c("x", "y", "z")

## [1] "xyz"

#> [1] "xyz"

str_c("x", "y", sep = ", ")

## [1] "x, y"

#> [1] "x, y"

x <- c("abc", NA)
str_c("|-", x, "-|")

## [1] "|-abc-|" NA

#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")

## [1] "|-abc-|" "|-NA-|"

#> [1] "|-abc-|" "|-NA-|"

str_c("prefix-", c("a", "b", "c"), "-suffix")

## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"

#> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"

name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)

## [1] "Good morning Hadley."

#> [1] "Good morning Hadley."

str_c(c("x", "y", "z"), collapse = ", ")

## [1] "x, y, z"

#> [1] "x, y, z"

#14.2.3 Subsetting strings

x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)

## [1] "App" "Ban" "Pea"

#> [1] "App" "Ban" "Pea"
# negative numbers count backwards from end
str_sub(x, -3, -1)

## [1] "ple" "ana" "ear"

#> [1] "ple" "ana" "ear"

str_sub("a", 1, 5)

## [1] "a"

#> [1] "a"

str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x

## [1] "apple"  "banana" "pear"

#> [1] "apple"  "banana" "pear"

#14.2.4 Locales
# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))

## [1] "I" "I"

#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")

## [1] "I" "I"

#> [1] "İ" "I"

x <- c("apple", "eggplant", "banana")

str_sort(x, locale = "en")  # English

## [1] "apple"    "banana"   "eggplant"

#> [1] "apple"    "banana"   "eggplant"

str_sort(x, locale = "haw") # Hawaiian

## [1] "apple"    "eggplant" "banana"

#> [1] "apple"    "eggplant" "banana"

#14.2.5 Exercises--3
x <- c("a", "abc", "abcd", "abcde", "abcdef")
L <- str_length(x)
m <- ceiling(L / 2)
str_sub(x, m, m)

## [1] "a" "b" "b" "c" "c"

#> [1] "a" "b" "b" "c" "c"

#2-Sections: Matching patterns with regular expressions, Basic Matches; Exercises: None
#14.3 Matching patterns with regular expressions
#install.packages("htmlwidgets")

 x <- c("apple", "banana", "pear")
str_view(x, "an")

str_view(x, ".a.")

# To create the regular expression, we need \\
dot <- "\\."

# But the expression itself only contains one:
writeLines(dot)

## \.

#> \.

# And this tells R to look for an explicit .
str_view(c("abc", "a.c", "bef"), "a\\.c")

x <- "a\\b"
writeLines(x)

## a\b

#> a\b

str_view(x, "\\\\")

#Sections: Anchors; Exercises 

x <- c("apple", "banana", "pear")
str_view(x, "^a")

str_view(x, "a$")

x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")

str_view(x, "^apple$")

#Sections: Repetition; Exercises
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")

str_view(x, "CC+")

str_view(x, 'C[LX]+')

str_view(x, "C{2}")

str_view(x, "C{2,}")

str_view(x, "C{2,3}")

str_view(x, 'C{2,3}?')

str_view(x, 'C[LX]+?')

#14.3.4.1 Exercises
str_view(words, '^[^aeiou]{3}?', match = TRUE)

str_view(words, '[aeiou]{3,}?', match = TRUE)

str_view(words, "([aeiou][^aeiou]){2,}", match = TRUE)

#Sections: Grouping and Back references; Exercises: 1
str_view(fruit, "(..)\\1", match = TRUE)

#14.3.5.1 Exercises
#(.)\1\1: The same character appearing three times in a row. E.g. "aaa"
#"(.)(.)\\2\\1": A pair of characters followed by the same pair of characters in reversed order. E.g. "abba".
#(..)\1: Any two characters repeated. E.g. "a1a1".
#"(.).\\1.\\1": A character followed by any character, the original character, any other character, the original character again. E.g. "abaca", "b8b.b".
#"(.)(.)(.).*\\3\\2\\1" Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. "abcsgasgddsadgsdgcba" or "abccba" or "abc1cba".

#Sections: Tools, Detect Matches; Exercises
x <- c("apple", "banana", "pear")
str_detect(x, "e")

## [1]  TRUE FALSE  TRUE

#> [1]  TRUE FALSE  TRUE

# How many common words start with t?
sum(str_detect(words, "^t"))

## [1] 65

#> [1] 65
# What proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$"))

## [1] 0.2765306

#> [1] 0.277

# Find all words containing at least one vowel, and negate
no_vowels_1 <- !str_detect(words, "[aeiou]")
# Find all words consisting only of consonants (non-vowels)
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)

## [1] TRUE

#> [1] TRUE

words[str_detect(words, "x$")]

## [1] "box" "sex" "six" "tax"

#> [1] "box" "sex" "six" "tax"
str_subset(words, "x$")

## [1] "box" "sex" "six" "tax"

#> [1] "box" "sex" "six" "tax"

df <- tibble(
  word = words, 
  i = seq_along(word)
)
df %>% 
  filter(str_detect(word, "x$"))

#> # A tibble: 4 x 2
#>   word      i
#>   <chr> <int>
#> 1 box     108
#> 2 sex     747
#> 3 six     772
#> 4 tax     841

x <- c("apple", "banana", "pear")
str_count(x, "a")

## [1] 1 3 1

#> [1] 1 3 1

# On average, how many vowels per word?
mean(str_count(words, "[aeiou]"))

## [1] 1.991837

#> [1] 1.99

df %>% 
  mutate(
    vowels = str_count(word, "[aeiou]"),
    consonants = str_count(word, "[^aeiou]")
  )

#> # A tibble: 980 x 4
#>   word         i vowels consonants
#>   <chr>    <int>  <int>      <int>
#> 1 a            1      1          0
#> 2 able         2      2          2
#> 3 about        3      3          2
#> 4 absolute     4      4          4
#> 5 accept       5      2          4
#> 6 account      6      3          4
#> # … with 974 more rows

str_count("abababa", "aba")

## [1] 2

#> [1] 2
str_view_all("abababa", "aba")

#14.4.1.1 Exercises
str_view(words, "^x|x$", match = TRUE)

str_view(words, "^x|x$", match = TRUE)

words[str_detect(words, "^x|x$")]

## [1] "box" "sex" "six" "tax"

start_with_x <- str_detect(words, "^x")
end_with_x <- str_detect(words, "x$")
words[start_with_x | end_with_x]

## [1] "box" "sex" "six" "tax"

#Find all words that start with a vowel and end with a consonant.


words[str_detect(words,"^[aieou].*[^aeiou]$")]

##   [1] "about"       "accept"      "account"     "across"      "act"        
##   [6] "actual"      "add"         "address"     "admit"       "affect"     
##  [11] "afford"      "after"       "afternoon"   "again"       "against"    
##  [16] "agent"       "air"         "all"         "allow"       "almost"     
##  [21] "along"       "already"     "alright"     "although"    "always"     
##  [26] "amount"      "and"         "another"     "answer"      "any"        
##  [31] "apart"       "apparent"    "appear"      "apply"       "appoint"    
##  [36] "approach"    "arm"         "around"      "art"         "as"         
##  [41] "ask"         "at"          "attend"      "authority"   "away"       
##  [46] "awful"       "each"        "early"       "east"        "easy"       
##  [51] "eat"         "economy"     "effect"      "egg"         "eight"      
##  [56] "either"      "elect"       "electric"    "eleven"      "employ"     
##  [61] "end"         "english"     "enjoy"       "enough"      "enter"      
##  [66] "environment" "equal"       "especial"    "even"        "evening"    
##  [71] "ever"        "every"       "exact"       "except"      "exist"      
##  [76] "expect"      "explain"     "express"     "identify"    "if"         
##  [81] "important"   "in"          "indeed"      "individual"  "industry"   
##  [86] "inform"      "instead"     "interest"    "invest"      "it"         
##  [91] "item"        "obvious"     "occasion"    "odd"         "of"         
##  [96] "off"         "offer"       "often"       "okay"        "old"        
## [101] "on"          "only"        "open"        "opportunity" "or"         
## [106] "order"       "original"    "other"       "ought"       "out"        
## [111] "over"        "own"         "under"       "understand"  "union"      
## [116] "unit"        "university"  "unless"      "until"       "up"         
## [121] "upon"        "usual"

words[str_detect(words, "a") &
        str_detect(words, "e") &
        str_detect(words, "i") &
        str_detect(words, "o") &
        str_detect(words, "u")]

## character(0)

prop_vowels <- str_count(words, "[aeiou]") / str_length(words)
words[which(prop_vowels == max(prop_vowels))]

## [1] "a"

#Sections: Extract matches; Exercises: 2
length(sentences)

## [1] 720

 head(sentences)

## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match

## [1] "red|orange|yellow|green|blue|purple"

has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)

## [1] "blue" "blue" "red"  "red"  "red"  "blue"

more <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more, colour_match)

str_extract(more, colour_match)

## [1] "blue"   "green"  "orange"

str_extract_all(more, colour_match)

## [[1]]
## [1] "blue" "red" 
## 
## [[2]]
## [1] "green" "red"  
## 
## [[3]]
## [1] "orange" "red"

str_extract_all(more, colour_match, simplify = TRUE)

##      [,1]     [,2] 
## [1,] "blue"   "red"
## [2,] "green"  "red"
## [3,] "orange" "red"

x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)

##      [,1] [,2] [,3]
## [1,] "a"  ""   ""  
## [2,] "a"  "b"  ""  
## [3,] "a"  "b"  "c"

#14.4.2.1 Exercises

str_extract(sentences, "[A-ZAa-z]+") %>% head()

## [1] "The"   "Glue"  "It"    "These" "Rice"  "The"

str_extract(sentences, "[A-Za-z][A-Za-z']*") %>% head()

## [1] "The"   "Glue"  "It's"  "These" "Rice"  "The"

pattern <- "\\b[A-Za-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>%
  head()

## [1] "spring"  "evening" "morning" "winding" "living"  "king"

unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>%
  head()

## [1] "planks" "days"   "bowls"  "lemons" "makes"  "hogs"

#Sections: Grouped Matches; Exercises: 1, 2
noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>% 
  str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

has_noun %>% 
  str_match(noun)

##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"

tibble(sentence = sentences) %>% 
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
    remove = FALSE
  )

#14.4.3.1 Exercises
numword <- "\\b(one|two|three|four|five|six|seven|eight|nine|ten) +(\\w+)"
sentences[str_detect(sentences, numword)] %>%
  str_extract(numword)

##  [1] "seven books"   "two met"       "two factors"   "three lists"  
##  [5] "seven is"      "two when"      "ten inches"    "one war"      
##  [9] "one button"    "six minutes"   "ten years"     "two shares"   
## [13] "two distinct"  "five cents"    "two pins"      "five robins"  
## [17] "four kinds"    "three story"   "three inches"  "six comes"    
## [21] "three batches" "two leaves"

#Sections: Replacing Matches; Exercises: 1, 2
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")

## [1] "-pple"  "p-ar"   "b-nana"

str_replace_all(x, "[aeiou]", "-")

## [1] "-ppl-"  "p--r"   "b-n-n-"

x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))

## [1] "one house"    "two cars"     "three people"

sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)

## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."

#14.4.4.1 Exercises
str_replace_all("past/present/future", "/", "\\\\")

## [1] "past\\present\\future"

replacements <- c("A" = "a", "B" = "b", "C" = "c", "D" = "d", "E" = "e",
                  "F" = "f", "G" = "g", "H" = "h", "I" = "i", "J" = "j", 
                  "K" = "k", "L" = "l", "M" = "m", "N" = "n", "O" = "o", 
                  "P" = "p", "Q" = "q", "R" = "r", "S" = "s", "T" = "t", 
                  "U" = "u", "V" = "v", "W" = "w", "X" = "x", "Y" = "y", 
                  "Z" = "z")
lower_words <- str_replace_all(words, pattern = replacements)
head(lower_words)

## [1] "a"        "able"     "about"    "absolute" "accept"   "account"

#Sections: Splitting; Exercises: 1,2, 3
sentences %>%
  head(5) %>% 
  str_split(" ")

## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."

"a|b|c|d" %>% 
  str_split("\\|") %>% 
  .[[1]]

## [1] "a" "b" "c" "d"

sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE)

##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]     [,8]         
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth" "planks."    
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"   "background."
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"     "a"          
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"      "rare"       
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls." ""           
##      [,9]   
## [1,] ""     
## [2,] ""     
## [3,] "well."
## [4,] "dish."
## [5,] ""

fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)

##      [,1]      [,2]    
## [1,] "Name"    "Hadley"
## [2,] "Country" "NZ"    
## [3,] "Age"     "35"

x <- "This is a sentence.  This is another sentence."
str_view_all(x, boundary("word"))

str_split(x, " ")[[1]]

## [1] "This"      "is"        "a"         "sentence." ""          "This"     
## [7] "is"        "another"   "sentence."

str_split(x, boundary("word"))[[1]]

## [1] "This"     "is"       "a"        "sentence" "This"     "is"       "another" 
## [8] "sentence"

#14.4.5.1 Exercises
x <- c("apples, pears, and bananas")
str_split(x, ", +(and +)?")[[1]]

## [1] "apples"  "pears"   "bananas"

sentence <- "The quick (“brown”) fox can’t jump 32.3 feet, right?"
str_split(sentence, " ")

## [[1]]
## [1] "The"       "quick"     "(“brown”)" "fox"       "can’t"     "jump"     
## [7] "32.3"      "feet,"     "right?"

str_split(sentence, boundary("word"))

## [[1]]
## [1] "The"   "quick" "brown" "fox"   "can’t" "jump"  "32.3"  "feet"  "right"

#Sections: Find Matches; Exercises: 1
# The regular call:
str_view(fruit, "nana")

# Is shorthand for
str_view(fruit, regex("nana"))

bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")

str_view(bananas, regex("banana", ignore_case = TRUE))

x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]

## [1] "Line"

 str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]

## [1] "Line" "Line" "Line"

phone <- regex("
  \\(?     # optional opening parens
  (\\d{3}) # area code
  [) -]?   # optional closing parens, space, or dash
  (\\d{3}) # another three numbers
  [ -]?    # optional space or dash
  (\\d{3}) # three more numbers
  ", comments = TRUE)

str_match("514-791-8141", phone)

##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"

#install.packages("microbenchmark")

microbenchmark::microbenchmark(
  fixed = str_detect(sentences, fixed("the")),
  regex = str_detect(sentences, "the"),
  times = 20
)

a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)

## [1] "á" "a´"

 a1 == a2

## [1] FALSE

str_detect(a1, fixed(a2))

## [1] FALSE

 str_detect(a1, coll(a2))

## [1] TRUE

i <- c("I", "İ", "i", "ı")
i

## [1] "I" "I" "i" "i"

str_subset(i, coll("i", ignore_case = TRUE))

## [1] "I" "I" "i" "i"

 str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))

## [1] "i" "i"

stringi::stri_locale_info()

## $Language
## [1] "en"
## 
## $Country
## [1] "US"
## 
## $Variant
## [1] ""
## 
## $Name
## [1] "en_US"

x <- "This is a sentence."
str_view_all(x, boundary("word"))

str_extract_all(x, boundary("word"))

## [[1]]
## [1] "This"     "is"       "a"        "sentence"

#14.5.1 Exercises
str_subset(c("a\\b", "ab"), "\\\\")

## [1] "a\\b"

str_subset(c("a\\b", "ab"), fixed("\\"))

## [1] "a\\b"

Wk 9 Assignment_Hiral Purohit

Hiral Purohit

7/6/2020