library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0      v purrr   1.0.1 
## v tibble  3.1.6      v dplyr   1.0.10
## v tidyr   1.2.0      v stringr 1.5.0 
## v readr   2.1.2      v forcats 0.5.2
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dbplyr)
## 
## Attaching package: 'dbplyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
library(stringr)
library(stringi)
## Warning: package 'stringi' was built under R version 4.0.5

Exercise 1

Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset, link provide code that identifies the majors that contain either “DATA” or “STATISTICS”

##read csv and see the header
readfile <- data.frame(read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv", header = TRUE))

head(readfile)
##   FOD1P                                 Major                  Major_Category
## 1  1100                   GENERAL AGRICULTURE Agriculture & Natural Resources
## 2  1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3  1102                AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4  1103                       ANIMAL SCIENCES Agriculture & Natural Resources
## 5  1104                          FOOD SCIENCE Agriculture & Natural Resources
## 6  1105            PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
## search for majors contains DATA or STATISTICS
data_stat_df <-grep("DATA|STATISTICS", readfile$Major, value = TRUE)

head(data_stat_df)
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
## [3] "STATISTICS AND DECISION SCIENCE"

Exercise 2

Write code that transforms the data below: [1] “bell pepper” “bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry” Into a format like this: c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

##input the data to a dataframe

new_df <- {
'
[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
[9] "elderberry"   "lime"         "lychee"       "mulberry"    
[13] "olive"        "salal berry"
'
}
new_df
## [1] "\n[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\"\n[5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"  \n[9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"    \n[13] \"olive\"        \"salal berry\"\n"
##remove the unknown characters
new_df<-str_remove_all(new_df,"\"")
new_df<-str_remove_all(new_df,"\n")
new_df
## [1] "[1] bell pepper  bilberry     blackberry   blood orange[5] blueberry    cantaloupe   chili pepper cloudberry  [9] elderberry   lime         lychee       mulberry    [13] olive        salal berry"
##sort the string
sort(new_df)
## [1] "[1] bell pepper  bilberry     blackberry   blood orange[5] blueberry    cantaloupe   chili pepper cloudberry  [9] elderberry   lime         lychee       mulberry    [13] olive        salal berry"
new_list <-c(new_df)

new_list
## [1] "[1] bell pepper  bilberry     blackberry   blood orange[5] blueberry    cantaloupe   chili pepper cloudberry  [9] elderberry   lime         lychee       mulberry    [13] olive        salal berry"

Exercise 3

Describe, in words, what these expressions will match: *(.)\1\1 This expression returns value with characters repeated 3 times.

test_word <- c("goddesssship", "juice", "skullllike", "skull", "hostesssship")
str_subset(test_word, "(.)\\1\\1")
## [1] "goddesssship" "skullllike"   "hostesssship"

*(.)(.)\2\1 This expression returns value with pairs characters.

test_word2 <- c("abba", "music", "fruits", "candy", "choco")
str_subset(test_word2, "(.)(.)\\2\\1")
## [1] "abba"

*(..)\1 This expression returns value with any characters repeated.

test_word3 <-c("banana","sausage", "music", "silver", "muscle")
str_subset(test_word3, "(..)\\1")
## [1] "banana"

*(.).\1.\1 This expression returns characters that have other characters in between 2 original characters.

test_word4 <- c("banana", "Alabama", "sushi", "sugar", "bowl")
str_subset(test_word4, "(.).\\1.\\1")
## [1] "banana"  "Alabama"

(.)(.)(.).\3\2\1 This expression returns characters that have others characters in between 2 character sets in reversed order.

test_word5 <- c("xyz1zyx", "doggod", "sworddrows", "music", "mussle")
str_subset(test_word5, "(.)(.)(.).*\\3\\2\\1")
## [1] "xyz1zyx"    "doggod"     "sworddrows"

Exercise 4

Construct regular expressions to match words that: Start and end with the same character.

test_word6 <- c("dad", "music", "razor", "banana", "burritos")
str_subset(test_word6, "^(.)((.*\\1$)|\\1?$)")
## [1] "dad"   "razor"

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

test_word7 <- c("church", "monster", "banana", "jujube", "star")
str_subset(test_word7, "([A-Za-z][A-Za-z]).*\\1")
## [1] "church" "banana" "jujube"

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

test_word8 <- c("eleven", "balloon", "expression", "repeat", "banana")
str_subset(test_word8, "([a-z]).*\\1.*\\1")
## [1] "eleven" "banana"