library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.0 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.3 v tibble 3.1.8
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(dplyr)
#1. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”
#use read.csv() to load data from github
col_majors <- read.csv ("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv", header = TRUE, sep = ",")
#filter for Data or Statistics like majors
col_major_mod <- col_majors %>%
filter(str_detect(Major, "DATA|STATISTICS"))
glimpse(col_major_mod)
## Rows: 3
## Columns: 3
## $ FOD1P <chr> "6212", "2101", "3702"
## $ Major <chr> "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS", "COMPU~
## $ Major_Category <chr> "Business", "Computers & Mathematics", "Computers & Mat~
#2 Write code that transforms the data below: [1] “bell pepper”
“bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe”
“chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry” Into a format like this: c(“bell pepper”,
“bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”,
“chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”,
“mulberry”, “olive”, “salal berry”) The two exercises below are taken
from R for Data Science:
produce <- data.frame(c("bell pepper", "bilberry", "blackberry","blood orange","blueberry","cantalope","chili pepper","cloudberry","elderberry","lime","lychee","mulberry","olive","salal berry"))
#view produce
produce
## c..bell.pepper....bilberry....blackberry....blood.orange....blueberry...
## 1 bell pepper
## 2 bilberry
## 3 blackberry
## 4 blood orange
## 5 blueberry
## 6 cantalope
## 7 chili pepper
## 8 cloudberry
## 9 elderberry
## 10 lime
## 11 lychee
## 12 mulberry
## 13 olive
## 14 salal berry
#paste produce collapsed by , into concatenate
cat(paste0(produce), collapse=",")
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantalope", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry") ,
#3 Describe, in words, what these expressions will match: (.)\1\1 ANS - “(.)\1\1” looks for any character that is followed by “\1\1”
“(.)(.)\2\1” ANS - “(.)(.)\2\1” looks for any two characters that are repeated in reverse, ie ace, eca
(..)\1 ANS - (..)\1 finds any word with characters followed by “\1”
“(.).\1.\1” ANS - “(.).\1.\1” searches for any character, “a” followed by any other character, followed by original character “a”, followed by any other character, and the original character again
“(.)(.)(.).*\3\2\1” - ANS - This expression searches for any three characters followed by same three characters in reverse order
# (.)\1\1
my_string <- (c("abracadabra", "emme","farm\1\1", "aaa","civic", "peep", "aabbaa", "ccc" ))
# find any where character is followed by "\1\1"
x <- str_view(my_string, "(.)\1\1")
print(x)
## [3] | far<m>
# find any two characters where they are repeated in reverse, ie ab, ba
y <- str_view(my_string, "(.)(.)\\2\\1")
print(y)
## [2] | <emme>
## [6] | <peep>
## [7] | a<abba>a
# find any where word with characters followed by literal "\1"
z <- str_view(my_string, "(..)\1")
print(z)
## [3] | fa<rm>
a <- str_view(my_string, "(.).\\1.\\1")
print(a)
## [1] | abr<acada>bra
b <- str_view(my_string, "(.)(.)(.).*\\3\\2\\1")
print(b)
## [7] | <aabbaa>
#4 Construct regular expressions to match words that: Start and end with the same character. Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
#expression to find words that start and end with the same character
mystring2 <- c("rubber", "Emme", "Toronto", "decide", "melee", "termite" )
sc <- str_subset(mystring2, regex("^(.).*\\1$", ignore_case = T))
print (sc)
## [1] "rubber" "Emme"
#expression that contain a repeated pair of letters (e.g. "church" contains "ch" repeated twice.)
rc <- str_subset(mystring2, regex("([A-Za-z][A-Za-z]).*\\1", ignore_case = T))
print (rc)
## [1] "Toronto" "decide" "termite"
#one letter repeated in at least three places
tr <- str_subset(mystring2, regex("([A-Za-z]).*\\1.*\\1", ignore_case = T))
print (tr)
## [1] "Toronto" "melee"