Data 607 Assignment 3

library(tidyverse)

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.0     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.3     v tibble    3.1.8
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.2     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(stringr)
library(dplyr)

#1. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

#use read.csv() to load data from github

col_majors <- read.csv ("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv", header = TRUE, sep = ",")

#filter for Data or Statistics like majors  
col_major_mod <- col_majors %>%
  filter(str_detect(Major, "DATA|STATISTICS"))

glimpse(col_major_mod)

## Rows: 3
## Columns: 3
## $ FOD1P          <chr> "6212", "2101", "3702"
## $ Major          <chr> "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS", "COMPU~
## $ Major_Category <chr> "Business", "Computers & Mathematics", "Computers & Mat~

#2 Write code that transforms the data below: [1] “bell pepper” “bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry” Into a format like this: c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”) The two exercises below are taken from R for Data Science:

produce <- data.frame(c("bell pepper", "bilberry", "blackberry","blood orange","blueberry","cantalope","chili pepper","cloudberry","elderberry","lime","lychee","mulberry","olive","salal berry"))

#view produce
produce

##    c..bell.pepper....bilberry....blackberry....blood.orange....blueberry...
## 1                                                               bell pepper
## 2                                                                  bilberry
## 3                                                                blackberry
## 4                                                              blood orange
## 5                                                                 blueberry
## 6                                                                 cantalope
## 7                                                              chili pepper
## 8                                                                cloudberry
## 9                                                                elderberry
## 10                                                                     lime
## 11                                                                   lychee
## 12                                                                 mulberry
## 13                                                                    olive
## 14                                                              salal berry

#paste produce collapsed by , into concatenate
cat(paste0(produce), collapse=",")

## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantalope", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry") ,

#3 Describe, in words, what these expressions will match: (.)\1\1 ANS - “(.)\1\1” looks for any character that is followed by “\1\1”

“(.)(.)\2\1” ANS - “(.)(.)\2\1” looks for any two characters that are repeated in reverse, ie ace, eca

(..)\1 ANS - (..)\1 finds any word with characters followed by “\1”

“(.).\1.\1” ANS - “(.).\1.\1” searches for any character, “a” followed by any other character, followed by original character “a”, followed by any other character, and the original character again

“(.)(.)(.).*\3\2\1” - ANS - This expression searches for any three characters followed by same three characters in reverse order

# (.)\1\1 
my_string <- (c("abracadabra", "emme","farm\1\1", "aaa","civic", "peep", "aabbaa", "ccc" ))

# find any where character is followed by "\1\1"
x <- str_view(my_string, "(.)\1\1")   
print(x)

## [3] | far<m>

# find any two characters where they are repeated in reverse, ie ab, ba
y <- str_view(my_string, "(.)(.)\\2\\1")
print(y)

## [2] | <emme>
## [6] | <peep>
## [7] | a<abba>a

# find any where word with characters followed by literal "\1"
z <- str_view(my_string, "(..)\1")
print(z)

## [3] | fa<rm>

a <- str_view(my_string, "(.).\\1.\\1")
print(a)

## [1] | abr<acada>bra

b <- str_view(my_string, "(.)(.)(.).*\\3\\2\\1")
print(b)

## [7] | <aabbaa>

#4 Construct regular expressions to match words that: Start and end with the same character. Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

#expression to find words that start and end with the same character
mystring2 <- c("rubber", "Emme", "Toronto", "decide", "melee", "termite" )
sc <- str_subset(mystring2, regex("^(.).*\\1$", ignore_case = T))
print (sc)

## [1] "rubber" "Emme"

#expression that contain a repeated pair of letters (e.g. "church" contains "ch" repeated twice.)
rc <- str_subset(mystring2, regex("([A-Za-z][A-Za-z]).*\\1", ignore_case = T))
print (rc)

## [1] "Toronto" "decide"  "termite"

#one letter repeated in at least three places
tr <- str_subset(mystring2, regex("([A-Za-z]).*\\1.*\\1", ignore_case = T))
print (tr)

## [1] "Toronto" "melee"

Data 607 Assignment 3

Carol Campbell

2023-09-23