data <- 'https://raw.githubusercontent.com/curiostegui/CUNY-SPS/main/Data%20607/Assignment%20Week%203/majors-list.csv'
majors <- read.csv(file = data, header = TRUE, sep = ",")
library(stringr)

Exercise 1

Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset, provide code that identifies the majors that contain either “DATA” or “STATISTICS”

grep(pattern = 'data|science', majors$Major, value = TRUE, ignore.case = TRUE)
##  [1] "ANIMAL SCIENCES"                                    
##  [2] "FOOD SCIENCE"                                       
##  [3] "PLANT SCIENCE AND AGRONOMY"                         
##  [4] "SOIL SCIENCE"                                       
##  [5] "ENVIRONMENTAL SCIENCE"                              
##  [6] "BIOCHEMICAL SCIENCES"                               
##  [7] "NEUROSCIENCE"                                       
##  [8] "COGNITIVE SCIENCE AND BIOPSYCHOLOGY"                
##  [9] "ACTUARIAL SCIENCE"                                  
## [10] "COMPUTER PROGRAMMING AND DATA PROCESSING"           
## [11] "COMPUTER SCIENCE"                                   
## [12] "INFORMATION SCIENCES"                               
## [13] "STATISTICS AND DECISION SCIENCE"                    
## [14] "MATHEMATICS AND COMPUTER SCIENCE"                   
## [15] "SCIENCE AND COMPUTER TEACHER EDUCATION"             
## [16] "SOCIAL SCIENCE OR HISTORY TEACHER EDUCATION"        
## [17] "LIBRARY SCIENCE"                                    
## [18] "ENGINEERING MECHANICS PHYSICS AND SCIENCE"          
## [19] "MATERIALS ENGINEERING AND MATERIALS SCIENCE"        
## [20] "MATERIALS SCIENCE"                                  
## [21] "NUTRITION SCIENCES"                                 
## [22] "COMMUNICATION DISORDERS SCIENCES AND SERVICES"      
## [23] "PHARMACY PHARMACEUTICAL SCIENCES AND ADMINISTRATION"
## [24] "FAMILY AND CONSUMER SCIENCES"                       
## [25] "TRANSPORTATION SCIENCES AND TECHNOLOGIES"           
## [26] "PHYSICAL SCIENCES"                                  
## [27] "ATMOSPHERIC SCIENCES AND METEOROLOGY"               
## [28] "GEOLOGY AND EARTH SCIENCE"                          
## [29] "GEOSCIENCES"                                        
## [30] "MULTI-DISCIPLINARY OR GENERAL SCIENCE"              
## [31] "INTERDISCIPLINARY SOCIAL SCIENCES"                  
## [32] "GENERAL SOCIAL SCIENCES"                            
## [33] "POLITICAL SCIENCE AND GOVERNMENT"                   
## [34] "MISCELLANEOUS SOCIAL SCIENCES"

Exercise 2

Write code that transforms the data below

[1] “bell pepper” “bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”

Into a format like this: c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

food <- list(c("bell pepper","bilberry","blackberry","blood orange", "blueberry","cantaloupe","chili pepper","cloudberry" ,"elderberry","lime","lychee","mulberry","olive","salal berry"))

Created a list and then used the cat and paste function to transform list

cat(paste(food), collapse = ",")
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry") ,

Exercise 3

Describe, in words, what these expressions will match:

(.)\1\1

What is returned will start with any character and be followed be with \1\1. For ex: 3\1\1, c\1\1

“(.)(.)\2\1”

Will return string any two characters, that follows up with a reversal of two characters. For ex: 1221, 4554, Chhc

(..)\1

Will return any two sets of character that is then followed by \1. For ex:

“(.).\1.\1”

Returns strings that have a chracter that repeats in the middle and last part of the string

“(.)(.)(.).\3\2\1”*

Will return strings where the first 3 characters can be any character and is then reversed in order

Exercise #4

Construct regular expressions to match words that:

  1. Start and end with the same character
exer1 = 'creiucdiu'
  
exer_str <- str_extract_all(exer1, "(.).+\\1")
print(exer_str)
## [[1]]
## [1] "creiuc"
  1. Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)
exer2 = 'cacehjhhcadejnk'

exer2_str <- str_extract_all(exer2, "(..).*\\1")
print(exer2_str)
## [[1]]
## [1] "cacehjhhca"
  1. Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
exer3 = 'banana'
  
exer3_str <- str_extract_all(exer3, "(.).*\\1.*\\1")
print(exer3_str)
## [[1]]
## [1] "anana"