College Majors

We are using the 173 majors listed in fivethirtyeight.com’s College Majors dataset (https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/]).

# Load the dataset from GitHub
url <- "https://raw.githubusercontent.com/Nweoomon/Assignment3_607_091224/main/majors-list.csv"
data <- read.csv(url)

# Load the necessary library
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(data)
## Rows: 174
## Columns: 3
## $ FOD1P          <chr> "1100", "1101", "1102", "1103", "1104", "1105", "1106",…
## $ Major          <chr> "GENERAL AGRICULTURE", "AGRICULTURE PRODUCTION AND MANA…
## $ Major_Category <chr> "Agriculture & Natural Resources", "Agriculture & Natur…

Identifying the majors that contain either “DATA” or “STATISTICS”

grep(pattern = 'data|statistics', data$Major, value = TRUE, ignore.case = TRUE)
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
## [3] "STATISTICS AND DECISION SCIENCE"

Transforming the data

# Original string
text <- '"bell pepper"  "bilberry"     "blackberry"   "blood orange"

"blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  

"elderberry"   "lime"         "lychee"       "mulberry"    

"olive"        "salal berry"'

# Use gsub() to replace multiple spaces with a single space and add commas
formatted_text <- gsub('("[^"]+")\\s*', '\\1, ', text) 

# Remove the trailing comma and space at the end of the text
formatted_text <- sub(", $", "", formatted_text)

# Wrap the cleaned text with c() to make it look like a vector in R
formatted_text <- paste0("c(", formatted_text, ")")

# Print the result
cat(formatted_text)
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")

Describing in words, what the expressions will match:

# Sample strings to test the regular expressions
test_strings <- c(
  "aaa", "abba", "abab", "abaca", "abccba", "abcidkcba")

# 1. Test: (.)\1\1 
test1 <- grep("(.)\1\1", test_strings, value = TRUE)
cat("Matches for (.)\1\1:", test1)
## Matches for (.):
# 1. Revised Test: "(.)\\1\\1" - Matches three consecutive identical characters
test1 <- grep("(.)\\1\\1", test_strings, value = TRUE)
cat("Matches for (.)\\1\\1:", test1)
## Matches for (.)\1\1: aaa
# 2. Test: "(.)(.)\\2\\1" - Matches a character followed by another character and their reverse
test2 <- grep("(.)(.)\\2\\1", test_strings, value = TRUE)
cat("Matches for (.)(.)\\2\\1:", test2)
## Matches for (.)(.)\2\1: abba abccba
# 3. Test: (..)\1 
test3 <- grep("(..)\1", test_strings, value = TRUE)
cat("Matches for (..)\1:", test3)
## Matches for (..):
# 3. Revised Test: "(..)\\1" - Matches two characters that repeat
test3 <- grep("(..)\\1", test_strings, value = TRUE)
cat("Matches for (..)\\1:", test3)
## Matches for (..)\1: abab
# 4. Test: "(.).\\1.\\1" - Matches a character This pattern matches a character is followed by any character, then the same character again, followed by any character, and the same character once more.
test4 <- grep("(.).\\1.\\1", test_strings, value = TRUE)
cat("Matches for (.).\\1.\\1:", test4)
## Matches for (.).\1.\1: abaca
# 5. Test: "(.)(.)(.).*\\3\\2\\1" - Matches three characters followed by their reverse
test5 <- grep("(.)(.)(.).*\\3\\2\\1", test_strings, value = TRUE)
cat("Matches for (.)(.)(.).*\\3\\2\\1:", test5)
## Matches for (.)(.)(.).*\3\2\1: abccba abcidkcba

Constructing regular expressions to match words

# Sample list of words to test
words <- c("racecar", "church", "eleven", "noon")

# 1. Test: Words that start and end with the same character
test1 <- grep("^(.).*\\1$", words, value = TRUE)
cat("Matches for words that start and end with the same character:", test1)
## Matches for words that start and end with the same character: racecar noon
# 2. Test: Words containing a repeated pair of letters
test2 <- grep("(.{2}).*\\1", words, value = TRUE)
cat("Matches for words containing a repeated pair of letters:", test2)
## Matches for words containing a repeated pair of letters: church
# 3. Test: Words containing one letter repeated in at least three places
test3 <- grep("(.).*\\1.*\\1", words, value = TRUE)
cat("Matches for words containing one letter repeated at least three times:", test3)
## Matches for words containing one letter repeated at least three times: eleven