# Load the libraries
library(RCurl)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(stringr)
# Gets the CSV from Github
majorscsv <- getURL("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
# Reads the majors CSV into a dataframe.
majorsdf <- read.csv(text=majorscsv)
# To perform the matching in the Majors dataframe column (and avoid a coercion warning), the df is subset and vectorized.
majorsvec <- c(majorsdf[, "Major"])
# The strings that match 'data' and 'statistics' are extracted and assigned back to a clean dataframe. NA values are omitted.
majorsdf <- na.omit(data.frame(
str_extract(majorsvec, regex((".*(data|statistic).*"), ignore_case = TRUE))))
# The final result shows 3 matches
majorsdf
## str_extract.majorsvec..regex......data.statistic.......ignore_case...TRUE..
## 44 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS
## 52 COMPUTER PROGRAMMING AND DATA PROCESSING
## 59 STATISTICS AND DECISION SCIENCE
[1] “bell pepper” “bilberry” “blackberry” “blood orange” [5]
“blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”
Into a format like this: c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)
# Creates the fruit vector
fruit <- c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
# Adds quotes to the elements in the fruit vector in preparation for the string collapse.
fruit <- paste(shQuote(fruit))
#Collapses the fruit vector into a single string
fruit <- str_c(fruit, collapse = ",")
# Concatenates the 'c' syntax to the fruit string
mystring <- paste('c(', fruit, ')')
#The new string is in a format that will recreate the original fruit vector.
mystring
## [1] "c( 'bell pepper','bilberry','blackberry','blood orange','blueberry','cantaloupe','chili pepper','cloudberry','elderberry','lime','lychee','mulberry','olive','salal berry' )"
This creates a single character capture group (.) followed by two back references itself. ie, repeats 2 more times. Ex www.
(.)\1\1
This creates two, single character captures groups (.)(.) followed by a back reference to Group 2 \2 followed by a back referenece to Group 1 \1. Ex. abba “(.)(.)\2\1”
This creates a single 2 character capture group (..) followed by a back reference to itself \1; ie, repeats itself once. Ex. abab (..)\1
This creates a single character capture group (.) followed by any character . followed by the back reference to the initial character \1 followed by any character . followed by the back reference to the initial character. Ex a2axa “(.).\1.\1”
This creates three single character capture groups (.)(.)(.) followed by any string of characters .* followed by the back reference to group 3 \3 followed by a back reference to group 2 \2 followed by a back reference to group 1. Ex abc11111111cba “(.)(.)(.).*\3\2\1”
“^(.)(.*\1$)”
([A-Za-z][A-Za-z]).*\1
([a-z]).\1.\1