#Install package
#install.packages("stringr")
#Import libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(dplyr)
#Load the dataset
theUrl <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
majors_df <- read.csv(file = theUrl, header = TRUE, sep = ",")
head(majors_df)
## FOD1P Major Major_Category
## 1 1100 GENERAL AGRICULTURE Agriculture & Natural Resources
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4 1103 ANIMAL SCIENCES Agriculture & Natural Resources
## 5 1104 FOOD SCIENCE Agriculture & Natural Resources
## 6 1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
#Use str_detect to filter out majors containing "DATA" and "STATISTICS"
df <- majors_df
df %>%
filter(str_detect(Major, "DATA|STATISTICS"))
## FOD1P Major Major_Category
## 1 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
## 2 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 3 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics
[1] “bell pepper” “bilberry” “blackberry” “blood orange”
[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”
Into a format like this:
c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)
#Transform from text string to character vector using str_c
fruits <- str_c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry", sep = ",")
fruits
## [1] "bell pepper,bilberry,blackberry,blood orange,blueberry,cantaloupe,chili pepper,cloudberry,elderberry,lime,lychee,mulberry,olive,salal berry"
# This is using str_extract_all
fruits <- c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
fruits <- unlist(str_extract_all(fruits, pattern = "(.+?)+", simplify = FALSE))
fruits
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
The two exercises below are taken from R for Data Science, 14.3.5.1 in the on-line version:
#Bringing up the fruits data frame to show the regex patterns
head(fruit)
## [1] "apple" "apricot" "avocado" "banana" "bell pepper"
## [6] "bilberry"
#Assigning the pattern and using the fruit data frame to see what it returns with str_subset and str_match
#pattern <- (.)\1\1
#fruit %>%
#str_subset(pattern)%>%
#str_match(pattern)
#Assigning the pattern and using the fruit data frame to see what it returns with str_subset and str_match
#pattern <- "(.)(.)\\2\\1"
#fruit %>%
#str_subset(pattern)%>%
#str_match(pattern)
#Assigning the pattern and using the fruit data frame to see what it returns with str_subset and str_match
#pattern <- (..)\1
#fruit %>%
#str_subset(pattern)%>%
#str_match(pattern)
#Assigning the pattern and using the fruit data frame to see what it returns with str_subset and str_match
#pattern <- "(.).\\1.\\1"
#fruit %>%
#str_subset(pattern)%>%
#str_match(pattern)
#Assigning the pattern and using the fruit data frame to see what it returns with str_subset and str_match
#pattern <- "(.)(.)(.).*\\3\\2\\1"
#fruit %>%
#str_subset(pattern)%>%
#str_match(pattern)
str_view("anemia","(.)(.*)\\1$")
str_view("church", "([A-Za-z][A-Za-z]).*\\1" )
str_view("eleven", "([A-Za-z]).*\\1.*\\1.")