College Majors
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
CollegeM <- getURL("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
collegeMajors <- data.frame(read.csv(text = CollegeM, header=TRUE))
head(collegeMajors)
collegeMajors %>%
filter(str_detect(collegeMajors$Major, "STATISTICS|DATA"));
Transformation
x <- c("bell pepper", "bilberry", "blackberry","blood orange","blueberry","cantaloupe","chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
x
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
str_c(c(x), collapse = ", ")
## [1] "bell pepper, bilberry, blackberry, blood orange, blueberry, cantaloupe, chili pepper, cloudberry, elderberry, lime, lychee, mulberry, olive, salal berry"
x <- c("apple", "banana", "kiki", "pear","Mississippi","azzzbra", "abaca", "baba", "abba", "ababa", "abbcccddddcccbba")
“(.)\1\1” This will look for a character that is the same three times in a row.
str_view(x,"(.)\\1\\1", match = TRUE)
“(.)(.)\2\1” This will find where the is a character followed by a pair of characters and then the same character at the beginning. For Mississippi, it is finding the “issi” pattern.
str_view(x, "(.)(.)\\2\\1", match = TRUE)
“(..)\1” This will look for repeated patterns like “baba”.
str_view(x, "(..)\\1", match = TRUE)
“(.).\1.\1” This will look for a character and then another character then the 1st character again and then another character and then the first character again.
str_view(x, "(..)\\1", match = TRUE)
"(.)(.)(.).*\3\2\1" This expression looks for three characters and then any in between and then the same characters in reverse order. For example, “ississi” in Mississippi.
str_view(x, "(.)(.)(.).*\\3\\2\\1", match = TRUE)
y <- c("apple", "banana", "kiki", "pear","Mississippi","azzzbra", "abaca", "baba", "abba", "ababa", "abbcccddddcccbba", "church", "eleven")
Start and end with the same character: Uses the anchors of ^ at the beginning of the string and $ at the end of the string.
str_subset(y, "^(.)((.*\\1$)|\\1?$)")
## [1] "azzzbra" "abaca" "abba" "ababa"
## [5] "abbcccddddcccbba"
Contain a repeated pair of letters: searches through the string to look whre there is a repeating pattern of combined letters such as “kiki”.
str_subset(y, "([A-Za-z][A-Za-z]).*\\1")
## [1] "banana" "kiki" "Mississippi" "baba"
## [5] "ababa" "abbcccddddcccbba" "church"
Contain one letter repeated in at least 3 places: Searches throught the letters searching for a letter that is used 3 times.
str_subset(y, "([a-z]).*\\1.\\1")
## [1] "banana" "Mississippi" "abaca" "ababa"
## [5] "abbcccddddcccbba" "eleven"