library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
csv_data <- read.csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv')
glimpse(csv_data)
## Rows: 173
## Columns: 22
## $ Major_code <int> 5601, 6004, 6211, 2201, 2001, 3201, 6206,…
## $ Major <chr> "CONSTRUCTION SERVICES", "COMMERCIAL ART …
## $ Major_category <chr> "Industrial Arts & Consumer Services", "A…
## $ Grad_total <int> 9173, 53864, 24417, 5411, 9109, 1542, 190…
## $ Grad_sample_size <int> 200, 882, 437, 72, 171, 22, 3738, 386, 98…
## $ Grad_employed <int> 7098, 40492, 18368, 3590, 7512, 1008, 151…
## $ Grad_full_time_year_round <int> 6511, 29553, 14784, 2701, 5622, 860, 1230…
## $ Grad_unemployed <int> 681, 2482, 1465, 316, 466, 0, 8324, 473, …
## $ Grad_unemployment_rate <dbl> 0.08754339, 0.05775585, 0.07386679, 0.080…
## $ Grad_median <dbl> 75000, 60000, 65000, 47000, 57000, 75000,…
## $ Grad_P25 <int> 53000, 40000, 45000, 24500, 40600, 55000,…
## $ Grad_P75 <dbl> 110000, 89000, 100000, 85000, 83700, 1200…
## $ Nongrad_total <int> 86062, 461977, 179335, 37575, 53819, 8921…
## $ Nongrad_employed <int> 73607, 347166, 145597, 29738, 43163, 6967…
## $ Nongrad_full_time_year_round <int> 62435, 250596, 113579, 23249, 34231, 6063…
## $ Nongrad_unemployed <int> 3928, 25484, 7409, 1661, 3389, 518, 45519…
## $ Nongrad_unemployment_rate <dbl> 0.05066099, 0.06838588, 0.04842294, 0.052…
## $ Nongrad_median <dbl> 65000, 48000, 50000, 41600, 52000, 50000,…
## $ Nongrad_P25 <int> 47000, 34000, 35000, 29000, 36000, 34000,…
## $ Nongrad_P75 <dbl> 98000, 71000, 75000, 60000, 78000, 75000,…
## $ Grad_share <dbl> 0.09631963, 0.10441977, 0.11983686, 0.125…
## $ Grad_premium <dbl> 0.15384615, 0.25000000, 0.30000000, 0.129…
string_match <- csv_data %>%
filter(str_detect(Major, "DATA|STATISTIC")) %>%
head()
string_match
## Major_code Major
## 1 2101 COMPUTER PROGRAMMING AND DATA PROCESSING
## 2 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS
## 3 3702 STATISTICS AND DECISION SCIENCE
## Major_category Grad_total Grad_sample_size Grad_employed
## 1 Computers & Mathematics 5611 98 4716
## 2 Business 41970 963 36227
## 3 Computers & Mathematics 21973 429 16979
## Grad_full_time_year_round Grad_unemployed Grad_unemployment_rate Grad_median
## 1 3981 119 0.02461220 85000
## 2 32121 1459 0.03871464 89000
## 3 14113 751 0.04235759 92000
## Grad_P25 Grad_P75 Nongrad_total Nongrad_employed Nongrad_full_time_year_round
## 1 56000 114000 28314 22024 18381
## 2 64500 116000 150110 129179 115177
## 3 64000 125000 22210 17024 13665
## Nongrad_unemployed Nongrad_unemployment_rate Nongrad_median Nongrad_P25
## 1 2222 0.09164398 60000 40000
## 2 5690 0.04218909 72000 50000
## 3 874 0.04883227 75000 45000
## Nongrad_P75 Grad_share Grad_premium
## 1 85000 0.1653943 0.4166667
## 2 100000 0.2185027 0.2361111
## 3 106000 0.4973180 0.2266667
[1] “bell pepper” “bilberry” “blackberry” “blood orange” [5]
“blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry” [13] “olive” “salal
berry”
Into a format like this:
c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)
str_fruits <- '[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"'
str_fruits_pattern <- str_extract_all(str_fruits,pattern = '[A-Za-z]+.?[A-Za-z]+')
str_fruits_pattern <- unlist(str_fruits_pattern)
str_fruits_pattern
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
newformat <- str_c(str_fruits_pattern, sep = ",", collapse = "\", \"")
newformat <- paste("c(\"", newformat,"\")")
newformat <- str_replace(newformat, " b", "b")
newformat <- str_replace(newformat, "y ", "y")
cat(newformat)
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
test_string <- c('aaa', 'bbb', 'cat', 'doll', 'aba', 'cda', 'kind', 'noon', 'moon','byby', 'axaxa','racecar')
It will display any character that is repeated 3 times
str_match(test_string, '(.)\\1\\1')
## [,1] [,2]
## [1,] "aaa" "a"
## [2,] "bbb" "b"
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] NA NA
## [7,] NA NA
## [8,] NA NA
## [9,] NA NA
## [10,] NA NA
## [11,] NA NA
## [12,] NA NA
same letter before and after of 2 same character
str_match(test_string, '(.)(.)\\2\\1')
## [,1] [,2] [,3]
## [1,] NA NA NA
## [2,] NA NA NA
## [3,] NA NA NA
## [4,] NA NA NA
## [5,] NA NA NA
## [6,] NA NA NA
## [7,] NA NA NA
## [8,] "noon" "n" "o"
## [9,] NA NA NA
## [10,] NA NA NA
## [11,] NA NA NA
## [12,] NA NA NA
Regular expression not represented in string format that has two characters repeated twice in the same order
str_match(test_string, '(..)\\1')
## [,1] [,2]
## [1,] NA NA
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] NA NA
## [7,] NA NA
## [8,] NA NA
## [9,] NA NA
## [10,] "byby" "by"
## [11,] "axax" "ax"
## [12,] NA NA
The letter is located that has any letter after it. Then the letter with any letter with the repeated letter
str_match(test_string, '(.).\\1.\\1')
## [,1] [,2]
## [1,] NA NA
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] NA NA
## [7,] NA NA
## [8,] NA NA
## [9,] NA NA
## [10,] NA NA
## [11,] "axaxa" "a"
## [12,] NA NA
letters follows by same three letters but the third swap the position with the first letter
str_match(test_string, '(.)(.)(.).*\\3\\2\\1')
## [,1] [,2] [,3] [,4]
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] NA NA NA NA
## [7,] NA NA NA NA
## [8,] NA NA NA NA
## [9,] NA NA NA NA
## [10,] NA NA NA NA
## [11,] NA NA NA NA
## [12,] "racecar" "r" "a" "c"
pr4_str <- c('wow', 'ada', 'peep', 'mama', 'kaka', 'eleven', 'rarer')
str_match(pr4_str, "^(.).*\\1$")
## [,1] [,2]
## [1,] "wow" "w"
## [2,] "ada" "a"
## [3,] "peep" "p"
## [4,] NA NA
## [5,] NA NA
## [6,] NA NA
## [7,] "rarer" "r"
str_match(pr4_str, "(..).*\\1")
## [,1] [,2]
## [1,] NA NA
## [2,] NA NA
## [3,] NA NA
## [4,] "mama" "ma"
## [5,] "kaka" "ka"
## [6,] NA NA
## [7,] NA NA
str_match(pr4_str, "(.).*\\1.*\\1")
## [,1] [,2]
## [1,] NA NA
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] "eleve" "e"
## [7,] "rarer" "r"