library(dplyr)
library(downloader)
library(stringr)
library(htmlTable)
url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
download_file <- "majors_list.csv" # name of the file on the local machine after download
# the file will be downloaded to the working directory
downloader::download(url, download_file)
# read the dataset into a dataframe
majors_list <- read.csv(download_file, header=TRUE, stringsAsFactors = FALSE)
### keywords <- c("STATISTICS" , "DATA") ## test
keywords <- c( "DATA","STATISTICS")
filtered_majors <- majors_list[str_detect(majors_list$Major, paste( keywords, collapse = '|')), ]
htmlTable(filtered_majors)
| FOD1P | Major | Major_Category | |
|---|---|---|---|
| 44 | 6212 | MANAGEMENT INFORMATION SYSTEMS AND STATISTICS | Business |
| 52 | 2101 | COMPUTER PROGRAMMING AND DATA PROCESSING | Computers & Mathematics |
| 59 | 3702 | STATISTICS AND DECISION SCIENCE | Computers & Mathematics |
strdata <- "[1] 'bell pepper' 'bilberry' 'blackberry' 'blood orange' [5] 'blueberry' 'cantaloupe' 'chili pepper' 'cloudberry' [9] 'elderberry' 'lime' 'lychee' 'mulberry' [13] 'olive' 'salal berry'"
strdata1 <- str_replace_all(strdata, "([[0-9]])", "")
strdata2 <-str_replace_all(strdata1, "\\[\\]", "")
strdata3 <-str_replace_all(strdata2, "[[:punct:]]+", ",")
strdata4 <-str_replace_all(strdata3, ",[[:space:]]+,", ",")
strdata5 <-str_replace_all(strdata4, " ,", "")
strdata5_end <-str_replace_all(strdata5, ",$", "")
strdata6 <-str_replace_all(strdata5_end, ",", "\",\"")
strdata7 <-str_replace_all(strdata6, "\'", "\"")
strdata8 <- str_c( "c(\"" , strdata7 , "\")" )
htmlTable(strdata8)
| c(“bell pepper”,“bilberry”,“blackberry”,“blood orange”,“blueberry”,“cantaloupe”,“chili pepper”,“cloudberry”,“elderberry”,“lime”,“lychee”,“mulberry”,“olive”,“salal berry”) |
. Matches any character except line breaks (, and usually .
Back-references to previously matched subexpressions, grouped by (), \1 means the first match Another \1 means repeat the first match. Total repeat will be three times the matched character.
This would match any of the following strings: “aaa” “bbb” “ccc”
. Matches any character except line breaks (, and usually .
There are two characters at the start
Back-references to previously matched subexpressions, grouped by (), \2 means the second match, followed by the first match
This would match any of the following strings: “abba” “poop” “cddc”
.. Matches any two characters, \1 means repeating twice the match immediately
Given fruits like “salal berry”, “banana”,“papaya”, str_match(fruits,above_pattern) will return
[1,] “alal” “al” [2,] “anan” “an” [3,] “papa” “pa”
(.) Match first character, . means the Second character can be any character
\1 means repeating the match, .\1 means repeat the first match with second character as wild character
Given fruits like “salal berry”, “banana”,“papaya”, str_match(fruits,above_pattern) will return
[2,] “anana” “a” [3,] “apaya” “a”
(.)(.)(.) Match first three characters, Fourth onwards can be any number of characters, then followed by the third, then second and followed by the first character Example: “tomapotatomot”
[Result] “tomapotatomot” “t” “o” “m”
fruits <- c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry", "banana","apapaya", "tomapotatomotot","church")
#pattern <- "(.)\\1\\1"
#pattern <- "(.)(.)\\2\\1"
#pattern <- "(..)\\1"
#pattern <- "(.).\\1.\\1"
#pattern <- "(.)(.)(.).*\\3\\2\\1"
#pattern <- "^(.).*\\1$"
#pattern <- "(..).*\\1"
pattern <- "([a-z]).*\\1.*\\1"
#pattern <- "(.)\\..\\..\\.."
#fruits %>%
str_match(fruits,pattern)
## [,1] [,2]
## [1,] "ell peppe" "e"
## [2,] NA NA
## [3,] NA NA
## [4,] "ood o" "o"
## [5,] NA NA
## [6,] NA NA
## [7,] "pepp" "p"
## [8,] NA NA
## [9,] "elderbe" "e"
## [10,] NA NA
## [11,] NA NA
## [12,] NA NA
## [13,] NA NA
## [14,] NA NA
## [15,] "anana" "a"
## [16,] "apapaya" "a"
## [17,] "tomapotatomotot" "t"
## [18,] NA NA
pattern <- "^(.).*\1$"
[16,] “apapaya” “a”
[17,] “tomapotatomotot” “t”
pattern <- "(..).*\1"
[18,] “church” “ch”
pattern <- “([a-z]).\1.\1”