library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(htmltab)
library(stringr)
file<- "https://projects.fivethirtyeight.com/mid-levels/college-majors/"
table1<-htmltab(doc =file, which =1)
str(table1)
## 'data.frame': 173 obs. of 12 variables:
## $ V1 : chr "1" "2" "3" "4" ...
## $ MAJOR : chr "Petroleum Eng." "Mining & Mineral Eng." "Metallurgical Eng." "Naval Architecture & Marine Eng." ...
## $ # OF MAJORS : chr "2,339" "756" "856" "1,258" ...
## $ EARNINGS (x1,000) >> HIDDEN : chr "110" "75" "73" "70" ...
## $ EARNINGS (x1,000) >> MED. : chr "$110" "$75" "$73" "$70" ...
## $ EARNINGS (x1,000) >> HIDDEN : chr "95" "55" "50" "43" ...
## $ EARNINGS (x1,000) >> 25TH : chr "$95" "$55" "$50" "$43" ...
## $ EARNINGS (x1,000) >> HIDDEN : chr "125" "90" "105" "80" ...
## $ EARNINGS (x1,000) >> 75TH : chr "$125" "$90" "$105" "$80" ...
## $ % WORKING IN JOBS >> PART-TIME : chr "13" "23" "19" "12" ...
## $ % WORKING IN JOBS >> NON-COLLEGE: chr "19" "42" "28" "16" ...
## $ % WORKING IN JOBS >> LOW-PAYING : chr "10" "8" "0" "0" ...
majors<-table1$MAJOR
matching <- str_detect(majors, "(?i)statistics|(?i)data")
majors[matching]
## [1] "Mgmt. Information Systems & Statistics"
## [2] "Statistics & Decision Science"
## [3] "Computer Programming & Data Processing"
**given data: [1] “bell pepper” “bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe” “chili pepper” “cloudberry” [9] “elderberry” “lime” “lychee” “mulberry” [13] “olive” “salal berry”
**required out put: c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)
myword <-('[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"')
myword1<-str_extract_all(myword, regex("\\w+..\\w+"))
myword2<-unlist(myword1)
class(myword2)
## [1] "character"
myword2
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
**(.)\1\1: any one character repeated twice
**“(.)(.)\2\1”: a string whose any two characters repeated once but in reverse way
**(..)\1: a pair of any characters repeated once
**“(.).\1.\1”: a string whose one character (any type) repeated twice with two consecutive periods where each period consisted of one character(any type)
“(.)(.)(.).*\3\2\1” : a string whose three characters (any type) followed by 0 or more characters of any type followed by the first three characters in a reverse way
**matching-1:Start and end with the same character:“(.).+\1$”
matching1<- c("apa", "lol", "nope", "mango", "eighteen")
str_view(matching1, "(.).+\\1$", match=TRUE)
**matching-2:Contain a repeated pair of letters:“(..).+\1”
matching1<- c("church", "ppeppero", "nope", "mango", "eighteen")
str_view(matching1, "(..).+\\1", match=TRUE)
**matching-3: Contain one letter repeated in at least three places:“([a-zA-Z])….\1+\1.+”
matching3<- c("okra", "papaya", "banana", "mango", "eighteen")
str_view(matching3, "([a-zA-Z])....\\1+\\1.+", match=TRUE)