##Overview: This assignment using regular expressions in R.
git_hub_url <- 'https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv'
major_data <- read_csv(git_hub_url)
## Parsed with column specification:
## cols(
## FOD1P = col_character(),
## Major = col_character(),
## Major_Category = col_character()
## )
colnames(major_data) <- c('Field_of_Degree_ID','Major','Major_Category')
head(major_data)
## # A tibble: 6 x 3
## Field_of_Degree_ID Major Major_Category
## <chr> <chr> <chr>
## 1 1100 GENERAL AGRICULTURE Agriculture & Natural Reso…
## 2 1101 AGRICULTURE PRODUCTION AND MAN… Agriculture & Natural Reso…
## 3 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Reso…
## 4 1103 ANIMAL SCIENCES Agriculture & Natural Reso…
## 5 1104 FOOD SCIENCE Agriculture & Natural Reso…
## 6 1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Reso…
#Get list of majors from data
majors <- major_data$Major
grep(pattern = 'data|science',majors,value=TRUE,ignore.case=TRUE)
## [1] "ANIMAL SCIENCES"
## [2] "FOOD SCIENCE"
## [3] "PLANT SCIENCE AND AGRONOMY"
## [4] "SOIL SCIENCE"
## [5] "ENVIRONMENTAL SCIENCE"
## [6] "BIOCHEMICAL SCIENCES"
## [7] "NEUROSCIENCE"
## [8] "COGNITIVE SCIENCE AND BIOPSYCHOLOGY"
## [9] "ACTUARIAL SCIENCE"
## [10] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [11] "COMPUTER SCIENCE"
## [12] "INFORMATION SCIENCES"
## [13] "STATISTICS AND DECISION SCIENCE"
## [14] "MATHEMATICS AND COMPUTER SCIENCE"
## [15] "SCIENCE AND COMPUTER TEACHER EDUCATION"
## [16] "SOCIAL SCIENCE OR HISTORY TEACHER EDUCATION"
## [17] "LIBRARY SCIENCE"
## [18] "ENGINEERING MECHANICS PHYSICS AND SCIENCE"
## [19] "MATERIALS ENGINEERING AND MATERIALS SCIENCE"
## [20] "MATERIALS SCIENCE"
## [21] "NUTRITION SCIENCES"
## [22] "COMMUNICATION DISORDERS SCIENCES AND SERVICES"
## [23] "PHARMACY PHARMACEUTICAL SCIENCES AND ADMINISTRATION"
## [24] "FAMILY AND CONSUMER SCIENCES"
## [25] "TRANSPORTATION SCIENCES AND TECHNOLOGIES"
## [26] "PHYSICAL SCIENCES"
## [27] "ATMOSPHERIC SCIENCES AND METEOROLOGY"
## [28] "GEOLOGY AND EARTH SCIENCE"
## [29] "GEOSCIENCES"
## [30] "MULTI-DISCIPLINARY OR GENERAL SCIENCE"
## [31] "INTERDISCIPLINARY SOCIAL SCIENCES"
## [32] "GENERAL SOCIAL SCIENCES"
## [33] "POLITICAL SCIENCE AND GOVERNMENT"
## [34] "MISCELLANEOUS SOCIAL SCIENCES"
Write code that transforms the data below:
[1] “bell pepper” “bilberry” “blackberry” “blood orange”
[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”
Into a format like this:
c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)
food <- '[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"'
food <- str_replace_all(food,"\\d+","")
food <- str_replace_all(food,"\\n","")
food <- str_replace_all(food,"[\\[\\]]","")
food <- str_replace_all(food,"\\s+"," ")
food <- str_replace_all(food,'[\"]',"'")
food <- str_replace_all(food,"' '","','")
food <- str_sub(food,2,str_length(food))
food <- str_split(food,pattern=",")
food <- unlist(food)
food
## [1] "'bell pepper'" "'bilberry'" "'blackberry'" "'blood orange'"
## [5] "'blueberry'" "'cantaloupe'" "'chili pepper'" "'cloudberry'"
## [9] "'elderberry'" "'lime'" "'lychee'" "'mulberry'"
## [13] "'olive'" "'salal berry'"
#3 Describe, in words, what these expressions will match:
“(.)\1\1”
This will return any character followed by the string \1\1.
Examples: “a\1\1”,“2\1\1”
string1 <- "a\1\1"
str_view(string1,"(.)\1\1")
string2 <- "2\1\1"
str_view(string2,"(.)\1\1")
“(..)\1”
This will return any two characters following by the string \1.
Examples “8d\1”, “ee\1”
string1 <- "8d\1"
str_view(string1,"(..)\1")
string2 <- "ee\1"
str_view(string2,"(..)\1")
“(.).\1.\1”
This will return any character, followed by any character, followed by the first character again, follow by any character, followed by the first character again.
The portion of the string that will match this regex, must have the same 1st, 3rd and 5th character.
string1 <- "abaDa"
str_view(string1,"(.).\\1.\\1")
string2 <- "YabadabaDoo"
str_view(string2,"(.).\\1.\\1")
"(.)(.)(.).\3\2\1"*
The first three characters can be anything,
followed by another character repeated n-number of times where n can be any number,
followed by the third character, the second character, and the first character.
string1 <- "abcddddddddddddddddcba"
str_view(string1,"(.)(.)(.).*\\3\\2\\1")
string2 <- "xyzuzyx"
str_view(string2,"(.)(.)(.).*\\3\\2\\1")
#4 Construct regular expressions to match words that: Start and end with the same character.
test1 <- "deed"
str_view(test1,"^(.).+\\1$")
test2 <- "googling"
str_view(test2,"^(.).+\\1$")
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)
test1 <- "church"
str_view(test1,"(..).+\\1")
test2 <- "sheesh"
str_view(test2,"(..).+\\1")
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
test1 <- "melee"
str_view(test1,"(.)(.+?\\1?)+")
test2 <- "eleven"
str_view(test2,"(.)(.+?\\1?)+")
test3 <- "anacondas"
str_view(test3,"(.)(.+?\\1?)+")
test4 <- "telephone"
str_view(test4,"(.)(.+?\\1?)+")
test5 <- "googling"
str_view(test5,"(.)(.+?\\1?)+")
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.