library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0 v purrr 1.0.1
## v tibble 3.1.6 v dplyr 1.0.10
## v tidyr 1.2.0 v stringr 1.5.0
## v readr 2.1.2 v forcats 0.5.2
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dbplyr)
##
## Attaching package: 'dbplyr'
##
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(stringr)
library(stringi)
## Warning: package 'stringi' was built under R version 4.0.5
Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset, link provide code that identifies the majors that contain either “DATA” or “STATISTICS”
##read csv and see the header
readfile <- data.frame(read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv", header = TRUE))
head(readfile)
## FOD1P Major Major_Category
## 1 1100 GENERAL AGRICULTURE Agriculture & Natural Resources
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4 1103 ANIMAL SCIENCES Agriculture & Natural Resources
## 5 1104 FOOD SCIENCE Agriculture & Natural Resources
## 6 1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
## search for majors contains DATA or STATISTICS
data_stat_df <-grep("DATA|STATISTICS", readfile$Major, value = TRUE)
head(data_stat_df)
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [3] "STATISTICS AND DECISION SCIENCE"
Write code that transforms the data below: [1] “bell pepper”
“bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe”
“chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry” Into a format like this: c(“bell pepper”,
“bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”,
“chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”,
“mulberry”, “olive”, “salal berry”)
##input the data to a dataframe
new_df <- {
'
[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"
'
}
new_df
## [1] "\n[1] \"bell pepper\" \"bilberry\" \"blackberry\" \"blood orange\"\n[5] \"blueberry\" \"cantaloupe\" \"chili pepper\" \"cloudberry\" \n[9] \"elderberry\" \"lime\" \"lychee\" \"mulberry\" \n[13] \"olive\" \"salal berry\"\n"
##remove the unknown characters
new_df<-str_remove_all(new_df,"\"")
new_df<-str_remove_all(new_df,"\n")
new_df
## [1] "[1] bell pepper bilberry blackberry blood orange[5] blueberry cantaloupe chili pepper cloudberry [9] elderberry lime lychee mulberry [13] olive salal berry"
##sort the string
sort(new_df)
## [1] "[1] bell pepper bilberry blackberry blood orange[5] blueberry cantaloupe chili pepper cloudberry [9] elderberry lime lychee mulberry [13] olive salal berry"
new_list <-c(new_df)
new_list
## [1] "[1] bell pepper bilberry blackberry blood orange[5] blueberry cantaloupe chili pepper cloudberry [9] elderberry lime lychee mulberry [13] olive salal berry"
Describe, in words, what these expressions will match: *(.)\1\1 This expression returns value with characters repeated 3 times.
test_word <- c("goddesssship", "juice", "skullllike", "skull", "hostesssship")
str_subset(test_word, "(.)\\1\\1")
## [1] "goddesssship" "skullllike" "hostesssship"
*(.)(.)\2\1 This expression returns value with pairs characters.
test_word2 <- c("abba", "music", "fruits", "candy", "choco")
str_subset(test_word2, "(.)(.)\\2\\1")
## [1] "abba"
*(..)\1 This expression returns value with any characters repeated.
test_word3 <-c("banana","sausage", "music", "silver", "muscle")
str_subset(test_word3, "(..)\\1")
## [1] "banana"
*(.).\1.\1 This expression returns characters that have other characters in between 2 original characters.
test_word4 <- c("banana", "Alabama", "sushi", "sugar", "bowl")
str_subset(test_word4, "(.).\\1.\\1")
## [1] "banana" "Alabama"
(.)(.)(.).\3\2\1 This expression returns characters that have others characters in between 2 character sets in reversed order.
test_word5 <- c("xyz1zyx", "doggod", "sworddrows", "music", "mussle")
str_subset(test_word5, "(.)(.)(.).*\\3\\2\\1")
## [1] "xyz1zyx" "doggod" "sworddrows"
Construct regular expressions to match words that: Start and end with the same character.
test_word6 <- c("dad", "music", "razor", "banana", "burritos")
str_subset(test_word6, "^(.)((.*\\1$)|\\1?$)")
## [1] "dad" "razor"
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)
test_word7 <- c("church", "monster", "banana", "jujube", "star")
str_subset(test_word7, "([A-Za-z][A-Za-z]).*\\1")
## [1] "church" "banana" "jujube"
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
test_word8 <- c("eleven", "balloon", "expression", "repeat", "banana")
str_subset(test_word8, "([a-z]).*\\1.*\\1")
## [1] "eleven" "banana"