##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.6 v purrr 0.3.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Question 1
#cmajors <- read.csv("college-majors.csv")
cmajors <- read.csv("https://raw.githubusercontent.com/tagensingh/SPS-DATA607-ASSIGNMENT-3/main/college-majors.csv")
class(cmajors)
## [1] "data.frame"
tibble(cmajors)
## # A tibble: 174 x 3
## FOD1P Major Major_Category
## <chr> <chr> <chr>
## 1 1100 GENERAL AGRICULTURE Agriculture & Natural Resources
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4 1103 ANIMAL SCIENCES Agriculture & Natural Resources
## 5 1104 FOOD SCIENCE Agriculture & Natural Resources
## 6 1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
## 7 1106 SOIL SCIENCE Agriculture & Natural Resources
## 8 1199 MISCELLANEOUS AGRICULTURE Agriculture & Natural Resources
## 9 1302 FORESTRY Agriculture & Natural Resources
## 10 1303 NATURAL RESOURCES MANAGEMENT Agriculture & Natural Resources
## # ... with 164 more rows
#_________________________________________________________
#**The Answer to Question 1**
dplyr::filter(cmajors, grepl('DATA|STATISTICS', Major))
## FOD1P Major Major_Category
## 1 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
## 2 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 3 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics
#__________________________________________________________
Question 2
(matches <- str_subset(fruit, pattern = "l.*e"))
## [1] "apple" "bell pepper" "bilberry"
## [4] "blackberry" "blood orange" "blueberry"
## [7] "cantaloupe" "chili pepper" "clementine"
## [10] "cloudberry" "elderberry" "huckleberry"
## [13] "lemon" "lime" "lychee"
## [16] "mulberry" "olive" "pineapple"
## [19] "purple mangosteen" "salal berry"
list(match = intersect(matches, str_subset(fruit, pattern = "l.+e")),
no_match = setdiff(matches, str_subset(fruit, pattern = "l.+e")))
## $match
## [1] "bell pepper" "bilberry" "blackberry"
## [4] "blood orange" "blueberry" "cantaloupe"
## [7] "chili pepper" "clementine" "cloudberry"
## [10] "elderberry" "huckleberry" "lime"
## [13] "lychee" "mulberry" "olive"
## [16] "purple mangosteen" "salal berry"
##
## $no_match
## [1] "apple" "lemon" "pineapple"
Question 3
1. (.)\1\1 - The same character appears three times in a row. E.g. “zzz”
2. “(.)(.)\2\1” - This is a pair of characters followed by the same pair of characters in reversed order. E.g. “noon” or “saas”.
3. (..)\1 - This is two characters repeated. E.g. “a1a1” or "U2U2
4. “(.).\1.\1” - A character followed by any character, the original character, any other character, the original character again. E.g. “abaca”, “z0z0z”.
5. "(.)(.)(.).*\3\2\1" - Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. “abctagessinghcba” or “123456321” or “a1a6789a1a”
Question 4
### Q4-1
#### Words that start and end with the same character, from the "words" dataset
str_subset(words, "^(.)((.*\\1$)|\\1?$)")
## [1] "a" "america" "area" "dad" "dead"
## [6] "depend" "educate" "else" "encourage" "engine"
## [11] "europe" "evidence" "example" "excuse" "exercise"
## [16] "expense" "experience" "eye" "health" "high"
## [21] "knock" "level" "local" "nation" "non"
## [26] "rather" "refer" "remember" "serious" "stairs"
## [31] "test" "tonight" "transport" "treat" "trust"
## [36] "window" "yesterday"
### Q4-2
#### Pairs of Repeated letters - "london' in this case
str_subset("london", "([A-Za-z][A-Za-z]).*\\1")
## [1] "london"
#### Pairs of Repeated letters - from the "words" dataset.
str_subset(words, "([A-Za-z][A-Za-z]).*\\1")
## [1] "appropriate" "church" "condition" "decide" "environment"
## [6] "london" "paragraph" "particular" "photograph" "prepare"
## [11] "pressure" "remember" "represent" "require" "sense"
## [16] "therefore" "understand" "whether"
### Q4-3
#### One letter repeated in at least three places - "nineteen" in this case
str_subset("nineteen", "([a-z]).*\\1.*\\1")
## [1] "nineteen"
#### One letter repeated in at least three places - from the "words" dataset.
str_subset(words, "([a-z]).*\\1.*\\1")
## [1] "appropriate" "available" "believe" "between" "business"
## [6] "degree" "difference" "discuss" "eleven" "environment"
## [11] "evidence" "exercise" "expense" "experience" "individual"
## [16] "paragraph" "receive" "remember" "represent" "telephone"
## [21] "therefore" "tomorrow"