## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.6     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Question 1

#cmajors <- read.csv("college-majors.csv")

cmajors <- read.csv("https://raw.githubusercontent.com/tagensingh/SPS-DATA607-ASSIGNMENT-3/main/college-majors.csv")

class(cmajors)
## [1] "data.frame"
tibble(cmajors)
## # A tibble: 174 x 3
##    FOD1P Major                                 Major_Category                 
##    <chr> <chr>                                 <chr>                          
##  1 1100  GENERAL AGRICULTURE                   Agriculture & Natural Resources
##  2 1101  AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
##  3 1102  AGRICULTURAL ECONOMICS                Agriculture & Natural Resources
##  4 1103  ANIMAL SCIENCES                       Agriculture & Natural Resources
##  5 1104  FOOD SCIENCE                          Agriculture & Natural Resources
##  6 1105  PLANT SCIENCE AND AGRONOMY            Agriculture & Natural Resources
##  7 1106  SOIL SCIENCE                          Agriculture & Natural Resources
##  8 1199  MISCELLANEOUS AGRICULTURE             Agriculture & Natural Resources
##  9 1302  FORESTRY                              Agriculture & Natural Resources
## 10 1303  NATURAL RESOURCES MANAGEMENT          Agriculture & Natural Resources
## # ... with 164 more rows
#_________________________________________________________

#**The Answer to Question 1**

dplyr::filter(cmajors, grepl('DATA|STATISTICS', Major))
##   FOD1P                                         Major          Major_Category
## 1  6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS                Business
## 2  2101      COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 3  3702               STATISTICS AND DECISION SCIENCE Computers & Mathematics
#__________________________________________________________

Question 2

(matches <- str_subset(fruit, pattern = "l.*e"))
##  [1] "apple"             "bell pepper"       "bilberry"         
##  [4] "blackberry"        "blood orange"      "blueberry"        
##  [7] "cantaloupe"        "chili pepper"      "clementine"       
## [10] "cloudberry"        "elderberry"        "huckleberry"      
## [13] "lemon"             "lime"              "lychee"           
## [16] "mulberry"          "olive"             "pineapple"        
## [19] "purple mangosteen" "salal berry"
list(match = intersect(matches, str_subset(fruit, pattern = "l.+e")),
     no_match = setdiff(matches, str_subset(fruit, pattern = "l.+e")))
## $match
##  [1] "bell pepper"       "bilberry"          "blackberry"       
##  [4] "blood orange"      "blueberry"         "cantaloupe"       
##  [7] "chili pepper"      "clementine"        "cloudberry"       
## [10] "elderberry"        "huckleberry"       "lime"             
## [13] "lychee"            "mulberry"          "olive"            
## [16] "purple mangosteen" "salal berry"      
## 
## $no_match
## [1] "apple"     "lemon"     "pineapple"

Question 3

1. (.)\1\1 - The same character appears three times in a row. E.g. “zzz”

2. “(.)(.)\2\1” - This is a pair of characters followed by the same pair of characters in reversed order. E.g. “noon” or “saas”.

3. (..)\1 - This is two characters repeated. E.g. “a1a1” or "U2U2

4. “(.).\1.\1” - A character followed by any character, the original character, any other character, the original character again. E.g. “abaca”, “z0z0z”.

5. "(.)(.)(.).*\3\2\1" - Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. “abctagessinghcba” or “123456321” or “a1a6789a1a”

Question 4

### Q4-1

#### Words that start and end with the same character, from the "words" dataset

str_subset(words, "^(.)((.*\\1$)|\\1?$)")
##  [1] "a"          "america"    "area"       "dad"        "dead"      
##  [6] "depend"     "educate"    "else"       "encourage"  "engine"    
## [11] "europe"     "evidence"   "example"    "excuse"     "exercise"  
## [16] "expense"    "experience" "eye"        "health"     "high"      
## [21] "knock"      "level"      "local"      "nation"     "non"       
## [26] "rather"     "refer"      "remember"   "serious"    "stairs"    
## [31] "test"       "tonight"    "transport"  "treat"      "trust"     
## [36] "window"     "yesterday"
### Q4-2

#### Pairs of Repeated letters - "london' in this case

str_subset("london", "([A-Za-z][A-Za-z]).*\\1")
## [1] "london"
#### Pairs of Repeated letters - from the "words" dataset.

str_subset(words, "([A-Za-z][A-Za-z]).*\\1")
##  [1] "appropriate" "church"      "condition"   "decide"      "environment"
##  [6] "london"      "paragraph"   "particular"  "photograph"  "prepare"    
## [11] "pressure"    "remember"    "represent"   "require"     "sense"      
## [16] "therefore"   "understand"  "whether"
### Q4-3

#### One letter repeated in at least three places - "nineteen" in this case

str_subset("nineteen", "([a-z]).*\\1.*\\1")
## [1] "nineteen"
#### One letter repeated in at least three places - from the "words" dataset.

str_subset(words, "([a-z]).*\\1.*\\1")
##  [1] "appropriate" "available"   "believe"     "between"     "business"   
##  [6] "degree"      "difference"  "discuss"     "eleven"      "environment"
## [11] "evidence"    "exercise"    "expense"     "experience"  "individual" 
## [16] "paragraph"   "receive"     "remember"    "represent"   "telephone"  
## [21] "therefore"   "tomorrow"