#1. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

majors <- read.csv(url('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv'), stringsAsFactors = F)
str(majors)
## 'data.frame':    174 obs. of  3 variables:
##  $ FOD1P         : chr  "1100" "1101" "1102" "1103" ...
##  $ Major         : chr  "GENERAL AGRICULTURE" "AGRICULTURE PRODUCTION AND MANAGEMENT" "AGRICULTURAL ECONOMICS" "ANIMAL SCIENCES" ...
##  $ Major_Category: chr  "Agriculture & Natural Resources" "Agriculture & Natural Resources" "Agriculture & Natural Resources" "Agriculture & Natural Resources" ...
grep(pattern = 'STATISTICS|DATA', majors$Major, value = TRUE, ignore.case = TRUE)
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
## [3] "STATISTICS AND DECISION SCIENCE"
majors$Major[grepl("DATA", majors$Major)]
## [1] "COMPUTER PROGRAMMING AND DATA PROCESSING"
majors$Major[grepl("STATISTICS", majors$Major)]
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "STATISTICS AND DECISION SCIENCE"

#2 Write code that transforms the data below: [1] “bell pepper” “bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry” Into a format like this: c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

Source_data = '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"

[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  

[9] "elderberry"   "lime"         "lychee"       "mulberry"    

[13] "olive"        "salal berry"'


library(stringr)
healthy <- str_extract_all(Source_data, '[:alpha:]+\\s[:alpha:]+|[:alpha:]+')
unlist(healthy)
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"
cat(paste0("c(",paste0(sep = '"',healthy, collapse = ', ', sep='"'),paste(")")))
## c("c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")")

#The two exercises below are taken from R for Data Science, 14.3.5.1 in the on-line version: #3 Describe, in words, what these expressions will match: (.)\1\1 = ‘(.)’ matches any one character and the ‘/1’ means the same character repeated a second time and the next ‘/1’ means the same character appearing three times in a row such as ‘bbb’ (Are two slashes not needed?)

“(.)(.)\2\1” = ‘(.)(.)’ represents a pair of characters, and //2//1 means the next pair of characters match it exactly with case sensitivity such as hiih

(..)\1 = ‘(..)’shows two characters, and’/1’ represents that those two characters are repeated such as hihi (Are two slashes not needed?)

“(.).\1.\1” = the 1st, 3rd, and 5th ch match. Character A, followed by any character, Character A, followed by any character, Character A, such as 23242.

“(.)(.)(.).*\3\2\1” = First 3 characters match the last 3 characters in rever order, with any characters in between such as 12398321

#4 Construct regular expressions to match words that: Start and end with the same character.

str_subset(words, "^(.)((.*\\1$)|\\1?$)")
##  [1] "a"          "america"    "area"       "dad"        "dead"      
##  [6] "depend"     "educate"    "else"       "encourage"  "engine"    
## [11] "europe"     "evidence"   "example"    "excuse"     "exercise"  
## [16] "expense"    "experience" "eye"        "health"     "high"      
## [21] "knock"      "level"      "local"      "nation"     "non"       
## [26] "rather"     "refer"      "remember"   "serious"    "stairs"    
## [31] "test"       "tonight"    "transport"  "treat"      "trust"     
## [36] "window"     "yesterday"

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

str_subset(words, "([A-Za-z][A-Za-z]).*\\1")
##  [1] "appropriate" "church"      "condition"   "decide"      "environment"
##  [6] "london"      "paragraph"   "particular"  "photograph"  "prepare"    
## [11] "pressure"    "remember"    "represent"   "require"     "sense"      
## [16] "therefore"   "understand"  "whether"

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

str_subset(words, "([a-z]).*\\1.*\\1")
##  [1] "appropriate" "available"   "believe"     "between"     "business"   
##  [6] "degree"      "difference"  "discuss"     "eleven"      "environment"
## [11] "evidence"    "exercise"    "expense"     "experience"  "individual" 
## [16] "paragraph"   "receive"     "remember"    "represent"   "telephone"  
## [21] "therefore"   "tomorrow"

#Latex error-> tinytex

```