library(openintro)
library(stringr)
library(RCurl)
library(tidyverse)
library(dplyr)

Exercise 1:

Provide Code that Identifies Majors with “Data” or “Statistics” in the name

#Read data into R using RCurl
x <- getURL("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv")
college_data <- read.csv(text = x)


str_subset(college_data$Major,("DATA|STATISTICS"))
## [1] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
## [2] "STATISTICS AND DECISION SCIENCE"              
## [3] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"

Exercise 2:

write code that transforms data:

#raw text
dat <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"

[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  

[9] "elderberry"   "lime"         "lychee"       "mulberry"    

[13] "olive"        "salal berry"'

#extracting the phrases
fruit_dat<-str_extract_all(dat,"[a-z]+ ?[a-z]+")%>%unlist()

#collapsing with ', '
fruit_dat<-str_c(fruit_dat, collapse=', ')


#adding in quotes
fruit_dat<-cat(gsub("\\b", '"', fruit_dat, perl=T))
## "bell" "pepper", "bilberry", "blackberry", "blood" "orange", "blueberry", "cantaloupe", "chili" "pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal" "berry"

Exercise 3

describe the following pattern

part 1

“(.)\1\1”

This will ONLY match “(any character or”.“)\1\1”. It looks like a backreference, but you need to escape the "" if you want to use it as a backreference.

Example:

x<-c("(.)\1\1","check\1\1","aaa","b)\1\1")

pattern<-"(.)\1\1"

str_view_all(x,pattern)

part 2

“(.)(.)\2\1”

This will match a first group, a second group, followed by the second group again and then the first group again.

Example:

pattern<-"(.)(.)\\2\\1"

str_view(words,pattern, match=TRUE)

part 3

“(..)\1”

this will only match “(any two characters or ..)\1” because it is not a backreference

Example:

x<-c("(.)\1\1","check\1\1","aaa","b)\1\1")

pattern<-"(..)\1"

str_view_all(x,pattern)

part 4

“(.).\1.\1”

this will match a group, followed by any character, followed by the group, followed by any character, followed by the group.

Example:

pattern<-"(.).\\1.\\1"

str_view(words,pattern, match=TRUE)

part 5

"(.)(.)(.).*\3\2\1"

this will match three groups, followed by any number of characters until the group is repeated in reverse.

Example:

pattern<-"(.)(.)(.).*\\3\\2\\1"

str_view(words,pattern, match=TRUE)

Excersise 4:

Construct regular expressions that meet requirements

part 1

Start and end with the same character.

pattern<-"^(.).*\\1$"


#Example

result<-str_subset(words[1:300],pattern)%>%unlist()

result
##  [1] "america"    "area"       "dad"        "dead"       "depend"    
##  [6] "educate"    "else"       "encourage"  "engine"     "europe"    
## [11] "evidence"   "example"    "excuse"     "exercise"   "expense"   
## [16] "experience" "eye"

part 2

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

pattern<-"(..).*\\1"

result<-str_subset(words[1:300],pattern)%>%unlist()

result
## [1] "appropriate" "church"      "condition"   "decide"      "environment"

part 3

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

pattern<-"(.).*\\1.*\\1"

result<-str_subset(words[1:300],pattern)%>%unlist()

result
##  [1] "appropriate" "available"   "believe"     "between"     "business"   
##  [6] "degree"      "difference"  "discuss"     "eleven"      "environment"
## [11] "evidence"    "exercise"    "expense"     "experience"
LS0tDQp0aXRsZTogIkRhdGEgQ29sbGVjdGlvbiBBc3NpZ25tZW50IDMiDQphdXRob3I6ICJKYWNrIFdyaWdodCINCmRhdGU6ICJgciBTeXMuRGF0ZSgpYCINCm91dHB1dDoNCiAgb3BlbmludHJvOjpsYWJfcmVwb3J0OiBkZWZhdWx0DQogIGh0bWxfZG9jdW1lbnQ6IGRlZmF1bHQNCi0tLQ0KDQpgYGB7ciBsb2FkLXBhY2thZ2VzLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShvcGVuaW50cm8pDQpsaWJyYXJ5KHN0cmluZ3IpDQpsaWJyYXJ5KFJDdXJsKQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KGRwbHlyKQ0KYGBgDQoNCiMjIyAgRXhlcmNpc2UgMToNCg0KUHJvdmlkZSBDb2RlIHRoYXQgSWRlbnRpZmllcyBNYWpvcnMgd2l0aCAiRGF0YSIgb3IgIlN0YXRpc3RpY3MiIGluIHRoZSBuYW1lDQoNCmBgYHtyLCBtZXNzYWdlPUZBTFNFfQ0KDQojUmVhZCBkYXRhIGludG8gUiB1c2luZyBSQ3VybA0KeCA8LSBnZXRVUkwoImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9maXZldGhpcnR5ZWlnaHQvZGF0YS9tYXN0ZXIvY29sbGVnZS1tYWpvcnMvYWxsLWFnZXMuY3N2IikNCmNvbGxlZ2VfZGF0YSA8LSByZWFkLmNzdih0ZXh0ID0geCkNCg0KDQpzdHJfc3Vic2V0KGNvbGxlZ2VfZGF0YSRNYWpvciwoIkRBVEF8U1RBVElTVElDUyIpKQ0KDQoNCg0KYGBgDQoNCg0KIyMjIEV4ZXJjaXNlIDI6DQoNCg0Kd3JpdGUgY29kZSB0aGF0IHRyYW5zZm9ybXMgZGF0YToNCg0KYGBge3J9DQoNCiNyYXcgdGV4dA0KZGF0IDwtICdbMV0gImJlbGwgcGVwcGVyIiAgImJpbGJlcnJ5IiAgICAgImJsYWNrYmVycnkiICAgImJsb29kIG9yYW5nZSINCg0KWzVdICJibHVlYmVycnkiICAgICJjYW50YWxvdXBlIiAgICJjaGlsaSBwZXBwZXIiICJjbG91ZGJlcnJ5IiAgDQoNCls5XSAiZWxkZXJiZXJyeSIgICAibGltZSIgICAgICAgICAibHljaGVlIiAgICAgICAibXVsYmVycnkiICAgIA0KDQpbMTNdICJvbGl2ZSIgICAgICAgICJzYWxhbCBiZXJyeSInDQoNCiNleHRyYWN0aW5nIHRoZSBwaHJhc2VzDQpmcnVpdF9kYXQ8LXN0cl9leHRyYWN0X2FsbChkYXQsIlthLXpdKyA/W2Etel0rIiklPiV1bmxpc3QoKQ0KDQojY29sbGFwc2luZyB3aXRoICcsICcNCmZydWl0X2RhdDwtc3RyX2MoZnJ1aXRfZGF0LCBjb2xsYXBzZT0nLCAnKQ0KDQoNCiNhZGRpbmcgaW4gcXVvdGVzDQpmcnVpdF9kYXQ8LWNhdChnc3ViKCJcXGIiLCAnIicsIGZydWl0X2RhdCwgcGVybD1UKSkNCg0KDQoNCg0KDQpgYGANCg0KDQojIyMgRXhlcmNpc2UgMw0KDQpkZXNjcmliZSB0aGUgZm9sbG93aW5nIHBhdHRlcm4NCg0KcGFydCAxDQoNCiIoLilcMVwxIg0KDQpUaGlzIHdpbGwgT05MWSBtYXRjaCAiKGFueSBjaGFyYWN0ZXIgb3IgIi4iKVwxXDEiLiBJdCBsb29rcyBsaWtlIGEgYmFja3JlZmVyZW5jZSwgYnV0IHlvdSBuZWVkIHRvIGVzY2FwZSB0aGUgIlwiIGlmIHlvdSB3YW50IHRvIHVzZSBpdCBhcyBhIGJhY2tyZWZlcmVuY2UuDQoNCkV4YW1wbGU6DQpgYGB7cn0NCg0KeDwtYygiKC4pXDFcMSIsImNoZWNrXDFcMSIsImFhYSIsImIpXDFcMSIpDQoNCnBhdHRlcm48LSIoLilcMVwxIg0KDQpzdHJfdmlld19hbGwoeCxwYXR0ZXJuKQ0KDQoNCg0KYGBgDQoNCnBhcnQgMg0KDQoiKC4pKC4pXFwyXFwxIg0KDQpUaGlzIHdpbGwgbWF0Y2ggYSBmaXJzdCBncm91cCwgYSBzZWNvbmQgZ3JvdXAsIGZvbGxvd2VkIGJ5IHRoZSBzZWNvbmQgZ3JvdXAgYWdhaW4gYW5kIHRoZW4gdGhlIGZpcnN0IGdyb3VwIGFnYWluLg0KDQpFeGFtcGxlOg0KYGBge3J9DQoNCg0KDQpwYXR0ZXJuPC0iKC4pKC4pXFwyXFwxIg0KDQpzdHJfdmlldyh3b3JkcyxwYXR0ZXJuLCBtYXRjaD1UUlVFKQ0KDQoNCg0KYGBgDQoNCg0KcGFydCAzDQoNCiIoLi4pXDEiDQoNCnRoaXMgd2lsbCBvbmx5IG1hdGNoIA0KIihhbnkgdHdvIGNoYXJhY3RlcnMgb3IgLi4pXDEiIGJlY2F1c2UgaXQgaXMgbm90IGEgYmFja3JlZmVyZW5jZQ0KDQpFeGFtcGxlOg0KYGBge3J9DQoNCng8LWMoIiguKVwxXDEiLCJjaGVja1wxXDEiLCJhYWEiLCJiKVwxXDEiKQ0KDQpwYXR0ZXJuPC0iKC4uKVwxIg0KDQpzdHJfdmlld19hbGwoeCxwYXR0ZXJuKQ0KDQoNCg0KYGBgDQoNCnBhcnQgNA0KDQoiKC4pLlxcMS5cXDEiDQoNCnRoaXMgd2lsbCBtYXRjaCBhIGdyb3VwLCBmb2xsb3dlZCBieSBhbnkgY2hhcmFjdGVyLCBmb2xsb3dlZCBieSB0aGUgZ3JvdXAsIGZvbGxvd2VkIGJ5IGFueSBjaGFyYWN0ZXIsIGZvbGxvd2VkIGJ5IHRoZSBncm91cC4NCg0KRXhhbXBsZToNCmBgYHtyfQ0KDQoNCg0KcGF0dGVybjwtIiguKS5cXDEuXFwxIg0KDQpzdHJfdmlldyh3b3JkcyxwYXR0ZXJuLCBtYXRjaD1UUlVFKQ0KDQoNCg0KYGBgDQoNCnBhcnQgNQ0KDQoiKC4pKC4pKC4pLipcXDNcXDJcXDEiDQoNCnRoaXMgd2lsbCBtYXRjaCB0aHJlZSBncm91cHMsIGZvbGxvd2VkIGJ5IGFueSBudW1iZXIgb2YgY2hhcmFjdGVycyB1bnRpbCB0aGUgZ3JvdXAgaXMgcmVwZWF0ZWQgaW4gcmV2ZXJzZS4NCg0KRXhhbXBsZToNCmBgYHtyfQ0KDQoNCg0KcGF0dGVybjwtIiguKSguKSguKS4qXFwzXFwyXFwxIg0KDQpzdHJfdmlldyh3b3JkcyxwYXR0ZXJuLCBtYXRjaD1UUlVFKQ0KDQoNCg0KYGBgDQoNCg0KDQojIyMgRXhjZXJzaXNlIDQ6DQoNCkNvbnN0cnVjdCByZWd1bGFyIGV4cHJlc3Npb25zIHRoYXQgbWVldCByZXF1aXJlbWVudHMNCg0KDQpwYXJ0IDENCg0KU3RhcnQgYW5kIGVuZCB3aXRoIHRoZSBzYW1lIGNoYXJhY3Rlci4NCg0KYGBge3J9DQoNCnBhdHRlcm48LSJeKC4pLipcXDEkIg0KDQoNCiNFeGFtcGxlDQoNCnJlc3VsdDwtc3RyX3N1YnNldCh3b3Jkc1sxOjMwMF0scGF0dGVybiklPiV1bmxpc3QoKQ0KDQpyZXN1bHQNCmBgYA0KDQoNCnBhcnQgMg0KDQpDb250YWluIGEgcmVwZWF0ZWQgcGFpciBvZiBsZXR0ZXJzIChlLmcuICJjaHVyY2giIGNvbnRhaW5zICJjaCIgcmVwZWF0ZWQgdHdpY2UuKQ0KDQpgYGB7cn0NCg0KcGF0dGVybjwtIiguLikuKlxcMSINCg0KcmVzdWx0PC1zdHJfc3Vic2V0KHdvcmRzWzE6MzAwXSxwYXR0ZXJuKSU+JXVubGlzdCgpDQoNCnJlc3VsdA0KDQpgYGANCg0KcGFydCAzDQoNCkNvbnRhaW4gb25lIGxldHRlciByZXBlYXRlZCBpbiBhdCBsZWFzdCB0aHJlZSBwbGFjZXMgKGUuZy4gImVsZXZlbiIgY29udGFpbnMgdGhyZWUgImUicy4pDQoNCmBgYHtyfQ0KDQpwYXR0ZXJuPC0iKC4pLipcXDEuKlxcMSINCg0KcmVzdWx0PC1zdHJfc3Vic2V0KHdvcmRzWzE6MzAwXSxwYXR0ZXJuKSU+JXVubGlzdCgpDQoNCnJlc3VsdA0KDQpgYGANCg0K