Assignment 3 will cover topics related to data manipulation and regular expressions.
# Get data from github
college.majors.data <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
# subset data to only include DATA and STATISTICS
college.majors.subset <- subset(college.majors.data, grepl("DATA", Major) | grepl("STATISTICS", Major)) # grepl() returns TRUE if condition is met
print(college.majors.subset)
## FOD1P Major Major_Category
## 44 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
## 52 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 59 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics
# Data from above
fruit.data <- '[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"'
# split data by each element
fruit.data.scan<-scan(text=fruit.data, what='character', quiet=TRUE)
# remove the elements that are not in the new format (example: [1])
fruit.data.scan<-fruit.data.scan[-c(1,6,11,16)]
# prints the data in the desired format
fruit.data.reformat<-dput(fruit.data.scan)
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry",
## "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime",
## "lychee", "mulberry", "olive", "salal berry")
(.)\1\1 will match any string that repeats back to back. Example: aaa
“(.)(.)\\2\\1” will match to a symmetrical group of 4 characters. Example: abba
(..)\1 will match any string that has 2 characters that repeat once. Example: abab
“(.).\\1.\\1” will match any string that repeats the same character 3 times, but in between each repeated character there is a random character. Example: abaca
"(.)(.)(.).*\\3\\2\\1" will match with 6 characters that are symmetric with any set of values in between the symmetry. Examples: “abccba” “abc33345cba”
# sample words
words = c("eye", "noon", "harsh", "church", "shush", "eleven", "banana", "eevee")
The regular expression is: ^(.)((.*\\1$)|\\1?$)
# Regular Expression: ^(.)((.*\\1$)|\\1?$)
grep("^(.)((.*\\1$)|\\1?$)",words,value=T)
## [1] "eye" "noon" "harsh" "eevee"
The regular expression is: ([A-Za-z][A-Za-z]).*\\1
# Regular Expression: ([A-Za-z][A-Za-z]).*\\1
grep("([A-Za-z][A-Za-z]).*\\1",words,value=T)
## [1] "church" "shush" "eevee"
The regular expression is: ([A-Za-z]).*\\1.*\\1
# Regular Expression: ([A-Za-z]).*\\1.*\\1
grep("([A-Za-z]).*\\1.*\\1",words,value=T)
## [1] "eleven" "eevee"