607assignment3

Question1

inserting the data

major_list_url<- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
majors <-read.csv(major_list_url)

pulling the data

You can also embed plots, for example:

##    FOD1P                                         Major          Major_Category
## 44  6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS                Business
## 52  2101      COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 59  3702               STATISTICS AND DECISION SCIENCE Computers & Mathematics

Question2

Inputting the text and checking

sampletext <- c('[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
[9] "elderberry"   "lime"         "lychee"       "mulberry"    
[13] "olive"        "salal berry"')

sampletext

## [1] "[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\"\n[5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"  \n[9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"    \n[13] \"olive\"        \"salal berry\""

removeing data and displaying

sampletextonlytext <- unlist(str_extract_all(sampletext, pattern="\"([a-z]+.[a-z]+)\""))
sampletext2 <- str_replace_all(sampletextonlytext,"\"","")
dput(as.character(sampletext2))

## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", 
## "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", 
## "lychee", "mulberry", "olive", "salal berry")

Question3

inputting random data

(.)\1\1

tested, it is not working because of missing , I guess it mean to match a character which repeated 3 times like aaa or bbb.

“(.)(.)\2\1”

2 matching characters in reverse order

(..)\1

tested, it is not working because of missing ,I guess it mean to match a set of pattern which repeated 2 times like abab or 1212.

“(.).\1.\1”

characters between the 3 matching characters

“(.)(.)(.).*\3\2\1”

match any 3 characters in reverse order like the “(.)(.)\2\1”

library("stringr")
expression1 ="(.)\1\1"
expression2 ="(.)(.)\\2\\1"
expression3 ="(..)\1"
expression4 ="(.).\\1.\\1"
expression5 ="(.)(.)(.).*\\3\\2\\1"

dataforq3 <-c("111", "aaaa", "aaa", "1212", "1221", "a1a1", "a111a", "1abba1", "2211aa", "11111a111", "aaa11aaaaa1", "12a", "abba")

result1 <- str_subset(dataforq3,expression1)
result1

## character(0)

result2 <- str_subset(dataforq3,expression2)
result2

## [1] "aaaa"        "1221"        "1abba1"      "11111a111"   "aaa11aaaaa1"
## [6] "abba"

result3 <- str_subset(dataforq3,expression3)
result3

## character(0)

result4 <- str_subset(dataforq3,expression4)
result4

## [1] "11111a111"   "aaa11aaaaa1"

result5 <- str_subset(dataforq3,expression5)
result5

## [1] "1abba1"      "11111a111"   "aaa11aaaaa1"

Question4

a.Start and end with the same character. b.Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) c.Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

dataforq4 <-c("1234", "5678", "church","mom","dad","eleven","steventeen")

expressiona="^(.)(.*\\1$)"
expressionb="([a-z][a-z]).*\\1"
expressionc="([a-z]).*\\1.*\\1"

resulta <- str_subset(dataforq4,expressiona)
resulta

## [1] "mom" "dad"

resultb <- str_subset(dataforq4,expressionb)
resultb

## [1] "church"     "steventeen"

resultc <- str_subset(dataforq4,expressionc)
resultc

## [1] "eleven"     "steventeen"