Question 1 College Major - #1. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”
#Read and upload the csv file of the college major list
collegemajors = read.csv( "https://raw.githubusercontent.com/fivethirtyeight/data/2d2ff3e9457549d51f8e571c52099bfe9b2017ad/college-majors/majors-list.csv")
Now select for majors with Data or STATISTICS
grep(pattern = "DATA|STATISTICS", collegemajors$Major, value= TRUE )
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [3] "STATISTICS AND DECISION SCIENCE"
Question 2 #2 Write code that transforms the data below: #[1] “bell
pepper” “bilberry” “blackberry” “blood orange” #[5] “blueberry”
“cantaloupe” “chili pepper” “cloudberry”
#[9] “elderberry” “lime” “lychee” “mulberry”
#[13] “olive” “salal berry” #Into a format like this: #c(“bell pepper”,
“bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”,
“chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”,
“mulberry”, “olive”, “salal berry”)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ stringr 1.5.0
## ✔ tidyr 1.3.0 ✔ forcats 1.0.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
fruit = c(
'[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry" ' )
VIEW
fruit
## [1] "[1] \"bell pepper\" \"bilberry\" \"blackberry\" \"blood orange\"\n[5] \"blueberry\" \"cantaloupe\" \"chili pepper\" \"cloudberry\" \n[9] \"elderberry\" \"lime\" \"lychee\" \"mulberry\" \n[13] \"olive\" \"salal berry\" "
#NEED TO REMOVE [1] [5] [9] [13 and the ]
library(stringr)
newfruitv1= str_remove_all(string=fruit, pattern="\"")
newfruitv1= str_remove_all(string=fruit, pattern="[1]")
fruit
## [1] "[1] \"bell pepper\" \"bilberry\" \"blackberry\" \"blood orange\"\n[5] \"blueberry\" \"cantaloupe\" \"chili pepper\" \"cloudberry\" \n[9] \"elderberry\" \"lime\" \"lychee\" \"mulberry\" \n[13] \"olive\" \"salal berry\" "
fruit3 = gsub('\"', '', fruit, fixed=TRUE)
fruit3
## [1] "[1] bell pepper bilberry blackberry blood orange\n[5] blueberry cantaloupe chili pepper cloudberry \n[9] elderberry lime lychee mulberry \n[13] olive salal berry "
#getting ride of number5,9,13 and 1 one-by-one
fruit3 = gsub('\n[5]', "", fruit3, fixed=TRUE)
fruit3
## [1] "[1] bell pepper bilberry blackberry blood orange blueberry cantaloupe chili pepper cloudberry \n[9] elderberry lime lychee mulberry \n[13] olive salal berry "
fruit3 = gsub('\n[9]', "", fruit3, fixed=TRUE)
fruit3
## [1] "[1] bell pepper bilberry blackberry blood orange blueberry cantaloupe chili pepper cloudberry elderberry lime lychee mulberry \n[13] olive salal berry "
fruit3 = gsub('\n[13]', "", fruit3, fixed=TRUE)
fruit3
## [1] "[1] bell pepper bilberry blackberry blood orange blueberry cantaloupe chili pepper cloudberry elderberry lime lychee mulberry olive salal berry "
#I want to get ride of backslash and numbers in less moves than what it took to mke fruit3
fruit2 = gsub("[^A-Za-z0-9]", ".", fruit)
fruit2
## [1] ".1...bell.pepper....bilberry.......blackberry.....blood.orange...5...blueberry......cantaloupe.....chili.pepper...cloudberry.....9...elderberry.....lime...........lychee.........mulberry.......13...olive..........salal.berry.."
fruit2 = gsub('[0-9]+', '', fruit2)
fruit2
## [1] "....bell.pepper....bilberry.......blackberry.....blood.orange......blueberry......cantaloupe.....chili.pepper...cloudberry........elderberry.....lime...........lychee.........mulberry..........olive..........salal.berry.."
FOUND THE PATTERN
str_view_all(fruit2,"\\W")
## Warning: `str_view()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view_all()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] │ <.><.><.><.>bell<.>pepper<.><.><.><.>bilberry<.><.><.><.><.><.><.>blackberry<.><.><.><.><.>blood<.>orange<.><.><.><.><.><.>blueberry<.><.><.><.><.><.>cantaloupe<.><.><.><.><.>chili<.>pepper<.><.><.>cloudberry<.><.><.><.><.><.><.><.>elderberry<.><.><.><.><.>lime<.><.><.><.><.><.><.><.><.><.><.>lychee<.><.><.><.><.><.><.><.><.>mulberry<.><.><.><.><.><.><.><.><.><.>olive<.><.><.><.><.><.><.><.><.><.>salal<.>berry<.><.>
#THANKKKKKK YOU JESUS
#Trying to figure out how to replace all the noncharcter white space with ” ” and not get backslah
fruits2update = str_replace_all(fruit2,"\\W", ' "')
fruits2update
## [1] " \" \" \" \"bell \"pepper \" \" \" \"bilberry \" \" \" \" \" \" \"blackberry \" \" \" \" \"blood \"orange \" \" \" \" \" \"blueberry \" \" \" \" \" \"cantaloupe \" \" \" \" \"chili \"pepper \" \" \"cloudberry \" \" \" \" \" \" \" \"elderberry \" \" \" \" \"lime \" \" \" \" \" \" \" \" \" \" \"lychee \" \" \" \" \" \" \" \" \"mulberry \" \" \" \" \" \" \" \" \" \"olive \" \" \" \" \" \" \" \" \" \"salal \"berry \" \""
fruits2update = str_replace_all(fruit2,"\\W", " '")
fruits2update
## [1] " ' ' ' 'bell 'pepper ' ' ' 'bilberry ' ' ' ' ' ' 'blackberry ' ' ' ' 'blood 'orange ' ' ' ' ' 'blueberry ' ' ' ' ' 'cantaloupe ' ' ' ' 'chili 'pepper ' ' 'cloudberry ' ' ' ' ' ' ' 'elderberry ' ' ' ' 'lime ' ' ' ' ' ' ' ' ' ' 'lychee ' ' ' ' ' ' ' ' 'mulberry ' ' ' ' ' ' ' ' ' 'olive ' ' ' ' ' ' ' ' ' 'salal 'berry ' '"
fruits2update = str_remove(fruit2,"\\W")
fruits2update
## [1] "...bell.pepper....bilberry.......blackberry.....blood.orange......blueberry......cantaloupe.....chili.pepper...cloudberry........elderberry.....lime...........lychee.........mulberry..........olive..........salal.berry.."
#fruits2update = str_remove_all(fruit2,".")
#fruits2update
#OPPPPPS #Go BACK to fruit2 as answer
fruit2
## [1] "....bell.pepper....bilberry.......blackberry.....blood.orange......blueberry......cantaloupe.....chili.pepper...cloudberry........elderberry.....lime...........lychee.........mulberry..........olive..........salal.berry.."
fruit_list = as.list(fruit2)
fruit_list
## [[1]]
## [1] "....bell.pepper....bilberry.......blackberry.....blood.orange......blueberry......cantaloupe.....chili.pepper...cloudberry........elderberry.....lime...........lychee.........mulberry..........olive..........salal.berry.."
fruit5 = gsub("\\\\", " ",fruit)
fruit5
## [1] "[1] \"bell pepper\" \"bilberry\" \"blackberry\" \"blood orange\"\n[5] \"blueberry\" \"cantaloupe\" \"chili pepper\" \"cloudberry\" \n[9] \"elderberry\" \"lime\" \"lychee\" \"mulberry\" \n[13] \"olive\" \"salal berry\" "
Question 3
#3 Describe, in words, what these expressions will match: #(.)\1\1 #“(.)(.)\2\1” #(..)\1 #“(.).\1.\1” #“(.)(.)(.).*\3\2\1”
Testing string
input_string = "abc"
matches = str_extract_all(input_string,"(.)\1\1")
matches
## [[1]]
## character(0)
strings= c("aaa", "111", "&&&", "abb", "abccba", "xyz", "xyyzz", "(.)\\1\\1")
matches2= str_extract_all(strings, "(.)\\1\\1" )
matches2
## [[1]]
## [1] "aaa"
##
## [[2]]
## [1] "111"
##
## [[3]]
## [1] "&&&"
##
## [[4]]
## character(0)
##
## [[5]]
## character(0)
##
## [[6]]
## character(0)
##
## [[7]]
## character(0)
##
## [[8]]
## character(0)
strings3 = c("abba", "cccc","chhc","papa","paap","coco","church","yaya","yaay")
matches3 = str_extract_all(strings3, "(.)(.)\\2\\1")
matches3
## [[1]]
## [1] "abba"
##
## [[2]]
## [1] "cccc"
##
## [[3]]
## [1] "chhc"
##
## [[4]]
## character(0)
##
## [[5]]
## [1] "paap"
##
## [[6]]
## character(0)
##
## [[7]]
## character(0)
##
## [[8]]
## character(0)
##
## [[9]]
## [1] "yaay"
string4 = c("abba", "cccc","chhc","papa","paap","coco","church","yaya","yaay","dada","dadaegeg","dadadada","abaa","aaaabbbb", "a1a1")
matches4 = str_extract_all(string4, "(..)\\1")
matches4
## [[1]]
## character(0)
##
## [[2]]
## [1] "cccc"
##
## [[3]]
## character(0)
##
## [[4]]
## [1] "papa"
##
## [[5]]
## character(0)
##
## [[6]]
## [1] "coco"
##
## [[7]]
## character(0)
##
## [[8]]
## [1] "yaya"
##
## [[9]]
## character(0)
##
## [[10]]
## [1] "dada"
##
## [[11]]
## [1] "dada" "egeg"
##
## [[12]]
## [1] "dada" "dada"
##
## [[13]]
## character(0)
##
## [[14]]
## [1] "aaaa" "bbbb"
##
## [[15]]
## [1] "a1a1"
The answer: The answer to each part follows.
1)(.)\1\1: The same character appearing three times in a row. E.g. “aaa” 2)“(.)(.)\2\1”: A pair of characters followed by the same pair of characters in reversed order. E.g. “abba”. 3)(..)\1: Any two characters repeated. E.g. “a1a1”. 4)“(.).\1.\1”: A character followed by any character, the original character, any other character, the original character again. E.g. “abaca”, “b8b.b”. 5)“(.)(.)(.).*\3\2\1” Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. “abcsgasgddsadgsdgcba” or “abccba” or “abc1cba”.
Question 4
#Construct regular expressions to match words that: #Start and end with the same character. #Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) #Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
Q_list= list("dad","eye","london","papa","arm","newyork","arizona","church","tomato","baba","tio","rawr","mom","wow","pop","fun", "encourage", "remember", "noon","sense","banana","octogon","habibi","kefir","genes","economic")
Q_list
## [[1]]
## [1] "dad"
##
## [[2]]
## [1] "eye"
##
## [[3]]
## [1] "london"
##
## [[4]]
## [1] "papa"
##
## [[5]]
## [1] "arm"
##
## [[6]]
## [1] "newyork"
##
## [[7]]
## [1] "arizona"
##
## [[8]]
## [1] "church"
##
## [[9]]
## [1] "tomato"
##
## [[10]]
## [1] "baba"
##
## [[11]]
## [1] "tio"
##
## [[12]]
## [1] "rawr"
##
## [[13]]
## [1] "mom"
##
## [[14]]
## [1] "wow"
##
## [[15]]
## [1] "pop"
##
## [[16]]
## [1] "fun"
##
## [[17]]
## [1] "encourage"
##
## [[18]]
## [1] "remember"
##
## [[19]]
## [1] "noon"
##
## [[20]]
## [1] "sense"
##
## [[21]]
## [1] "banana"
##
## [[22]]
## [1] "octogon"
##
## [[23]]
## [1] "habibi"
##
## [[24]]
## [1] "kefir"
##
## [[25]]
## [1] "genes"
##
## [[26]]
## [1] "economic"
Answer 1
str_view(string = Q_list, pattern = "^(.)((.*\\1$)|\\1?$)" )
## [1] │ <dad>
## [2] │ <eye>
## [7] │ <arizona>
## [12] │ <rawr>
## [13] │ <mom>
## [14] │ <wow>
## [15] │ <pop>
## [17] │ <encourage>
## [18] │ <remember>
## [19] │ <noon>
Answer Question4_2
str_subset(Q_list, "([A-Za-z][A-Za-z]).*\\1")
## [1] "london" "papa" "church" "tomato" "baba" "remember" "sense"
## [8] "banana" "habibi"
AnswerQuestion4_3
str_subset(Q_list, "([a-z]).*\\1.*\\1")
## [1] "remember" "banana" "octogon"
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.