Question 1 College Major - #1. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

#Read and upload the csv file of the college major list

collegemajors = read.csv( "https://raw.githubusercontent.com/fivethirtyeight/data/2d2ff3e9457549d51f8e571c52099bfe9b2017ad/college-majors/majors-list.csv")

Now select for majors with Data or STATISTICS

grep(pattern = "DATA|STATISTICS", collegemajors$Major, value= TRUE )

## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
## [3] "STATISTICS AND DECISION SCIENCE"

Question 2 #2 Write code that transforms the data below: #[1] “bell pepper” “bilberry” “blackberry” “blood orange” #[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
#[9] “elderberry” “lime” “lychee” “mulberry”
#[13] “olive” “salal berry” #Into a format like this: #c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──

## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ stringr 1.5.0
## ✔ tidyr   1.3.0     ✔ forcats 1.0.0
## ✔ readr   2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

fruit = c(
'[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
[9] "elderberry"   "lime"         "lychee"       "mulberry"    
[13] "olive"        "salal berry" ' )

VIEW

fruit

## [1] "[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\"\n[5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"  \n[9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"    \n[13] \"olive\"        \"salal berry\" "

#NEED TO REMOVE [1] [5] [9] [13 and the ]

library(stringr)

newfruitv1= str_remove_all(string=fruit, pattern="\"")

newfruitv1= str_remove_all(string=fruit, pattern="[1]")

fruit

## [1] "[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\"\n[5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"  \n[9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"    \n[13] \"olive\"        \"salal berry\" "

fruit3 = gsub('\"', '', fruit, fixed=TRUE)
fruit3

## [1] "[1] bell pepper  bilberry     blackberry   blood orange\n[5] blueberry    cantaloupe   chili pepper cloudberry  \n[9] elderberry   lime         lychee       mulberry    \n[13] olive        salal berry "

#getting ride of number5,9,13 and 1 one-by-one

fruit3 = gsub('\n[5]', "", fruit3, fixed=TRUE)
fruit3

## [1] "[1] bell pepper  bilberry     blackberry   blood orange blueberry    cantaloupe   chili pepper cloudberry  \n[9] elderberry   lime         lychee       mulberry    \n[13] olive        salal berry "

fruit3 = gsub('\n[9]', "", fruit3, fixed=TRUE)
fruit3

## [1] "[1] bell pepper  bilberry     blackberry   blood orange blueberry    cantaloupe   chili pepper cloudberry   elderberry   lime         lychee       mulberry    \n[13] olive        salal berry "

fruit3 = gsub('\n[13]', "", fruit3, fixed=TRUE)
fruit3

## [1] "[1] bell pepper  bilberry     blackberry   blood orange blueberry    cantaloupe   chili pepper cloudberry   elderberry   lime         lychee       mulberry     olive        salal berry "

#I want to get ride of backslash and numbers in less moves than what it took to mke fruit3

fruit2 = gsub("[^A-Za-z0-9]", ".", fruit)
fruit2

## [1] ".1...bell.pepper....bilberry.......blackberry.....blood.orange...5...blueberry......cantaloupe.....chili.pepper...cloudberry.....9...elderberry.....lime...........lychee.........mulberry.......13...olive..........salal.berry.."

fruit2 = gsub('[0-9]+', '', fruit2)
fruit2

## [1] "....bell.pepper....bilberry.......blackberry.....blood.orange......blueberry......cantaloupe.....chili.pepper...cloudberry........elderberry.....lime...........lychee.........mulberry..........olive..........salal.berry.."

FOUND THE PATTERN

str_view_all(fruit2,"\\W")

## Warning: `str_view()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view_all()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## [1] │ <.><.><.><.>bell<.>pepper<.><.><.><.>bilberry<.><.><.><.><.><.><.>blackberry<.><.><.><.><.>blood<.>orange<.><.><.><.><.><.>blueberry<.><.><.><.><.><.>cantaloupe<.><.><.><.><.>chili<.>pepper<.><.><.>cloudberry<.><.><.><.><.><.><.><.>elderberry<.><.><.><.><.>lime<.><.><.><.><.><.><.><.><.><.><.>lychee<.><.><.><.><.><.><.><.><.>mulberry<.><.><.><.><.><.><.><.><.><.>olive<.><.><.><.><.><.><.><.><.><.>salal<.>berry<.><.>

#THANKKKKKK YOU JESUS

#Trying to figure out how to replace all the noncharcter white space with ” ” and not get backslah

fruits2update = str_replace_all(fruit2,"\\W", ' "')
fruits2update

## [1] " \" \" \" \"bell \"pepper \" \" \" \"bilberry \" \" \" \" \" \" \"blackberry \" \" \" \" \"blood \"orange \" \" \" \" \" \"blueberry \" \" \" \" \" \"cantaloupe \" \" \" \" \"chili \"pepper \" \" \"cloudberry \" \" \" \" \" \" \" \"elderberry \" \" \" \" \"lime \" \" \" \" \" \" \" \" \" \" \"lychee \" \" \" \" \" \" \" \" \"mulberry \" \" \" \" \" \" \" \" \" \"olive \" \" \" \" \" \" \" \" \" \"salal \"berry \" \""

fruits2update = str_replace_all(fruit2,"\\W",  " '")
fruits2update

## [1] " ' ' ' 'bell 'pepper ' ' ' 'bilberry ' ' ' ' ' ' 'blackberry ' ' ' ' 'blood 'orange ' ' ' ' ' 'blueberry ' ' ' ' ' 'cantaloupe ' ' ' ' 'chili 'pepper ' ' 'cloudberry ' ' ' ' ' ' ' 'elderberry ' ' ' ' 'lime ' ' ' ' ' ' ' ' ' ' 'lychee ' ' ' ' ' ' ' ' 'mulberry ' ' ' ' ' ' ' ' ' 'olive ' ' ' ' ' ' ' ' ' 'salal 'berry ' '"

Okay, I’m going to go back to the orginal and leave as it for my answer 2

fruits2update = str_remove(fruit2,"\\W")
fruits2update

## [1] "...bell.pepper....bilberry.......blackberry.....blood.orange......blueberry......cantaloupe.....chili.pepper...cloudberry........elderberry.....lime...........lychee.........mulberry..........olive..........salal.berry.."

#fruits2update = str_remove_all(fruit2,".")
#fruits2update

#OPPPPPS #Go BACK to fruit2 as answer

fruit2

## [1] "....bell.pepper....bilberry.......blackberry.....blood.orange......blueberry......cantaloupe.....chili.pepper...cloudberry........elderberry.....lime...........lychee.........mulberry..........olive..........salal.berry.."

fruit_list = as.list(fruit2)
fruit_list

## [[1]]
## [1] "....bell.pepper....bilberry.......blackberry.....blood.orange......blueberry......cantaloupe.....chili.pepper...cloudberry........elderberry.....lime...........lychee.........mulberry..........olive..........salal.berry.."

fruit5 = gsub("\\\\", " ",fruit)
fruit5

## [1] "[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\"\n[5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"  \n[9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"    \n[13] \"olive\"        \"salal berry\" "

Question 3

#3 Describe, in words, what these expressions will match: #(.)\1\1 #“(.)(.)\2\1” #(..)\1 #“(.).\1.\1” #“(.)(.)(.).*\3\2\1”

Testing string

input_string = "abc"
matches = str_extract_all(input_string,"(.)\1\1")
matches

## [[1]]
## character(0)

strings= c("aaa", "111", "&&&", "abb", "abccba", "xyz", "xyyzz", "(.)\\1\\1")
matches2= str_extract_all(strings, "(.)\\1\\1" )
matches2

## [[1]]
## [1] "aaa"
## 
## [[2]]
## [1] "111"
## 
## [[3]]
## [1] "&&&"
## 
## [[4]]
## character(0)
## 
## [[5]]
## character(0)
## 
## [[6]]
## character(0)
## 
## [[7]]
## character(0)
## 
## [[8]]
## character(0)

strings3 = c("abba", "cccc","chhc","papa","paap","coco","church","yaya","yaay")
matches3 = str_extract_all(strings3, "(.)(.)\\2\\1")
matches3

## [[1]]
## [1] "abba"
## 
## [[2]]
## [1] "cccc"
## 
## [[3]]
## [1] "chhc"
## 
## [[4]]
## character(0)
## 
## [[5]]
## [1] "paap"
## 
## [[6]]
## character(0)
## 
## [[7]]
## character(0)
## 
## [[8]]
## character(0)
## 
## [[9]]
## [1] "yaay"

string4 = c("abba", "cccc","chhc","papa","paap","coco","church","yaya","yaay","dada","dadaegeg","dadadada","abaa","aaaabbbb", "a1a1")
matches4 = str_extract_all(string4, "(..)\\1")
matches4

## [[1]]
## character(0)
## 
## [[2]]
## [1] "cccc"
## 
## [[3]]
## character(0)
## 
## [[4]]
## [1] "papa"
## 
## [[5]]
## character(0)
## 
## [[6]]
## [1] "coco"
## 
## [[7]]
## character(0)
## 
## [[8]]
## [1] "yaya"
## 
## [[9]]
## character(0)
## 
## [[10]]
## [1] "dada"
## 
## [[11]]
## [1] "dada" "egeg"
## 
## [[12]]
## [1] "dada" "dada"
## 
## [[13]]
## character(0)
## 
## [[14]]
## [1] "aaaa" "bbbb"
## 
## [[15]]
## [1] "a1a1"

The answer: The answer to each part follows.

1)(.)\1\1: The same character appearing three times in a row. E.g. “aaa” 2)“(.)(.)\2\1”: A pair of characters followed by the same pair of characters in reversed order. E.g. “abba”. 3)(..)\1: Any two characters repeated. E.g. “a1a1”. 4)“(.).\1.\1”: A character followed by any character, the original character, any other character, the original character again. E.g. “abaca”, “b8b.b”. 5)“(.)(.)(.).*\3\2\1” Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. “abcsgasgddsadgsdgcba” or “abccba” or “abc1cba”.

Question 4

#Construct regular expressions to match words that: #Start and end with the same character. #Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) #Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

Q_list= list("dad","eye","london","papa","arm","newyork","arizona","church","tomato","baba","tio","rawr","mom","wow","pop","fun", "encourage", "remember", "noon","sense","banana","octogon","habibi","kefir","genes","economic")

Q_list

## [[1]]
## [1] "dad"
## 
## [[2]]
## [1] "eye"
## 
## [[3]]
## [1] "london"
## 
## [[4]]
## [1] "papa"
## 
## [[5]]
## [1] "arm"
## 
## [[6]]
## [1] "newyork"
## 
## [[7]]
## [1] "arizona"
## 
## [[8]]
## [1] "church"
## 
## [[9]]
## [1] "tomato"
## 
## [[10]]
## [1] "baba"
## 
## [[11]]
## [1] "tio"
## 
## [[12]]
## [1] "rawr"
## 
## [[13]]
## [1] "mom"
## 
## [[14]]
## [1] "wow"
## 
## [[15]]
## [1] "pop"
## 
## [[16]]
## [1] "fun"
## 
## [[17]]
## [1] "encourage"
## 
## [[18]]
## [1] "remember"
## 
## [[19]]
## [1] "noon"
## 
## [[20]]
## [1] "sense"
## 
## [[21]]
## [1] "banana"
## 
## [[22]]
## [1] "octogon"
## 
## [[23]]
## [1] "habibi"
## 
## [[24]]
## [1] "kefir"
## 
## [[25]]
## [1] "genes"
## 
## [[26]]
## [1] "economic"

Answer 1

str_view(string = Q_list, pattern = "^(.)((.*\\1$)|\\1?$)" )

##  [1] │ <dad>
##  [2] │ <eye>
##  [7] │ <arizona>
## [12] │ <rawr>
## [13] │ <mom>
## [14] │ <wow>
## [15] │ <pop>
## [17] │ <encourage>
## [18] │ <remember>
## [19] │ <noon>

Answer Question4_2

str_subset(Q_list, "([A-Za-z][A-Za-z]).*\\1")

## [1] "london"   "papa"     "church"   "tomato"   "baba"     "remember" "sense"   
## [8] "banana"   "habibi"

AnswerQuestion4_3

str_subset(Q_list, "([a-z]).*\\1.*\\1")

## [1] "remember" "banana"   "octogon"

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Data607-Assignment3_Z.O

Zainab.O

2023-09-16

Okay, I’m going to go back to the orginal and leave as it for my answer 2

R Markdown

Including Plots