R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)

Reading collage_major data from GitHub

collage_major <- read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Major_code = col_double(),
##   Major = col_character(),
##   Major_category = col_character(),
##   Total = col_double(),
##   Employed = col_double(),
##   Employed_full_time_year_round = col_double(),
##   Unemployed = col_double(),
##   Unemployment_rate = col_double(),
##   Median = col_double(),
##   P25th = col_double(),
##   P75th = col_double()
## )
collage_major
## # A tibble: 173 x 11
##    Major_code Major   Major_category  Total Employed Employed_full_t~ Unemployed
##         <dbl> <chr>   <chr>           <dbl>    <dbl>            <dbl>      <dbl>
##  1       1100 GENERA~ Agriculture &~ 128148    90245            74078       2423
##  2       1101 AGRICU~ Agriculture &~  95326    76865            64240       2266
##  3       1102 AGRICU~ Agriculture &~  33955    26321            22810        821
##  4       1103 ANIMAL~ Agriculture &~ 103549    81177            64937       3619
##  5       1104 FOOD S~ Agriculture &~  24280    17281            12722        894
##  6       1105 PLANT ~ Agriculture &~  79409    63043            51077       2070
##  7       1106 SOIL S~ Agriculture &~   6586     4926             4042        264
##  8       1199 MISCEL~ Agriculture &~   8549     6392             5074        261
##  9       1301 ENVIRO~ Biology & Lif~ 106106    87602            65238       4736
## 10       1302 FOREST~ Agriculture &~  69447    48228            39613       2144
## # ... with 163 more rows, and 4 more variables: Unemployment_rate <dbl>,
## #   Median <dbl>, P25th <dbl>, P75th <dbl>

#1 Code that identifies the majors that contain either “DATA” or “STATISTICS”

library("stringr")

Using str_subset funtion to find the matching string

major1 <- str_subset(collage_major$Major, "DATA")
major1
## [1] "COMPUTER PROGRAMMING AND DATA PROCESSING"

Extract matching rows with str_detect

major1 <- collage_major[str_detect(collage_major$Major, "DATA"), ]  
major1
## # A tibble: 1 x 11
##   Major_code Major    Major_category  Total Employed Employed_full_t~ Unemployed
##        <dbl> <chr>    <chr>           <dbl>    <dbl>            <dbl>      <dbl>
## 1       2101 COMPUTE~ Computers & Ma~ 29317    22828            18747       2265
## # ... with 4 more variables: Unemployment_rate <dbl>, Median <dbl>,
## #   P25th <dbl>, P75th <dbl>
major2 <- str_subset(collage_major$Major, "STATISTICS")
major2
## [1] "STATISTICS AND DECISION SCIENCE"              
## [2] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
major2 <- collage_major[str_detect(collage_major$Major, "STATISTICS"), ]  
major2
## # A tibble: 2 x 11
##   Major_code Major   Major_category   Total Employed Employed_full_t~ Unemployed
##        <dbl> <chr>   <chr>            <dbl>    <dbl>            <dbl>      <dbl>
## 1       3702 STATIS~ Computers & Ma~  24806    18808            14468       1138
## 2       6212 MANAGE~ Business        156673   134478           118249       6186
## # ... with 4 more variables: Unemployment_rate <dbl>, Median <dbl>,
## #   P25th <dbl>, P75th <dbl>

#2 Write code that transforms the data below:

# Creating four vectors with quotations added:

fruit_category_1 <- c("bell pepper", "bilberry", "blackberry", "blood orange")

fruit_category_2 <- c("blueberry", "cantaloupe", "chili pepper", "cloudberry")

fruit_category_3 <- c("elderberry", "lime", "lychee", "mulberry")

fruit_category_4 <- c("olive", "lsalal berry")
 writeLines(fruit_category_1)         
## bell pepper
## bilberry
## blackberry
## blood orange
fruit_category_1 <- str_c("\"", fruit_category_1, "\"")
writeLines(fruit_category_1) 
## "bell pepper"
## "bilberry"
## "blackberry"
## "blood orange"
fruit_category_2 <- str_c("\"", fruit_category_2, "\"")
writeLines(fruit_category_2) 
## "blueberry"
## "cantaloupe"
## "chili pepper"
## "cloudberry"
fruit_category_3 <- str_c("\"", fruit_category_3, "\"")
writeLines(fruit_category_3) 
## "elderberry"
## "lime"
## "lychee"
## "mulberry"
fruit_category_4 <- str_c("\"", fruit_category_4, "\"")
writeLines(fruit_category_4) 
## "olive"
## "lsalal berry"
x <- str_c(c(fruit_category_1, fruit_category_2, fruit_category_3, fruit_category_4), collapse = ", ")
writeLines(x)   
## "bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "lsalal berry"

#3 Describe, in words, what these expressions will match:

# This will match nothing because single backslash does not define any rule

str_view(c("abbbbc", "a.c", "a*c", "a c"), "(.)\1\1")
# This will look for two characters that are repeating in reverse order

str_view(c("grrgff", "a.ccc.c", "a*aa*c", "a c"), "(.)(.)\\2\\1")
# This will match nothing because single backslash does not define any rule

str_view(c("grrrrgff", "a.ccc.c.c", "a*aaaa*c", "a c"), "(..)\1")
# 1-This will find a character, 2- followed by any one character 3- and repeat of 1st character 
# and again repeat step 2 and 3.
str_view(c("grrrrgff", "a.c2c3c.c.c", "a*adafaa*c"), "(.).\\1.\\1")
# This will find three charactrs followed by any number of other characters
# then the repeat of first three character in reverse order

str_view(c("lmnx*xmnlff", "a.uzmac2c3camzu.c.c", "a*adafaa*c"), "(.)(.)(.).*\\3\\2\\1")

#4 Construct regular expressions to match words that:

# Start and end with the same character.
# (.).*\\1

str_view(c("lmnx*xmnl", "a.uzmac2c3camzu.c.a", "a*adafaa*c"), "(.).*\\1")
# Contain a repeated pair of letters (e.g. "church" contains "ch" repeated twice.)
# (.)(.).*\\1\\1

str_view(c("lmnx*xmnl", "a.uzmac2c3camzu.c.a", "churchac"), "(.)(.).*\\1\\2")
# Contain one letter repeated in at least three places (e.g. "eleven" contains three "e"s.)

str_view(c("lmnx*xmnl", "eleven", "churchac"), "(.).*\\1*\\1*\\1")