library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(stringr)
library(htmltools)
Week 3 assignment Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to the problems below. You may work in a small group, but please submit separately with names of all group participants in your submission.
df = read.csv(url('https://github.com/fivethirtyeight/data/raw/e48bfdad04d909610cecb01d5a4ba2c99cb997f3/college-majors/majors-list.csv'), stringsAsFactors = FALSE)
str(df)
## 'data.frame': 174 obs. of 3 variables:
## $ FOD1P : chr "1100" "1101" "1102" "1103" ...
## $ Major : chr "GENERAL AGRICULTURE" "AGRICULTURE PRODUCTION AND MANAGEMENT" "AGRICULTURAL ECONOMICS" "ANIMAL SCIENCES" ...
## $ Major_Category: chr "Agriculture & Natural Resources" "Agriculture & Natural Resources" "Agriculture & Natural Resources" "Agriculture & Natural Resources" ...
data_statistics_major = df$Major[grep("DATA|STATISTICS", df$Major)]
data_statistics_major
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [3] "STATISTICS AND DECISION SCIENCE"
[1] “bell pepper” “bilberry” “blackberry” “blood orange”
[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”
Into a format like this:
c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)
fruits_veg <- '[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"'
fruits_veg
## [1] "[1] \"bell pepper\" \"bilberry\" \"blackberry\" \"blood orange\"\n\n[5] \"blueberry\" \"cantaloupe\" \"chili pepper\" \"cloudberry\" \n\n[9] \"elderberry\" \"lime\" \"lychee\" \"mulberry\" \n\n[13] \"olive\" \"salal berry\""
fruits_veg_wo= unlist(str_extract_all(fruits_veg, pattern = "\"([a-z]+.[a-z]+)\""))
fruits_veg_wo
## [1] "\"bell pepper\"" "\"bilberry\"" "\"blackberry\"" "\"blood orange\""
## [5] "\"blueberry\"" "\"cantaloupe\"" "\"chili pepper\"" "\"cloudberry\""
## [9] "\"elderberry\"" "\"lime\"" "\"lychee\"" "\"mulberry\""
## [13] "\"olive\"" "\"salal berry\""
fruits_veg_wo_cl= str_remove_all(fruits_veg_wo, "\"")
fruits_veg_wo_cl=c(fruits_veg_wo_cl)
fruits_veg_wo_cl
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
The two exercises below are taken from R for Data Science, 14.3.5.1 in the on-line version:
#3 Describe, in words, what these expressions will match:
words <- '[1] "bell pepper" "bilberry" "church" "pressure"
[5] "blueberry" "eleven" "chili pepper" "apple"
[9] "abcdfdfbcafgfdg" "papaya" "dad" "mulberry"
[13] "believe" "salal berry" "Banana" "coconut"'
words_wo= unlist(str_extract_all(words, pattern = "\"([a-z]+.[a-z]+)\""))
wrods_wo_cl= str_remove_all(words_wo, "\"")
(.)\1\1
Match any strings that have a character that repeats back to back
expression1 = "(.)\1\1"
wrods_wo_cl %>%
str_subset(expression1)
## character(0)
“(.)(.)\2\1”
This could match anything like 4 characters, with the first and last matching and then the two middle matching.
expression2 = "(.)(.)\\2\\1"
wrods_wo_cl %>%
str_subset(expression2)
## [1] "bell pepper" "chili pepper"
(..)\1
String contains the exact same character, repeated 2 or more times
expression3 = "(..)\1"
wrods_wo_cl %>%
str_subset(expression3)
## character(0)
“(.).\1.\1”
it will look for 3 repeating words in a string.
expression4 = "(.).\\1.\\1"
wrods_wo_cl %>%
str_subset(expression4)
## [1] "eleven" "papaya"
"(.)(.)(.).*\3\2\1"
it will look at the first three letters and then see if there is an opposite pattern like abcdfdfdfcba.
expression5 = "(.)(.)(.).*\\3\\2\\1"
wrods_wo_cl %>%
str_subset(expression5)
## character(0)
#4 Construct regular expressions to match words that:
Start and end with the same character.
expression9 = "^(.).*\\1$"
wrods_wo_cl %>%
str_subset(expression9)
## [1] "dad"
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)
expression6 = "(..).*\\1"
wrods_wo_cl %>%
str_subset(expression6)
## [1] "bell pepper" "church" "pressure" "chili pepper"
## [5] "abcdfdfbcafgfdg" "papaya" "salal berry" "coconut"
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
expression7 = "(.).*\\1.*\\1"
wrods_wo_cl %>%
str_subset(expression7)
## [1] "bell pepper" "eleven" "chili pepper" "abcdfdfbcafgfdg"
## [5] "papaya" "believe"