library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(stringr)
Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to the problems below. You may work in a small group, but please submit separately with names of all group participants in your submission.
df <- read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
## Rows: 174 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): FOD1P, Major, Major_Category
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df
## # A tibble: 174 × 3
## FOD1P Major Major_Category
## <chr> <chr> <chr>
## 1 1100 GENERAL AGRICULTURE Agriculture & Natural Resources
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4 1103 ANIMAL SCIENCES Agriculture & Natural Resources
## 5 1104 FOOD SCIENCE Agriculture & Natural Resources
## 6 1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
## 7 1106 SOIL SCIENCE Agriculture & Natural Resources
## 8 1199 MISCELLANEOUS AGRICULTURE Agriculture & Natural Resources
## 9 1302 FORESTRY Agriculture & Natural Resources
## 10 1303 NATURAL RESOURCES MANAGEMENT Agriculture & Natural Resources
## # … with 164 more rows
glimpse(df)
## Rows: 174
## Columns: 3
## $ FOD1P <chr> "1100", "1101", "1102", "1103", "1104", "1105", "1106",…
## $ Major <chr> "GENERAL AGRICULTURE", "AGRICULTURE PRODUCTION AND MANA…
## $ Major_Category <chr> "Agriculture & Natural Resources", "Agriculture & Natur…
colnames(df)
## [1] "FOD1P" "Major" "Major_Category"
df
## # A tibble: 174 × 3
## FOD1P Major Major_Category
## <chr> <chr> <chr>
## 1 1100 GENERAL AGRICULTURE Agriculture & Natural Resources
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4 1103 ANIMAL SCIENCES Agriculture & Natural Resources
## 5 1104 FOOD SCIENCE Agriculture & Natural Resources
## 6 1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
## 7 1106 SOIL SCIENCE Agriculture & Natural Resources
## 8 1199 MISCELLANEOUS AGRICULTURE Agriculture & Natural Resources
## 9 1302 FORESTRY Agriculture & Natural Resources
## 10 1303 NATURAL RESOURCES MANAGEMENT Agriculture & Natural Resources
## # … with 164 more rows
new_df<- df[grep("DATA|STATISTICS", df$Major),]
new_df
## # A tibble: 3 × 3
## FOD1P Major Major_Category
## <chr> <chr> <chr>
## 1 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
## 2 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 3 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics
[1] “bell pepper” “bilberry” “blackberry” “blood orange” [5]
“blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”
Into a format like this:
c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)
string_list = c('[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"')
string_list
## [1] "[1] \"bell pepper\" \"bilberry\" \"blackberry\" \"blood orange\"\n[5] \"blueberry\" \"cantaloupe\" \"chili pepper\" \"cloudberry\" \n[9] \"elderberry\" \"lime\" \"lychee\" \"mulberry\" \n[13] \"olive\" \"salal berry\""
new_string_list <- str_extract_all(string_list, "\\w[a-z]+\\s?[a-z]+\\w")
print(new_string_list)
## [[1]]
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
new_list <- gsub('[[:digit:]]+', '', new_string_list)
new_list
## [1] "c(\"bell pepper\", \"bilberry\", \"blackberry\", \"blood orange\", \"blueberry\", \"cantaloupe\", \"chili pepper\", \"cloudberry\", \"elderberry\", \"lime\", \"lychee\", \"mulberry\", \"olive\", \"salal berry\")"
class(new_list)
## [1] "character"
cat(str_c(new_list, collapse = " , "))
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
The two exercises below are taken from R for Data Science, 14.3.5.1 in the on-line version:
(.)\1\1
This expression would throw an error, since it’s missing double quotes and a backslash.
test_set <- c('aaa','bbbght','abba','1221','xyzzyx','zrzrzlk','53575','abaza','1234djakfl;jdafkjad4321','abccba', "$%^^%$#@*")
str_match(test_set, "(.)\1\1")
## [,1] [,2]
## [1,] NA NA
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] NA NA
## [7,] NA NA
## [8,] NA NA
## [9,] NA NA
## [10,] NA NA
## [11,] NA NA
“(.)(.)\2\1”
This expression would produce two characters followed by the same characters in reverse order. An example is “abba”.
str_match(test_set, "(.)(.)\\2\\1")
## [,1] [,2] [,3]
## [1,] NA NA NA
## [2,] NA NA NA
## [3,] "abba" "a" "b"
## [4,] "1221" "1" "2"
## [5,] "yzzy" "y" "z"
## [6,] NA NA NA
## [7,] NA NA NA
## [8,] NA NA NA
## [9,] NA NA NA
## [10,] "bccb" "b" "c"
## [11,] "%^^%" "%" "^"
(..)\1
This expression would throw an error, since it’s missing double quotes and a backslash.
str_match(test_set, "(..)\1")
## [,1] [,2]
## [1,] NA NA
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] NA NA
## [7,] NA NA
## [8,] NA NA
## [9,] NA NA
## [10,] NA NA
## [11,] NA NA
“(.).\1.\1”
This expression produces characters that are repeated 3 or more times within a group of characters.
str_match(test_set, "(.).\\1.\\1")
## [,1] [,2]
## [1,] NA NA
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] "zrzrz" "z"
## [7,] "53575" "5"
## [8,] "abaza" "a"
## [9,] NA NA
## [10,] NA NA
## [11,] NA NA
“(.)(.)(.).*\3\2\1”
This expression would produce a group where a string of 3 or more characters also appear in reverse order anywhere in the group.
str_match(test_set, "(.)(.)(.).*\\3\\2\\1")
## [,1] [,2] [,3] [,4]
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] "xyzzyx" "x" "y" "z"
## [6,] NA NA NA NA
## [7,] NA NA NA NA
## [8,] NA NA NA NA
## [9,] "1234djakfl;jdafkjad4321" "1" "2" "3"
## [10,] "abccba" "a" "b" "c"
## [11,] "$%^^%$" "$" "%" "^"
Start and end with the same character. Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
# Start and end with the same character
variable_set <- c("test", "high", "stuff", "lawn", "abra")
str_extract(variable_set, "^(.).*\\1$")
## [1] "test" "high" NA NA "abra"
# Contain a repeated pair of letters (e.g. "church" contains "ch" repeated twice.)
testing_set <- c("trial", "church", "mile", "pepper")
str_match(testing_set, "(..).*\\1")
## [,1] [,2]
## [1,] NA NA
## [2,] "church" "ch"
## [3,] NA NA
## [4,] "peppe" "pe"
# Contain one letter repeated in at least three places (e.g. "eleven" contains three "e"s.)
variable_test_set <- c("model", "eleven", "pepper", "train", "mississippi")
str_extract(variable_test_set, "([a-z]).*\\1.*\\1")
## [1] NA "eleve" "pepp" NA "ississippi"