Code that identifies majors that contain either “DATA” or “STATISTICS” from [Five Thirty Eight] (https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/).
library(rvest)
library(xml2)
library(tidyverse)
# Get URL of College Majors list
site <- "https://projects.fivethirtyeight.com/mid-levels/college-majors/index.html?v=3"
majors_list <- read_html(site)
# Scrape all data from the website
majors_df <- majors_list %>%
html_nodes(xpath = '//*[@id="example"]') %>%
html_table(fill = TRUE)
# Scrapped data is in the form of a list.
# Convert list to data frame and save the majors column only to its own data frame
majors_df <- as.data.frame(majors_df)
mj <- majors_df$MAJOR
# look for majors with statistics in name
major.stats <- str_view(mj, regex("statistics", ignore_case = TRUE), match = TRUE)
# look for majors with data in the name
major.data <- str_view(mj, regex("data", ignore_case = TRUE), match = TRUE)
# Display all majors with Statistics in the name
major.stats
# Display all majors with Data in the name
major.data
Data transformation
library(RCurl)
library(dplyr)
# get fruit list from Github
x <- getURL("https://raw.githubusercontent.com/ltcancel/Homework3_Data607_F20/master/fruit.txt")
fruit <- read.delim(text = x, header = FALSE, sep = '"')
fruit
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 [1] bell pepper NA bilberry NA blackberry NA blood orange NA
## 2 [5] blueberry NA cantaloupe NA chili pepper NA cloudberry NA
## 3 [9] elderberry NA lime NA lychee NA mulberry NA
## 4 [13] olive NA salal berry NA NA NA
# filter to only include columns with values
fruit.clean <- fruit %>%
select(2,4,6,8)
fruit.clean
## V2 V4 V6 V8
## 1 bell pepper bilberry blackberry blood orange
## 2 blueberry cantaloupe chili pepper cloudberry
## 3 elderberry lime lychee mulberry
## 4 olive salal berry
# loop through data frame to create final string
for (x in 1:nrow(fruit.clean)){
for (y in 1:ncol(fruit.clean)){
if (x == 1 && y == 1){
final.string <- c("c(",fruit.clean[x,y])
}
else if (x == nrow(fruit.clean) && y == ncol(fruit.clean)){
final.string <- paste0(final.string,")")
}
else{
final.string <- paste0(final.string,",",fruit.clean[x,y])
}
}
}
# Display new string
final.string
## [1] "c(,bilberry,blackberry,blood orange,blueberry,cantaloupe,chili pepper,cloudberry,elderberry,lime,lychee,mulberry,olive,salal berry,)"
## [2] "1,bilberry,blackberry,blood orange,blueberry,cantaloupe,chili pepper,cloudberry,elderberry,lime,lychee,mulberry,olive,salal berry,)"
Describe, in words, what these expressions will match:
Construct regular expressions to match words that:
Start and end with the same character.
str_view(c("dad","apple","eye"),"^(.).*\\1$")
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)
str_view(c("church","house","pressure"),"(..).*\\1", match = TRUE)
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
str_view(c("eleven","six","between"),"(.).*\\1.*\\1")