A demonstration of scraping a table with embedded urls from the U.S. Department of Education web page

library(tidyverse)
library(rvest) # R package that does web scraping
start.time = Sys.time()

A screenshot of the targeted web page and its elements (press F12)

Sample Web Page from ED.gov

ED2 <- read_html("https://www2.ed.gov/about/offices/list/oig/newsroom.html")
links <- ED2 %>%
  html_nodes("a")%>% # <a content>...</a>
  html_attr("href") # get embedded urls 
ED2_fraud <- links[42:82]
head(ED2_fraud)
## [1] " https://oag.ca.gov/news/press-releases/attorney-general-bonta-announces-arrests-orange-county-connected-one-nations"        
## [2] "https://www.justice.gov/usao-ma/pr/former-georgetown-head-tennis-coach-agrees-plead-guilty-college-admissions-case"          
## [3] "https://www.justice.gov/usao-ma/pr/swampscott-financial-advisor-sentenced-stealing-former-clients-retirement-assets"         
## [4] "https://www.justice.gov/usao-wdmo/pr/sparta-man-sentenced-600000-tax-evasion-0"                                              
## [5] "https://www.justice.gov/usao-ndfl/pr/owners-florida-teacher-certification-exam-preparation-company-plead-guilty-racketeering"
## [6] "https://www.justice.gov/usao-hi/pr/kauai-woman-pleads-guilty-multiple-wire-fraud-schemes-and-aggravated-identity-theft"
length(ED2_fraud)
## [1] 41
titles <- ED2 %>%
  html_nodes("tr") %>%
  html_nodes("td") %>%
  html_text()
titles <- strsplit(titles, "\n")
titles_edit <- titles[[1]][114:282] # it needs some human supervision to extract the useful components
titles_edit <- trimws(titles_edit) # remove white space
titles_edit[1:20] # observe the pattern of text data
##  [1] "Subject"                                                                                                                                               
##  [2] "Title of Release"                                                                                                                                      
##  [3] "DateIssued"                                                                                                                                            
##  [4] "Release"                                                                                                                                               
##  [5] "Fraud"                                                                                                                                                 
##  [6] "Attorney General Bonta Announces Arrests in Orange County Connected to One of the Nation's Largest Student Loan Debt Relief Fraud Schemes (California)"
##  [7] "09/21/2021"                                                                                                                                            
##  [8] "READ"                                                                                                                                                  
##  [9] "Fraud"                                                                                                                                                 
## [10] "Former Georgetown Head Tennis Coach Agrees to Plead Guilty in College Admissions Case (Massachusetts)"                                                 
## [11] "09/15/2021"                                                                                                                                            
## [12] "READ"                                                                                                                                                  
## [13] "Fraud"                                                                                                                                                 
## [14] "Swampscott Financial Advisor Sentenced for Stealing Former Client's Retirement Assets (Massachusetts)"                                                 
## [15] "09/14/2021"                                                                                                                                            
## [16] "READ"                                                                                                                                                  
## [17] "Multiple Fraud (Including Student Aid Fraud)"                                                                                                          
## [18] "Sparta Man Sentenced for $600,000 Tax Evasion. Also Illegally Received Federal Benefits (Missouri)"                                                    
## [19] "08/4/2021"                                                                                                                                             
## [20] "READ"
titles_edit <- titles_edit[titles_edit!=""] # remove blank cells
length(titles_edit)/4 # check the dimension
## [1] 42
frauddata <- matrix(data = titles_edit[5:length(titles_edit)], nrow = 41, ncol = 4, byrow = T) # table content from the 5th element
colnames(frauddata) <- titles_edit[1:4] # the first 4 items are column names
frauddata <- data.frame(frauddata)
dim(frauddata)
## [1] 41  4
frauddata$Release <- trimws(ED2_fraud)
library(DT)
datatable(frauddata)

Read texts from a list of urls

frauddata <- frauddata[grepl("www.myfloridalegal.com", frauddata$Release)==F,] # in the first run of the loop, this website triggered the error message, so, remove it.
alltexts <- list()
for (i in 1: length(frauddata$Release)) {
alltexts[i] <- paste(read_html(frauddata$Release[i]) %>% html_nodes("p") %>% html_text2(), collapse = " ")}
all_texts <- c()
for (i in 1: length(alltexts)) {all_texts[i] <- alltexts[[i]]}
all_texts[2:3] # success!
## [1] "BOSTON – The former head coach of men and women’s tennis at Georgetown University has agreed to plead guilty in connection with soliciting and accepting bribes to facilitate the admission of prospective Georgetown applicants and failing to report a significant portion of those bribe payments on his federal income taxes. Gordon Ernst, 54, of Chevy Chase, Md. and Falmouth, Mass., will plead guilty to one count of conspiracy to commit federal programs bribery, three counts of federal programs bribery and one count of filing a false tax return. A plea hearing has not yet been scheduled. According to the terms of the plea agreement, the parties have agreed to a sentence of at least one year and up to four years in prison, two years of supervised release and forfeiture of $3,435,053. As set forth in the charging document, Ernst solicited and received bribe payments from William “Rick” Singer and prospective Georgetown applicants to facilitate their admission to Georgetown as student athletes. Ernst then failed to report a significant portion of those bribe payments on his federal income tax returns. The charge of federal programs bribery provides for a sentence of up to 10 years in prison, three years of supervised release and a fine of $250,000 or twice the gross gain or loss, whichever is greater. The charge of conspiracy to commit federal programs bribery provides for a sentence of up to five years in prison, three years of supervised release a fine of $250,000 or twice the gross gain or loss, whichever is greater. The charge of filing a false tax return provides for a sentence of up to three years in prison, one year of supervised release and a fine of $100,000. Sentences are imposed by a federal district court judge based upon the U.S. Sentencing Guidelines and other statutory factors. Case information, including the status of each defendant, charging documents, and plea agreements are available here: https://www.justice.gov/usao-ma/investigations-college-admissions-and-testing-bribery-scheme. Acting United States Attorney Nathaniel R. Mendell; Joseph R. Bonavolonta, Special Agent in Charge of the Federal Bureau of Investigation, Boston Division; Joleen D. Simpson, Special Agent in Charge of the Internal Revenue Service’s Criminal Investigations in Boston; and Mark Deckett, Resident Agent in Charge of the Department of Education, Office of Inspector General made the announcement today. Assistant U.S. Attorneys Leslie A. Wright, Kristen A. Kearney and Kriss Basil of Mendell’s Criminal Division are prosecuting the case. The details contained in the charging documents are allegations. The remaining defendants are presumed innocent until proven guilty beyond a reasonable doubt in a court of law.  Making sure victims of federal crimes are treated with compassion, fairness and respect.\n     Information on the Boston Marathon bombing cases \n"
## [2] "BOSTON – A Swampscott financial advisor was sentenced today in federal court in Boston for defrauding an elderly victim and her bank by stealing the victim’s retirement assets. Felix Gorovodsky, 29, was sentenced by U.S. District Court Judge Denise J. Casper to 33 months in prison and two years of supervised release. Gorovodsky was also ordered to pay restitution of $310,492. On May 11, 2021, Gorovodsky pleaded guilty to one count of bank fraud. Gorovodsky served as a financial advisor for the victim. In or about July 2019, the victim terminated that advisor relationship and revoked the power of attorney she had previously granted Gorovodsky. Approximately nine months later, Gorovodsky accessed and liquidated the victim’s bank account, transferring more than $250,000 into his own bank account. Gorovodsky then used the victim’s stolen retirement funds for personal expenses, including paying off more than $100,000 in federal student loans. As part of the scheme, Gorovodsky forged the victim’s signature on a purported “gift letter,” which he sent to the bank in an attempt to legitimize the fraudulent transfer. Acting United States Attorney Nathaniel R. Mendell and Joseph R. Bonavolonta, Special Agent in Charge of the Federal Bureau of Investigation, Boston Division made the announcement. The Department of Education, Office of Inspector General provided valuable assistance with the investigation. Assistant U.S. Attorneys Ian J. Stearns and Mackenzie A. Queenin of Mendell’s Securities, Financial & Cyber Fraud Unit prosecuted the case.  Making sure victims of federal crimes are treated with compassion, fairness and respect.\n     Information on the Boston Marathon bombing cases \n"
frauddata$text <- all_texts # add text to existing data frame
#writexl::write_xlsx(list("ED_fraud_2020_2021" = frauddata), "ED_Fraud_.xlsx")
start.time; Sys.time()
## [1] "2021-09-27 01:00:28 EDT"
## [1] "2021-09-27 01:00:38 EDT"