The three text files were created in Notepad++ and uploaded to my GitHub page. The URLs that are scraped are from my GitHub. All the data frames look the same but with JSON I was able to specifically name the variables ‘pubdate’ and ‘pages’ as integers before it was scraped, which I like, because it will remove a step when it is time to tidy the data.
url <- getURL("https://raw.githubusercontent.com/smithchad17/Class607/master/books.html")
html_books <- readHTMLTable(url, as.data.frame = T, stringsAsFactors = F)
html_books
## $`NULL`
## title authors
## 1 The Hunt for Red October Tom Clancy
## 2 Dead or Alive Tom Clancy, Grant Blackwood
## 3 Locked On Tom Clancy, Mark Greaney
## publisher pubdate pages
## 1 Navy Institute Press 1984 387
## 2 G.P. Putnam's Sons 2010 950
## 3 G.P. Putnam's Sons 2011 853
str(html_books)
## List of 1
## $ NULL:'data.frame': 3 obs. of 5 variables:
## ..$ title : chr [1:3] "The Hunt for Red October" "Dead or Alive" "Locked On"
## ..$ authors : chr [1:3] "Tom Clancy" "Tom Clancy, Grant Blackwood" "Tom Clancy, Mark Greaney"
## ..$ publisher: chr [1:3] "Navy Institute Press" "G.P. Putnam's Sons" "G.P. Putnam's Sons"
## ..$ pubdate : chr [1:3] "1984" "2010" "2011"
## ..$ pages : chr [1:3] "387" "950" "853"
url <- getURL("https://raw.githubusercontent.com/smithchad17/Class607/master/books.xml")
xml_books <- xmlToDataFrame(url, stringsAsFactors = F)
str(xml_books)
## 'data.frame': 3 obs. of 5 variables:
## $ title : chr "The Hunt for Red October" "Dead or Alive" "Locked On"
## $ authors : chr "Tom Clancy" "Tom Clancy, Grant Blackwood" "Tom Clancy, Mark Greaney"
## $ publisher: chr "Navy Institute Press" "G.P. Putnam's Sons" "G.P. Putnam's Sons"
## $ pubdate : chr "1984" "2010" "2011"
## $ pages : chr "387" "950" "853"
url <- getURL("https://raw.githubusercontent.com/smithchad17/Class607/master/books.json")
json_books <- fromJSON(url)
str(json_books)
## List of 1
## $ books:'data.frame': 3 obs. of 5 variables:
## ..$ title : chr [1:3] "Hunt for Red October" "Dead or Alive" "Hunt for Red October"
## ..$ authors : chr [1:3] "Tom Clancy" "Tom Clancy, Grant Blackwood" "Tom Clancy, Mark Greaney"
## ..$ publishers: chr [1:3] "Navy Institute Press" "G.P. Putnam's Sons" "G.P. Putnam's Sons"
## ..$ pubdate : int [1:3] 1984 2010 2011
## ..$ pages : int [1:3] 387 950 853