library(devtools)
library(tidyverse)
library(RCurl)
library(XML)
library(jsonlite)
library(knitr)

Sources:

Favorite Books XML file

Favorite Books HTML Table file

Favorite Books HTML file

Favorite Books JSON file

Process the XML file

filename <- getURL("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA607%20Spring%202021/Week7/Homework7/Data/Fave_Hist_Books.xml")
(fave_books_xml <- xmlParse(filename))
## <?xml version="1.0" encoding="UTF-8"?>
## <books>
##   <book pages="398" lang="eng">
##     <title>Faith of My Fathers</title>
##     <author_1>
##       <first_name>John</first_name>
##       <last_name>McCain</last_name>
##       <educ>US Naval Academy</educ>
##       <career>Republican Senator</career>
##       <misc>Naval Aviator</misc>
##     </author_1>
##     <isbn-13>978-0-3995-9089-4</isbn-13>
##     <publisher>Random House Publishing Group</publisher>
##     <pub_year>1998</pub_year>
##   </book>
##   <book pages="578" lang="eng">
##     <title>Indianapolis</title>
##     <author_1>
##       <first_name>Lynn</first_name>
##       <last_name>Vincent</last_name>
##       <educ>US Navy Veteran</educ>
##       <career>Journalist</career>
##       <misc>Amazon Best of 2018</misc>
##     </author_1>
##     <author_2>
##       <first_name>Sara</first_name>
##       <last_name>Vladic</last_name>
##       <educ>Pepperdine University</educ>
##       <career>Documentary Filmmaker</career>
##       <misc>San Diego Film Award</misc>
##     </author_2>
##     <isbn-13>978-1-5011-3594-1</isbn-13>
##     <publisher>Simon and Schuster Paperbacks</publisher>
##     <pub_year>2019</pub_year>
##   </book>
##   <book pages="480" lang="eng">
##     <title>We Were Soldiers Once...and Young</title>
##     <author_1>
##       <first_name>Hal</first_name>
##       <last_name>Moore</last_name>
##       <educ>West Point</educ>
##       <career>Lt General, US Army</career>
##       <misc>Vietnam Veteran</misc>
##     </author_1>
##     <author_2>
##       <first_name>Joseph</first_name>
##       <last_name>Galloway</last_name>
##       <educ>Bronze Star V-Device</educ>
##       <career>Author,Journalist</career>
##       <misc>Decorated non-combatant</misc>
##     </author_2>
##     <isbn-13>978-0-3454-7581-7</isbn-13>
##     <publisher>Random House Publishing Group</publisher>
##     <pub_year>2004</pub_year>
##   </book>
## </books>
## 
class(fave_books_xml)
## [1] "XMLInternalDocument" "XMLAbstractDocument"
(xml_books_df <- xmlToDataFrame(fave_books_xml))
fave_books_xml_1 <- xmlParse(filename,useInternalNodes = FALSE)
fave_books_xml_1
## $doc
## $file
## [1] "<buffer>"
## 
## $version
## [1] "1.0"
## 
## $children
## $children$books
## <books>
##  <book pages="398" lang="eng">
##   <title>Faith of My Fathers</title>
##   <author_1>
##    <first_name>John</first_name>
##    <last_name>McCain</last_name>
##    <educ>US Naval Academy</educ>
##    <career>Republican Senator</career>
##    <misc>Naval Aviator</misc>
##   </author_1>
##   <isbn-13>978-0-3995-9089-4</isbn-13>
##   <publisher>Random House Publishing Group</publisher>
##   <pub_year>1998</pub_year>
##  </book>
##  <book pages="578" lang="eng">
##   <title>Indianapolis</title>
##   <author_1>
##    <first_name>Lynn</first_name>
##    <last_name>Vincent</last_name>
##    <educ>US Navy Veteran</educ>
##    <career>Journalist</career>
##    <misc>Amazon Best of 2018</misc>
##   </author_1>
##   <author_2>
##    <first_name>Sara</first_name>
##    <last_name>Vladic</last_name>
##    <educ>Pepperdine University</educ>
##    <career>Documentary Filmmaker</career>
##    <misc>San Diego Film Award</misc>
##   </author_2>
##   <isbn-13>978-1-5011-3594-1</isbn-13>
##   <publisher>Simon and Schuster Paperbacks</publisher>
##   <pub_year>2019</pub_year>
##  </book>
##  <book pages="480" lang="eng">
##   <title>We Were Soldiers Once...and Young</title>
##   <author_1>
##    <first_name>Hal</first_name>
##    <last_name>Moore</last_name>
##    <educ>West Point</educ>
##    <career>Lt General, US Army</career>
##    <misc>Vietnam Veteran</misc>
##   </author_1>
##   <author_2>
##    <first_name>Joseph</first_name>
##    <last_name>Galloway</last_name>
##    <educ>Bronze Star V-Device</educ>
##    <career>Author,Journalist</career>
##    <misc>Decorated non-combatant</misc>
##   </author_2>
##   <isbn-13>978-0-3454-7581-7</isbn-13>
##   <publisher>Random House Publishing Group</publisher>
##   <pub_year>2004</pub_year>
##  </book>
## </books>
## 
## 
## attr(,"class")
## [1] "XMLDocumentContent"
## 
## $dtd
## $external
## NULL
## 
## $internal
## NULL
## 
## attr(,"class")
## [1] "DTDList"
## 
## attr(,"class")
## [1] "XMLDocument"         "XMLAbstractDocument"
class(fave_books_xml)
## [1] "XMLInternalDocument" "XMLAbstractDocument"

Process the HTML Table file

filename <-getURL("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA607%20Spring%202021/Week7/Homework7/Data/Fave_Hist_Books_tbl.html")
fave_books_html_tbl <- xmlParse(filename,isHTML = TRUE)
## htmlParseEntityRef: no name
fave_books_html_tbl
## <!DOCTYPE html>
## <html><body><table style="witdh:100%">
## <tr>
## <h1>Favorite Books</h1>
##     <p>World War II</p>
## </tr>
## <tr>
## <td>Title: Faith of My Fathers</td>
##     <td>Author: John McCain</td>
##     <td>Education: US Naval Academy</td>
##     <td>Career: Republican Senator</td>
##     <td>Miscellaneous: Naval Aviator</td>
##     <td>ISBN-13: 978-0-3995-9089-4</td>
##     <td>Publisher: Random House Publishing Group</td>
##     <td>Publication Year: 1998</td>
##     <td>Pages: 398</td>
##     <td>Language: English</td>
## </tr>
## <tr>
## <td>Title: Indianapolis</td>
##     <td>Author: Lynn Vincent</td>
##     <td>Education: US Navy Veteran</td>
##     <td>Career: Journalist</td>
##     <td>Miscellaneous: Amazon Best of 2018</td>
##     <td>Author: Sara Vladic</td>
##     <td>Education: Pepperdine University</td>
##     <td>Career: Documentary Filmmaker</td>
##     <td>Miscellaneous: San Diego Film Award</td>
##     <td>ISBN-13: 978-1-5011-3594-1</td>
##     <td>Publisher: Simon &amp; Schuster Paperbacks</td>
##     <td>Publication Year: 2019</td>
## <td>
##     </td>
## <td>Pages: 578</td>
## <td>
##     </td>
## <td>Language: English</td>
## <td>
## </td>
## </tr>
## <tr>
## <td>Title: We Were Soldiers Once...and Young</td>
##     <td>Author: Hal Moore</td>
##     <td>Education: West Point</td>
##     <td>Career: Lt General, US Army</td>
##     <td>Miscellaneous: Vietname Veteran</td>
##     <td>Author: Joseph Galloway</td>
##     <td>Education: Bronze Star V-Device</td>
##     <td>Career: Author,Journalist</td>
##     <td>Miscellaneous: Decorated non-combatant</td>
##     <td>ISBN-13: 978-0-3454-7581-7</td>
##     <td>Publisher: Random House Publishing Group</td>
##     <td>Publication Year: 2004</td>
##     <td>Pages: 480</td>
##     <td>Language: English</td>
## </tr>
## </table></body></html>
## 
class(fave_books_html_tbl)
## [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" 
## [4] "XMLAbstractDocument"
(html_tbl_books_df <- xmlToDataFrame(fave_books_html_tbl))
(fave_books_html_tbl_1 <- htmlParse(filename,isHTML = TRUE))
## <!DOCTYPE html>
## <html><body><table style="witdh:100%">
## <tr>
## <h1>Favorite Books</h1>
##     <p>World War II</p>
## </tr>
## <tr>
## <td>Title: Faith of My Fathers</td>
##     <td>Author: John McCain</td>
##     <td>Education: US Naval Academy</td>
##     <td>Career: Republican Senator</td>
##     <td>Miscellaneous: Naval Aviator</td>
##     <td>ISBN-13: 978-0-3995-9089-4</td>
##     <td>Publisher: Random House Publishing Group</td>
##     <td>Publication Year: 1998</td>
##     <td>Pages: 398</td>
##     <td>Language: English</td>
## </tr>
## <tr>
## <td>Title: Indianapolis</td>
##     <td>Author: Lynn Vincent</td>
##     <td>Education: US Navy Veteran</td>
##     <td>Career: Journalist</td>
##     <td>Miscellaneous: Amazon Best of 2018</td>
##     <td>Author: Sara Vladic</td>
##     <td>Education: Pepperdine University</td>
##     <td>Career: Documentary Filmmaker</td>
##     <td>Miscellaneous: San Diego Film Award</td>
##     <td>ISBN-13: 978-1-5011-3594-1</td>
##     <td>Publisher: Simon &amp; Schuster Paperbacks</td>
##     <td>Publication Year: 2019</td>
## <td>
##     </td>
## <td>Pages: 578</td>
## <td>
##     </td>
## <td>Language: English</td>
## <td>
## </td>
## </tr>
## <tr>
## <td>Title: We Were Soldiers Once...and Young</td>
##     <td>Author: Hal Moore</td>
##     <td>Education: West Point</td>
##     <td>Career: Lt General, US Army</td>
##     <td>Miscellaneous: Vietname Veteran</td>
##     <td>Author: Joseph Galloway</td>
##     <td>Education: Bronze Star V-Device</td>
##     <td>Career: Author,Journalist</td>
##     <td>Miscellaneous: Decorated non-combatant</td>
##     <td>ISBN-13: 978-0-3454-7581-7</td>
##     <td>Publisher: Random House Publishing Group</td>
##     <td>Publication Year: 2004</td>
##     <td>Pages: 480</td>
##     <td>Language: English</td>
## </tr>
## </table></body></html>
## 
class(fave_books_html_tbl_1)
## [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" 
## [4] "XMLAbstractDocument"

Process HTML File

filename <- getURL("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA607%20Spring%202021/Week7/Homework7/Data/Fave_Hist_Books.html")
fave_books_html <- xmlParse(filename,isHTML = TRUE)
## htmlParseEntityRef: no name
fave_books_html
## <!DOCTYPE html>
## <html><body>
## 
## <h1>Favorite Books</h1>
## <p>World War II</p>
## <dl>
## <dt>Title: Faith of My Fathers</dt>
##     <dd>Author: John McCain</dd>
##     <dd>Education: US Naval Academy</dd>
##     <dd>Career: Republican Senator</dd>
##     <dd>Miscellaneous: Naval Aviator</dd>
##     <dd>ISBN-13: 978-0-3995-9089-4</dd>
##     <dd>Publisher: Random House Publishing Group</dd>
##     <dd>Publication Year: 1998<dd>
##     <dd>Pages: 398<dd>
##     <dd>Language: English<dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dl>
## <dl>
## <dt>Title: Indianapolis</dt>
##     <dd>Author: Lynn Vincent</dd>
##     <dd>Education: US Navy Veteran</dd>
##     <dd>Career: Journalist</dd>
##     <dd>Miscellaneous: Amazon Best of 2018</dd>
##     <dd>Author: Sara Vladic</dd>
##     <dd>Education: Pepperdine University</dd>
##     <dd>Career: Documentary Filmmaker</dd>
##     <dd>Miscellaneous: San Diego Film Award</dd>
##     <dd>ISBN-13: 978-1-5011-3594-1</dd>
##     <dd>Publisher: Simon &amp; Schuster Paperbacks</dd>
##     <dd>Publication Year: 2019<dd>
##     <dd>Pages: 578<dd>
##     <dd>Language: English<dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dl>
## <dl>
## <dt>Title: We Were Soldiers Once...and Young</dt>
##     <dd>Author: Hal Moore</dd>
##     <dd>Education: West Point</dd>
##     <dd>Career: Lt General, US Army</dd>
##     <dd>Miscellaneous: Vietname Veteran</dd>
##     <dd>Author: Joseph Galloway</dd>
##     <dd>Education: Bronze Star V-Device</dd>
##     <dd>Career: Author,Journalist</dd>
##     <dd>Miscellaneous: Decorated non-combatant</dd>
##     <dd>ISBN-13: 978-0-3454-7581-7</dd>
##     <dd>Publisher: Random House Publishing Group</dd>
##     <dd>Publication Year: 2004</dd>
##     <dd>Pages: 480</dd>
##     <dd>Language: English</dd>
## </dl>
## </body></html>
## 
class(fave_books_html)
## [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" 
## [4] "XMLAbstractDocument"
(html_books_df <- xmlToDataFrame(fave_books_html))
fave_books_html_1 <- htmlParse(filename,isHTML = TRUE)
fave_books_html_1
## <!DOCTYPE html>
## <html><body>
## 
## <h1>Favorite Books</h1>
## <p>World War II</p>
## <dl>
## <dt>Title: Faith of My Fathers</dt>
##     <dd>Author: John McCain</dd>
##     <dd>Education: US Naval Academy</dd>
##     <dd>Career: Republican Senator</dd>
##     <dd>Miscellaneous: Naval Aviator</dd>
##     <dd>ISBN-13: 978-0-3995-9089-4</dd>
##     <dd>Publisher: Random House Publishing Group</dd>
##     <dd>Publication Year: 1998<dd>
##     <dd>Pages: 398<dd>
##     <dd>Language: English<dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dl>
## <dl>
## <dt>Title: Indianapolis</dt>
##     <dd>Author: Lynn Vincent</dd>
##     <dd>Education: US Navy Veteran</dd>
##     <dd>Career: Journalist</dd>
##     <dd>Miscellaneous: Amazon Best of 2018</dd>
##     <dd>Author: Sara Vladic</dd>
##     <dd>Education: Pepperdine University</dd>
##     <dd>Career: Documentary Filmmaker</dd>
##     <dd>Miscellaneous: San Diego Film Award</dd>
##     <dd>ISBN-13: 978-1-5011-3594-1</dd>
##     <dd>Publisher: Simon &amp; Schuster Paperbacks</dd>
##     <dd>Publication Year: 2019<dd>
##     <dd>Pages: 578<dd>
##     <dd>Language: English<dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dd>
## </dl>
## <dl>
## <dt>Title: We Were Soldiers Once...and Young</dt>
##     <dd>Author: Hal Moore</dd>
##     <dd>Education: West Point</dd>
##     <dd>Career: Lt General, US Army</dd>
##     <dd>Miscellaneous: Vietname Veteran</dd>
##     <dd>Author: Joseph Galloway</dd>
##     <dd>Education: Bronze Star V-Device</dd>
##     <dd>Career: Author,Journalist</dd>
##     <dd>Miscellaneous: Decorated non-combatant</dd>
##     <dd>ISBN-13: 978-0-3454-7581-7</dd>
##     <dd>Publisher: Random House Publishing Group</dd>
##     <dd>Publication Year: 2004</dd>
##     <dd>Pages: 480</dd>
##     <dd>Language: English</dd>
## </dl>
## </body></html>
## 
class(fave_books_html_1)
## [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" 
## [4] "XMLAbstractDocument"
(html_books_df_1 <- xmlToDataFrame(fave_books_html_1))

Process JSON File

filename <- getURL("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA607%20Spring%202021/Week7/Homework7/Data/Fave_Hist_Books.json")
fave_books_json = fromJSON(filename)
class(fave_books_json)
## [1] "list"
as.data.frame(fromJSON(filename))