three books in one subject at least one book has more than one author title, authors, xx,xx,xx -your interesting three files to store book info to .html, .xml, .json
R - load info from each sources into separate R data frames hand in three source files and .rmd
extractHTML<-htmlTreeParse('books.html', useInternalNodes = TRUE)
extractHTML #check inputs
## <!DOCTYPE html>
## <html>
## <head>
## <meta charset="UTF-8">
## <title>My Bookshelf</title>
## <style type="text/css">
## body {
## padding-left: 11em;
## font-family: Georgia, "Times New Roman", Times, serif;
## color: black;
## background-color: #d8da3d }
## ul.navbar {
## position: absolute;
## top: 2em;
## left: 1em;
## width: 9em
## background-image: url(https://images-na.ssl-images-amazon.com/images/G/01/gno/sprites/global-sprite_bluebeacon-v1._CB327533540_.png);}
## h1 {
## font-family: Helvetica, Geneva, Arial, SunSans-Regular, sans-serif;
## color: purple }
## ul.navbar li {
## background: lightgrey;
## margin: 0.5em 0;
## padding: 0.3em; }
## ul.navbar a {
## text-decoration: none }
## a:visited {
## color: blue }
## </style>
## </head>
## <body>
##
## <!-- Main content -->
## <h1>My Bookshelf</h1>
##
## <p>Welcome to read my favory books!
##
## <book id="1"><p><a href="https://www.amazon.com/Sherlock-Holmes-Missing-Shakespeare-Watson/dp/1546319247/ref=sr_1_7?ie=UTF8&qid=1507847849&sr=8-7&keywords=sherlock+holmes+books"><b>Sherlock Holmes and the Missing Shakespeare (The Watson Files) (Volume 1)</b></a>
## </p>
## <p></p>
## <title>Sherlock Holmes and the Missing Shakespeare (The Watson Files) (Volume 1)</title>
## <p><author>J.R. Rain, Chanel Smith</author></p>
## <p><genre>Crime Mystery</genre></p>
## <p><price>6.99</price></p>
## <p><isbn>9781546319245</isbn></p>
## <p><publisher>CreateSpace Independent Publishing Platform</publisher></p>
## <p><publish_date>2017-04-25</publish_date></p></book><book id="2"><p><a href="https://www.amazon.com/Kindaichi-Case-Files-Opera-Murders/dp/1591823544/ref=sr_1_1?s=books&ie=UTF8&qid=1507847970&sr=1-1&keywords=kindaichi+case+files"><b>The Kindaichi Case Files: The Opera House Murders</b></a>
## </p>
## <p></p>
## <title>The Kindaichi Case Files: The Opera House Murders</title>
## <p><author>Yozaburo Kanari, Fumiya Sato</author></p>
## <p><genre>Crime Mystery</genre></p>
## <p><price>14.96</price></p>
## <p><isbn>9781591823544</isbn></p>
## <p><publisher>TokyoPop</publisher></p>
## <p><publish_date>2003-06-10</publish_date></p></book><book id="3"><p><a href="https://www.amazon.com/NCIS-Los-Angeles-Jeff-Mariotte/dp/178329633X/ref=pd_sim_14_15?_encoding=UTF8&pd_rd_i=178329633X&pd_rd_r=TQJNXG0MHMWC7KNPXVFA&pd_rd_w=L5CS4&pd_rd_wg=CSZyP&psc=1&refRID=TQJNXG0MHMWC7KNPXVFA"><b>NCIS Los Angeles: Bolthole</b></a>
## </p>
## <p></p>
## <title>NCIS Los Angeles: Bolthole</title>
## <p><author>Jeff Mariotte</author></p>
## <p><genre>Crime Mystery</genre></p>
## <p><price>6.47</price></p>
## <p><isbn>9781783296330</isbn></p>
## <p><publisher>Titan Books</publisher></p>
## <p><publish_date>2016-11-29</publish_date></p></book></p>
## </body>
## </html>
##
htmldf = unlist(xpathApply(extractHTML, '//p', xmlValue))
#htmldf #check values
#content1 = gsub('\\n', ' ', content) #same as unlist()
#build a data frame to store the data
book1<-htmldf[2:9]
book2<-htmldf[10:17]
book3<-htmldf[18:25]
htmldat<-cbind(book1,book2,book3)
#data clearning: remore an empty row from the dataframe
htmldat<-htmldat[-c(2),]
htmldat<-data.frame(t(htmldat))
#give a header for the table
names(htmldat) <- c("Titles","Authors","Catogory","Price","ISBN", "Publisher","Pub_Date")
htmldat
## Titles
## book1 Sherlock Holmes and the Missing Shakespeare (The Watson Files) (Volume 1)\r\n \t\t
## book2 The Kindaichi Case Files: The Opera House Murders \t\r\n \t\t
## book3 NCIS Los Angeles: Bolthole \t \t\t\r\n \t\t
## Authors Catogory Price ISBN
## book1 J.R. Rain, Chanel Smith Crime Mystery 6.99 9781546319245
## book2 Yozaburo Kanari, Fumiya Sato Crime Mystery 14.96 9781591823544
## book3 Jeff Mariotte Crime Mystery 6.47 9781783296330
## Publisher Pub_Date
## book1 CreateSpace Independent Publishing Platform 2017-04-25
## book2 TokyoPop 2003-06-10
## book3 Titan Books 2016-11-29
#str(htmldat)
extractXML<-xmlParse('books.xml')
#extractXML #check inputs
#build a data frame to store the data
xmldat<-xmlToDataFrame(extractXML)
xmldat #check values
## bookid
## 1 1
## 2 2
## 3 3
## title
## 1 Sherlock Holmes and the Missing Shakespeare (The Watson Files) (Volume 1)
## 2 The Kindaichi Case Files: The Opera House Murders
## 3 NCIS Los Angeles: Bolthole
## author genre price isbn
## 1 J.R. Rain, Chanel Smith Crime Mystery 6.99 9781546319245
## 2 Yozaburo Kanari,Fumiya Sato Crime Mystery 14.96 9781591823544
## 3 Jeff Mariotte Crime Mystery 6.47 9781783296330
## publisher publish_date
## 1 CreateSpace Independent Publishing Platform 2017-04-25
## 2 TokyoPop 2003-06-10
## 3 Titan Books 2016-11-29
extractJSON<-fromJSON(file ="books.json")
#extractJSON #check inputs
jsondf <- lapply(extractJSON, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
jsondat<-as.data.frame(do.call("rbind", jsondf))
jsondat
## bookid
## 1 1
## 2 2
## 3 3
## title
## 1 Sherlock Holmes and the Missing Shakespeare (The Watson Files) (Volume 1)
## 2 The Kindaichi Case Files: The Opera House Murders
## 3 NCIS Los Angeles: Bolthole
## author genre price isbn
## 1 J.R. Rain, Chanel Smith Crime Mystery 6.99 9781546319245
## 2 Yozaburo Kanari,Fumiya Sato Crime Mystery 14.96 9781591823544
## 3 Jeff Mariotte Crime Mystery 6.47 9781783296330
## publisher publish_date
## 1 CreateSpace Independent Publishing Platform 2017-04-25
## 2 TokyoPop 2003-06-10
## 3 Titan Books 2016-11-29