three books in one subject at least one book has more than one author title, authors, xx,xx,xx -your interesting three files to store book info to .html, .xml, .json

R - load info from each sources into separate R data frames hand in three source files and .rmd

load data from HTML file to R

Read text from HTML file

extractHTML<-htmlTreeParse('books.html', useInternalNodes = TRUE)
extractHTML #check inputs

## <!DOCTYPE html>
## <html>
## <head>
## <meta charset="UTF-8">
## <title>My Bookshelf</title>
## <style type="text/css">
##          body {
##          padding-left: 11em;
##              font-family: Georgia, "Times New Roman", Times, serif;
##          color: black;
##                  background-color: #d8da3d }
##      ul.navbar {
##              position: absolute;
##              top: 2em;
##              left: 1em;
##              width: 9em 
##          background-image: url(https://images-na.ssl-images-amazon.com/images/G/01/gno/sprites/global-sprite_bluebeacon-v1._CB327533540_.png);}
##      h1 {
##              font-family: Helvetica, Geneva, Arial, SunSans-Regular, sans-serif;
##          color: purple }
##      ul.navbar li {
##              background: lightgrey;
##              margin: 0.5em 0;
##              padding: 0.3em; }
##          ul.navbar a {
##              text-decoration: none }
##          a:visited {
##              color: blue }
##         </style>
## </head>
## <body>
##  
##  <!-- Main content -->
##  <h1>My Bookshelf</h1>
## 
##  <p>Welcome to read my favory books!
## 
##  <book id="1"><p><a href="https://www.amazon.com/Sherlock-Holmes-Missing-Shakespeare-Watson/dp/1546319247/ref=sr_1_7?ie=UTF8&amp;qid=1507847849&amp;sr=8-7&amp;keywords=sherlock+holmes+books"><b>Sherlock Holmes and the Missing Shakespeare (The Watson Files) (Volume 1)</b></a>
##              </p>
## <p></p>
## <title>Sherlock Holmes and the Missing Shakespeare (The Watson Files) (Volume 1)</title>
## <p><author>J.R. Rain, Chanel Smith</author></p>
## <p><genre>Crime Mystery</genre></p>
## <p><price>6.99</price></p>
## <p><isbn>9781546319245</isbn></p>
## <p><publisher>CreateSpace Independent Publishing Platform</publisher></p>
## <p><publish_date>2017-04-25</publish_date></p></book><book id="2"><p><a href="https://www.amazon.com/Kindaichi-Case-Files-Opera-Murders/dp/1591823544/ref=sr_1_1?s=books&amp;ie=UTF8&amp;qid=1507847970&amp;sr=1-1&amp;keywords=kindaichi+case+files"><b>The Kindaichi Case Files: The Opera House Murders</b></a>       
##              </p>
## <p></p>
## <title>The Kindaichi Case Files: The Opera House Murders</title>
## <p><author>Yozaburo Kanari, Fumiya Sato</author></p>
## <p><genre>Crime Mystery</genre></p>
## <p><price>14.96</price></p>
## <p><isbn>9781591823544</isbn></p>
## <p><publisher>TokyoPop</publisher></p>
## <p><publish_date>2003-06-10</publish_date></p></book><book id="3"><p><a href="https://www.amazon.com/NCIS-Los-Angeles-Jeff-Mariotte/dp/178329633X/ref=pd_sim_14_15?_encoding=UTF8&amp;pd_rd_i=178329633X&amp;pd_rd_r=TQJNXG0MHMWC7KNPXVFA&amp;pd_rd_w=L5CS4&amp;pd_rd_wg=CSZyP&amp;psc=1&amp;refRID=TQJNXG0MHMWC7KNPXVFA"><b>NCIS Los Angeles: Bolthole</b></a>                      
##              </p>
## <p></p>
## <title>NCIS Los Angeles: Bolthole</title>
## <p><author>Jeff Mariotte</author></p>
## <p><genre>Crime Mystery</genre></p>
## <p><price>6.47</price></p>
## <p><isbn>9781783296330</isbn></p>
## <p><publisher>Titan Books</publisher></p>
## <p><publish_date>2016-11-29</publish_date></p></book></p>
## </body>
## </html>
##

htmldf = unlist(xpathApply(extractHTML, '//p', xmlValue))
#htmldf #check values

#content1 = gsub('\\n', ' ', content) #same as unlist()

#build a data frame to store the data
book1<-htmldf[2:9]
book2<-htmldf[10:17]
book3<-htmldf[18:25]
htmldat<-cbind(book1,book2,book3)

#data clearning: remore an empty row from the dataframe
htmldat<-htmldat[-c(2),]
htmldat<-data.frame(t(htmldat))

#give a header for the table
names(htmldat) <- c("Titles","Authors","Catogory","Price","ISBN", "Publisher","Pub_Date")
htmldat

##                                                                                      Titles
## book1 Sherlock Holmes and the Missing Shakespeare (The Watson Files) (Volume 1)\r\n      \t\t
## book2                  The Kindaichi Case Files: The Opera House Murders      \t\r\n      \t\t
## book3                                 NCIS Los Angeles: Bolthole      \t      \t\t\r\n      \t\t
##                            Authors      Catogory Price          ISBN
## book1      J.R. Rain, Chanel Smith Crime Mystery  6.99 9781546319245
## book2 Yozaburo Kanari, Fumiya Sato Crime Mystery 14.96 9781591823544
## book3                Jeff Mariotte Crime Mystery  6.47 9781783296330
##                                         Publisher   Pub_Date
## book1 CreateSpace Independent Publishing Platform 2017-04-25
## book2                                    TokyoPop 2003-06-10
## book3                                 Titan Books 2016-11-29

#str(htmldat)

Read text from XML file

extractXML<-xmlParse('books.xml')
#extractXML #check inputs

#build a data frame to store the data
xmldat<-xmlToDataFrame(extractXML)
xmldat #check values

##   bookid
## 1      1
## 2      2
## 3      3
##                                                                       title
## 1 Sherlock Holmes and the Missing Shakespeare (The Watson Files) (Volume 1)
## 2                         The Kindaichi Case Files: The Opera House Murders
## 3                                                NCIS Los Angeles: Bolthole
##                        author         genre price          isbn
## 1     J.R. Rain, Chanel Smith Crime Mystery  6.99 9781546319245
## 2 Yozaburo Kanari,Fumiya Sato Crime Mystery 14.96 9781591823544
## 3               Jeff Mariotte Crime Mystery  6.47 9781783296330
##                                     publisher publish_date
## 1 CreateSpace Independent Publishing Platform   2017-04-25
## 2                                    TokyoPop   2003-06-10
## 3                                 Titan Books   2016-11-29

Read text from JSON file

extractJSON<-fromJSON(file ="books.json")
#extractJSON #check inputs

jsondf <- lapply(extractJSON, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})
jsondat<-as.data.frame(do.call("rbind", jsondf))
jsondat

##   bookid
## 1      1
## 2      2
## 3      3
##                                                                       title
## 1 Sherlock Holmes and the Missing Shakespeare (The Watson Files) (Volume 1)
## 2                         The Kindaichi Case Files: The Opera House Murders
## 3                                                NCIS Los Angeles: Bolthole
##                        author         genre price          isbn
## 1     J.R. Rain, Chanel Smith Crime Mystery  6.99 9781546319245
## 2 Yozaburo Kanari,Fumiya Sato Crime Mystery 14.96 9781591823544
## 3               Jeff Mariotte Crime Mystery  6.47 9781783296330
##                                     publisher publish_date
## 1 CreateSpace Independent Publishing Platform   2017-04-25
## 2                                    TokyoPop   2003-06-10
## 3                                 Titan Books   2016-11-29

book

Chunmei Zhu

October 12, 2017

load data from HTML file to R

Read text from HTML file

Read text from XML file

Read text from JSON file