html_file <- read_html("https://raw.githubusercontent.com/javernw/JWCUNYAssignments/master/books.html")
#parse html file into R
books_html <- htmlParse(html_file)
books_html
## <!DOCTYPE html>
## <html>
## <head>
## <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
## <title>Subject Books</title>
## </head>
## <style>
## table {
## border-collapse: collapse;
## }
## table, th, td {
## border: 1px solid black;
## text-align: left;
## }
## td {
## height: 50px;
## }
## </style>
## <body>
## <table>
## <tr>
## <th width="30%">Title</th>
## <th width="30%">Author(s)</th>
## <th width="10%">Edition</th>
## <th width="15%">Publisher</th>
## <th width="15%">Date Published</th>
## </tr>
## <tr>
## <td>OpenIntro Statistics</td>
## <td>David M Diez, Christopher D Barr, Mine Ãetinkaya-Rundel</td>
## <td>Third</td>
## <td>OpenIntro, Inc.</td>
## <td>07/02/2015</td>
## <td>
## </td>
## </tr>
## <tr>
## <td>R for Everyone: Advanced Analytics and Graphics</td>
## <td>Jared P. Lander</td>
## <td>Second</td>
## <td> Addison-Wesley Professional</td>
## <td>06/18/2017</td>
## </tr>
## <tr>
## <td>Automated Data Collection with R A Practical Guide to Web Scraping and Text Mining</td>
## <td>Simon Munzert, Christian Rubba, Peter MeiÃner, Dominic Nyhuis</td>
## <td>First</td>
## <td>Wiley</td>
## <td>01/20/2015</td>
## </tr>
## </table>
## </body>
## </html>
##
books_html_df <- books_html %>% readHTMLTable(header = TRUE, stringAsFactors = FALSE) %>% as.data.frame()
#clean table
books_html_df <- books_html_df[,-6] #remove null column
#rename columns
names(books_html_df) <- c("Title", "Author(s)", "Edition", "Publisher", "Date Published")
kable(books_html_df) %>% kable_styling(bootstrap_options = "striped")
Title | Author(s) | Edition | Publisher | Date Published |
---|---|---|---|---|
OpenIntro Statistics | David M Diez, Christopher D Barr, Mine Çetinkaya-Rundel | Third | OpenIntro, Inc. | 07/02/2015 |
R for Everyone: Advanced Analytics and Graphics | Jared P. Lander | Second | Addison-Wesley Professional | 06/18/2017 |
Automated Data Collection with R A Practical Guide to Web Scraping and Text Mining | Simon Munzert, Christian Rubba, Peter Meißner, Dominic Nyhuis | First | Wiley | 01/20/2015 |
#get url
xml_file <- getURL("https://raw.githubusercontent.com/javernw/JWCUNYAssignments/master/books.xml")
#extract information from web into R
books_xml <- xmlParse(xml_file)
books_xml
## <?xml version="1.0" encoding="UTF-8"?>
## <fav_books>
## <books>
## <title>OpenIntro Statistics</title>
## <authors>David M Diez, Christopher D Barr, Mine Ãetinkaya-Rundel</authors>
## <edition>Third</edition>
## <publisher>OpenIntro, Inc.</publisher>
## <date_published>07/02/2015</date_published>
## </books>
## <books>
## <title>R for Everyone: Advanced Analytics and Graphics</title>
## <authors>Jared P. Lander</authors>
## <edition>Second</edition>
## <publisher>Addison-Wesley Professional</publisher>
## <date_published>06/18/2017</date_published>
## </books>
## <books>
## <title>Automated Data Collection with R A Practical Guide to Web Scraping and Text Mining</title>
## <authors>Simon Munzert, Christian Rubba, Peter MeiÃner, Dominic Nyhuis</authors>
## <edition>First</edition>
## <publisher>Wiley</publisher>
## <date_published>01/20/2015</date_published>
## </books>
## </fav_books>
##
#estract root node
xroot <- xmlRoot(books_xml)
#convert books.xml to dataframe
books_xml_df <- xmlToDataFrame(xroot)
names(books_xml_df) <- c("Title", "Author(s)", "Edition", "Publisher", "Date Published")
kable(books_xml_df) %>% kable_styling(bootstrap_options = "striped")
Title | Author(s) | Edition | Publisher | Date Published |
---|---|---|---|---|
OpenIntro Statistics | David M Diez, Christopher D Barr, Mine Çetinkaya-Rundel | Third | OpenIntro, Inc. | 07/02/2015 |
R for Everyone: Advanced Analytics and Graphics | Jared P. Lander | Second | Addison-Wesley Professional | 06/18/2017 |
Automated Data Collection with R A Practical Guide to Web Scraping and Text Mining | Simon Munzert, Christian Rubba, Peter Meißner, Dominic Nyhuis | First | Wiley | 01/20/2015 |
json_file <- "https://raw.githubusercontent.com/javernw/JWCUNYAssignments/master/books.json"
#input file into R
books_json <- fromJSON(file = json_file)
books_json
## $`favorite books`
## $`favorite books`[[1]]
## $`favorite books`[[1]]$title
## [1] "OpenIntro Statistics"
##
## $`favorite books`[[1]]$authors
## [1] "David M Diez, Christopher D Barr, Mine Cetinkaya-Rundel"
##
## $`favorite books`[[1]]$edition
## [1] "Third"
##
## $`favorite books`[[1]]$publisher
## [1] "OpenIntro, Inc."
##
## $`favorite books`[[1]]$date_published
## [1] "07/02/2015"
##
##
## $`favorite books`[[2]]
## $`favorite books`[[2]]$title
## [1] "R for Everyone: Advanced Analytics and Graphics"
##
## $`favorite books`[[2]]$authors
## [1] "Jared P. Lander"
##
## $`favorite books`[[2]]$edition
## [1] "Second"
##
## $`favorite books`[[2]]$publisher
## [1] "Addison-Wesley Professional"
##
## $`favorite books`[[2]]$date_published
## [1] "06/18/2017"
##
##
## $`favorite books`[[3]]
## $`favorite books`[[3]]$title
## [1] "Automated Data Collection with R A Practical Guide to Web Scraping and Text Mining"
##
## $`favorite books`[[3]]$authors
## [1] "Simon Munzert, Christian Rubba, Peter Meibner, Dominic Nyhuis"
##
## $`favorite books`[[3]]$edition
## [1] "First"
##
## $`favorite books`[[3]]$publisher
## [1] "Wiley"
##
## $`favorite books`[[3]]$date_published
## [1] "01/20/2015"
#class(books_json) is a list
#convert "books_json" from a list to a dataframe
books_json_df <- data.frame(matrix(unlist(books_json), nrow=3, byrow=T),stringsAsFactors=FALSE)
names(books_json_df) <- c("Title", "Author(s)", "Edition", "Publisher", "Date Published")
kable(books_json_df) %>% kable_styling(bootstrap_options = "striped")
Title | Author(s) | Edition | Publisher | Date Published |
---|---|---|---|---|
OpenIntro Statistics | David M Diez, Christopher D Barr, Mine Cetinkaya-Rundel | Third | OpenIntro, Inc. | 07/02/2015 |
R for Everyone: Advanced Analytics and Graphics | Jared P. Lander | Second | Addison-Wesley Professional | 06/18/2017 |
Automated Data Collection with R A Practical Guide to Web Scraping and Text Mining | Simon Munzert, Christian Rubba, Peter Meibner, Dominic Nyhuis | First | Wiley | 01/20/2015 |