607-week10-assignment.R

library(XML)
## Warning: package 'XML' was built under R version 3.1.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.3
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
theURL <- "http://www.jaredlander.com/2012/02/another-kind-of-super-bowl-pool/"
bowlPool <- readHTMLTable(theURL, which = 1, header = FALSE, stringsAsFactors = FALSE)
bowlPool
##                V1      V2        V3
## 1   Participant 1 Giant A Patriot Q
## 2   Participant 2 Giant B Patriot R
## 3   Participant 3 Giant C Patriot S
## 4   Participant 4 Giant D Patriot T
## 5   Participant 5 Giant E Patriot U
## 6   Participant 6 Giant F Patriot V
## 7   Participant 7 Giant G Patriot W
## 8   Participant 8 Giant H Patriot X
## 9   Participant 9 Giant I Patriot Y
## 10 Participant 10 Giant J Patriot Z
# 1. What type of data structure is bowlpool? 
str(bowlPool)
## 'data.frame':    10 obs. of  3 variables:
##  $ V1: chr  "Participant 1" "Participant 2" "Participant 3" "Participant 4" ...
##  $ V2: chr  "Giant A" "Giant B" "Giant C" "Giant D" ...
##  $ V3: chr  "Patriot Q" "Patriot R" "Patriot S" "Patriot T" ...
class(bowlPool)
## [1] "data.frame"
# 2. Suppose instead you call readHTMLTable() with just the URL argument,
# against the provided URL, as shown below
theURL <- "http://www.w3schools.com/html/html_tables.asp"
hvalues <- readHTMLTable(theURL)
# What is the type of variable returned in hvalues?

sapply(hvalues,class)
##         NULL         NULL         NULL         NULL         NULL 
## "data.frame"       "NULL"       "NULL"       "NULL"       "NULL" 
##         NULL 
## "data.frame"
# or
class(hvalues)
## [1] "list"
#or
str(hvalues)
## List of 6
##  $ NULL:'data.frame':    4 obs. of  4 variables:
##   ..$ Number    : Factor w/ 4 levels "1","2","3","4": 1 2 3 4
##   ..$ First Name: Factor w/ 4 levels "Adam","Eve","Jill",..: 2 4 1 3
##   ..$ Last Name : Factor w/ 4 levels "Doe","Jackson",..: 2 1 3 4
##   ..$ Points    : Factor w/ 4 levels "50","67","80",..: 4 3 2 1
##  $ NULL: NULL
##  $ NULL: NULL
##  $ NULL: NULL
##  $ NULL: NULL
##  $ NULL:'data.frame':    10 obs. of  2 variables:
##   ..$ Tag        : Factor w/ 10 levels "<caption>","<col>",..: 4 8 10 6 1 3 2 9 5 7
##   ..$ Description: Factor w/ 10 levels "Defines a cell in a table",..: 4 2 3 1 5 9 10 8 6 7
# 3. Write R code that shows how many HTML tables are represented in hvalues
length(hvalues)
## [1] 6
for (i in 1:length(hvalues))
{
  print(hvalues[i])
}
## $`NULL`
##   Number First Name Last Name Points
## 1      1        Eve   Jackson     94
## 2      2       John       Doe     80
## 3      3       Adam   Johnson     67
## 4      4       Jill     Smith     50
## 
## $`NULL`
## NULL
## 
## $`NULL`
## NULL
## 
## $`NULL`
## NULL
## 
## $`NULL`
## NULL
## 
## $`NULL`
##           Tag
## 1     <table>
## 2        <th>
## 3        <tr>
## 4        <td>
## 5   <caption>
## 6  <colgroup>
## 7       <col>
## 8     <thead>
## 9     <tbody>
## 10    <tfoot>
##                                                                Description
## 1                                                          Defines a table
## 2                                         Defines a header cell in a table
## 3                                                 Defines a row in a table
## 4                                                Defines a cell in a table
## 5                                                  Defines a table caption
## 6       Specifies a group of one or more columns in a table for formatting
## 7  Specifies column properties for each column within a <colgroup> element
## 8                                     Groups the header content in a table
## 9                                       Groups the body content in a table
## 10                                    Groups the footer content in a table
names(hvalues)
## [1] "NULL" "NULL" "NULL" "NULL" "NULL" "NULL"
hvalues[1]
## $`NULL`
##   Number First Name Last Name Points
## 1      1        Eve   Jackson     94
## 2      2       John       Doe     80
## 3      3       Adam   Johnson     67
## 4      4       Jill     Smith     50
hvalues[2]
## $`NULL`
## NULL
hvalues[3]
## $`NULL`
## NULL
hvalues[4]
## $`NULL`
## NULL
hvalues[5]
## $`NULL`
## NULL
hvalues[6]
## $`NULL`
##           Tag
## 1     <table>
## 2        <th>
## 3        <tr>
## 4        <td>
## 5   <caption>
## 6  <colgroup>
## 7       <col>
## 8     <thead>
## 9     <tbody>
## 10    <tfoot>
##                                                                Description
## 1                                                          Defines a table
## 2                                         Defines a header cell in a table
## 3                                                 Defines a row in a table
## 4                                                Defines a cell in a table
## 5                                                  Defines a table caption
## 6       Specifies a group of one or more columns in a table for formatting
## 7  Specifies column properties for each column within a <colgroup> element
## 8                                     Groups the header content in a table
## 9                                       Groups the body content in a table
## 10                                    Groups the footer content in a table
# 4. Modify the readHTMLTable code so that just the table with Number, 
# FirstName, LastName, # and Points is returned into a dataframe
doc = htmlParse(theURL)
tableNodes = getNodeSet(doc, "//table")
tb = readHTMLTable(tableNodes[[1]])
tb
##   Number First Name Last Name Points
## 1      1        Eve   Jackson     94
## 2      2       John       Doe     80
## 3      3       Adam   Johnson     67
## 4      4       Jill     Smith     50
# 5. Modify the returned data frame so only the Last Name and Points columns are shown.
  select(tb,3,4)
##   Last Name Points
## 1   Jackson     94
## 2       Doe     80
## 3   Johnson     67
## 4     Smith     50
# 6 Identify another interesting page on the web with HTML table values.  
# This may be somewhat tricky, because while
# HTML tables are great for web-page scrapers, many HTML designers now prefer 
# creating tables using other methods (such as <div> tags or .png files).  

# 
theURL="http://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
html.tables <-readHTMLTable(theURL)
# 7 How many HTML tables does that page contain?
length(html.tables)
## [1] 5
# 8 Identify your web browser, and describe (in one or two sentences) 
# how you view HTML page source in your web browser.
# I use Internet Explorer. Right click on the page click on View source to view  the HTML code.
# 9 (Optional challenge exercise)
# Instead of using readHTMLTable from the XML package, use the functionality in the rvest package to perform the same task.  
require(rvest)
## Loading required package: rvest
## Warning: package 'rvest' was built under R version 3.1.3
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:XML':
## 
##     xml
url<-"http://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
html.rvest <-html(url)
#html.rvest  %>%
 # html_nodes("span class") %>%
  #.[[1]] %>%
 # html_table()

        # I tried to work on rvest, but realised that need more reading to answer this question



# Which method do you prefer?  Why might one prefer one package over the other?