607-week10-assignment.R
library(XML)
## Warning: package 'XML' was built under R version 3.1.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.3
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
theURL <- "http://www.jaredlander.com/2012/02/another-kind-of-super-bowl-pool/"
bowlPool <- readHTMLTable(theURL, which = 1, header = FALSE, stringsAsFactors = FALSE)
bowlPool
## V1 V2 V3
## 1 Participant 1 Giant A Patriot Q
## 2 Participant 2 Giant B Patriot R
## 3 Participant 3 Giant C Patriot S
## 4 Participant 4 Giant D Patriot T
## 5 Participant 5 Giant E Patriot U
## 6 Participant 6 Giant F Patriot V
## 7 Participant 7 Giant G Patriot W
## 8 Participant 8 Giant H Patriot X
## 9 Participant 9 Giant I Patriot Y
## 10 Participant 10 Giant J Patriot Z
# 1. What type of data structure is bowlpool?
str(bowlPool)
## 'data.frame': 10 obs. of 3 variables:
## $ V1: chr "Participant 1" "Participant 2" "Participant 3" "Participant 4" ...
## $ V2: chr "Giant A" "Giant B" "Giant C" "Giant D" ...
## $ V3: chr "Patriot Q" "Patriot R" "Patriot S" "Patriot T" ...
class(bowlPool)
## [1] "data.frame"
# 2. Suppose instead you call readHTMLTable() with just the URL argument,
# against the provided URL, as shown below
theURL <- "http://www.w3schools.com/html/html_tables.asp"
hvalues <- readHTMLTable(theURL)
# What is the type of variable returned in hvalues?
sapply(hvalues,class)
## NULL NULL NULL NULL NULL
## "data.frame" "NULL" "NULL" "NULL" "NULL"
## NULL
## "data.frame"
# or
class(hvalues)
## [1] "list"
#or
str(hvalues)
## List of 6
## $ NULL:'data.frame': 4 obs. of 4 variables:
## ..$ Number : Factor w/ 4 levels "1","2","3","4": 1 2 3 4
## ..$ First Name: Factor w/ 4 levels "Adam","Eve","Jill",..: 2 4 1 3
## ..$ Last Name : Factor w/ 4 levels "Doe","Jackson",..: 2 1 3 4
## ..$ Points : Factor w/ 4 levels "50","67","80",..: 4 3 2 1
## $ NULL: NULL
## $ NULL: NULL
## $ NULL: NULL
## $ NULL: NULL
## $ NULL:'data.frame': 10 obs. of 2 variables:
## ..$ Tag : Factor w/ 10 levels "<caption>","<col>",..: 4 8 10 6 1 3 2 9 5 7
## ..$ Description: Factor w/ 10 levels "Defines a cell in a table",..: 4 2 3 1 5 9 10 8 6 7
# 3. Write R code that shows how many HTML tables are represented in hvalues
length(hvalues)
## [1] 6
for (i in 1:length(hvalues))
{
print(hvalues[i])
}
## $`NULL`
## Number First Name Last Name Points
## 1 1 Eve Jackson 94
## 2 2 John Doe 80
## 3 3 Adam Johnson 67
## 4 4 Jill Smith 50
##
## $`NULL`
## NULL
##
## $`NULL`
## NULL
##
## $`NULL`
## NULL
##
## $`NULL`
## NULL
##
## $`NULL`
## Tag
## 1 <table>
## 2 <th>
## 3 <tr>
## 4 <td>
## 5 <caption>
## 6 <colgroup>
## 7 <col>
## 8 <thead>
## 9 <tbody>
## 10 <tfoot>
## Description
## 1 Defines a table
## 2 Defines a header cell in a table
## 3 Defines a row in a table
## 4 Defines a cell in a table
## 5 Defines a table caption
## 6 Specifies a group of one or more columns in a table for formatting
## 7 Specifies column properties for each column within a <colgroup> element
## 8 Groups the header content in a table
## 9 Groups the body content in a table
## 10 Groups the footer content in a table
names(hvalues)
## [1] "NULL" "NULL" "NULL" "NULL" "NULL" "NULL"
hvalues[1]
## $`NULL`
## Number First Name Last Name Points
## 1 1 Eve Jackson 94
## 2 2 John Doe 80
## 3 3 Adam Johnson 67
## 4 4 Jill Smith 50
hvalues[2]
## $`NULL`
## NULL
hvalues[3]
## $`NULL`
## NULL
hvalues[4]
## $`NULL`
## NULL
hvalues[5]
## $`NULL`
## NULL
hvalues[6]
## $`NULL`
## Tag
## 1 <table>
## 2 <th>
## 3 <tr>
## 4 <td>
## 5 <caption>
## 6 <colgroup>
## 7 <col>
## 8 <thead>
## 9 <tbody>
## 10 <tfoot>
## Description
## 1 Defines a table
## 2 Defines a header cell in a table
## 3 Defines a row in a table
## 4 Defines a cell in a table
## 5 Defines a table caption
## 6 Specifies a group of one or more columns in a table for formatting
## 7 Specifies column properties for each column within a <colgroup> element
## 8 Groups the header content in a table
## 9 Groups the body content in a table
## 10 Groups the footer content in a table
# 4. Modify the readHTMLTable code so that just the table with Number,
# FirstName, LastName, # and Points is returned into a dataframe
doc = htmlParse(theURL)
tableNodes = getNodeSet(doc, "//table")
tb = readHTMLTable(tableNodes[[1]])
tb
## Number First Name Last Name Points
## 1 1 Eve Jackson 94
## 2 2 John Doe 80
## 3 3 Adam Johnson 67
## 4 4 Jill Smith 50
# 5. Modify the returned data frame so only the Last Name and Points columns are shown.
select(tb,3,4)
## Last Name Points
## 1 Jackson 94
## 2 Doe 80
## 3 Johnson 67
## 4 Smith 50
# 6 Identify another interesting page on the web with HTML table values.
# This may be somewhat tricky, because while
# HTML tables are great for web-page scrapers, many HTML designers now prefer
# creating tables using other methods (such as <div> tags or .png files).
#
theURL="http://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
html.tables <-readHTMLTable(theURL)
# 7 How many HTML tables does that page contain?
length(html.tables)
## [1] 5
# 8 Identify your web browser, and describe (in one or two sentences)
# how you view HTML page source in your web browser.
# I use Internet Explorer. Right click on the page click on View source to view the HTML code.
# 9 (Optional challenge exercise)
# Instead of using readHTMLTable from the XML package, use the functionality in the rvest package to perform the same task.
require(rvest)
## Loading required package: rvest
## Warning: package 'rvest' was built under R version 3.1.3
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:XML':
##
## xml
url<-"http://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
html.rvest <-html(url)
#html.rvest %>%
# html_nodes("span class") %>%
#.[[1]] %>%
# html_table()
# I tried to work on rvest, but realised that need more reading to answer this question
# Which method do you prefer? Why might one prefer one package over the other?