library(XML)
## Warning: package 'XML' was built under R version 3.1.3
theURL <- "http://www.jaredlander.com/2012/02/another-kind-of-super-bowl-pool/"
bowlPool <- readHTMLTable(theURL, which = 1, header = FALSE, stringsAsFactors = FALSE)
bowlPool
## V1 V2 V3
## 1 Participant 1 Giant A Patriot Q
## 2 Participant 2 Giant B Patriot R
## 3 Participant 3 Giant C Patriot S
## 4 Participant 4 Giant D Patriot T
## 5 Participant 5 Giant E Patriot U
## 6 Participant 6 Giant F Patriot V
## 7 Participant 7 Giant G Patriot W
## 8 Participant 8 Giant H Patriot X
## 9 Participant 9 Giant I Patriot Y
## 10 Participant 10 Giant J Patriot Z
str(bowlPool)
## 'data.frame': 10 obs. of 3 variables:
## $ V1: chr "Participant 1" "Participant 2" "Participant 3" "Participant 4" ...
## $ V2: chr "Giant A" "Giant B" "Giant C" "Giant D" ...
## $ V3: chr "Patriot Q" "Patriot R" "Patriot S" "Patriot T" ...
data.frame
theURL <- "http://www.w3schools.com/html/html_tables.asp"
hvalues <- readHTMLTable(theURL)
hvalues
## $`NULL`
## Number First Name Last Name Points
## 1 1 Eve Jackson 94
## 2 2 John Doe 80
## 3 3 Adam Johnson 67
## 4 4 Jill Smith 50
##
## $`NULL`
## NULL
##
## $`NULL`
## NULL
##
## $`NULL`
## NULL
##
## $`NULL`
## NULL
##
## $`NULL`
## Tag
## 1 <table>
## 2 <th>
## 3 <tr>
## 4 <td>
## 5 <caption>
## 6 <colgroup>
## 7 <col>
## 8 <thead>
## 9 <tbody>
## 10 <tfoot>
## Description
## 1 Defines a table
## 2 Defines a header cell in a table
## 3 Defines a row in a table
## 4 Defines a cell in a table
## 5 Defines a table caption
## 6 Specifies a group of one or more columns in a table for formatting
## 7 Specifies column properties for each column within a <colgroup> element
## 8 Groups the header content in a table
## 9 Groups the body content in a table
## 10 Groups the footer content in a table
str(hvalues)
## List of 6
## $ NULL:'data.frame': 4 obs. of 4 variables:
## ..$ Number : Factor w/ 4 levels "1","2","3","4": 1 2 3 4
## ..$ First Name: Factor w/ 4 levels "Adam","Eve","Jill",..: 2 4 1 3
## ..$ Last Name : Factor w/ 4 levels "Doe","Jackson",..: 2 1 3 4
## ..$ Points : Factor w/ 4 levels "50","67","80",..: 4 3 2 1
## $ NULL: NULL
## $ NULL: NULL
## $ NULL: NULL
## $ NULL: NULL
## $ NULL:'data.frame': 10 obs. of 2 variables:
## ..$ Tag : Factor w/ 10 levels "<caption>","<col>",..: 4 8 10 6 1 3 2 9 5 7
## ..$ Description: Factor w/ 10 levels "Defines a cell in a table",..: 4 2 3 1 5 9 10 8 6 7
factor
6 tables
str(hvalues)
## List of 6
## $ NULL:'data.frame': 4 obs. of 4 variables:
## ..$ Number : Factor w/ 4 levels "1","2","3","4": 1 2 3 4
## ..$ First Name: Factor w/ 4 levels "Adam","Eve","Jill",..: 2 4 1 3
## ..$ Last Name : Factor w/ 4 levels "Doe","Jackson",..: 2 1 3 4
## ..$ Points : Factor w/ 4 levels "50","67","80",..: 4 3 2 1
## $ NULL: NULL
## $ NULL: NULL
## $ NULL: NULL
## $ NULL: NULL
## $ NULL:'data.frame': 10 obs. of 2 variables:
## ..$ Tag : Factor w/ 10 levels "<caption>","<col>",..: 4 8 10 6 1 3 2 9 5 7
## ..$ Description: Factor w/ 10 levels "Defines a cell in a table",..: 4 2 3 1 5 9 10 8 6 7
theURL <- "http://www.w3schools.com/html/html_tables.asp"
hvalues.1 <- readHTMLTable(theURL, which = 1)
hvalues.1
## Number First Name Last Name Points
## 1 1 Eve Jackson 94
## 2 2 John Doe 80
## 3 3 Adam Johnson 67
## 4 4 Jill Smith 50
hvalues.1mod <- subset(hvalues.1, select = c("Last Name", "Points"))
hvalues.1mod
## Last Name Points
## 1 Jackson 94
## 2 Doe 80
## 3 Johnson 67
## 4 Smith 50
2 tables
Chrome; ctrl+u will display page source
I prefer rvest because it lets you get more specific with your selection criteria, although the XML readHTMLTable is simpler and seems more suited for scraping less complicated pages.
library(rvest)
## Warning: package 'rvest' was built under R version 3.1.3
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:XML':
##
## xml
elements.1URL <- html("http://www.webelements.com/nexus/content/list-elements-atomic-number")
elements.1 <- elements.1URL %>%
html_nodes(xpath='//*[@id="node-49"]/div[1]/table') %>%
html_table(header = T)
str(elements.1)
## List of 1
## $ :'data.frame': 118 obs. of 3 variables:
## ..$ Element name : chr [1:118] "Hydrogen" "Helium" "Lithium" "Beryllium" ...
## ..$ Element symbol: chr [1:118] "H" "He" "Li" "Be" ...
## ..$ Atomic number : int [1:118] 1 2 3 4 5 6 7 8 9 10 ...
print(elements.1)
## [[1]]
## Element name Element symbol Atomic number
## 1 Hydrogen H 1
## 2 Helium He 2
## 3 Lithium Li 3
## 4 Beryllium Be 4
## 5 Boron B 5
## 6 Carbon C 6
## 7 Nitrogen N 7
## 8 Oxygen O 8
## 9 Fluorine F 9
## 10 Neon Ne 10
## 11 Sodium Na 11
## 12 Magnesium Mg 12
## 13 Aluminium (aluminum) Al 13
## 14 Silicon Si 14
## 15 Phosphorus P 15
## 16 Sulfur (Sulphur) S 16
## 17 Chlorine Cl 17
## 18 Argon Ar 18
## 19 Potassium K 19
## 20 Calcium Ca 20
## 21 Scandium Sc 21
## 22 Titanium Ti 22
## 23 Vanadium V 23
## 24 Chromium Cr 24
## 25 Manganese Mn 25
## 26 Iron Fe 26
## 27 Cobalt Co 27
## 28 Nickel Ni 28
## 29 Copper Cu 29
## 30 Zinc Zn 30
## 31 Gallium Ga 31
## 32 Germanium Ge 32
## 33 Arsenic As 33
## 34 Selenium Se 34
## 35 Bromine Br 35
## 36 Krypton Kr 36
## 37 Rubidium Rb 37
## 38 Strontium Sr 38
## 39 Yttrium Y 39
## 40 Zirconium Zr 40
## 41 Niobium Nb 41
## 42 Molybdenum Mo 42
## 43 Technetium Tc 43
## 44 Ruthenium Ru 44
## 45 Rhodium Rh 45
## 46 Palladium Pd 46
## 47 Silver Ag 47
## 48 Cadmium Cd 48
## 49 Indium In 49
## 50 Tin Sn 50
## 51 Antimony Sb 51
## 52 Tellurium Te 52
## 53 Iodine I 53
## 54 Xenon Xe 54
## 55 Caesium (Cesium) Cs 55
## 56 Barium Ba 56
## 57 Lanthanum La 57
## 58 Cerium Ce 58
## 59 Praseodymium Pr 59
## 60 Neodymium Nd 60
## 61 Promethium Pm 61
## 62 Samarium Sm 62
## 63 Europium Eu 63
## 64 Gadolinium Gd 64
## 65 Terbium Tb 65
## 66 Dysprosium Dy 66
## 67 Holmium Ho 67
## 68 Erbium Er 68
## 69 Thulium Tm 69
## 70 Ytterbium Yb 70
## 71 Lutetium Lu 71
## 72 Hafnium Hf 72
## 73 Tantalum Ta 73
## 74 Tungsten W 74
## 75 Rhenium Re 75
## 76 Osmium Os 76
## 77 Iridium Ir 77
## 78 Platinum Pt 78
## 79 Gold Au 79
## 80 Mercury Hg 80
## 81 Thallium Tl 81
## 82 Lead Pb 82
## 83 Bismuth Bi 83
## 84 Polonium Po 84
## 85 Astatine At 85
## 86 Radon Rn 86
## 87 Francium Fr 87
## 88 Radium Ra 88
## 89 Actinium Ac 89
## 90 Thorium Th 90
## 91 Protactinium Pa 91
## 92 Uranium U 92
## 93 Neptunium Np 93
## 94 Plutonium Pu 94
## 95 Americium Am 95
## 96 Curium Cm 96
## 97 Berkelium Bk 97
## 98 Californium Cf 98
## 99 Einsteinium Es 99
## 100 Fermium Fm 100
## 101 Mendelevium Md 101
## 102 Nobelium No 102
## 103 Lawrencium Lr 103
## 104 Rutherfordium Rf 104
## 105 Dubnium Db 105
## 106 Seaborgium Sg 106
## 107 Bohrium Bh 107
## 108 Hassium Hs 108
## 109 Meitnerium Mt 109
## 110 Darmstadtium Ds 110
## 111 Roentgenium Rg 111
## 112 Copernicium Cp 112
## 113 Ununtrium Uut 113
## 114 Ununquadium Uuq 114
## 115 Ununpentium Uup 115
## 116 Ununhexium Uuh 116
## 117 Ununseptium Uus 117
## 118 Ununoctium Uuo 118