Sources of Lessons/Script:

Example 1:

#Parse the webpage as a html object
Example1 <- read_html('https://practicewebscrapingsite.wordpress.com/example-1/')
#Query this html version of the webpage using a css path or xpath
Title <- html_nodes(Example1,'strong') %>% html_text()
Text <- html_nodes(Example1,'.Content') %>% html_text()
Images <- html_nodes(Example1,'#post-25 img') %>% html_attr('src')
#Creating a dataframe with a row for Title, text and image url.
# The code ID = seq(1,length(Title)) is an automatic way of adding an ID variable that run from 1 
#to the number of Titles collected.
Example1DF <- data.frame(ID= seq(1,length(Title)),Title,Text,Images)

As soon as you feel this makes sense Why not have a try and at extracting the titles, main text and images from the page titled “Exercise 1”.

Example 2:

Title <- c()
Highlights <- c()
Maintext <- c()
Author <- c()
#Parse the webpage as a html object
Example2 <- read_html('https://practicewebscrapingsite.wordpress.com/example-2/')
#Extract the links
BlogPages <- html_nodes(Example2,'.Links a') %>% html_attr("href")
for (i in BlogPages){
  Sys.sleep(2) 
#The system.sleep function ask the system to wait (x amount of time) before executing the rest of the script. This prevents sites being swamped by your webscraper! 
  
#read_html is used here to call the particular webpage that i is at the moment and then downloads the html text to be queried below
  Example2 <- read_html(i)
  #Extracting information
  Heading <- html_nodes(Example2,'.entry-title') %>% html_text()
  High <- html_nodes(Example2,'.Highlights') %>% html_text()
  MainContent <- html_nodes(Example2,'.Content') %>% html_text()
  Aut <- html_nodes(Example2,'.Author') %>% html_text()
  
  #Storing information
  Title <- c(Title,Heading)
  Highlights <- c(Highlights,High)
  Maintext <- c(Maintext,MainContent)
  Author <- c(Author,Aut)
}
#Creating a dataframe with a row for Title, Highlights, Main text and Author.

# The code ID = seq(1,length(Title)) is an automatic way of adding an ID variable that run from 1 to the number of Titles collected.

Example2DF <- data.frame(ID= seq(1,length(Title)),Highlights,Maintext,Author)

#Cleaning our workspace to just leave the dataframe (rm stands for the remove function

rm(Example2,Aut, Author,BlogPages,Heading,High,Highlights,i,MainContent,Maintext,Title)

As soon as you feel this makes sense Why not have a try and at extracting the titles, highlights, main text and author from the page titled “Exercise 2”.

Example 3:

Title <- c()
Maintext <- c()
Author <- c()
#Parse the webpage as a html object
Example3 <- read_html('https://practicewebscrapingsite.wordpress.com/example-3/')

#Extract the links
AdditionalPages <- html_nodes(Example3,'p:nth-child(12) a') %>% html_attr("href")
for (i in AdditionalPages)
  Sys.sleep(2)
#The system.sleep function ask the system to wait (x amount of time) before executing the rest of the script. 
  
  MainPages <- read_html(i) 
#read_html(i) calls and parses the webpage link and stores the page in  MainPages.
  
  BlogPages <- html_nodes(MainPages,'h2 a') %>% html_attr("href") #This line pulls out the links for each of the blog posts
  
  print('Level1')
## [1] "Level1"
  print(i)
## [1] "https://practicewebscrapingsite.wordpress.com/example-3-page-3/"
  for (t in BlogPages){
    Sys.sleep(2) 
    Blogs <- read_html(t) 
  #Downloading each of the blog post pages
  #Extracting information
    
  Heading <- html_nodes(Blogs,'.entry-title') %>% html_text()
  #Extracting the Title.
    
  MainContent <- html_nodes(Blogs,'.Content') %>% html_text()
  #Extracting the main text.
    
  Aut <- html_nodes(Blogs,'em') %>% html_text()
  #Extracting the author.
  
  print('Level2')  
  print(t)
    
  #Storing information by adding each bit of information extracted to the vector outside the loop so it does not get over written.
    Title <- c(Title,Heading)
    Maintext <- c(Maintext,MainContent)
    Author <- c(Author,Aut)
  }
## [1] "Level2"
## [1] "https://practicewebscrapingsite.wordpress.com/on-the-fence-over-birdexit/"
## [1] "Level2"
## [1] "https://practicewebscrapingsite.wordpress.com/operating-in-the-shadows/"
## [1] "Level2"
## [1] "https://practicewebscrapingsite.wordpress.com/birdexit-how-new-flood-controls-could-affect-youre-nest/"
#Creating a dataframe with a row for Title, Main text and Author.

#The code ID = seq(1,length(Title)) is an automatic way of adding an ID variable that run from 1 to the number of Titles collected.

Example3DF <- data.frame(ID= seq(1,length(Title)),Title,Maintext,Author)

#Cleaning our workspace to just leave the dataframe (rm stands for the remove function
rm(Blogs,Example3,MainPages,AdditionalPages,Aut, Author,BlogPages,Heading,t,i,MainContent,Maintext,Title)

As soon as you feel this makes sense Why not have a try and at extracting the titles, highlights, main text and author from the page titled “Exercise 3”.

Example 4:

Title <- c()
Picture <- c()
Maintext <- c()

We are going to go over just the first two pages in this example.

You notice the URL’s are identical except for the page number at the end. Inorder to navigate over the pages all we need to is alter the final number and add it to the end of the URL. To do this we first need to generate a sequence of numbers

We can generate a sequence of numbers by using the sequence function which takes the arguments seq(first number, last number, increment change)

The line below also includes the function as.character to turn these numbers into character type (from numerical). This is important as it will allow the character to be pasted together (using the paste function below)with the rest of the URL (https://practicewebscrapingsite.wordpress.com/example-4-page-)

Pages <- as.character(seq(0,1,1))
for (i in Pages){
  Sys.sleep(2)  
#Allowing the system to pause for two seconds.

#This line of code generate a unique URL which is the passed to the read-html function below to download and parse.
  
#The paste function takes two strings and paste them together (in this case the URL and the character numbers we generated) with empty space seperating the two strings as denoted by the sep argument being empty ("")
  
  WebpageURL <- paste("https://practicewebscrapingsite.wordpress.com/example-4-page-",i,"", sep="")
  print(WebpageURL)
  #I added in  the print statment just so you can see the URL created
  
  #Download and Parse the webpage as a html object 
  Example4 <- read_html(WebpageURL)
  
  #Extract the information from each page
  Heading <- html_nodes(Example4,'.Title') %>% html_text()
  MainContent <- html_nodes(Example4,'.Content') %>% html_text()
  Pic <- html_nodes(Example4,'#main img') %>% html_attr("src")

  #Storing information
  Title <- c(Title,Heading)
  Maintext <- c(Maintext,MainContent)
  Picture <- c(Picture,Pic)
}
## [1] "https://practicewebscrapingsite.wordpress.com/example-4-page-0"
## [1] "https://practicewebscrapingsite.wordpress.com/example-4-page-1"
#Creating a dataframe with a row for Title, Highlights, Main text and Author.

#The code ID = seq(1,length(Title)) is an automatic way of adding an ID variable that run from 1 to the number of Titles collected.
Example4DF <- data.frame(ID= seq(1,length(Title)),MainContent,Picture)

#Cleaning our workspace to just leave the dataframe (rm stands for the remove function
rm(Example4,Heading,i,MainContent,Maintext,Pages,Pic,Picture,Title,WebpageURL)

As soon as you feel this makes sense Why not have a try and modify this script to get the following URL’s as well: