The goal of this project is to scrape data from a website.
I reviewed regular expressions RegexOne website.
The deliverable is the final score screen of all challenges complete:
In this section we were supposed to follow video tutorials to capture movie data from IMDB.
These are the video tutorials:
Below is the finished code for the php scraper based off of the video tutorials.
<?php
function scrape_imdb($year_start, $year_end, $page_start, $page_end){
$curl = curl_init();
$all_data = array();
for ($page = $page_start; $page <= $page_end; $page++){
$url = "http://www.imdb.com/search/title?year=$year_start,$year_end&title_type=feature&sort=moviemeter,asc&page=$page&ref_=adv_nxt";
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($curl);
$movies = array();
//match movie title
preg_match_all('!<a href="\/title\/.*?\/\?ref_=adv_li_tt"\n>(.*?)<\/a>!',$result,$match);
$movies['title'] = $match[1];
//match year
preg_match_all('!<span class="lister-item-year text-muted unbold">.*?\((\d{4})\)<\/span>!',$result,$match);
$movies['year'] = $match[1];
//match image url
preg_match_all('!loadlate="(.*?)"!',$result,$match);
$movies['image'] = $match[1];
//match certificate, runtime, genre block
preg_match_all("!<p class=\"text-muted\s\">(.*?)<\/p>!is",$result, $match);
//match certificate, runtime, genre individually from above block
for ($i=0;$i<count($match[1]);$i++){
//match certificate
if (preg_match('!<span class="certificate">(.*?)<\/span>!',$match[1][$i],$certificate)){
$movies['certificate'][$i] = $certificate[1];
}
else {
$movies['certificate'][$i] = '';
}
//match runtime
if (preg_match('!<span class="runtime">(\d{2,3}) min<\/span>!',$match[1][$i],$runtime)){
$movies['runtime'][$i] = $runtime[1];
}
else {
$movies['runtime'][$i] = '';
}
//match genre
if (preg_match('!<span class="genre">\n(.*?)\s*?<\/span>!',$match[1][$i],$genre)){
$movies['genres'][$i] = $genre[1];
}
else {
$movies['genres'][$i] = '';
}
}
//match ratings bar block
preg_match_all('!<div class="ratings-bar">(.*?)<\/span>!is',$result,$match);
//match ratings individually
for ($i=0;$i<count($match[1]);$i++)
{
if (preg_match("!data-value=\"(.*?)\"!i",$match[1][$i],$imdb_rating)){
$movies['imdb_rating'][$i] = $imdb_rating[1];
}
else {
$movies['imdb_rating'][$i] = '';
}
}
//match the metascore and description together, make metascore optional
preg_match_all('!(<div class="inline-block ratings-metascore">
<span class="metascore (favorable|mixed|unfavorable)">(.*?)\s*?<\/span>\s*?Metascore\s*?<\/div>\s*?<\/div>\n)?<p class="text-muted">(.*?)<\/p>!is',$result,$match);
for ($i=0;$i<count($match[0]);$i++){
if (preg_match('!metascore (favorable|mixed|unfavorable)">(.*?)\s*?</span>!',$match[0][$i],$metascore)){
$movies['metascore'][$i] = $metascore[2];
}
else {
$movies['metascore'][$i] = '';
}
if (preg_match('!<p class="text-muted\s?">\n(.*?)</p>!i',$match[0][$i],$description)){
$movies['description'][$i] = $description[1];
}
else {
$movies['description'][$i] = '';
}
}
//match directors and stars block
preg_match_all('!<p class="">(.*?)<\/p>!is',$result,$match);
for ($i=0;$i<count($match[1]);$i++){
if (preg_match('!Directors?:\n<a href="/name/.*?/?ref_=adv_li_dr_0"\n>(.*?)</a>\n!s',$match[1][$i],$directors))
{
//print_r($directors);die;
$clean_directors = preg_replace('!(<a href="\/name\/.*?\/?ref_=adv_li_dr_\d"\n>|<\/a>|\n)!','',$directors[1]);
$movies['directors'][$i] = $clean_directors;
}
else {
$movies['directors'][$i] = '';
}
if (preg_match('!Stars?:\n(.*?)<\/a>\n!is',$match[1][$i],$stars)){
preg_match_all('!>(.*?)<!',$stars[1],$all_stars);
$movies['stars'][$i] = implode(', ',$all_stars[1]);
}
else {
$movies['stars'][$i] = '';
}
}
//match votes block, votes and gross, (because Gross can be empty)
//votes or gross can be empty, they can also both be empty, make them both optional
$regex = '!(<p class="sort-num_votes-visible">
\s*?<span class="text-muted">Votes:<\/span>
\s*?<span name="nv" data-value="(\d*?)">.*?<\/span>)?.*?
(<span class="ghost">\|<\/span>\s*?<span class="text-muted">Gross:<\/span>
\s*?<span name="nv" data-value="(.*?)">.*?<\/span>
\s*?<\/p>)?(\s*?<\/div>\s*?<\/div>\s*?)(<div class="lister-item mode-advanced">|</div>\s*?<div class="nav">)!is';
//<div class="lister-item mode-advanced">
preg_match_all($regex,$result,$match);
for ($i=0;$i<count($match[0]);$i++){
if (preg_match('!Votes:</span>\s*?<span name="nv" data-value="(\d*?)">!is',$match[0][$i],$votes)){
$movies['votes'][$i] = $votes[1];
}
else {
$movies['votes'][$i] = '';
}
if (preg_match('!Gross:</span>\s*?<span name="nv" data-value="(.*?)">!is',$match[0][$i],$gross)){
$movies['gross'][$i] = $gross[1];
}
else {
$movies['gross'][$i] = '';
}
}
//save all data in a nicely formatted array
foreach($movies as $key => $value) {
for ($i = 0; $i < count ($movies[$key]);$i++){
$data[$i][$key] = $movies[$key][$i];
}
}
$all_data = array_merge($data,$all_data);
} //end main loop
return $all_data;
}
?>
The deliverable for this section is the complete IMDB contents (proof that the php scraper runs correctly):
In this section, we are supposed to scrape another website using the php scraper we created in the previous section. I will be completing this section in the R programming language to take advantage of the rvest package that makes web scraping easier.
The website that I am going to scrape is backpage.com.
“Backpage is a classified advertising website launched in 2004. It offers classified listings for a wide variety of products and services including automotive, jobs listings, and real estate.” -https://en.wikipedia.org/wiki/Backpage
Backpage is used to list a lot of harmless products and services, like the ones mentioned on Wikipedia. It is, however, a hub for posting ads for prostitution. I’m a data science contractor for the Arizona Financial Crimes Task Force, and it’s a part of my job to scrape websites like these to collect data on illegal activity in hopes to catch the “pimps” or the facilitators of these crimes. I’m going to be scraping the phone numbers, social media links, and locations of the prostition postings on backpage.
Do not call any numbers gathered in this scrape, and do not visit any of the social media links.
The rvest library is going to be used to write the R scraper. rvest was designed after Python’s beautiful soup and makes expressing common web scraping tasks easy.
This is what the main page looks like that I am going to scrape. This is the page for prostitution listings for the state of Arizona. Each phone number is a link to an ad.
(fun fact: this website is not blocked by GCU)
# import the rvest library
library(rvest)
# read in html
bp.html = read_html("http://arizona.backpage.com/MenSeekWomen/")
The raw html is scraped from this page.
# display html object
bp.html
## {xml_document}
## <html lang="en-us">
## [1] <head>\n<title>Arizona - men seeking women personals - backpage.com< ...
## [2] <body id="Results">\r\n\r\n \r\n \r\n\r\n\r\n\r\n\r\n \r\n\r\ ...
The raw html is stored in an xml_document.
# get all of the hyperlinks on the page
bp.ads = html_nodes(bp.html, "a")
# display sample of hyperlinks
head(bp.ads)
## {xml_nodeset (6)}
## [1] <a href="http://arizona.backpage.com/">Home</a>
## [2] <a href="https://my.backpage.com/classifieds/central/index">My Accou ...
## [3] <a href="https://my.backpage.com/classifieds/central/PurchaseCredits ...
## [4] <a href="#es-us" rel="nofollow" data-lang="es-us" data-key="MenSeekW ...
## [5] <a href="http://flagstaff.backpage.com/">flagstaff/sedona</a>
## [6] <a href="http://mohave.backpage.com/">mohave county</a>
The phone numbers are scraped from the html. From this scrape I essentially get all of the hyperlinks on the page.
# use regex to get all the ad links
links = gregexpr("http:\\/\\/arizona\\.backpage\\.com\\/MenSeekWomen\\/[0-9]{3}-[0-9]{3}-[0-9]{4}\\/[0-9]*",
as.character(bp.ads))
links = unlist(regmatches(as.character(bp.ads), links))
# use regex to get all the ad phones
phone = gregexpr("[0-9]{3}-[0-9]{3}-[0-9]{4}", links)
phone = unlist(regmatches(links, phone))
# create a single data frame with all the data
bp.ads = data.frame(ad_link = links, phone = phone)
bp.ads$ad_link = as.character(bp.ads$ad_link)
bp.ads$phone = as.character(bp.ads$phone)
From this list I created a data frame with the text and the link.
# display a sample of the ads data frame
library(knitr)
kable(head(bp.ads))
| ad_link | phone |
|---|---|
| http://arizona.backpage.com/MenSeekWomen/480-799-0124/70667527 | 480-799-0124 |
| http://arizona.backpage.com/MenSeekWomen/480-799-0124/70667527 | 480-799-0124 |
| http://arizona.backpage.com/MenSeekWomen/480-729-1745/62584332 | 480-729-1745 |
| http://arizona.backpage.com/MenSeekWomen/480-729-1745/62584332 | 480-729-1745 |
| http://arizona.backpage.com/MenSeekWomen/480-729-1745/62584332 | 480-729-1745 |
| http://arizona.backpage.com/MenSeekWomen/480-729-1745/62584332 | 480-729-1745 |
This is a sample of the data frame I have so far. This data frame has 336 rows (336 ads).
locations = c()
dates = c()
socials = c()
# loop through each link
for (i in 1:1) {
# read in the html for the ad
ad.html = read_html(bp.ads[i,]$ad_link)
# scrape the location
location = html_text(html_node(ad.html, ".metaInfoDisplay+ div"))
# parse the part of location needed
location = substr(location, 36, nchar(location) - 10)
# add the location to the locations list
locations = c(locations, location)
# scrape the date
date = html_text(html_node(ad.html, ".adInfo"))
# parse the part of date needed
date = substr(date, 32, nchar(date) - 6)
# add the date to the dates list
dates = c(dates, date)
# scrape the social media link
social = html_text(html_node(ad.html, ".posting br+ a"))
# add the date to the social media links list
socials = c(socials, social)
}
# add all this data to the ads data frame
bp.ads$location = locations
bp.ads$date_posted = dates
bp.ads$social_link = socials
From here I scrape each individual ad link to get the information about the location, date, and social media link of the prostitute.
kable(head(bp.ads))
This is a sample of the data frame with all of the new information scraped off of each ad.