library(xml2)
library(stringr)
#This code reads in the data from the webpage
thepage = readLines('https://www.boxofficemojo.com/alltime/world/')
#This code locates the lines we are intersted in which include our data
grep('</td></tr>', thepage)
## [1] 152 167 1295
thepage[167:177]
## [1] "<table border=\"0\" cellpadding=\"5\" cellspacing=\"1\" width=\"98%\"><tr bgcolor=\"#dcdcdc\"><td align=\"center\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=rank&order=ASC&p=.htm\">Rank</a></font></td><td align=\"center\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=title&order=ASC&p=.htm\">Title</a></font></td><td align=\"center\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=studio&order=ASC&p=.htm\">Studio</a></font></td><td align=\"center\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=wwgross&order=ASC&p=.htm\"><b>Worldwide</b></a></font></td><td align=\"center\" colspan=\"2\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=domgross&order=DESC&p=.htm\">Domestic</a> / </font><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=dompercent&order=DESC&p=.htm\">%</a></font></td><td align=\"center\" colspan=\"2\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=osgross&order=DESC&p=.htm\">Overseas</a> / </font><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=ospercent&order=DESC&p=.htm\">%</a></font></td><td align=\"center\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=year&order=DESC&p=.htm\">Year^</a></font></td></tr><tr bgcolor=\"#ffffff\"><td align=\"center\"><font size=\"2\">1</font></td>"
## [2] "<td><font size=\"2\"><a href=\"/movies/?id=avatar.htm\"><b>Avatar</b></a></font></td>"
## [3] "<td><font size=\"2\">Fox</font></td>"
## [4] "<td align=\"right\"><font size=\"2\"><b>$2,788.0</b></font></td>"
## [5] "<td align=\"right\"><font size=\"2\">$760.5</font></td>"
## [6] "<td align=\"right\"><font size=\"2\">27.3%</font></td>"
## [7] "<td align=\"right\"><font size=\"2\">$2,027.5</font></td>"
## [8] "<td align=\"right\"><font size=\"2\">72.7%</font></td>"
## [9] "<td align=\"center\"><font size=\"2\">2009^</font></td>"
## [10] "</tr>"
## [11] ""
mypattern = '<td.+>.*<.+</td>'
datalines = grep(mypattern,thepage[168:1295],value=TRUE)
head(datalines,15)
## [1] "<td><font size=\"2\"><a href=\"/movies/?id=avatar.htm\"><b>Avatar</b></a></font></td>"
## [2] "<td><font size=\"2\">Fox</font></td>"
## [3] "<td align=\"right\"><font size=\"2\"><b>$2,788.0</b></font></td>"
## [4] "<td align=\"right\"><font size=\"2\">$760.5</font></td>"
## [5] "<td align=\"right\"><font size=\"2\">27.3%</font></td>"
## [6] "<td align=\"right\"><font size=\"2\">$2,027.5</font></td>"
## [7] "<td align=\"right\"><font size=\"2\">72.7%</font></td>"
## [8] "<td align=\"center\"><font size=\"2\">2009^</font></td>"
## [9] "<tr bgcolor=\"#f4f4ff\"><td align=\"center\"><font size=\"2\">2</font></td>"
## [10] "<td><font size=\"2\"><a href=\"/movies/?id=titanic.htm\"><b>Titanic</b></a></font></td>"
## [11] "<td><font size=\"2\">Par.</font></td>"
## [12] "<td align=\"right\"><font size=\"2\"><b>$2,187.5</b></font></td>"
## [13] "<td align=\"right\"><font size=\"2\">$659.4</font></td>"
## [14] "<td align=\"right\"><font size=\"2\">30.1%</font></td>"
## [15] "<td align=\"right\"><font size=\"2\">$1,528.1</font></td>"
#This compicated code parses out only the data we need - I am sure there is a more efficinet way of doing this - but since regular expressions are hard it is the best I could do.
getexpr = function(s,g)substring(s,g,g+attr(g,'match.length')-1)
gg = gregexpr('>(.+?)<',datalines)
matches = mapply(getexpr,datalines,gg)
result = gsub('>(.+?)<','\\1',matches)
names(result) = NULL
head(result)
## [1] "c(\"<font size=\\\"2\\\">\", \"<b>Avatar\", \"</a>\")"
## [2] "<font size=\"2\">Fox"
## [3] "c(\"<font size=\\\"2\\\">\", \"$2,788.0\", \"</font>\")"
## [4] "<font size=\"2\">$760.5"
## [5] "<font size=\"2\">27.3%"
## [6] "<font size=\"2\">$2,027.5"
result2 = str_extract(result, '>[:print:]*')
head(result2)
## [1] ">\", \"<b>Avatar\", \"</a>\")" ">Fox"
## [3] ">\", \"$2,788.0\", \"</font>\")" ">$760.5"
## [5] ">27.3%" ">$2,027.5"
result3 = str_replace(result2, '>*', '')
head(result3,10)
## [1] "\", \"<b>Avatar\", \"</a>\")" "Fox"
## [3] "\", \"$2,788.0\", \"</font>\")" "$760.5"
## [5] "27.3%" "$2,027.5"
## [7] "72.7%" "2009^"
## [9] "\", \"2\")" "\", \"<b>Titanic\", \"</a>\")"
result4 = str_replace(result3, '[:punct:]*[:blank:]+[:punct:]*', '')
head(result4,10)
## [1] "<b>Avatar\", \"</a>\")" "Fox"
## [3] "$2,788.0\", \"</font>\")" "$760.5"
## [5] "27.3%" "$2,027.5"
## [7] "72.7%" "2009^"
## [9] "2\")" "<b>Titanic\", \"</a>\")"
result5 = str_replace(result4, '<b>', '')
head(result5,10)
## [1] "Avatar\", \"</a>\")" "Fox"
## [3] "$2,788.0\", \"</font>\")" "$760.5"
## [5] "27.3%" "$2,027.5"
## [7] "72.7%" "2009^"
## [9] "2\")" "Titanic\", \"</a>\")"
result6 = str_replace(result5, '\\", \"<[:graph:]*', '')
head(result6,30)
## [1] "Avatar" "Fox"
## [3] "$2,788.0" "$760.5"
## [5] "27.3%" "$2,027.5"
## [7] "72.7%" "2009^"
## [9] "2\")" "Titanic"
## [11] "Par." "$2,187.5"
## [13] "$659.4" "30.1%"
## [15] "$1,528.1" "69.9%"
## [17] "1997^" "3\")"
## [19] "Star Wars: The Force Awakens" "BV"
## [21] "$2,068.2" "$936.7"
## [23] "45.3%" "$1,131.6"
## [25] "54.7%" "2015"
## [27] "4\")" "Avengers: Infinity War"
## [29] "BV" "$2,046.5"
result7 = str_replace(result6, '\\"[:punct:]{1}', '')
head(result7,30)
## [1] "Avatar" "Fox"
## [3] "$2,788.0" "$760.5"
## [5] "27.3%" "$2,027.5"
## [7] "72.7%" "2009^"
## [9] "2" "Titanic"
## [11] "Par." "$2,187.5"
## [13] "$659.4" "30.1%"
## [15] "$1,528.1" "69.9%"
## [17] "1997^" "3"
## [19] "Star Wars: The Force Awakens" "BV"
## [21] "$2,068.2" "$936.7"
## [23] "45.3%" "$1,131.6"
## [25] "54.7%" "2015"
## [27] "4" "Avengers: Infinity War"
## [29] "BV" "$2,046.5"
#This code reads our parsed data into a dataframe.
finalmov <- data.frame("Title" = result7[seq(1, 899, 9)], "Studio" = result7[seq(2, 899, 9)], "Total" = result7[seq(3, 899, 9)], "DomGross" = result7[seq(4, 899, 9)], "DomPerc" = result7[seq(5, 899, 9)], "IntlGross" = result7[seq(6, 899, 9)], "IntlPerc" = result7[seq(7, 899, 9)], "Year" = result7[seq(8, 899, 9)])
head(finalmov)
## Title Studio Total DomGross DomPerc IntlGross
## 1 Avatar Fox $2,788.0 $760.5 27.3% $2,027.5
## 2 Titanic Par. $2,187.5 $659.4 30.1% $1,528.1
## 3 Star Wars: The Force Awakens BV $2,068.2 $936.7 45.3% $1,131.6
## 4 Avengers: Infinity War BV $2,046.5 $678.8 33.2% $1,367.7
## 5 Jurassic World Uni. $1,671.7 $652.3 39.0% $1,019.4
## 6 Marvel's The Avengers BV $1,518.8 $623.4 41.0% $895.5
## IntlPerc Year
## 1 72.7% 2009^
## 2 69.9% 1997^
## 3 54.7% 2015
## 4 66.8% 2018
## 5 61.0% 2015
## 6 59.0% 2012