Project2Dataset3

library(xml2)
library(stringr)

#This code reads in the data from the webpage
thepage = readLines('https://www.boxofficemojo.com/alltime/world/')

#This code locates the lines we are intersted in which include our data
grep('</td></tr>', thepage)

## [1]  152  167 1295

thepage[167:177]

##  [1] "<table border=\"0\" cellpadding=\"5\" cellspacing=\"1\" width=\"98%\"><tr bgcolor=\"#dcdcdc\"><td align=\"center\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=rank&order=ASC&p=.htm\">Rank</a></font></td><td align=\"center\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=title&order=ASC&p=.htm\">Title</a></font></td><td align=\"center\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=studio&order=ASC&p=.htm\">Studio</a></font></td><td align=\"center\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=wwgross&order=ASC&p=.htm\"><b>Worldwide</b></a></font></td><td align=\"center\" colspan=\"2\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=domgross&order=DESC&p=.htm\">Domestic</a> / </font><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=dompercent&order=DESC&p=.htm\">%</a></font></td><td align=\"center\" colspan=\"2\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=osgross&order=DESC&p=.htm\">Overseas</a> / </font><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=ospercent&order=DESC&p=.htm\">%</a></font></td><td align=\"center\"><font size=\"2\"><a href=\"/alltime/world/?pagenum=1&sort=year&order=DESC&p=.htm\">Year^</a></font></td></tr><tr bgcolor=\"#ffffff\"><td align=\"center\"><font size=\"2\">1</font></td>"
##  [2] "<td><font size=\"2\"><a href=\"/movies/?id=avatar.htm\"><b>Avatar</b></a></font></td>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
##  [3] "<td><font size=\"2\">Fox</font></td>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
##  [4] "<td align=\"right\"><font size=\"2\"><b>$2,788.0</b></font></td>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
##  [5] "<td align=\"right\"><font size=\"2\">$760.5</font></td>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  [6] "<td align=\"right\"><font size=\"2\">27.3%</font></td>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
##  [7] "<td align=\"right\"><font size=\"2\">$2,027.5</font></td>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
##  [8] "<td align=\"right\"><font size=\"2\">72.7%</font></td>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
##  [9] "<td align=\"center\"><font size=\"2\">2009^</font></td>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## [10] "</tr>"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## [11] ""

mypattern = '<td.+>.*<.+</td>'
datalines = grep(mypattern,thepage[168:1295],value=TRUE)
head(datalines,15)

##  [1] "<td><font size=\"2\"><a href=\"/movies/?id=avatar.htm\"><b>Avatar</b></a></font></td>"  
##  [2] "<td><font size=\"2\">Fox</font></td>"                                                   
##  [3] "<td align=\"right\"><font size=\"2\"><b>$2,788.0</b></font></td>"                       
##  [4] "<td align=\"right\"><font size=\"2\">$760.5</font></td>"                                
##  [5] "<td align=\"right\"><font size=\"2\">27.3%</font></td>"                                 
##  [6] "<td align=\"right\"><font size=\"2\">$2,027.5</font></td>"                              
##  [7] "<td align=\"right\"><font size=\"2\">72.7%</font></td>"                                 
##  [8] "<td align=\"center\"><font size=\"2\">2009^</font></td>"                                
##  [9] "<tr bgcolor=\"#f4f4ff\"><td align=\"center\"><font size=\"2\">2</font></td>"            
## [10] "<td><font size=\"2\"><a href=\"/movies/?id=titanic.htm\"><b>Titanic</b></a></font></td>"
## [11] "<td><font size=\"2\">Par.</font></td>"                                                  
## [12] "<td align=\"right\"><font size=\"2\"><b>$2,187.5</b></font></td>"                       
## [13] "<td align=\"right\"><font size=\"2\">$659.4</font></td>"                                
## [14] "<td align=\"right\"><font size=\"2\">30.1%</font></td>"                                 
## [15] "<td align=\"right\"><font size=\"2\">$1,528.1</font></td>"

 #This compicated code parses out only the data we need - I am sure there is a more efficinet way of doing this - but since regular expressions are hard it is the best I could do.
getexpr = function(s,g)substring(s,g,g+attr(g,'match.length')-1)
 gg = gregexpr('>(.+?)<',datalines)
 matches = mapply(getexpr,datalines,gg)
 result = gsub('>(.+?)<','\\1',matches)
 names(result) = NULL
 head(result)

## [1] "c(\"<font size=\\\"2\\\">\", \"<b>Avatar\", \"</a>\")"  
## [2] "<font size=\"2\">Fox"                                   
## [3] "c(\"<font size=\\\"2\\\">\", \"$2,788.0\", \"</font>\")"
## [4] "<font size=\"2\">$760.5"                                
## [5] "<font size=\"2\">27.3%"                                 
## [6] "<font size=\"2\">$2,027.5"

   result2 = str_extract(result, '>[:print:]*')
 head(result2)

## [1] ">\", \"<b>Avatar\", \"</a>\")"   ">Fox"                           
## [3] ">\", \"$2,788.0\", \"</font>\")" ">$760.5"                        
## [5] ">27.3%"                          ">$2,027.5"

    result3 = str_replace(result2, '>*', '')
 head(result3,10)

##  [1] "\", \"<b>Avatar\", \"</a>\")"   "Fox"                           
##  [3] "\", \"$2,788.0\", \"</font>\")" "$760.5"                        
##  [5] "27.3%"                          "$2,027.5"                      
##  [7] "72.7%"                          "2009^"                         
##  [9] "\", \"2\")"                     "\", \"<b>Titanic\", \"</a>\")"

     result4 = str_replace(result3, '[:punct:]*[:blank:]+[:punct:]*', '')
 head(result4,10)

##  [1] "<b>Avatar\", \"</a>\")"   "Fox"                     
##  [3] "$2,788.0\", \"</font>\")" "$760.5"                  
##  [5] "27.3%"                    "$2,027.5"                
##  [7] "72.7%"                    "2009^"                   
##  [9] "2\")"                     "<b>Titanic\", \"</a>\")"

      result5 = str_replace(result4, '<b>', '')
 head(result5,10)

##  [1] "Avatar\", \"</a>\")"      "Fox"                     
##  [3] "$2,788.0\", \"</font>\")" "$760.5"                  
##  [5] "27.3%"                    "$2,027.5"                
##  [7] "72.7%"                    "2009^"                   
##  [9] "2\")"                     "Titanic\", \"</a>\")"

      result6 = str_replace(result5, '\\", \"<[:graph:]*', '')
 head(result6,30)

##  [1] "Avatar"                       "Fox"                         
##  [3] "$2,788.0"                     "$760.5"                      
##  [5] "27.3%"                        "$2,027.5"                    
##  [7] "72.7%"                        "2009^"                       
##  [9] "2\")"                         "Titanic"                     
## [11] "Par."                         "$2,187.5"                    
## [13] "$659.4"                       "30.1%"                       
## [15] "$1,528.1"                     "69.9%"                       
## [17] "1997^"                        "3\")"                        
## [19] "Star Wars: The Force Awakens" "BV"                          
## [21] "$2,068.2"                     "$936.7"                      
## [23] "45.3%"                        "$1,131.6"                    
## [25] "54.7%"                        "2015"                        
## [27] "4\")"                         "Avengers: Infinity War"      
## [29] "BV"                           "$2,046.5"

       result7 = str_replace(result6, '\\"[:punct:]{1}', '')
 head(result7,30)

##  [1] "Avatar"                       "Fox"                         
##  [3] "$2,788.0"                     "$760.5"                      
##  [5] "27.3%"                        "$2,027.5"                    
##  [7] "72.7%"                        "2009^"                       
##  [9] "2"                            "Titanic"                     
## [11] "Par."                         "$2,187.5"                    
## [13] "$659.4"                       "30.1%"                       
## [15] "$1,528.1"                     "69.9%"                       
## [17] "1997^"                        "3"                           
## [19] "Star Wars: The Force Awakens" "BV"                          
## [21] "$2,068.2"                     "$936.7"                      
## [23] "45.3%"                        "$1,131.6"                    
## [25] "54.7%"                        "2015"                        
## [27] "4"                            "Avengers: Infinity War"      
## [29] "BV"                           "$2,046.5"

 #This code reads our parsed data into a dataframe.
 
finalmov <- data.frame("Title" = result7[seq(1, 899, 9)], "Studio" = result7[seq(2, 899, 9)], "Total" = result7[seq(3, 899, 9)], "DomGross" = result7[seq(4, 899, 9)], "DomPerc" = result7[seq(5, 899, 9)], "IntlGross" = result7[seq(6, 899, 9)], "IntlPerc" = result7[seq(7, 899, 9)], "Year" = result7[seq(8, 899, 9)])

head(finalmov)

##                          Title Studio    Total DomGross DomPerc IntlGross
## 1                       Avatar    Fox $2,788.0   $760.5   27.3%  $2,027.5
## 2                      Titanic   Par. $2,187.5   $659.4   30.1%  $1,528.1
## 3 Star Wars: The Force Awakens     BV $2,068.2   $936.7   45.3%  $1,131.6
## 4       Avengers: Infinity War     BV $2,046.5   $678.8   33.2%  $1,367.7
## 5               Jurassic World   Uni. $1,671.7   $652.3   39.0%  $1,019.4
## 6        Marvel's The Avengers     BV $1,518.8   $623.4   41.0%    $895.5
##   IntlPerc  Year
## 1    72.7% 2009^
## 2    69.9% 1997^
## 3    54.7%  2015
## 4    66.8%  2018
## 5    61.0%  2015
## 6    59.0%  2012