Submitted By : Neeraj Khattar, CBA (Batch-7), Roll #: 71620042

Prerequisite

Loading Library and if not present install the required library.

rm(list=ls())
require(rvest) || install.packages('rvest')
require(NLP) || install.packages('NLP')
require(openNLP) || install.packages('openNLP')
require(ggmap) || install.packages('ggmap')
require(rworldmap) || install.packages('rworldmap')

library(rvest)
library(NLP)
library(openNLP)
library(ggmap)
library(rworldmap)

Step 1 & 2 - Select one well-known firm from the list of the Fortune 500 firms. For the selected firm, scrape it’s Wikipedia page

In this example, we are going to parse General Electric data from wikipedia…

page = read_html('https://en.wikipedia.org/wiki/General_Electric')

text = html_text(html_nodes(page,'p'))
text = text[text != ""]
text = gsub("\\[[0-9]]|\\[[0-9][0-9]]|\\[[0-9][0-9][0-9]]","",text) # removing refrences [101] type

# Make one complete document
text = paste(text,collapse = " ") 

text = as.String(text)

Step 3 - Using openNLP, find all the locations and persons mentioned in the Wikipedia page. Its good practice to set timers and report runtimes for heavy functions.

t1 = Sys.time()

sent_annot = Maxent_Sent_Token_Annotator()
word_annot = Maxent_Word_Token_Annotator()
loc_annot = Maxent_Entity_Annotator(kind = "location")
person_annot = Maxent_Entity_Annotator(kind = "person")

annot.l1 = NLP::annotate(text, list(sent_annot,word_annot,loc_annot,person_annot))

Sys.time() - t1  # how much time did the above take?
## Time difference of 7.736443 secs
k <- sapply(annot.l1$features, `[[`, "kind")
GE_locations = text[annot.l1[k == "location"]]
GE_person = text[annot.l1[k == "person"]]


all_places = unique(GE_locations) # view contents of this obj
all_person = unique(GE_person)

print (all_places)
##  [1] "New York"                   "Boston"                    
##  [3] "Massachusetts"              "Medical"                   
##  [5] "East Newark"                "New Jersey"                
##  [7] "Lynn"                       "Drexel"                    
##  [9] "Nela Park"                  "East Cleveland"            
## [11] "Ohio"                       "United States"             
## [13] "Thomson-Houston"            "Gaithersburg"              
## [15] "Maryland"                   "GXS."                      
## [17] "Richmond"                   "India"                     
## [19] "Oak Hill"                   "NBCUniversal"              
## [21] "Israel"                     "Citigroup Inc."            
## [23] "Mexico"                     "Alstom"                    
## [25] "France"                     "Canada"                    
## [27] "New York City"              "Lexington Avenue"          
## [29] "Fairfield"                  "GE"                        
## [31] "Japan"                      "Cisco"                     
## [33] "New York State"             "Waterford"                 
## [35] "Housatonic River"           "Rome"                      
## [37] "Georgia"                    "Plymouth"                  
## [39] "Hudson River"               "Clearwater"                
## [41] "Pittsfield"                 "Woods"                     
## [43] "Woods Pond"                 "Ecomagination"             
## [45] "Northern Ireland"           "South Carolina"            
## [47] "Toronto International Film" "Tribeca Film"              
## [49] "Sundance Film"              "Bridgeport"                
## [51] "Connecticut"                "Ilium"
print (all_person)
##  [1] "Aviation"               "Healthcare"            
##  [3] "Thomas Edison"          "Drexel"                
##  [5] "Anthony J. Drexel"      "Charles Coffin"        
##  [7] "Owen D. Young"          "Ernst Alexanderson"    
##  [9] "Sanford Alexander Moss" "Burroughs"             
## [11] "Nelson Peltz"           "Brian Gladden"         
## [13] "Jeffrey Immelt"         "Jack Welch"            
## [15] "Bill Ruh"               "David Lucas"           
## [17] "Wolff Olins"            "Robert Abrams"         
## [19] "Pete Seeger"            "Jeff Immelt"           
## [21] "Mr. Immelt"             "Albert Maysles"        
## [23] "Jessica Yu"             "Leslie Iwerks"         
## [25] "Steve James"            "Alex Gibney"           
## [27] "Lixin Fan"              "Gary Hustwit"          
## [29] "Short Films"            "Ronald Reagan"         
## [31] "Jack Donaghy"           "Alec Baldwin"          
## [33] "Phil Dusenberry"        "Marty Schultz"         
## [35] "Mathew Brady"           "Victor Kalin"
all_places_geocoded <- geocode(all_places) #[1:10]
all_places_geocoded # view contents of this obj
##            lon       lat
## 1   -74.005941  40.71278
## 2   -71.058880  42.36008
## 3   -71.382437  42.40721
## 4   -87.665753  41.87055
## 5   -74.163631  40.74970
## 6   -74.405661  40.05832
## 7   -70.949494  42.46676
## 8   -94.608566  38.47946
## 9   -81.560982  41.54003
## 10  -81.579014  41.53311
## 11  -82.907123  40.41729
## 12  -95.712891  37.09024
## 13  -95.339371  29.58460
## 14  -77.201370  39.14344
## 15  -76.641271  39.04575
## 16   77.599443  12.97298
## 17  -77.436048  37.54072
## 18   78.962880  20.59368
## 19   -2.233407  53.77890
## 20          NA        NA
## 21   34.851612  31.04605
## 22          NA        NA
## 23 -102.552784  23.63450
## 24   28.083171 -26.05987
## 25    2.213749  46.22764
## 26 -106.346771  56.13037
## 27  -74.005941  40.71278
## 28  -73.985680  40.73822
## 29 -122.039966  38.24936
## 30   43.356892  42.31541
## 31  138.252924  36.20482
## 32  -98.979234  32.38819
## 33  -74.217933  43.29943
## 34   -7.110070  52.25932
## 35  -73.350467  41.49324
## 36   12.496365  41.90278
## 37  -82.900075  32.16562
## 38   -4.142657  50.37546
## 39  -73.886034  42.39770
## 40  -82.800103  27.96585
## 41  -73.245382  42.45008
## 42  -98.748117  36.71819
## 43  -70.734148  44.03149
## 44          NA        NA
## 45   -6.492314  54.78771
## 46  -81.163725  33.83608
## 47  -79.383184  43.65323
## 48          NA        NA
## 49          NA        NA
## 50  -73.195177  41.18655
## 51  -73.087749  41.60322
## 52   23.700804  38.03700

Step 4 - Plot all the extracted locations from the Wikipedia page on a map. You may want to see ‘NLP location extract and plot.R’ file on LMS for this.

windows()
newmap <- getMap(resolution = "high")
plot(newmap, 
     asp = 1)

points(all_places_geocoded$lon, 
       all_places_geocoded$lat, 
       col = "red", cex = 1.2, pch = 19)

Step 5 - Briefly describe your observations based on the map and persons extracted from Wikipedia page.

From the location, it’s clear that General Electric has prominently presence in Europe and US. Also, identified “Boston, Massachusetts” where GE has Headquarter.

From people list, “Jeffrey Immelt, Jeff Immelt, Mr. Immelt” who is Chairman and CEO of General Electric. “Thomas Edison and Charles Coffin” as founder but missed two names in the list. “Aviation and HealthCare, short films” are few keywords which can be skipped as those are not the names.