Loading Library and if not present install the required library.
rm(list=ls())
require(rvest) || install.packages('rvest')
require(NLP) || install.packages('NLP')
require(openNLP) || install.packages('openNLP')
require(ggmap) || install.packages('ggmap')
require(rworldmap) || install.packages('rworldmap')
library(rvest)
library(NLP)
library(openNLP)
library(ggmap)
library(rworldmap)
In this example, we are going to parse General Electric data from wikipedia…
page = read_html('https://en.wikipedia.org/wiki/General_Electric')
text = html_text(html_nodes(page,'p'))
text = text[text != ""]
text = gsub("\\[[0-9]]|\\[[0-9][0-9]]|\\[[0-9][0-9][0-9]]","",text) # removing refrences [101] type
# Make one complete document
text = paste(text,collapse = " ")
text = as.String(text)
t1 = Sys.time()
sent_annot = Maxent_Sent_Token_Annotator()
word_annot = Maxent_Word_Token_Annotator()
loc_annot = Maxent_Entity_Annotator(kind = "location")
person_annot = Maxent_Entity_Annotator(kind = "person")
annot.l1 = NLP::annotate(text, list(sent_annot,word_annot,loc_annot,person_annot))
Sys.time() - t1 # how much time did the above take?
## Time difference of 7.736443 secs
k <- sapply(annot.l1$features, `[[`, "kind")
GE_locations = text[annot.l1[k == "location"]]
GE_person = text[annot.l1[k == "person"]]
all_places = unique(GE_locations) # view contents of this obj
all_person = unique(GE_person)
print (all_places)
## [1] "New York" "Boston"
## [3] "Massachusetts" "Medical"
## [5] "East Newark" "New Jersey"
## [7] "Lynn" "Drexel"
## [9] "Nela Park" "East Cleveland"
## [11] "Ohio" "United States"
## [13] "Thomson-Houston" "Gaithersburg"
## [15] "Maryland" "GXS."
## [17] "Richmond" "India"
## [19] "Oak Hill" "NBCUniversal"
## [21] "Israel" "Citigroup Inc."
## [23] "Mexico" "Alstom"
## [25] "France" "Canada"
## [27] "New York City" "Lexington Avenue"
## [29] "Fairfield" "GE"
## [31] "Japan" "Cisco"
## [33] "New York State" "Waterford"
## [35] "Housatonic River" "Rome"
## [37] "Georgia" "Plymouth"
## [39] "Hudson River" "Clearwater"
## [41] "Pittsfield" "Woods"
## [43] "Woods Pond" "Ecomagination"
## [45] "Northern Ireland" "South Carolina"
## [47] "Toronto International Film" "Tribeca Film"
## [49] "Sundance Film" "Bridgeport"
## [51] "Connecticut" "Ilium"
print (all_person)
## [1] "Aviation" "Healthcare"
## [3] "Thomas Edison" "Drexel"
## [5] "Anthony J. Drexel" "Charles Coffin"
## [7] "Owen D. Young" "Ernst Alexanderson"
## [9] "Sanford Alexander Moss" "Burroughs"
## [11] "Nelson Peltz" "Brian Gladden"
## [13] "Jeffrey Immelt" "Jack Welch"
## [15] "Bill Ruh" "David Lucas"
## [17] "Wolff Olins" "Robert Abrams"
## [19] "Pete Seeger" "Jeff Immelt"
## [21] "Mr. Immelt" "Albert Maysles"
## [23] "Jessica Yu" "Leslie Iwerks"
## [25] "Steve James" "Alex Gibney"
## [27] "Lixin Fan" "Gary Hustwit"
## [29] "Short Films" "Ronald Reagan"
## [31] "Jack Donaghy" "Alec Baldwin"
## [33] "Phil Dusenberry" "Marty Schultz"
## [35] "Mathew Brady" "Victor Kalin"
all_places_geocoded <- geocode(all_places) #[1:10]
all_places_geocoded # view contents of this obj
## lon lat
## 1 -74.005941 40.71278
## 2 -71.058880 42.36008
## 3 -71.382437 42.40721
## 4 -87.665753 41.87055
## 5 -74.163631 40.74970
## 6 -74.405661 40.05832
## 7 -70.949494 42.46676
## 8 -94.608566 38.47946
## 9 -81.560982 41.54003
## 10 -81.579014 41.53311
## 11 -82.907123 40.41729
## 12 -95.712891 37.09024
## 13 -95.339371 29.58460
## 14 -77.201370 39.14344
## 15 -76.641271 39.04575
## 16 77.599443 12.97298
## 17 -77.436048 37.54072
## 18 78.962880 20.59368
## 19 -2.233407 53.77890
## 20 NA NA
## 21 34.851612 31.04605
## 22 NA NA
## 23 -102.552784 23.63450
## 24 28.083171 -26.05987
## 25 2.213749 46.22764
## 26 -106.346771 56.13037
## 27 -74.005941 40.71278
## 28 -73.985680 40.73822
## 29 -122.039966 38.24936
## 30 43.356892 42.31541
## 31 138.252924 36.20482
## 32 -98.979234 32.38819
## 33 -74.217933 43.29943
## 34 -7.110070 52.25932
## 35 -73.350467 41.49324
## 36 12.496365 41.90278
## 37 -82.900075 32.16562
## 38 -4.142657 50.37546
## 39 -73.886034 42.39770
## 40 -82.800103 27.96585
## 41 -73.245382 42.45008
## 42 -98.748117 36.71819
## 43 -70.734148 44.03149
## 44 NA NA
## 45 -6.492314 54.78771
## 46 -81.163725 33.83608
## 47 -79.383184 43.65323
## 48 NA NA
## 49 NA NA
## 50 -73.195177 41.18655
## 51 -73.087749 41.60322
## 52 23.700804 38.03700
windows()
newmap <- getMap(resolution = "high")
plot(newmap,
asp = 1)
points(all_places_geocoded$lon,
all_places_geocoded$lat,
col = "red", cex = 1.2, pch = 19)
From the location, it’s clear that General Electric has prominently presence in Europe and US. Also, identified “Boston, Massachusetts” where GE has Headquarter.
From people list, “Jeffrey Immelt, Jeff Immelt, Mr. Immelt” who is Chairman and CEO of General Electric. “Thomas Edison and Charles Coffin” as founder but missed two names in the list. “Aviation and HealthCare, short films” are few keywords which can be skipped as those are not the names.