Inculde libraries
library(rvest)
## Loading required package: xml2
library(NLP)
library(openNLP)
library(ggmap)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
page = read_html('https://en.wikipedia.org/wiki/Bank_of_America')
text = html_text(html_nodes(page,'p'))
text = text[text != ""]
text = gsub("\\[[0-9]]|\\[[0-9][0-9]]|\\[[0-9][0-9][0-9]]","",text) # removing refrences [101] type
# Make one complete document
text = paste(text,collapse = " ")
text = as.String(text)
sent_annot = Maxent_Sent_Token_Annotator()
word_annot = Maxent_Word_Token_Annotator()
loc_annot = Maxent_Entity_Annotator(kind = "location") #annotate location
people_annot = Maxent_Entity_Annotator(kind = "person") #annotate person
annot.l1 = NLP::annotate(text, list(sent_annot,word_annot,loc_annot,people_annot))
k <- sapply(annot.l1$features, `[[`, "kind")
boa_locations = text[annot.l1[k == "location"]]
boa_people = text[annot.l1[k == "person"]]
# We could do much with this info, e.g., improve lists by editing them with external domain knowledge, etc.
# E.g., geocode the locations and create a map of the world of each article.
all_places = unique(boa_locations) # view contents of this obj
all_places_geocoded <- geocode(all_places) #[1:10]
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Charlotte&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=North%20Carolina&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20States&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=District%20of%20Columbia&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Italy&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=San%20Francisco&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Los%20Angeles&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=California&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=San%20Jose&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Seattle&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Washington&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Seafirst&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Seafirst"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Latin%20America&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Arizona&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Idaho&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Oregon&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Rainier%20Bank&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=West&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Nevada&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Chicago&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Oklahoma%20City&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Penn%20Square%20Bank&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=San%20Francisco%E2%80%93based&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20Mexico&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Chile&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=America&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Netherlands&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Detroit&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Illinois&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Michigan&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Indiana&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Singapore&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Merrill&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Richmond&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Troubled&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Troubled"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Manhattan&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20Kingdom&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=France&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=China&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Germany&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Mexico&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Canada&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Australia&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20Zealand&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York%20City&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Avenue%20of%20the%20Americas&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Hong%20Kong&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Shanghai&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=London&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Uptown%20Charlotte&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Uptown
## Charlotte"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Massachusetts&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Miami&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Europe&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Iran&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Anchorage&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Alaska&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Providence&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Rhode%20Island&sensor=false
all_places_geocoded # view contents of this obj
## lon lat
## 1 -80.8431267 35.227087
## 2 -79.0192997 35.759573
## 3 -95.7128910 37.090240
## 4 -77.0368707 38.907192
## 5 12.5673800 41.871940
## 6 -122.4194155 37.774929
## 7 -118.2436849 34.052234
## 8 -119.4179324 36.778261
## 9 -121.8863286 37.338208
## 10 -122.3320708 47.606209
## 11 -77.0368707 38.907192
## 12 NA NA
## 13 -86.2519898 41.676355
## 14 -111.0937311 34.048928
## 15 -114.7420408 44.068202
## 16 -120.5542012 43.804133
## 17 -122.2472392 47.511260
## 18 4.8625133 52.373354
## 19 -116.4193890 38.802610
## 20 -87.6297982 41.878114
## 21 -97.5164276 35.467560
## 22 -76.3056373 40.037743
## 23 -122.4194155 37.774929
## 24 -105.8700901 34.519940
## 25 -51.9252800 -14.235004
## 26 -71.5429690 -35.675147
## 27 -95.7128910 37.090240
## 28 5.2912660 52.132633
## 29 -83.0457538 42.331427
## 30 -89.3985283 40.633125
## 31 -85.6023643 44.314844
## 32 -86.1349019 40.267194
## 33 103.8198360 1.352083
## 34 -89.6834590 45.180522
## 35 -77.4360481 37.540725
## 36 NA NA
## 37 -74.0059413 40.712784
## 38 -96.5716694 39.183608
## 39 -3.4359730 55.378051
## 40 2.2137490 46.227638
## 41 104.1953970 35.861660
## 42 10.4515260 51.165691
## 43 -102.5527840 23.634501
## 44 -106.3467710 56.130366
## 45 133.7751360 -25.274398
## 46 174.8859710 -40.900557
## 47 -74.0059413 40.712784
## 48 -73.9927978 40.742903
## 49 114.1094970 22.396428
## 50 121.4737010 31.230416
## 51 -0.1277583 51.507351
## 52 NA NA
## 53 -71.3824374 42.407211
## 54 -80.1917902 25.761680
## 55 15.2551187 54.525961
## 56 53.6880460 32.427908
## 57 -149.9002778 61.218056
## 58 -149.4936733 64.200841
## 59 -71.4128343 41.823989
## 60 -71.4774291 41.580095
newmap <- getMap(resolution = "high")
## Warning in getMap(resolution = "high"): for resolution='high' option you
## need to install package rworldxtra, using low resolution version for now
plot(newmap,
asp = 1)
points(all_places_geocoded$lon,
all_places_geocoded$lat,
col = "red", cex = 1.2, pch = 19)
unique(boa_locations)
## [1] "Charlotte" "North Carolina"
## [3] "United States" "District of Columbia"
## [5] "Italy" "San Francisco"
## [7] "Los Angeles" "California"
## [9] "San Jose" "Seattle"
## [11] "Washington" "Seafirst"
## [13] "Latin America" "Arizona"
## [15] "Idaho" "Oregon"
## [17] "Rainier Bank" "West"
## [19] "Nevada" "Chicago"
## [21] "Oklahoma City" "Penn Square Bank"
## [23] "San Francisco<U+0096>based" "New Mexico"
## [25] "Brazil" "Chile"
## [27] "America" "Netherlands"
## [29] "Detroit" "Illinois"
## [31] "Michigan" "Indiana"
## [33] "Singapore" "Merrill"
## [35] "Richmond" "Troubled"
## [37] "New York" "Manhattan"
## [39] "United Kingdom" "France"
## [41] "China" "Germany"
## [43] "Mexico" "Canada"
## [45] "Australia" "New Zealand"
## [47] "New York City" "Avenue of the Americas"
## [49] "Hong Kong" "Shanghai"
## [51] "London" "Uptown Charlotte"
## [53] "Massachusetts" "Miami"
## [55] "Europe" "Iran"
## [57] "Anchorage" "Alaska"
## [59] "Providence" "Rhode Island"
unique(boa_people)
## [1] "Citigroup" "Wells Fargo<U+0097>its"
## [3] "Orra" "Sam Armacost"
## [5] "Armacost" "Tom) Clausen"
## [7] "Charles Schwab" "Robertson Stephens"
## [9] "BancAmerica" "Hugh McColl"
## [11] "BancAmerica Robertson Stephens" "McColl"
## [13] "Ken Lewis" "Brian Moynihan"
## [15] "Eric Holder" "John Thain"
## [17] "Nelson Chai" "Tom Montag"
## [19] "Kenneth Lewis" "Lewis"
## [21] "Jeffrey Lacker" "Mr. Lewis"
## [23] "Andrew Cuomo" "David Rosenfeld"
## [25] "Arthur" "39,000"
## [27] "Douglas Campbell" "Jed Kolko"
## [29] "Robert Madsen" "Edward O'Donnell"
## [31] "Michelle Moore" "Peter Wannemacher"
## [33] "JPMorgan Chase" "Wells Fargo"
## [35] "Santander Serfin" "Syndicated Loans"
## [37] "China" "Banco Itaú"
## [39] "Joe L. Price" "Ann Minch"
## [41] "Julian Assange" "Nick Baumann"
## [43] "Daniel Domscheit-Berg" "Brian Penny"
## [45] "Betty Riess" "B. Atwood Building"
It is displaying citi_group also in its content
It is correctly indentifying its main presence in Charlotte
It is also indentifying its competitors in that location that are highliy talked about
While all locations of its presence are correct, there can be few improvements to its NON banking areas presence also