Inculde libraries
library(rvest)
## Loading required package: xml2
library(NLP)
library(openNLP)
library(ggmap)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(rworldmap)
## Warning: package 'rworldmap' was built under R version 3.3.2
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
page = read_html('https://en.wikipedia.org/wiki/Berkshire_Hathaway')
text = html_text(html_nodes(page,'p'))
text = text[text != ""]
text = gsub("\\[[0-9]]|\\[[0-9][0-9]]|\\[[0-9][0-9][0-9]]","",text) # removing refrences [101] type
# Make one complete document
text = paste(text,collapse = " ")
text = as.String(text)
sent_annot = Maxent_Sent_Token_Annotator()
word_annot = Maxent_Word_Token_Annotator()
loc_annot = Maxent_Entity_Annotator(kind = "location") #annotate location
people_annot = Maxent_Entity_Annotator(kind = "person") #annotate person
annot.l1 = NLP::annotate(text, list(sent_annot,word_annot,loc_annot,people_annot))
k <- sapply(annot.l1$features, `[[`, "kind")
berk_locations = text[annot.l1[k == "location"]]
berk_people = text[annot.l1[k == "person"]]
# We could do much with this info, e.g., improve lists by editing them with external domain knowledge, etc.
# E.g., geocode the locations and create a map of the world of each article.
all_places = unique(berk_locations) # view contents of this obj
all_places_geocoded <- geocode(all_places) #[1:10]
all_places_geocoded # view contents of this obj
## lon lat
## 1 -95.99799 41.25236
## 2 -99.90181 41.49254
## 3 -95.71289 37.09024
## 4 -122.16860 37.42270
## 5 -85.88751 39.24389
## 6 -81.97561 35.01251
## 7 -71.47743 41.58009
## 8 -95.71289 37.09024
## 9 -89.77553 44.01970
## 10 -71.38244 42.40721
## 11 -70.93420 41.63622
## 12 -76.18660 42.30424
## 13 NA NA
## 14 -84.27002 37.83933
## 15 -97.33077 32.75549
## 16 -99.90181 31.96860
## 17 -74.40566 40.05832
## 18 -106.34677 56.13037
## 19 -91.83183 37.96425
## 20 -83.92074 35.96064
## 21 -86.58045 35.51749
## 22 -79.38318 43.65323
## 23 -84.19161 39.75895
## 24 -73.87397 40.77693
## 25 -73.83308 40.76750
## 26 -74.00594 40.71278
## 27 -94.57857 39.09973
## 28 -96.82917 32.96179
## 29 -89.39853 40.63312
## 30 -119.41793 36.77826
## 31 -93.34995 44.88969
## 32 -94.68590 46.72955
## 33 -78.87837 42.88645
## 34 -93.09770 41.87800
## 35 -77.40398 37.64408
## 36 -79.79198 36.07264
## 37 -74.42293 39.36428
## 38 -80.19179 25.76168
## 39 -93.26501 44.97775
## 40 -82.90008 32.16562
## 41 -86.13490 40.26719
## 42 15.25512 54.52596
## 43 -51.92528 -14.23500
## 44 -87.34751 35.90090
newmap <- getMap(resolution = "high")
plot(newmap,
asp = 1)
points(all_places_geocoded$lon,
all_places_geocoded$lat,
col = "red", cex = 1.2, pch = 19)
unique(berk_people)
## [1] "Dairy Queen"
## [2] "Helzberg Diamonds"
## [3] "Warren Buffett"
## [4] "Charlie Munger"
## [5] "Oliver Chace"
## [6] "Samuel Slater"
## [7] "Seabury Stanton"
## [8] "Stanton"
## [9] "Buffett"
## [10] "Magazine"
## [11] "Lloyd Blankfein"
## [12] "David Gottesman"
## [13] "Franklin Otis Booth"
## [14] "Bill Gates"
## [15] "Arnold Schwarzenegger"
## [16] "Jamie Lee Curtis"
## [17] "Nicollette Sheridan"
## [18] "Walter Scott"
## [19] "Thomas S. Murphy"
## [20] "Howard Graham Buffett"
## [21] "Warren"
## [22] "Ronald Olson"
## [23] "Steve Burke"
## [24] "Susan Decker"
## [25] "Todd Combs"
## [26] "A++"
## [27] "Russell Corporation"
## [28] "Justin Brands"
## [29] "Justin Boots"
## [30] "Justin Original Workboots"
## [31] "Nocona Boots"
## [32] "Tony Lama Boots"
## [33] "Benjamin Moore"
## [34] "Moore"
## [35] "Inc. Shaw"
## [36] "Clayton Homes"
## [37] "Inc."
## [38] "Clayton"
## [39] "Ben Bridge Jeweler"
## [40] "Graham Holdings Company"
## [41] "Scott Fetzer Companies<U+0096>The Scott Fetzer Companies"
## [42] "Wayne Water Systems"
## [43] "Campbell Hausfeld"
## [44] "Scott Fetzer"
## [45] "AAI"
## [46] "Gymnastics"
## [47] "Lee Enterprises"
unique(berk_locations)
## [1] "Omaha" "Nebraska"
## [3] "United States" "Mars"
## [5] "Valley Falls Company" "Valley Falls"
## [7] "Rhode Island" "America"
## [9] "Adams" "Massachusetts"
## [11] "New Bedford" "Berkshire"
## [13] "Charlotte Guyman" "Kentucky"
## [15] "Fort Worth" "Texas"
## [17] "New Jersey" "Canada"
## [19] "Missouri" "Knoxville"
## [21] "Tennessee" "Toronto"
## [23] "Dayton" "LaGuardia Airport"
## [25] "Flushing" "New York"
## [27] "Kansas City" "Addison"
## [29] "Illinois" "California"
## [31] "Edina" "Minnesota"
## [33] "Buffalo" "Iowa"
## [35] "Richmond Times-Dispatch" "Greensboro"
## [37] "Atlantic City" "Miami"
## [39] "Minneapolis" "Georgia"
## [41] "Indiana" "Europe"
## [43] "Brazil" "Wrigley"
Maxent_Entity_Annotator - Generates an annotator which computes entity annotations using the Apache OpenNLP Maxent name finder
Observed that locations from US and Europe as recognised better when compared to other parts of the world (tried iwth different wiki links).
There are some inconsistencies in identifying person names.
Also it is wrongly identifying few words such as Inc., AAI., Gymnastics, Lee Enterprises, A++