page = read_html('https://en.wikipedia.org/wiki/at&t')
text = html_text(html_nodes(page,'p'))
text = text[text != ""]
text = gsub("\\[[0-9]]|\\[[0-9][0-9]]|\\[[0-9][0-9][0-9]]","",text) # removing refrences [101] type
# Make one complete document
text = paste(text,collapse = " ")
text = as.String(text)
sent_annot = Maxent_Sent_Token_Annotator()
word_annot = Maxent_Word_Token_Annotator()
loc_annot = Maxent_Entity_Annotator(kind = "location")
annot.l1 = NLP::annotate(text, list(sent_annot,word_annot,loc_annot))
k <- sapply(annot.l1$features, `[[`, "kind")
at_t_locations = text[annot.l1[k == "location"]]
person_annot = Maxent_Entity_Annotator(kind = "person")
annot.l1=NLP::annotate(text,list(sent_annot,word_annot,loc_annot,person_annot))
j= sapply(annot.l1$features,`[[`, "kind")
at_tpersons = text[annot.l1[j=="person"]]
all_persons=unique(at_tpersons)
print(all_persons)
## [1] "Millward Brown Optimor" "Alexander Graham Bell"
## [3] "Robert" "Carlos Slim"
## [5] "George W. Bush" "Bill Leahy"
## [7] "Tim McKone" "AT&T"
## [9] "Mark Klein" "Mike McConnell"
## [11] "Keith Olbermann" "Barbara Popovic"
## [13] "Sue Buske" "Aaron Slator"
## [15] "Randall Stephenson"
all_places = unique(at_t_locations)
print(all_places)
## [1] "Dallas" "Texas"
## [3] "United States" "Telegraph"
## [5] "Canada" "Ma Bell."
## [7] "Latin America" "Connecticut"
## [9] "AT&T Mexico" "Wisconsin"
## [11] "New England" "West Virginia"
## [13] "San Antonio" "Downtown Dallas"
## [15] "St. Louis" "San Antonio.[citation"
## [17] "Redmond" "Washington"
## [19] "New Jersey" "Asia Pacific"
## [21] "Hong Kong" "Mexico"
## [23] "Los Angeles" "Atlanta"
## [25] "Houston" "AT&T"
## [27] "California" "San Francisco"
## [29] "Colombia" "Philippines"
t1 = Sys.time()
all_places_geocoded <- geocode(all_places) #[1:10]
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 22.23127 secs
all_places_geocoded # view contents of this obj
## lon lat
## 1 -96.79699 32.776664
## 2 -99.90181 31.968599
## 3 -95.71289 37.090240
## 4 -99.90618 30.327416
## 5 -106.34677 56.130366
## 6 -70.84135 42.612538
## 7 -86.25199 41.676355
## 8 -73.08775 41.603221
## 9 14.55007 47.516231
## 10 -88.78787 43.784440
## 11 -70.82265 43.965389
## 12 -80.45490 38.597626
## 13 -98.49363 29.424122
## 14 -96.80027 32.779091
## 15 -90.41252 38.610302
## 16 -98.52022 29.573371
## 17 -122.12151 47.673988
## 18 -77.03687 38.907192
## 19 -74.40566 40.058324
## 20 121.01259 14.675905
## 21 114.10950 22.396428
## 22 -102.55278 23.634501
## 23 -118.24368 34.052234
## 24 -84.38798 33.748995
## 25 -95.36980 29.760427
## 26 14.55007 47.516231
## 27 -119.41793 36.778261
## 28 -122.41942 37.774929
## 29 -74.29733 4.570868
## 30 121.77402 12.879721
newmap <- getMap(resolution = "high")
plot(newmap,
# xlim = c(-20, 59), ylim = c(35, 71), # can select 'boxes' of lat-lon to focus on
asp = 1)
points(all_places_geocoded$lon,
all_places_geocoded$lat,
col = "red", cex = 1.2, pch = 19)
