This is an R Markdown document for plotting the locations and the persons mentioned in the Amazon wikipedia page.
From the world map generated i can make out that most of the locations and persons mentioned in Amazon wikipedia page are from North America.
Locations though are plotted in different countries across the world which have been mentioned in the page but still densely populated across North America and Europe.
rm(list=ls())
library(rvest)
## Loading required package: xml2
library(NLP)
library(openNLP)
library(ggmap)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
page = read_html('https://en.wikipedia.org/wiki/Amazon.com')
text = html_text(html_nodes(page,'p'))
text = text[text != ""]
text = gsub("\\[[0-9]]|\\[[0-9][0-9]]|\\[[0-9][0-9][0-9]]","",text) # removing refrences [101] type
# Make one complete document
text = paste(text,collapse = " ")
text = as.String(text)
sent_annot = Maxent_Sent_Token_Annotator()
word_annot = Maxent_Word_Token_Annotator()
loc_annot = Maxent_Entity_Annotator(kind = "location")
name_annot = Maxent_Entity_Annotator(kind = "person")
annot.l1 = NLP::annotate(text, list(sent_annot,word_annot,loc_annot,name_annot))
k <- sapply(annot.l1$features, `[[`, "kind")
amazon_locations = text[annot.l1[k == "location"]]
amazon_persons = text[annot.l1[k == "person"]]
## +++ some downstream analysis with above data +++ ##
##
# We could do much with this info, e.g., improve lists by editing them with external domain knowledge, etc.
# E.g., geocode the locations and create a map of the world of each article.
all_places = unique(amazon_locations) # view contents of this obj
all_names = unique(amazon_persons) # view contents of this obj
all_places_geocoded <- geocode(all_places) #[1:10]
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Amazon&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Seattle&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Washington&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Amazon%20Kindle&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Amazon
## Kindle"
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20States&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20Kingdom&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Ireland&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=France&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Canada&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Germany&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Italy&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Spain&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Netherlands&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Australia&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Japan&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=China&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Mexico&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Bellevue&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Delaware&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Amazon%E2%80%99s&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Los%20Angeles&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Dallas&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Houston&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20Orleans&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Phoenix&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Grand%20Haven&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Michigan&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Beijing%20Century&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=University%20Village&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Pennsylvania&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Berlin&sensor=false
all_places_geocoded # view contents of this obj
## lon lat
## 1 -112.107222 46.32021
## 2 -122.332071 47.60621
## 3 -77.036871 38.90719
## 4 NA NA
## 5 -95.712891 37.09024
## 6 -3.435973 55.37805
## 7 -8.243890 53.41291
## 8 2.213749 46.22764
## 9 -106.346771 56.13037
## 10 10.451526 51.16569
## 11 12.567380 41.87194
## 12 -3.749220 40.46367
## 13 5.291266 52.13263
## 14 133.775136 -25.27440
## 15 -51.925280 -14.23500
## 16 138.252924 36.20482
## 17 104.195397 35.86166
## 18 78.962880 20.59368
## 19 -102.552784 23.63450
## 20 -122.200679 47.61038
## 21 -75.527670 38.91083
## 22 -112.107222 46.32021
## 23 -118.243685 34.05223
## 24 -74.005941 40.71278
## 25 -96.796988 32.77666
## 26 -95.369803 29.76043
## 27 -90.071532 29.95107
## 28 -112.074037 33.44838
## 29 -86.228386 43.06307
## 30 -85.602364 44.31484
## 31 116.560144 40.34027
## 32 -117.338993 33.97608
## 33 -77.194525 41.20332
## 34 13.404954 52.52001
all_names_geocoded <- geocode(all_names) #[1:10]
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Jeff%20Bezos&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Jeff Bezos"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=John%20Ingram&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Fluid%20Concepts&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Fluid
## Concepts"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Barnes&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Noble&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Marks&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Spencer&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Green%20Lantern&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Audible%20Studios&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Audible
## Studios"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Michael%20Snodgrass&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Michael
## Snodgrass"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Otis%20Chandler&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Elizabeth%20Chandler.&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Goodreads&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Goodreads"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Michael%20Jackson&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Randall%20Sullivan&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Netscape%20Secure%20Commerce%20Server&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Netscape
## Secure Commerce Server"
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brad%20Stone&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Diana%20L%C3%B6bl&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Peter%20Onneken&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Peter
## Onneken"
all_names_geocoded # view contents of this obj
## lon lat
## 1 NA NA
## 2 -80.05966 40.46920
## 3 NA NA
## 4 -91.50802 46.33369
## 5 -97.39475 35.13924
## 6 -116.38937 33.73474
## 7 -95.14432 43.14507
## 8 -97.06893 27.82093
## 9 NA NA
## 10 NA NA
## 11 -73.04512 42.22175
## 12 -111.83030 33.29986
## 13 NA NA
## 14 -89.58001 37.33307
## 15 -86.14520 43.19472
## 16 NA NA
## 17 -4.28180 50.60861
## 18 -43.29986 -22.84098
## 19 NA NA
newmap <- getMap(resolution = "high")
plot(newmap,
# xlim = c(-20, 59), ylim = c(35, 71), # can select 'boxes' of lat-lon to focus on
asp = 1)
points(all_places_geocoded$lon,
all_places_geocoded$lat,
col = "red", cex = 1.2, pch = 19,
)
points(all_names_geocoded$lon,
all_names_geocoded$lat,
col = "blue", cex = 1.2, pch = 19,
)