R Markdown

This is an R Markdown document for plotting the locations and the persons mentioned in the Amazon wikipedia page.

From the world map generated i can make out that most of the locations and persons mentioned in Amazon wikipedia page are from North America.

Locations though are plotted in different countries across the world which have been mentioned in the page but still densely populated across North America and Europe.

rm(list=ls())

library(rvest)
## Loading required package: xml2
library(NLP)
library(openNLP)
library(ggmap)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
page = read_html('https://en.wikipedia.org/wiki/Amazon.com')

text = html_text(html_nodes(page,'p'))
text = text[text != ""]
text = gsub("\\[[0-9]]|\\[[0-9][0-9]]|\\[[0-9][0-9][0-9]]","",text) # removing refrences [101] type

# Make one complete document
text = paste(text,collapse = " ") 

text = as.String(text)

sent_annot = Maxent_Sent_Token_Annotator()
word_annot = Maxent_Word_Token_Annotator()
loc_annot = Maxent_Entity_Annotator(kind = "location")
name_annot = Maxent_Entity_Annotator(kind = "person")

annot.l1 = NLP::annotate(text, list(sent_annot,word_annot,loc_annot,name_annot))

k <- sapply(annot.l1$features, `[[`, "kind")
amazon_locations = text[annot.l1[k == "location"]]
amazon_persons = text[annot.l1[k == "person"]]


## +++ some downstream analysis with above data +++ ##
##

# We could do much with this info, e.g., improve lists by editing them with external domain knowledge, etc. 
# E.g., geocode the locations and create a map of the world of each article. 

all_places = unique(amazon_locations) # view contents of this obj
all_names = unique(amazon_persons) # view contents of this obj

all_places_geocoded <- geocode(all_places) #[1:10]
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Amazon&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Seattle&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Washington&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Amazon%20Kindle&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Amazon
## Kindle"
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20States&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20Kingdom&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Ireland&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=France&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Canada&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Germany&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Italy&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Spain&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Netherlands&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Australia&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Japan&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=China&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Mexico&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Bellevue&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Delaware&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Amazon%E2%80%99s&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Los%20Angeles&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Dallas&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Houston&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20Orleans&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Phoenix&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Grand%20Haven&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Michigan&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Beijing%20Century&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=University%20Village&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Pennsylvania&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Berlin&sensor=false
all_places_geocoded # view contents of this obj
##            lon       lat
## 1  -112.107222  46.32021
## 2  -122.332071  47.60621
## 3   -77.036871  38.90719
## 4           NA        NA
## 5   -95.712891  37.09024
## 6    -3.435973  55.37805
## 7    -8.243890  53.41291
## 8     2.213749  46.22764
## 9  -106.346771  56.13037
## 10   10.451526  51.16569
## 11   12.567380  41.87194
## 12   -3.749220  40.46367
## 13    5.291266  52.13263
## 14  133.775136 -25.27440
## 15  -51.925280 -14.23500
## 16  138.252924  36.20482
## 17  104.195397  35.86166
## 18   78.962880  20.59368
## 19 -102.552784  23.63450
## 20 -122.200679  47.61038
## 21  -75.527670  38.91083
## 22 -112.107222  46.32021
## 23 -118.243685  34.05223
## 24  -74.005941  40.71278
## 25  -96.796988  32.77666
## 26  -95.369803  29.76043
## 27  -90.071532  29.95107
## 28 -112.074037  33.44838
## 29  -86.228386  43.06307
## 30  -85.602364  44.31484
## 31  116.560144  40.34027
## 32 -117.338993  33.97608
## 33  -77.194525  41.20332
## 34   13.404954  52.52001
all_names_geocoded <- geocode(all_names) #[1:10]
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Jeff%20Bezos&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Jeff Bezos"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=John%20Ingram&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Fluid%20Concepts&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Fluid
## Concepts"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Barnes&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Noble&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Marks&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Spencer&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Green%20Lantern&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Audible%20Studios&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Audible
## Studios"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Michael%20Snodgrass&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Michael
## Snodgrass"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Otis%20Chandler&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Elizabeth%20Chandler.&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Goodreads&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Goodreads"
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Michael%20Jackson&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Randall%20Sullivan&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Netscape%20Secure%20Commerce%20Server&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Netscape
## Secure Commerce Server"
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brad%20Stone&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Diana%20L%C3%B6bl&sensor=false
## .
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Peter%20Onneken&sensor=false
## Warning: geocode failed with status ZERO_RESULTS, location = "Peter
## Onneken"
all_names_geocoded # view contents of this obj
##           lon       lat
## 1          NA        NA
## 2   -80.05966  40.46920
## 3          NA        NA
## 4   -91.50802  46.33369
## 5   -97.39475  35.13924
## 6  -116.38937  33.73474
## 7   -95.14432  43.14507
## 8   -97.06893  27.82093
## 9          NA        NA
## 10         NA        NA
## 11  -73.04512  42.22175
## 12 -111.83030  33.29986
## 13         NA        NA
## 14  -89.58001  37.33307
## 15  -86.14520  43.19472
## 16         NA        NA
## 17   -4.28180  50.60861
## 18  -43.29986 -22.84098
## 19         NA        NA
newmap <- getMap(resolution = "high")
plot(newmap, 
     # xlim = c(-20, 59), ylim = c(35, 71),   # can select 'boxes' of lat-lon to focus on
     asp = 1)

points(all_places_geocoded$lon, 
       all_places_geocoded$lat, 
       col = "red", cex = 1.2, pch = 19,
       )

points(all_names_geocoded$lon, 
       all_names_geocoded$lat, 
       col = "blue", cex = 1.2, pch = 19,
)