rm(list=ls()) # clears workspace
# need latest Java version installed on your machine b4 U start
require(rJava) || install.packages("rJava")
require(NLP) || install.packages("NLP")
require(openNLP) || install.packages("openNLP")
require(RWeka) || install.packages("RWeka")
library(rvest)
library(ggmap)
library(rworldmap)
t1 = Sys.time() # set timer
page = read_html('https://en.wikipedia.org/wiki/Walmart')
text = html_text(html_nodes(page,'p'))
text = text[text != ""]
text = gsub("\\[[0-9]]|\\[[0-9][0-9]]|\\[[0-9][0-9][0-9]]","",text) # removing refrences [101] type
# Make one complete document
text = paste(text,collapse = " ")
text = as.String(text)
Sys.time() - t1 # how much time did the above take?
## Time difference of 2.182449 secs
t1 = Sys.time() # set timer
sent_annot = Maxent_Sent_Token_Annotator()
word_annot = Maxent_Word_Token_Annotator()
loc_annot = Maxent_Entity_Annotator(kind = "location")
person_annot = Maxent_Entity_Annotator(kind = "person")
org_annot = Maxent_Entity_Annotator(kind = "organization")
annot.l1 = NLP::annotate(text, list(sent_annot,word_annot,loc_annot,person_annot, org_annot))
Sys.time() - t1 # how much time did the above take?
## Time difference of 12.7605 secs
Exract Locations
k <- sapply(annot.l1$features, `[[`, "kind")
walmart_locations = text[annot.l1[k == "location"]]
Walmart Locations from Wikipedia
t1 = Sys.time() # set timer
all_places = unique(walmart_locations) # view contents of this obj
all_places_geocoded <- geocode(all_places) #[1:10]
all_places_geocoded # view contents of this obj
## lon lat
## 1 -91.8318334 35.201050
## 2 -95.7128910 37.090240
## 3 -106.3467710 56.130366
## 4 -91.8803310 39.165044
## 5 -3.4359730 55.378051
## 6 138.2529240 36.204824
## 7 78.9628800 20.593684
## 8 -63.6166720 -38.416097
## 9 -51.9252800 -14.235004
## 10 -74.4056612 40.058324
## 11 -1.0872979 53.959965
## 12 -77.1945247 41.203322
## 13 -95.7128910 37.090240
## 14 -95.7128910 37.090240
## 15 104.1953970 35.861660
## 16 10.4515260 51.165691
## 17 127.7669220 35.907757
## 18 -78.8773898 42.876089
## 19 -94.2088172 36.372854
## 20 -91.8318334 37.964253
## 21 -97.0928770 35.007752
## 22 -98.4842465 39.011902
## 23 -91.9623327 30.984298
## 24 -86.5804473 35.517491
## 25 -84.2700179 37.839333
## 26 -99.9018131 31.968599
## 27 -79.8284258 32.832322
## 28 -77.0368707 38.907192
## 29 -119.4179324 36.778261
## 30 -102.5527840 23.634501
## 31 -72.5778415 44.558803
## 32 15.2551187 54.525961
## 33 -96.6988856 33.019843
## 34 -96.6388833 32.912624
## 35 -77.0368707 38.907192
## 36 -66.5901490 18.220833
## 37 -73.7562317 42.652579
## 38 -74.0059413 40.712784
## 39 -103.2310149 44.080543
## 40 -99.9018131 43.969515
## 41 -114.7420408 44.068202
## 42 -110.3625658 46.879682
## 43 -99.9018131 41.492537
## 44 -101.0020119 47.551493
## 45 -81.1637245 33.836081
## 46 -111.0937311 39.320980
## 47 -80.4549026 38.597626
## 48 -107.2902839 43.075968
## 49 -0.0116755 42.001533
## 50 -149.4936733 64.200841
## 51 -75.5276699 38.910832
## 52 -155.5827818 19.896766
## 53 -93.0977020 41.878003
## 54 -69.4454689 45.253783
## 55 -76.6412712 39.045755
## 56 -71.3824374 42.407211
## 57 -85.6023643 44.314844
## 58 -94.6858998 46.729553
## 59 -71.5723953 43.193852
## 60 -82.9071230 40.417287
## 61 -71.4774291 41.580095
## 62 -98.4104996 29.840551
## 63 -95.3698028 29.760427
## 64 -112.0740373 33.448377
## 65 -111.0937311 34.048928
## 66 -79.0192997 35.759573
## 67 -87.6297982 41.878114
## 68 -82.4571776 27.950575
## 69 -81.5157535 27.664827
## 70 -90.5067726 38.596040
## 71 -112.1072224 46.320207
## 72 -104.9902510 39.739236
## 73 -0.0100245 35.004115
## 74 -86.2519898 41.676355
## 75 -71.5429690 -35.675147
## 76 -83.7534280 9.748917
## 77 -88.8965300 13.794185
## 78 -90.2307590 15.783471
## 79 -86.2419050 15.199999
## 80 -85.2072290 12.865416
## 81 -49.2671370 -25.428954
## 82 -88.8965300 13.794185
## 83 -88.8965300 13.794185
## 84 -109.2730555 25.993056
## 85 120.3209373 16.615891
## 86 42.5509603 29.298528
## 87 34.5085230 -8.783195
## 88 22.9375060 -30.559482
## 89 -1.0231940 7.946527
## 90 37.9061930 -0.023559
## 91 28.2336080 -29.609988
## 92 34.3015250 -13.254308
## 93 35.5295620 -18.665695
## 94 18.4904100 -22.957640
## 95 8.6752770 9.081999
## 96 32.2902750 1.373333
## 97 -84.5613355 39.399501
## 98 -0.1277583 51.507351
## 99 -117.6508876 34.063344
## 100 -97.0035982 28.805267
## 101 -127.6476206 53.726668
## 102 28.0473051 -26.204103
## 103 -71.1097335 42.373616
## 104 100.6196553 34.047863
## 105 NA NA
## 106 -98.8721186 19.685267
## 107 NA NA
## 108 -48.6478069 -26.993517
## 109 -120.5542012 43.804133
## 110 -122.4194155 37.774929
## 111 -122.6764816 45.523062
## 112 121.4737010 31.230416
## 113 -122.0575434 37.387474
## 114 -0.1277583 51.507351
## 115 -122.4433597 41.956063
## 116 -91.6514459 41.924790
newmap <- getMap(resolution = "high")
plot(newmap,
# xlim = c(-20, 59), ylim = c(35, 71), # can select 'boxes' of lat-lon to focus on
asp = 1)
points(all_places_geocoded$lon,
all_places_geocoded$lat,
col = "red", cex = 1.2, pch = 19)
Sys.time() - t1 # how much time did the above take?
## Time difference of 1.903601 mins
Extract the persons
walmart_person = text[annot.l1[k == "person"]]
walmart_organization = text[annot.l1[k == "organization"]]
walmart_person<-unique(walmart_person)
walmart_person<-setdiff(walmart_person,walmart_organization)
walmart_person<-setdiff(walmart_person,walmart_locations)
knitr::kable(walmart_person)
| Sam Walton |
| Ben Franklin |
| Walton |
| David Glass |
| Lee Scott |
| Kenneth Stone |
| Steven Horwitz |
| Greg Foran |
| Wendy |
| Auntie Annes |
| Burger King |
| Tim Horton |
| Tesoro Corporation |
| Bud |
| Mi Changomas |
| Todo Dia |
| Maxxi Atacado |
| Walmart Posto |
| Walmart Supercenter |
| Despensa Familiar |
| Don Juan |
| Maxi Pali |
| Mas X Menos |
| Superbodega Acuenta |
| Bodega Aurrera |
| Asda Living |
| Builders Warehouse |
| Dion Wired |
| Game Foodco |
| Neighborhood Market |
| Scott Price |
| Adam Hartung |
| Clubs |
| Wholesale Club“ |
| Doug McMillon |
| Gregory B. Penner |
| Jim Breyer |
| Michele Burns |
| James Cash |
| Roger Corbett |
| Douglas Daft |
| Marissa Mayer |
| Allen Questrom |
| Jim Walton |
| Christopher J. Williams |
| Linda S. Wolf |
| Hillary Clinton |
| Tom Coughlin |
| Clinton |
| Don Soderquist |
| George W. Bush |
| John Kerry |
| Alex Grigoryan |
| Betty Dukes |
| African-Americans |
| Wayne Pacelle |