I decided to use data from Libremap.net to fulfill this assignment. It uses data collected from community-run networks around the world. The agent collects data from the radio driver in the integrated circuit which is reported to this website as a json. However, I know nothing about scraping, and I obtained this file by inspecting the page.
I used the tidyr library to import the JSON. There are 45 rows in this document which represent 45 different measuring devices in a single map box on the page.
require(jsonlite)
## Loading required package: jsonlite
dat <- fromJSON("libremap.json", flatten = TRUE)
dat <- dat$rows
head(dat)
## id bbox
## 1 d0282b3bb16a040a273941d7f5d646d6 -44.31, -22.29, -44.31, -22.29
## 2 c5e87212e309250939a9e9c3400d63b9 -44.31, -22.29, -44.31, -22.29
## 3 126bc5930c18d7700989652b9abc1814 -44.31, -22.29, -44.31, -22.29
## 4 8b2b6f0dc891dfa7b5fe165cd9b42213 -44.31, -22.29, -44.31, -22.29
## 5 c5e87212e309250939a9e9c3401192e2 -44.31, -22.29, -44.31, -22.29
## 6 460d3e496931ad2df4df6b7d74b1391f -44.31, -22.29, -44.31, -22.29
## geometry.type geometry.coordinates value._id
## 1 Point -44.31, -22.29 d0282b3bb16a040a273941d7f5d646d6
## 2 Point -44.31, -22.29 c5e87212e309250939a9e9c3400d63b9
## 3 Point -44.31, -22.29 126bc5930c18d7700989652b9abc1814
## 4 Point -44.31, -22.29 8b2b6f0dc891dfa7b5fe165cd9b42213
## 5 Point -44.31, -22.29 c5e87212e309250939a9e9c3401192e2
## 6 Point -44.31, -22.29 460d3e496931ad2df4df6b7d74b1391f
## value._rev value.lon value.type value.api_rev
## 1 6202-2bfd5fd4678d20963a89c6a681d93fe0 -44.31 router 1.0
## 2 3-1fe06eabad06fb00a23ff1423f83629c -44.31 router 1.0
## 3 2-18ac7219546f9f10996c9c2830aef3fc -44.31 router 1.0
## 4 1-65ced0a64c91dde6ee906701297e4184 -44.31 router 1.0
## 5 13-e6e35677162820ed38f25e99f68a450e -44.31 router 1.0
## 6 148-e1e76048f3f6b898eedf308e023a6cb8 -44.31 router 1.0
## value.community value.ctime value.lat value.hostname
## 1 fumaçaonline 2015-08-01T04:07:28.896Z -22.29 igrejinha
## 2 quilombocambury 2017-10-24T20:17:18.700Z -22.29 gateway_escolinha
## 3 quilombocamburi 2017-10-24T14:01:23.994Z -22.29 LiMe-fb6470
## 4 quilombocambury 2017-10-25T21:58:51.566Z -22.29 LiMe-fe56e6
## 5 quilombocambury 2017-10-24T20:21:16.210Z -22.29 gateway_escolinha
## 6 fumaçaonline 2017-04-16T05:03:03.591Z -22.29 Tatu
## value.elev value.mtime
## 1 700 2017-10-24T11:02:25.436Z
## 2 700 2017-10-24T20:29:38.847Z
## 3 700 2017-10-25T11:39:23.155Z
## 4 700 2017-10-25T21:58:51.566Z
## 5 700 2017-10-25T22:05:37.824Z
## 6 700 2017-11-09T19:45:37.871Z
## value.links
## 1 NULL
## 2 1A:A6:F7:05:36:92, 1A:A6:F7:05:36:92, 1E:A6:F7:05:36:92, 18:A6:F7:05:36:92, 18:A6:F7:05:36:92, 18:A6:F7:05:36:92, 18:A6:F7:05:36:92, wifi, wifi, wifi, wifi, wifi, wifi, wifi, 1, 0.25, 1, 0.25, 0.75, 0.25, 0.75, C4:42:02:2F:7C:58, C4:43:8F:B6:44:CA, DC:53:60:43:A7:C4, 84:16:F9:AE:38:94, 18:A6:F7:EB:39:88, 84:16:F9:93:1C:5A, 84:16:F9:7B:DC:62, 1A:A6:F7:05:36:92, 1A:A6:F7:05:36:92, 1E:A6:F7:05:36:92, 18:A6:F7:05:36:92, 18:A6:F7:05:36:92, 18:A6:F7:05:36:92, 18:A6:F7:05:36:92, C4:42:02:2F:7C:58, C4:43:8F:B6:44:CA, DC:53:60:43:A7:C4, 84:16:F9:AE:38:94, 18:A6:F7:EB:39:88, 84:16:F9:93:1C:5A, 84:16:F9:7B:DC:62, 11, 11, 11, 11, 11, 11, 11, -52, -85, -54, -84, -66, -82, -66, wlan0-ap, wlan0-ap, wlan0-apname, wlan0-adhoc, wlan0-adhoc, wlan0-adhoc, wlan0-adhoc
## 3 NULL
## 4 fe80::aa15:4dff:fefe:56e7, fe80::aa15:4dff:fefe:56e7, AA:15:4D:FE:56:E7, AA:15:4D:FE:56:E7, AA:15:4D:FE:56:E7, AE:15:4D:FE:56:E7, A8:15:4D:FE:56:E7, A8:15:4D:FE:56:E7, bmx6, bmx6, wifi, wifi, wifi, wifi, wifi, wifi, 0.97, 0.95, 0.5, 1, 0.1, 0.75, 0.5, 0.5, fe80::8616:f9ff:fe7b:dc62, fe80::1aa6:f7ff:feeb:3988, 00:16:98:05:AC:58, 7C:4F:B5:4E:89:9A, C4:43:8F:B6:44:CA, 80:A5:89:9D:8E:1D, 84:16:F9:7B:DC:62, 18:A6:F7:EB:39:88, 97, 95, NA, NA, NA, NA, NA, NA, casadefarinha740n, casadefarinha, NA, NA, NA, NA, NA, NA, wlan0-adhoc_13, wlan0-adhoc_13, NA, NA, NA, NA, NA, NA, NA, NA, AA:15:4D:FE:56:E7, AA:15:4D:FE:56:E7, AA:15:4D:FE:56:E7, AE:15:4D:FE:56:E7, A8:15:4D:FE:56:E7, A8:15:4D:FE:56:E7, NA, NA, 00:16:98:05:AC:58, 7C:4F:B5:4E:89:9A, C4:43:8F:B6:44:CA, 80:A5:89:9D:8E:1D, 84:16:F9:7B:DC:62, 18:A6:F7:EB:39:88, NA, NA, 11, 11, 11, 11, 11, 11, NA, NA, -78, -64, -91, -73, -75, -75, NA, NA, wlan0-ap, wlan0-ap, wlan0-ap, wlan0-apname, wlan0-adhoc, wlan0-adhoc
## 5 fe80::aa15:4dff:fefe:56e7, AA:15:4D:FE:56:E7, AA:15:4D:FE:56:E7, A8:15:4D:FE:56:E7, A8:15:4D:FE:56:E7, bmx6, wifi, wifi, wifi, wifi, 1, 1, 1, 0.75, 0.5, fe80::1aa6:f7ff:feeb:3988, 7C:4F:B5:4E:89:9A, 80:A5:89:9D:8E:1D, 18:A6:F7:EB:39:88, 84:16:F9:7B:DC:62, 100, NA, NA, NA, NA, casadefarinha, NA, NA, NA, NA, wlan0-adhoc_13, NA, NA, NA, NA, NA, AA:15:4D:FE:56:E7, AA:15:4D:FE:56:E7, A8:15:4D:FE:56:E7, A8:15:4D:FE:56:E7, NA, 7C:4F:B5:4E:89:9A, 80:A5:89:9D:8E:1D, 18:A6:F7:EB:39:88, 84:16:F9:7B:DC:62, NA, 11, 11, 11, 11, NA, -65, -63, -72, -74, NA, wlan0-ap, wlan0-ap, wlan0-adhoc, wlan0-adhoc
## 6 fe80::62e3:27ff:fe4a:7a8c, 62:E3:27:4A:7A:8C, 62:E3:27:4A:7A:8C, 60:E3:27:4A:7A:8C, bmx6, wifi, wifi, wifi, 0.18, 0.5, 0.5, 0.25, fe80::32b5:c2ff:feb8:67ac, 0C:41:3E:61:28:94, 48:49:C7:65:FC:20, 30:B5:C2:B8:67:AC, 18, NA, NA, NA, benedita, NA, NA, NA, wlan0-adhoc_13, NA, NA, NA, NA, 62:E3:27:4A:7A:8C, 62:E3:27:4A:7A:8C, 60:E3:27:4A:7A:8C, NA, 0C:41:3E:61:28:94, 48:49:C7:65:FC:20, 30:B5:C2:B8:67:AC, NA, 11, 11, 11, NA, -79, -77, -84, NA, wlan0-ap, wlan0-ap, wlan0-adhoc
## value.aliases
## 1 NULL
## 2 1A:A6:F7:05:36:92, 1E:A6:F7:05:36:92, 18:A6:F7:05:36:92, wifi, wifi, wifi
## 3 C2:61:18:FB:64:71, C6:61:18:FB:64:71, C0:61:18:FB:64:71, C2:61:18:FB:64:72, C6:61:18:FB:64:72, C0:61:18:FB:64:72, wifi, wifi, wifi, wifi, wifi, wifi
## 4 fe80::aa15:4dff:fefe:56e6, fe80::aa15:4dff:fefe:56e6, fe80::aa15:4dff:fefe:56e6, fe80::aa15:4dff:fefe:56e7, fe80::aa15:4dff:fefe:56e8, AA:15:4D:FE:56:E7, AE:15:4D:FE:56:E7, A8:15:4D:FE:56:E7, AA:15:4D:FE:56:E8, AE:15:4D:FE:56:E8, A8:15:4D:FE:56:E8, bmx6, bmx6, bmx6, bmx6, bmx6, wifi, wifi, wifi, wifi, wifi, wifi
## 5 fe80::aa15:4dff:fefe:56e6, fe80::aa15:4dff:fefe:56e6, fe80::aa15:4dff:fefe:56e6, fe80::aa15:4dff:fefe:56e7, fe80::aa15:4dff:fefe:56e8, AA:15:4D:FE:56:E7, AE:15:4D:FE:56:E7, A8:15:4D:FE:56:E7, AA:15:4D:FE:56:E8, AE:15:4D:FE:56:E8, A8:15:4D:FE:56:E8, bmx6, bmx6, bmx6, bmx6, bmx6, wifi, wifi, wifi, wifi, wifi, wifi
## 6 fe80::62e3:27ff:fe4a:7a8b, fe80::62e3:27ff:fe4a:7a8b, fe80::62e3:27ff:fe4a:7a8b, fe80::62e3:27ff:fe4a:7a8c, fe80::62e3:27ff:fe4a:7a8d, 62:E3:27:4A:7A:8C, 66:E3:27:4A:7A:8C, 60:E3:27:4A:7A:8C, 62:E3:27:4A:7A:8D, 66:E3:27:4A:7A:8D, 60:E3:27:4A:7A:8D, bmx6, bmx6, bmx6, bmx6, bmx6, wifi, wifi, wifi, wifi, wifi, wifi
## value._conflicts value.attributes.submitter.plugins
## 1 NULL location, bmx6, wireless, system
## 2 NULL location, bmx6, wireless, system
## 3 NULL location, bmx6, wireless, system
## 4 NULL location, bmx6, wireless, system
## 5 NULL location, bmx6, wireless, system
## 6 NULL location, bmx6, wireless, system
## value.attributes.submitter.url
## 1 https://github.com/libremap/libremap-agent-openwrt
## 2 https://github.com/libremap/libremap-agent-openwrt
## 3 https://github.com/libremap/libremap-agent-openwrt
## 4 https://github.com/libremap/libremap-agent-openwrt
## 5 https://github.com/libremap/libremap-agent-openwrt
## 6 https://github.com/libremap/libremap-agent-openwrt
## value.attributes.submitter.name value.attributes.submitter.version
## 1 libremap-agent-openwrt 0.1.9
## 2 libremap-agent-openwrt 0.1.9
## 3 libremap-agent-openwrt 0.1.9
## 4 libremap-agent-openwrt 0.1.9
## 5 libremap-agent-openwrt 0.1.9
## 6 libremap-agent-openwrt 0.1.9
## value.attributes.system.memtotal value.attributes.system.name
## 1 NA <NA>
## 2 NA <NA>
## 3 NA <NA>
## 4 NA <NA>
## 5 NA <NA>
## 6 NA <NA>
## value.attributes.system.model
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
Some information was very easy to parse because it as already set up as a vector. Latitude, longitude, agent version and agent type were parsed as below.
lat <- c(dat$value.lat)
lon <- c(dat$value.lon)
agent.version <- c(dat$value.api_rev)
type <- c(dat$value.type)
However, in order for this data collection to scale and work over time, it will need to keep track of the bounding box that was measured. Since it was stored as a list, we have to flatten the data so that each direction has its own column.
tmp <- c(dat$bbox)
nw <- lapply(tmp, '[', 1)
ne <- lapply(tmp, '[', 2)
sw <- lapply(tmp, '[', 3)
se <- lapply(tmp, '[', 4)
Next, we collect data about the hardware and what data it has chosen to provide.
agent.plugin <- c(dat$value.attributes.submitter.plugins)
agent.location <- grepl("location", agent.plugin)
agent.bmx <- grepl("bmx", agent.plugin)
agent.wireless <- grepl("wireless", agent.plugin)
agent.system <- grepl("system", agent.plugin)
model <- c(dat$value.attributes.system.model)
hostname <- c(dat$value.hostname)
community <- c(dat$value.community)
Finally, we store all of these things as a big vector of vectors and look at it to make sure it’s going ok.
big.one <- cbind(time, community, lat,lon,agent.version, type, nw, ne, sw, se, agent.plugin, agent.location, agent.bmx, agent.wireless, agent.system, model, hostname)
dim(big.one)
## [1] 45 17
head(big.one)
## time community lat lon agent.version type nw
## [1,] ? "fumaçaonline" -22.29 -44.31 "1.0" "router" -44.31
## [2,] ? "quilombocambury" -22.29 -44.31 "1.0" "router" -44.31
## [3,] ? "quilombocamburi" -22.29 -44.31 "1.0" "router" -44.31
## [4,] ? "quilombocambury" -22.29 -44.31 "1.0" "router" -44.31
## [5,] ? "quilombocambury" -22.29 -44.31 "1.0" "router" -44.31
## [6,] ? "fumaçaonline" -22.29 -44.31 "1.0" "router" -44.31
## ne sw se agent.plugin agent.location agent.bmx
## [1,] -22.29 -44.31 -22.29 Character,4 TRUE TRUE
## [2,] -22.29 -44.31 -22.29 Character,4 TRUE TRUE
## [3,] -22.29 -44.31 -22.29 Character,4 TRUE TRUE
## [4,] -22.29 -44.31 -22.29 Character,4 TRUE TRUE
## [5,] -22.29 -44.31 -22.29 Character,4 TRUE TRUE
## [6,] -22.29 -44.31 -22.29 Character,4 TRUE TRUE
## agent.wireless agent.system model hostname
## [1,] TRUE TRUE "NA" "igrejinha"
## [2,] TRUE TRUE "NA" "gateway_escolinha"
## [3,] TRUE TRUE "NA" "LiMe-fb6470"
## [4,] TRUE TRUE "NA" "LiMe-fe56e6"
## [5,] TRUE TRUE "NA" "gateway_escolinha"
## [6,] TRUE TRUE "NA" "Tatu"
The next section requires iterating through the nested scan data that exists for each row of the above. Because each scan datum is a list of a list, we have to repeat the above and fill out our table. Then, we can save it all as a csv.
links <-c(dat$value.links)
index = 2
for (index in 2:45 ){
types <- c(dat$value.links[[index]]$type)
signal <- dat$value.links[[index]]$attributes.signal
mac.local <- dat$value.links[[index]]$attributes.local_mac
mac.station <- dat$value.links[[index]]$attributes.station_mac
channel <- dat$value.links[[index]]$attributes.channel
size <- length(types)
size
# repeat a line from big.one for each row of this frame
community <- matrix(rep(community[index], size))
lat <- matrix(rep(lat[index],size))
lon <- matrix(rep(lon[index],size))
version <- matrix(rep(agent.version[index], size))
plugin <- matrix(rep(agent.plugin[index],size))
bmx <- matrix(rep(agent.bmx[index],size))
agent.wireless <- matrix(rep(agent.wireless[index],size))
agent.system <- matrix(rep(agent.system[index],size))
model <- matrix(rep(model[index],size))
hostname <- matrix(rep(hostname[index],size))
#bind them all together
data <- cbind( community, lat, lon, version, plugin, bmx, agent.wireless, agent.system, model, hostname, signal, mac.local, mac.station, channel)
#append to csv
write.table(data, file="dat.csv", col.names=FALSE, append=TRUE)
}