new_tb_cases data to graph DBOur task for Project 4 was to migrate data from a relational database to a NoSQL database of our choosing. I have used MySQL and Neo4j, interfacing with both using the RMySQL and RNeo4j packages. My data are sourced from the World Health Organization’s Global Tuberculosis Database. NOTE: running the R code below will download the WHO’s Global TB datasets (~ 3.8 MB) into a folder named “tb” in your current working directory.
setwd("~/DATA 607 Data Acquisition and Management/Project4")
library(stringr)
library(dplyr)
library(tidyr)
library(RMySQL)
library(RNeo4j)
library(DT)
library(visNetwork)
if(!dir.exists("tb")) {
message("creating tb/ subdirectory")
dir.create("tb")
} else {
message("tb/ subdirectory already exists")
}
urls <- c("https://extranet.who.int/tme/generateCSV.asp?ds=dictionary",
"https://extranet.who.int/tme/generateCSV.asp?ds=estimates",
"https://extranet.who.int/tme/generateCSV.asp?ds=mdr_rr_estimates",
"https://extranet.who.int/tme/generateCSV.asp?ds=ltbi_estimates",
"https://extranet.who.int/tme/generateCSV.asp?ds=notifications",
"https://extranet.who.int/tme/generateCSV.asp?ds=labs",
"https://extranet.who.int/tme/generateCSV.asp?ds=tbhivnonroutinesurv",
"https://extranet.who.int/tme/generateCSV.asp?ds=outcomes",
"https://extranet.who.int/tme/generateCSV.asp?ds=policies",
"https://extranet.who.int/tme/generateCSV.asp?ds=community")
download_tb_data <- function(url) {
if(!file.exists(str_c("tb/", str_extract(url, "(?<=\\=).+$"), ".csv"))) {
message(str_c("downloading ", str_extract(url, "(?<=\\=).+$"), ".csv", " to tb/ subdirectory"))
download.file(url, file.path("tb", str_c(str_extract(url, "(?<=\\=).+$"), ".csv")),
method = "curl")
}
else {
message(str_c("tb/", str_extract(url, "(?<=\\=).+$"), ".csv", " already exists"))
}
}
for(i in 1:length(urls)) {
download_tb_data(urls[i])
}
con <- dbConnect(MySQL(), host = "localhost", username = username, password = password)
The CSV files containing WHO’s tuberculosis data were read into a list of data frames and then written into the MySQL database in order to preserve missing values as NULLs in MySQL. Writing the database tables directly from the CSV files using the dbWriteTable() function rendered missing values as zeroes.
dbSendQuery(con, "DROP SCHEMA IF EXISTS global_tb;")
dbSendQuery(con, "CREATE SCHEMA global_tb;")
dbSendQuery(con, "USE global_tb;")
dbSendQuery(con, "
DROP TABLE IF EXISTS
community,
dictionary,
estimates,
labs,
ltbi_estimates,
mdr_rr_estimates,
notifications,
outcomes,
policies,
tbhivnonroutinesurv;
")
files <- list.files(path = "tb", pattern = "\\.csv", full.names = TRUE)
tb_data <- lapply(files, function(x) read.csv(file = x, header = TRUE,
stringsAsFactors = FALSE,
encoding = "UTF-8"))
names(tb_data) <- str_extract(files, "(?<=/).+(?=\\.)")
write_db_tables <- function(list) {
for(i in 1:length(list)) {
dbWriteTable(con, names(tb_data)[i], tb_data[[i]],
header = TRUE, row.names = FALSE, overwrite = TRUE)
}
}
write_db_tables(tb_data)
reduced_tb <- tb_data$notifications
red_leq12 <- reduced_tb %>% filter(year <= 2012) %>%
group_by(country, iso3, iso_numeric, g_whoregion, year) %>%
summarize(m_child = new_sp_m014,
m_adult = sum(new_sp_m1524, new_sp_m2534, new_sp_m3544, new_sp_m4554, new_sp_m5564),
m_senior = new_sp_m65,
f_child = new_sp_f014,
f_adult = sum(new_sp_f1524, new_sp_f2534, new_sp_f3544, new_sp_f4554, new_sp_f5564),
f_senior = new_sp_f65)
red_geq13 <- reduced_tb %>% filter(year > 2012) %>%
group_by(country, iso3, iso_numeric, g_whoregion, year) %>%
summarize(m_child = newrel_m014,
m_adult = sum(newrel_m1524, newrel_m2534, newrel_m3544, newrel_m4554, newrel_m5564),
m_senior = newrel_m65,
f_child = newrel_f014,
f_adult = sum(newrel_f1524, newrel_f2534, newrel_f3544, newrel_f4554, newrel_f5564),
f_senior = newrel_f65)
reduced_tb <- bind_rows(red_leq12, red_geq13) %>% arrange(country, year)
reduced_tb <- reduced_tb %>% gather(category, new_cases, m_child:f_senior)
reduced_tb <- reduced_tb %>%
mutate(gender = ifelse(str_sub(category, 1, 1) == "m", "male", "female")) %>%
mutate(category = str_sub(category, 3, -1)) %>%
rename(age_group = category) %>%
select(country:year, gender, age_group, new_cases) %>% arrange(country, year, gender)
reduced_tb <- ungroup(reduced_tb)
dbSendQuery(con, "DROP TABLE IF EXISTS new_cases;")
dbWriteTable(con, "new_cases", reduced_tb,
header = TRUE, row.names = FALSE, overwrite = TRUE)
knitr::kable(dbGetQuery(con, "
SELECT * FROM new_cases
WHERE iso3 = 'USA'
AND year = 2015;
"))
| country | iso3 | iso_numeric | g_whoregion | year | gender | age_group | new_cases |
|---|---|---|---|---|---|---|---|
| United States of America | USA | 840 | AMR | 2015 | female | child | 222 |
| United States of America | USA | 840 | AMR | 2015 | female | adult | 2589 |
| United States of America | USA | 840 | AMR | 2015 | female | senior | 850 |
| United States of America | USA | 840 | AMR | 2015 | male | child | 215 |
| United States of America | USA | 840 | AMR | 2015 | male | adult | 3870 |
| United States of America | USA | 840 | AMR | 2015 | male | senior | 1307 |
# write CSV from MySQL
#dbSendQuery(con, "
#SELECT *
#INTO OUTFILE '/Users/Shared/Data/tb_new_cases.csv'
#FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"'
#ESCAPED BY '\\\\'
#LINES TERMINATED BY '\\n'
#FROM (
# SELECT 'country', 'iso3', 'iso_numeric', 'g_whoregion',
# 'year', 'gender', 'age_group', 'new_cases'
# UNION ALL
# SELECT * FROM new_cases
#) csv_headers;
#")
# write CSV from R
write.csv(dbGetQuery(con, "SELECT * FROM new_cases"), "new_tb_cases.csv", row.names = FALSE)
dbSendQuery(con, "DROP SCHEMA global_tb;")
## <MySQLResult:0,0,40>
dbDisconnect(con)
## [1] TRUE
new_tb_cases <- read.csv("new_tb_cases.csv", header = TRUE, stringsAsFactors = FALSE)
new_tb_cases <- new_tb_cases[!is.na(new_tb_cases$new_cases), ]
graph <- startGraph("http://localhost:7474/db/data/", username = neo4j_user, password = neo4j_pw)
clear(graph, input = FALSE)
addConstraint(graph, "Region", "who_region")
addConstraint(graph, "Country", "country")
addConstraint(graph, "Year", "year")
create_query <- "
CREATE (new_tb_cases:Notification {count:TOINT({new_cases}), gender:{gender}, age_group:{age_group}})
MERGE (who_region:Region {region:{region}})
MERGE (country:Country {country:{country}, iso3:{iso3}, iso_numeric:TOINT({iso_numeric})})
MERGE (year:Year {year:TOINT({year})})
MERGE (who_region)-[:CONTAINS]->(country)
MERGE (country)-[:REPORTED]->(new_tb_cases)
MERGE (new_tb_cases)-[:IN]->(year);
"
new_tb_cases data to graph DBtx = newTransaction(graph)
for (i in 1:nrow(new_tb_cases)) {
# upload in blocks of 1000
if(i %% 1000 == 0) {
# commit current transaction
commit(tx)
message(paste("batch", i / 1000, "committed"))
# open new transaction
tx = newTransaction(graph)
}
# append paramaterized Cypher query to transaction
appendCypher(tx,
create_query,
gender = new_tb_cases$gender[i],
age_group = new_tb_cases$age_group[i],
new_cases = new_tb_cases$new_cases[i],
region = new_tb_cases$g_whoregion[i],
country = new_tb_cases$country[i],
iso3 = new_tb_cases$iso3[i],
iso_numeric = new_tb_cases$iso_numeric[i],
year = new_tb_cases$year[i]
)
}
# commit last transaction
commit(tx)
message("last batch committed")
mod_query <- "
MATCH (year:Year)
WITH year
ORDER BY year.year
WITH collect(year) AS years
FOREACH(i in RANGE(0, length(years) - 2) |
FOREACH(year1 in [years[i]] |
FOREACH(year2 in [years[i + 1]] |
CREATE UNIQUE (year1)-[:NEXT]->(year2))))
"
cypher(graph, mod_query)
After writing a query to retrieve distinct node labels and their respective relationship types, I discovered that calling summary() on a graph DB returns a very similar result set.
node_rel_query <- "
MATCH (n)-[r]->(m)
WITH LABELS(n) AS label1, TYPE(r) AS rel_types, LABELS(m) AS label2
UNWIND label1 AS node_label1
UNWIND label2 AS node_label2
RETURN DISTINCT node_label1, rel_types AS relationship_type, node_label2
ORDER BY node_label1;
"
knitr::kable(cypher(graph, node_rel_query))
| node_label1 | relationship_type | node_label2 |
|---|---|---|
| Country | REPORTED | Notification |
| Notification | IN | Year |
| Region | CONTAINS | Country |
| Year | NEXT | Year |
knitr::kable(summary(graph))
## This To That
## 1 Country REPORTED Notification
## 2 Region CONTAINS Country
## 3 Notification IN Year
## 4 Year NEXT Year
| This | To | That |
|---|---|---|
| Country | REPORTED | Notification |
| Region | CONTAINS | Country |
| Notification | IN | Year |
| Year | NEXT | Year |
usa_query <- "
MATCH (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year)
WHERE country.iso3 = 'USA'
RETURN who_region.region as WHO_Region,
country.country as Country,
year.year as Year,
new_tb_cases.gender as Gender,
new_tb_cases.age_group as Age_Group,
new_tb_cases.count as New_TB_Cases
ORDER BY Year, Gender, Age_Group;
"
usa_tb <- cypher(graph, usa_query)
datatable(usa_tb, rownames = FALSE)
# downloading the image while knitting the R markdown produced an empty file
#download.file("https://github.com/juddanderman/cuny-data-607/blob/master/Project4/USA_TB.png?raw=true", "USA_TB.png", method = "curl", mode = "wb")
Neo4j Browser: USA TB Notifications
# open Neo4j broswer in viewer pane
browse(graph, viewer = TRUE)
# run Cypher query below in Neo4j browser to view subset of graph shown in screen shot above
#MATCH n = (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year)
#WHERE country.iso3 = 'USA'
#RETURN n
#ORDER BY year.year, new_tb_cases.gender, new_tb_cases.age_group;
usa_nodes <- "
MATCH (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year)
WHERE country.iso3 = 'USA'
RETURN who_region.region as WHO_Region,
country.iso3 as Country,
year.year as Year,
sum(new_tb_cases.count) as Cases
ORDER BY Year;
"
nodes <- cypher(graph, usa_nodes)
nodes <- nodes %>%
gather(group, id, WHO_Region:Cases) %>%
mutate(label = id) %>%
select(id, label, group) %>% distinct()
usa_edges <- "
MATCH (who_region)-[r:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year)
WHERE country.iso3 = 'USA'
RETURN DISTINCT who_region.region as from,
country.iso3 as to,
TYPE(r) as label,
year.year as extra
UNION
MATCH (who_region)-[:CONTAINS]->(country)-[r:REPORTED]->(new_tb_cases)-[:IN]->(year)
WHERE country.iso3 = 'USA'
RETURN DISTINCT country.iso3 as from,
sum(new_tb_cases.count) as to,
TYPE(r) as label,
year.year as extra
UNION
MATCH (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[r:IN]->(year)
WHERE country.iso3 = 'USA'
RETURN DISTINCT sum(new_tb_cases.count) as from,
year.year as to,
TYPE(r) as label,
country.iso3 as extra
"
edges <- cypher(graph, usa_edges)
edges <- edges %>% select(from, to, label) %>% distinct()
visNetwork(nodes, edges)
regions <- cypher(graph, "MATCH (who_region) RETURN DISTINCT (who_region.region);")
regions <- regions[!is.na(regions)]
reg_nodes <- "
MATCH (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year)
WHERE who_region.region = '%1$s'
RETURN who_region.region as WHO_Region,
country.iso3 as Country,
year.year as Year,
sum(new_tb_cases.count) as Cases
ORDER BY Year;
"
reg_edges <- "
MATCH (who_region)-[r:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year)
WHERE who_region.region = '%1$s'
RETURN DISTINCT who_region.region as from,
country.iso3 as to,
TYPE(r) as label,
year.year as extra
UNION
MATCH (who_region)-[:CONTAINS]->(country)-[r:REPORTED]->(new_tb_cases)-[:IN]->(year)
WHERE who_region.region = '%1$s'
RETURN DISTINCT country.iso3 as from,
sum(new_tb_cases.count) as to,
TYPE(r) as label,
year.year as extra
UNION
MATCH (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[r:IN]->(year)
WHERE who_region.region = '%1$s'
RETURN DISTINCT sum(new_tb_cases.count) as from,
year.year as to,
TYPE(r) as label,
country.iso3 as extra
"
viz_net <- function(reg) {
nodes_reg <- cypher(graph, sprintf(reg_nodes, reg))
nodes_reg <- nodes_reg %>%
gather(group, id, WHO_Region:Cases) %>%
mutate(label = id) %>%
select(id, label, group) %>% distinct()
edges_reg <- cypher(graph, sprintf(reg_edges, reg))
edges_reg <- edges_reg %>% select(from, to, label) %>% distinct()
visNetwork(nodes_reg, edges_reg)
}
# visualize first WHO region, looping over length of vector returned no network plots
cat("Region: ", regions[1], "\n")
## Region: EMR
viz_net(regions[1])
The relative advantages of the relational database over the Neo4j are faster transaction time and more flexibility with a greater range of data types. In particular, Neo4j lacks date and time data types, although there are workarounds. Neo4j offers a very effective and compelling way to create, modify, and examine relationships among data objects, and provides great flexibility and relatively easy ways to represent complex heirarchical, sequential, and multidirectional relationships. The downside here is that modeling such relationships can be conceptually difficult and time-consuming in its own right, though the data modeling process surely improves one’s understanding of a given dataset.
RNeo4J author Nicole White’s many tutorials were very useful resources for getting a handle on how to build and interface with my Neo4J database in R. Her guides to uploading data using Neo4j’s transactional endpoint and to graph visualization using the visNetwork package were especially valuable for this project. The “Modeling Time Series Data with Neo4j” post on graphgrid.com was also a helpful reference as I worked through the data modeling process since I wanted to preserve and make use of the time-series relationships contained in the new case notifications subset of the WHO’s global tuberculosis data that I migrated to Neo4j.