Data 607 Project 4

NoSQL Migration

Our task for Project 4 was to migrate data from a relational database to a NoSQL database of our choosing. I have used MySQL and Neo4j, interfacing with both using the RMySQL and RNeo4j packages. My data are sourced from the World Health Organization’s Global Tuberculosis Database. NOTE: running the R code below will download the WHO’s Global TB datasets (~ 3.8 MB) into a folder named “tb” in your current working directory.

Setup

Load required libraries

setwd("~/DATA 607 Data Acquisition and Management/Project4")
library(stringr)
library(dplyr)
library(tidyr)
library(RMySQL)
library(RNeo4j)
library(DT)
library(visNetwork)

Create “tb” subdirectory in current working directory

if(!dir.exists("tb")) {
  message("creating tb/ subdirectory")
  dir.create("tb")
} else {
  message("tb/ subdirectory already exists")
}

Download WHO’s global tuberculosis data to “tb” subdirectory

urls <- c("https://extranet.who.int/tme/generateCSV.asp?ds=dictionary",
         "https://extranet.who.int/tme/generateCSV.asp?ds=estimates",
         "https://extranet.who.int/tme/generateCSV.asp?ds=mdr_rr_estimates",
         "https://extranet.who.int/tme/generateCSV.asp?ds=ltbi_estimates",
         "https://extranet.who.int/tme/generateCSV.asp?ds=notifications",
         "https://extranet.who.int/tme/generateCSV.asp?ds=labs",
         "https://extranet.who.int/tme/generateCSV.asp?ds=tbhivnonroutinesurv",
         "https://extranet.who.int/tme/generateCSV.asp?ds=outcomes",
         "https://extranet.who.int/tme/generateCSV.asp?ds=policies",
         "https://extranet.who.int/tme/generateCSV.asp?ds=community")

download_tb_data <- function(url) {
  if(!file.exists(str_c("tb/", str_extract(url, "(?<=\\=).+$"), ".csv"))) {
    message(str_c("downloading ", str_extract(url, "(?<=\\=).+$"), ".csv", " to tb/ subdirectory"))
    download.file(url, file.path("tb", str_c(str_extract(url, "(?<=\\=).+$"), ".csv")), 
                method = "curl")
  }
  else {
    message(str_c("tb/", str_extract(url, "(?<=\\=).+$"), ".csv", " already exists"))
  }
}

for(i in 1:length(urls)) {
  download_tb_data(urls[i])
}

MySQL

Connect to MySQL

con <- dbConnect(MySQL(), host = "localhost", username = username, password = password)

Create and populate WHO tuberculosis (“global_tb”) database

The CSV files containing WHO’s tuberculosis data were read into a list of data frames and then written into the MySQL database in order to preserve missing values as NULLs in MySQL. Writing the database tables directly from the CSV files using the dbWriteTable() function rendered missing values as zeroes.

dbSendQuery(con, "DROP SCHEMA IF EXISTS global_tb;")
dbSendQuery(con, "CREATE SCHEMA global_tb;")
dbSendQuery(con, "USE global_tb;")

dbSendQuery(con, "
DROP TABLE IF EXISTS 
  community,
  dictionary,
  estimates,
  labs,
  ltbi_estimates,
  mdr_rr_estimates,
  notifications,
  outcomes,
  policies,
  tbhivnonroutinesurv;
")

files <- list.files(path = "tb", pattern = "\\.csv", full.names = TRUE)

tb_data <- lapply(files, function(x) read.csv(file = x, header = TRUE, 
                                              stringsAsFactors = FALSE,
                                              encoding = "UTF-8"))

names(tb_data) <- str_extract(files, "(?<=/).+(?=\\.)")

write_db_tables <- function(list) {
  for(i in 1:length(list)) {
    dbWriteTable(con, names(tb_data)[i], tb_data[[i]], 
                 header = TRUE, row.names = FALSE, overwrite = TRUE)
  }
}

write_db_tables(tb_data)

Reduce and tidy tb notifications data frame and write new DB table

reduced_tb <- tb_data$notifications

red_leq12 <- reduced_tb %>% filter(year <= 2012) %>% 
  group_by(country, iso3, iso_numeric, g_whoregion, year) %>%
  summarize(m_child = new_sp_m014,
         m_adult = sum(new_sp_m1524, new_sp_m2534, new_sp_m3544, new_sp_m4554, new_sp_m5564),
         m_senior = new_sp_m65,
         f_child = new_sp_f014, 
         f_adult = sum(new_sp_f1524, new_sp_f2534, new_sp_f3544, new_sp_f4554, new_sp_f5564),
         f_senior = new_sp_f65)
  
red_geq13 <- reduced_tb %>% filter(year > 2012) %>% 
  group_by(country, iso3, iso_numeric, g_whoregion, year) %>%
  summarize(m_child = newrel_m014,
         m_adult = sum(newrel_m1524, newrel_m2534, newrel_m3544, newrel_m4554, newrel_m5564),
         m_senior = newrel_m65,
         f_child = newrel_f014, 
         f_adult = sum(newrel_f1524, newrel_f2534, newrel_f3544, newrel_f4554, newrel_f5564),
         f_senior = newrel_f65)
                        
reduced_tb <- bind_rows(red_leq12, red_geq13) %>% arrange(country, year)

reduced_tb <- reduced_tb %>% gather(category, new_cases, m_child:f_senior)

reduced_tb <- reduced_tb %>% 
  mutate(gender = ifelse(str_sub(category, 1, 1) == "m", "male", "female")) %>%
  mutate(category = str_sub(category, 3, -1)) %>% 
  rename(age_group = category) %>%
  select(country:year, gender, age_group, new_cases) %>% arrange(country, year, gender)

reduced_tb <- ungroup(reduced_tb)

dbSendQuery(con, "DROP TABLE IF EXISTS new_cases;")

dbWriteTable(con, "new_cases", reduced_tb, 
                 header = TRUE, row.names = FALSE, overwrite = TRUE)

Test query from “new_cases” table

knitr::kable(dbGetQuery(con, "
SELECT * FROM new_cases
WHERE iso3 = 'USA'
AND year = 2015;
"))

country	iso3	iso_numeric	g_whoregion	year	gender	age_group	new_cases
United States of America	USA	840	AMR	2015	female	child	222
United States of America	USA	840	AMR	2015	female	adult	2589
United States of America	USA	840	AMR	2015	female	senior	850
United States of America	USA	840	AMR	2015	male	child	215
United States of America	USA	840	AMR	2015	male	adult	3870
United States of America	USA	840	AMR	2015	male	senior	1307

Write “new_cases” table as CSV file in current working directory

# write CSV from MySQL
#dbSendQuery(con, "
#SELECT * 
#INTO OUTFILE '/Users/Shared/Data/tb_new_cases.csv'
#FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"'
#ESCAPED BY '\\\\'
#LINES TERMINATED BY '\\n'
#FROM (
#    SELECT 'country', 'iso3', 'iso_numeric', 'g_whoregion', 
#      'year', 'gender', 'age_group', 'new_cases'
#    UNION ALL
#    SELECT * FROM new_cases
#) csv_headers;
#")

# write CSV from R
write.csv(dbGetQuery(con, "SELECT * FROM new_cases"), "new_tb_cases.csv", row.names = FALSE)

Drop “global_tb” DB and disconnect from MySQL

dbSendQuery(con, "DROP SCHEMA global_tb;")

## <MySQLResult:0,0,40>

dbDisconnect(con)

## [1] TRUE

Neo4j

Read-in “new_tb_cases.csv” and remove rows with missing data from data frame

new_tb_cases <- read.csv("new_tb_cases.csv", header = TRUE, stringsAsFactors = FALSE)
new_tb_cases <- new_tb_cases[!is.na(new_tb_cases$new_cases), ]

Connect to Neo4J database

graph <- startGraph("http://localhost:7474/db/data/", username = neo4j_user, password = neo4j_pw)
clear(graph, input = FALSE)

Add uniqueness constraints and define import query

addConstraint(graph, "Region", "who_region")
addConstraint(graph, "Country", "country")
addConstraint(graph, "Year", "year")

create_query <- "
CREATE (new_tb_cases:Notification {count:TOINT({new_cases}), gender:{gender}, age_group:{age_group}})

MERGE (who_region:Region {region:{region}})
MERGE (country:Country {country:{country}, iso3:{iso3}, iso_numeric:TOINT({iso_numeric})})
MERGE (year:Year {year:TOINT({year})})

MERGE (who_region)-[:CONTAINS]->(country)
MERGE (country)-[:REPORTED]->(new_tb_cases)
MERGE (new_tb_cases)-[:IN]->(year);
"

Use the transactional endpoint to upload `new_tb_cases` data to graph DB

tx = newTransaction(graph)

for (i in 1:nrow(new_tb_cases)) {
  # upload in blocks of 1000
  if(i %% 1000 == 0) {
    # commit current transaction
    commit(tx)
    message(paste("batch", i / 1000, "committed"))
    # open new transaction
    tx = newTransaction(graph)
  }
  
  # append paramaterized Cypher query to transaction
  appendCypher(tx,
               create_query,
               gender = new_tb_cases$gender[i],
               age_group = new_tb_cases$age_group[i],
               new_cases = new_tb_cases$new_cases[i],
               region = new_tb_cases$g_whoregion[i],
               country = new_tb_cases$country[i],
               iso3 = new_tb_cases$iso3[i],
               iso_numeric = new_tb_cases$iso_numeric[i],
               year = new_tb_cases$year[i]
               )
}

# commit last transaction
commit(tx)
message("last batch committed")

Add sequential relationships to “year” nodes in graph

mod_query <- "
MATCH (year:Year)
WITH year
ORDER BY year.year
WITH collect(year) AS years
FOREACH(i in RANGE(0, length(years) - 2) |
  FOREACH(year1 in [years[i]] |
    FOREACH(year2 in [years[i + 1]] |
      CREATE UNIQUE (year1)-[:NEXT]->(year2))))
"

cypher(graph, mod_query)

Query unique node labels and relationship types

After writing a query to retrieve distinct node labels and their respective relationship types, I discovered that calling summary() on a graph DB returns a very similar result set.

node_rel_query <- "
MATCH (n)-[r]->(m)
WITH LABELS(n) AS label1, TYPE(r) AS rel_types, LABELS(m) AS label2
UNWIND label1 AS node_label1
UNWIND label2 AS node_label2
RETURN DISTINCT node_label1, rel_types AS relationship_type, node_label2
ORDER BY node_label1;
"
knitr::kable(cypher(graph, node_rel_query))

node_label1	relationship_type	node_label2
Country	REPORTED	Notification
Notification	IN	Year
Region	CONTAINS	Country
Year	NEXT	Year

knitr::kable(summary(graph))

##           This       To         That
## 1      Country REPORTED Notification
## 2       Region CONTAINS      Country
## 3 Notification       IN         Year
## 4         Year     NEXT         Year

This	To	That
Country	REPORTED	Notification
Region	CONTAINS	Country
Notification	IN	Year
Year	NEXT	Year

Test query of new tb cases reported by USA

usa_query <- "
MATCH (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year) 
WHERE country.iso3 = 'USA' 
RETURN who_region.region as WHO_Region, 
  country.country as Country, 
  year.year as Year, 
  new_tb_cases.gender as Gender, 
  new_tb_cases.age_group as Age_Group, 
  new_tb_cases.count as New_TB_Cases
ORDER BY Year, Gender, Age_Group;
"
usa_tb <- cypher(graph, usa_query)
datatable(usa_tb, rownames = FALSE)

Visualize graph DB using visNetwork package

# downloading the image while knitting the R markdown produced an empty file
#download.file("https://github.com/juddanderman/cuny-data-607/blob/master/Project4/USA_TB.png?raw=true", "USA_TB.png", method = "curl", mode = "wb")

$Neo4j Browser: USA TB Notifications$

Neo4j Browser: USA TB Notifications

# open Neo4j broswer in viewer pane
browse(graph, viewer = TRUE)

# run Cypher query below in Neo4j browser to view subset of graph shown in screen shot above
#MATCH n = (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year) 
#WHERE country.iso3 = 'USA' 
#RETURN n
#ORDER BY year.year, new_tb_cases.gender, new_tb_cases.age_group;

USA new TB cases graph

usa_nodes <- "
MATCH (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year) 
WHERE country.iso3 = 'USA' 
RETURN who_region.region as WHO_Region, 
  country.iso3 as Country, 
  year.year as Year,  
  sum(new_tb_cases.count) as Cases
ORDER BY Year;
"
nodes <- cypher(graph, usa_nodes)
nodes <- nodes %>% 
  gather(group, id, WHO_Region:Cases) %>% 
  mutate(label = id) %>%
  select(id, label, group) %>% distinct()

usa_edges <- "
MATCH (who_region)-[r:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year) 
WHERE country.iso3 = 'USA' 
RETURN DISTINCT who_region.region as from, 
  country.iso3 as to, 
  TYPE(r) as label,
  year.year as extra
UNION
MATCH (who_region)-[:CONTAINS]->(country)-[r:REPORTED]->(new_tb_cases)-[:IN]->(year) 
WHERE country.iso3 = 'USA' 
RETURN DISTINCT country.iso3 as from, 
  sum(new_tb_cases.count) as to, 
  TYPE(r) as label, 
  year.year as extra
UNION
MATCH (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[r:IN]->(year) 
WHERE country.iso3 = 'USA' 
RETURN DISTINCT sum(new_tb_cases.count) as from, 
  year.year as to, 
  TYPE(r) as label,
  country.iso3 as extra
"
edges <- cypher(graph, usa_edges)
edges <- edges %>% select(from, to, label) %>% distinct()

visNetwork(nodes, edges)

regions <- cypher(graph, "MATCH (who_region) RETURN DISTINCT (who_region.region);")
regions <- regions[!is.na(regions)]

reg_nodes <- "
MATCH (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year) 
WHERE who_region.region = '%1$s'
RETURN who_region.region as WHO_Region, 
  country.iso3 as Country, 
  year.year as Year,  
  sum(new_tb_cases.count) as Cases
ORDER BY Year;
"

reg_edges <- "
MATCH (who_region)-[r:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[:IN]->(year) 
WHERE who_region.region = '%1$s'
RETURN DISTINCT who_region.region as from, 
  country.iso3 as to, 
  TYPE(r) as label,
  year.year as extra
UNION
MATCH (who_region)-[:CONTAINS]->(country)-[r:REPORTED]->(new_tb_cases)-[:IN]->(year) 
WHERE who_region.region = '%1$s'
RETURN DISTINCT country.iso3 as from, 
  sum(new_tb_cases.count) as to, 
  TYPE(r) as label, 
  year.year as extra
UNION
MATCH (who_region)-[:CONTAINS]->(country)-[:REPORTED]->(new_tb_cases)-[r:IN]->(year) 
WHERE who_region.region = '%1$s'
RETURN DISTINCT sum(new_tb_cases.count) as from, 
  year.year as to, 
  TYPE(r) as label,
  country.iso3 as extra
"

viz_net <- function(reg) {
  nodes_reg <- cypher(graph, sprintf(reg_nodes, reg))
  nodes_reg <- nodes_reg %>% 
    gather(group, id, WHO_Region:Cases) %>% 
    mutate(label = id) %>%
    select(id, label, group) %>% distinct()
  
  edges_reg <- cypher(graph, sprintf(reg_edges, reg))
  edges_reg <- edges_reg %>% select(from, to, label) %>% distinct()
  
  visNetwork(nodes_reg, edges_reg)
}

# visualize first WHO region, looping over length of vector returned no network plots
cat("Region: ", regions[1], "\n")

## Region:  EMR

viz_net(regions[1])

Data 607 Project 4

Judd Anderman

November 17, 2016

NoSQL Migration

Setup

Load required libraries

Create “tb” subdirectory in current working directory

Download WHO’s global tuberculosis data to “tb” subdirectory

MySQL

Connect to MySQL

Create and populate WHO tuberculosis (“global_tb”) database

Reduce and tidy tb notifications data frame and write new DB table

Test query from “new_cases” table

Write “new_cases” table as CSV file in current working directory

Drop “global_tb” DB and disconnect from MySQL

Neo4j

Read-in “new_tb_cases.csv” and remove rows with missing data from data frame

Connect to Neo4J database

Add uniqueness constraints and define import query

Use the transactional endpoint to upload `new_tb_cases` data to graph DB

Add sequential relationships to “year” nodes in graph

Query unique node labels and relationship types

Test query of new tb cases reported by USA

Visualize graph DB using visNetwork package

USA new TB cases graph

Discussion MySQL versus Neo4j

Sources

Data 607 Project 4

Judd Anderman

November 17, 2016

NoSQL Migration

Setup

Load required libraries

Create “tb” subdirectory in current working directory

Download WHO’s global tuberculosis data to “tb” subdirectory

MySQL

Connect to MySQL

Create and populate WHO tuberculosis (“global_tb”) database

Reduce and tidy tb notifications data frame and write new DB table

Test query from “new_cases” table

Write “new_cases” table as CSV file in current working directory

Drop “global_tb” DB and disconnect from MySQL

Neo4j

Read-in “new_tb_cases.csv” and remove rows with missing data from data frame

Connect to Neo4J database

Add uniqueness constraints and define import query

Use the transactional endpoint to upload new_tb_cases data to graph DB

Add sequential relationships to “year” nodes in graph

Query unique node labels and relationship types

Test query of new tb cases reported by USA

Visualize graph DB using visNetwork package

USA new TB cases graph

Discussion MySQL versus Neo4j

Sources

Use the transactional endpoint to upload `new_tb_cases` data to graph DB