Neo4J Query Lab

Part I - Load nodes and add relationships

library(RCurl)
## Loading required package: bitops
myfile_flight <- getURL('https://raw.githubusercontent.com/yuenwong2sps/Data607/master/Week12/neo4j-flight-lab-data.csv', 
                 ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)

myfile_airport <- getURL('https://raw.githubusercontent.com/yuenwong2sps/Data607/master/Week12/neo4j-airport-csv-raw.csv', 
                        ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)

mydat_airport <- read.csv(textConnection(myfile_airport), header=T)


mydat_flight <- read.csv(textConnection(myfile_flight), header=T)
library(RNeo4j)




graph = startGraph("http://localhost:7474/db/data/",
                   username = Neo4J_Username,
                   password = Neo4J_Password)

clear(graph, input = FALSE)
  1. Use Load CSV to create the airport nodes. Note that each airport node should have a label of “Airport” and aset of three attributes: code, city, and state.
#unqiue airport data
addConstraint(graph, "Airport","label")

for(i in 1:length(mydat_airport$label))
{
  createNode(graph, "Airport", 
             label = mydat_airport[i,]$label,
             code=mydat_airport[i,]$label, 
             city=mydat_airport[i,]$city,
             state = mydat_airport[i,]$state
             )
  
}
  1. Use Load CSV to create the flight nodes. Each flight node should have a label of “Flight” and should have three attributes: number, capacity, and airline.
for(i in 1:length(mydat_flight$flight))
{
  createNode(graph, "flights",
             
             flight = mydat_flight[i,]$flight,
             airline = mydat_flight[i,]$airline,
             capacity = mydat_flight[i,]$capacity
            
             )
  
}
  1. Use Load CSV to create the arrival relationships. Each arrival relationship should have a label of “Arrive” and should have a single attribute: landing. (This attribute gives the scheduled landing time.)
query = "match (a:Airport {code:{airport_code}}), (b:flights {flight:{flight_num}}) create (b) -[r:Arrive {landing:{landing_time}}]->(a)"

t = newTransaction(graph)
for(i in 1:length(mydat_flight$flight))
{
  v_airport_code = mydat_flight[i,]$arrive
  v_flight_num = mydat_flight[i,]$flight
  v_landing_time = mydat_flight[i,]$landing
  
  appendCypher(t,
               query,
               airport_code = v_airport_code,
               flight_num = v_flight_num,
               landing_time = v_landing_time
               )
  
}
commit(t)
  1. Use Load CSV to create the departure relationships. Each departure relationship should have a label of “Depart” and should have a single attribute: takeoff. (This attribute gives the scheduled takeoff time.)
query = "match (a:Airport {code:{airport_code}}), (b:flights {flight:{flight_num}}) create (a) -[r:Depart {takeoff:{takeoff_time}}]->(b)"

t = newTransaction(graph)
for(i in 1:length(mydat_flight$flight))
{
  v_airport_code = mydat_flight[i,]$depart
  v_flight_num = mydat_flight[i,]$flight
  v_takeoff_time = mydat_flight[i,]$takeoff
  
  appendCypher(t,
               query,
               airport_code = v_airport_code,
               flight_num = v_flight_num,
               takeoff_time = v_takeoff_time
               )
  
}
commit(t)
  1. Run the query match (n) return (n)
#we need to specific the field to query data instead of just "n"
query = "MATCH (d:Airport)-[t:Depart]->(f:flights)-[l:Arrive]->(a:Airport)
RETURN d.label as FromAirport,t.takeoff as TakeOff, f.airline as AirLine, f.flight as FlightNum,l.landing as Landing, a.label as ToAirport"

match_n_return_n = cypher(graph, query)

kable(match_n_return_n)
FromAirport TakeOff AirLine FlightNum Landing ToAirport
PIT 1100 Delta 44 1203 ATL
BOS 1315 American 1258 1455 ATL
DTW 1310 American 1257 1455 ATL
DTW 914 Delta 24 1123 ATL
BOS 816 Delta 38 1040 ATL
PIT 955 Delta 46 1103 BOS
DTW 945 Delta 27 1210 BOS
ATL 1201 Delta 37 1444 BOS
DTW 1300 American 1231 1520 BOS
ATL 1340 American 1292 1545 BOS
DTW 1600 Southwest 101 1810 BOS
BOS 1310 American 1277 1530 DTW
ATL 926 Delta 23 1109 DTW
BOS 1009 Delta 28 1228 DTW
BOS 1605 Southwest 102 1815 DTW
PIT 1117 Delta 36 1219 DTW
PIT 1620 Southwest 104 1735 DTW
ATL 1330 American 1291 1530 DTW
PIT 1400 American 1278 1505 DTW
BOS 744 Delta 45 855 PIT
DTW 1305 American 1232 1410 PIT
DTW 955 Delta 35 1106 PIT
DTW 1615 Southwest 103 1730 PIT
ATL 901 Delta 43 1001 PIT

Part II: Data Analysis

Now that we have some data loaded, let’s answer some basic questions. Write Cypher queries to answer each of the following:

  1. Return all flights that run from Detroit to Atlanta.
query = "MATCH (d:Airport)-->(f:flights)-->(a:Airport) where d.code = 'DTW' and a.code = 'ATL'
RETURN d.label as FromAirport, f.airline as AirLine, f.flight as FlightNum, a.label as ToAirport"

flights_From_DTW_To_ATL = cypher(graph, query)

kable(flights_From_DTW_To_ATL)
FromAirport AirLine FlightNum ToAirport
DTW American 1257 ATL
DTW Delta 24 ATL
  1. Return all flights that take off before 11 a.m.
query = "MATCH (d:Airport)-[t:Depart]->(f:flights)-->(a:Airport) where t.takeoff < 1100
RETURN d.label as FromAirport,t.takeoff as TakeOff, f.airline as AirLine, f.flight as FlightNum, a.label as ToAirport"

flights_Before11AM = cypher(graph, query)

kable(flights_Before11AM)
FromAirport TakeOff AirLine FlightNum ToAirport
ATL 901 Delta 43 PIT
ATL 926 Delta 23 DTW
BOS 744 Delta 45 PIT
BOS 816 Delta 38 ATL
BOS 1009 Delta 28 DTW
DTW 955 Delta 35 PIT
DTW 914 Delta 24 ATL
DTW 945 Delta 27 BOS
PIT 955 Delta 46 BOS
  1. Return all flights with a capacity greater than 150 passengers.
query = "MATCH (d:Airport)-[t:Depart]->(f:flights)-[l:Arrive]->(a:Airport) where f.capacity > 150
RETURN d.label as FromAirport,t.takeoff as TakeOff, f.airline as AirLine, f.flight as FlightNum,l.landing as Landing, a.label as ToAirport, f.capacity as Capacity"

flights_LargerThan150 = cypher(graph, query)

kable(flights_LargerThan150)
FromAirport TakeOff AirLine FlightNum Landing ToAirport Capacity
DTW 1305 American 1232 1410 PIT 160
BOS 1315 American 1258 1455 ATL 160
PIT 1100 Delta 44 1203 ATL 160
DTW 914 Delta 24 1123 ATL 160
BOS 1009 Delta 28 1228 DTW 160
BOS 816 Delta 38 1040 ATL 160
PIT 955 Delta 46 1103 BOS 160
ATL 926 Delta 23 1109 DTW 160
DTW 945 Delta 27 1210 BOS 160
ATL 1201 Delta 37 1444 BOS 160
ATL 901 Delta 43 1001 PIT 160
BOS 744 Delta 45 855 PIT 160
DTW 1300 American 1231 1520 BOS 160
BOS 1310 American 1277 1530 DTW 160
PIT 1400 American 1278 1505 DTW 160
ATL 1340 American 1292 1545 BOS 160
  1. Return all flights on Delta that arrive in Boston.
query = "MATCH (d:Airport)-[t:Depart]->(f:flights)-[l:Arrive]->(a:Airport) where f.airline ='Delta' and a.code = 'BOS'
RETURN d.label as FromAirport,t.takeoff as TakeOff, f.airline as AirLine, f.flight as FlightNum,l.landing as Landing, a.label as ToAirport"

flights_Delta_BOS = cypher(graph, query)

kable(flights_Delta_BOS)
FromAirport TakeOff AirLine FlightNum Landing ToAirport
PIT 955 Delta 46 1103 BOS
DTW 945 Delta 27 1210 BOS
ATL 1201 Delta 37 1444 BOS

Part III: Data Modification

Let’s make a few simple changes to our data. Perform the following tasks:

  1. Suppose the Pittsburgh airport designation code has been changed from PIT to PGH. Write a query to update the airport’s code.
 Pittsburge_node = getNodes(graph, "Match (a:Airport) where a.code = 'PIT' return a ")

 #updateProp expect a node, but getNodes return a list
 #https://tutel.me/c/programming/questions/35876340/cannot+use+method+39updateprop39+in+rneo4j+package
 Pittsburge_node = Pittsburge_node[[1]] #fix the issue

Before update:

 (Pittsburge_node)
## < Node > 
## Airport
## 
## $code
## [1] "PIT"
## 
## $city
## [1] "Pittsburgh"
## 
## $state
## [1] "Pennsylvania"
## 
## $label
## [1] "PIT"

After update:

 Pittsburge_node = updateProp(Pittsburge_node, code = 'PGH', label= 'PGH')
 
  
 (Pittsburge_node)
## < Node > 
## Airport
## 
## $code
## [1] "PGH"
## 
## $city
## [1] "Pittsburgh"
## 
## $state
## [1] "Pennsylvania"
## 
## $label
## [1] "PGH"
  1. Delta has decided to renumber some of its flights. Write a query to change flight 28 to flight 29.
 delta_node = getNodes(graph, "match (f:flights) where f.airline='Delta' and f.flight=28  return f")

delta_node = delta_node[[1]] #fix the issue

Before udpate:

 (delta_node)
## < Node > 
## flights
## 
## $flight
## [1] 28
## 
## $airline
## [1] "Delta"
## 
## $capacity
## [1] 160

After update:

 delta_node = updateProp(delta_node, flight = 29)
 

 (delta_node)
## < Node > 
## flights
## 
## $flight
## [1] 29
## 
## $airline
## [1] "Delta"
## 
## $capacity
## [1] 160
  1. Southwest has cancelled all flights between Detroit and Boston. Write a query that removes all such flights.
query = "match (fa:Airport)-[d:Depart]->(f:flights)-[l:Arrive]->(ta:Airport) where (fa.code = 'DTW' or fa.code = 'BOS') and (ta.code = 'BOS' or ta.code = 'DTW') and f.airline = 'Southwest' delete d, f, l"
 
 cypher(graph, query)

Re-run records to verify the result

Screenshoot summary

query = "MATCH (d:Airport)-[t:Depart]->(f:flights)-[l:Arrive]->(a:Airport)
RETURN d.label as FromAirport,t.takeoff as TakeOff, f.airline as AirLine, f.flight as FlightNum,l.landing as Landing, a.label as ToAirport"

match_n_return_n = cypher(graph, query)

kable(match_n_return_n)
FromAirport TakeOff AirLine FlightNum Landing ToAirport
PGH 1100 Delta 44 1203 ATL
BOS 1315 American 1258 1455 ATL
DTW 1310 American 1257 1455 ATL
DTW 914 Delta 24 1123 ATL
BOS 816 Delta 38 1040 ATL
PGH 955 Delta 46 1103 BOS
DTW 945 Delta 27 1210 BOS
ATL 1201 Delta 37 1444 BOS
DTW 1300 American 1231 1520 BOS
ATL 1340 American 1292 1545 BOS
BOS 1310 American 1277 1530 DTW
ATL 926 Delta 23 1109 DTW
BOS 1009 Delta 29 1228 DTW
PGH 1117 Delta 36 1219 DTW
PGH 1620 Southwest 104 1735 DTW
ATL 1330 American 1291 1530 DTW
PGH 1400 American 1278 1505 DTW
BOS 744 Delta 45 855 PGH
DTW 1305 American 1232 1410 PGH
DTW 955 Delta 35 1106 PGH
DTW 1615 Southwest 103 1730 PGH
ATL 901 Delta 43 1001 PGH

Part IV: Clean Up

Just run cypher to delete all records.

query = "match (a) optional match (a)-[r]-() delete a,r"
 
cypher(graph, query)

Relational DB V.S. NoSQL DB

Both offer advantage and disadvantage.

NoSQL database provides dynamaic data structure and schema-less design but it doesn’t now have stored procedures. Also, Relational database is a more matured technology that there are many expert developers/adminstorator. NoSQL is relatively new technology.