MINI MINI MANI MO
#! /usr/bin/env python
from jpype import *
import sys, os
import uuid
import random
import csv
"""
pyopg core functions
Copyright 2016, Oracle and/or its affiliates. All rights reserved.
"""
pyopg_cp = os.environ.get('OPG_CP', None)
#start the JVM
startJVM(getDefaultJVMPath(), "-ea", "-Djava.class.path="+pyopg_cp)
enabled_storage = {"nosql":True, "hbase":True, "rdbms":True}
jdbc_creds = {"jdbc":None, "user":None, "pass":None}
#define specific Java classes
HashMap = java.util.HashMap
try:
HBaseBytes = JClass('org.apache.hadoop.hbase.util.Bytes')
except:
# print "WARN: HBase support disabled!"
enabled_storage["hbase"] = False
pgxf = JClass('oracle.pgx.config.PgxConfig$Field')
try:
loader = JClass('oracle.pg.nosql.OraclePropertyGraphUtils')
except:
# print "WARN: NoSQL Support Disabled!"
enabled_storage["nosql"] = False
if enabled_storage["hbase"]:
hloader = JClass('oracle.pg.hbase.OraclePropertyGraphUtils')
try:
rloader = JClass('oracle.pg.rdbms.OraclePropertyGraphUtils')
except:
print "WARN: RDBMS Storage Disabled!"
enabled_storage["rdbms"] = False
if enabled_storage["nosql"]:
bulkloader = JClass('oracle.pg.nosql.OraclePropertyGraphDataLoader')
if enabled_storage["hbase"]:
hbulkloader = JClass('oracle.pg.hbase.OraclePropertyGraphDataLoader')
hbasekeyfilter = JClass('oracle.pg.hbase.OracleKeyFilter')
if enabled_storage["rdbms"]:
rbulkloader = JClass('oracle.pg.rdbms.OraclePropertyGraphDataLoader')
pstream = JClass('java.io.PrintStream')
pgx = JClass('oracle.pgx.api.Pgx')
analyst_class = JClass('oracle.pgx.api.Analyst')
BAOS = JClass('java.io.ByteArrayOutputStream')
print "Class loading done"
#define Java packages
common = JPackage('oracle.pg.common')
if enabled_storage["nosql"]:
nosql = JPackage('oracle.pg.nosql')
nosql_index = JPackage('oracle.pg.nosql.index')
kv = JPackage('oracle.kv')
if enabled_storage["hbase"]:
hbase = JPackage('oracle.pg.hbase')
hbase_index = JPackage('oracle.pg.hbase.index')
hadoop_conf = JPackage('org.apache.hadoop.conf')
hadoop_hbase = JPackage('org.apache.hadoop.hbase')
hbase_client = JPackage('org.apache.hadoop.hbase.client')
hbase_filter = JPackage('org.apache.hadoop.hbase.filter')
if enabled_storage["rdbms"]:
rdbms = JPackage('oracle.pg.rdbms')
rdbms_index = JPackage('oracle.pg.rdbms.index')
text = JPackage('oracle.pg.text')
blueprints = JPackage('com.tinkerpop.blueprints')
pgx_config = JPackage('oracle.pgx.config')
pgx_types = JPackage('oracle.pgx.common.types')
pgx_control = JPackage('oracle.pgx.api')
fexp = JPackage('oracle.pgx.filter.expressions')
jutil = JPackage('java.util')
from janalyst import *
#set PGX Configuration -- TODO: make this readable from a config file
## do we need to do this in PGX 1.1?
pgx_param_map = HashMap()
pgx_param_map.put(pgxf.ENABLE_GM_COMPILER, False)
#thisPgxConfig = pgx_config.PgxConfig().init()#.parse(pgx_param_map, False, None)
opg = None #this is the context object
class OPGDALException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class OPGAnalystException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class OPGIterator:
def __init__(self, jitter):
self.jitter = jitter
def __iter__(self):
return self
def next(self):
if (self.jitter.hasNext() == 0):
raise StopIteration
return self.jitter.next()
""" OPG DAL Commands begin here
This is the base set of commands available when working with the pyopg layer.
"""
def _checkIDType(iid):
"""
Checks to see if an id is convertable to Java PG types
"""
if not isinstance(iid,int) and not isinstance(iid,long) and not isinstance(iid, java.lang.Long):
raise OPGDALException("IDs must be ints or longs")
return False
return True
def _checkDBType():
"""
Checks opg context to determine if ONDB or HBase is the underlying store
"""
global opg
opgclass = type(opg).__name__
if opgclass == "oracle.pg.nosql.OraclePropertyGraph":
return "nosql"
elif opgclass == "oracle.pg.hbase.OraclePropertyGraph":
return "hbase"
elif opgclass == "oracle.pg.rdbms.OraclePropertyGraph":
return "rdbms"
def connect(graph_name, store_name, hosts):
if isinstance(hosts, list):
kconfig = kv.KVStoreConfig(store_name, hosts)
else:
kconfig = kv.KVStoreConfig(store_name, [hosts])
global opg
opg = nosql.OraclePropertyGraph.getInstance(kconfig, graph_name)
return opg
def connectONDB(graph_name, store_name, hosts):
if isinstance(hosts, list):
kconfig = kv.KVStoreConfig(store_name, hosts)
else:
kconfig = kv.KVStoreConfig(store_name, [hosts])
global opg
opg = nosql.OraclePropertyGraph.getInstance(kconfig, graph_name)
return opg
def connectHBase(graph_name, zk_quorum, client_port="2181"):
hbconf = hadoop_hbase.HBaseConfiguration.create();
hbconf.set("hbase.zookeeper.quorum", zk_quorum)
hbconf.set("hbase.zookeeper.property.clientPort", client_port)
hbconf.set("hbase.client.scanner.timeout.period", "1000000")
hbconf.set("zookeeper.session.timeout", "1000000")
hconn = hbase_client.HConnectionManager.createConnection(hbconf)
global opg
opg = hbase.OraclePropertyGraph.getInstance(hbconf, hconn, graph_name)
return opg
def connectRDBMS(graph_name, jdbc_url, username, password):
orcl = rdbms.Oracle(jdbc_url, username, password)
global jdbc_creds
jdbc_creds = {"jdbc":jdbc_url, "user":username, "pass":password}
global opg
opg = rdbms.OraclePropertyGraph.getInstance(orcl, graph_name)
return opg
def getGraphName():
"""
Returns the graph name.
"""
global opg
return opg.getGraphName()
def clearRepository():
"""
This method removes all vertices and edges from this property graph instance.
"""
global opg
opg.clearRepository()
def dropAllIndicies():
"""
Drops all existing indices
"""
global opg
opg.dropAllIndices()
def getEdges(*keys):
"""
Gets all edges with a property corresponding to key
"""
global opg
return OPGIterator(opg.getEdges(keys).iterator())
def addVertex(vid):
"""
Add a vertex with ID vid to the graph
"""
global opg
result = None
if isinstance(vid, int):
result = opg.addVertex(long(vid))
elif isinstance(vid, long):
result = opg.addVertex(vid)
else:
raise OPGDALException("IDs must be ints or longs")
return result
def addVertexByName(name):
"""
Add a new vertex to the graph with a property 'name'
"""
global opg
result = opg.addVertex(random.getrandbits(32))
result.setProperty("name", name, True)
return result
def getVertexByName(name):
"""
Retrieve a vertex by the property 'name'
"""
global opg
try:
result = OPGIterator(opg.getVertices("name", name).iterator()).next()
return result
except:
return None
def setVertexProperty(vid, label, value):
"""
Sets a property on the vertex with ID vid. Note that
vertices can have properties added directly to them, this
is a convenience method.
"""
global opg
_checkIDType(vid)
v = opg.getVertex(long(vid))
v.setProperty(label, value, True)
def addEdge(eid, out_vid, in_vid, label):
"""
Add a labeled edge to the graph.
"""
global opg
for i in [eid, out_vid, in_vid]:
_checkIDType(i)
outv = opg.getVertex(out_vid)
inv = opg.getVertex(in_vid)
result = opg.addEdge(eid, outv, inv, label)
return result
def addEdgeByName(ename, out_vname, in_vname):
"""
Add an edge to the graph by name.
Takes names for the edge, out vertex and in vertex as input.
"""
global opg
outv = getVertexByName(out_vname)
inv = getVertexByName(in_vname)
if not outv:
outv = addVertexByName(out_vname)
if not inv:
inv = addVertexByName(in_vname)
outvid = long(str(outv.getId()))
invid = long(str(inv.getId()))
e = addEdge(random.getrandbits(32),outvid, invid, ename)
return e
def getEdgeByEndpoints(edge_label, outv_label, inv_label):
"""
Fetch an edge by its out and in-vertex names.
"""
global opg
outv = getVertexByName(outv_label)
inv = getVertexByName(inv_label)
try:
#dumb hack to deal with Java method signature
oedges = OPGIterator(outv.getEdges(blueprints.Direction.OUT, [edge_label]).iterator())
for e in oedges:
if inv == e.getInVertex():
return e
except AttributeError:
return None
return None
def setEdgeProperty(eid, label, value):
"""
Convenience method for adding properties to edges.
"""
global opg
_checkIDType(eid)
e = opg.getEdge(long(eid))
e.setProperty(label, value, True)
def getAllVertexPropertyKeys(getTypes=False, sample=0.1):
"""
Return the set of all property keys defined on vertices.
"""
global opg
props = set()
if not getTypes:
for v in OPGIterator(opg.getVertices().iterator()):
props = props.union(v.getPropertyKeys())
else:
for v in OPGIterator(opg.getVertices().iterator()):
rn = random.random()
if rn <= sample:
vkeys = list(OPGIterator(v.getPropertyKeys().iterator()))
vtypes = [type(v.getProperty(k)).__name__ for k in vkeys]
props = props.union(zip(vkeys, vtypes))
return props
def getAllEdgePropertyKeys(getTypes=False, sample=0.1):
"""
Return the set of all property keys defined on edges.
Passing getTypes as True returns a set of tuples with the key
and the value type.
"""
global opg
props = set()
if not getTypes:
for e in OPGIterator(opg.getEdges().iterator()):
props = props.union(e.getPropertyKeys())
else:
for e in OPGIterator(opg.getEdges().iterator()):
rn = random.random()
if rn <= sample:
ekeys = list(OPGIterator(e.getPropertyKeys().iterator()))
etypes = [type(e.getProperty(k)).__name__ for k in ekeys]
props = props.union(zip(ekeys, etypes))
return props
def getVertexDict(vid):
"""
Get the properties of a vertex as a dict.
"""
global opg
d = {}
_checkIDType(vid)
v = opg.getVertex(vid)
for k in v.getPropertyKeys():
d[k] = v.getProperty(k)
return d
def getEdgeDict(eid):
"""
Get the properties of a vertex as a dict.
"""
global opg
d = {}
_checkIDType(eid)
e = opg.getEdge(long(eid))
for k in v.getPropertyKeys():
d[k] = v.getProperty(k)
return d
def getVertices(*keys):
"""
Gets all vertices with a property corresponding to one of the keys
"""
global opg
return OPGIterator(opg.getVertices(keys).iterator())
def getVerticesByValue(key, value, wildcard=False):
"""
Return an iterable to all the vertices in the graph that have a particular key/value property
"""
global opg
return OPGIterator(opg.getVertices(key, value, wildcard).iterator())
def V(vid):
"""
Shorthand to fetch a vertex by id
"""
global opg
return opg.getVertex(vid)
def E(eid):
"""
Shorthand to fetch an edge by id
"""
global opg
return opg.getEdge(eid)
def countV(*keys):
"""
Return the vertex count for the graph.
"""
global opg
cv = 0
#for v in OPGIterator(opg.getVertices(java.lang.String(keys)).iterator()):
for v in OPGIterator(opg.getVertices(keys, opg.getVertexFilterCallback(), common.OraclePropertyGraphBase.getJustVertexIdOptFlag()).iterator()):
cv += 1
return cv
def countE(*keys):
"""
Return the edge count for the graph.
"""
global opg
ce = 0
# for e in OPGIterator(opg.getEdges(keys).iterator()):
for e in OPGIterator(opg.getEdges(keys, opg.getEdgeFilterCallback(), common.OraclePropertyGraphBase.getJustEdgeIdOptFlag()).iterator()):
ce += 1
return ce
def createVertexIndex(*index_keys):
"""
Create automatic indices on all keys in index keys.
"""
global opg
indexParams = text.OracleIndexParameters.buildFS(1, 4, 10000, 50000, True, "./lucene-index");
opg.setDefaultIndexParameters(indexParams)
opg.createKeyIndex(index_keys, JClass("com.tinkerpop.blueprints.Vertex"))
def createEdgeIndex(*index_keys):
"""
Create automatic indices on all keys in index keys.
"""
global opg
indexParams = text.OracleIndexParameters.buildFS(1, 4, 10000, 50000, True, "./lucene-index");
opg.setDefaultIndexParameters(indexParams)
opg.createKeyIndex(index_keys, JClass("com.tinkerpop.blueprints.Edge"))
def searchVertexIndex(key, term):
"""
Search the index of a given key for the term.
"""
global opg
return OPGIterator(opg.getVertices(key, term, True).iterator())
def searchEdgeIndex(key, term):
"""
Search the index of a given key for the term.
"""
global opg
return OPGIterator(opg.getEdges(key, term, True).iterator())
def dropAllIndices():
"""
Drops all indices on the graph.
"""
global opg
opg.dropAllIndices()
def dropVertexIndex(index_key):
"""
Drops the specified index.
"""
global opg
opg.dropKeyIndex(index_key, JClass("com.tinkerpop.blueprints.Vertex"))
def dropEdgeIndex(index_key):
"""
Drops the specified index.
"""
global opg
opg.dropKeyIndex(index_key, JClass("com.tinkerpop.blueprints.Edge"))
def importGML(filename):
"""
Load a GML file into the database
"""
global opg
dbtype = _checkDBType()
if dbtype == "nosql":
loader.importGML(opg, filename, pstream("/dev/null"))
elif dbtype == "hbase":
hloader.importGML(opg, filename, pstream("/dev/null"))
elif dbtype == "rdbms":
rloader.importGML(opg, filename, pstream("/dev/null"))
def importGraphSON(filename):
"""
Load a GraphSON file into the database
"""
global opg
dbtype = _checkDBType()
if dbtype == "nosql":
loader.importGraphSON(opg, filename, pstream("/dev/null"))
elif dbtype == "hbase":
hloader.importGraphSON(opg, filename, pstream("/dev/null"))
elif dbtype == "rdbms":
rloader.importGraphSON(opg, filename, pstream("/dev/null"))
def importGraphML(filename):
"""
Load a GraphML file into the database
"""
global opg
dbtype = _checkDBType()
if dbtype == "nosql":
loader.importGraphML(opg, filename, pstream("/dev/null"))
elif dbtype == "hbase":
hloader.importGraphML(opg, filename, pstream("/dev/null"))
elif dbtype == "rdbms":
rloader.importGraphML(opg, filename, pstream("/dev/null"))
def exportGML(filename):
"""
Write the current graph as a GML file
"""
global opg
dbtype = _checkDBType()
if dbtype == "nosql":
loader.exportGML(opg, filename, pstream("/dev/null"))
elif dbtype == "hbase":
hloader.exportGML(opg, filename, pstream("/dev/null"))
elif dbtype == "rdbms":
rloader.exportGML(opg, filename, pstream("/dev/null"))
def exportGraphSON(filename):
"""
Write the current graph as a GraphSON file
"""
global opg
dbtype = _checkDBType()
if dbtype == "nosql":
loader.exportGraphSON(opg, filename, pstream("/dev/null"))
elif dbtype == "hbase":
hloader.exportGraphSON(opg, filename, pstream("/dev/null"))
elif dbtype == "rdbms":
rloader.exportGraphSON(opg, filename, pstream("/dev/null"))
def exportGraphML(filename):
"""
Write the current graph as a GraphML file
"""
global opg
dbtype = _checkDBType()
if dbtype == "nosql":
loader.exportGraphML(opg, filename, pstream("/dev/null"))
elif dbtype == "hbase":
hloader.exportGraphML(opg, filename, pstream("/dev/null"))
elif dbtype == "rdbms":
rloader.exportGraphML(opg, filename, pstream("/dev/null"))
def exportFlatFiles(v_filename, e_filename):
"""
Write the current graph as vertex and edge files optimized for bulk loading
"""
global opg
dbtype = _checkDBType()
if dbtype == "nosql":
loader.exportFlatFiles(opg, v_filename, e_filename, False)
elif dbtype == "hbase":
hloader.exportFlatFiles(opg, v_filename, e_filename, False)
elif dbtype == "rdbms":
rloader.exportFlatFiles(opg, v_filename, e_filename, False)
def importFlatFiles(v_filename, e_filename, dop=4):
"""
Bulk load a graph using vertex and edge files
dop controls the number of JVM threads used for loading
"""
global opg
dbtype = _checkDBType()
if dbtype == "nosql":
bulkload = bulkloader.getInstance()
bulkload.loadData(opg, v_filename, e_filename, dop)
elif dbtype == "hbase":
hload = hbulkloader.getInstance()
hload.loadData(opg, v_filename, e_filename, dop)
elif dbtype == "rdbms":
rload = rbulkloader.getInstance()
rload.loadData(opg, v_filename, e_filename, dop, 1000, True, None)
def unparseMetric(c):
"""
Simple function for mapping iterators of
Java Long IDs and metrics into Vertices and
python floats.
"""
global opg
cid = c.getKey()
metric = c.getValue()
return opg.getVertex(cid),float(metric.toString())
def analyst_config(props = []):
"""
Create an in-memory analyst configuration on this graph for
complex analyses
"""
global opg
vertexProps = getAllVertexPropertyKeys(True)
edgeProps = getAllEdgePropertyKeys(True)
dbtype = _checkDBType()
analyst_cfg = None
if dbtype == "nosql":
kvcfg = opg.kVStoreConfig
analyst_cfg = pgx_config.PgNosqlGraphConfigBuilder().setName(opg.getGraphName()).setHosts(jutil.Arrays.asList(kvcfg.helperHosts)).setStoreName(kvcfg.storeName)
#analyst_cfg = pgx_config.GraphConfigBuilder.forNosql().setName(opg.getGraphName()).setHosts(jutil.Arrays.asList(kvcfg.helperHosts)).setStoreName(kvcfg.storeName)
elif dbtype == "hbase":
quorum = opg.getConfiguration().get("hbase.zookeeper.quorum")
clientPort = int(opg.getConfiguration().get("hbase.zookeeper.property.clientPort"))
analyst_cfg = pgx_config.PgHbaseGraphConfigBuilder().setName(opg.getGraphName()).setZkQuorum(quorum).setZkClientPort(clientPort).setSplitsPerRegion(opg.numSplitsPerRegion).setZkSessionTimeout(1000000)
elif dbtype == "rdbms":
global jdbc_creds
analyst_cfg = pgx_config.PgRdbmsGraphConfigBuilder().setName(opg.getGraphName()).setJdbcUrl(jdbc_creds["jdbc"]).setUsername(jdbc_creds["user"]).setPassword(jdbc_creds["pass"]).setMaxNumConnections(2)
for vp in vertexProps:
ptype = pgx_types.PropertyType.STRING
defaultval = vp[0]
if vp[0] in props:
if vp[1] == "java.lang.Integer":
ptype = pgx_types.PropertyType.INTEGER
defaultval = 0
elif vp[1] == "java.lang.Double":
ptype = pgx_types.PropertyType.DOUBLE
defaultval = 1.0
elif vp[1] == "java.lang.Float":
ptype = pgx_types.PropertyType.FLOAT
defaultval = 1.0
elif vp[1] == "java.lang.Long":
ptype = pgx_types.PropertyType.LONG
defaultval = 0L
analyst_cfg.addNodeProperty(vp[0], ptype, defaultval)
for ep in edgeProps:
ptype = pgx_types.PropertyType.STRING
defaultval = ep[0]
if ep[0] in props:
if ep[1] == "java.lang.Integer":
ptype = pgx_types.PropertyType.INTEGER
defaultval = 0
elif ep[1] == "java.lang.Double":
ptype = pgx_types.PropertyType.DOUBLE
defaultval = 1.0
elif ep[1] == "java.lang.Float":
ptype = pgx_types.PropertyType.FLOAT
defaultval = 0.0
elif ep[1] == "java.lang.Long":
ptype = pgx_types.PropertyType.LONG
defaultval = 0L
analyst_cfg.addEdgeProperty(ep[0], ptype, defaultval)
analyst_cfg = analyst_cfg.build()
if dbtype == "nosql":
opg = nosql.OraclePropertyGraph.getInstance(analyst_cfg)
elif dbtype == "hbase":
opg = hbase.OraclePropertyGraph.getInstance(analyst_cfg)
elif dbtype == "rdbms":
opg = rdbms.OraclePropertyGraph.getInstance(analyst_cfg)
return opg
def analyst(url=None, sessionLabel=None, *properties):
"""
Creates an in-memory analyst for the graph.
Properties loaded into the analyst must be named:
e.g. a = analyst("label", "weight")
"""
global opg
opg = analyst_config(properties)
instance = None
if url:
instance = pgx.getInstance(url)
else:
instance = pgx.getInstance()
if url == None:
instance.startEngine(pgx_param_map)
session = None
if sessionLabel:
session = instance.createSession(sessionLabel)
else:
session = instance.createSession("my-session")
acfg = opg.getConfig()
print acfg
pgxGraph = session.readGraphWithProperties(acfg, True)
a = session.createAnalyst()
a = Janalyst(pgxGraph,a)
return a
def subgraph_from_filter(analyst_obj, filter_exp):
"""
Create a subgraph on this grapha analyst using a
PGX filter expression
"""
subgraph = a.graph.filter(filter_exp)
return Janalyst(subgraph, a.analyst_context) #Janalyst(new_ac)
def saveAnalyst(analyst_obj, filename, overwrite=True):
"""
Save the current analyst graph as a GraphML file
"""
core = pgx.getCore()
ac = analyst_obj.analyst_context
core.storeGraphWithProperties(ac.getSessionId(), ac.getGraphName(), pgx_config.Format.GRAPHML, filename, overwrite)
def shutdown():
"""
Shutdown PGX
"""
global opg
if opg is not None:
opg.shutdown()
#shutdownJVM()
if __name__ == "__main__":
if len(sys.argv) > 1:
if sys.argv[1] == "test" or sys.argv[1] == "testclean":
print "entering test"
connectONDB("marvel", "kvstore", ["localhost:5000"])
#connectHBase("marvel", "localhost")
if sys.argv[1] == "testclean":
print "entering testclean"
dropVertexIndex("name")
dropEdgeIndex("appearedWith")
clearRepository()
marvel_1 = csv.reader(open("hero-network.csv"), delimiter=',', quotechar='"')
for hero in marvel_1:
hedge = getEdgeByEndpoints("appearedWith", hero[0], hero[1])
if hedge == None:
hedge = addEdgeByName("appearedWith", hero[0], hero[1])
hedge.setProperty("weight", 1.0)
else:
weight = hedge.getProperty("weight")
weight = weight.floatValue() + 1.0
hedge.setProperty("weight", weight)
opg.commit()
createVertexIndex("name")
createEdgeIndex("appearedWith")
print "edges", countE()
print "searching index"
for v in searchVertexIndex("name", "IRON*"):
print v
a = analyst(None, "marvel-analysis", "name", "weight")
#example against a jetty-hosted PGX instance
#a = analyst("http://scott:tiger@localhost:8080/pgx", "marvel-analysis", "name", "weight")
print a.countTriangles(), "triangles"
print "in degree centrality"
ind = a.inDegreeCentrality().topK(10)
for ddist in ind:
v = opg.getVertex(ddist.getKey().getId())
print v.getProperty("name"), ddist.getValue()
print "communities"
graph_communities = [{"compName":i.collectionName, "size":i.size()} for i in a.communities().components()]
print "\n".join(map(str,graph_communities))
print "10 largest PR values"
for tk in a.pagerank().getTopKValues(10):
v = opg.getVertex(tk.getKey().getId())
print v.getProperty("name"), tk.getValue()
print "triangles before subgraph", a.countTriangles()
a = subgraph_from_filter(a, fexp.EdgeFilter("edge.weight > 1.0"))
print "triangles after subgraph", a.countTriangles()
shutdown()
OHA YOOOO