#!/usr/bin/env python """Transforms gml files to the tgf format Author: Simon Willnauer (simonw@apache.org) """ # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # -*- coding: utf-8 -*- import os import re import sys from htmlentitydefs import name2codepoint __author__= 'Simon Willnauer (simonw@apache.org)' __version__= '1.0' def decodeHtml(s): return re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: unichr(name2codepoint[m.group(1)]), s) def transform(input, output): """Transforms the input from the gml format into tgf Reads a GML (http://en.wikipedia.org/wiki/Graph_Modelling_Language) file from the input file and transforms the input graph into a TGF (http://en.wikipedia.org/wiki/Trivial_Graph_Format) file given as the output parameter. Argument: input the input file stream to read the gml format from output the output file stream to write the tgf format to """ nodePattern = re.compile(r'node\s*[[]\s*id\s*(\d+)\s*label\s*["](.+)["]', re.MULTILINE) edgePattern = re.compile(r'edge\s*[[]\s*source\s*(\d+)\s*target\s*(\d+)' '(\s*label\s*["](.+)["])?', re.MULTILINE) source = input.read() edgePrint = "%s %s %s\n" nodePrint = "%s %s\n" start = 0 nodes = True while True: if nodes: matcher = nodePattern.search(source, start) if matcher: id = matcher.group(1) label = matcher.group(2) start = matcher.span()[1] output.write(nodePrint % (id, decodeHtml(label))) else: nodes = False output.write("#\n") else: matcher = edgePattern.search(source, start) if matcher: src = matcher.group(1) dest = matcher.group(2) label = "" if matcher.group(3): label = matcher.group(4) start = matcher.span()[1] output.write(edgePrint % (src, dest, decodeHtml(label))) else: break if __name__ == "__main__": input = None output = None if len(sys.argv) < 2: print "Usage: %s INPUTFILE [OUTPUTFILE]" % (os.path.split(sys.argv[0])[1]) sys.exit(- 1) if len(sys.argv) < 3: head, tail = os.path.split(sys.argv[1]) outFileName = "%s.tgf" % (os.path.splitext(tail)[0]) else: outFileName = sys.argv[2] try: input = open(sys.argv[1]) output = open(outFileName, "w") transform(input, output) finally: if input: input.close() if output: output.flush() output.close()