import shlex import pandas as pd import numpy as np # crude performance test: # # import timeit # exec(open('../python/read_omnetpp.py').read()) # t = timeit.Timer(lambda: read_omnetpp('PureAlohaExperiment-numHosts=10,mean=1-#0.sca')) # t.timeit(number=1000) class Context: # constants for 'type' RUN = 1 SCALAR = 2 VECTOR = 3 STATISTICS = 4 HISTOGRAM = 5 type = None run_id = None module_name = None result_name = None scalar_value = None vector_id = None vector_columns = None attrs = {} itervars = {} params = {} fields = {} binedges = [] binvalues = [] veccols = {} # maps vector id to either "TV" or "ETV" # these map vector ids to numpy arrays vecevents = {} vectimes = {} vecvalues = {} def read_omnetpp(filename): # Performance notes: # (1) most CPU cycles are burnt in splitting the line to tokens (you can verify this by strategically placing 'continue' statements below) # (2) creating an empty DataFrame (such as at the end of this method, with records=[]) takes surprisingly long time # records = [] ctx = Context() with open(filename) as f: for line in f: line = line.lstrip() # continue (uncomment for profiling) if len(line) == 0 or line[0] == '#' or line.isspace(): continue # tokenize the line. Note: this line is THE performance bottleneck. # shlex.split() is extra slow, but even String.split() doesn't cut it tokens = line.split() if line.find('"') == -1 else shlex.split(line) # continue # (uncomment for profiling) # process the line type = tokens[0] # the order of the checks is important for performance if type.isdigit(): # looks like a vector data line vector_id = tokens[0] if ctx.veccols[vector_id] == "TV": ctx.vectimes[vector_id].append(tokens[1]) ctx.vecvalues[vector_id].append(tokens[2]) else: # ETV # ctx.vecevents[vector_id].append(tokens[1]) ctx.vectimes[vector_id].append(tokens[2]) ctx.vecvalues[vector_id].append(tokens[3]) elif type == 'attr': # syntax: "attr " assert ctx.type != None, "stray 'attr' line" assert len(tokens) == 3, "incorrect 'attr' line -- attr expected" ctx.attrs[tokens[1]] = tokens[2] elif type == 'scalar': flush_context(ctx, records) # syntax: "scalar " assert ctx.type != None, "stray 'scalar' line, must be under a 'run'" assert len(tokens) == 4, "incorrect 'scalar' line -- scalar expected" ctx.type = Context.SCALAR ctx.module_name = tokens[1] ctx.result_name = tokens[2] ctx.scalar_value = tokens[3] elif type == 'run': # flush last result item in previous run flush_context(ctx, records) # "run" line, format: run assert len(tokens) == 2, "incorrect 'run' line -- run expected" ctx.type = Context.RUN ctx.run = tokens[1] elif type == 'vector': flush_context(ctx, records) # syntax: "vector []" assert ctx.type != None, "stray 'vector' line, must be under a 'run'" assert len(tokens) == 4 or len(tokens) == 5, "incorrect 'vector' line -- vector [] expected" ctx.type = Context.VECTOR ctx.vector_id = tokens[1] ctx.module_name = tokens[2] ctx.result_name = tokens[3] ctx.vector_columns = "TV" if (len(tokens) < 5 or tokens[4].isnumeric()) else tokens[4] if ctx.vector_columns != "TV" and ctx.vector_columns != "ETV": raise RuntimeError("unsupported vector column specification: '" + ctx.vector_columns + "' -- only 'TV' and 'ETV' are implemented") ctx.veccols[ctx.vector_id] = ctx.vector_columns ctx.vecevents[ctx.vector_id] = [] ctx.vectimes[ctx.vector_id] = [] ctx.vecvalues[ctx.vector_id] = [] elif type == 'statistic': flush_context(ctx, records) # syntax: "statistic " assert ctx.type != None, "stray 'statistic' line, must be under a 'run'" assert len(tokens) == 3, "incorrect 'statistic' line -- statistic expected" ctx.type = Context.STATISTICS ctx.module_name = tokens[1] ctx.result_name = tokens[2] elif type == 'field': # syntax: "field " assert ctx.type == Context.STATISTICS, "stray 'field' line, must be under a 'statistic'" assert len(tokens) == 3, "incorrect 'field' line -- field expected" assert tokens[1] in ['count', 'mean', 'sumweights', 'min', 'max', 'stddev', 'sum', 'sqrsum'], "field name '" + tokens[1] + "' not accepted" # TODO weighted fields ctx.fields[tokens[1]] = float(tokens[2]) elif type == 'bin': if ctx.type == Context.STATISTICS: ctx.type = Context.HISTOGRAM if ctx.type != Context.HISTOGRAM: raise RuntimeError("stray 'bin' line, must be under a 'histogram'") assert len(tokens) == 3, "incorrect 'bin' line -- bin expected" ctx.binedges.append(tokens[1]) ctx.binvalues.append(tokens[2]) elif type == 'param': # syntax: "param " assert ctx.type == Context.RUN, "stray 'param' line, must be under a 'run' line" assert len(tokens) == 3, "incorrect 'param' line -- param expected" ctx.params[tokens[1]] = tokens[2] elif type == 'itervar': # syntax: "itervar " assert ctx.type == Context.RUN, "stray 'itervar' line, must be under a 'run' line" assert len(tokens) == 3, "incorrect 'itervar' line -- itervar expected" ctx.itervars[tokens[1]] = tokens[2] elif type == 'version': assert len(tokens) == 2, "incorrect 'version' line -- version expected" assert tokens[1] == '2', "expects version 2 expected" else: raise RuntimeError('unrecognized line type: ' + type) dataframe = pd.DataFrame(data=records, columns=['run', 'type', 'module', 'name', 'attrname', 'value', 'count', 'sumweights', 'mean', 'stddev', 'min', 'max', 'vectime', 'vecvalue', 'binedges', 'binvalues']) dataframe = dataframe.dropna(axis=1, how='all') return dataframe def flush_context(ctx, records): if ctx.type == Context.RUN: for key, value in ctx.attrs.items(): records.append({'run': ctx.run, 'type': 'runattr', 'attrname': key, 'value': value}) for key, value in ctx.itervars.items(): records.append({'run': ctx.run, 'type': 'itervar', 'attrname': key, 'value': value}) for key, value in ctx.params.items(): records.append({'run': ctx.run, 'type': 'param', 'attrname': key, 'value': value}) if ctx.type == Context.SCALAR: records.append({'run': ctx.run, 'type': 'scalar', 'module': ctx.module_name, 'name': ctx.result_name, 'value': ctx.scalar_value}) for key, value in ctx.attrs.items(): records.append({'run': ctx.run, 'type': 'attr', 'module': ctx.module_name, 'name': ctx.result_name, 'attrname': key, 'value': value}) if ctx.type == Context.VECTOR: records.append({'run': ctx.run, 'type': 'vector', 'module': ctx.module_name, 'name': ctx.result_name, 'vectime': ctx.vectimes[ctx.vector_id], 'vecvalue': ctx.vecvalues[ctx.vector_id]}) for key, value in ctx.attrs.items(): records.append({'run': ctx.run, 'type': 'attr', 'module': ctx.module_name, 'name': ctx.result_name, 'attrname': key, 'value': value}) if ctx.type == Context.STATISTICS: records.append({'run': ctx.run, 'type': 'statistic', 'module': ctx.module_name, 'name': ctx.result_name, **ctx.fields}) for key, value in ctx.attrs.items(): records.append({'run': ctx.run, 'type': 'attr', 'module': ctx.module_name, 'name': ctx.result_name, 'attrname': key, 'value': value}) if ctx.type == Context.HISTOGRAM: records.append({'run': ctx.run, 'type': 'histogram', 'module': ctx.module_name, 'name': ctx.result_name, **ctx.fields, 'binedges': np.array(ctx.binedges), 'binvalues': np.array(ctx.binvalues)}) for key, value in ctx.attrs.items(): records.append({'run': ctx.run, 'type': 'attr', 'module': ctx.module_name, 'name': ctx.result_name, 'attrname': key, 'value': value}) ctx.attrs = {} ctx.itervars = {} ctx.params = {} ctx.fields = {} ctx.binedges = [] ctx.binvalues = []