initial revision of pycol
[pycol.git] / expparser.py
1 #!/usr/bin/python3
2
3 # todo
4 # - better checks before starting to parse (ex: no double spaces ?)
5 # - better error messages
6 # - document expressions
7 # - unit tests
8 # - reduce binary tree in tree
9 # - handle quotes?
10 # - better comments and names (operator is valid for '(' ?)
11 # - ignore chars
12 # - pylint
13 # - tests only if main
14 # - "not" operator? useful?
15 # - we could completely remove all_ops
16
17 DEBUG = False
18
19 def debug(*args, **kvargs):
20     if DEBUG:
21         print(*args, **kvargs)
22
23 class PycolExprParser(object):
24     def __init__(self, binary_ops = None, unary_ops = None,
25                  group_ops = None, var_chars = None):
26         if binary_ops == None:
27             # ordered by priority
28             self.binary_ops = [ "|", "/", " " ]
29         else:
30             self.binary_ops = binary_ops
31         if unary_ops == None:
32             self.unary_ops = [ ]
33         else:
34             self.unary_ops = unary_ops
35         if group_ops == None:
36             self.group_ops = { "(" : ")", "[" : "]" }
37         else:
38             self.group_ops = group_ops
39         if var_chars == None:
40             self.var_chars = "<>-_abcdefghijklmnopqrstuvwxyz" + \
41                              "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
42         else:
43             self.var_chars = var_chars
44
45         self.all_ops = []
46         self.all_ops += self.binary_ops
47         self.all_ops += self.unary_ops
48         self.all_ops += self.group_ops.keys()
49         self.all_ops += self.group_ops.values()
50
51     # return operator priority (high value = high priority)
52     def op_prio(self, op):
53         assert(isinstance(op, str))
54         if not op in self.binary_ops:
55             return 0
56         return self.binary_ops.index(op) + 1
57
58     # The argument tokens is an list of tokens that must start with an
59     # opening group operator, for instance an opening bracket '('. This
60     # function returns the list of tokens representing the inbound
61     # expression without the first opening operator and its corresponding
62     # closing operator
63     def get_group(self, tokens):
64         debug(tokens, self.group_ops.keys())
65         assert(tokens[0] in self.group_ops.keys())
66         stack = []
67
68         for i, t in enumerate(tokens):
69             debug((i,t))
70             if t in self.group_ops.keys():
71                 stack.append(t)
72             elif t == self.group_ops[stack[-1]]:
73                 stack.pop()
74                 if len(stack) == 0:
75                     break
76         assert(len(stack) == 0)
77         debug(tokens[1:i])
78         return tokens[1:i]
79
80     # convert a string into a list of tokens
81     def tokenize(self, s):
82         debug("tokenize(%s)"%(s))
83         tokens = []
84         while s != "":
85             # try to parse an operator
86             op_found = False
87             for op in self.all_ops:
88                 if s.startswith(op):
89                     tokens.append(op)
90                     s = s[len(op):]
91                     op_found = True
92                     break
93             if op_found:
94                 continue
95             # try to parse a variable name
96             length = 0
97             for c in s:
98                 if c in self.var_chars:
99                     length += 1
100                 else:
101                     break
102             if length == 0:
103                 debug(tokens)
104                 debug("invalid token")
105                 excp()
106             tokens.append(s[:length])
107             s = s[length:]
108         debug("tokenize() return %s"%(tokens))
109         return tokens
110
111     def binary_parse(self, tokens):
112         top = None
113         exp = None
114         debug(tokens)
115         while len(tokens) > 0:
116             t = tokens.pop(0)
117             exp = PycolBinExpr(t, self)
118             debug("token '%s'"%t)
119
120             if t in self.binary_ops:
121                 # if it is the first node, error
122                 if top == None:
123                     debug("expression starts with a binary operator")
124                     exce()
125
126                 # the rightest leaf must be a variable
127                 tmp = top
128                 while tmp.right != None:
129                     tmp = tmp.right
130                 if not tmp.is_var():
131                     debug("the rightest leaf must be a variable")
132                     exce()
133
134                 # if new node priority is higher than root node, it
135                 # becomes the new root
136                 if self.op_prio(t) > self.op_prio(top.op):
137                     exp.left = top
138                     top = exp
139                 # else, exp is placed at the "rightest" leaf. Check
140                 # that blabla XXX
141                 else:
142                     tmp = top
143                     while tmp.right != None and \
144                           self.op_prio(exp.op) < self.op_prio(tmp.right.op):
145                         tmp = tmp.right
146
147                     if tmp.right == None:
148                         debug("invalid right node")
149                         exce()
150
151                     exp.left = tmp.right
152                     tmp.right = exp
153
154             elif t in self.group_ops:
155                 # if operator is an opening bracket, recursively call
156                 # ourself with the sub-expression
157                 gr = self.get_group([t] + tokens)
158                 exp.right = self.binary_parse(gr[:])
159                 if exp.right == None:
160                     exce()
161                 tokens = tokens[len(gr) + 1:]
162                 debug("subexp parsed, tokens=%s"%tokens)
163
164                 # if it is the first node
165                 if top == None:
166                     top = exp;
167                     continue
168
169                 # exp is placed at the "rightest" leaf. Check
170                 # that our parent is not a variable
171                 tmp = top
172                 while tmp.right != None:
173                     tmp = tmp.right
174                 if tmp.is_var():
175                     debug("the rightest leaf must not be a variable")
176                     exce()
177
178                 tmp.right = exp
179
180             else: # unary operator or variable
181                 debug("unary operator or variable")
182
183                 # if it is the first node
184                 if top == None:
185                     top = exp;
186                     continue
187
188                 # exp is placed at the "rightest" leaf. Check
189                 # that our parent is not a variable
190                 tmp = top
191                 while tmp.right != None:
192                     tmp = tmp.right
193                 if tmp.is_var():
194                     debug("the rightest leaf must not be a variable")
195                     exce()
196
197                 tmp.right = exp
198
199         if top == None:
200             debug("no top node")
201             exce()
202
203         # the rightest leaf must be a variable
204         tmp = top
205         while tmp.right != None:
206             tmp = tmp.right
207         if not tmp.is_var():
208             debug("the rightest leaf must be a variable")
209             exce()
210
211         return top
212
213     def parse(self, tokens):
214         bin_expr = self.binary_parse(tokens)
215         expr = PycolExpr(bin_expr)
216         expr.reduce()
217         return expr
218
219 class PycolBinExpr(object):
220     def __init__(self, op, parser):
221         debug("PycolBinExpr(%s)"%op)
222         self.op = op
223         self.parser = parser
224         self.left = None
225         self.right = None
226
227     def dump(self, level=0):
228         print("vertice %d %s@0x%x"%(level, self.op, self.__hash__()))
229         if self.left:
230             self.left.dump(level+1)
231         if self.right:
232             self.right.dump(level+1)
233
234     def to_str(self):
235         # it's a variable name
236         if self.is_var():
237             return self.op
238         s = ""
239         # dump left expression if it's a binary operator (in case of
240         # unary operator, we only use the right child)
241         if self.op in self.parser.binary_ops:
242             return "%s%s%s"%(self.left.to_str(), self.op, self.right.to_str())
243
244         # group operator
245         if self.op in self.parser.group_ops.keys():
246             return "%s%s%s"%(self.op, self.right.to_str(),
247                              self.parser.group_ops[self.op])
248
249         # unary
250         return "%s%s"%(self.op, self.right)
251
252     def is_var(self):
253         return not self.op in self.parser.all_ops
254
255 class PycolExpr(object):
256     def __init__(self, bin_expr):
257         self.op = bin_expr.op
258         self.parser = bin_expr.parser
259         self.children = []
260         if bin_expr.left != None:
261             self.children.append(PycolExpr(bin_expr.left))
262         if bin_expr.right != None:
263             self.children.append(PycolExpr(bin_expr.right))
264
265     def dump(self, level=0):
266         print("vertice %d %s@0x%x"%(level, self.op, self.__hash__()))
267         for c in self.children:
268             c.dump(level+1)
269
270     def to_str(self):
271         # it's a variable name
272         if self.is_var():
273             return self.op
274         s = ""
275
276         # binary operators
277         if self.op in self.parser.binary_ops:
278             for i, c in enumerate(self.children):
279                 if i == 0:
280                     s += "%s"%(c.to_str())
281                 else:
282                     s += "%s%s"%(self.op, c.to_str())
283             return s
284
285         # group operator
286         if self.op in self.parser.group_ops.keys():
287             return "%s%s%s"%(self.op, self.children[0].to_str(),
288                              self.parser.group_ops[self.op])
289
290         # unary
291         return "%s%s"%(self.op, self.children[0])
292
293     def is_var(self):
294         return not self.op in self.parser.all_ops
295
296     def reduce(self):
297         for c in self.children:
298             c.reduce()
299         children = []
300         for c in self.children:
301             if c.op == self.op:
302                 children += c.children
303             else:
304                 children += [c]
305         self.children = children
306
307 if __name__ == '__main__':
308     s = "[opt1/opt2] (<desdes>|a|(b|c)|[x]) (no bidule)|coin"
309     #s = "x|y"
310     #tokenize(s)
311
312     parser = PycolExprParser()
313     tokens = parser.tokenize(s)
314     e = parser.parse(tokens)
315     print(e.dump())
316     print(e.to_str())
317     e.reduce()
318     print(e.dump())
319     print(e.to_str())
320
321     #print(get_group(tokenize("(fr fr (fr<3>)(xe)) sss")))
322     #print(get_group(tokenize("<(fr fr> (fr<3>)(xe))> sss")))
323