984ec3f15307ff6da37238eac30f5699f09442aa
[protos/libecoli.git] / lib / ecoli_tk_re_lex.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <stdbool.h>
4 #include <string.h>
5 #include <regex.h>
6 #include <errno.h>
7
8 #include <ecoli_malloc.h>
9 #include <ecoli_log.h>
10 #include <ecoli_test.h>
11 #include <ecoli_strvec.h>
12 #include <ecoli_tk.h>
13 #include <ecoli_tk_many.h>
14 #include <ecoli_tk_or.h>
15 #include <ecoli_tk_str.h>
16 #include <ecoli_tk_int.h>
17 #include <ecoli_tk_re_lex.h>
18
19 struct regexp_pattern {
20         char *pattern;
21         regex_t r;
22         bool keep;
23 };
24
25 struct ec_tk_re_lex {
26         struct ec_tk gen;
27         struct ec_tk *child;
28         struct regexp_pattern *table;
29         size_t len;
30 };
31
32 static struct ec_strvec *
33 tokenize(struct regexp_pattern *table, size_t table_len, const char *str)
34 {
35         struct ec_strvec *strvec = NULL;
36         char *dup = NULL;
37         char c;
38         size_t len, off = 0;
39         size_t i;
40         int ret;
41         regmatch_t pos;
42
43         dup = ec_strdup(str);
44         if (dup == NULL)
45                 goto fail;
46
47         strvec = ec_strvec_new();
48         if (strvec == NULL)
49                 goto fail;
50
51         len = strlen(dup);
52         while (off < len) {
53                 for (i = 0; i < table_len; i++) {
54                         ret = regexec(&table[i].r, &dup[off], 1, &pos, 0);
55                         if (ret != 0)
56                                 continue;
57                         if (pos.rm_so != 0 || pos.rm_eo == 0) {
58                                 ret = -1;
59                                 continue;
60                         }
61
62                         if (table[i].keep == 0)
63                                 break;
64
65                         c = dup[pos.rm_eo + off];
66                         dup[pos.rm_eo + off] = '\0';
67                         ec_log(EC_LOG_DEBUG, "re_lex match <%s>\n", &dup[off]);
68                         if (ec_strvec_add(strvec, &dup[off]) < 0)
69                                 goto fail;
70
71                         dup[pos.rm_eo + off] = c;
72                         break;
73                 }
74
75                 if (ret != 0)
76                         goto fail;
77
78                 off += pos.rm_eo;
79         }
80
81         ec_free(dup);
82         return strvec;
83
84 fail:
85         ec_free(dup);
86         ec_strvec_free(strvec);
87         return NULL;
88 }
89
90 static struct ec_parsed_tk *ec_tk_re_lex_parse(const struct ec_tk *gen_tk,
91         const struct ec_strvec *strvec)
92 {
93         struct ec_tk_re_lex *tk = (struct ec_tk_re_lex *)gen_tk;
94         struct ec_strvec *new_vec = NULL, *match_strvec;
95         struct ec_parsed_tk *parsed_tk = NULL, *child_parsed_tk;
96         const char *str;
97
98         parsed_tk = ec_parsed_tk_new();
99         if (parsed_tk == NULL)
100                 return NULL;
101
102         if (ec_strvec_len(strvec) == 0)
103                 return parsed_tk;
104
105         str = ec_strvec_val(strvec, 0);
106         new_vec = tokenize(tk->table, tk->len, str);
107         if (new_vec == NULL)
108                 goto fail;
109
110         printf("--------\n");
111         ec_strvec_dump(stdout, new_vec);
112         child_parsed_tk = ec_tk_parse_tokens(tk->child, new_vec);
113         if (child_parsed_tk == NULL)
114                 goto fail;
115
116         if (!ec_parsed_tk_matches(child_parsed_tk) ||
117                         ec_parsed_tk_len(child_parsed_tk) !=
118                                 ec_strvec_len(new_vec)) {
119                 ec_strvec_free(new_vec);
120                 ec_parsed_tk_free(child_parsed_tk);
121                 return parsed_tk;
122         }
123         ec_strvec_free(new_vec);
124         new_vec = NULL;
125
126         ec_parsed_tk_add_child(parsed_tk, child_parsed_tk);
127         match_strvec = ec_strvec_ndup(strvec, 0, 1);
128         if (match_strvec == NULL)
129                 goto fail;
130         ec_parsed_tk_set_match(parsed_tk, gen_tk, match_strvec);
131
132         return parsed_tk;
133
134  fail:
135         ec_strvec_free(new_vec);
136         ec_parsed_tk_free(parsed_tk);
137
138         return NULL;
139 }
140
141 static void ec_tk_re_lex_free_priv(struct ec_tk *gen_tk)
142 {
143         struct ec_tk_re_lex *tk = (struct ec_tk_re_lex *)gen_tk;
144         unsigned int i;
145
146         for (i = 0; i < tk->len; i++) {
147                 ec_free(tk->table[i].pattern);
148                 regfree(&tk->table[i].r);
149         }
150
151         ec_free(tk->table);
152         ec_tk_free(tk->child);
153 }
154
155 static struct ec_tk_type ec_tk_re_lex_type = {
156         .name = "re_lex",
157         .parse = ec_tk_re_lex_parse,
158         //.complete = ec_tk_re_lex_complete, //XXX
159         .size = sizeof(struct ec_tk_re_lex),
160         .free_priv = ec_tk_re_lex_free_priv,
161 };
162
163 EC_TK_TYPE_REGISTER(ec_tk_re_lex_type);
164
165 int ec_tk_re_lex_add(struct ec_tk *gen_tk, const char *pattern, int keep)
166 {
167         struct ec_tk_re_lex *tk = (struct ec_tk_re_lex *)gen_tk;
168         struct regexp_pattern *table;
169         int ret;
170         char *pat_dup = NULL;
171
172         ret = -ENOMEM;
173         pat_dup = ec_strdup(pattern);
174         if (pat_dup == NULL)
175                 goto fail;
176
177         ret = -ENOMEM;
178         table = ec_realloc(tk->table, sizeof(*table) * (tk->len + 1));
179         if (table == NULL)
180                 goto fail;
181
182         ret = regcomp(&table[tk->len].r, pattern, REG_EXTENDED);
183         if (ret != 0) {
184                 ec_log(EC_LOG_ERR,
185                         "Regular expression <%s> compilation failed: %d\n",
186                         pattern, ret);
187                 if (ret == REG_ESPACE)
188                         ret = -ENOMEM;
189                 else
190                         ret = -EINVAL;
191
192                 goto fail;
193         }
194
195         table[tk->len].pattern = pat_dup;
196         table[tk->len].keep = keep;
197         tk->len++;
198         tk->table = table;
199
200         return 0;
201
202 fail:
203         ec_free(pat_dup);
204         return ret;
205 }
206
207 struct ec_tk *ec_tk_re_lex(const char *id, struct ec_tk *child)
208 {
209         struct ec_tk_re_lex *tk = NULL;
210
211         if (child == NULL)
212                 return NULL;
213
214         tk = (struct ec_tk_re_lex *)__ec_tk_new(&ec_tk_re_lex_type, id);
215         if (tk == NULL) {
216                 ec_tk_free(child);
217                 return NULL;
218         }
219
220         tk->child = child;
221
222         return &tk->gen;
223 }
224
225
226 static int ec_tk_re_lex_testcase(void)
227 {
228         struct ec_tk *tk;
229         int ret = 0;
230
231         tk = ec_tk_re_lex(NULL,
232                 ec_tk_many(NULL,
233                         EC_TK_OR(NULL,
234                                 ec_tk_str(NULL, "foo"),
235                                 ec_tk_str(NULL, "bar"),
236                                 ec_tk_int(NULL, 0, 1000, 0)
237                         ), 0, 0
238                 )
239         );
240         if (tk == NULL) {
241                 ec_log(EC_LOG_ERR, "cannot create tk\n");
242                 return -1;
243         }
244
245         /* XXX add ^ automatically ? */
246         ret |= ec_tk_re_lex_add(tk, "[a-zA-Z]+", 1);
247         ret |= ec_tk_re_lex_add(tk, "[0-9]+", 1);
248         ret |= ec_tk_re_lex_add(tk, "=", 1);
249         ret |= ec_tk_re_lex_add(tk, "-", 1);
250         ret |= ec_tk_re_lex_add(tk, "\\+", 1);
251         ret |= ec_tk_re_lex_add(tk, "[  ]+", 0);
252         if (ret != 0) {
253                 ec_log(EC_LOG_ERR, "cannot add regexp to token\n");
254                 ec_tk_free(tk);
255                 return -1;
256         }
257
258         ret |= EC_TEST_CHECK_TK_PARSE(tk, 1, "  foo bar  324 bar234");
259         ret |= EC_TEST_CHECK_TK_PARSE(tk, 1, "foo bar324");
260         ret |= EC_TEST_CHECK_TK_PARSE(tk, 1, "");
261         ret |= EC_TEST_CHECK_TK_PARSE(tk, -1, "foobar");
262
263         ec_tk_free(tk);
264
265         return ret;
266 }
267
268 static struct ec_test ec_tk_re_lex_test = {
269         .name = "tk_re_lex",
270         .test = ec_tk_re_lex_testcase,
271 };
272
273 EC_TEST_REGISTER(ec_tk_re_lex_test);