regexp-gen accepts a quasi regular expression as a string and returns a list-implemented set of strings from the language associated with the regular expression.
A regular expression is a sequence of items. For parsing convenience, adjacent items have at least one space between them; for further convenience, the regular-expression operators are used in prefix form.
Items otherwise unidentified in the syntax are copied as is:
guile> (regexp-gen "a b c") ("a b c") guile>
An option item is a question mark followed by an item; it indicates that the item may or may not appear in strings:
guile> (regexp-gen "a ? 1 b") ("a 1 b" "a b") guile>
A choice item is a bar followed by two items; it indicates that exactly one of the two items may appear in strings:
guile> (regexp-gen "a | 1 2 c") ("a 2 c" "a 1 c") guile>
A repetition item is a star followed by an item; it indicates that zero, one or two copies of the item in succession may appear in strings:
guile> (regexp-gen "a * 1 b") ("a 1 b" "a 1 1 b" "a b") guile>
Normally repetition generates infinitely many strings when applied to a non-empty item; quasi regular expressions restrict repetition to the three shortest strings.
The list-repetition item '*,' is followed by two items; it indicates that strings may contain zero, one, or two copies of the second item with successive occurrences of the second item separated by the first item:
guile> (regexp-gen "[ f *, , a ]") ("[ f a ]" "[ f a , a ]" "[ f ]") guile>
Parentheses group a sequence of items into a single item:
guile> (regexp-gen "a ( b ( c ) ) d") ("a b c d") guile> (regexp-gen "a ? ( b c ) d") ("a b c d" "a d") guile> (regexp-gen "a * ( b c ) d") ("a b c d" "a b c b c d" "a d") guile> (regexp-gen "a | b ( c d )") ("a c d" "a b") guile>
Empty parenthesis and choice make option redundant:
guile> (regexp-gen "a | b ( )") ("a" "a b") guile>
Option and parenthesis makes list repetition redundant:
guile> (regexp-gen "[ f ? ( a ? ( , a ) ) ]") ("[ f a ]" "[ f a , a ]" "[ f ]") guile>
I wrote this code to provide a quick and simple source of strings for regression testing a compiler I'm writing. (Actually, I wrote this code to prototype the algorithm; the code used to test the compiler is written in awk.)
(use-modules ((srfi srfi-13) :renamer (symbol-prefix-proc 'srfi13:))) (define group-token "(") (define choice-token "|") (define repetition-token "*") (define option-token "?") (define list-repetition-token "*,") (define (cross-concat prefixes suffixes) ; Return a list consisting of all strings which are a string from the first ; given list followed by a string from the given second list. Constituent ; strings are separted by a space. (apply append (map (lambda (pfx) (map (lambda (sfx) (string-append pfx (if (or (string-null? pfx) (string-null? sfx)) "" " ") sfx)) suffixes)) prefixes))) (define (do-a-regexp regexp output) ; Return a pair containing the tail of the given regular expression and the ; cross-catenation of the given output with the language generated by the ; regular expression at the head of the given regular expression. (let ((f (or (assoc-ref function-table (car regexp)) do-item))) (f regexp output))) (define (do-choice regexp output) ; Return a pair containing the tail of the given regular expression and the ; cross-catenation of the given output and the language associated with the ; head of the given regular expression, which is assumed to exist and be a ; choice. (regexp-head-ok regexp choice-token "choice") (let* ((p1 (do-a-regexp (cdr regexp) output)) (p2 (do-a-regexp (car p1) output))) (cons (car p2) (union (cdr p1) (cdr p2))))) (define (do-group regexp output) ; Return a pair containing the tail of the given regular expression and the ; cross-catenation of the given output and the language associated with the ; head of the given regular expression, which is assumed to exist and be a ; group. (regexp-head-ok regexp group-token "group") (let loop ((re (cdr regexp)) (o output)) (cond ((null? re) (error "missing close parens")) ((string=? (car re) ")") (cons (cdr re) o)) (t (let ((p (do-a-regexp re '("")))) (loop (car p) (cross-concat o (cdr p)))))))) (define (do-item regexp output) ; Return a pair containing the tail of the given regular expression and the ; cross-catenation of the given output and the language associated with the ; head of the given regular expression, which is assumed to exist and be an ; item. (cons (cdr regexp) (cross-concat output (list (car regexp))))) (define (do-list-repetition regexp output) ; Return a pair containing the tail of the given regular expression and the ; cross-catenation of the given output and the language associated with the ; head of the given regular expression, which is assumed to exist and be a ; list repetition. (regexp-head-ok regexp list-repetition-token "list-repetition") (let* ((p1 (do-a-regexp (cdr regexp) '(""))) (p2 (do-a-regexp (car p1) '(""))) (rep (cdr p2)) (o (cross-concat output rep))) (cons (car p2) (union output (union o (cross-concat (cross-concat o (cdr p1)) rep)))))) (define (do-option regexp output) ; Return a pair containing the tail of the given regular expression and the ; cross-catenation of the given output and the language associated with the ; head of the given regular expression, which is assumed to exist and be an ; option. (regexp-head-ok regexp option-token "option") (let* ((re (cdr regexp)) (p (do-a-regexp re '("")))) (cons (car p) (union output (cross-concat output (cdr p)))))) (define (do-repetition regexp output) ; Return a pair containing the tail of the given regular expression and the ; cross-catenation of the given output and the language associated with the ; head of the given regular expression, which is assumed to exist and be a ; repetition. (regexp-head-ok regexp repetition-token "repetition") (let* ((p (do-a-regexp (cdr regexp) '(""))) (rep (cd r p)) (o (cross-concat output rep))) (cons (car p) (union output (union o (cross-concat o rep)))))) (define (regexp-gen regexp) ; Return a list of all the strings in the language associated with the given ; regular expression. (cdr (do-group (srfi13:string-tokenize (string-append "( " regexp " )")) '("")))) (define (regexp-head-ok regexp token what) ; Do nothing if the given regular expression starts with the given token; ; otherwise die with an apropriate error message from the given caller. (if (null? regexp) (error (string-append "empty regular expression passed to do-" what))) (let ((hd (car regexp))) (if (not (string=? hd token)) (error (string-append "regular-expression head is '" hd "', not '" token "' in do-" what))))) (define (union s1 s2) ; Return a list-implemented set that's the union of the elements in the given ; list-implemented sets. (let ((ht (make-hash-table (length s1)))) ; load the hash table with strings from the first list. (let loop ((l s1)) (cond ((not (null? l)) (hash-set! ht (car l) 1) (loop (cdr l))))) ; Add to the first list all strings from the second list not in the hash ; table. (let loop ((l s2) (u s1)) (if (null? l) u (let ((s (car l))) (loop (cdr l) (if (not (hash-ref ht s)) (cons s u) u))))))) (define function-table (acons group-token do-group (acons choice-token do-choice (acons repetition-token do-repetition (acons option-token do-option (acons list-repetition-token do-list-repetition '()))))))