regular-expression-language


regexp-gen accepts a quasi regular expression as a string and returns a list-implemented set of strings from the language associated with the regular expression.

A regular expression is a sequence of items. For parsing convenience, adjacent items have at least one space between them; for further convenience, the regular-expression operators are used in prefix form.

Items otherwise unidentified in the syntax are copied as is:

  guile> (regexp-gen "a b c")
  ("a b c")
  guile> 

An option item is a question mark followed by an item; it indicates that the item may or may not appear in strings:

  guile> (regexp-gen "a ? 1 b")
  ("a 1 b" "a b")
  guile> 

A choice item is a bar followed by two items; it indicates that exactly one of the two items may appear in strings:

  guile> (regexp-gen "a | 1 2 c")
  ("a 2 c" "a 1 c")
  guile>

A repetition item is a star followed by an item; it indicates that zero, one or two copies of the item in succession may appear in strings:

  guile> (regexp-gen "a * 1 b")
  ("a 1 b" "a 1 1 b" "a b")
  guile>

Normally repetition generates infinitely many strings when applied to a non-empty item; quasi regular expressions restrict repetition to the three shortest strings.

The list-repetition item '*,' is followed by two items; it indicates that strings may contain zero, one, or two copies of the second item with successive occurrences of the second item separated by the first item:

  guile> (regexp-gen "[ f *, , a ]")
  ("[ f a ]" "[ f a , a ]" "[ f ]")
  guile>

Parentheses group a sequence of items into a single item:

  guile> (regexp-gen "a ( b ( c ) ) d")
  ("a b c d")
  guile> (regexp-gen "a ? ( b c ) d")
  ("a b c d" "a d")
  guile> (regexp-gen "a * ( b c ) d")
  ("a b c d" "a b c b c d" "a d")
  guile> (regexp-gen "a | b ( c d )")
  ("a c d" "a b")
  guile> 

Empty parenthesis and choice make option redundant:

  guile> (regexp-gen "a | b ( )")
  ("a" "a b")
  guile>

Option and parenthesis makes list repetition redundant:

  guile> (regexp-gen "[ f ? ( a ? ( , a ) ) ]")
  ("[ f a ]" "[ f a , a ]" "[ f ]")
  guile>

I wrote this code to provide a quick and simple source of strings for regression testing a compiler I'm writing. (Actually, I wrote this code to prototype the algorithm; the code used to test the compiler is written in awk.)

 (use-modules ((srfi srfi-13) :renamer (symbol-prefix-proc 'srfi13:))) 
  
  
 (define group-token "(") 
 (define choice-token "|") 
 (define repetition-token "*") 
 (define option-token "?") 
 (define list-repetition-token "*,") 
  
  
 (define (cross-concat prefixes suffixes) 
  
   ; Return a list consisting of all strings which are a string from the first 
   ; given list followed by a string from the given second list.  Constituent 
   ; strings are separted by a space. 
  
   (apply 
      append 
      (map 
         (lambda (pfx) 
            (map 
               (lambda (sfx) 
                 (string-append 
                    pfx 
                    (if (or (string-null? pfx) (string-null? sfx)) 
                       "" 
                       " ") 
                    sfx)) 
               suffixes)) 
         prefixes))) 
  
  
 (define (do-a-regexp regexp output) 
  
   ; Return a pair containing the tail of the given regular expression and the 
   ; cross-catenation of the given output with the language generated by the 
   ; regular expression at the head of the given regular expression. 
  
   (let 
  
      ((f (or (assoc-ref function-table (car regexp)) do-item))) 
  
      (f regexp output))) 
  
  
 (define (do-choice regexp output) 
  
   ; Return a pair containing the tail of the given regular expression and the 
   ; cross-catenation of the given output and the language associated with the 
   ; head of the given regular expression, which is assumed to exist and be a 
   ; choice. 
  
   (regexp-head-ok regexp choice-token "choice") 
  
   (let* 
  
     ((p1 (do-a-regexp (cdr regexp) output)) 
      (p2 (do-a-regexp (car p1) output))) 
  
     (cons 
       (car p2) 
       (union (cdr p1) (cdr p2))))) 
  
  
 (define (do-group regexp output) 
  
   ; Return a pair containing the tail of the given regular expression and the 
   ; cross-catenation of the given output and the language associated with the 
   ; head of the given regular expression, which is assumed to exist and be a 
   ; group. 
  
   (regexp-head-ok regexp group-token "group") 
  
   (let 
  
     loop ((re (cdr regexp)) (o output)) 
  
     (cond 
        ((null? re) 
            (error "missing close parens")) 
  
        ((string=? (car re) ")") 
            (cons (cdr re) o)) 
  
        (t 
            (let ((p (do-a-regexp re '("")))) 
              (loop (car p) (cross-concat o (cdr p)))))))) 
  
  
 (define (do-item regexp output) 
  
   ; Return a pair containing the tail of the given regular expression and the 
   ; cross-catenation of the given output and the language associated with the 
   ; head of the given regular expression, which is assumed to exist and be an 
   ; item. 
  
   (cons (cdr regexp) (cross-concat output (list (car regexp))))) 
  
  
 (define (do-list-repetition regexp output) 
  
   ; Return a pair containing the tail of the given regular expression and the 
   ; cross-catenation of the given output and the language associated with the 
   ; head of the given regular expression, which is assumed to exist and be a 
   ; list repetition. 
  
   (regexp-head-ok regexp list-repetition-token "list-repetition") 
  
   (let* 
  
      ((p1 (do-a-regexp (cdr regexp) '(""))) 
       (p2 (do-a-regexp (car p1) '(""))) 
       (rep (cdr p2)) 
       (o (cross-concat output rep))) 
  
      (cons 
         (car p2) 
         (union 
            output 
            (union 
               o 
               (cross-concat (cross-concat o (cdr p1)) rep)))))) 
  
  
 (define (do-option regexp output) 
  
   ; Return a pair containing the tail of the given regular expression and the 
   ; cross-catenation of the given output and the language associated with the 
   ; head of the given regular expression, which is assumed to exist and be an 
   ; option. 
  
   (regexp-head-ok regexp option-token "option") 
  
   (let* 
  
      ((re (cdr regexp)) 
       (p (do-a-regexp re '("")))) 
  
      (cons 
         (car p) 
         (union 
            output 
            (cross-concat output (cdr p)))))) 
  
  
 (define (do-repetition regexp output) 
  
   ; Return a pair containing the tail of the given regular expression and the 
   ; cross-catenation of the given output and the language associated with the 
   ; head of the given regular expression, which is assumed to exist and be a 
   ; repetition. 
  
   (regexp-head-ok regexp repetition-token "repetition") 
  
   (let* 
  
      ((p (do-a-regexp (cdr regexp) '(""))) 
       (rep (cd r p)) 
       (o (cross-concat output rep))) 
  
     (cons 
         (car p) 
         (union 
            output 
            (union o (cross-concat o rep)))))) 
  
  
 (define (regexp-gen regexp) 
  
   ; Return a list of all the strings in the language associated with the given 
   ; regular expression. 
  
   (cdr 
      (do-group 
         (srfi13:string-tokenize (string-append "( " regexp " )")) 
         '("")))) 
  
  
 (define (regexp-head-ok regexp token what) 
  
   ; Do nothing if the given regular expression starts with the given token; 
   ; otherwise die with an apropriate error message from the given caller. 
  
   (if (null? regexp) 
      (error (string-append "empty regular expression passed to do-" what))) 
   (let ((hd (car regexp))) 
     (if (not (string=? hd token)) 
        (error (string-append "regular-expression head is '" hd "', not '" 
                              token "' in do-" what))))) 
  
  
 (define (union s1 s2) 
  
   ; Return a list-implemented set that's the union of the elements in the given 
   ; list-implemented sets. 
  
   (let 
  
     ((ht (make-hash-table (length s1)))) 
  
     ; load the hash table with strings from the first list. 
  
     (let 
  
       loop ((l s1)) 
  
       (cond ((not (null? l)) 
                 (hash-set! ht (car l) 1) 
                 (loop (cdr l))))) 
  
     ; Add to the first list all strings from the second list not in the hash 
     ; table. 
  
     (let 
  
        loop ((l s2) (u s1)) 
  
        (if (null? l) 
           u 
           (let ((s (car l))) 
              (loop 
                 (cdr l) 
                 (if (not (hash-ref ht s)) 
                    (cons s u) 
                    u))))))) 
  
  
 (define function-table 
   (acons group-token do-group 
   (acons choice-token do-choice 
   (acons repetition-token do-repetition 
   (acons option-token do-option 
   (acons list-repetition-token do-list-repetition '())))))) 

category-code