enormous file words


Write a function that accepts a file of words and returns a list of the words used in the file. Assume the file is many times larger than available primary store.

 $ cat dfw.rkt 
 #lang racket 
  
 ; This is a multi-pass algorithm.  Each pass is the same and 
 ; consists of two phases. 
 ; 
 ; This code uses lists to represent files.  Inserting a word 
 ; into primary store may probabilistically fail to represent 
 ; no more space available. 
  
 (define (distinct-file-words file) 
  
   (define (phase-1 input-file output-file) 
  
     ; phase 1 - read as many words from the input file as 
     ; possible.  Once primary store is full, it contains a 
     ; list of distinct words from some prefix of the input 
     ; file. 
  
     (let loop ((word-set (new-word-set)) (input-file input-file)) 
       (if (null? input-file) 
         (word-set 'copy output-file) 
         (let ((word (car input-file))) 
           (if (word-set 'add word) 
             (loop word-set (cdr input-file)) 
             (phase-2 
               word-set input-file (word-set 'copy output-file))))))) 
  
  
   (define (phase-2 word-set input-file output-file) 
  
     ; phase 2 - filter the words remaining in the input file 
     ; through the words in primary store to remove 
     ; duplicates, write the surviving words to a new input 
     ; file for the next pass. 
  
     (phase-1  
       (foldl 
         (lambda (word filtered-file) 
           (if (word-set 'contains word) 
             filtered-file 
             (cons word filtered-file))) 
         '() input-file) 
       output-file)) 
    
   (phase-1 file '())) 
  
  
 (define (new-word-set) 
  
   (let 
  
     ((words '())) 
  
     (lambda (cmd arg) 
       (case (list 'quote cmd) 
  
         ; Flip a biased coin.  If it comes up heads or if 
         ; the set is empty, add the given word to the set and 
         ; return true.  If the biased coin comes up tails, 
         ; pretend primary store is full, do nothing to the 
         ; word set and return false. 
  
         (('add) 
            (cond 
              ((null? words) 
                 (set! words (cons arg words)) 
                 #t) 
              ((member arg words) 
                 #t) 
              ((< (random 10) 8) 
                 (set! words (cons arg words)) 
                 #t) 
              (#t 
                 #f))) 
  
         ; Return true iff the given word is in the word set. 
          
         (('contains) 
            (member arg words)) 
  
  
         ; Add the words in the word set to the given list; 
         ; return the augmented list. 
  
         (('copy) 
            (append arg words)) 
  
  
         ; oops 
          
         (else 
           (raise-user-error 'new-word-set 
             "Don't recognize ~a as a command" cmd)))))) 
  
  
 (require rackunit) 
  
 (check-eq? (distinct-file-words '()) '()) 
 (check-equal? (distinct-file-words '("a")) '("a")) 
 (check-equal? (distinct-file-words '("a" "a" "a")) '("a")) 
 (let ((l (range 1 2000))) 
   (check-equal? (sort (distinct-file-words l) <) l) 
   (check-equal? (sort (distinct-file-words (append l l)) <) l)) 
  
 $ mzscheme dfw.rkt 
  
 $