enormous file words

Write a function that accepts a file of words and returns a list of the words used in the file. Assume the file is many times larger than available primary store.

 $ cat dfw.rkt 
 #lang racket 
 ; This is a multi-pass algorithm.  Each pass is the same and 
 ; consists of two phases. 
 ; This code uses lists to represent files.  Inserting a word 
 ; into primary store may probabilistically fail to represent 
 ; no more space available. 
 (define (distinct-file-words file) 
   (define (phase-1 input-file output-file) 
     ; phase 1 - read as many words from the input file as 
     ; possible.  Once primary store is full, it contains a 
     ; list of distinct words from some prefix of the input 
     ; file. 
     (let loop ((word-set (new-word-set)) (input-file input-file)) 
       (if (null? input-file) 
         (word-set 'copy output-file) 
         (let ((word (car input-file))) 
           (if (word-set 'add word) 
             (loop word-set (cdr input-file)) 
               word-set input-file (word-set 'copy output-file))))))) 
   (define (phase-2 word-set input-file output-file) 
     ; phase 2 - filter the words remaining in the input file 
     ; through the words in primary store to remove 
     ; duplicates, write the surviving words to a new input 
     ; file for the next pass. 
         (lambda (word filtered-file) 
           (if (word-set 'contains word) 
             (cons word filtered-file))) 
         '() input-file) 
   (phase-1 file '())) 
 (define (new-word-set) 
     ((words '())) 
     (lambda (cmd arg) 
       (case (list 'quote cmd) 
         ; Flip a biased coin.  If it comes up heads or if 
         ; the set is empty, add the given word to the set and 
         ; return true.  If the biased coin comes up tails, 
         ; pretend primary store is full, do nothing to the 
         ; word set and return false. 
              ((null? words) 
                 (set! words (cons arg words)) 
              ((member arg words) 
              ((< (random 10) 8) 
                 (set! words (cons arg words)) 
         ; Return true iff the given word is in the word set. 
            (member arg words)) 
         ; Add the words in the word set to the given list; 
         ; return the augmented list. 
            (append arg words)) 
         ; oops 
           (raise-user-error 'new-word-set 
             "Don't recognize ~a as a command" cmd)))))) 
 (require rackunit) 
 (check-eq? (distinct-file-words '()) '()) 
 (check-equal? (distinct-file-words '("a")) '("a")) 
 (check-equal? (distinct-file-words '("a" "a" "a")) '("a")) 
 (let ((l (range 1 2000))) 
   (check-equal? (sort (distinct-file-words l) <) l) 
   (check-equal? (sort (distinct-file-words (append l l)) <) l)) 
 $ mzscheme dfw.rkt