Write a function that accepts a file of words and returns a list of the words used in the file. Assume the file is many times larger than available primary store.
$ cat dfw.rkt #lang racket ; This is a multi-pass algorithm. Each pass is the same and ; consists of two phases. ; ; This code uses lists to represent files. Inserting a word ; into primary store may probabilistically fail to represent ; no more space available. (define (distinct-file-words file) (define (phase-1 input-file output-file) ; phase 1 - read as many words from the input file as ; possible. Once primary store is full, it contains a ; list of distinct words from some prefix of the input ; file. (let loop ((word-set (new-word-set)) (input-file input-file)) (if (null? input-file) (word-set 'copy output-file) (let ((word (car input-file))) (if (word-set 'add word) (loop word-set (cdr input-file)) (phase-2 word-set input-file (word-set 'copy output-file))))))) (define (phase-2 word-set input-file output-file) ; phase 2 - filter the words remaining in the input file ; through the words in primary store to remove ; duplicates, write the surviving words to a new input ; file for the next pass. (phase-1 (foldl (lambda (word filtered-file) (if (word-set 'contains word) filtered-file (cons word filtered-file))) '() input-file) output-file)) (phase-1 file '())) (define (new-word-set) (let ((words '())) (lambda (cmd arg) (case (list 'quote cmd) ; Flip a biased coin. If it comes up heads or if ; the set is empty, add the given word to the set and ; return true. If the biased coin comes up tails, ; pretend primary store is full, do nothing to the ; word set and return false. (('add) (cond ((null? words) (set! words (cons arg words)) #t) ((member arg words) #t) ((< (random 10) 8) (set! words (cons arg words)) #t) (#t #f))) ; Return true iff the given word is in the word set. (('contains) (member arg words)) ; Add the words in the word set to the given list; ; return the augmented list. (('copy) (append arg words)) ; oops (else (raise-user-error 'new-word-set "Don't recognize ~a as a command" cmd)))))) (require rackunit) (check-eq? (distinct-file-words '()) '()) (check-equal? (distinct-file-words '("a")) '("a")) (check-equal? (distinct-file-words '("a" "a" "a")) '("a")) (let ((l (range 1 2000))) (check-equal? (sort (distinct-file-words l) <) l) (check-equal? (sort (distinct-file-words (append l l)) <) l)) $ mzscheme dfw.rkt $