Find the n most frequently occurring objects in an object stream a couple of ways.
The choice of uniformly distributed random data is dumb for this problem. For example, it can cause the Misra-Gries algorithm to return fewer than the n most frequently occurring objects.
$ cat tn.scm (load-from-path "utils") (define (data-counts n) ; Return an item-frequency counter capable of keeping track of ; n items. (let* ((data (make-hash-table n)) (data-count 0) (pick-entry (lambda () ; Return an arbitrary entry in the table as a (key ; . value) pair. Assumes the table is not empty. (catch 'out (lambda () (hash-fold (lambda (key val acc) (throw 'out (cons key val))) '() data) (error "item set shouldn't be empty, but it is")) (lambda (tag p) p)))) (minimum-entry (lambda () ; Return a (key . value) pair representing an entry with ; the minimum count. Assumes the table is not empty. (let ((f (lambda (key val min) (if (< val (cdr min)) (cons key val) min)))) (hash-fold f (pick-entry) data))))) (lambda (cmd . args) (cond ; Add the given item to the table with a count of 1. ; Assumes the item is not in the table. ((eq? 'add-element cmd) (if (= data-count n) (error "adding an item to a full table")) (hash-set! data (car args) 1) (set! data-count (+ data-count 1))) ; Remove a item with the lowest count. Assumes the ; table's not empty. ((eq? 'delete-minimum cmd) (hash-remove! data (car (minimum-entry))) (set! data-count (- data-count 1))) ; If the given item is not in the table, return false, ; otherwise increase the item's count by one and return ; true. ((eq? 'increase-if-included cmd) (let* ((key (car args)) (val (hash-ref data key))) (when val (hash-set! data key (+ val 1))) val)) ; If the given item is not in the table, return false, ; otherwise increase the item's count by one and return ; true. ((eq? 'print cmd) (let* ((f (lambda (k v acc) (cons (cons k (car acc)) (cons v (cdr acc))))) (p (lambda (lst) (let loop ((l lst)) (unless (null? l) (format #t "~4d" (car l)) (loop (cdr l)))) (display #\newline))) (ht (hash-fold f (cons '() '()) data))) (p (car ht)) (p (cdr ht)))) ; Reduce the item counts by one, throwing away items that ; have non-positive counts. Return true iff items were ; thrown away. ((eq? 'reduce-counts cmd) (let ((old-data-count data-count) (f (lambda (key val acc) (when (> val 1) (hash-set! acc key (- val 1)) (set! data-count (+ data-count 1))) acc))) (set! data-count 0) (set! data (hash-fold f (make-hash-table n) data)) (< data-count old-data-count))) ; Return true iff there's space for more items ((eq? 'space-available cmd) (< data-count n)) (#t (error "unexpected data-counts command: " cmd)))))) (define (maa-top-n data counts) (define (place e) (unless (counts 'increase-if-included e) (unless (counts 'space-available) (counts 'delete-minimum)) (counts 'add-element e))) (for-each place data) counts) (define (mg-top-n data counts) (define (place e) (unless (counts 'increase-if-included e) (when (or (counts 'space-available) (counts 'reduce-counts)) (counts 'add-element e)))) (for-each place data) counts) (define (test-data n t c) (define (mvec n t) (let loopo ((k 0) (lo '())) (if (>= k t) (vector->list (randomly-permute-vector (list->vector lo))) (let loopm ((i 0) (lm lo)) (if (>= i n) (loopo (+ k 1) lm) (let loopi ((j 0) (li lm)) (if (> j i) (loopm (+ i 1) li) (loopi (+ j 1) (cons i li))))))))) (let ((l (mvec n t))) ((mg-top-n l (data-counts c)) 'print) (display #\newline) ((maa-top-n l (data-counts c)) 'print))) $ guile scheme@(guile-user)> (load "tn.scm") scheme@(guile-user)> (test-data 10 1 10) 9 8 7 6 5 4 3 2 1 0 10 9 8 7 6 5 4 3 2 1 9 8 7 6 5 4 3 2 1 0 10 9 8 7 6 5 4 3 2 1 scheme@(guile-user)> (test-data 10 1 9) 9 8 7 6 5 4 3 2 1 8 7 6 5 4 3 2 1 1 9 8 7 6 5 4 3 2 1 10 9 8 7 6 5 4 3 1 scheme@(guile-user)> (test-data 10 1 5) 9 8 6 5 3 3 2 2 2 1 8 7 6 5 4 9 8 7 1 5 scheme@(guile-user)> (test-data 10 2 5) 8 7 4 3 2 1 1 1 9 8 7 6 4 20 18 1 14 7 scheme@(guile-user)> $