streaming popularity


Find the n most frequently occurring objects in an object stream a couple of ways.

The choice of uniformly distributed random data is dumb for this problem. For example, it can cause the Misra-Gries algorithm to return fewer than the n most frequently occurring objects.

 $ cat tn.scm 
 (load-from-path "utils") 
  
 (define (data-counts n) 
  
   ; Return an item-frequency counter capable of keeping track of 
   ; n items. 
    
   (let* 
     ((data (make-hash-table n)) 
      (data-count 0) 
  
  
      (pick-entry (lambda () 
  
        ; Return an arbitrary entry in the table as a (key 
        ; . value) pair.  Assumes the table is not empty. 
  
        (catch 'out 
          (lambda () 
            (hash-fold 
             (lambda (key val acc) (throw 'out (cons key val))) 
             '() data) 
            (error "item set shouldn't be empty, but it is")) 
          (lambda (tag p) p)))) 
       
  
      (minimum-entry (lambda () 
  
        ; Return a (key . value) pair representing an entry with 
        ; the minimum count.  Assumes the table is not empty. 
   
        (let 
  
          ((f (lambda (key val min) 
                (if (< val (cdr min)) (cons key val) min)))) 
  
          (hash-fold f (pick-entry) data))))) 
  
  
     (lambda (cmd . args) 
       (cond 
  
        ; Add the given item to the table with a count of 1. 
        ; Assumes the item is not in the table. 
         
        ((eq? 'add-element cmd) 
           (if (= data-count n) 
             (error "adding an item to a full table")) 
           (hash-set! data (car args) 1) 
           (set! data-count (+ data-count 1))) 
  
  
        ; Remove a item with the lowest count.  Assumes the 
        ; table's not empty. 
  
        ((eq? 'delete-minimum cmd) 
          (hash-remove! data (car (minimum-entry))) 
        (set! data-count (- data-count 1))) 
  
  
        ; If the given item is not in the table, return false, 
        ; otherwise increase the item's count by one and return 
        ; true. 
  
        ((eq? 'increase-if-included cmd) 
           (let* 
             ((key (car args)) 
              (val (hash-ref data key))) 
  
             (when val 
               (hash-set! data key (+ val 1))) 
  
             val)) 
  
  
        ; If the given item is not in the table, return false, 
        ; otherwise increase the item's count by one and return 
        ; true. 
  
        ((eq? 'print cmd) 
         (let* 
           ((f (lambda (k v acc) 
                 (cons (cons k (car acc)) (cons v (cdr acc))))) 
  
            (p (lambda (lst) 
                 (let loop ((l lst)) 
                   (unless (null? l) 
                     (format #t "~4d" (car l)) 
                     (loop (cdr l)))) 
                 (display #\newline))) 
  
            (ht (hash-fold f (cons '() '()) data))) 
  
           (p (car ht)) 
           (p (cdr ht)))) 
  
  
        ; Reduce the item counts by one, throwing away items that 
        ; have non-positive counts.  Return true iff items were 
        ; thrown away. 
  
        ((eq? 'reduce-counts cmd) 
           (let 
             ((old-data-count data-count) 
              (f (lambda (key val acc) 
                   (when (> val 1) 
                     (hash-set! acc key (- val 1)) 
                     (set! data-count (+ data-count 1))) 
                   acc))) 
  
             (set! data-count 0) 
             (set! data (hash-fold f (make-hash-table n) data)) 
             (< data-count old-data-count))) 
  
  
        ; Return true iff there's space for more items 
   
        ((eq? 'space-available cmd) 
           (< data-count n)) 
  
        (#t 
           (error "unexpected data-counts command: " cmd)))))) 
  
  
 (define (maa-top-n data counts) 
  
   (define (place e) 
     (unless (counts 'increase-if-included e) 
       (unless (counts 'space-available) 
       (counts 'delete-minimum)) 
       (counts 'add-element e))) 
  
   (for-each place data) 
   counts) 
  
  
 (define (mg-top-n data counts) 
  
   (define (place e) 
     (unless (counts 'increase-if-included e) 
       (when (or (counts 'space-available) (counts 'reduce-counts)) 
  (counts 'add-element e)))) 
  
   (for-each place data) 
   counts) 
  
  
 (define (test-data n t c) 
  
   (define (mvec n t) 
  
     (let loopo ((k 0) (lo '())) 
       (if (>= k t) 
         (vector->list (randomly-permute-vector (list->vector lo))) 
         (let loopm ((i 0) (lm lo)) 
           (if (>= i n) 
             (loopo (+ k 1) lm) 
             (let loopi ((j 0) (li lm)) 
               (if (> j i) 
                 (loopm (+ i 1) li) 
                 (loopi (+ j 1) (cons i li))))))))) 
  
   (let ((l (mvec n t))) 
     ((mg-top-n l (data-counts c)) 'print) 
     (display #\newline) 
     ((maa-top-n l (data-counts c)) 'print))) 
  
 $ guile 
 scheme@(guile-user)> (load "tn.scm") 
 scheme@(guile-user)> (test-data 10 1 10) 
    9   8   7   6   5   4   3   2   1   0 
   10   9   8   7   6   5   4   3   2   1 
  
    9   8   7   6   5   4   3   2   1   0 
   10   9   8   7   6   5   4   3   2   1 
 scheme@(guile-user)> (test-data 10 1 9) 
    9   8   7   6   5   4   3   2   1 
    8   7   6   5   4   3   2   1   1 
  
    9   8   7   6   5   4   3   2   1 
   10   9   8   7   6   5   4   3   1 
 scheme@(guile-user)> (test-data 10 1 5) 
    9   8   6   5   3 
    3   2   2   2   1 
  
    8   7   6   5   4 
    9   8   7   1   5 
 scheme@(guile-user)> (test-data 10 2 5) 
    8   7   4   3 
    2   1   1   1 
  
    9   8   7   6   4 
   20  18   1  14   7 
 scheme@(guile-user)>  
  
 $