deduplifying lists


Define a function removing duplicate elements from a list, preserving the relative order of the remaining elements.

Pedant trigger warning: yes, some of the worst-case behavior claims below are actually high-probability claims for average-case behavior. Oh well.

Start with the obvious O(n^2) worst-case behavior function.

 $ cat ldd.scm 
 (use-modules (srfi srfi-1) (ice-9 format)) 
 (include-from-path "srfi-78/srfi-78.scm") 
  
  
 (define (dedupify-nsq l) 
   (let loop ((in l) (out '())) 
     (if (null? in) 
       (reverse out) 
       (let ((e (car in))) 
         (loop (cdr in) (if (member e out) out (cons e out))))))) 

A ridiculous O(n log n) worst-case behavior function: pair each element with its index (O(n)), sort the pairs lexicographically (O(n log n) to group equivalent elements while preserving their original relative order, remove pairs duplicated on their first coordinate (O(n)), sort the remaining pairs by their second coordinate (O(n log n)) to restore the original relative order, and replace each pair with its first coordinate (O(n)).

 (define (dedupify-nln l) 
  
   (define (numberate l) 
     (let loop ((in l) (i 0) (out '())) 
       (if (null? in) 
         out 
         (loop (cdr in) (+ i 1) (cons (cons (car in) i) out))))) 
  
   (define (pair-sort l) 
     (sort l 
       (lambda (p1 p2) 
         (let ((f1 (car p1)) 
               (f2 (car p2))) 
           (or (< f1 f2) 
               (and (= f1 f2) (< (cdr p1) (cdr p2)))))))) 
  
   (define (dedupify l) 
     (if (null? l) 
       l 
       (let ((p (car l))) 
         (let loop ((in (cdr l)) (e (car p)) (out (list p))) 
           (if (null? in) 
             out 
             (let* ((p (car in)) 
                    (e' (car p)) 
                    (in' (cdr in))) 
               (if (= e e') 
                 (loop in' e out) 
                 (loop in' e' (cons p out))))))))) 
  
   (define (cdr-sort l) 
     (sort l (lambda (p1 p2) (> (cdr p1) (cdr p2))))) 
  
   (define (denumberate l) 
     (let loop ((in l) (out '())) 
       (if (null? in) 
         out 
         (loop (cdr in)  (cons (caar in) out))))) 
  
   (denumberate (cdr-sort (dedupify (pair-sort (numberate l)))))) 

An obvious O(n) worst-case function with a hash table.

 (define (dedupify-n l) 
   (let ((roster (make-hash-table 1024))) 
     (let loop ((in l) (out '())) 
       (if (null? in) 
           (reverse out) 
           (let ((e (car in)) 
                 (l (cdr in))) 
             (if (hashq-ref roster e) 
               (loop l out) 
               (begin 
                 (hashq-set! roster e #t) 
                 (loop l (cons e out))))))))) 

Another obvious O(n) worst-case function with a hash table, betraying my love of named lets and tail-recursive loops.

Pro tip: dedupifying as specified is not associative (i.e., fold-right fail).

 (define (dedupify-n-fold l) 
   (let ((roster (make-hash-table 1024))) 
     (reverse 
       (fold 
         (lambda (e l) 
           (if (hashq-ref roster e) 
               l 
               (begin 
                 (hashq-set! roster e #t) 
                 (cons e l)))) 
         '() l)))) 

Is it correct?

  
 (define (check-it f n) 
   (format #t "\n~a\n" n) 
   (for-each 
     (lambda (p) (check (f (car p)) => (cdr p))) 
    '((() . ()) 
      ((1) . (1)) 
      ((1 1) . (1)) 
      ((1 1 1) . (1)) 
      ((1 1 2) . (1 2)) 
      ((1 2 1) . (1 2)) 
      ((2 1 1) . (2 1))))) 
  
 (check-it dedupify-n "dedupify-n") 
 (check-it dedupify-n-fold "dedupify-n-fold") 
 (check-it dedupify-nln "dedupify-nln") 
 (check-it dedupify-nsq "dedupify-nsq") 

How fast is it?

 (define (elapsed-time t) 
   (let ((start-time (get-internal-real-time))) 
     (t) 
     (- (get-internal-real-time) start-time))) 
  
 (define (time-it f n s) 
  
   (define (make-list n) 
     (let loop ((i 0) (l '())) 
       (if (< i n) 
         (loop (+ i 1) (cons (random n) l)) 
         l))) 
  
   (let* 
     ((l (make-list s)) 
      (thunk (lambda () (f l)))) 
  
     (let loop ((i n) (t 0)) 
       (if (= i 0) 
         (inexact->exact (round (exact->inexact (/ t n)))) 
         (loop (- i 1) (+ t (elapsed-time thunk))))))) 
  
 (define (timing-results f n) 
   (format #t "\n~a\n" n) 
   (for-each 
     (lambda (l) (format #t "  n = ~d, t = ~d\n" l (time-it f 5 l))) 
    '(1000 2000 3000 4000 5000 6000 7000 8000))) 
  
 (timing-results dedupify-n "dedupify-n") 
 (timing-results dedupify-n-fold "dedupify-n-fold") 
 (timing-results dedupify-nln "dedupify-nln") 
 (timing-results dedupify-nsq "dedupify-nsq") 
  
 $ 

And, in the end,

 $ guile ldd.scm 
  
 dedupify-n 
  
 (f (car p)) => () ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1 2) ; correct 
 (f (car p)) => (1 2) ; correct 
 (f (car p)) => (2 1) ; correct 
  
 dedupify-n-fold 
  
 (f (car p)) => () ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1 2) ; correct 
 (f (car p)) => (1 2) ; correct 
 (f (car p)) => (2 1) ; correct 
  
 dedupify-nln 
  
 (f (car p)) => () ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1 2) ; correct 
 (f (car p)) => (1 2) ; correct 
 (f (car p)) => (2 1) ; correct 
  
 dedupify-nsq 
  
 (f (car p)) => () ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1) ; correct 
 (f (car p)) => (1 2) ; correct 
 (f (car p)) => (1 2) ; correct 
 (f (car p)) => (2 1) ; correct 
  
 dedupify-n 
   n = 1000, t = 1 
   n = 2000, t = 2 
   n = 3000, t = 15 
   n = 4000, t = 13 
   n = 5000, t = 15 
   n = 6000, t = 29 
   n = 7000, t = 21 
   n = 8000, t = 24 
  
 dedupify-n-fold 
   n = 1000, t = 14 
   n = 2000, t = 4 
   n = 3000, t = 11 
   n = 4000, t = 14 
   n = 5000, t = 14 
   n = 6000, t = 30 
   n = 7000, t = 22 
   n = 8000, t = 36 
  
 dedupify-nln 
   n = 1000, t = 19 
   n = 2000, t = 41 
   n = 3000, t = 64 
   n = 4000, t = 88 
   n = 5000, t = 114 
   n = 6000, t = 148 
   n = 7000, t = 164 
   n = 8000, t = 115 
  
 }dedupify-nsq 
   n = 1000, t = 172 
   n = 2000, t = 627 
   n = 3000, t = 1514 
   n = 4000, t = 2976 
   n = 5000, t = 4491 
   n = 6000, t = 6238 
   n = 7000, t = 8783 
   n = 8000, t = 12199 
  
 $ 

Pedant trigger warning: yes, different functions are running with different data to make corresponding measurements. Oh well.