Numpy grouping using itertools.groupby performance


Question

I have many large (>35,000,000) lists of integers that will contain duplicates. I need to get a count for each integer in a list. The following code works, but seems slow. Can anyone else better the benchmark using Python and preferably Numpy?

def group():
    import numpy as np
    from itertools import groupby
    values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
    values.sort()
    groups = ((k,len(list(g))) for k,g in groupby(values))
    index = np.fromiter(groups,dtype='u4,u2')

if __name__=='__main__':
    from timeit import Timer
    t = Timer("group()","from __main__ import group")
    print t.timeit(number=1)

which returns:

$ python bench.py 
111.377498865

Cheers!

Edit based on responses:

def group_original():
    import numpy as np
    from itertools import groupby
    values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
    values.sort()
    groups = ((k,len(list(g))) for k,g in groupby(values))
    index = np.fromiter(groups,dtype='u4,u2')

def group_gnibbler():
    import numpy as np
    from itertools import groupby
    values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
    values.sort()
    groups = ((k,sum(1 for i in g)) for k,g in groupby(values))
    index = np.fromiter(groups,dtype='u4,u2')

def group_christophe():
    import numpy as np
    values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
    values.sort()
    counts=values.searchsorted(values, side='right') - values.searchsorted(values, side='left')
    index = np.zeros(len(values),dtype='u4,u2')
    index['f0']=values
    index['f1']=counts
    #Erroneous result!

def group_paul():
    import numpy as np
    values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
    values.sort()
    diff = np.concatenate(([1],np.diff(values)))
    idx = np.concatenate((np.where(diff)[0],[len(values)]))
    index = np.empty(len(idx)-1,dtype='u4,u2')
    index['f0']=values[idx[:-1]]
    index['f1']=np.diff(idx)

if __name__=='__main__':
    from timeit import Timer
    timings=[
                ("group_original","Original"),
                ("group_gnibbler","Gnibbler"),
                ("group_christophe","Christophe"),
                ("group_paul","Paul"),
            ]
    for method,title in timings:
        t = Timer("%s()"%method,"from __main__ import %s"%method)
        print "%s: %s secs"%(title,t.timeit(number=1))

which returns:

$ python bench.py 
Original: 113.385262966 secs
Gnibbler: 71.7464978695 secs
Christophe: 27.1690568924 secs
Paul: 9.06268405914 secs

Although Christophe gives incorrect results currently

1
26
4/22/2011 4:25:45 AM

Accepted Answer

i get a 3x improvement doing something like this:

def group():
    import numpy as np
    values = np.array(np.random.randint(0,3298,size=35000000),dtype='u4')
    values.sort()
    dif = np.ones(values.shape,values.dtype)
    dif[1:] = np.diff(values)
    idx = np.where(dif>0)
    vals = values[idx]
    count = np.diff(idx)
31
1/10/2011 10:31:53 PM

More than 5 years have passed since Paul's answer was accepted. Interestingly, the sort() is still the bottleneck in the accepted solution.

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     3                                           @profile
     4                                           def group_paul():
     5         1        99040  99040.0      2.4      import numpy as np
     6         1       305651 305651.0      7.4      values = np.array(np.random.randint(0, 2**32,size=35000000),dtype='u4')
     7         1      2928204 2928204.0    71.3      values.sort()
     8         1        78268  78268.0      1.9      diff = np.concatenate(([1],np.diff(values)))
     9         1       215774 215774.0      5.3      idx = np.concatenate((np.where(diff)[0],[len(values)]))
    10         1           95     95.0      0.0      index = np.empty(len(idx)-1,dtype='u4,u2')
    11         1       386673 386673.0      9.4      index['f0'] = values[idx[:-1]]
    12         1        91492  91492.0      2.2      index['f1'] = np.diff(idx)

The accepted solution runs for 4.0 s on my machine, with radix sort it drops down to 1.7 s.

Just by switching to radix sort, I get an overall 2.35x speedup. The radix sort is more than 4x faster than quicksort in this case.

See How to sort an array of integers faster than quicksort? that was motivated by your question.


Licensed under: CC-BY-SA with attribution
Not affiliated with: Stack Overflow
Icon