from itertools import tee def itersorted(iterable, cmp = cmp, key = lambda x: x, reverse = False): """ This function returns a generator object that yields sorted items from 'iterable'. It implements a form of lazy sorting that's most useful in two cases: 1) When you only need the first few values in the sorted data. 2) When you want to amortize the cost of the sort over the time you use the data. It is to be considered a 'stable sort', where values with equivalent sorting criteria maintain their relative order as it is in the input data set. 'cmp' MUST return values in [-1,0,1]. Otherwise, behavior is undefined, and will most likely be very incorrect. """ # Notes: # Understanding the concepts of 'left' and 'right' here is important. # 'left' values are those that are yielded earlier in the sort. So # each subsequent value yielded is 'to the right' of the previous one. # A stack is used to maintain sets of values who share the same key # value. Each layer corresponds to one key. During the traversals of # the input data, values are added to each layer in such a way that # they maintain their relative position (to others in the same layer) # from the original data. This ensures a 'stable sort'. # Create our working structures stack = [] # holds a stack of 'layers'. # 'left' value layers are above 'right' ones. layer = () # A 3-tuple of the form: # (key, data iterator, [values]) init = True # Is set to true for the first pass through # the data. if reverse: # Use this to easily switch the direction of the sort. rev = -1 else: rev = 1 # Create the base iterator that will track our # main progress through the data. a = ((key(x),x) for x in iterable) # Begin the main loop while 1: # If the stack is empty, we must now seed it. # Advance the base iterator until we find a value 'to the right' of # anything we've yielded so far. (All values 'to the left' have # already been yielded) if not stack: # pull next value off the base iterator k,val = a.next() # If init, get the first value and stop. # Otherwise, find the first value 'to the right' # of the most recently yielded value. while (not init) and (cmp(k,lLimit) != rev): k,val = a.next() pass # Place the found value as the initial stack value # (and store its iteration progress as well). a,b = tee(a) stack.append([k, b, [val]]) pass # We now iterate through the data, starting where the value # at the top of the stack left off. layer = stack[-1] b = layer[1] for k,val in b: # If the next data element is 'to the left' of (or equal to) # the top off the stack and 'to the right' of the last element # yielded, add it to the stack. if cmp(k,layer[0]) != rev and (init or cmp(k,lLimit) == rev): # If it's 'to the left' of the current stack value, # make a new layer and add it to the top of the stack. # Otherwise, it's equivalent so we'll just append it # to the values in the top layer of the stack. if cmp(k,layer[0]) == -rev: b,layer[1] = tee(b) stack.append([k, b, []]) layer = stack[-1] pass layer[2].append(val) pass pass # Remove the initialization condition to enable lLimit checking. init = False # Whatever values that are on the top stack at this point are # the 'left-most' we've found that we haven't yet yielded. Yield # them in the order that we discovered them in the source data. # Define lLimit as the right-most limit for values that have not # yet been yielded. This will allow us to ignore these values # on future iterations. lLimit, b, vals = stack.pop() for val in vals: yield val pass if __debug__: def P(i): for x in reversed(i): print x def test(): import random from itertools import islice control = sorted(data, key = lambda x: x[0]) variable = itersorted(data, key = lambda x: x[0]) print control[:10] == [x for x in islice(variable,10)] print data print control variable = itersorted(data, key = lambda x: x[0]) print [x for x in islice(variable,10)] from unittest import TestCase, main from random import shuffle from itertools import islice class LazySortTest(TestCase): """ Run these tests with: > python LazySort.py """ TESTLEN = 10 RANGELEN = max(TESTLEN, 10) a = range(RANGELEN/2)*2 b = range(RANGELEN/2)*2 shuffle(a) shuffle(b) DATA = zip(a,b) shuffle(DATA) del a del b def testRange(self): control = sorted(self.DATA) variable = itersorted(self.DATA) self.assertEqual(control[:10], [x for x in islice(variable, self.TESTLEN)]) def testRangeCompare(self): control = sorted(self.DATA, cmp = lambda a,b: -cmp(a,b)) variable = itersorted(self.DATA, cmp = lambda a,b: -cmp(a,b)) self.assertEqual(control[:10], [x for x in islice(variable, self.TESTLEN)]) def testRangeKey(self): control = sorted(self.DATA, key = lambda x: x[0]) variable = itersorted(self.DATA, key = lambda x: x[0]) self.assertEqual(control[:10], [x for x in islice(variable, self.TESTLEN)]) def testRangeReverse(self): control = sorted(self.DATA, reverse = True) variable = itersorted(self.DATA, reverse = True) self.assertEqual(control[:10], [x for x in islice(variable, self.TESTLEN)]) def testRangeCompareKey(self): control = sorted(self.DATA, cmp = lambda a,b: -cmp(a,b), key = lambda x: x[0]) variable = itersorted(self.DATA, cmp = lambda a,b: -cmp(a,b), key = lambda x: x[0]) self.assertEqual(control[:10], [x for x in islice(variable, self.TESTLEN)]) def testRangeCompareReverse(self): control = sorted(self.DATA, cmp = lambda a,b: -cmp(a,b), reverse = True) variable = itersorted(self.DATA, cmp = lambda a,b: -cmp(a,b), reverse = True) self.assertEqual(control[:10], [x for x in islice(variable, self.TESTLEN)]) def testRangeKeyReverse(self): control = sorted(self.DATA, key = lambda x: x[0], reverse = True) variable = itersorted(self.DATA, key = lambda x: x[0], reverse = True) self.assertEqual(control[:10], [x for x in islice(variable, self.TESTLEN)]) control = sorted(self.DATA, key = lambda x: x[1], reverse = True) variable = itersorted(self.DATA, key = lambda x: x[1], reverse = True) self.assertEqual(control[:10], [x for x in islice(variable, self.TESTLEN)]) def testRangeCompareKeyReverse(self): control = sorted(self.DATA, cmp = lambda a,b: -cmp(a,b), key = lambda x: x[0], reverse = True) variable = itersorted(self.DATA, cmp = lambda a,b: -cmp(a,b), key = lambda x: x[0], reverse = True) self.assertEqual(control[:10], [x for x in islice(variable, self.TESTLEN)]) if __name__ == '__main__': main() # unittest.main