'Most optimal way to exhaust a list of lists
I have a list of lists, built from a 2d array:
I need to popleft a value from each list, in order, until all lists have been exhausted. For example:
4,3,4,9,2,82,5,4,23,3,56,7 for the lists above
because this list can become enormous, I want to skip over empty queues. e.g
my solution is to also have a deque of indexes of the queues that still have values with a loop like this:
Loop continuously until we no longer have any dequeues in our queue of indexes to pop from. This will leave the entire list with an empty deque when its done. The alternative is to simply iterate through the list of deque, and remove[index_of_empty_queue] when it is done. This is extremely slow to delete items in a very large list, especially towards the start of the list.
Any thoughts on my approach and if there is a better one? I know popleft and append is O(1) for deque, I just still don't know the overall performant implications of using this method to essentially iterate through a list, with the ability to also remove items in the "list" (deque) with O(1) as well.
Solution 1:[1]
Benchmark with some solutions:
Best three from ten runs:
145 ms 147 ms 148 ms columns
202 ms 203 ms 204 ms chained_removers
219 ms 220 ms 221 ms like_interleave_longest
302 ms 303 ms 304 ms with_roundrobin
313 ms 314 ms 314 ms iterators
330 ms 331 ms 333 ms iterators3
336 ms 338 ms 338 ms iterators2
366 ms 368 ms 369 ms queues_optimized
471 ms 474 ms 475 ms queues_clean
737 ms 741 ms 746 ms queues
Input was 1000 lists with random lengths from 1000 to 2000.
queuesis your original solution (edit: which you had in your question but now deleted).queues_cleanis the same, but without indices, and normal truth value tests instead of length checks.queues_optimizedis an optimized version ofqueues_clean.iteratorsis likequeues_optimizedbut with iterators instead of queues.iterators2anditerators3are some other versions with iterators I tried, replacing the outer deque with something else.columnsis a different approach. Think of the input data as rows. What you want is the concatenated columns. So prepare one list for each needed column, and then spread every input row across the columns. Finish by collecting by columns.chained_removersmainlyzips all the lists. But it chains little remover iterables behind them, which remove their exhausted iterator and yield a marker which then also gets removed (from the values of the current "column"). Also uses anOrderedDictfor its doubly-linked list, allowing O(1) time deletions and subsequent O(length) time iteration.with_roundrobinuses the roundrobin itertools recipe. Not sure it counts, as it "skips" exhausted iterators at a potentially very high cost, see below.like_interleave_longestis like more_itertools.interleave_longest, but optimized for producing a list. It doesn't skip exhausted inner lists, but I include it in the benchmark out of curiousity.
I had originally discarded the roundrobin solution because your question made it look like you had many very short (even empty) inner lists. And there it's terrible, for example for 10000 lists with random lengths from 1 to 5:
3 ms 3 ms 3 ms like_interleave_longest
5 ms 6 ms 6 ms columns
8 ms 8 ms 8 ms iterators
8 ms 8 ms 8 ms iterators2
8 ms 8 ms 8 ms iterators3
9 ms 9 ms 10 ms queues_optimized
12 ms 12 ms 13 ms queues_clean
18 ms 18 ms 19 ms queues
26 ms 27 ms 29 ms chained_removers
3642 ms 3750 ms 3812 ms with_roundrobin
Full code (Try it online!):
def queues(data):
data_q = [deque(i) for i in data ]
data_i = deque([i for i in range(len(data_q))])
return_list = []
while len(data_i) > 0:
index = data_i.popleft()
return_list.append(data_q[index].popleft())
if len(data_q[index]) != 0:
data_i.append(index)
return return_list
def queues_clean(data):
queues = deque(map(deque, data))
result = []
while queues:
queue = queues.popleft()
result.append(queue.popleft())
if queue:
queues.append(queue)
return result
def queues_optimized(data):
queues = deque(map(deque, data))
queues_pop = queues.popleft
queues_push = queues.append
result = []
result_append = result.append
while queues:
queue = queues_pop()
result_append(queue.popleft())
if queue:
queues_push(queue)
return result
def iterators(data):
iterators = deque(map(iter, data))
iterators_pop = iterators.popleft
iterators_push = iterators.append
result = []
result_append = result.append
next_value = next
while iterators:
iterator = iterators_pop()
try:
result_append(next_value(iterator))
iterators_push(iterator)
except StopIteration:
pass
return result
def iterators2(data):
iterators = list(map(iter, data))
result = []
result_append = result.append
next_value = next
while iterators:
alive = []
keep = alive.append
for iterator in iterators:
try:
result_append(next_value(iterator))
keep(iterator)
except StopIteration:
pass
iterators = alive
return result
def iterators3(data):
iterators = list(map(iter, data))
result = []
result_append = result.append
next_value = next
while iterators:
keep = 0
for iterator in iterators:
try:
result_append(next_value(iterator))
iterators[keep] = iterator
keep += 1
except StopIteration:
pass
del iterators[keep:]
return result
def columns(data):
columns = [[] for _ in range(max(map(len, data)))]
for row in data:
deque(map(list.append, columns, row), 0)
result = []
for column in columns:
result += column
return result
def chained_removers(data):
marker = object()
def remover(i):
del iterators[i]
yield marker
iterators = OrderedDict()
for i, d in enumerate(data):
iterators[i] = chain(d, remover(i))
result = []
while alive := len(iterators):
for values in zip(*iterators.values()):
if len(iterators) < alive:
result += compress(values, map(is_not, values, repeat(marker)))
break
result += values
return result
def roundrobin(*iterables):
"roundrobin('ABC', 'D', 'EF') --> A D E B F C"
# Recipe credited to George Sakkis
num_active = len(iterables)
nexts = cycle(iter(it).__next__ for it in iterables)
while num_active:
try:
for next in nexts:
yield next()
except StopIteration:
# Remove the iterator we just exhausted from the cycle.
num_active -= 1
nexts = cycle(islice(nexts, num_active))
def with_roundrobin(data):
return list(roundrobin(*data))
def like_interleave_longest(data):
marker = object()
return [x
for xs in zip_longest(*data, fillvalue=marker)
for x in xs
if x is not marker]
funcs = [
queues,
queues_clean,
queues_optimized,
iterators,
iterators2,
iterators3,
columns,
chained_removers,
with_roundrobin,
like_interleave_longest,
]
from timeit import default_timer as time
from random import randint, shuffle
from bisect import insort
from collections import deque, OrderedDict
from itertools import cycle, islice, chain, compress, repeat, zip_longest
from operator import is_not
data = [list(range(1000, 1000 + randint(1, 5)))
for _ in range(10000)]
data = [list(range(1000, 1000 + randint(1000, 2000)))
for _ in range(1000)]
expect = funcs[0](data)
for func in funcs:
assert func(data) == expect
times = {func: [] for func in funcs}
for _ in range(10):
shuffle(funcs)
for func in funcs:
t0 = time()
func(data)
t1 = time()
insort(times[func], t1 - t0)
for func in sorted(funcs, key=times.get):
print(*('%4d ms ' % (t * 1e3) for t in times[func][:3]), func.__name__)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 |
