'Finding the best supersets in a list based on intersections

I have a file including lines as follows,

finalInjectionList is input file: [0, 2, 3] [0, 2, 3, 4] [0, 3] [1, 2, 4] [2, 3] [2, 3, 4]

Here [0, 2, 3, 4] and [1, 2, 4] are the best supersets for my problem and I want to write them to an outputfile. Because those are supersets of some other elements and NOT subsets of any line.

my code:

import ast
import itertools

def get_data(filename):
    with open(filename, 'r') as fi:
        data = fi.readlines()
    return data

def get_ast_set(line):
    return set(ast.literal_eval(line))

def check_infile(datafile, savefile):
    list1 = [get_ast_set(row) for row in get_data(datafile)]
    print(list1)
    outlist = []

    #for i in range(len(list1)):
    for a, b in itertools.combinations(list1, 2):
       if a.issuperset(b):
            with open(savefile, 'a') as fo:
                  fo.writelines(str(a))

if __name__ == "__main__":
    datafile = str("./finalInjectionList")
    savefile = str("./filteredSets" )
    check_infile(datafile, savefile)

My code writes all supersets, e.g {2, 3, 4} also. But {0, 2, 3, 4} covers {2, 3, 4} already, so I do not want to write {2, 3, 4} to output file.

Is there any suggestion?



Solution 1:[1]

Solved by modifying routine check_infile

import ast
import itertools

# A union by rank and path compression based  
# program to detect cycle in a graph 
from collections import defaultdict 

def findparent(d, node):
  """Goes through chain of parents, until we reach node which is its own parent
     Meaning, no node has it has a subset"""
  if d[node] == node:
    return node
  else:
    return findparent(d, d[node])

def get_data(filename):
    with open(filename, 'r') as fi:
        data = fi.readlines()
    return data

def get_ast_set(line):
    return set(ast.literal_eval(line))

def check_infile(datafile, savefile):
  """Find minimum number of supersets as follows:
    1) identify superset of each set
    2) Go through superset chains (findparents) to find set of nodes which are supersets (roots) """

    list1 = [get_ast_set(row) for row in get_data(datafile)]
    print(list1)
    outlist = []

    n = len(list1)

    # Initially each node is its own parent (i.e. include self as superset)
    # Here parent means superset
    parents = {u:u for u in range(n)}

    for u in range(n):
      a = list1[u]
      for v in range(u+1, n):
        b = list1[v]

        if a.issuperset(b):
          parents[v] = u   # index u is superset of v
        elif b.issuperset(a):
          parents[u] = v   # index v is superset of u

    # Print root nodes
    roots = set()
    for u in range(n):
      roots.add(findparent(parents, u))

    with open(savefile, 'w') as fo:
      for i in roots:
        fo.write(str(list1[i]))
        fo.write('\n')


if __name__ == "__main__":
    datafile = str("./finalInjectionList.txt")
    savefile = str("./filteredSets.txt" )
    check_infile(datafile, savefile)

Test File (finalInjectionList.txt)

[0, 2, 3]
[0, 2, 3, 4]
[0, 3]
[1, 2, 4]
[2, 3]
[2, 3, 4]

Output File (filteredSets.txt)

{0, 2, 3, 4}
{1, 2, 4}

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1