'Find important keyword in a list of strings

I have a function that organizes files in a particular directory on the basis of its name, Basically what the function does is split the files in the directory based on their name then create folders with that name, then all the files containing that name will be moved into that folder. For example if there are two files wave.png and wave-edited.png it will create a folder named wave and because those two files contain the keyword wave they will be moved into that folder. I am stuck figuring out how to get the keyword

List of file_names = ['ghosts-edited.png', 'ghosts.png', 'wave.png', 'wave-edited.png', '10-14-day', '12-11-day']

Expected output

['ghosts', 'wave', 'day']

Code:

def name_category():
    sub_file_names = []
    file_names = []
    delimiters = ['.', ',', '!', ' ', '-', ';', '?', '*', '!', '@', '#', '$', '%', '^', '&', '(', ')', '_', '/', '|', '<', '>']
    try:
        for filename in os.listdir(folder_to_track):
            filename = filename.lower()
            file_names.append(filename)
            sub_file_names.append(max(re.findall(r'[A-Za-z]+',filename),key=len)) # I want to replace this method
            sub_file_names = list(set(sub_file_names))
        file_mappings = collections.defaultdict()
        for filename in os.listdir(folder_to_track):
            if not os.path.isdir(os.path.join(folder_to_track, filename)):
                for sub_file_name in sub_file_names:
                    file_mappings.setdefault(sub_file_name, []).append(filename)

        for folder_name, folder_items in file_mappings.items():
            folder_path = os.path.join(folder_to_track, folder_name)
            if not os.path.exists(folder_path):
                os.mkdir(folder_path)
                
                for filename in file_names:
                    filename = filename.lower()
                    i = 1
                    regexPattern = '|'.join(map(re.escape, delimiters))
                    splittedstring = re.split(regexPattern, filename, 0)
                    if folder_name in splittedstring:
                        new_name = filename
                        file_exits = os.path.isfile(folder_path + '\\' + new_name)
                        while file_exits:
                            i += 1
                            new_name = os.path.splitext(folder_to_track + '\\' + new_name)[0] + str(i) + os.path.splitext(folder_to_track + '\\' + new_name)[1]   
                            new_name = new_name.split("\\")[4]
                            file_exits = os.path.isfile(folder_path + "\\" + new_name)
                        src = folder_to_track + "\\" + filename
                        new_name = folder_path + "\\" + new_name
                        os.rename(src, new_name)
        
    except Exception as e:
        print(e)

sub_file_names when printed:

['ghosts', 'wave', 'edited']

Right now I am filtering the keywords using the biggest word in the filename called sub_file_name.

Update

This is the directory containing the files

enter image description here

This is the expected output

enter image description here

All files having the same keyword for example the files wave.png and wave-edited.png contain the keyword wave so a folder named wave is created and those files are moved into it and all files that do not have repeating keywords like the file ant-dark.png and nature-aesthetic.png are moved into the other folder



Solution 1:[1]

IIUC, you would like a method that could obtain the names before the first non-word character.

Code

from collections import Counter

def find_prefixes(strings):
    '''
       Finds common prefix in strings before non-alphanumeric character
       
       Processing
           - Separate each string in strings(list) into substrings by splitting non-alphanumeric boundaries
           - Count occurence of substring based upon value and position (excluding suffixes)
           - Filter (keep) substrings whose value & position is 2 or more
           - Sort substrings value & position and counts (descending order by count, ascening by position)
           - Get sorted substrings (value & position)
           - Find which substrings are used in sorted list
           
    '''
    prefix_cnts = Counter()                   # Count of prefixes
    pattern = re.compile('[^a-zA-Z0-9]')      # pattern to detect non-alphanumeric character
                         
    for string in strings:
            # Separate each string in strings(list) into substrings by splitting non-alphanumeric boundaries
            arr = pattern.split(string)  # single split on non-letter character
            
            # Count occurence of substring based upon value and position (excluding suffixes)
            for pos, prefix in enumerate(arr[:-1]):
                if not prefix.isdigit():
                    prefix_cnts[f'{prefix} {pos}'] += 1     # Increment count of prefix at position pos
    
    # Filter (keep) substrings whose value & position is 2 or more
    prefix_cnts = {k:v for k, v in prefix_cnts.items() if v > 1}
    
    # Sort substrings value & position and counts (descending order by count, ascening by position)
    prefix_cnts = sorted(prefix_cnts.items(), key = lambda kv: (-kv[1], int(kv[0].split()[1])))
    
    # Get sorted substrings (value & position)
    prefixes = [k for k, v in prefix_cnts]
    
    # Find which substrings are used in sorted list
    prefix_cnts = Counter(prefixes)
    for string in strings:
            # Add first non-integer list
            arr = pattern.split(string)  # single split on non-letter character
            for pos, prefix in enumerate(arr[:-1]):   # use arr[:-1] to exclude file name suffixes
                token = f'{prefix} {pos}'
                if token in prefixes:
                    prefix_cnts[token] += 1           # Increment count of prefix at position pos
                    break
           
    # Prefixes with count > 1
    prefixes = {k.split()[0] for k, v in prefix_cnts.items() if v > 1}
    
    # Generate string to prefix mapping
    mapping = {}
    for string in strings:
        arr = pattern.split(string)   # single split on non-letter character
        for prefix in arr[:-1]:   # use arr[:-1] to exclude file name suffixes
            if prefix in prefixes:
                mapping[string] = prefix
                break
        else:
            mapping[string] = 'other'
                
    return prefixes, mapping
          

Test

# Using names from folder in posted image  
file_names = ['10-14-day.jpg',
             '10-14-night-6k.jpg',
             'ant-dark.jpg',
             'garuda-desert.png',
             'garuda-desert-edited.png',
             'ghosts.jpg',
             'ghosts-edited.png',
             'nature-aesthetic.jpg',
             'predator-black.png',
             'predator-blue.jpg',
             'predator-red.jpg',
             'wave.png',
             'wave-edited.png']

folders, file_mapping = find_prefixes(file_names)
#print(f"Folder names: {folders}")
print("Folder names:", list(folders))
print("\nFile to folder name mapping:")
for k, v in file_mapping.items():
  print(f"\t{k:>25} --> {v}")

Output

Folder names: ['wave', 'garuda', 'predator', 'ghosts']

File to folder name mapping:
                10-14-day.jpg --> other
           10-14-night-6k.jpg --> other
                 ant-dark.jpg --> other
            garuda-desert.png --> garuda
     garuda-desert-edited.png --> garuda
                   ghosts.jpg --> ghosts
            ghosts-edited.png --> ghosts
         nature-aesthetic.jpg --> other
           predator-black.png --> predator
            predator-blue.jpg --> predator
             predator-red.jpg --> predator
                     wave.png --> wave
              wave-edited.png --> wave

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1