'Python docx Replace string in paragraph while keeping style
I need help replacing a string in a word document while keeping the formatting of the entire document.
I'm using python-docx, after reading the documentation, it works with entire paragraphs, so I loose formatting like words that are in bold or italics. Including the text to replace is in bold, and I would like to keep it that way. I'm using this code:
from docx import Document
def replace_string2(filename):
doc = Document(filename)
for p in doc.paragraphs:
if 'Text to find and replace' in p.text:
print 'SEARCH FOUND!!'
text = p.text.replace('Text to find and replace', 'new text')
style = p.style
p.text = text
p.style = style
# doc.save(filename)
doc.save('test.docx')
return 1
So if I implement it and want something like (the paragraph containing the string to be replaced loses its formatting):
This is paragraph 1, and this is a text in bold.
This is paragraph 2, and I will replace old text
The current result is:
This is paragraph 1, and this is a text in bold.
This is paragraph 2, and I will replace new text
Solution 1:[1]
I posted this question (even though I saw a few identical ones on here), because none of those (to my knowledge) solved the issue. There was one using a oodocx library, which I tried, but did not work. So I found a workaround.
The code is very similar, but the logic is: when I find the paragraph that contains the string I wish to replace, add another loop using runs. (this will only work if the string I wish to replace has the same formatting).
def replace_string(filename):
doc = Document(filename)
for p in doc.paragraphs:
if 'old text' in p.text:
inline = p.runs
# Loop added to work with runs (strings with same style)
for i in range(len(inline)):
if 'old text' in inline[i].text:
text = inline[i].text.replace('old text', 'new text')
inline[i].text = text
print p.text
doc.save('dest1.docx')
return 1
Solution 2:[2]
This is what works for me to retain the text style when replacing text.
Based on Alo's answer and the fact the search text can be split over several runs, here's what worked for me to replace placeholder text in a template docx file. It checks all the document paragraphs and any table cell contents for the placeholders.
Once the search text is found in a paragraph it loops through it's runs identifying which runs contains the partial text of the search text, after which it inserts the replacement text in the first run then blanks out the remaining search text characters in the remaining runs.
I hope this helps someone. Here's the gist if anyone wants to improve it
Edit:
I have subsequently discovered python-docx-template which allows jinja2 style templating within a docx template. Here's a link to the documentation
python3 python-docx python-docx-template
def docx_replace(doc, data):
paragraphs = list(doc.paragraphs)
for t in doc.tables:
for row in t.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
paragraphs.append(paragraph)
for p in paragraphs:
for key, val in data.items():
key_name = '${{{}}}'.format(key) # I'm using placeholders in the form ${PlaceholderName}
if key_name in p.text:
inline = p.runs
# Replace strings and retain the same style.
# The text to be replaced can be split over several runs so
# search through, identify which runs need to have text replaced
# then replace the text in those identified
started = False
key_index = 0
# found_runs is a list of (inline index, index of match, length of match)
found_runs = list()
found_all = False
replace_done = False
for i in range(len(inline)):
# case 1: found in single run so short circuit the replace
if key_name in inline[i].text and not started:
found_runs.append((i, inline[i].text.find(key_name), len(key_name)))
text = inline[i].text.replace(key_name, str(val))
inline[i].text = text
replace_done = True
found_all = True
break
if key_name[key_index] not in inline[i].text and not started:
# keep looking ...
continue
# case 2: search for partial text, find first run
if key_name[key_index] in inline[i].text and inline[i].text[-1] in key_name and not started:
# check sequence
start_index = inline[i].text.find(key_name[key_index])
check_length = len(inline[i].text)
for text_index in range(start_index, check_length):
if inline[i].text[text_index] != key_name[key_index]:
# no match so must be false positive
break
if key_index == 0:
started = True
chars_found = check_length - start_index
key_index += chars_found
found_runs.append((i, start_index, chars_found))
if key_index != len(key_name):
continue
else:
# found all chars in key_name
found_all = True
break
# case 2: search for partial text, find subsequent run
if key_name[key_index] in inline[i].text and started and not found_all:
# check sequence
chars_found = 0
check_length = len(inline[i].text)
for text_index in range(0, check_length):
if inline[i].text[text_index] == key_name[key_index]:
key_index += 1
chars_found += 1
else:
break
# no match so must be end
found_runs.append((i, 0, chars_found))
if key_index == len(key_name):
found_all = True
break
if found_all and not replace_done:
for i, item in enumerate(found_runs):
index, start, length = [t for t in item]
if i == 0:
text = inline[index].text.replace(inline[index].text[start:start + length], str(val))
inline[index].text = text
else:
text = inline[index].text.replace(inline[index].text[start:start + length], '')
inline[index].text = text
# print(p.text)
# usage
doc = docx.Document('path/to/template.docx')
docx_replace(doc, dict(ItemOne='replacement text', ItemTwo="Some replacement text\nand some more")
doc.save('path/to/destination.docx')
Solution 3:[3]
from docx import Document
document = Document('old.docx')
dic = {
'{{FULLNAME}}':'First Last',
'{{FIRST}}':'First',
'{{LAST}}' : 'Last',
}
for p in document.paragraphs:
inline = p.runs
for i in range(len(inline)):
text = inline[i].text
for key in dic.keys():
if key in text:
text=text.replace(key,dic[key])
inline[i].text = text
document.save('new.docx')
Solution 4:[4]
According to the architecture of the DOCX document:
- Text: docx>Paragraphs>runs
- Text table: docx>tables>rows>cells>Paragraphs>runs
- Header: docx>sections>header>Paragraphs>runs
- Header tables: docx>sections>header>tables>row>cells>Paragraphs>runs
The footer is the same as the header, we can directly traverse the paragraph to find and replace our keywords, but this will cause the text format to be reset, so we can only traverse the words in the run and replace them. However, as our keywords may exceed the length range of the run, we cannot replace them successfully.
Therefore, I provide an idea here: firstly, take paragraph as unit, and mark the position of every character in paragraph through list; then, mark the position of every character in run through list; find keywords in paragraph, delete and replace them by character as unit by corresponding relation.
'''
-*- coding: utf-8 -*-
@Time : 2021/4/19 13:13
@Author : ZCG
@Site :
@File : Batch DOCX document keyword replacement.py
@Software: PyCharm
'''
from docx import Document
import os
def get_docx_list(dir_path):
'''
:param dir_path:
:return: List of docx files in the current directory
'''
file_list = []
for roots,dirs,files in os.walk(dir_path):
for file in files:
if file.endswith("docx") == True and file[0] != "~": # Locate the docx document and exclude temporary files
file_root = roots+"\\"+file
file_list.append(file_root)
print("The directory found a total of {0} related files!".format(len(file_list)))
return file_list
class ParagraphsKeyWordsReplace:
'''
self:paragraph
'''
def __init__(self):
self.text = None
self.runs = None
def p_replace(self,x,key,value):
'''
The reason why the text in the paragraph is not directly replaced is because this will cause the original format to change.
Replacing the text in the runs will not cause the original format to change.
:param x: paragraph number
:param key: keywords to replace
:param value: replaced keywords
:return:
'''
paragraph_positions = [] # Get the coordinate values of all characters in this paragraph {run_index , char_index}
for y, run in enumerate(self.runs): # Read the index of the run
for z, char in enumerate(list(run.text)): # Read the index of chars in run
position = {"run": y, "char": z} # give each character a dictionary index
paragraph_positions.append(position)
# Process the number of times the key appears in this paragraph, and record the starting position in the list
# Here, if you use while self.text.find(key) >= 0, when you encounter a structural word such as {"ab":"abc"},
# it will enter an infinite loop, and return the index of the first word of the key in the current paragraph value
key_indexs = [s for s in range(len(self.text)) if self.text.find(key, s, len(self.text)) == s]
for i, start_i in enumerate( reversed(key_indexs),start=1): # Iteration in reverse order
end_i = start_i + len(key) # where the keyword ends in this paragraph
key_maps = paragraph_positions[start_i:end_i] # Map the section of the slice list that contains the keyword in the paragraph
ParagraphsKeyWordsReplace.c_replace(self, key_maps, value)
print(f"\tSuccessfully replaced segment {x+1}, object {i}?{key}===>{value}")
def c_replace(self,key_maps,value):
'''
:param key_maps: List of index dictionaries containing keywords
:param value: replaced new word
:return:
Receive parameters, delete the characters in key_maps from back to front, and keep the first one for replacement with value
Note: Be sure to delete in reverse order, otherwise the change in the length of the list will cause IndedxError: string index out of range error
'''
# print(key_maps)
for i, position in enumerate(reversed(key_maps),start=1):
y, z = position["run"], position["char"]
run,char = self.runs[y],self.runs[y].text[z]
# print("current processing:",position,char,i,len(key_maps))
# print("Before:",run.text)
if i < len(key_maps):
rt = list(run.text)
rt.pop(z)
run.text = ''.join(rt) # Delete the character at the specified index each time through the loop
# Stepping on the pit: There is no replace method here. The purpose is to prevent multiple identical words in run.text. If multiple words are replaced at one time, an IndedxError will be raised.
if i == len(key_maps):
run.text = run.text.replace(char, value) # The first character in key_maps is replaced with value
# print("After:", run.text)
class DocxKeyWordsReplace:
'''
self:docx
'''
def __init__(self):
self.paragraphs = None
self.tables = None
self.sections = None
def content(self,replace_dict):
print(f"(1)Processing keywords in body text...")
for key, value in replace_dict.items():
for x, paragraph in enumerate(self.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tText keyword replacement completed!")
def tables(self,replace_dict):
print(f"(2)Processing keywords in table...")
for key,value in replace_dict.items():
for table in self.tables:
for row in table.rows:
for cell in row.cells:
for x,paragraph in enumerate(cell.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tTable keyword replacement completed!")
def header_content(self,replace_dict):
print(f"(3)Processing keywords in header...")
for key,value in replace_dict.items():
for section in self.sections:
for x,paragraph in enumerate(section.header.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tContent header keyword replacement completed!")
def header_tables(self,replace_dict):
print(f"(4)Processing keywords in header table...")
for key,value in replace_dict.items():
for section in self.sections:
for table in section.header.tables:
for row in table.rows:
for cell in row.cells:
for x, paragraph in enumerate(cell.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tHeader table keyword replacement completed!")
def footer_content(self, replace_dict):
print(f"(6)Processing keywords in footer...")
for key, value in replace_dict.items():
for section in self.sections:
for x, paragraph in enumerate(section.footer.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tFooter keyword replacement completed!")
def footer_tables(self, replace_dict):
print(f"(7)Processing keywords in footer table...")
for key, value in replace_dict.items():
for section in self.sections:
for table in section.footer.tables:
for row in table.rows:
for cell in row.cells:
for x, paragraph in enumerate(cell.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tFooter table keyword replacement completed!")
def main():
'''
How to use: Modify the values in replace_dict and file_dir
replace_dict ?The following dictionary corresponds to the format, the key is the content to be replaced, and the value is the new content
file_dir ?The directory where the docx file is stored, and its subdirectories are supported
'''
# input section
replace_dict = {
"MG life technology (shenzhen) co., LTD":"Shenzhen YW medical technology co., LTD",
"MG-":"YW-",
"2017-":"2020-",
"Z18":"Z20",
}
file_dir = r"E:\docxfiles"
# call processing part
for i,file in enumerate(get_docx_list(file_dir),start=1):
print(f"{i}?file being processed:{file}")
docx = Document(file)
DocxKeyWordsReplace.content(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.tables(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.header_content(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.header_tables(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.footer_content(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.footer_tables(docx, replace_dict=replace_dict)
docx.save(file)
print(f'"{file}"Document processing complete!\n')
if __name__ == "__main__":
main()
print("All complete processing!")
Solution 5:[5]
https://gist.github.com/heimoshuiyu/671a4dfbd13f7c279e85224a5b6726c0
This use a "shuttle" so it can find the key which cross multiple runs. This is similar to the "Replace All" behavior in MS Word
def shuttle_text(shuttle):
t = ''
for i in shuttle:
t += i.text
return t
def docx_replace(doc, data):
for key in data:
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if key in cell.text:
cell.text = cell.text.replace(key, data[key])
for p in doc.paragraphs:
begin = 0
for end in range(len(p.runs)):
shuttle = p.runs[begin:end+1]
full_text = shuttle_text(shuttle)
if key in full_text:
# print('Replace?', key, '->', data[key])
# print([i.text for i in shuttle])
# find the begin
index = full_text.index(key)
# print('full_text length', len(full_text), 'index:', index)
while index >= len(p.runs[begin].text):
index -= len(p.runs[begin].text)
begin += 1
shuttle = p.runs[begin:end+1]
# do replace
# print('before replace', [i.text for i in shuttle])
if key in shuttle[0].text:
shuttle[0].text = shuttle[0].text.replace(key, data[key])
else:
replace_begin_index = shuttle_text(shuttle).index(key)
replace_end_index = replace_begin_index + len(key)
replace_end_index_in_last_run = replace_end_index - len(shuttle_text(shuttle[:-1]))
shuttle[0].text = shuttle[0].text[:replace_begin_index] + data[key]
# clear middle runs
for i in shuttle[1:-1]:
i.text = ''
# keep last run
shuttle[-1].text = shuttle[-1].text[replace_end_index_in_last_run:]
# print('after replace', [i.text for i in shuttle])
# set begin to next
begin = end
# usage
doc = docx.Document('path/to/template.docx')
docx_replace(doc, dict(ItemOne='replacement text', ItemTwo="Some replacement text\nand some more")
doc.save('path/to/destination.docx')
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Alo |
| Solution 2 | |
| Solution 3 | Jothan Kelepolo |
| Solution 4 | |
| Solution 5 |
