'Extract specific pages of PDF and save it with Python
I have some sources and tried to code which extract some pages and create pdf files. I have a list which looks like this
information = [(filename1,startpage1,endpage1), (filename2, startpage2, endpage2), ...,(filename19,startpage19,endpage19)].
This is my code.
from PyPDF2 import PdfFileReader, PdfFileWriter
reader = PdfFileReader("example.pdf")
for page in range(reader.getNumPages() - 1):
writer = PdfFileWriter()
start = information[page][1]
end = information[page][2]
while start < end:
writer.addPage(reader.getPage(start))
start += 1
output_filename = "{}_{}_page_{}.pdf".format(
information[page][0], information[page][1], information[page][2]
)
with open(output_filename, "wb") as out:
writer.write(out)
But the output is weird.. some has nothing inside and some has just one page in it. How can I correct this?
Solution 1:[1]
Full code and I modified SSS' answer to be portable, flexible, and concurrent with multiple source pdfs. I couldn't test the performance difference between ThreadPoolExecutor and ProcessPoolExecutor, but I assumed that the extraction process is bounded by the reading and writing of PDFs rather than by getPage and addPage.
import concurrent.futures
from multiprocessing import freeze_support
from pathlib import Path
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_extract(pdf, segments):
"""
pdf: str | Path
segments: [(start, end), {'start': int, 'end': int}]
"""
with open(pdf, 'rb') as read_stream:
pdf_reader = PdfFileReader(read_stream)
for segment in segments:
pdf_writer = PdfFileWriter()
# support {'start': 3, 'end': 3} or (start, end)
try:
start_page, end_page = segment['start'], segment['end']
except TypeError:
start_page, end_page = segment
for page_num in range(start_page - 1, end_page):
pdf_writer.addPage(pdf_reader.getPage(page_num))
p = Path(pdf)
ouput = p.parent / p.with_stem(f'{p.stem}_pages_{start_page}-{end_page}')
with open(ouput, 'wb') as out:
pdf_writer.write(out)
def __pdf_extract(pair):
return pdf_extract(*pair)
def pdf_extract_batch(pdfs, workers=20):
"""
pdfs = {pdf_name: [(1, 1), ...], ...}
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
executor.map(__pdf_extract, pdfs.items())
if __name__ == '__main__':
freeze_support()
pdf_name = r'C:\Users\maste\Documents\long.pdf'
segments = [(1, 1), {'start': 3, 'end': 5}]
# Single
pdf_extract(pdf_name, segments)
# Batched (Concurrent)
pdfs = {pdf_name: segments}
# pdf_extract_batch(pdfs)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 |
