OCRmyPDF Tutorial: Convert Scanned Documents into Searchable PDF/A Files with Sidecar Text Extraction and Batch Processing

def _purge(*prefixes): for the identify of [m for m in list(sys.modules)
if any(m == p or m.startswith(p + “.”) for p in prefixes)]: del sys.modules[name]
def _load_ocrmypdf(): _purge(“PIL”, “ocrmypdf”) import ocrmypdf return ocrmypdf strive: ocrmypdf = _load_ocrmypdf() with exception ImportError as e: if “_Ink” in str(e) or “PIL” in str(e): print(“Repairing incompatible pillow (reinstalling pillow<12...") sh(f'"{sys.executable}" -m pip set up -q --force-reinstall "pillow<12"') try: ocrmypdf = _load_ocrmypdf() print("ピローは修復されました — 再起動せずに続行します。") 例外: raise RuntimeError( "このセッションではピローにまだ互換性がありません。Colab メニューを使用します: " "ランタイム > Restart your session and run this cell once more. ” ) else: Import from ocrmypdf.Exceptions (ExitCode, PriorOcrFoundError, EncryptedPdfError, MissingDependencyError, TaggedPDFError, DigitalSignatureError, DpiError, InputFileError, UnsupportedImageFormatError,) Import from ocrmypdf.helpers check_pdf Import from ocrmypdf.pdfa file_claims_pdfa PIL img2pdf import from Picture, ImageDraw, ImageFont, ImageFilterlogging.basicConfig(degree=logging.WARNING, format=”%(levelname)s: %(message)s”)logging.getLogger(“ocrmypdf”).setLevel(logging.WARNING)logging.getLogger(“pdfminer”).setLevel(logging.ERROR)logging.getLogger(“PIL”).setLevel(logging.WARNING) SAMPLE_TEXT_PAGES = [
“Optical Character Recognition, commonly abbreviated as OCR, is the ”
“process of converting images of typed or printed text into machine ”
“encoded text. This page was generated as a synthetic scan so that the ”
“OCRmyPDF pipeline has something realistic to recognize and search.”,
“On 14 March 2026 the archive contained 1,482 pages across 37 folders. ”
“Roughly 92 percent of those pages were scanned at 200 to 300 dots per ”
“inch. The remaining 8 percent were skewed and required deskewing before ”
“any reliable recognition was possible.”,
“After OCRmyPDF finishes, the output is a searchable PDF/A file. You can ”
“select text, copy it, and run full text search across thousands of ”
“documents. The original image resolution is preserved while a hidden ”
“text layer is placed accurately underneath the page image.”,
]
def _find_font(): if cand ( “/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf”, “/usr/share/fonts/truetype/liberation/LiberationSans- Common.ttf”, ): if os.path.exists(cand): return can, return none. _FONT_PATH = _find_font() FONT = ImageFont.truetype(_FONT_PATH, 40) if _FONT_PATH else ImageFont.load_default() def _add_speckle(img, n=6000, darkish=60): “””Sprinkle brilliant darkish speckles to imitate scanner noise (–clean motive).””” Random px = img.load() w, h = img.dimension for _ in vary(n): import px[random.randint(0, w – 1), random.randint(0, h – 1)] =random.randint(0, darkish) return img def render_page(textual content, skew=False): “””Renders a single A4 web page (1654×2339 px ≈ 200 DPI) with darkish textual content on a white background.””” W, H = 1654, 2339 img = Picture.new(“L”, (W, H), 255)draw = ImageDraw.Draw(img)draw.multiline_text((150, 180), textwrap.fill(textual content, width=58), fill=25, font=FONT, spacing=18) For skew: img = img.rotate(6, resample=Picture.BICUBIC, Increase=False, fillcolor=255) img = img.filter(ImageFilter.GaussianBlur(0.6)) img = _add_speckle(img) return img def build_scanned_pdf(pdf_path: Path, Pages_text, skew_index=1): “””Renders the web page to PNG and wraps it right into a lossless, image-only PDF.””” pngs = []
For i, the textual content in enumerate(pages_text): img = render_page(textual content, skew=(i == skew_index)) p = pdf_path.dad or mum / f”_pg_{pdf_path.stem}_{i}.png” img.save(p, format=”PNG”, dpi=(200, 200)) pngs.append(str(p)) with open(pdf_path, “wb”) as f: f.write(img2pdf.convert(pngs)) for p in pngs: os.take away(p) return pdf_path def do_ocr(input_file, Output_file, **kw): “””Wrapper for ocrmypdf.ocr() that measures time with progress bar disabled.””” kw.setdefault(“progress_bar”, False) t0 = time.perf_counter() rc = ocrmypdf.ocr(input_file, output_file, **kw) return rc, time.perf_counter() – t0 def tokens(s: str): return re.findall(r”[a-z0-9]+”, s. decrease()) def kb(path) -> str: return f”{Path(path).stat().st_size / 1024:,.1f} KB” def Banner(title: str): line = “─” * 74 print(f”n{line}n {title}n{line}”)

OCRmyPDF Tutorial: Convert Scanned Documents into Searchable PDF/A Files with Sidecar Text Extraction and Batch Processing

Leave a Reply Cancel reply

Follow US

Popular News

Hayley Williams Joined Jack Antonoff’s Ally Coalition Show

15 Best Tablets (2025), Tested and Reviewed

9 Must Try Grocery Store Treats To Snack On

How Working in Retirement May Affect Your Social Security and Medicare

Petit Planet preview: HoYoverse’s first foray into family-friendly games is very careful to colour within the lines

Categories

About US

Quick Links

Important Links

Subscribe US