pdfgen/check_pdf_fonts.py
2025-08-16 07:28:01 +00:00

50 lines
2.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import fitz # PyMuPDF
import sys
def extract_text_info(pdf_path):
try:
doc = fitz.open(pdf_path)
except Exception as e:
print(f"Ошибка при открытии файла: {e}")
return
for page_num in range(len(doc)):
page = doc[page_num]
blocks = page.get_text("dict")["blocks"]
print(f"Page {page_num + 1}")
for block in blocks:
if "lines" in block:
for line in block["lines"]:
line_text = ""
line_fonts = set() # Для хранения всех шрифтов в строке
font_size = None
# Координаты строки (берем из первого спана или обновляем)
bbox = None
for span in line["spans"]:
line_text += span["text"]
line_fonts.add(span["font"]) # Шрифт
font_size = span["size"] # Размер шрифта
# Координаты спана (x0, y0, x1, y1)
span_bbox = span["bbox"]
if bbox is None:
bbox = span_bbox
else:
# Обновляем границы для всей строки
bbox = (
min(bbox[0], span_bbox[0]), # x0
min(bbox[1], span_bbox[1]), # y0
max(bbox[2], span_bbox[2]), # x1
max(bbox[3], span_bbox[3]) # y1
)
if line_text.strip():
print(f"Line: {line_text.strip()} | Fonts: {', '.join(line_fonts)} | Size: {font_size} | BBox: ({bbox[0]:.2f}, {bbox[1]:.2f}, {bbox[2]:.2f}, {bbox[3]:.2f})")
doc.close()
if len(sys.argv) != 2:
print("Использование: python3 pdf_fonts.py <путь_к_pdf>")
sys.exit(1)
pdf_path = sys.argv[1]
extract_text_info(pdf_path)