50 lines
2.1 KiB
Python
50 lines
2.1 KiB
Python
import fitz # PyMuPDF
|
||
import sys
|
||
|
||
def extract_text_info(pdf_path):
|
||
try:
|
||
doc = fitz.open(pdf_path)
|
||
except Exception as e:
|
||
print(f"Ошибка при открытии файла: {e}")
|
||
return
|
||
|
||
for page_num in range(len(doc)):
|
||
page = doc[page_num]
|
||
blocks = page.get_text("dict")["blocks"]
|
||
|
||
print(f"Page {page_num + 1}")
|
||
for block in blocks:
|
||
if "lines" in block:
|
||
for line in block["lines"]:
|
||
line_text = ""
|
||
line_fonts = set() # Для хранения всех шрифтов в строке
|
||
font_size = None
|
||
# Координаты строки (берем из первого спана или обновляем)
|
||
bbox = None
|
||
for span in line["spans"]:
|
||
line_text += span["text"]
|
||
line_fonts.add(span["font"]) # Шрифт
|
||
font_size = span["size"] # Размер шрифта
|
||
# Координаты спана (x0, y0, x1, y1)
|
||
span_bbox = span["bbox"]
|
||
if bbox is None:
|
||
bbox = span_bbox
|
||
else:
|
||
# Обновляем границы для всей строки
|
||
bbox = (
|
||
min(bbox[0], span_bbox[0]), # x0
|
||
min(bbox[1], span_bbox[1]), # y0
|
||
max(bbox[2], span_bbox[2]), # x1
|
||
max(bbox[3], span_bbox[3]) # y1
|
||
)
|
||
if line_text.strip():
|
||
print(f"Line: {line_text.strip()} | Fonts: {', '.join(line_fonts)} | Size: {font_size} | BBox: ({bbox[0]:.2f}, {bbox[1]:.2f}, {bbox[2]:.2f}, {bbox[3]:.2f})")
|
||
|
||
doc.close()
|
||
|
||
if len(sys.argv) != 2:
|
||
print("Использование: python3 pdf_fonts.py <путь_к_pdf>")
|
||
sys.exit(1)
|
||
|
||
pdf_path = sys.argv[1]
|
||
extract_text_info(pdf_path) |