Surya OCR

2024-07-08 PV:

环境准备

版本：

python3.9 + surya-ocr 0.4.15

模型准备：

检测模型：surya_det3

识别模型：surya_rec

版面模型：surya_layout3

源码修改

因首次使用下载模型被墙，提前将模型收录至模型文件夹并修改源码导入部分：

(源码位置：...Python39/Lib/site-packages/surya/settings.py)

from typing import Dict, Optional

from dotenv import find_dotenv
from pydantic import computed_field
from pydantic_settings import BaseSettings
import torch
import os


class Settings(BaseSettings):
    # General
    TORCH_DEVICE: Optional[str] = None
    IMAGE_DPI: int = 96
    IN_STREAMLIT: bool = False # Whether we're running in streamlit

    # Paths
    DATA_DIR: str = "data"
    RESULT_DIR: str = "results"
    BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")

    @computed_field
    def TORCH_DEVICE_MODEL(self) -> str:
        if self.TORCH_DEVICE is not None:
            return self.TORCH_DEVICE

        if torch.cuda.is_available():
            return "cuda"

        if torch.backends.mps.is_available():
            return "mps"

        return "cpu"

    # Text detection
    DETECTOR_BATCH_SIZE: Optional[int] = None # Defaults to 2 for CPU/MPS, 32 otherwise
    DETECTOR_MODEL_CHECKPOINT: str = r"D:\pycharmproject_2\translate_plat\surya_ocr\models\surya_det3"
    DETECTOR_BENCH_DATASET_NAME: str = "vikp/doclaynet_bench"
    DETECTOR_IMAGE_CHUNK_HEIGHT: int = 1400 # Height at which to slice images vertically
    DETECTOR_TEXT_THRESHOLD: float = 0.6 # Threshold for text detection (above this is considered text)
    DETECTOR_BLANK_THRESHOLD: float = 0.35 # Threshold for blank space (below this is considered blank)
    DETECTOR_POSTPROCESSING_CPU_WORKERS: int = min(8, os.cpu_count()) # Number of workers for postprocessing
    DETECTOR_MIN_PARALLEL_THRESH: int = 3 # Minimum number of images before we parallelize

    # Text recognition
    RECOGNITION_MODEL_CHECKPOINT: str = r"D:\pycharmproject_2\translate_plat\surya_ocr\models\surya_rec"
    RECOGNITION_MAX_TOKENS: int = 175
    RECOGNITION_BATCH_SIZE: Optional[int] = None # Defaults to 8 for CPU/MPS, 256 otherwise
    RECOGNITION_IMAGE_SIZE: Dict = {"height": 196, "width": 896}
    RECOGNITION_RENDER_FONTS: Dict[str, str] = {
        "all": os.path.join(FONT_DIR, "GoNotoCurrent-Regular.ttf"),
        "zh": os.path.join(FONT_DIR, "GoNotoCJKCore.ttf"),
        "ja": os.path.join(FONT_DIR, "GoNotoCJKCore.ttf"),
        "ko": os.path.join(FONT_DIR, "GoNotoCJKCore.ttf"),
    }
    RECOGNITION_FONT_DL_BASE: str = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0"
    RECOGNITION_BENCH_DATASET_NAME: str = "vikp/rec_bench"
    RECOGNITION_PAD_VALUE: int = 255 # Should be 0 or 255
    RECOGNITION_STATIC_CACHE: bool = False # Static cache for torch compile
    RECOGNITION_MAX_LANGS: int = 4

    # Layout
    LAYOUT_MODEL_CHECKPOINT: str = r"D:\pycharmproject_2\translate_plat\surya_ocr\models\surya_layout3"
    LAYOUT_BENCH_DATASET_NAME: str = "vikp/publaynet_bench"

    # Ordering
    ORDER_MODEL_CHECKPOINT: str = "vikp/surya_order"
    ORDER_IMAGE_SIZE: Dict = {"height": 1024, "width": 1024}
    ORDER_MAX_BOXES: int = 256
    ORDER_BATCH_SIZE: Optional[int] = None  # Defaults to 4 for CPU/MPS, 32 otherwise
    ORDER_BENCH_DATASET_NAME: str = "vikp/order_bench"

    # Tesseract (for benchmarks only)
    TESSDATA_PREFIX: Optional[str] = None

    @computed_field
    @property
    def MODEL_DTYPE(self) -> torch.dtype:
        return torch.float32 if self.TORCH_DEVICE_MODEL == "cpu" else torch.float16

    class Config:
        env_file = find_dotenv("local.env")
        extra = "ignore"


settings = Settings()

使用

小语种OCR识别

import cv2
from PIL import Image
from surya.detection import batch_text_detection
from surya.layout import batch_layout_detection
from surya.model.detection.model import load_model, load_processor
from surya.settings import settings

IMAGE_PATH = './test/7.png'  ## 需检测图片地址
DET_MODEL_PATH = './models/surya_det3'  ## 模型参数保存地址
LAYOUT_MODEL_PATH = './models/surya_layout3'  ## 模型参数保存地址

image = Image.open(IMAGE_PATH)
surya_layout_model = load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
surya_processor = load_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
surya_det_model = load_model()
surya_det_processor = load_processor()

# layout_predictions is a list of dicts, one per image
line_predictions = batch_text_detection([image], surya_det_model, surya_det_processor)
layout_predictions = batch_layout_detection([image], surya_layout_model, surya_processor, line_predictions)

image = cv2.imread(IMAGE_PATH)


def surya_layout2paddle_structure(surya_layout_res, im):
    paddle_structure_res = []
    for item in surya_layout_res[0].bboxes:
        surya_layout_dict = dict(item)
        paddle_structure_res.append({
            'bbox': [
                surya_layout_dict['polygon'][0][0],
                surya_layout_dict['polygon'][0][1],
                surya_layout_dict['polygon'][2][0],
                surya_layout_dict['polygon'][2][1]
            ],
            'type': surya_layout_dict['label'],
            'img': im[
                   surya_layout_dict['polygon'][0][1]: surya_layout_dict['polygon'][2][1],
                   surya_layout_dict['polygon'][0][0]: surya_layout_dict['polygon'][2][0]
                   ]
        })
    return paddle_structure_res


print(surya_layout2paddle_structure(layout_predictions, image))

res = []
for i in layout_predictions[0].bboxes:
    res.append(dict(i))
print(res)

识别结果：

Loaded detection model ./models/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model ./models/surya_rec on device cpu with dtype torch.float32
ok
Detecting bboxes: 100%|██████████| 1/1 [00:05<00:00,  5.15s/it]
Recognizing Text: 100%|██████████| 1/1 [00:18<00:00, 18.12s/it]

[OCRResult(text_lines=[TextLine(polygon=[[65.0, 12.0], [424.0, 11.0], [425.0, 49.0], [66.0, 50.0]], confidence=0.8231586813926697, text='Đ Phong Cách', bbox=[65.0, 12.0, 424.0, 49.0]), TextLine(polygon=[[146.0, 60.0], [345.0, 60.0], [345.0, 95.0], [146.0, 95.0]], confidence=0.9834306836128235, text='Khác Biêt', bbox=[146.0, 60.0, 345.0, 95.0]), TextLine(polygon=[[9.0, 114.0], [482.0, 114.0], [482.0, 126.0], [9.0, 126.0]], confidence=0.9684180021286011, text='Trên tay ché tác nquyên khői đẫn đầu xu hướng với thiết kế thân máy lièn mach, đô mòng ấn tương 8.5mm cùng', bbox=[9.0, 114.0, 482.0, 126.0]), TextLine(polygon=[[0.0, 132.0], [490.0, 132.0], [490.0, 143.0], [0.0, 143.0]], confidence=0.9850525856018066, text='kiểu dáng măt kính bóng mươt, sang trong từ Galaxy M30. Vừa văn hoàn hảo trong lòng bàn tay, thoả thích thể hiện', bbox=[0.0, 132.0, 490.0, 143.0]), TextLine(polygon=[[95.0, 149.0], [393.0, 149.0], [394.0, 160.0], [95.0, 160.0]], confidence=0.9769484400749207, text='phong cach thời thượng với hai phiên bản mảu Đen hoāc Xanh cá tính.', bbox=[95.0, 149.0, 393.0, 160.0])], languages=['en'], image_bbox=[0.0, 0.0, 494.0, 182.0])]
['Đ Phong Cách', 'Khác Biêt', 'Trên tay ché tác nquyên khői đẫn đầu xu hướng với thiết kế thân máy lièn mach, đô mòng ấn tương 8.5mm cùng', 'kiểu dáng măt kính bóng mươt, sang trong từ Galaxy M30. Vừa văn hoàn hảo trong lòng bàn tay, thoả thích thể hiện', 'phong cach thời thượng với hai phiên bản mảu Đen hoāc Xanh cá tính.'] 5
[[[65.0, 12.0], [424.0, 11.0], [425.0, 49.0], [66.0, 50.0]], [[146.0, 60.0], [345.0, 60.0], [345.0, 95.0], [146.0, 95.0]], [[9.0, 114.0], [482.0, 114.0], [482.0, 126.0], [9.0, 126.0]], [[0.0, 132.0], [490.0, 132.0], [490.0, 143.0], [0.0, 143.0]], [[95.0, 149.0], [393.0, 149.0], [394.0, 160.0], [95.0, 160.0]]] 5
[0.8231586813926697, 0.9834306836128235, 0.9684180021286011, 0.9850525856018066, 0.9769484400749207] 5

OCR版面分析

未完待续