Surya OCR

环境准备

版本:

python3.9 + surya-ocr 0.4.15

模型准备:

检测模型:surya_det3

识别模型:surya_rec

版面模型:surya_layout3

源码修改

因首次使用下载模型被墙,提前将模型收录至模型文件夹并修改源码导入部分:

(源码位置:...Python39/Lib/site-packages/surya/settings.py)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from typing import Dict, Optional

from dotenv import find_dotenv
from pydantic import computed_field
from pydantic_settings import BaseSettings
import torch
import os


class Settings(BaseSettings):
# General
TORCH_DEVICE: Optional[str] = None
IMAGE_DPI: int = 96
IN_STREAMLIT: bool = False # Whether we're running in streamlit

# Paths
DATA_DIR: str = "data"
RESULT_DIR: str = "results"
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")

@computed_field
def TORCH_DEVICE_MODEL(self) -> str:
if self.TORCH_DEVICE is not None:
return self.TORCH_DEVICE

if torch.cuda.is_available():
return "cuda"

if torch.backends.mps.is_available():
return "mps"

return "cpu"

# Text detection
DETECTOR_BATCH_SIZE: Optional[int] = None # Defaults to 2 for CPU/MPS, 32 otherwise
DETECTOR_MODEL_CHECKPOINT: str = r"D:\pycharmproject_2\translate_plat\surya_ocr\models\surya_det3"
DETECTOR_BENCH_DATASET_NAME: str = "vikp/doclaynet_bench"
DETECTOR_IMAGE_CHUNK_HEIGHT: int = 1400 # Height at which to slice images vertically
DETECTOR_TEXT_THRESHOLD: float = 0.6 # Threshold for text detection (above this is considered text)
DETECTOR_BLANK_THRESHOLD: float = 0.35 # Threshold for blank space (below this is considered blank)
DETECTOR_POSTPROCESSING_CPU_WORKERS: int = min(8, os.cpu_count()) # Number of workers for postprocessing
DETECTOR_MIN_PARALLEL_THRESH: int = 3 # Minimum number of images before we parallelize

# Text recognition
RECOGNITION_MODEL_CHECKPOINT: str = r"D:\pycharmproject_2\translate_plat\surya_ocr\models\surya_rec"
RECOGNITION_MAX_TOKENS: int = 175
RECOGNITION_BATCH_SIZE: Optional[int] = None # Defaults to 8 for CPU/MPS, 256 otherwise
RECOGNITION_IMAGE_SIZE: Dict = {"height": 196, "width": 896}
RECOGNITION_RENDER_FONTS: Dict[str, str] = {
"all": os.path.join(FONT_DIR, "GoNotoCurrent-Regular.ttf"),
"zh": os.path.join(FONT_DIR, "GoNotoCJKCore.ttf"),
"ja": os.path.join(FONT_DIR, "GoNotoCJKCore.ttf"),
"ko": os.path.join(FONT_DIR, "GoNotoCJKCore.ttf"),
}
RECOGNITION_FONT_DL_BASE: str = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0"
RECOGNITION_BENCH_DATASET_NAME: str = "vikp/rec_bench"
RECOGNITION_PAD_VALUE: int = 255 # Should be 0 or 255
RECOGNITION_STATIC_CACHE: bool = False # Static cache for torch compile
RECOGNITION_MAX_LANGS: int = 4

# Layout
LAYOUT_MODEL_CHECKPOINT: str = r"D:\pycharmproject_2\translate_plat\surya_ocr\models\surya_layout3"
LAYOUT_BENCH_DATASET_NAME: str = "vikp/publaynet_bench"

# Ordering
ORDER_MODEL_CHECKPOINT: str = "vikp/surya_order"
ORDER_IMAGE_SIZE: Dict = {"height": 1024, "width": 1024}
ORDER_MAX_BOXES: int = 256
ORDER_BATCH_SIZE: Optional[int] = None # Defaults to 4 for CPU/MPS, 32 otherwise
ORDER_BENCH_DATASET_NAME: str = "vikp/order_bench"

# Tesseract (for benchmarks only)
TESSDATA_PREFIX: Optional[str] = None

@computed_field
@property
def MODEL_DTYPE(self) -> torch.dtype:
return torch.float32 if self.TORCH_DEVICE_MODEL == "cpu" else torch.float16

class Config:
env_file = find_dotenv("local.env")
extra = "ignore"


settings = Settings()

使用

小语种OCR识别

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import cv2
from PIL import Image
from surya.detection import batch_text_detection
from surya.layout import batch_layout_detection
from surya.model.detection.model import load_model, load_processor
from surya.settings import settings

IMAGE_PATH = './test/7.png' ## 需检测图片地址
DET_MODEL_PATH = './models/surya_det3' ## 模型参数保存地址
LAYOUT_MODEL_PATH = './models/surya_layout3' ## 模型参数保存地址

image = Image.open(IMAGE_PATH)
surya_layout_model = load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
surya_processor = load_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
surya_det_model = load_model()
surya_det_processor = load_processor()

# layout_predictions is a list of dicts, one per image
line_predictions = batch_text_detection([image], surya_det_model, surya_det_processor)
layout_predictions = batch_layout_detection([image], surya_layout_model, surya_processor, line_predictions)

image = cv2.imread(IMAGE_PATH)


def surya_layout2paddle_structure(surya_layout_res, im):
paddle_structure_res = []
for item in surya_layout_res[0].bboxes:
surya_layout_dict = dict(item)
paddle_structure_res.append({
'bbox': [
surya_layout_dict['polygon'][0][0],
surya_layout_dict['polygon'][0][1],
surya_layout_dict['polygon'][2][0],
surya_layout_dict['polygon'][2][1]
],
'type': surya_layout_dict['label'],
'img': im[
surya_layout_dict['polygon'][0][1]: surya_layout_dict['polygon'][2][1],
surya_layout_dict['polygon'][0][0]: surya_layout_dict['polygon'][2][0]
]
})
return paddle_structure_res


print(surya_layout2paddle_structure(layout_predictions, image))

res = []
for i in layout_predictions[0].bboxes:
res.append(dict(i))
print(res)

识别结果:

1
2
3
4
5
6
7
8
9
10
Loaded detection model ./models/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model ./models/surya_rec on device cpu with dtype torch.float32
ok
Detecting bboxes: 100%|██████████| 1/1 [00:05<00:00, 5.15s/it]
Recognizing Text: 100%|██████████| 1/1 [00:18<00:00, 18.12s/it]

[OCRResult(text_lines=[TextLine(polygon=[[65.0, 12.0], [424.0, 11.0], [425.0, 49.0], [66.0, 50.0]], confidence=0.8231586813926697, text='Đ Phong Cách', bbox=[65.0, 12.0, 424.0, 49.0]), TextLine(polygon=[[146.0, 60.0], [345.0, 60.0], [345.0, 95.0], [146.0, 95.0]], confidence=0.9834306836128235, text='Khác Biêt', bbox=[146.0, 60.0, 345.0, 95.0]), TextLine(polygon=[[9.0, 114.0], [482.0, 114.0], [482.0, 126.0], [9.0, 126.0]], confidence=0.9684180021286011, text='Trên tay ché tác nquyên khői đẫn đầu xu hướng với thiết kế thân máy lièn mach, đô mòng ấn tương 8.5mm cùng', bbox=[9.0, 114.0, 482.0, 126.0]), TextLine(polygon=[[0.0, 132.0], [490.0, 132.0], [490.0, 143.0], [0.0, 143.0]], confidence=0.9850525856018066, text='kiểu dáng măt kính bóng mươt, sang trong từ Galaxy M30. Vừa văn hoàn hảo trong lòng bàn tay, thoả thích thể hiện', bbox=[0.0, 132.0, 490.0, 143.0]), TextLine(polygon=[[95.0, 149.0], [393.0, 149.0], [394.0, 160.0], [95.0, 160.0]], confidence=0.9769484400749207, text='phong cach thời thượng với hai phiên bản mảu Đen hoāc Xanh cá tính.', bbox=[95.0, 149.0, 393.0, 160.0])], languages=['en'], image_bbox=[0.0, 0.0, 494.0, 182.0])]
['Đ Phong Cách', 'Khác Biêt', 'Trên tay ché tác nquyên khői đẫn đầu xu hướng với thiết kế thân máy lièn mach, đô mòng ấn tương 8.5mm cùng', 'kiểu dáng măt kính bóng mươt, sang trong từ Galaxy M30. Vừa văn hoàn hảo trong lòng bàn tay, thoả thích thể hiện', 'phong cach thời thượng với hai phiên bản mảu Đen hoāc Xanh cá tính.'] 5
[[[65.0, 12.0], [424.0, 11.0], [425.0, 49.0], [66.0, 50.0]], [[146.0, 60.0], [345.0, 60.0], [345.0, 95.0], [146.0, 95.0]], [[9.0, 114.0], [482.0, 114.0], [482.0, 126.0], [9.0, 126.0]], [[0.0, 132.0], [490.0, 132.0], [490.0, 143.0], [0.0, 143.0]], [[95.0, 149.0], [393.0, 149.0], [394.0, 160.0], [95.0, 160.0]]] 5
[0.8231586813926697, 0.9834306836128235, 0.9684180021286011, 0.9850525856018066, 0.9769484400749207] 5

OCR版面分析

未完待续

Powered by Hexo and Hexo-theme-hiker

Copyright © 2017 - 2024 青域 All Rights Reserved.

UV : | PV :