File size: 4,044 Bytes
85daa07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import cv2 as cv
import numpy as np
import gradio as gr
from huggingface_hub import hf_hub_download
from ppocr_det import PPOCRDet
from crnn import CRNN

# Download model files from Hugging Face
det_model_path = hf_hub_download(
    repo_id="opencv/text_detection_ppocr",
    filename="text_detection_en_ppocrv3_2023may.onnx"
)
rec_model_path = hf_hub_download(
    repo_id="opencv/text_recognition_crnn",
    filename="text_recognition_CRNN_EN_2021sep.onnx"
)

# DNN backend and target
backend_id = cv.dnn.DNN_BACKEND_OPENCV
target_id = cv.dnn.DNN_TARGET_CPU

# Detector and recognizer setup
detector = PPOCRDet(
    modelPath=det_model_path,
    inputSize=[736, 736],
    binaryThreshold=0.3,
    polygonThreshold=0.5,
    maxCandidates=200,
    unclipRatio=2.0,
    backendId=backend_id,
    targetId=target_id
)

recognizer = CRNN(
    modelPath=rec_model_path,
    backendId=backend_id,
    targetId=target_id
)

def detect_and_recognize(input_image):
    bgr = cv.cvtColor(input_image, cv.COLOR_RGB2BGR)
    h_orig, w_orig = input_image.shape[:2]
    resized = cv.resize(bgr, (736, 736))
    scale_w = w_orig / 736
    scale_h = h_orig / 736

    # Detect & recognize
    det_results, _ = detector.infer(resized)
    texts = [recognizer.infer(resized, box.reshape(8)) for box in det_results]

    # Prepare canvases
    left = input_image.copy()
    right = np.ones_like(input_image) * 255

    for box_raw, text in zip(det_results, texts):
        # Rescale box to original image coords
        box = np.int32([[pt[0] * scale_w, pt[1] * scale_h] for pt in box_raw])

        # Compute box dimensions
        xs = box[:, 0]
        box_w = xs.max() - xs.min()
        # box height (average vertical edges)
        h1 = np.linalg.norm(box[1] - box[0])
        h2 = np.linalg.norm(box[2] - box[3])
        box_h = (h1 + h2) / 2.0

        # Initial font scale so text height ≈ 80% of box height
        (_, th0), _ = cv.getTextSize(text, cv.FONT_HERSHEY_SIMPLEX, 1.0, 1)
        font_scale = (box_h * 0.8) / th0 if th0 > 0 else 1.0
        font_thickness = max(1, int(font_scale))

        # Re-measure text size with this scale
        (tw, th), _ = cv.getTextSize(text, cv.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)

        # If text is wider than box or taller than box, scale down to fit
        scale_x = box_w / tw if tw > 0 else 1.0
        scale_y = (box_h * 0.8) / th if th > 0 else 1.0
        final_scale = font_scale * min(1.0, scale_x, scale_y)
        font_scale = final_scale
        font_thickness = max(1, int(np.floor(font_scale)))

        # Draw boxes on both panels
        cv.polylines(left, [box], isClosed=True, color=(0, 0, 255), thickness=2)
        cv.polylines(right, [box], isClosed=True, color=(0, 0, 255), thickness=2)

        # Draw text on whiteboard, just above top-left corner
        x0, y0 = box[0]
        y_text = max(0, int(y0 - 5))
        cv.putText(
            right, text, (int(x0), y_text),
            cv.FONT_HERSHEY_SIMPLEX,
            font_scale, (0, 0, 0), font_thickness
        )

    combined = cv.hconcat([left, right])
    return combined

with gr.Blocks(title="Scene Text Detection + Recognition (PPOCR + CRNN)") as demo:
    gr.Markdown("## Scene Text Detection and Recognition (PPOCR + CRNN)")
    gr.Markdown("Upload an image with scene text to detect text regions and recognize text using OpenCV DNN with PPOCR + CRNN models.")

    input_img = gr.Image(type="numpy", label="Upload Image")
    output_img = gr.Image(type="numpy", label="Detected Text Image")

    with gr.Row():
        submit_btn = gr.Button("Submit", variant="primary")
        clear_btn = gr.Button("Clear")

    submit_btn.click(
        fn=detect_and_recognize,
        inputs=input_img,
        outputs=output_img
    )

    clear_btn.click(
        fn=lambda: (None, None),
        inputs=[],
        outputs=[input_img, output_img]
    )

    gr.Markdown("**Note**: Left side of output shows detected regions, right side shows recognized text.")

if __name__ == "__main__":
    demo.launch()