paddle-ocr-demo

Sleeping

App Files Files

codic commited on Apr 30

Commit

fd45fbc

verified ·

1 Parent(s): c66181c

Update app.py

Browse files

Files changed (1) hide show

app.py +194 -160

app.py CHANGED Viewed

@@ -1,181 +1,215 @@
 from paddleocr import PaddleOCR
 from gliner import GLiNER
 from PIL import Image
 import gradio as gr
 import numpy as np
 import logging
 import tempfile
 import pandas as pd
 import re
 import traceback
-import zxingcpp
-# --------------------------
-# Configuration & Constants
-# --------------------------
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-COUNTRY_CODES = {
-    'SAUDI': {'code': '+966', 'pattern': r'^(\+9665\d{8}|05\d{8})$'},
-    'UAE': {'code': '+971', 'pattern': r'^(\+9715\d{8}|05\d{8})$'}
-}
-VALIDATION_PATTERNS = {
-    'email': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', re.IGNORECASE),
-    'website': re.compile(r'(?:https?://)?(?:www\.)?([A-Za-z0-9-]+\.[A-Za-z]{2,})'),
-    'name': re.compile(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$')
-}
-# --------------------------
-# Core Processing Functions
-# --------------------------
-def process_phone_number(raw_number: str) -> str:
-    """Validate and standardize phone numbers for supported countries"""
-    cleaned = re.sub(r'[^\d+]', '', raw_number)
-    for country, config in COUNTRY_CODES.items():
-        if re.match(config['pattern'], cleaned):
-            if cleaned.startswith('0'):
-                return f"{config['code']}{cleaned[1:]}"
-            if cleaned.startswith('5'):
-                return f"{config['code']}{cleaned}"
-            return cleaned
     return None
-def extract_contact_info(text: str) -> dict:
-    """Extract and validate all contact information from text"""
-    contacts = {
-        'phones': set(),
-        'emails': set(),
-        'websites': set()
-    }
-    # Phone number extraction
-    for match in re.finditer(r'(\+?\d{10,13}|05\d{8})', text):
-        if processed := process_phone_number(match.group()):
-            contacts['phones'].add(processed)
-    # Email validation
-    contacts['emails'].update(
-        email.lower() for email in VALIDATION_PATTERNS['email'].findall(text)
-    )
-    # Website normalization
-    for match in VALIDATION_PATTERNS['website'].finditer(text):
-        domain = match.group(1).lower()
-        if '.' in domain:
-            contacts['websites'].add(f"www.{domain.split('/')[0]}")
-    return {k: list(v) for k, v in contacts.items() if v}
-def process_entities(entities: list, ocr_text: list) -> dict:
-    """Process GLiNER entities with validation and fallbacks"""
-    result = {
-        'name': None,
-        'company': None,
-        'title': None,
-        'address': None
-    }
-    # Entity extraction
-    for entity in entities:
-        label = entity['label'].lower()
-        text = entity['text'].strip()
-        if label == 'person name' and VALIDATION_PATTERNS['name'].match(text):
-            result['name'] = text.title()
-        elif label == 'company name':
-            result['company'] = text
-        elif label == 'job title':
-            result['title'] = text.title()
-        elif label == 'address':
-            result['address'] = text
-    # Name fallback from OCR text
-    if not result['name']:
-        for text in ocr_text:
-            if VALIDATION_PATTERNS['name'].match(text):
-                result['name'] = text.title()
-                break
-    return result
-# --------------------------
-# Main Processing Pipeline
-# --------------------------
-def process_business_card(img: Image.Image, confidence: float) -> tuple:
-    """Full processing pipeline for business card images"""
     try:
-        # Initialize OCR
-        ocr_engine = PaddleOCR(lang='en', use_gpu=False)
-        # OCR Processing
-        ocr_result = ocr_engine.ocr(np.array(img), cls=True)
-        ocr_text = [line[1][0] for line in ocr_result[0]]
-        full_text = " ".join(ocr_text)
-        # Entity Recognition
-        labels = ["person name", "company name", "job title",
-                 "phone number", "email address", "address",
-                 "website"]
-        entities = gliner_model.predict_entities(full_text, labels, threshold=confidence)
-        # Data Extraction
-        contacts = extract_contact_info(full_text)
-        entity_data = process_entities(entities, ocr_text)
-        qr_data = zxingcpp.read_barcodes(np.array(img.convert('RGB')))
-        # Compile Final Results
-        results = {
-            'Person Name': entity_data['name'],
-            'Company Name': entity_data['company'] or (
-                contacts['emails'][0].split('@')[1].split('.')[0].title()
-                if contacts['emails'] else None
-            ),
-            'Job Title': entity_data['title'],
-            'Phone Numbers': contacts['phones'],
-            'Email Addresses': contacts['emails'],
-            'Address': entity_data['address'] or next(
-                (t for t in ocr_text if any(kw in t.lower()
-                 for kw in {'street', 'ave', 'road'})), None
-            ),
-            'Website': contacts['websites'][0] if contacts['websites'] else None,
-            'QR Code': qr_data[0].text if qr_data else None
-        }
-        # Generate CSV Output
         with tempfile.NamedTemporaryFile(suffix='.csv', delete=False, mode='w') as f:
-            pd.DataFrame([results]).to_csv(f)
             csv_path = f.name
-        return full_text, results, csv_path, ""
-    except Exception as e:
-        logger.error(f"Processing Error: {traceback.format_exc()}")
-        return "", {}, None, f"Error: {str(e)}"
-# --------------------------
-# Gradio Interface
-# --------------------------
-interface = gr.Interface(
-    fn=process_business_card,
-    inputs=[
-        gr.Image(type='pil', label='Upload Business Card'),
-        gr.Slider(0.1, 1.0, value=0.4, label='Confidence Threshold')
-    ],
-    outputs=[
-        gr.Textbox(label='OCR Result'),
-        gr.JSON(label='Structured Data'),
-        gr.File(label='Download CSV'),
-        gr.Textbox(label='Error Log')
-    ],
-    title='Enterprise Business Card Parser',
-    description='Multi-country support with comprehensive validation'
-)
 if __name__ == '__main__':
-    interface.launch()

+```python
 from paddleocr import PaddleOCR
 from gliner import GLiNER
 from PIL import Image
 import gradio as gr
 import numpy as np
+import cv2
 import logging
+import os
 import tempfile
 import pandas as pd
 import re
 import traceback
+import zxingcpp  # QR decoding
+# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Environment setup
+os.environ['GLINER_HOME'] = './gliner_models'
+# Load GLiNER model
+try:
+    logger.info("Loading GLiNER model...")
+    gliner_model = GLiNER.from_pretrained("urchade/gliner_large-v2.1")
+except Exception:
+    logger.exception("Failed to load GLiNER model")
+    raise
+# Regex patterns
+EMAIL_REGEX = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
+WEBSITE_REGEX = re.compile(r"^(?:https?://)?(?:www\.)?([A-Za-z0-9-]+\.[A-Za-z]{2,})(?:/\S*)?$")
+# Phone number constants and regex
+SAUDI_CODE = '+966'
+UAE_CODE = '+971'
+PHONE_REGEX = re.compile(r'^(?:\+9665\d{8}|\+9715\d{8}|05\d{8}|5\d{8})$')
+# Utility functions
+def extract_emails(text: str) -> list[str]:
+    return [e.lower() for e in EMAIL_REGEX.findall(text)]
+def extract_websites(text: str) -> list[str]:
+    return [m.lower() for m in WEBSITE_REGEX.findall(text)]
+def clean_phone_number(phone: str) -> str | None:
+    cleaned = re.sub(r"[^\d+]", "", phone)
+    # International formats
+    if cleaned.startswith(SAUDI_CODE + '5') and len(cleaned) == 12:
+        return cleaned
+    if cleaned.startswith(UAE_CODE + '5') and len(cleaned) == 12:
+        return cleaned
+    # Local to international
+    if cleaned.startswith('05') and len(cleaned) == 10:
+        return f"{UAE_CODE}{cleaned[1:]}"
+    if cleaned.startswith('5') and len(cleaned) == 9:
+        return f"{UAE_CODE}{cleaned}"
+    if cleaned.startswith('9665') and len(cleaned) == 12:
+        return f"+{cleaned}"
     return None
+def process_phone_numbers(text: str) -> list[str]:
+    found = []
+    for match in re.finditer(r'(?:\+?\d{8,13}|05\d{8})', text):
+        raw = match.group().strip()
+        if (c := clean_phone_number(raw)):
+            found.append(c)
+    return list(set(found))
+def normalize_website(url: str) -> str | None:
+    u = url.lower().replace('www.', '').split('/')[0]
+    if re.match(r"^[a-z0-9-]+\.[a-z]{2,}$", u):
+        return f"www.{u}"
+    return None
+def extract_address(ocr_texts: list[str]) -> str | None:
+    keywords = ["block","street","ave","area","industrial","road"]
+    parts = [t for t in ocr_texts if any(kw in t.lower() for kw in keywords)]
+    return " ".join(parts) if parts else None
+# QR scanning
+def scan_qr_code(image: Image.Image) -> str | None:
     try:
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+            image.save(tmp, format="PNG")
+            path = tmp.name
+        img_cv = cv2.imread(path)
+        # Direct decode
+        try:
+            res = zxingcpp.read_barcodes(img_cv)
+            if res and res[0].text:
+                return res[0].text.strip()
+        except:
+            logger.warning("Direct ZXing decode failed")
+        # Fallback recolor
+        default_color = (0, 0, 0)
+        tol = 50
+        pix = list(image.convert('RGB').getdata())
+        new_pix = [default_color if all(abs(p[i]-default_color[i])<=tol for i in range(3)) else (255,255,255) for p in pix]
+        img_conv = Image.new('RGB', image.size)
+        img_conv.putdata(new_pix)
+        cv2.imwrite(path + '_conv.png', cv2.cvtColor(np.array(img_conv), cv2.COLOR_RGB2BGR))
+        res = zxingcpp.read_barcodes(cv2.imread(path + '_conv.png'))
+        if res and res[0].text:
+            return res[0].text.strip()
+    except Exception:
+        logger.exception("QR scan error")
+    return None
+# Deduplication
+def deduplicate_data(results: dict[str, list[str]]) -> None:
+    def clean_list(items, normalizer=lambda x: x):
+        seen = set(); out = []
+        for raw in items:
+            for part in re.split(r'[;,]\s*', raw):
+                p = part.strip()
+                if not p: continue
+                norm = normalizer(p)
+                if norm and norm not in seen:
+                    seen.add(norm); out.append(norm)
+        return out
+    # Normalize lists
+    results['Email Address'] = clean_list(results['Email Address'], lambda e: e.lower())
+    results['Website'] = clean_list(results['Website'], normalize_website)
+    results['Phone Number'] = clean_list(results['Phone Number'], clean_phone_number)
+    # Others: simple dedupe
+    for key in ['Person Name','Company Name','Job Title','Address','QR Code']:
+        seen = set(); out = []
+        for v in results.get(key, []):
+            vv = v.strip()
+            if vv and vv not in seen:
+                seen.add(vv); out.append(vv)
+        results[key] = out
+# Inference pipeline
+def inference(img: Image.Image, confidence: float):
+    try:
+        ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False,
+                        det_model_dir='./models/det/en',
+                        cls_model_dir='./models/cls/en',
+                        rec_model_dir='./models/rec/en')
+        arr = np.array(img)
+        raw = ocr.ocr(arr, cls=True)[0]
+        ocr_texts = [ln[1][0] for ln in raw]
+        full_text = ' '.join(ocr_texts)
+        labels = ['person name','company name','job title','phone number','email address','address','website']
+        entities = gliner_model.predict_entities(full_text, labels, threshold=confidence, flat_ner=True)
+        results = {k: [] for k in ['Person Name','Company Name','Job Title','Phone Number','Email Address','Address','Website','QR Code']}
+        # Entity processing
+        for ent in entities:
+            txt, lbl = ent['text'].strip(), ent['label'].lower()
+            if lbl == 'person name': results['Person Name'].append(txt)
+            elif lbl == 'company name': results['Company Name'].append(txt)
+            elif lbl == 'job title': results['Job Title'].append(txt.title())
+            elif lbl == 'phone number':
+                if (c:=clean_phone_number(txt)): results['Phone Number'].append(c)
+            elif lbl == 'email address' and EMAIL_REGEX.fullmatch(txt):
+                results['Email Address'].append(txt.lower())
+            elif lbl == 'website' and WEBSITE_REGEX.fullmatch(txt):
+                if (n:=normalize_website(txt)): results['Website'].append(n)
+            elif lbl == 'address': results['Address'].append(txt)
+        # Regex fallbacks
+        results['Email Address'] += extract_emails(full_text)
+        results['Website'] += extract_websites(full_text)
+        # Phone regex fallback
+        results['Phone Number'] += process_phone_numbers(full_text)
+        # QR
+        if qr := scan_qr_code(img): results['QR Code'].append(qr)
+        # Address fallback
+        if not results['Address']:
+            if addr := extract_address(ocr_texts): results['Address'].append(addr)
+        # Dedupe
+        deduplicate_data(results)
+        # Company fallback
+        if not results['Company Name']:
+            if results['Email Address']:
+                dom = results['Email Address'][0].split('@')[-1].split('.')[0]
+                results['Company Name'].append(dom.title())
+            elif results['Website']:
+                dom = results['Website'][0].split('.')[1]
+                results['Company Name'].append(dom.title())
+        # Name fallback
+        if not results['Person Name']:
+            for t in ocr_texts:
+                if re.match(r'^(?:[A-Z][a-z]+\s?){2,}$', t):
+                    results['Person Name'].append(t); break
+        # CSV
+        csv_map = {k: '; '.join(v) for k,v in results.items() if v}
         with tempfile.NamedTemporaryFile(suffix='.csv', delete=False, mode='w') as f:
+            pd.DataFrame([csv_map]).to_csv(f, index=False)
             csv_path = f.name
+        return full_text, csv_map, csv_path, ''
+    except Exception:
+        err = traceback.format_exc()
+        logger.error(f"Processing failed: {err}")
+        return '', {}, None, f"Error:\n{err}"
+# Gradio Interface
 if __name__ == '__main__':
+    demo = gr.Interface(
+        inference,
+        [gr.Image(type='pil', label='Upload Business Card'),
+         gr.Slider(0.1, 1, 0.4, step=0.1, label='Confidence Threshold')],
+        [gr.Textbox(label="OCR Result"),
+         gr.JSON(label="Structured Data"),
+         gr.File(label="Download CSV"),
+         gr.Textbox(label="Error Log")],
+        title='Enhanced Business Card Parser',
+        description='Accurate entity extraction with combined AI and regex validation (with Saudi/UAE support)',
+        css=".gr-interface {max-width: 800px !important;}"
+    )
+    demo.launch()
+```