From 73b53d66b166f06715edfbe920754e53a234cfeb Mon Sep 17 00:00:00 2001
From: root <root@JBOOK25.localdomain>
Date: Thu, 2 Apr 2026 11:41:05 +0200
Subject: [PATCH] Initial commit: OCR receipt extractor

---
 .gitignore         |   6 ++
 Dockerfile         |  23 +++++
 docker-compose.yml |  15 +++
 extract_receipt.py | 232 +++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt   |   4 +
 5 files changed, 280 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.yml
 create mode 100644 extract_receipt.py
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8ab7f39
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+__pycache__/
+*.pyc
+.env
+*.jpg
+*.jpeg
+*.png
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..0f8f894
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,23 @@
+FROM python:3.12-slim
+
+# Installa tesseract-ocr con lingua italiana e inglese
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        tesseract-ocr \
+        tesseract-ocr-ita \
+        tesseract-ocr-eng \
+        libgl1 \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY extract_receipt.py .
+
+EXPOSE 5000
+
+# Avvia con gunicorn in produzione
+CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "--timeout", "120", "extract_receipt:app"]
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..b25e10a
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,15 @@
+services:
+  ocr-extractor:
+    build: .
+    container_name: ocr-receipt-extractor
+    ports:
+      - "5000:5000"
+    restart: unless-stopped
+    environment:
+      - PYTHONUNBUFFERED=1
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:5000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 10s
diff --git a/extract_receipt.py b/extract_receipt.py
new file mode 100644
index 0000000..6efef08
--- /dev/null
+++ b/extract_receipt.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python3
+"""
+Script per estrarre l'importo da foto di scontrini/ricevute.
+Pensato per integrazione con n8n (Execute Command o HTTP endpoint).
+
+Uso CLI:
+    python3 extract_receipt.py /path/to/receipt.jpg
+
+Uso HTTP (per n8n HTTP Request node):
+    python3 extract_receipt.py --serve --port 5000
+    POST /extract  con multipart form-data (campo "file")
+"""
+
+import argparse
+import base64
+import json
+import re
+import sys
+import os
+import tempfile
+from pathlib import Path
+
+try:
+    from PIL import Image, ImageEnhance, ImageFilter
+except ImportError:
+    sys.exit("Errore: installa Pillow → pip install Pillow")
+
+try:
+    import pytesseract
+except ImportError:
+    sys.exit("Errore: installa pytesseract → pip install pytesseract (e tesseract-ocr sul sistema)")
+
+
+# ---------- Pattern regex per importi su scontrini italiani ----------
+AMOUNT_PATTERNS = [
+    # IMPORTO EUR 90,93  /  IMPORTO: € 12,50
+    r'IMPORTO\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})',
+    # TOTALE EUR 90,93  /  TOTALE: 12,50  /  TOTALE EURO 12.50
+    r'TOTALE\s*(?:COMPLESSIVO|GENERALE|EURO|EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})',
+    # TOT. 12,50  /  TOT 12.50
+    r'\bTOT\.?\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})',
+    # TOTAL 12,50 (EN)
+    r'\bTOTAL\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})',
+    # AMOUNT / AMOUNT EUR
+    r'\bAMOUNT\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})',
+    # €90,93  /  EUR 90,93  (standalone, ultima riga)
+    r'(?:EUR|€)\s*(\d{1,6}[.,]\d{2})',
+    # DOVUTO 12,50
+    r'DOVUTO\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})',
+    # PAGAMENTO 12,50
+    r'PAGAMENTO\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})',
+]
+
+
+def preprocess_image(image: Image.Image) -> Image.Image:
+    """Migliora l'immagine per l'OCR: scala di grigi, contrasto, nitidezza."""
+    img = image.convert("L")
+    img = ImageEnhance.Contrast(img).enhance(2.0)
+    img = ImageEnhance.Sharpness(img).enhance(2.0)
+    img = img.filter(ImageFilter.MedianFilter(size=3))
+    return img
+
+
+def ocr_image(image_path: str) -> str:
+    """Esegue OCR sull'immagine e ritorna il testo estratto."""
+    img = Image.open(image_path)
+    processed = preprocess_image(img)
+    # Tesseract con lingua italiana + inglese
+    custom_config = r'--oem 3 --psm 6 -l ita+eng'
+    text = pytesseract.image_to_string(processed, config=custom_config)
+    return text
+
+
+def parse_amount(text: str) -> dict:
+    """Cerca l'importo nel testo OCR usando i pattern definiti."""
+    text_upper = text.upper()
+    matches = []
+
+    for i, pattern in enumerate(AMOUNT_PATTERNS):
+        for m in re.finditer(pattern, text_upper):
+            raw = m.group(1)
+            # Normalizza: virgola → punto
+            value = float(raw.replace(",", "."))
+            matches.append({
+                "raw": raw,
+                "amount": value,
+                "currency": "EUR",
+                "pattern_index": i,
+                "position": m.start(),
+            })
+
+    if not matches:
+        return {
+            "success": False,
+            "amount": None,
+            "currency": None,
+            "raw_match": None,
+            "confidence": "none",
+            "all_matches": [],
+            "ocr_text": text,
+        }
+
+    # Priorità: pattern con indice più basso (più specifico)
+    best = min(matches, key=lambda m: (m["pattern_index"], -m["position"]))
+    confidence = "high" if best["pattern_index"] <= 2 else "medium"
+
+    return {
+        "success": True,
+        "amount": best["amount"],
+        "currency": best["currency"],
+        "raw_match": best["raw"],
+        "confidence": confidence,
+        "all_matches": [{"amount": m["amount"], "raw": m["raw"]} for m in matches],
+        "ocr_text": text,
+    }
+
+
+def extract_receipt(image_path: str) -> dict:
+    """Pipeline completa: OCR + parsing importo."""
+    if not os.path.isfile(image_path):
+        return {"success": False, "error": f"File non trovato: {image_path}"}
+
+    text = ocr_image(image_path)
+    result = parse_amount(text)
+    result["file"] = os.path.basename(image_path)
+    return result
+
+
+# ---------- Modalità HTTP (Flask) per n8n ----------
+def create_app():
+    """Factory Flask app — usata sia da gunicorn che da run_server."""
+    from flask import Flask, request, jsonify
+    from werkzeug.utils import secure_filename
+
+    app = Flask(__name__)
+
+    ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "bmp", "tiff", "webp"}
+
+    def allowed_file(filename):
+        return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
+
+    @app.route("/extract", methods=["POST"])
+    def extract():
+        # --- Modalità 1: JSON con base64 (ideale per n8n + Telegram) ---
+        if request.is_json:
+            data = request.get_json()
+            image_b64 = data.get("image") or data.get("file") or data.get("data")
+            if not image_b64:
+                return jsonify({"success": False, "error": "Campo 'image' (base64) mancante nel JSON"}), 400
+
+            # Rimuovi eventuale header data:image/...;base64,
+            if "," in image_b64:
+                image_b64 = image_b64.split(",", 1)[1]
+
+            try:
+                img_bytes = base64.b64decode(image_b64)
+            except Exception:
+                return jsonify({"success": False, "error": "Base64 non valido"}), 400
+
+            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
+                tmp.write(img_bytes)
+                tmp_path = tmp.name
+
+            try:
+                result = extract_receipt(tmp_path)
+                return jsonify(result)
+            finally:
+                os.unlink(tmp_path)
+
+        # --- Modalità 2: multipart form-data (upload file) ---
+        if "file" not in request.files:
+            return jsonify({"success": False, "error": "Nessun file inviato (campo 'file') e nessun JSON con base64"}), 400
+
+        file = request.files["file"]
+        if file.filename == "":
+            return jsonify({"success": False, "error": "Filename vuoto"}), 400
+
+        if not allowed_file(file.filename):
+            return jsonify({"success": False, "error": f"Formato non supportato. Usa: {ALLOWED_EXTENSIONS}"}), 400
+
+        filename = secure_filename(file.filename)
+        with tempfile.NamedTemporaryFile(suffix=Path(filename).suffix, delete=False) as tmp:
+            file.save(tmp.name)
+            tmp_path = tmp.name
+
+        try:
+            result = extract_receipt(tmp_path)
+            return jsonify(result)
+        finally:
+            os.unlink(tmp_path)
+
+    @app.route("/health", methods=["GET"])
+    def health():
+        return jsonify({"status": "ok"})
+
+    return app
+
+
+def run_server(host: str, port: int):
+    app = create_app()
+    print(f"Server avviato su http://{host}:{port}")
+    print(f"  POST /extract  (multipart form-data o JSON base64)")
+    print(f"  GET  /health")
+    app.run(host=host, port=port)
+
+
+# App WSGI per gunicorn: gunicorn extract_receipt:app
+app = create_app()
+
+
+# ---------- Main ----------
+def main():
+    parser = argparse.ArgumentParser(description="Estrai importo da scontrino/ricevuta")
+    parser.add_argument("image", nargs="?", help="Percorso immagine dello scontrino")
+    parser.add_argument("--serve", action="store_true", help="Avvia server HTTP (per n8n)")
+    parser.add_argument("--host", default="0.0.0.0", help="Host server (default: 0.0.0.0)")
+    parser.add_argument("--port", type=int, default=5000, help="Porta server (default: 5000)")
+    args = parser.parse_args()
+
+    if args.serve:
+        run_server(args.host, args.port)
+    elif args.image:
+        result = extract_receipt(args.image)
+        print(json.dumps(result, ensure_ascii=False, indent=2))
+        sys.exit(0 if result.get("success") else 1)
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e25524b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+pytesseract==0.3.13
+Pillow==11.1.0
+flask==3.1.0
+gunicorn==23.0.0