From 73b53d66b166f06715edfbe920754e53a234cfeb Mon Sep 17 00:00:00 2001 From: root Date: Thu, 2 Apr 2026 11:41:05 +0200 Subject: [PATCH] Initial commit: OCR receipt extractor --- .gitignore | 6 ++ Dockerfile | 23 +++++ docker-compose.yml | 15 +++ extract_receipt.py | 232 +++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 4 + 5 files changed, 280 insertions(+) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 docker-compose.yml create mode 100644 extract_receipt.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8ab7f39 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +__pycache__/ +*.pyc +.env +*.jpg +*.jpeg +*.png diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..0f8f894 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.12-slim + +# Installa tesseract-ocr con lingua italiana e inglese +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + tesseract-ocr \ + tesseract-ocr-ita \ + tesseract-ocr-eng \ + libgl1 \ + curl \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY extract_receipt.py . + +EXPOSE 5000 + +# Avvia con gunicorn in produzione +CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "--timeout", "120", "extract_receipt:app"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..b25e10a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,15 @@ +services: + ocr-extractor: + build: . + container_name: ocr-receipt-extractor + ports: + - "5000:5000" + restart: unless-stopped + environment: + - PYTHONUNBUFFERED=1 + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:5000/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s diff --git a/extract_receipt.py b/extract_receipt.py new file mode 100644 index 0000000..6efef08 --- /dev/null +++ b/extract_receipt.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +""" +Script per estrarre l'importo da foto di scontrini/ricevute. +Pensato per integrazione con n8n (Execute Command o HTTP endpoint). + +Uso CLI: + python3 extract_receipt.py /path/to/receipt.jpg + +Uso HTTP (per n8n HTTP Request node): + python3 extract_receipt.py --serve --port 5000 + POST /extract con multipart form-data (campo "file") +""" + +import argparse +import base64 +import json +import re +import sys +import os +import tempfile +from pathlib import Path + +try: + from PIL import Image, ImageEnhance, ImageFilter +except ImportError: + sys.exit("Errore: installa Pillow → pip install Pillow") + +try: + import pytesseract +except ImportError: + sys.exit("Errore: installa pytesseract → pip install pytesseract (e tesseract-ocr sul sistema)") + + +# ---------- Pattern regex per importi su scontrini italiani ---------- +AMOUNT_PATTERNS = [ + # IMPORTO EUR 90,93 / IMPORTO: € 12,50 + r'IMPORTO\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})', + # TOTALE EUR 90,93 / TOTALE: 12,50 / TOTALE EURO 12.50 + r'TOTALE\s*(?:COMPLESSIVO|GENERALE|EURO|EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})', + # TOT. 12,50 / TOT 12.50 + r'\bTOT\.?\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})', + # TOTAL 12,50 (EN) + r'\bTOTAL\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})', + # AMOUNT / AMOUNT EUR + r'\bAMOUNT\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})', + # €90,93 / EUR 90,93 (standalone, ultima riga) + r'(?:EUR|€)\s*(\d{1,6}[.,]\d{2})', + # DOVUTO 12,50 + r'DOVUTO\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})', + # PAGAMENTO 12,50 + r'PAGAMENTO\s*(?:EUR|€)?\s*[:.]?\s*(\d{1,6}[.,]\d{2})', +] + + +def preprocess_image(image: Image.Image) -> Image.Image: + """Migliora l'immagine per l'OCR: scala di grigi, contrasto, nitidezza.""" + img = image.convert("L") + img = ImageEnhance.Contrast(img).enhance(2.0) + img = ImageEnhance.Sharpness(img).enhance(2.0) + img = img.filter(ImageFilter.MedianFilter(size=3)) + return img + + +def ocr_image(image_path: str) -> str: + """Esegue OCR sull'immagine e ritorna il testo estratto.""" + img = Image.open(image_path) + processed = preprocess_image(img) + # Tesseract con lingua italiana + inglese + custom_config = r'--oem 3 --psm 6 -l ita+eng' + text = pytesseract.image_to_string(processed, config=custom_config) + return text + + +def parse_amount(text: str) -> dict: + """Cerca l'importo nel testo OCR usando i pattern definiti.""" + text_upper = text.upper() + matches = [] + + for i, pattern in enumerate(AMOUNT_PATTERNS): + for m in re.finditer(pattern, text_upper): + raw = m.group(1) + # Normalizza: virgola → punto + value = float(raw.replace(",", ".")) + matches.append({ + "raw": raw, + "amount": value, + "currency": "EUR", + "pattern_index": i, + "position": m.start(), + }) + + if not matches: + return { + "success": False, + "amount": None, + "currency": None, + "raw_match": None, + "confidence": "none", + "all_matches": [], + "ocr_text": text, + } + + # Priorità: pattern con indice più basso (più specifico) + best = min(matches, key=lambda m: (m["pattern_index"], -m["position"])) + confidence = "high" if best["pattern_index"] <= 2 else "medium" + + return { + "success": True, + "amount": best["amount"], + "currency": best["currency"], + "raw_match": best["raw"], + "confidence": confidence, + "all_matches": [{"amount": m["amount"], "raw": m["raw"]} for m in matches], + "ocr_text": text, + } + + +def extract_receipt(image_path: str) -> dict: + """Pipeline completa: OCR + parsing importo.""" + if not os.path.isfile(image_path): + return {"success": False, "error": f"File non trovato: {image_path}"} + + text = ocr_image(image_path) + result = parse_amount(text) + result["file"] = os.path.basename(image_path) + return result + + +# ---------- Modalità HTTP (Flask) per n8n ---------- +def create_app(): + """Factory Flask app — usata sia da gunicorn che da run_server.""" + from flask import Flask, request, jsonify + from werkzeug.utils import secure_filename + + app = Flask(__name__) + + ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "bmp", "tiff", "webp"} + + def allowed_file(filename): + return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS + + @app.route("/extract", methods=["POST"]) + def extract(): + # --- Modalità 1: JSON con base64 (ideale per n8n + Telegram) --- + if request.is_json: + data = request.get_json() + image_b64 = data.get("image") or data.get("file") or data.get("data") + if not image_b64: + return jsonify({"success": False, "error": "Campo 'image' (base64) mancante nel JSON"}), 400 + + # Rimuovi eventuale header data:image/...;base64, + if "," in image_b64: + image_b64 = image_b64.split(",", 1)[1] + + try: + img_bytes = base64.b64decode(image_b64) + except Exception: + return jsonify({"success": False, "error": "Base64 non valido"}), 400 + + with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp: + tmp.write(img_bytes) + tmp_path = tmp.name + + try: + result = extract_receipt(tmp_path) + return jsonify(result) + finally: + os.unlink(tmp_path) + + # --- Modalità 2: multipart form-data (upload file) --- + if "file" not in request.files: + return jsonify({"success": False, "error": "Nessun file inviato (campo 'file') e nessun JSON con base64"}), 400 + + file = request.files["file"] + if file.filename == "": + return jsonify({"success": False, "error": "Filename vuoto"}), 400 + + if not allowed_file(file.filename): + return jsonify({"success": False, "error": f"Formato non supportato. Usa: {ALLOWED_EXTENSIONS}"}), 400 + + filename = secure_filename(file.filename) + with tempfile.NamedTemporaryFile(suffix=Path(filename).suffix, delete=False) as tmp: + file.save(tmp.name) + tmp_path = tmp.name + + try: + result = extract_receipt(tmp_path) + return jsonify(result) + finally: + os.unlink(tmp_path) + + @app.route("/health", methods=["GET"]) + def health(): + return jsonify({"status": "ok"}) + + return app + + +def run_server(host: str, port: int): + app = create_app() + print(f"Server avviato su http://{host}:{port}") + print(f" POST /extract (multipart form-data o JSON base64)") + print(f" GET /health") + app.run(host=host, port=port) + + +# App WSGI per gunicorn: gunicorn extract_receipt:app +app = create_app() + + +# ---------- Main ---------- +def main(): + parser = argparse.ArgumentParser(description="Estrai importo da scontrino/ricevuta") + parser.add_argument("image", nargs="?", help="Percorso immagine dello scontrino") + parser.add_argument("--serve", action="store_true", help="Avvia server HTTP (per n8n)") + parser.add_argument("--host", default="0.0.0.0", help="Host server (default: 0.0.0.0)") + parser.add_argument("--port", type=int, default=5000, help="Porta server (default: 5000)") + args = parser.parse_args() + + if args.serve: + run_server(args.host, args.port) + elif args.image: + result = extract_receipt(args.image) + print(json.dumps(result, ensure_ascii=False, indent=2)) + sys.exit(0 if result.get("success") else 1) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e25524b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +pytesseract==0.3.13 +Pillow==11.1.0 +flask==3.1.0 +gunicorn==23.0.0