crawler-seo/send_report.py

import sqlite3
import json
import glob
import os
import html
import requests
from urllib.parse import urlparse

def load_config():
    try:
        with open("config.json", "r", encoding="utf-8") as f:
            raw_config = json.load(f)
            return {k.strip(): v for k, v in raw_config.items()}
    except Exception as e:
        print(f"[!] Błąd wczytywania config.json: {e}")
        return {}

def send_telegram(token, chat_id, message):
    url = f"https://api.telegram.org/bot{token}/sendMessage"
    try:
        r = requests.post(url, json={"chat_id": chat_id, "text": message, "parse_mode": "HTML"}, timeout=15)
        if r.status_code != 200:
            print(f"[!] Błąd wysyłania (HTTP {r.status_code}): {r.text}")
        else:
            print("[*] Wiadomość wysłana pomyślnie.")
    except Exception as e:
        print(f"[!] Błąd połączenia z Telegramem: {e}")

def main():
    config = load_config()
    token = config.get("telegram_token")
    chat_id_errors = config.get("telegram_chat_id_errors") or config.get("telegram_chat_id_info")

    if not token or not chat_id_errors:
        print("[!] Brak poprawnej konfiguracji Telegram (token lub chat_id_errors w config.json).")
        return

    # Znajdź najnowszą bazę danych crawla w podfolderze scans
    dbs = glob.glob("scans/crawler_v*.db")
    if not dbs:
        print("[!] Nie znaleziono żadnej bazy danych crawler_v*.db w podfolderze scans.")
        return

    dbs.sort(key=os.path.getmtime, reverse=True)
    latest_db = dbs[0]
    print(f"[*] Odczytuję dane z bazy: {latest_db}")

    critical_errors = []
    schema_errors = []
    translation_issues = []
    domain = "fluo.dog"

    try:
        conn = sqlite3.connect(latest_db)
        cursor = conn.cursor()

        # Wyciągamy domenę z pierwszego rekordu
        cursor.execute("SELECT url FROM pages LIMIT 1")
        row = cursor.fetchone()
        if row:
            domain = html.escape(urlparse(row[0]).netloc)

        # Błędy krytyczne (404, 500+, zablokowane)
        cursor.execute("SELECT url, status, google_access FROM pages WHERE status = 404 OR status >= 500 OR google_access = 'Blocked' LIMIT 15")
        for url, status, access in cursor.fetchall():
            if access == 'Blocked':
                err_type = "ROBOTS.TXT BLOCK"
            elif status == 404:
                err_type = "404"
            else:
                err_type = f"ERR {status}"
            critical_errors.append((url, err_type))

        # Błędy schema
        cursor.execute("SELECT url, schema_critical FROM pages WHERE schema_critical > 0 LIMIT 10")
        for url, count in cursor.fetchall():
            schema_errors.append((url, count))

        # Błędy tłumaczeń
        try:
            cursor.execute("SELECT sku, lang1, lang2, field FROM translation_audit LIMIT 10")
            for sku, lang1, lang2, field in cursor.fetchall():
                translation_issues.append(f"SKU {sku}: {field} identyczny w {lang1} i {lang2}")
        except sqlite3.OperationalError:
            pass # Jeśli z jakiegoś powodu nie ma jeszcze tej tabeli

        conn.close()
    except Exception as e:
        print(f"[!] Błąd odczytu bazy danych: {e}")
        return

    if not (critical_errors or schema_errors or translation_issues):
        print("[*] Brak błędów do zaraportowania w ostatnim skanie.")
        return

    # Budowanie wiadomości z promptem
    err_msg = f"🚨 <b>BŁĘDY: {domain} (Wysłane ręcznie)</b>\n\n"

    prompt_text = (
        "Twoim zadaniem jest weryfikacja poniższych błędów na sklepie i przygotowanie planu naprawy. "
        "Znasz strukturę plików, masz dostęp do bazy danych sklepu oraz wiesz jak działają wszystkie mechanizmy cache. "
        "WAŻNE: Podane błędy zostały wykryte przez crawler, który analizował wyrenderowany kod HTML stron sklepu. "
        "Crawler nie sprawdzał bazy danych – może być tak, że w bazie dane są w pełni poprawne, "
        "a problem leży po stronie modułów (np. wstrzykujących dane strukturalne do źródła strony).\n"
        "Oto zestawienie błędów do przeanalizowania:\n\n"
    )

    if critical_errors:
        err_msg += f"❌ <b>KRYTYCZNE:</b>\n"
        prompt_text += "BŁĘDY KRYTYCZNE:\n"
        for url, err in critical_errors:
            safe_url = html.escape(url)
            err_msg += f"• {err}: {safe_url}\n"
            prompt_text += f"- {err}: {url}\n"
        err_msg += "\n"
        prompt_text += "\n"

    if schema_errors:
        err_msg += f"🛠 <b>SCHEMA.ORG:</b>\n"
        prompt_text += "BŁĘDY SCHEMA.ORG:\n"
        for url, count in schema_errors:
            safe_url = html.escape(url)
            err_msg += f"• Brak {count} pól: {safe_url}\n"
            prompt_text += f"- Brak {count} pól: {url}\n"
        err_msg += "\n"
        prompt_text += "\n"

    if translation_issues:
        err_msg += f"🌐 <b>TŁUMACZENIA:</b>\n"
        prompt_text += "BŁĘDY TŁUMACZEŃ:\n"
        for issue in translation_issues:
            err_msg += f"• {html.escape(issue)}\n"
            prompt_text += f"- {issue}\n"
        err_msg += "\n"
        prompt_text += "\n"

    err_msg += f"🤖 <b>Gotowy prompt dla Agenta AI:</b>\n"
    err_msg += f"<pre><code class=\"language-text\">{html.escape(prompt_text.strip())}</code></pre>"

    print("[*] Wysyłanie raportu na kanał Errors...")
    send_telegram(token, chat_id_errors, err_msg)

if __name__ == "__main__":
    main()