Initial commit - Crawler SEO (with AI Agent prompt)

2026-05-09 11:10:06 +02:00
commit 8411593c55
5 changed files with 1193 additions and 0 deletions
@@ -0,0 +1,144 @@
+import sqlite3
+import json
+import glob
+import os
+import html
+import requests
+from urllib.parse import urlparse
+
+def load_config():
+    try:
+        with open("config.json", "r", encoding="utf-8") as f:
+            raw_config = json.load(f)
+            return {k.strip(): v for k, v in raw_config.items()}
+    except Exception as e:
+        print(f"[!] Błąd wczytywania config.json: {e}")
+        return {}
+
+def send_telegram(token, chat_id, message):
+    url = f"https://api.telegram.org/bot{token}/sendMessage"
+    try:
+        r = requests.post(url, json={"chat_id": chat_id, "text": message, "parse_mode": "HTML"}, timeout=15)
+        if r.status_code != 200:
+            print(f"[!] Błąd wysyłania (HTTP {r.status_code}): {r.text}")
+        else:
+            print("[*] Wiadomość wysłana pomyślnie.")
+    except Exception as e:
+        print(f"[!] Błąd połączenia z Telegramem: {e}")
+
+def main():
+    config = load_config()
+    token = config.get("telegram_token")
+    chat_id_errors = config.get("telegram_chat_id_errors") or config.get("telegram_chat_id_info")
+    
+    if not token or not chat_id_errors:
+        print("[!] Brak poprawnej konfiguracji Telegram (token lub chat_id_errors w config.json).")
+        return
+
+    # Znajdź najnowszą bazę danych crawla w podfolderze scans
+    dbs = glob.glob("scans/crawler_v*.db")
+    if not dbs:
+        print("[!] Nie znaleziono żadnej bazy danych crawler_v*.db w podfolderze scans.")
+        return
+    
+    dbs.sort(key=os.path.getmtime, reverse=True)
+    latest_db = dbs[0]
+    print(f"[*] Odczytuję dane z bazy: {latest_db}")
+
+    critical_errors = []
+    schema_errors = []
+    translation_issues = []
+    domain = "fluo.dog"
+
+    try:
+        conn = sqlite3.connect(latest_db)
+        cursor = conn.cursor()
+        
+        # Wyciągamy domenę z pierwszego rekordu
+        cursor.execute("SELECT url FROM pages LIMIT 1")
+        row = cursor.fetchone()
+        if row:
+            domain = html.escape(urlparse(row[0]).netloc)
+
+        # Błędy krytyczne (404, 500+, zablokowane)
+        cursor.execute("SELECT url, status, google_access FROM pages WHERE status = 404 OR status >= 500 OR google_access = 'Blocked' LIMIT 15")
+        for url, status, access in cursor.fetchall():
+            if access == 'Blocked':
+                err_type = "ROBOTS.TXT BLOCK"
+            elif status == 404:
+                err_type = "404"
+            else:
+                err_type = f"ERR {status}"
+            critical_errors.append((url, err_type))
+
+        # Błędy schema
+        cursor.execute("SELECT url, schema_critical FROM pages WHERE schema_critical > 0 LIMIT 10")
+        for url, count in cursor.fetchall():
+            schema_errors.append((url, count))
+
+        # Błędy tłumaczeń
+        try:
+            cursor.execute("SELECT sku, lang1, lang2, field FROM translation_audit LIMIT 10")
+            for sku, lang1, lang2, field in cursor.fetchall():
+                translation_issues.append(f"SKU {sku}: {field} identyczny w {lang1} i {lang2}")
+        except sqlite3.OperationalError:
+            pass # Jeśli z jakiegoś powodu nie ma jeszcze tej tabeli
+            
+        conn.close()
+    except Exception as e:
+        print(f"[!] Błąd odczytu bazy danych: {e}")
+        return
+
+    if not (critical_errors or schema_errors or translation_issues):
+        print("[*] Brak błędów do zaraportowania w ostatnim skanie.")
+        return
+
+    # Budowanie wiadomości z promptem
+    err_msg = f"🚨 <b>BŁĘDY: {domain} (Wysłane ręcznie)</b>\n\n"
+    
+    prompt_text = (
+        "Twoim zadaniem jest weryfikacja poniższych błędów na sklepie i przygotowanie planu naprawy. "
+        "Znasz strukturę plików, masz dostęp do bazy danych sklepu oraz wiesz jak działają wszystkie mechanizmy cache. "
+        "WAŻNE: Podane błędy zostały wykryte przez crawler, który analizował wyrenderowany kod HTML stron sklepu. "
+        "Crawler nie sprawdzał bazy danych – może być tak, że w bazie dane są w pełni poprawne, "
+        "a problem leży po stronie modułów (np. wstrzykujących dane strukturalne do źródła strony).\n"
+        "Oto zestawienie błędów do przeanalizowania:\n\n"
+    )
+
+    if critical_errors:
+        err_msg += f"❌ <b>KRYTYCZNE:</b>\n"
+        prompt_text += "BŁĘDY KRYTYCZNE:\n"
+        for url, err in critical_errors:
+            safe_url = html.escape(url)
+            err_msg += f"• {err}: {safe_url}\n"
+            prompt_text += f"- {err}: {url}\n"
+        err_msg += "\n"
+        prompt_text += "\n"
+        
+    if schema_errors:
+        err_msg += f"🛠 <b>SCHEMA.ORG:</b>\n"
+        prompt_text += "BŁĘDY SCHEMA.ORG:\n"
+        for url, count in schema_errors:
+            safe_url = html.escape(url)
+            err_msg += f"• Brak {count} pól: {safe_url}\n"
+            prompt_text += f"- Brak {count} pól: {url}\n"
+        err_msg += "\n"
+        prompt_text += "\n"
+        
+    if translation_issues:
+        err_msg += f"🌐 <b>TŁUMACZENIA:</b>\n"
+        prompt_text += "BŁĘDY TŁUMACZEŃ:\n"
+        for issue in translation_issues:
+            err_msg += f"• {html.escape(issue)}\n"
+            prompt_text += f"- {issue}\n"
+        err_msg += "\n"
+        prompt_text += "\n"
+        
+    err_msg += f"🤖 <b>Gotowy prompt dla Agenta AI:</b>\n"
+    err_msg += f"<pre><code class=\"language-text\">{html.escape(prompt_text.strip())}</code></pre>"
+
+    print("[*] Wysyłanie raportu na kanał Errors...")
+    send_telegram(token, chat_id_errors, err_msg)
+
+if __name__ == "__main__":
+    main()