From 8411593c55c31b490be2c1386ae4cb5e92d9746d Mon Sep 17 00:00:00 2001 From: Lukasz Date: Sat, 9 May 2026 11:10:06 +0200 Subject: [PATCH] Initial commit - Crawler SEO (with AI Agent prompt) --- .gitignore | 27 +++ crawler.py | 443 ++++++++++++++++++++++++++++++++++++ dashboard_api.py | 575 +++++++++++++++++++++++++++++++++++++++++++++++ run_audit.bat | 4 + send_report.py | 144 ++++++++++++ 5 files changed, 1193 insertions(+) create mode 100644 .gitignore create mode 100644 crawler.py create mode 100644 dashboard_api.py create mode 100644 run_audit.bat create mode 100644 send_report.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..acc62ba --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +# Wyniki skanowania (duże pliki binarne) +scans/*.db +scans/*.tmp +*.db +*.db-journal +*.csv + +# Konfiguracja (zawiera tokeny) +# Jeśli chcesz współdzielić config przez Git, zakomentuj poniższą linię +config.json + +# Python +__pycache__/ +*.py[cod] +*$py.class +.venv/ +env/ +venv/ +ENV/ + +# Systemowe / Syncthing +.sync/ +.stfolder/ +.stignore +*~sync* +.DS_Store +Thumbs.db diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..45daa8a --- /dev/null +++ b/crawler.py @@ -0,0 +1,443 @@ +import requests +from bs4 import BeautifulSoup +import time +import sys +import json +import sqlite3 +import argparse +import glob +import html +from datetime import datetime +from urllib.parse import urljoin, urlparse +from urllib.robotparser import RobotFileParser +import threading +import queue +import os + +if os.name == 'nt': os.system('color') + +GOOGLEBOT_UA = "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" + +class TelegramNotifier: + def __init__(self, token, chat_id_info, chat_id_errors): + self.token = token + self.chat_id_info = chat_id_info + self.chat_id_errors = chat_id_errors or chat_id_info + self.enabled = True if token and chat_id_info else False + self.critical_errors = [] + self.schema_errors = [] + self.translation_issues = [] + self.lock = threading.Lock() + + def send(self, message, target='info'): + if not self.enabled: return + cid = self.chat_id_info if target == 'info' else self.chat_id_errors + url = f"https://api.telegram.org/bot{self.token}/sendMessage" + try: + r = requests.post(url, json={"chat_id": cid, "text": message, "parse_mode": "HTML"}, timeout=15) + if r.status_code != 200: + print(f"\n[!] Telegram Error ({target}): {r.text}") + except Exception as e: + print(f"\n[!] Connection Error (Telegram): {e}") + + def add_critical(self, url, msg): + with self.lock: + if len(self.critical_errors) < 30: self.critical_errors.append((url, msg)) + + def add_schema(self, url, count): + with self.lock: + if len(self.schema_errors) < 15: self.schema_errors.append((url, count)) + + def add_translation_issue(self, sku, lang1, lang2, field): + with self.lock: + if len(self.translation_issues) < 15: + self.translation_issues.append(f"SKU {sku}: {field} identyczny w {lang1} i {lang2}") + + def get_prev_404_count(self, current_db): + # Szukamy baz danych w podkatalogu scans, sortujemy po czasie modyfikacji + dbs = glob.glob("scans/crawler_v*.db") + dbs.sort(key=os.path.getmtime, reverse=True) + + prev_db = None + for d in dbs: + if os.path.basename(d) != os.path.basename(current_db): + prev_db = d + break + if not prev_db: return None + try: + conn = sqlite3.connect(prev_db) + count = conn.execute("SELECT COUNT(*) FROM pages WHERE status = 404").fetchone()[0] + conn.close() + return count + except: return None + + def send_final_report(self, start_url, total, errors, db_file, search_results=-1): + if not self.enabled: + print("\n[!] Powiadomienia Telegram są wyłączone (brak konfiguracji).") + return + + # Analiza 404 i innych błędów + current_404 = 0 + schema_errs = 0 + transl_errs = 0 + try: + conn = sqlite3.connect(db_file) + current_404 = conn.execute("SELECT COUNT(*) FROM pages WHERE status = 404").fetchone()[0] + schema_errs = conn.execute("SELECT COUNT(*) FROM pages WHERE schema_critical > 0").fetchone()[0] + transl_errs = conn.execute("SELECT COUNT(*) FROM translation_audit").fetchone()[0] + conn.close() + except: pass + + prev_404 = self.get_prev_404_count(db_file) + regression_str = "" + if prev_404 is not None: + diff = current_404 - prev_404 + if diff > 0: regression_str = f" (+{diff} NOWE! ⚠️)" + elif diff < 0: regression_str = f" ({diff} naprawione)" + else: regression_str = " (bez zmian)" + + # 1. RAPORT INFO + domain = html.escape(urlparse(start_url).netloc) + + total_icon = "✅" + http_icon = "✅" if errors == 0 else "❌" + err404_icon = "✅" if current_404 == 0 else "❌" + schema_icon = "✅" if schema_errs == 0 else "❌" + transl_icon = "✅" if transl_errs == 0 else "❌" + search_icon = "✅" if search_results > 0 else "❌" + + schema_text = "poprawne" if schema_errs == 0 else f"{schema_errs} błędów" + transl_text = "poprawne" if transl_errs == 0 else f"{transl_errs} błędów" + search_text = f"{search_results}" if search_results >= 0 else "BŁĄD" + + info_msg = f"🏁 AUDYT ZAKOŃCZONY: {domain}\n\n" + info_msg += f"{total_icon} Przeskanowano: {total}\n" + info_msg += f"{http_icon} Błędy HTTP: {errors}\n" + info_msg += f"{err404_icon} Błędy 404: {current_404}{regression_str}\n" + info_msg += f"{schema_icon} Dane strukturalne: {schema_text}\n" + info_msg += f"{transl_icon} Tłumaczenia: {transl_text}\n" + info_msg += f"{search_icon} Wyszukiwarka: {search_text}\n\n" + + if self.critical_errors or self.schema_errors or self.translation_issues: + info_msg += f"🚨 Wykryto błędy. Szczegóły na kanale ERRORS." + else: + info_msg += f"✅ Brak krytycznych błędów." + self.send(info_msg, target='info') + + # 2. RAPORT ERRORS + if self.critical_errors or self.schema_errors or self.translation_issues: + err_msg = f"🚨 BŁĘDY: {domain}\n\n" + + prompt_text = ( + "Twoim zadaniem jest weryfikacja poniższych błędów na sklepie i przygotowanie planu naprawy. " + "Znasz strukturę plików, masz dostęp do bazy danych sklepu oraz wiesz jak działają wszystkie mechanizmy cache. " + "WAŻNE: Podane błędy zostały wykryte przez crawler, który analizował wyrenderowany kod HTML stron sklepu. " + "Crawler nie sprawdzał bazy danych – może być tak, że w bazie dane są w pełni poprawne, " + "a problem leży po stronie modułów (np. wstrzykujących dane strukturalne do źródła strony).\n" + "Oto zestawienie błędów do przeanalizowania:\n\n" + ) + + if self.critical_errors: + err_msg += f"❌ KRYTYCZNE:\n" + prompt_text += "BŁĘDY KRYTYCZNE:\n" + for url, err in self.critical_errors[:15]: + safe_url = html.escape(url) + err_msg += f"• {err}: {safe_url}\n" + prompt_text += f"- {err}: {url}\n" + err_msg += "\n" + prompt_text += "\n" + + if self.schema_errors: + err_msg += f"🛠 SCHEMA.ORG:\n" + prompt_text += "BŁĘDY SCHEMA.ORG:\n" + for url, count in self.schema_errors[:10]: + safe_url = html.escape(url) + err_msg += f"• Brak {count} pól: {safe_url}\n" + prompt_text += f"- Brak {count} pól: {url}\n" + err_msg += "\n" + prompt_text += "\n" + + if self.translation_issues: + err_msg += f"🌐 TŁUMACZENIA:\n" + prompt_text += "BŁĘDY TŁUMACZEŃ:\n" + for issue in self.translation_issues[:10]: + err_msg += f"• {html.escape(issue)}\n" + prompt_text += f"- {issue}\n" + err_msg += "\n" + prompt_text += "\n" + + err_msg += f"🤖 Gotowy prompt dla Agenta AI:\n" + err_msg += f"
{html.escape(prompt_text.strip())}
" + + self.send(err_msg, target='errors') + +def crawler(start_url, db_file, max_threads, tg_notifier): + parsed_start = urlparse(start_url) + base_url = f"{parsed_start.scheme}://{parsed_start.netloc}" + base_domain = parsed_start.netloc + conn = sqlite3.connect(db_file, check_same_thread=False) + cursor = conn.cursor() + cursor.execute('''CREATE TABLE IF NOT EXISTS pages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT UNIQUE, source_url TEXT, status INTEGER, + total_time REAL, ttfb REAL, google_access TEXT, index_status TEXT, + schema_critical INTEGER DEFAULT 0, schema_warnings INTEGER DEFAULT 0, + images_no_alt INTEGER DEFAULT 0, images_no_webp INTEGER DEFAULT 0, + title TEXT, meta_desc TEXT, canonical TEXT, + lang TEXT, timestamp DATETIME)''') + cursor.execute('''CREATE TABLE IF NOT EXISTS structured_data (id INTEGER PRIMARY KEY AUTOINCREMENT, page_id INTEGER, schema_type TEXT, full_json TEXT, sku TEXT, FOREIGN KEY(page_id) REFERENCES pages(id))''') + cursor.execute('''CREATE TABLE IF NOT EXISTS translation_audit (id INTEGER PRIMARY KEY AUTOINCREMENT, sku TEXT, field TEXT, lang1 TEXT, lang2 TEXT, content TEXT)''') + cursor.execute('''CREATE TABLE IF NOT EXISTS images_audit (id INTEGER PRIMARY KEY AUTOINCREMENT, page_id INTEGER, img_url TEXT, alt TEXT, is_modern INTEGER, has_modern_source INTEGER, FOREIGN KEY(page_id) REFERENCES pages(id))''') + conn.commit() + + db_queue = queue.Queue() + def db_worker(): + db_conn = sqlite3.connect(db_file) + db_cursor = db_conn.cursor() + while True: + item = db_queue.get() + if item is None: break + try: + p = item['page'] + db_cursor.execute('''INSERT OR REPLACE INTO pages (url, source_url, status, total_time, ttfb, google_access, index_status, schema_critical, schema_warnings, images_no_alt, images_no_webp, title, meta_desc, canonical, lang, timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', (p['url'], p['source'], p['status'], p['time'], p['ttfb'], p['access'], p['idx'], p['s_crit'], p['s_warn'], p.get('images_no_alt',0), p.get('images_no_webp',0), p.get('title',''), p.get('meta_desc',''), p.get('canonical',''), p['lang'], p['ts'])) + page_id = db_cursor.lastrowid + for s in item['schemas']: db_cursor.execute('INSERT INTO structured_data (page_id, schema_type, full_json, sku) VALUES (?, ?, ?, ?)', (page_id, s['type'], s['json'], s.get('sku'))) + if 'images' in item: + for img in item['images']: db_cursor.execute('INSERT INTO images_audit (page_id, img_url, alt, is_modern, has_modern_source) VALUES (?, ?, ?, ?, ?)', (page_id, img['img_url'], img['alt'], img['is_modern'], img['has_modern_source'])) + db_conn.commit() + except: pass + finally: db_queue.task_done() + db_conn.close() + + db_thread = threading.Thread(target=db_worker) + db_thread.start() + rp = RobotFileParser() + try: rp.set_url(urljoin(base_url, "robots.txt")); rp.read() + except: pass + + visited, crawled_count, error_count = {start_url}, 0, 0 + total_response_time = 0.0 + visited_lock, stats_lock = threading.Lock(), threading.Lock() + url_queue = queue.Queue() + url_queue.put((start_url, "Start")) + stop_event = threading.Event() + session = requests.Session() + session.headers.update({'User-Agent': GOOGLEBOT_UA}) + + def analyze_schema(soup): + scripts = soup.find_all('script', type='application/ld+json') + results, crit, warn = [], 0, 0 + def get_val(obj, path): + curr = obj + for p in path.split('.'): + if isinstance(curr, dict) and p in curr: curr = curr[p] + else: return None + return curr + for script in scripts: + try: + data = json.loads(script.string) + objs = data if isinstance(data, list) else [data] + for obj in objs: + if not isinstance(obj, dict): continue + sku = get_val(obj, 'sku') or get_val(obj, 'mpn') + if 'Product' in str(obj.get('@type', '')): + if not get_val(obj, 'name') or not get_val(obj, 'image') or not get_val(obj, 'offers.price'): crit += 1 + results.append({'type': str(obj.get('@type', 'Unknown')), 'json': json.dumps(obj, ensure_ascii=False), 'sku': str(sku) if sku else None}) + except: pass + return results, crit, warn + + def analyze_images(soup, url): + images_data = [] + no_alt, no_webp = 0, 0 + for img in soup.find_all('img'): + src = img.get('src') or img.get('data-src') or '' + if not src or src.startswith('data:image'): continue + alt = img.get('alt', '').strip() if img.get('alt') is not None else '' + alt_text = alt if alt else '[BRAK]' + is_modern = src.lower().endswith(('.webp', '.avif', '.svg')) + parent = img.find_parent('picture') + has_modern_source = False + if parent: + for source in parent.find_all('source'): + srcs = source.get('srcset', '') + typ = source.get('type', '') + if 'webp' in srcs.lower() or 'avif' in srcs.lower() or 'webp' in typ or 'avif' in typ: + has_modern_source = True + break + if not has_modern_source: + srcset = img.get('srcset', '') + if 'webp' in srcset.lower() or 'avif' in srcset.lower(): + has_modern_source = True + images_data.append({ + 'img_url': urljoin(url, src), 'alt': alt_text, + 'is_modern': int(is_modern), 'has_modern_source': int(has_modern_source) + }) + if alt_text == '[BRAK]': no_alt += 1 + if not is_modern and not has_modern_source: no_webp += 1 + return images_data, no_alt, no_webp + + def process_url(url, source): + nonlocal crawled_count, error_count, total_response_time + if not rp.can_fetch("Googlebot", url): + tg_notifier.add_critical(url, "ROBOTS.TXT BLOCK") + db_queue.put({'page': {'url': url, 'source': source, 'status': 0, 'time': 0, 'ttfb': 0, 'access': 'Blocked', 'idx': '-', 's_crit': 0, 's_warn': 0, 'images_no_alt': 0, 'images_no_webp': 0, 'title': '', 'meta_desc': '', 'canonical': '', 'lang': '?', 'ts': datetime.now().isoformat()}, 'schemas': [], 'images': []}) + return + try: + start_t = time.time() + resp = session.get(url, timeout=10, stream=True) + ttfb = round(time.time() - start_t, 4) + soup = BeautifulSoup(resp.text, 'lxml') + total_t = round(time.time() - start_t, 4) + lang = soup.find('html').get('lang', 'unknown') if soup.find('html') else 'unknown' + schemas, s_crit, s_warn = analyze_schema(soup) + idx = "Indexable" + if 'noindex' in resp.headers.get('X-Robots-Tag', '').lower(): idx = "Noindex" + elif soup.find('meta', attrs={'name': ['robots', 'googlebot'], 'content': lambda x: x and 'noindex' in x.lower()}): idx = "Noindex" + + + title_tag = soup.find('title') + title = title_tag.text.strip() if title_tag else '' + meta_desc_tag = soup.find('meta', attrs={'name': 'description'}) + meta_desc = meta_desc_tag.get('content', '').strip() if meta_desc_tag else '' + canonical_tag = soup.find('link', rel='canonical') + canonical = canonical_tag.get('href', '').strip() if canonical_tag else '' + + images_data, no_alt, no_webp = analyze_images(soup, url) + + if resp.status_code >= 500: tg_notifier.add_critical(url, f"ERR {resp.status_code}") + if resp.status_code == 404: tg_notifier.add_critical(url, "404") + if s_crit > 0: tg_notifier.add_schema(url, s_crit) + + db_queue.put({'page': {'url': url, 'source': source, 'status': resp.status_code, 'time': total_t, 'ttfb': ttfb, 'access': 'Allowed', 'idx': idx, 's_crit': s_crit, 's_warn': s_warn, 'images_no_alt': no_alt, 'images_no_webp': no_webp, 'title': title, 'meta_desc': meta_desc, 'canonical': canonical, 'lang': lang, 'ts': datetime.now().isoformat()}, 'schemas': schemas, 'images': images_data}) + with stats_lock: + crawled_count += 1 + total_response_time += total_t + if resp.status_code != 200: error_count += 1 + if resp.status_code == 200 and 'text/html' in resp.headers.get('Content-Type', ''): + for link in soup.find_all('a', href=True): + full = urljoin(url, link['href']) + parsed = urlparse(full) + if parsed.netloc == base_domain: + clean = parsed._replace(query='', fragment='').geturl() + with visited_lock: + if clean not in visited: visited.add(clean); url_queue.put((clean, url)) + except: + with stats_lock: error_count += 1 + + def worker(): + while not stop_event.is_set(): + try: u, s = url_queue.get(timeout=0.5); process_url(u, s); url_queue.task_done() + except queue.Empty: continue + + threads = [threading.Thread(target=worker, daemon=True) for _ in range(max_threads)] + for t in threads: t.start() + try: + while not stop_event.is_set() and url_queue.unfinished_tasks > 0: + with stats_lock: + cc = crawled_count + err = error_count + avg = round(total_response_time / cc, 3) if cc > 0 else 0 + q_size = url_queue.unfinished_tasks + print(f"\r[AUDYT] Skanowanie: {cc} | Błędy: {err} | Kolejka: {q_size} | Średni czas: {avg}s ", end="") + time.sleep(0.5) + except KeyboardInterrupt: + print("\n[!] Przerwano (Ctrl+C). Trwa bezpieczne zamykanie...") + stop_event.set() + while not url_queue.empty(): + try: url_queue.get_nowait(); url_queue.task_done() + except queue.Empty: break + + if not stop_event.is_set(): + url_queue.join() + else: + # Czekamy krótką chwilę, by wątki url_queue zdążyły zobaczyć stop_event + time.sleep(1) + + print("\n[*] Zapisywanie bazy danych, proszę czekać...") + db_queue.put(None) + db_thread.join() + + # AUDYT WIELOJĘZYCZNOŚCI + cursor.execute('SELECT sku, lang, full_json FROM structured_data JOIN pages ON structured_data.page_id = pages.id WHERE sku IS NOT NULL AND sku != "None"') + sku_map = {} + for sku, lang, fjson in cursor.fetchall(): + if sku not in sku_map: sku_map[sku] = {} + data = json.loads(fjson) + sku_map[sku][lang] = {'name': data.get('name', ''), 'description': data.get('description', '')} + for sku, langs in sku_map.items(): + lang_list = list(langs.keys()) + if len(lang_list) > 1: + for i in range(len(lang_list)): + for j in range(i + 1, len(lang_list)): + l1, l2 = lang_list[i], lang_list[j] + if langs[l1]['name'] == langs[l2]['name']: + cursor.execute('INSERT INTO translation_audit (sku, field, lang1, lang2, content) VALUES (?, ?, ?, ?, ?)', (sku, 'name', l1, l2, langs[l1]['name'])) + tg_notifier.add_translation_issue(sku, l1, l2, 'name') + conn.commit(); conn.close() + + # TEST WYSZUKIWARKI + search_count = -1 + try: + print("\n[*] Przeprowadzanie testu wyszukiwarki (szukana fraza: karuzela)...") + search_url = "https://fluo.dog/szukaj?controller=search&s=karuzela" + resp_search = session.get(search_url, timeout=15) + if resp_search.status_code == 200: + soup_search = BeautifulSoup(resp_search.text, 'lxml') + products = soup_search.find_all('article', class_='product-miniature') + search_count = len(products) + if search_count == 0: + tg_notifier.add_critical(search_url, "TEST WYSZUKIWARKI: Brak wyników (0) dla 'karuzela'!") + print("[!] Test wyszukiwarki NIEPOWODZENIE: 0 wyników.") + else: + print(f"[*] Test wyszukiwarki OK: znaleziono {search_count} produktów.") + else: + tg_notifier.add_critical(search_url, f"TEST WYSZUKIWARKI: Błąd HTTP {resp_search.status_code}") + print(f"[!] Test wyszukiwarki BŁĄD HTTP: {resp_search.status_code}") + except Exception as e: + tg_notifier.add_critical("https://fluo.dog/szukaj?controller=search&s=karuzela", f"TEST WYSZUKIWARKI: Błąd połączenia") + print(f"[!] Test wyszukiwarki BŁĄD: {e}") + + tg_notifier.send_final_report(start_url, crawled_count, error_count, db_file, search_results=search_count) + +if __name__ == "__main__": + def load_config(): + try: + # Wymuszamy utf-8, żeby uniknąć problemów z kodowaniem Windows + with open("config.json", "r", encoding="utf-8") as f: + raw_config = json.load(f) + # Czyścimy klucze ze spacji na wszelki wypadek + return {k.strip(): v for k, v in raw_config.items()} + except Exception as e: + print(f"[!] Błąd wczytywania config.json: {e}") + return {} + + parser = argparse.ArgumentParser(description="Crawler SEO - Podgląd błędów i audyt.") + parser.add_argument("--url", default="https://fluo.dog", help="Startowy URL") + parser.add_argument("--threads", type=int, default=10, help="Liczba wątków") + args = parser.parse_args() + + # Upewnij się, że katalog scans istnieje + if not os.path.exists("scans"): + os.makedirs("scans") + + config = load_config() + + # Debug: wypiszmy jakie klucze faktycznie widzi Python + available_keys = ", ".join(config.keys()) + print(f"[*] Wczytane klucze z config: {available_keys}") + + token = config.get("telegram_token") + id_info = config.get("telegram_chat_id_info") + id_err = config.get("telegram_chat_id_errors") + + print(f"[*] Konfiguracja: INFO_ID={id_info}, ERRORS_ID={id_err}") + + notifier = TelegramNotifier(token, id_info, id_err) + if notifier.enabled: + print("[*] Telegram powiadomienia: WŁĄCZONE") + notifier.send(f"🚀 Rozpoczynam audyt SEO dla: {html.escape(args.url)}", target='info') + else: + print("[!] Telegram powiadomienia: WYŁĄCZONE (sprawdź czy klucze w config.json są poprawne)") + + db_name = f"scans/crawler_v18_{datetime.now().strftime('%Y%m%d_%H%M%S')}.db" + crawler(args.url, db_name, args.threads, notifier) diff --git a/dashboard_api.py b/dashboard_api.py new file mode 100644 index 0000000..36407d0 --- /dev/null +++ b/dashboard_api.py @@ -0,0 +1,575 @@ +import sqlite3 +import json +import glob +import os +from fastapi import FastAPI, HTTPException, Query +from fastapi.responses import HTMLResponse, StreamingResponse +from typing import List, Optional +import io +import csv + +app = FastAPI(title="Crawler SEO Dashboard API") + +def get_db_conn(db_name: str): + # Sprawdź czy db_name zawiera już katalog, jeśli nie - dodaj scans/ + if not db_name.startswith("scans/"): + db_path = os.path.join("scans", db_name) + else: + db_path = db_name + + if not os.path.exists(db_path): + raise HTTPException(status_code=404, detail=f"Baza danych nie istnieje: {db_path}") + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + return conn + +@app.get("/", response_class=HTMLResponse) +def get_dashboard(): + return """ + + + + + + Crawler SEO Dashboard + + + + + + + + + +
+ + + + """ + +@app.get("/api/list-dbs") +def list_dbs(): + dbs = glob.glob("scans/*.db") + # Zwracamy same nazwy plików dla ładniejszego widoku w select + return sorted([os.path.basename(db) for db in dbs], reverse=True) + +@app.get("/api/stats") +def get_stats(db: str): + conn = get_db_conn(db) + cursor = conn.cursor() + stats = {"total_pages": 0, "errors": 0, "avg_time": 0, "schema_objects": 0, "img_issues": 0, "translation_errors": 0} + try: + stats["total_pages"] = cursor.execute("SELECT COUNT(*) FROM pages").fetchone()[0] + stats["errors"] = cursor.execute("SELECT COUNT(*) FROM pages WHERE status != 200 AND status != 0").fetchone()[0] + stats["avg_time"] = cursor.execute("SELECT AVG(total_time) FROM pages WHERE total_time > 0").fetchone()[0] or 0 + stats["schema_objects"] = cursor.execute("SELECT COUNT(*) FROM structured_data").fetchone()[0] + except: pass + try: + img_stats = cursor.execute("SELECT SUM(images_no_alt), SUM(images_no_webp) FROM pages").fetchone() + stats["img_issues"] = (img_stats[0] or 0) + (img_stats[1] or 0) + except: pass + try: + stats["translation_errors"] = cursor.execute("SELECT COUNT(*) FROM translation_audit").fetchone()[0] + except: pass + conn.close() + return stats + +@app.get("/api/pages") +def get_pages(db: str, status_type: Optional[str] = "all"): + conn = get_db_conn(db) + cursor = conn.cursor() + try: + query = "SELECT * FROM pages" + try: + cursor.execute("SELECT images_no_alt FROM pages LIMIT 1") + has_img_cols = True + except: has_img_cols = False + + if status_type == "error": query += " WHERE status != 200 AND status != 0" + elif status_type == "noindex": query += " WHERE index_status LIKE 'Noindex%'" + elif status_type == "slow": query += " WHERE total_time > 1.5" + elif status_type == "images" and has_img_cols: query += " WHERE images_no_alt > 0 OR images_no_webp > 0" + + query += " ORDER BY id DESC LIMIT 1000" + pages = cursor.execute(query).fetchall() + return [dict(p) for p in pages] + except: return [] + finally: conn.close() + +@app.get("/api/translations") +def get_translations(db: str): + conn = get_db_conn(db) + cursor = conn.cursor() + try: + try: + cursor.execute("SELECT title, meta_desc FROM pages LIMIT 1") + has_meta = True + except: has_meta = False + + if has_meta: + query = """ + SELECT s.sku, p.lang, s.full_json, MIN(p.url) as url, MAX(p.title) as title, MAX(p.meta_desc) as meta_desc + FROM structured_data s + JOIN pages p ON s.page_id = p.id + WHERE s.sku IS NOT NULL AND s.sku != 'None' AND s.sku != '' AND s.schema_type LIKE '%Product%' + GROUP BY s.sku, p.lang + """ + else: + query = """ + SELECT s.sku, p.lang, s.full_json, MIN(p.url) as url, '' as title, '' as meta_desc + FROM structured_data s + JOIN pages p ON s.page_id = p.id + WHERE s.sku IS NOT NULL AND s.sku != 'None' AND s.sku != '' AND s.schema_type LIKE '%Product%' + GROUP BY s.sku, p.lang + """ + rows = cursor.execute(query).fetchall() + + sku_map = {} + langs_set = set() + + for r in rows: + sku = str(r['sku']).strip() + lang = str(r['lang']).lower().strip() + if '-' in lang: lang = lang.split('-')[0] + langs_set.add(lang) + + try: data = json.loads(r['full_json']) + except: continue + + obj = {} + if isinstance(data, list): obj = next((item for item in data if 'Product' in str(item.get('@type', ''))), {}) + else: obj = data if 'Product' in str(data.get('@type', '')) else {} + + name = obj.get('name', '').strip() + desc = obj.get('description', '').strip() + title = (r['title'] or '').strip() + meta_desc = (r['meta_desc'] or '').strip() + + slug = '' + if r['url']: + parts = r['url'].rstrip('/').split('/') + if parts: slug = parts[-1].split('?')[0].split('#')[0] + + if sku not in sku_map: sku_map[sku] = {'langs': {}, 'url': r['url']} + sku_map[sku]['langs'][lang] = { + 'nazwa': name, 'opis': desc, + 'nazwa seo': title, 'opis seo': meta_desc, 'slug': slug + } + + all_langs = sorted(list(langs_set)) + if 'pl' in all_langs: all_langs.remove('pl') + + results = [] + fields = ['nazwa', 'opis', 'nazwa seo', 'opis seo', 'slug'] + + for sku, info in sku_map.items(): + if 'pl' not in info['langs']: continue + + pl_data = info['langs']['pl'] + sku_has_errors = False + sku_rows = [] + + for field in fields: + pl_val = pl_data.get(field, '') + if not pl_val: continue + + row = {'sku': sku, 'field': field, 'url': info['url']} + for lang in all_langs: + l_val = info['langs'].get(lang, {}).get(field, '') + if not l_val or l_val == pl_val: + row[lang] = 'X' + sku_has_errors = True + else: + row[lang] = 'V' + sku_rows.append(row) + + if sku_has_errors: + results.extend(sku_rows) + + return {"langs": all_langs, "data": results} + except Exception as e: + print(f"Error in translations: {e}") + return {"langs": [], "data": []} + finally: conn.close() + +@app.get("/api/analysis/{page_id}") +def get_analysis(db: str, page_id: int): + conn = get_db_conn(db) + cursor = conn.cursor() + try: + schemas = cursor.execute("SELECT schema_type, full_json FROM structured_data WHERE page_id = ?", (page_id,)).fetchall() + try: images = cursor.execute("SELECT img_url, alt, is_modern, has_modern_source FROM images_audit WHERE page_id = ?", (page_id,)).fetchall() + except: images = [] + schema_list = [] + for s in schemas: + try: schema_list.append({"type": s["schema_type"], "data": json.loads(s["full_json"])}) + except: schema_list.append({"type": s["schema_type"], "data": s["full_json"]}) + return {"schemas": schema_list, "images": [dict(img) for img in images]} + finally: + conn.close() + +@app.get("/api/export-csv") +def export_csv(db: str, status_type: Optional[str] = "all"): + conn = get_db_conn(db) + cursor = conn.cursor() + + try: + cursor.execute("SELECT images_no_alt FROM pages LIMIT 1") + has_img_cols = True + except: has_img_cols = False + + query = "SELECT * FROM pages" + if status_type == "error": query += " WHERE status != 200 AND status != 0" + elif status_type == "noindex": query += " WHERE index_status LIKE 'Noindex%'" + elif status_type == "slow": query += " WHERE total_time > 1.5" + elif status_type == "images" and has_img_cols: query += " WHERE images_no_alt > 0 OR images_no_webp > 0" + query += " ORDER BY id DESC" + pages = cursor.execute(query).fetchall() + conn.close() + + output = io.StringIO() + writer = csv.writer(output, delimiter=';') + if pages: + keys = list(dict(pages[0]).keys()) + writer.writerow([k.upper() for k in keys]) + for p in pages: + writer.writerow([dict(p).get(k, '') for k in keys]) + else: + writer.writerow(['BRAK DANYCH']) + output.seek(0) + filename = f"raport_seo_{status_type}_{db.replace('.db', '')}.csv" + return StreamingResponse(io.BytesIO(output.getvalue().encode('utf-8-sig')), media_type="text/csv", headers={"Content-Disposition": f"attachment; filename={filename}"}) + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="127.0.0.1", port=8000) diff --git a/run_audit.bat b/run_audit.bat new file mode 100644 index 0000000..4d60bd7 --- /dev/null +++ b/run_audit.bat @@ -0,0 +1,4 @@ +@echo off +cd /d "e:\Lukasz\Projekty\Python\Crawler_XML" +python crawler.py --url https://fluo.dog +exit diff --git a/send_report.py b/send_report.py new file mode 100644 index 0000000..96886e2 --- /dev/null +++ b/send_report.py @@ -0,0 +1,144 @@ +import sqlite3 +import json +import glob +import os +import html +import requests +from urllib.parse import urlparse + +def load_config(): + try: + with open("config.json", "r", encoding="utf-8") as f: + raw_config = json.load(f) + return {k.strip(): v for k, v in raw_config.items()} + except Exception as e: + print(f"[!] Błąd wczytywania config.json: {e}") + return {} + +def send_telegram(token, chat_id, message): + url = f"https://api.telegram.org/bot{token}/sendMessage" + try: + r = requests.post(url, json={"chat_id": chat_id, "text": message, "parse_mode": "HTML"}, timeout=15) + if r.status_code != 200: + print(f"[!] Błąd wysyłania (HTTP {r.status_code}): {r.text}") + else: + print("[*] Wiadomość wysłana pomyślnie.") + except Exception as e: + print(f"[!] Błąd połączenia z Telegramem: {e}") + +def main(): + config = load_config() + token = config.get("telegram_token") + chat_id_errors = config.get("telegram_chat_id_errors") or config.get("telegram_chat_id_info") + + if not token or not chat_id_errors: + print("[!] Brak poprawnej konfiguracji Telegram (token lub chat_id_errors w config.json).") + return + + # Znajdź najnowszą bazę danych crawla w podfolderze scans + dbs = glob.glob("scans/crawler_v*.db") + if not dbs: + print("[!] Nie znaleziono żadnej bazy danych crawler_v*.db w podfolderze scans.") + return + + dbs.sort(key=os.path.getmtime, reverse=True) + latest_db = dbs[0] + print(f"[*] Odczytuję dane z bazy: {latest_db}") + + critical_errors = [] + schema_errors = [] + translation_issues = [] + domain = "fluo.dog" + + try: + conn = sqlite3.connect(latest_db) + cursor = conn.cursor() + + # Wyciągamy domenę z pierwszego rekordu + cursor.execute("SELECT url FROM pages LIMIT 1") + row = cursor.fetchone() + if row: + domain = html.escape(urlparse(row[0]).netloc) + + # Błędy krytyczne (404, 500+, zablokowane) + cursor.execute("SELECT url, status, google_access FROM pages WHERE status = 404 OR status >= 500 OR google_access = 'Blocked' LIMIT 15") + for url, status, access in cursor.fetchall(): + if access == 'Blocked': + err_type = "ROBOTS.TXT BLOCK" + elif status == 404: + err_type = "404" + else: + err_type = f"ERR {status}" + critical_errors.append((url, err_type)) + + # Błędy schema + cursor.execute("SELECT url, schema_critical FROM pages WHERE schema_critical > 0 LIMIT 10") + for url, count in cursor.fetchall(): + schema_errors.append((url, count)) + + # Błędy tłumaczeń + try: + cursor.execute("SELECT sku, lang1, lang2, field FROM translation_audit LIMIT 10") + for sku, lang1, lang2, field in cursor.fetchall(): + translation_issues.append(f"SKU {sku}: {field} identyczny w {lang1} i {lang2}") + except sqlite3.OperationalError: + pass # Jeśli z jakiegoś powodu nie ma jeszcze tej tabeli + + conn.close() + except Exception as e: + print(f"[!] Błąd odczytu bazy danych: {e}") + return + + if not (critical_errors or schema_errors or translation_issues): + print("[*] Brak błędów do zaraportowania w ostatnim skanie.") + return + + # Budowanie wiadomości z promptem + err_msg = f"🚨 BŁĘDY: {domain} (Wysłane ręcznie)\n\n" + + prompt_text = ( + "Twoim zadaniem jest weryfikacja poniższych błędów na sklepie i przygotowanie planu naprawy. " + "Znasz strukturę plików, masz dostęp do bazy danych sklepu oraz wiesz jak działają wszystkie mechanizmy cache. " + "WAŻNE: Podane błędy zostały wykryte przez crawler, który analizował wyrenderowany kod HTML stron sklepu. " + "Crawler nie sprawdzał bazy danych – może być tak, że w bazie dane są w pełni poprawne, " + "a problem leży po stronie modułów (np. wstrzykujących dane strukturalne do źródła strony).\n" + "Oto zestawienie błędów do przeanalizowania:\n\n" + ) + + if critical_errors: + err_msg += f"❌ KRYTYCZNE:\n" + prompt_text += "BŁĘDY KRYTYCZNE:\n" + for url, err in critical_errors: + safe_url = html.escape(url) + err_msg += f"• {err}: {safe_url}\n" + prompt_text += f"- {err}: {url}\n" + err_msg += "\n" + prompt_text += "\n" + + if schema_errors: + err_msg += f"🛠 SCHEMA.ORG:\n" + prompt_text += "BŁĘDY SCHEMA.ORG:\n" + for url, count in schema_errors: + safe_url = html.escape(url) + err_msg += f"• Brak {count} pól: {safe_url}\n" + prompt_text += f"- Brak {count} pól: {url}\n" + err_msg += "\n" + prompt_text += "\n" + + if translation_issues: + err_msg += f"🌐 TŁUMACZENIA:\n" + prompt_text += "BŁĘDY TŁUMACZEŃ:\n" + for issue in translation_issues: + err_msg += f"• {html.escape(issue)}\n" + prompt_text += f"- {issue}\n" + err_msg += "\n" + prompt_text += "\n" + + err_msg += f"🤖 Gotowy prompt dla Agenta AI:\n" + err_msg += f"
{html.escape(prompt_text.strip())}
" + + print("[*] Wysyłanie raportu na kanał Errors...") + send_telegram(token, chat_id_errors, err_msg) + +if __name__ == "__main__": + main()