Initial commit - Crawler SEO (with AI Agent prompt)

This commit is contained in:
2026-05-09 11:10:06 +02:00
commit 8411593c55
5 changed files with 1193 additions and 0 deletions
+27
View File
@@ -0,0 +1,27 @@
# Wyniki skanowania (duże pliki binarne)
scans/*.db
scans/*.tmp
*.db
*.db-journal
*.csv
# Konfiguracja (zawiera tokeny)
# Jeśli chcesz współdzielić config przez Git, zakomentuj poniższą linię
config.json
# Python
__pycache__/
*.py[cod]
*$py.class
.venv/
env/
venv/
ENV/
# Systemowe / Syncthing
.sync/
.stfolder/
.stignore
*~sync*
.DS_Store
Thumbs.db
+443
View File
@@ -0,0 +1,443 @@
import requests
from bs4 import BeautifulSoup
import time
import sys
import json
import sqlite3
import argparse
import glob
import html
from datetime import datetime
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import threading
import queue
import os
if os.name == 'nt': os.system('color')
GOOGLEBOT_UA = "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
class TelegramNotifier:
def __init__(self, token, chat_id_info, chat_id_errors):
self.token = token
self.chat_id_info = chat_id_info
self.chat_id_errors = chat_id_errors or chat_id_info
self.enabled = True if token and chat_id_info else False
self.critical_errors = []
self.schema_errors = []
self.translation_issues = []
self.lock = threading.Lock()
def send(self, message, target='info'):
if not self.enabled: return
cid = self.chat_id_info if target == 'info' else self.chat_id_errors
url = f"https://api.telegram.org/bot{self.token}/sendMessage"
try:
r = requests.post(url, json={"chat_id": cid, "text": message, "parse_mode": "HTML"}, timeout=15)
if r.status_code != 200:
print(f"\n[!] Telegram Error ({target}): {r.text}")
except Exception as e:
print(f"\n[!] Connection Error (Telegram): {e}")
def add_critical(self, url, msg):
with self.lock:
if len(self.critical_errors) < 30: self.critical_errors.append((url, msg))
def add_schema(self, url, count):
with self.lock:
if len(self.schema_errors) < 15: self.schema_errors.append((url, count))
def add_translation_issue(self, sku, lang1, lang2, field):
with self.lock:
if len(self.translation_issues) < 15:
self.translation_issues.append(f"SKU {sku}: {field} identyczny w {lang1} i {lang2}")
def get_prev_404_count(self, current_db):
# Szukamy baz danych w podkatalogu scans, sortujemy po czasie modyfikacji
dbs = glob.glob("scans/crawler_v*.db")
dbs.sort(key=os.path.getmtime, reverse=True)
prev_db = None
for d in dbs:
if os.path.basename(d) != os.path.basename(current_db):
prev_db = d
break
if not prev_db: return None
try:
conn = sqlite3.connect(prev_db)
count = conn.execute("SELECT COUNT(*) FROM pages WHERE status = 404").fetchone()[0]
conn.close()
return count
except: return None
def send_final_report(self, start_url, total, errors, db_file, search_results=-1):
if not self.enabled:
print("\n[!] Powiadomienia Telegram są wyłączone (brak konfiguracji).")
return
# Analiza 404 i innych błędów
current_404 = 0
schema_errs = 0
transl_errs = 0
try:
conn = sqlite3.connect(db_file)
current_404 = conn.execute("SELECT COUNT(*) FROM pages WHERE status = 404").fetchone()[0]
schema_errs = conn.execute("SELECT COUNT(*) FROM pages WHERE schema_critical > 0").fetchone()[0]
transl_errs = conn.execute("SELECT COUNT(*) FROM translation_audit").fetchone()[0]
conn.close()
except: pass
prev_404 = self.get_prev_404_count(db_file)
regression_str = ""
if prev_404 is not None:
diff = current_404 - prev_404
if diff > 0: regression_str = f" (<b>+{diff} NOWE!</b> ⚠️)"
elif diff < 0: regression_str = f" ({diff} naprawione)"
else: regression_str = " (bez zmian)"
# 1. RAPORT INFO
domain = html.escape(urlparse(start_url).netloc)
total_icon = ""
http_icon = "" if errors == 0 else ""
err404_icon = "" if current_404 == 0 else ""
schema_icon = "" if schema_errs == 0 else ""
transl_icon = "" if transl_errs == 0 else ""
search_icon = "" if search_results > 0 else ""
schema_text = "poprawne" if schema_errs == 0 else f"{schema_errs} błędów"
transl_text = "poprawne" if transl_errs == 0 else f"{transl_errs} błędów"
search_text = f"{search_results}" if search_results >= 0 else "BŁĄD"
info_msg = f"🏁 <b>AUDYT ZAKOŃCZONY: {domain}</b>\n\n"
info_msg += f"{total_icon} Przeskanowano: {total}\n"
info_msg += f"{http_icon} Błędy HTTP: {errors}\n"
info_msg += f"{err404_icon} Błędy 404: {current_404}{regression_str}\n"
info_msg += f"{schema_icon} Dane strukturalne: {schema_text}\n"
info_msg += f"{transl_icon} Tłumaczenia: {transl_text}\n"
info_msg += f"{search_icon} Wyszukiwarka: {search_text}\n\n"
if self.critical_errors or self.schema_errors or self.translation_issues:
info_msg += f"🚨 Wykryto błędy. Szczegóły na kanale ERRORS."
else:
info_msg += f"✅ Brak krytycznych błędów."
self.send(info_msg, target='info')
# 2. RAPORT ERRORS
if self.critical_errors or self.schema_errors or self.translation_issues:
err_msg = f"🚨 <b>BŁĘDY: {domain}</b>\n\n"
prompt_text = (
"Twoim zadaniem jest weryfikacja poniższych błędów na sklepie i przygotowanie planu naprawy. "
"Znasz strukturę plików, masz dostęp do bazy danych sklepu oraz wiesz jak działają wszystkie mechanizmy cache. "
"WAŻNE: Podane błędy zostały wykryte przez crawler, który analizował wyrenderowany kod HTML stron sklepu. "
"Crawler nie sprawdzał bazy danych może być tak, że w bazie dane są w pełni poprawne, "
"a problem leży po stronie modułów (np. wstrzykujących dane strukturalne do źródła strony).\n"
"Oto zestawienie błędów do przeanalizowania:\n\n"
)
if self.critical_errors:
err_msg += f"❌ <b>KRYTYCZNE:</b>\n"
prompt_text += "BŁĘDY KRYTYCZNE:\n"
for url, err in self.critical_errors[:15]:
safe_url = html.escape(url)
err_msg += f"{err}: {safe_url}\n"
prompt_text += f"- {err}: {url}\n"
err_msg += "\n"
prompt_text += "\n"
if self.schema_errors:
err_msg += f"🛠 <b>SCHEMA.ORG:</b>\n"
prompt_text += "BŁĘDY SCHEMA.ORG:\n"
for url, count in self.schema_errors[:10]:
safe_url = html.escape(url)
err_msg += f"• Brak {count} pól: {safe_url}\n"
prompt_text += f"- Brak {count} pól: {url}\n"
err_msg += "\n"
prompt_text += "\n"
if self.translation_issues:
err_msg += f"🌐 <b>TŁUMACZENIA:</b>\n"
prompt_text += "BŁĘDY TŁUMACZEŃ:\n"
for issue in self.translation_issues[:10]:
err_msg += f"{html.escape(issue)}\n"
prompt_text += f"- {issue}\n"
err_msg += "\n"
prompt_text += "\n"
err_msg += f"🤖 <b>Gotowy prompt dla Agenta AI:</b>\n"
err_msg += f"<pre><code class=\"language-text\">{html.escape(prompt_text.strip())}</code></pre>"
self.send(err_msg, target='errors')
def crawler(start_url, db_file, max_threads, tg_notifier):
parsed_start = urlparse(start_url)
base_url = f"{parsed_start.scheme}://{parsed_start.netloc}"
base_domain = parsed_start.netloc
conn = sqlite3.connect(db_file, check_same_thread=False)
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE, source_url TEXT, status INTEGER,
total_time REAL, ttfb REAL, google_access TEXT, index_status TEXT,
schema_critical INTEGER DEFAULT 0, schema_warnings INTEGER DEFAULT 0,
images_no_alt INTEGER DEFAULT 0, images_no_webp INTEGER DEFAULT 0,
title TEXT, meta_desc TEXT, canonical TEXT,
lang TEXT, timestamp DATETIME)''')
cursor.execute('''CREATE TABLE IF NOT EXISTS structured_data (id INTEGER PRIMARY KEY AUTOINCREMENT, page_id INTEGER, schema_type TEXT, full_json TEXT, sku TEXT, FOREIGN KEY(page_id) REFERENCES pages(id))''')
cursor.execute('''CREATE TABLE IF NOT EXISTS translation_audit (id INTEGER PRIMARY KEY AUTOINCREMENT, sku TEXT, field TEXT, lang1 TEXT, lang2 TEXT, content TEXT)''')
cursor.execute('''CREATE TABLE IF NOT EXISTS images_audit (id INTEGER PRIMARY KEY AUTOINCREMENT, page_id INTEGER, img_url TEXT, alt TEXT, is_modern INTEGER, has_modern_source INTEGER, FOREIGN KEY(page_id) REFERENCES pages(id))''')
conn.commit()
db_queue = queue.Queue()
def db_worker():
db_conn = sqlite3.connect(db_file)
db_cursor = db_conn.cursor()
while True:
item = db_queue.get()
if item is None: break
try:
p = item['page']
db_cursor.execute('''INSERT OR REPLACE INTO pages (url, source_url, status, total_time, ttfb, google_access, index_status, schema_critical, schema_warnings, images_no_alt, images_no_webp, title, meta_desc, canonical, lang, timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', (p['url'], p['source'], p['status'], p['time'], p['ttfb'], p['access'], p['idx'], p['s_crit'], p['s_warn'], p.get('images_no_alt',0), p.get('images_no_webp',0), p.get('title',''), p.get('meta_desc',''), p.get('canonical',''), p['lang'], p['ts']))
page_id = db_cursor.lastrowid
for s in item['schemas']: db_cursor.execute('INSERT INTO structured_data (page_id, schema_type, full_json, sku) VALUES (?, ?, ?, ?)', (page_id, s['type'], s['json'], s.get('sku')))
if 'images' in item:
for img in item['images']: db_cursor.execute('INSERT INTO images_audit (page_id, img_url, alt, is_modern, has_modern_source) VALUES (?, ?, ?, ?, ?)', (page_id, img['img_url'], img['alt'], img['is_modern'], img['has_modern_source']))
db_conn.commit()
except: pass
finally: db_queue.task_done()
db_conn.close()
db_thread = threading.Thread(target=db_worker)
db_thread.start()
rp = RobotFileParser()
try: rp.set_url(urljoin(base_url, "robots.txt")); rp.read()
except: pass
visited, crawled_count, error_count = {start_url}, 0, 0
total_response_time = 0.0
visited_lock, stats_lock = threading.Lock(), threading.Lock()
url_queue = queue.Queue()
url_queue.put((start_url, "Start"))
stop_event = threading.Event()
session = requests.Session()
session.headers.update({'User-Agent': GOOGLEBOT_UA})
def analyze_schema(soup):
scripts = soup.find_all('script', type='application/ld+json')
results, crit, warn = [], 0, 0
def get_val(obj, path):
curr = obj
for p in path.split('.'):
if isinstance(curr, dict) and p in curr: curr = curr[p]
else: return None
return curr
for script in scripts:
try:
data = json.loads(script.string)
objs = data if isinstance(data, list) else [data]
for obj in objs:
if not isinstance(obj, dict): continue
sku = get_val(obj, 'sku') or get_val(obj, 'mpn')
if 'Product' in str(obj.get('@type', '')):
if not get_val(obj, 'name') or not get_val(obj, 'image') or not get_val(obj, 'offers.price'): crit += 1
results.append({'type': str(obj.get('@type', 'Unknown')), 'json': json.dumps(obj, ensure_ascii=False), 'sku': str(sku) if sku else None})
except: pass
return results, crit, warn
def analyze_images(soup, url):
images_data = []
no_alt, no_webp = 0, 0
for img in soup.find_all('img'):
src = img.get('src') or img.get('data-src') or ''
if not src or src.startswith('data:image'): continue
alt = img.get('alt', '').strip() if img.get('alt') is not None else ''
alt_text = alt if alt else '[BRAK]'
is_modern = src.lower().endswith(('.webp', '.avif', '.svg'))
parent = img.find_parent('picture')
has_modern_source = False
if parent:
for source in parent.find_all('source'):
srcs = source.get('srcset', '')
typ = source.get('type', '')
if 'webp' in srcs.lower() or 'avif' in srcs.lower() or 'webp' in typ or 'avif' in typ:
has_modern_source = True
break
if not has_modern_source:
srcset = img.get('srcset', '')
if 'webp' in srcset.lower() or 'avif' in srcset.lower():
has_modern_source = True
images_data.append({
'img_url': urljoin(url, src), 'alt': alt_text,
'is_modern': int(is_modern), 'has_modern_source': int(has_modern_source)
})
if alt_text == '[BRAK]': no_alt += 1
if not is_modern and not has_modern_source: no_webp += 1
return images_data, no_alt, no_webp
def process_url(url, source):
nonlocal crawled_count, error_count, total_response_time
if not rp.can_fetch("Googlebot", url):
tg_notifier.add_critical(url, "ROBOTS.TXT BLOCK")
db_queue.put({'page': {'url': url, 'source': source, 'status': 0, 'time': 0, 'ttfb': 0, 'access': 'Blocked', 'idx': '-', 's_crit': 0, 's_warn': 0, 'images_no_alt': 0, 'images_no_webp': 0, 'title': '', 'meta_desc': '', 'canonical': '', 'lang': '?', 'ts': datetime.now().isoformat()}, 'schemas': [], 'images': []})
return
try:
start_t = time.time()
resp = session.get(url, timeout=10, stream=True)
ttfb = round(time.time() - start_t, 4)
soup = BeautifulSoup(resp.text, 'lxml')
total_t = round(time.time() - start_t, 4)
lang = soup.find('html').get('lang', 'unknown') if soup.find('html') else 'unknown'
schemas, s_crit, s_warn = analyze_schema(soup)
idx = "Indexable"
if 'noindex' in resp.headers.get('X-Robots-Tag', '').lower(): idx = "Noindex"
elif soup.find('meta', attrs={'name': ['robots', 'googlebot'], 'content': lambda x: x and 'noindex' in x.lower()}): idx = "Noindex"
title_tag = soup.find('title')
title = title_tag.text.strip() if title_tag else ''
meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
meta_desc = meta_desc_tag.get('content', '').strip() if meta_desc_tag else ''
canonical_tag = soup.find('link', rel='canonical')
canonical = canonical_tag.get('href', '').strip() if canonical_tag else ''
images_data, no_alt, no_webp = analyze_images(soup, url)
if resp.status_code >= 500: tg_notifier.add_critical(url, f"ERR {resp.status_code}")
if resp.status_code == 404: tg_notifier.add_critical(url, "404")
if s_crit > 0: tg_notifier.add_schema(url, s_crit)
db_queue.put({'page': {'url': url, 'source': source, 'status': resp.status_code, 'time': total_t, 'ttfb': ttfb, 'access': 'Allowed', 'idx': idx, 's_crit': s_crit, 's_warn': s_warn, 'images_no_alt': no_alt, 'images_no_webp': no_webp, 'title': title, 'meta_desc': meta_desc, 'canonical': canonical, 'lang': lang, 'ts': datetime.now().isoformat()}, 'schemas': schemas, 'images': images_data})
with stats_lock:
crawled_count += 1
total_response_time += total_t
if resp.status_code != 200: error_count += 1
if resp.status_code == 200 and 'text/html' in resp.headers.get('Content-Type', ''):
for link in soup.find_all('a', href=True):
full = urljoin(url, link['href'])
parsed = urlparse(full)
if parsed.netloc == base_domain:
clean = parsed._replace(query='', fragment='').geturl()
with visited_lock:
if clean not in visited: visited.add(clean); url_queue.put((clean, url))
except:
with stats_lock: error_count += 1
def worker():
while not stop_event.is_set():
try: u, s = url_queue.get(timeout=0.5); process_url(u, s); url_queue.task_done()
except queue.Empty: continue
threads = [threading.Thread(target=worker, daemon=True) for _ in range(max_threads)]
for t in threads: t.start()
try:
while not stop_event.is_set() and url_queue.unfinished_tasks > 0:
with stats_lock:
cc = crawled_count
err = error_count
avg = round(total_response_time / cc, 3) if cc > 0 else 0
q_size = url_queue.unfinished_tasks
print(f"\r[AUDYT] Skanowanie: {cc} | Błędy: {err} | Kolejka: {q_size} | Średni czas: {avg}s ", end="")
time.sleep(0.5)
except KeyboardInterrupt:
print("\n[!] Przerwano (Ctrl+C). Trwa bezpieczne zamykanie...")
stop_event.set()
while not url_queue.empty():
try: url_queue.get_nowait(); url_queue.task_done()
except queue.Empty: break
if not stop_event.is_set():
url_queue.join()
else:
# Czekamy krótką chwilę, by wątki url_queue zdążyły zobaczyć stop_event
time.sleep(1)
print("\n[*] Zapisywanie bazy danych, proszę czekać...")
db_queue.put(None)
db_thread.join()
# AUDYT WIELOJĘZYCZNOŚCI
cursor.execute('SELECT sku, lang, full_json FROM structured_data JOIN pages ON structured_data.page_id = pages.id WHERE sku IS NOT NULL AND sku != "None"')
sku_map = {}
for sku, lang, fjson in cursor.fetchall():
if sku not in sku_map: sku_map[sku] = {}
data = json.loads(fjson)
sku_map[sku][lang] = {'name': data.get('name', ''), 'description': data.get('description', '')}
for sku, langs in sku_map.items():
lang_list = list(langs.keys())
if len(lang_list) > 1:
for i in range(len(lang_list)):
for j in range(i + 1, len(lang_list)):
l1, l2 = lang_list[i], lang_list[j]
if langs[l1]['name'] == langs[l2]['name']:
cursor.execute('INSERT INTO translation_audit (sku, field, lang1, lang2, content) VALUES (?, ?, ?, ?, ?)', (sku, 'name', l1, l2, langs[l1]['name']))
tg_notifier.add_translation_issue(sku, l1, l2, 'name')
conn.commit(); conn.close()
# TEST WYSZUKIWARKI
search_count = -1
try:
print("\n[*] Przeprowadzanie testu wyszukiwarki (szukana fraza: karuzela)...")
search_url = "https://fluo.dog/szukaj?controller=search&s=karuzela"
resp_search = session.get(search_url, timeout=15)
if resp_search.status_code == 200:
soup_search = BeautifulSoup(resp_search.text, 'lxml')
products = soup_search.find_all('article', class_='product-miniature')
search_count = len(products)
if search_count == 0:
tg_notifier.add_critical(search_url, "TEST WYSZUKIWARKI: Brak wyników (0) dla 'karuzela'!")
print("[!] Test wyszukiwarki NIEPOWODZENIE: 0 wyników.")
else:
print(f"[*] Test wyszukiwarki OK: znaleziono {search_count} produktów.")
else:
tg_notifier.add_critical(search_url, f"TEST WYSZUKIWARKI: Błąd HTTP {resp_search.status_code}")
print(f"[!] Test wyszukiwarki BŁĄD HTTP: {resp_search.status_code}")
except Exception as e:
tg_notifier.add_critical("https://fluo.dog/szukaj?controller=search&s=karuzela", f"TEST WYSZUKIWARKI: Błąd połączenia")
print(f"[!] Test wyszukiwarki BŁĄD: {e}")
tg_notifier.send_final_report(start_url, crawled_count, error_count, db_file, search_results=search_count)
if __name__ == "__main__":
def load_config():
try:
# Wymuszamy utf-8, żeby uniknąć problemów z kodowaniem Windows
with open("config.json", "r", encoding="utf-8") as f:
raw_config = json.load(f)
# Czyścimy klucze ze spacji na wszelki wypadek
return {k.strip(): v for k, v in raw_config.items()}
except Exception as e:
print(f"[!] Błąd wczytywania config.json: {e}")
return {}
parser = argparse.ArgumentParser(description="Crawler SEO - Podgląd błędów i audyt.")
parser.add_argument("--url", default="https://fluo.dog", help="Startowy URL")
parser.add_argument("--threads", type=int, default=10, help="Liczba wątków")
args = parser.parse_args()
# Upewnij się, że katalog scans istnieje
if not os.path.exists("scans"):
os.makedirs("scans")
config = load_config()
# Debug: wypiszmy jakie klucze faktycznie widzi Python
available_keys = ", ".join(config.keys())
print(f"[*] Wczytane klucze z config: {available_keys}")
token = config.get("telegram_token")
id_info = config.get("telegram_chat_id_info")
id_err = config.get("telegram_chat_id_errors")
print(f"[*] Konfiguracja: INFO_ID={id_info}, ERRORS_ID={id_err}")
notifier = TelegramNotifier(token, id_info, id_err)
if notifier.enabled:
print("[*] Telegram powiadomienia: WŁĄCZONE")
notifier.send(f"🚀 <b>Rozpoczynam audyt SEO</b> dla: {html.escape(args.url)}", target='info')
else:
print("[!] Telegram powiadomienia: WYŁĄCZONE (sprawdź czy klucze w config.json są poprawne)")
db_name = f"scans/crawler_v18_{datetime.now().strftime('%Y%m%d_%H%M%S')}.db"
crawler(args.url, db_name, args.threads, notifier)
+575
View File
@@ -0,0 +1,575 @@
import sqlite3
import json
import glob
import os
from fastapi import FastAPI, HTTPException, Query
from fastapi.responses import HTMLResponse, StreamingResponse
from typing import List, Optional
import io
import csv
app = FastAPI(title="Crawler SEO Dashboard API")
def get_db_conn(db_name: str):
# Sprawdź czy db_name zawiera już katalog, jeśli nie - dodaj scans/
if not db_name.startswith("scans/"):
db_path = os.path.join("scans", db_name)
else:
db_path = db_name
if not os.path.exists(db_path):
raise HTTPException(status_code=404, detail=f"Baza danych nie istnieje: {db_path}")
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
return conn
@app.get("/", response_class=HTMLResponse)
def get_dashboard():
return """
<!DOCTYPE html>
<html lang="pl">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Crawler SEO Dashboard</title>
<script src="https://unpkg.com/react@18/umd/react.production.min.js"></script>
<script src="https://unpkg.com/react-dom@18/umd/react-dom.production.min.js"></script>
<script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
<script src="https://cdn.tailwindcss.com"></script>
<script src="https://unpkg.com/lucide@0.321.0/dist/umd/lucide.min.js"></script>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap" rel="stylesheet">
<style>
body { font-family: 'Inter', sans-serif; background: #0f172a; color: #f1f5f9; margin: 0; }
.glass { background: rgba(30, 41, 59, 0.7); backdrop-filter: blur(12px); border: 1px solid rgba(255,255,255,0.1); }
::-webkit-scrollbar { width: 8px; }
::-webkit-scrollbar-track { background: #0f172a; }
::-webkit-scrollbar-thumb { background: #334155; border-radius: 4px; }
.animate-fade-in { animation: fadeIn 0.4s ease-out; }
@keyframes fadeIn { from { opacity: 0; transform: translateY(10px); } to { opacity: 1; transform: translateY(0); } }
</style>
</head>
<body>
<div id="root"></div>
<script type="text/babel">
const { useState, useEffect } = React;
const SCHEMA_DEFS = {
'Product': [
{ field: 'name', label: 'Nazwa produktu', status: 'req' },
{ field: 'image', label: 'Zdjęcie', status: 'req' },
{ field: 'offers.price', label: 'Cena', status: 'req' },
{ field: 'offers.priceCurrency', label: 'Waluta', status: 'req' },
{ field: 'description', label: 'Opis', status: 'opt' },
{ field: 'sku', label: 'SKU', status: 'warn' },
{ field: 'brand.name', label: 'Marka', status: 'warn' },
{ field: 'aggregateRating.ratingValue', label: 'Ocena', status: 'opt' },
{ field: 'offers.availability', label: 'Dostępność', status: 'warn' }
],
'BreadcrumbList': [
{ field: 'itemListElement', label: 'Elementy listy', status: 'req' }
]
};
function App() {
const [dbs, setDbs] = useState([]);
const [selectedDb, setSelectedDb] = useState('');
const [stats, setStats] = useState(null);
const [pages, setPages] = useState([]);
const [loading, setLoading] = useState(false);
const [filter, setFilter] = useState('all');
const [selectedPage, setSelectedPage] = useState(null);
const [analysisData, setAnalysisData] = useState({ schemas: [], images: [] });
const [activeTab, setActiveTab] = useState('schema');
const [mainTab, setMainTab] = useState('pages');
const [translations, setTranslations] = useState({ langs: [], data: [] });
const [sortConfig, setSortConfig] = useState({ key: 'id', direction: 'desc' });
useEffect(() => {
fetch('/api/list-dbs').then(res => res.json()).then(data => {
setDbs(data);
if (data.length > 0) setSelectedDb(data[0]);
});
}, []);
useEffect(() => {
if (selectedDb) {
setLoading(true);
const endpoints = mainTab === 'pages'
? [fetch(`/api/stats?db=${selectedDb}`), fetch(`/api/pages?db=${selectedDb}&status_type=${filter}`)]
: [fetch(`/api/stats?db=${selectedDb}`), fetch(`/api/translations?db=${selectedDb}`)];
Promise.all(endpoints).then(async ([resStats, resData]) => {
const s = await resStats.json();
const d = await resData.json();
setStats(s);
if (mainTab === 'pages') setPages(d);
else setTranslations(d);
setLoading(false);
}).catch(() => setLoading(false));
}
}, [selectedDb, filter, mainTab]);
useEffect(() => {
if (window.lucide) window.lucide.createIcons();
}, [pages, stats, selectedPage, sortConfig, loading, activeTab, mainTab, translations]);
const requestSort = (key) => {
let direction = 'asc';
if (sortConfig.key === key && sortConfig.direction === 'asc') direction = 'desc';
setSortConfig({ key, direction });
};
const getSortedPages = () => {
if (!pages) return [];
const sortable = [...pages];
sortable.sort((a, b) => {
let valA = a[sortConfig.key] ?? 0;
let valB = b[sortConfig.key] ?? 0;
if (sortConfig.key === 'schema_status') {
valA = ((a.schema_critical || 0) * 100) + (a.schema_warnings || 0);
valB = ((b.schema_critical || 0) * 100) + (b.schema_warnings || 0);
}
if (valA < valB) return sortConfig.direction === 'asc' ? -1 : 1;
if (valA > valB) return sortConfig.direction === 'asc' ? 1 : -1;
return 0;
});
return sortable;
};
const viewAnalysis = (page) => {
fetch(`/api/analysis/${page.id}?db=${selectedDb}`).then(res => res.json()).then(data => {
setAnalysisData(data);
setSelectedPage(page);
setActiveTab('schema');
});
};
const getVal = (obj, path) => path.split('.').reduce((acc, part) => acc && acc[part], obj);
const sortedPages = getSortedPages();
return (
<div className="min-h-screen p-6 max-w-7xl mx-auto animate-fade-in">
<header className="flex flex-col md:flex-row justify-between items-start md:items-center mb-10 gap-4">
<div>
<h1 className="text-3xl font-extrabold bg-gradient-to-r from-blue-400 to-emerald-400 bg-clip-text text-transparent uppercase tracking-tight">
SEO Audit Dashboard
</h1>
<p className="text-slate-500 text-sm mt-1 font-medium">Monitoring techniczny i audyt wielojęzyczności</p>
</div>
<div className="flex items-center space-x-3 bg-slate-800/50 p-1.5 rounded-2xl border border-slate-700">
<span className="text-[10px] text-slate-500 font-black px-3 uppercase tracking-widest">Baza danych</span>
<select value={selectedDb} onChange={(e) => setSelectedDb(e.target.value)} className="bg-transparent text-slate-200 p-2 pr-8 rounded-xl outline-none text-xs font-bold cursor-pointer">
{dbs.map(db => <option key={db} value={db} className="bg-slate-900">{db}</option>)}
</select>
</div>
</header>
{stats && (
<div className="grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-4 gap-4 mb-10">
{[
{ t: 'Strony', v: stats.total_pages, i: 'layers', c: 'blue' },
{ t: 'Błędy HTTP', v: stats.errors, i: 'alert-circle', c: 'red' },
{ t: 'Obiekty Schema', v: stats.schema_objects, i: 'code', c: 'emerald' },
{ t: 'Błędy Tłumaczeń', v: stats.translation_errors || 0, i: 'globe', c: 'amber' }
].map((s, idx) => (
<div key={idx} className="glass p-5 rounded-2xl border border-white/5 shadow-lg flex items-center space-x-4">
<div className={`p-2.5 rounded-xl bg-${s.c}-500/20 text-${s.c}-400`}><i data-lucide={s.i} className="w-5 h-5"></i></div>
<div><p className="text-[10px] text-slate-500 font-bold uppercase tracking-wider">{s.t}</p><p className="text-xl font-bold text-slate-100">{s.v}</p></div>
</div>
))}
</div>
)}
<div className="flex bg-slate-800/30 rounded-2xl p-1 mb-8 w-fit border border-white/5">
<button onClick={() => setMainTab('pages')} className={`px-6 py-2 rounded-xl text-xs font-bold transition-all ${mainTab==='pages' ? 'bg-blue-600 text-white shadow-lg' : 'text-slate-500 hover:text-slate-300'}`}>AUDYT TECHNICZNY</button>
<button onClick={() => setMainTab('translations')} className={`px-6 py-2 rounded-xl text-xs font-bold transition-all ${mainTab==='translations' ? 'bg-blue-600 text-white shadow-lg' : 'text-slate-500 hover:text-slate-300'}`}>AUDYT TŁUMACZEŃ</button>
</div>
{mainTab === 'pages' ? (
<div>
<div className="flex flex-wrap items-center justify-between gap-4 mb-6">
<div className="flex flex-wrap gap-2">
{[{id:'all',l:'Wszystkie'},{id:'error',l:'Błędy'},{id:'slow',l:'Wolne'},{id:'images',l:'Obrazy'}].map(f => (
<button key={f.id} onClick={() => setFilter(f.id)} className={`px-5 py-1.5 rounded-xl text-xs font-bold transition-all ${filter===f.id ? 'bg-blue-600 text-white shadow-lg' : 'glass text-slate-500 hover:text-slate-300'}`}>
{f.l.toUpperCase()}
</button>
))}
</div>
<a href={`/api/export-csv?db=${selectedDb}&status_type=${filter}`} download className="bg-emerald-600/20 hover:bg-emerald-600 text-emerald-400 hover:text-white px-5 py-1.5 rounded-xl text-xs font-bold transition-all flex items-center space-x-2 border border-emerald-600/30">
<i data-lucide="download" className="w-4 h-4"></i><span>EKSPORTUJ CSV</span>
</a>
</div>
<div className="glass rounded-3xl border border-white/5 shadow-2xl">
<div className="overflow-auto max-h-[75vh]">
<table className="w-full text-left border-collapse">
<thead className="bg-slate-900 sticky top-0 z-10 shadow-md">
<tr className="text-[10px] font-black text-slate-400 uppercase tracking-[0.15em]">
{[{k:'status',l:'Status'},{k:'lang',l:'Język'},{k:'url',l:'URL'},{k:'total_time',l:'Czas'},{k:'schema_status',l:'Schema'}].map(col => (
<th key={col.k} onClick={()=>requestSort(col.k)} className="p-4 cursor-pointer hover:bg-white/5 transition">
<div className="flex items-center space-x-1"><span>{col.l}</span>{sortConfig.key === col.k && <i data-lucide={sortConfig.direction==='asc'?'chevron-up':'chevron-down'} className="w-3 h-3 text-blue-400"></i>}</div>
</th>
))}
<th className="p-4 text-right">Akcje</th>
</tr>
</thead>
<tbody className="text-sm">
{loading ? (
<tr><td colSpan="6" className="p-20 text-center text-slate-600 font-bold animate-pulse uppercase tracking-[0.2em]">Pobieranie danych...</td></tr>
) : sortedPages.map(page => (
<tr key={page.id} className="border-t border-white/5 hover:bg-white/[0.02] transition-colors group">
<td className="p-4"><span className={`px-2 py-0.5 rounded text-[10px] font-black ${page.status===200 ? 'bg-emerald-500/10 text-emerald-400' : 'bg-red-500/10 text-red-400'}`}>{page.status || 'ERROR'}</span></td>
<td className="p-4 font-black text-[10px] text-slate-500 uppercase">{page.lang || '??'}</td>
<td className="p-4 max-w-sm"><div className="truncate font-medium"><a href={page.url} target="_blank" rel="noopener noreferrer" className="text-blue-400 hover:text-blue-300 transition-colors underline decoration-blue-400/30 hover:decoration-blue-400 underline-offset-4">{page.url}</a></div><div className="text-[10px] text-slate-500 mt-0.5 truncate italic">Źródło: {page.source_url ? <a href={page.source_url} target="_blank" rel="noopener noreferrer" className="hover:text-slate-300 transition-colors">{page.source_url}</a> : 'Bezpośrednie'}</div></td>
<td className="p-4 text-slate-400 tabular-nums">{page.total_time?.toFixed(3)}s</td>
<td className="p-4">
{page.schema_critical > 0 ? <span className="text-red-500 text-[10px] font-black uppercase">Krytyczny</span> :
page.schema_warnings > 0 ? <span className="text-amber-500 text-[10px] font-black uppercase">Ostrzeżenia</span> : <span className="text-emerald-500 text-[10px] font-black uppercase">OK</span>}
</td>
<td className="p-4 text-right"><button onClick={() => viewAnalysis(page)} className="bg-blue-500/10 hover:bg-blue-600 text-blue-400 hover:text-white px-3 py-1.5 rounded-lg text-[10px] font-black transition-all uppercase">Analiza</button></td>
</tr>
))}
</tbody>
</table>
</div>
</div>
</div>
) : (
<div className="glass rounded-3xl border border-white/5 shadow-2xl animate-fade-in">
<div className="overflow-auto max-h-[75vh]">
<table className="w-full text-left border-collapse">
<thead className="bg-slate-900 text-[10px] font-black text-slate-400 uppercase tracking-widest sticky top-0 z-10 shadow-md">
<tr>
<th className="p-5 w-40">SKU</th>
<th className="p-5 w-40">POLE</th>
{translations?.langs?.map(l => <th key={l} className="p-5 text-center">{l.toUpperCase()}</th>)}
</tr>
</thead>
<tbody className="text-sm">
{loading ? (
<tr><td colSpan={2 + (translations?.langs?.length || 0)} className="p-20 text-center text-slate-600 font-bold animate-pulse uppercase tracking-[0.2em]">Analiza tłumaczeń...</td></tr>
) : translations?.data?.length > 0 ? translations.data.map((t, idx) => {
const isFirstOfSku = idx === 0 || translations.data[idx-1].sku !== t.sku;
return (
<tr key={idx} className={`hover:bg-white/[0.02] transition-colors ${isFirstOfSku ? 'border-t-2 border-t-white/10' : 'border-t border-white/5 border-dashed'}`}>
<td className="p-5 font-bold tabular-nums">
{isFirstOfSku ? <a href={t.url} target="_blank" rel="noopener noreferrer" className="text-blue-400 hover:text-blue-300 transition-colors underline decoration-blue-400/30 hover:decoration-blue-400 underline-offset-4">{t.sku}</a> : null}
</td>
<td className="p-5 uppercase text-[10px] font-black text-slate-400">{t.field}</td>
{translations.langs.map(l => (
<td key={l} className="p-5 text-center">
{t[l] === 'V' ? <i data-lucide="check-circle" className="w-4 h-4 text-emerald-500/70 mx-auto"></i> : <i data-lucide="x-circle" className="w-5 h-5 text-red-500 mx-auto drop-shadow-md"></i>}
</td>
))}
</tr>
);
}) : (
<tr><td colSpan={10} className="p-20 text-center text-slate-600 font-bold uppercase tracking-widest">Brak danych o tłumaczeniach.</td></tr>
)}
</tbody>
</table>
</div>
</div>
)}
{selectedPage && (
<div className="fixed inset-0 bg-slate-950/90 backdrop-blur-md flex items-center justify-center p-4 z-50 animate-fade-in">
<div className="glass w-full max-w-6xl max-h-[90vh] overflow-hidden flex flex-col rounded-[2.5rem] border border-white/10 shadow-2xl">
<div className="p-6 border-b border-white/5 flex justify-between items-center bg-slate-900/50">
<div><p className="text-blue-400 text-[10px] font-black uppercase tracking-widest mb-1">Pełny audyt strony</p><h2 className="text-lg font-bold text-slate-100 truncate max-w-2xl">{selectedPage.url}</h2></div>
<button onClick={() => setSelectedPage(null)} className="p-2 hover:bg-white/10 rounded-xl transition-colors"><i data-lucide="x"></i></button>
</div>
<div className="flex bg-slate-900/50 border-b border-white/5">
<button onClick={()=>setActiveTab('schema')} className={`px-8 py-3 text-[10px] font-black uppercase tracking-widest transition-all ${activeTab==='schema'?'text-blue-400 border-b-2 border-blue-400 bg-blue-400/5':'text-slate-500 hover:text-slate-300'}`}>Schema.org</button>
<button onClick={()=>setActiveTab('metadata')} className={`px-8 py-3 text-[10px] font-black uppercase tracking-widest transition-all ${activeTab==='metadata'?'text-purple-400 border-b-2 border-purple-400 bg-purple-400/5':'text-slate-500 hover:text-slate-300'}`}>Metadane SEO</button>
<button onClick={()=>setActiveTab('images')} className={`px-8 py-3 text-[10px] font-black uppercase tracking-widest transition-all ${activeTab==='images'?'text-emerald-400 border-b-2 border-emerald-400 bg-emerald-400/5':'text-slate-500 hover:text-slate-300'}`}>Audyt Grafiki</button>
</div>
<div className="p-6 overflow-y-auto">
{activeTab === 'schema' ? (
<div>
<div className="mb-10">
<h3 className="text-slate-500 text-[10px] font-black mb-4 uppercase tracking-[0.2em] border-b border-white/5 pb-2">I. Audyt pól Schema.org</h3>
{analysisData.schemas.length > 0 ? analysisData.schemas.map((s, i) => {
const fields = SCHEMA_DEFS[s.type] || [];
return (
<div key={i} className="mb-6 glass rounded-2xl overflow-hidden border border-white/5">
<div className="px-4 py-2 bg-white/5 flex items-center space-x-2 text-[10px] font-black uppercase text-slate-400"><i data-lucide="file-json" className="w-3 h-3"></i><span>{s.type}</span></div>
<table className="w-full text-[10px] border-collapse">
<tbody>
{fields.map((f, fi) => {
const val = getVal(s.data, f.field);
const ex = val !== undefined && val !== null && val !== '';
return (
<tr key={fi} className="border-t border-white/5">
<td className="p-2.5 text-slate-400 w-1/3">{f.label}</td>
<td className="p-2.5 text-slate-200 truncate max-w-[200px] font-medium">{ex ? (typeof val==='object'?'Obiekt':String(val)):''}</td>
<td className="p-2.5 text-right font-black uppercase">{ex ? <span className="text-emerald-500">OK</span> : f.status==='req' ? <span className="text-red-500">Wymagane</span> : f.status==='warn' ? <span className="text-amber-500">Zalecane</span> : <span className="text-slate-600">Opcjonalne</span>}</td>
</tr>
);
})}
</tbody>
</table>
</div>
);
}) : <p className="text-center py-10 text-slate-600 font-bold uppercase tracking-widest text-[10px]">Brak danych strukturalnych</p>}
</div>
<div>
<h3 className="text-slate-500 text-[10px] font-black mb-4 uppercase tracking-[0.2em] border-b border-white/5 pb-2">II. Kod JSON-LD</h3>
{analysisData.schemas.map((s, i) => (
<pre key={i} className="bg-slate-950/80 p-5 rounded-xl overflow-x-auto text-[11px] text-blue-200/70 border border-white/5 font-mono mb-4 last:mb-0">{JSON.stringify(s.data, null, 2)}</pre>
))}
</div>
</div>
) : activeTab === 'metadata' ? (
<div>
<h3 className="text-slate-500 text-[10px] font-black mb-4 uppercase tracking-[0.2em] border-b border-white/5 pb-2">Metadane SEO</h3>
<div className="glass rounded-2xl overflow-hidden border border-white/5 p-5 text-sm text-slate-300">
<div className="mb-4">
<span className="block text-[10px] font-black uppercase text-slate-500 mb-1">Tag Title</span>
<div className="font-bold text-slate-100 bg-slate-900/50 p-3 rounded-xl border border-white/5">{selectedPage.title || 'Brak tagu <title>'}</div>
</div>
<div className="mb-4">
<span className="block text-[10px] font-black uppercase text-slate-500 mb-1">Meta Description</span>
<div className="font-medium text-slate-300 bg-slate-900/50 p-3 rounded-xl border border-white/5">{selectedPage.meta_desc || 'Brak description'}</div>
</div>
<div>
<span className="block text-[10px] font-black uppercase text-slate-500 mb-1">Link Canonical</span>
<div className="font-mono text-xs text-blue-400 bg-slate-900/50 p-3 rounded-xl border border-white/5 truncate">{selectedPage.canonical || 'Brak canonical'}</div>
{selectedPage.canonical && selectedPage.canonical !== selectedPage.url && <div className="mt-2 text-[10px] text-amber-500 font-bold uppercase"><i data-lucide="alert-triangle" className="w-3 h-3 inline mr-1"></i>Canonical wskazuje na inną stronę!</div>}
{selectedPage.canonical && selectedPage.canonical === selectedPage.url && <div className="mt-2 text-[10px] text-emerald-500 font-bold uppercase"><i data-lucide="check-circle" className="w-3 h-3 inline mr-1"></i>Samoodwołujący (Zgodny z URL)</div>}
</div>
</div>
</div>
) : (
<div>
<h3 className="text-slate-500 text-[10px] font-black mb-4 uppercase tracking-[0.2em] border-b border-white/5 pb-2">Audyt optymalizacji obrazów</h3>
<div className="glass rounded-2xl overflow-hidden border border-white/5">
<table className="w-full text-left border-collapse">
<thead className="bg-white/5 text-[9px] font-black text-slate-500 uppercase tracking-widest">
<tr><th className="p-3">Podgląd</th><th className="p-3">Atrybut ALT</th><th className="p-3">Format Modern</th><th className="p-3 text-right">Status</th></tr>
</thead>
<tbody className="text-[10px]">
{analysisData.images.length > 0 ? analysisData.images.map((img, i) => (
<tr key={i} className="border-t border-white/5 hover:bg-white/[0.02]">
<td className="p-3"><img src={img.img_url} className="w-10 h-10 object-cover rounded bg-slate-800" onError={(e)=>e.target.src='https://via.placeholder.com/40'} /></td>
<td className="p-3"><span className={img.alt==='[BRAK]'?'text-red-400 font-bold':'text-slate-300'}>{img.alt}</span></td>
<td className="p-3">{img.is_modern ? <span className="text-emerald-400 font-bold">TAK (Bezpośrednio)</span> : img.has_modern_source ? <span className="text-blue-400 font-bold">TAK (Picture/Srcset)</span> : <span className="text-amber-500 font-bold">NIE (Stary format)</span>}</td>
<td className="p-3 text-right">{img.alt!=='[BRAK]' && (img.is_modern || img.has_modern_source) ? <span className="text-emerald-500 font-black">ZOPTYMALIZOWANO</span> : <span className="text-amber-500 font-black text-[9px]">DO POPRAWY</span>}</td>
</tr>
)) : <tr><td colSpan="4" className="p-10 text-center text-slate-600 font-bold uppercase">Nie znaleziono obrazów</td></tr>}
</tbody>
</table>
</div>
</div>
)}
</div>
</div>
</div>
)}
</div>
);
}
const root = ReactDOM.createRoot(document.getElementById('root'));
root.render(<App />);
</script>
</body>
</html>
"""
@app.get("/api/list-dbs")
def list_dbs():
dbs = glob.glob("scans/*.db")
# Zwracamy same nazwy plików dla ładniejszego widoku w select
return sorted([os.path.basename(db) for db in dbs], reverse=True)
@app.get("/api/stats")
def get_stats(db: str):
conn = get_db_conn(db)
cursor = conn.cursor()
stats = {"total_pages": 0, "errors": 0, "avg_time": 0, "schema_objects": 0, "img_issues": 0, "translation_errors": 0}
try:
stats["total_pages"] = cursor.execute("SELECT COUNT(*) FROM pages").fetchone()[0]
stats["errors"] = cursor.execute("SELECT COUNT(*) FROM pages WHERE status != 200 AND status != 0").fetchone()[0]
stats["avg_time"] = cursor.execute("SELECT AVG(total_time) FROM pages WHERE total_time > 0").fetchone()[0] or 0
stats["schema_objects"] = cursor.execute("SELECT COUNT(*) FROM structured_data").fetchone()[0]
except: pass
try:
img_stats = cursor.execute("SELECT SUM(images_no_alt), SUM(images_no_webp) FROM pages").fetchone()
stats["img_issues"] = (img_stats[0] or 0) + (img_stats[1] or 0)
except: pass
try:
stats["translation_errors"] = cursor.execute("SELECT COUNT(*) FROM translation_audit").fetchone()[0]
except: pass
conn.close()
return stats
@app.get("/api/pages")
def get_pages(db: str, status_type: Optional[str] = "all"):
conn = get_db_conn(db)
cursor = conn.cursor()
try:
query = "SELECT * FROM pages"
try:
cursor.execute("SELECT images_no_alt FROM pages LIMIT 1")
has_img_cols = True
except: has_img_cols = False
if status_type == "error": query += " WHERE status != 200 AND status != 0"
elif status_type == "noindex": query += " WHERE index_status LIKE 'Noindex%'"
elif status_type == "slow": query += " WHERE total_time > 1.5"
elif status_type == "images" and has_img_cols: query += " WHERE images_no_alt > 0 OR images_no_webp > 0"
query += " ORDER BY id DESC LIMIT 1000"
pages = cursor.execute(query).fetchall()
return [dict(p) for p in pages]
except: return []
finally: conn.close()
@app.get("/api/translations")
def get_translations(db: str):
conn = get_db_conn(db)
cursor = conn.cursor()
try:
try:
cursor.execute("SELECT title, meta_desc FROM pages LIMIT 1")
has_meta = True
except: has_meta = False
if has_meta:
query = """
SELECT s.sku, p.lang, s.full_json, MIN(p.url) as url, MAX(p.title) as title, MAX(p.meta_desc) as meta_desc
FROM structured_data s
JOIN pages p ON s.page_id = p.id
WHERE s.sku IS NOT NULL AND s.sku != 'None' AND s.sku != '' AND s.schema_type LIKE '%Product%'
GROUP BY s.sku, p.lang
"""
else:
query = """
SELECT s.sku, p.lang, s.full_json, MIN(p.url) as url, '' as title, '' as meta_desc
FROM structured_data s
JOIN pages p ON s.page_id = p.id
WHERE s.sku IS NOT NULL AND s.sku != 'None' AND s.sku != '' AND s.schema_type LIKE '%Product%'
GROUP BY s.sku, p.lang
"""
rows = cursor.execute(query).fetchall()
sku_map = {}
langs_set = set()
for r in rows:
sku = str(r['sku']).strip()
lang = str(r['lang']).lower().strip()
if '-' in lang: lang = lang.split('-')[0]
langs_set.add(lang)
try: data = json.loads(r['full_json'])
except: continue
obj = {}
if isinstance(data, list): obj = next((item for item in data if 'Product' in str(item.get('@type', ''))), {})
else: obj = data if 'Product' in str(data.get('@type', '')) else {}
name = obj.get('name', '').strip()
desc = obj.get('description', '').strip()
title = (r['title'] or '').strip()
meta_desc = (r['meta_desc'] or '').strip()
slug = ''
if r['url']:
parts = r['url'].rstrip('/').split('/')
if parts: slug = parts[-1].split('?')[0].split('#')[0]
if sku not in sku_map: sku_map[sku] = {'langs': {}, 'url': r['url']}
sku_map[sku]['langs'][lang] = {
'nazwa': name, 'opis': desc,
'nazwa seo': title, 'opis seo': meta_desc, 'slug': slug
}
all_langs = sorted(list(langs_set))
if 'pl' in all_langs: all_langs.remove('pl')
results = []
fields = ['nazwa', 'opis', 'nazwa seo', 'opis seo', 'slug']
for sku, info in sku_map.items():
if 'pl' not in info['langs']: continue
pl_data = info['langs']['pl']
sku_has_errors = False
sku_rows = []
for field in fields:
pl_val = pl_data.get(field, '')
if not pl_val: continue
row = {'sku': sku, 'field': field, 'url': info['url']}
for lang in all_langs:
l_val = info['langs'].get(lang, {}).get(field, '')
if not l_val or l_val == pl_val:
row[lang] = 'X'
sku_has_errors = True
else:
row[lang] = 'V'
sku_rows.append(row)
if sku_has_errors:
results.extend(sku_rows)
return {"langs": all_langs, "data": results}
except Exception as e:
print(f"Error in translations: {e}")
return {"langs": [], "data": []}
finally: conn.close()
@app.get("/api/analysis/{page_id}")
def get_analysis(db: str, page_id: int):
conn = get_db_conn(db)
cursor = conn.cursor()
try:
schemas = cursor.execute("SELECT schema_type, full_json FROM structured_data WHERE page_id = ?", (page_id,)).fetchall()
try: images = cursor.execute("SELECT img_url, alt, is_modern, has_modern_source FROM images_audit WHERE page_id = ?", (page_id,)).fetchall()
except: images = []
schema_list = []
for s in schemas:
try: schema_list.append({"type": s["schema_type"], "data": json.loads(s["full_json"])})
except: schema_list.append({"type": s["schema_type"], "data": s["full_json"]})
return {"schemas": schema_list, "images": [dict(img) for img in images]}
finally:
conn.close()
@app.get("/api/export-csv")
def export_csv(db: str, status_type: Optional[str] = "all"):
conn = get_db_conn(db)
cursor = conn.cursor()
try:
cursor.execute("SELECT images_no_alt FROM pages LIMIT 1")
has_img_cols = True
except: has_img_cols = False
query = "SELECT * FROM pages"
if status_type == "error": query += " WHERE status != 200 AND status != 0"
elif status_type == "noindex": query += " WHERE index_status LIKE 'Noindex%'"
elif status_type == "slow": query += " WHERE total_time > 1.5"
elif status_type == "images" and has_img_cols: query += " WHERE images_no_alt > 0 OR images_no_webp > 0"
query += " ORDER BY id DESC"
pages = cursor.execute(query).fetchall()
conn.close()
output = io.StringIO()
writer = csv.writer(output, delimiter=';')
if pages:
keys = list(dict(pages[0]).keys())
writer.writerow([k.upper() for k in keys])
for p in pages:
writer.writerow([dict(p).get(k, '') for k in keys])
else:
writer.writerow(['BRAK DANYCH'])
output.seek(0)
filename = f"raport_seo_{status_type}_{db.replace('.db', '')}.csv"
return StreamingResponse(io.BytesIO(output.getvalue().encode('utf-8-sig')), media_type="text/csv", headers={"Content-Disposition": f"attachment; filename={filename}"})
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=8000)
+4
View File
@@ -0,0 +1,4 @@
@echo off
cd /d "e:\Lukasz\Projekty\Python\Crawler_XML"
python crawler.py --url https://fluo.dog
exit
+144
View File
@@ -0,0 +1,144 @@
import sqlite3
import json
import glob
import os
import html
import requests
from urllib.parse import urlparse
def load_config():
try:
with open("config.json", "r", encoding="utf-8") as f:
raw_config = json.load(f)
return {k.strip(): v for k, v in raw_config.items()}
except Exception as e:
print(f"[!] Błąd wczytywania config.json: {e}")
return {}
def send_telegram(token, chat_id, message):
url = f"https://api.telegram.org/bot{token}/sendMessage"
try:
r = requests.post(url, json={"chat_id": chat_id, "text": message, "parse_mode": "HTML"}, timeout=15)
if r.status_code != 200:
print(f"[!] Błąd wysyłania (HTTP {r.status_code}): {r.text}")
else:
print("[*] Wiadomość wysłana pomyślnie.")
except Exception as e:
print(f"[!] Błąd połączenia z Telegramem: {e}")
def main():
config = load_config()
token = config.get("telegram_token")
chat_id_errors = config.get("telegram_chat_id_errors") or config.get("telegram_chat_id_info")
if not token or not chat_id_errors:
print("[!] Brak poprawnej konfiguracji Telegram (token lub chat_id_errors w config.json).")
return
# Znajdź najnowszą bazę danych crawla w podfolderze scans
dbs = glob.glob("scans/crawler_v*.db")
if not dbs:
print("[!] Nie znaleziono żadnej bazy danych crawler_v*.db w podfolderze scans.")
return
dbs.sort(key=os.path.getmtime, reverse=True)
latest_db = dbs[0]
print(f"[*] Odczytuję dane z bazy: {latest_db}")
critical_errors = []
schema_errors = []
translation_issues = []
domain = "fluo.dog"
try:
conn = sqlite3.connect(latest_db)
cursor = conn.cursor()
# Wyciągamy domenę z pierwszego rekordu
cursor.execute("SELECT url FROM pages LIMIT 1")
row = cursor.fetchone()
if row:
domain = html.escape(urlparse(row[0]).netloc)
# Błędy krytyczne (404, 500+, zablokowane)
cursor.execute("SELECT url, status, google_access FROM pages WHERE status = 404 OR status >= 500 OR google_access = 'Blocked' LIMIT 15")
for url, status, access in cursor.fetchall():
if access == 'Blocked':
err_type = "ROBOTS.TXT BLOCK"
elif status == 404:
err_type = "404"
else:
err_type = f"ERR {status}"
critical_errors.append((url, err_type))
# Błędy schema
cursor.execute("SELECT url, schema_critical FROM pages WHERE schema_critical > 0 LIMIT 10")
for url, count in cursor.fetchall():
schema_errors.append((url, count))
# Błędy tłumaczeń
try:
cursor.execute("SELECT sku, lang1, lang2, field FROM translation_audit LIMIT 10")
for sku, lang1, lang2, field in cursor.fetchall():
translation_issues.append(f"SKU {sku}: {field} identyczny w {lang1} i {lang2}")
except sqlite3.OperationalError:
pass # Jeśli z jakiegoś powodu nie ma jeszcze tej tabeli
conn.close()
except Exception as e:
print(f"[!] Błąd odczytu bazy danych: {e}")
return
if not (critical_errors or schema_errors or translation_issues):
print("[*] Brak błędów do zaraportowania w ostatnim skanie.")
return
# Budowanie wiadomości z promptem
err_msg = f"🚨 <b>BŁĘDY: {domain} (Wysłane ręcznie)</b>\n\n"
prompt_text = (
"Twoim zadaniem jest weryfikacja poniższych błędów na sklepie i przygotowanie planu naprawy. "
"Znasz strukturę plików, masz dostęp do bazy danych sklepu oraz wiesz jak działają wszystkie mechanizmy cache. "
"WAŻNE: Podane błędy zostały wykryte przez crawler, który analizował wyrenderowany kod HTML stron sklepu. "
"Crawler nie sprawdzał bazy danych może być tak, że w bazie dane są w pełni poprawne, "
"a problem leży po stronie modułów (np. wstrzykujących dane strukturalne do źródła strony).\n"
"Oto zestawienie błędów do przeanalizowania:\n\n"
)
if critical_errors:
err_msg += f"❌ <b>KRYTYCZNE:</b>\n"
prompt_text += "BŁĘDY KRYTYCZNE:\n"
for url, err in critical_errors:
safe_url = html.escape(url)
err_msg += f"{err}: {safe_url}\n"
prompt_text += f"- {err}: {url}\n"
err_msg += "\n"
prompt_text += "\n"
if schema_errors:
err_msg += f"🛠 <b>SCHEMA.ORG:</b>\n"
prompt_text += "BŁĘDY SCHEMA.ORG:\n"
for url, count in schema_errors:
safe_url = html.escape(url)
err_msg += f"• Brak {count} pól: {safe_url}\n"
prompt_text += f"- Brak {count} pól: {url}\n"
err_msg += "\n"
prompt_text += "\n"
if translation_issues:
err_msg += f"🌐 <b>TŁUMACZENIA:</b>\n"
prompt_text += "BŁĘDY TŁUMACZEŃ:\n"
for issue in translation_issues:
err_msg += f"{html.escape(issue)}\n"
prompt_text += f"- {issue}\n"
err_msg += "\n"
prompt_text += "\n"
err_msg += f"🤖 <b>Gotowy prompt dla Agenta AI:</b>\n"
err_msg += f"<pre><code class=\"language-text\">{html.escape(prompt_text.strip())}</code></pre>"
print("[*] Wysyłanie raportu na kanał Errors...")
send_telegram(token, chat_id_errors, err_msg)
if __name__ == "__main__":
main()