Initial commit - Crawler SEO (with AI Agent prompt)
This commit is contained in:
+27
@@ -0,0 +1,27 @@
|
|||||||
|
# Wyniki skanowania (duże pliki binarne)
|
||||||
|
scans/*.db
|
||||||
|
scans/*.tmp
|
||||||
|
*.db
|
||||||
|
*.db-journal
|
||||||
|
*.csv
|
||||||
|
|
||||||
|
# Konfiguracja (zawiera tokeny)
|
||||||
|
# Jeśli chcesz współdzielić config przez Git, zakomentuj poniższą linię
|
||||||
|
config.json
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
.venv/
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
|
||||||
|
# Systemowe / Syncthing
|
||||||
|
.sync/
|
||||||
|
.stfolder/
|
||||||
|
.stignore
|
||||||
|
*~sync*
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
+443
@@ -0,0 +1,443 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import argparse
|
||||||
|
import glob
|
||||||
|
import html
|
||||||
|
from datetime import datetime
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
from urllib.robotparser import RobotFileParser
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
import os
|
||||||
|
|
||||||
|
if os.name == 'nt': os.system('color')
|
||||||
|
|
||||||
|
GOOGLEBOT_UA = "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||||
|
|
||||||
|
class TelegramNotifier:
|
||||||
|
def __init__(self, token, chat_id_info, chat_id_errors):
|
||||||
|
self.token = token
|
||||||
|
self.chat_id_info = chat_id_info
|
||||||
|
self.chat_id_errors = chat_id_errors or chat_id_info
|
||||||
|
self.enabled = True if token and chat_id_info else False
|
||||||
|
self.critical_errors = []
|
||||||
|
self.schema_errors = []
|
||||||
|
self.translation_issues = []
|
||||||
|
self.lock = threading.Lock()
|
||||||
|
|
||||||
|
def send(self, message, target='info'):
|
||||||
|
if not self.enabled: return
|
||||||
|
cid = self.chat_id_info if target == 'info' else self.chat_id_errors
|
||||||
|
url = f"https://api.telegram.org/bot{self.token}/sendMessage"
|
||||||
|
try:
|
||||||
|
r = requests.post(url, json={"chat_id": cid, "text": message, "parse_mode": "HTML"}, timeout=15)
|
||||||
|
if r.status_code != 200:
|
||||||
|
print(f"\n[!] Telegram Error ({target}): {r.text}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n[!] Connection Error (Telegram): {e}")
|
||||||
|
|
||||||
|
def add_critical(self, url, msg):
|
||||||
|
with self.lock:
|
||||||
|
if len(self.critical_errors) < 30: self.critical_errors.append((url, msg))
|
||||||
|
|
||||||
|
def add_schema(self, url, count):
|
||||||
|
with self.lock:
|
||||||
|
if len(self.schema_errors) < 15: self.schema_errors.append((url, count))
|
||||||
|
|
||||||
|
def add_translation_issue(self, sku, lang1, lang2, field):
|
||||||
|
with self.lock:
|
||||||
|
if len(self.translation_issues) < 15:
|
||||||
|
self.translation_issues.append(f"SKU {sku}: {field} identyczny w {lang1} i {lang2}")
|
||||||
|
|
||||||
|
def get_prev_404_count(self, current_db):
|
||||||
|
# Szukamy baz danych w podkatalogu scans, sortujemy po czasie modyfikacji
|
||||||
|
dbs = glob.glob("scans/crawler_v*.db")
|
||||||
|
dbs.sort(key=os.path.getmtime, reverse=True)
|
||||||
|
|
||||||
|
prev_db = None
|
||||||
|
for d in dbs:
|
||||||
|
if os.path.basename(d) != os.path.basename(current_db):
|
||||||
|
prev_db = d
|
||||||
|
break
|
||||||
|
if not prev_db: return None
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(prev_db)
|
||||||
|
count = conn.execute("SELECT COUNT(*) FROM pages WHERE status = 404").fetchone()[0]
|
||||||
|
conn.close()
|
||||||
|
return count
|
||||||
|
except: return None
|
||||||
|
|
||||||
|
def send_final_report(self, start_url, total, errors, db_file, search_results=-1):
|
||||||
|
if not self.enabled:
|
||||||
|
print("\n[!] Powiadomienia Telegram są wyłączone (brak konfiguracji).")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Analiza 404 i innych błędów
|
||||||
|
current_404 = 0
|
||||||
|
schema_errs = 0
|
||||||
|
transl_errs = 0
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(db_file)
|
||||||
|
current_404 = conn.execute("SELECT COUNT(*) FROM pages WHERE status = 404").fetchone()[0]
|
||||||
|
schema_errs = conn.execute("SELECT COUNT(*) FROM pages WHERE schema_critical > 0").fetchone()[0]
|
||||||
|
transl_errs = conn.execute("SELECT COUNT(*) FROM translation_audit").fetchone()[0]
|
||||||
|
conn.close()
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
prev_404 = self.get_prev_404_count(db_file)
|
||||||
|
regression_str = ""
|
||||||
|
if prev_404 is not None:
|
||||||
|
diff = current_404 - prev_404
|
||||||
|
if diff > 0: regression_str = f" (<b>+{diff} NOWE!</b> ⚠️)"
|
||||||
|
elif diff < 0: regression_str = f" ({diff} naprawione)"
|
||||||
|
else: regression_str = " (bez zmian)"
|
||||||
|
|
||||||
|
# 1. RAPORT INFO
|
||||||
|
domain = html.escape(urlparse(start_url).netloc)
|
||||||
|
|
||||||
|
total_icon = "✅"
|
||||||
|
http_icon = "✅" if errors == 0 else "❌"
|
||||||
|
err404_icon = "✅" if current_404 == 0 else "❌"
|
||||||
|
schema_icon = "✅" if schema_errs == 0 else "❌"
|
||||||
|
transl_icon = "✅" if transl_errs == 0 else "❌"
|
||||||
|
search_icon = "✅" if search_results > 0 else "❌"
|
||||||
|
|
||||||
|
schema_text = "poprawne" if schema_errs == 0 else f"{schema_errs} błędów"
|
||||||
|
transl_text = "poprawne" if transl_errs == 0 else f"{transl_errs} błędów"
|
||||||
|
search_text = f"{search_results}" if search_results >= 0 else "BŁĄD"
|
||||||
|
|
||||||
|
info_msg = f"🏁 <b>AUDYT ZAKOŃCZONY: {domain}</b>\n\n"
|
||||||
|
info_msg += f"{total_icon} Przeskanowano: {total}\n"
|
||||||
|
info_msg += f"{http_icon} Błędy HTTP: {errors}\n"
|
||||||
|
info_msg += f"{err404_icon} Błędy 404: {current_404}{regression_str}\n"
|
||||||
|
info_msg += f"{schema_icon} Dane strukturalne: {schema_text}\n"
|
||||||
|
info_msg += f"{transl_icon} Tłumaczenia: {transl_text}\n"
|
||||||
|
info_msg += f"{search_icon} Wyszukiwarka: {search_text}\n\n"
|
||||||
|
|
||||||
|
if self.critical_errors or self.schema_errors or self.translation_issues:
|
||||||
|
info_msg += f"🚨 Wykryto błędy. Szczegóły na kanale ERRORS."
|
||||||
|
else:
|
||||||
|
info_msg += f"✅ Brak krytycznych błędów."
|
||||||
|
self.send(info_msg, target='info')
|
||||||
|
|
||||||
|
# 2. RAPORT ERRORS
|
||||||
|
if self.critical_errors or self.schema_errors or self.translation_issues:
|
||||||
|
err_msg = f"🚨 <b>BŁĘDY: {domain}</b>\n\n"
|
||||||
|
|
||||||
|
prompt_text = (
|
||||||
|
"Twoim zadaniem jest weryfikacja poniższych błędów na sklepie i przygotowanie planu naprawy. "
|
||||||
|
"Znasz strukturę plików, masz dostęp do bazy danych sklepu oraz wiesz jak działają wszystkie mechanizmy cache. "
|
||||||
|
"WAŻNE: Podane błędy zostały wykryte przez crawler, który analizował wyrenderowany kod HTML stron sklepu. "
|
||||||
|
"Crawler nie sprawdzał bazy danych – może być tak, że w bazie dane są w pełni poprawne, "
|
||||||
|
"a problem leży po stronie modułów (np. wstrzykujących dane strukturalne do źródła strony).\n"
|
||||||
|
"Oto zestawienie błędów do przeanalizowania:\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.critical_errors:
|
||||||
|
err_msg += f"❌ <b>KRYTYCZNE:</b>\n"
|
||||||
|
prompt_text += "BŁĘDY KRYTYCZNE:\n"
|
||||||
|
for url, err in self.critical_errors[:15]:
|
||||||
|
safe_url = html.escape(url)
|
||||||
|
err_msg += f"• {err}: {safe_url}\n"
|
||||||
|
prompt_text += f"- {err}: {url}\n"
|
||||||
|
err_msg += "\n"
|
||||||
|
prompt_text += "\n"
|
||||||
|
|
||||||
|
if self.schema_errors:
|
||||||
|
err_msg += f"🛠 <b>SCHEMA.ORG:</b>\n"
|
||||||
|
prompt_text += "BŁĘDY SCHEMA.ORG:\n"
|
||||||
|
for url, count in self.schema_errors[:10]:
|
||||||
|
safe_url = html.escape(url)
|
||||||
|
err_msg += f"• Brak {count} pól: {safe_url}\n"
|
||||||
|
prompt_text += f"- Brak {count} pól: {url}\n"
|
||||||
|
err_msg += "\n"
|
||||||
|
prompt_text += "\n"
|
||||||
|
|
||||||
|
if self.translation_issues:
|
||||||
|
err_msg += f"🌐 <b>TŁUMACZENIA:</b>\n"
|
||||||
|
prompt_text += "BŁĘDY TŁUMACZEŃ:\n"
|
||||||
|
for issue in self.translation_issues[:10]:
|
||||||
|
err_msg += f"• {html.escape(issue)}\n"
|
||||||
|
prompt_text += f"- {issue}\n"
|
||||||
|
err_msg += "\n"
|
||||||
|
prompt_text += "\n"
|
||||||
|
|
||||||
|
err_msg += f"🤖 <b>Gotowy prompt dla Agenta AI:</b>\n"
|
||||||
|
err_msg += f"<pre><code class=\"language-text\">{html.escape(prompt_text.strip())}</code></pre>"
|
||||||
|
|
||||||
|
self.send(err_msg, target='errors')
|
||||||
|
|
||||||
|
def crawler(start_url, db_file, max_threads, tg_notifier):
|
||||||
|
parsed_start = urlparse(start_url)
|
||||||
|
base_url = f"{parsed_start.scheme}://{parsed_start.netloc}"
|
||||||
|
base_domain = parsed_start.netloc
|
||||||
|
conn = sqlite3.connect(db_file, check_same_thread=False)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('''CREATE TABLE IF NOT EXISTS pages (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
url TEXT UNIQUE, source_url TEXT, status INTEGER,
|
||||||
|
total_time REAL, ttfb REAL, google_access TEXT, index_status TEXT,
|
||||||
|
schema_critical INTEGER DEFAULT 0, schema_warnings INTEGER DEFAULT 0,
|
||||||
|
images_no_alt INTEGER DEFAULT 0, images_no_webp INTEGER DEFAULT 0,
|
||||||
|
title TEXT, meta_desc TEXT, canonical TEXT,
|
||||||
|
lang TEXT, timestamp DATETIME)''')
|
||||||
|
cursor.execute('''CREATE TABLE IF NOT EXISTS structured_data (id INTEGER PRIMARY KEY AUTOINCREMENT, page_id INTEGER, schema_type TEXT, full_json TEXT, sku TEXT, FOREIGN KEY(page_id) REFERENCES pages(id))''')
|
||||||
|
cursor.execute('''CREATE TABLE IF NOT EXISTS translation_audit (id INTEGER PRIMARY KEY AUTOINCREMENT, sku TEXT, field TEXT, lang1 TEXT, lang2 TEXT, content TEXT)''')
|
||||||
|
cursor.execute('''CREATE TABLE IF NOT EXISTS images_audit (id INTEGER PRIMARY KEY AUTOINCREMENT, page_id INTEGER, img_url TEXT, alt TEXT, is_modern INTEGER, has_modern_source INTEGER, FOREIGN KEY(page_id) REFERENCES pages(id))''')
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
db_queue = queue.Queue()
|
||||||
|
def db_worker():
|
||||||
|
db_conn = sqlite3.connect(db_file)
|
||||||
|
db_cursor = db_conn.cursor()
|
||||||
|
while True:
|
||||||
|
item = db_queue.get()
|
||||||
|
if item is None: break
|
||||||
|
try:
|
||||||
|
p = item['page']
|
||||||
|
db_cursor.execute('''INSERT OR REPLACE INTO pages (url, source_url, status, total_time, ttfb, google_access, index_status, schema_critical, schema_warnings, images_no_alt, images_no_webp, title, meta_desc, canonical, lang, timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', (p['url'], p['source'], p['status'], p['time'], p['ttfb'], p['access'], p['idx'], p['s_crit'], p['s_warn'], p.get('images_no_alt',0), p.get('images_no_webp',0), p.get('title',''), p.get('meta_desc',''), p.get('canonical',''), p['lang'], p['ts']))
|
||||||
|
page_id = db_cursor.lastrowid
|
||||||
|
for s in item['schemas']: db_cursor.execute('INSERT INTO structured_data (page_id, schema_type, full_json, sku) VALUES (?, ?, ?, ?)', (page_id, s['type'], s['json'], s.get('sku')))
|
||||||
|
if 'images' in item:
|
||||||
|
for img in item['images']: db_cursor.execute('INSERT INTO images_audit (page_id, img_url, alt, is_modern, has_modern_source) VALUES (?, ?, ?, ?, ?)', (page_id, img['img_url'], img['alt'], img['is_modern'], img['has_modern_source']))
|
||||||
|
db_conn.commit()
|
||||||
|
except: pass
|
||||||
|
finally: db_queue.task_done()
|
||||||
|
db_conn.close()
|
||||||
|
|
||||||
|
db_thread = threading.Thread(target=db_worker)
|
||||||
|
db_thread.start()
|
||||||
|
rp = RobotFileParser()
|
||||||
|
try: rp.set_url(urljoin(base_url, "robots.txt")); rp.read()
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
visited, crawled_count, error_count = {start_url}, 0, 0
|
||||||
|
total_response_time = 0.0
|
||||||
|
visited_lock, stats_lock = threading.Lock(), threading.Lock()
|
||||||
|
url_queue = queue.Queue()
|
||||||
|
url_queue.put((start_url, "Start"))
|
||||||
|
stop_event = threading.Event()
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update({'User-Agent': GOOGLEBOT_UA})
|
||||||
|
|
||||||
|
def analyze_schema(soup):
|
||||||
|
scripts = soup.find_all('script', type='application/ld+json')
|
||||||
|
results, crit, warn = [], 0, 0
|
||||||
|
def get_val(obj, path):
|
||||||
|
curr = obj
|
||||||
|
for p in path.split('.'):
|
||||||
|
if isinstance(curr, dict) and p in curr: curr = curr[p]
|
||||||
|
else: return None
|
||||||
|
return curr
|
||||||
|
for script in scripts:
|
||||||
|
try:
|
||||||
|
data = json.loads(script.string)
|
||||||
|
objs = data if isinstance(data, list) else [data]
|
||||||
|
for obj in objs:
|
||||||
|
if not isinstance(obj, dict): continue
|
||||||
|
sku = get_val(obj, 'sku') or get_val(obj, 'mpn')
|
||||||
|
if 'Product' in str(obj.get('@type', '')):
|
||||||
|
if not get_val(obj, 'name') or not get_val(obj, 'image') or not get_val(obj, 'offers.price'): crit += 1
|
||||||
|
results.append({'type': str(obj.get('@type', 'Unknown')), 'json': json.dumps(obj, ensure_ascii=False), 'sku': str(sku) if sku else None})
|
||||||
|
except: pass
|
||||||
|
return results, crit, warn
|
||||||
|
|
||||||
|
def analyze_images(soup, url):
|
||||||
|
images_data = []
|
||||||
|
no_alt, no_webp = 0, 0
|
||||||
|
for img in soup.find_all('img'):
|
||||||
|
src = img.get('src') or img.get('data-src') or ''
|
||||||
|
if not src or src.startswith('data:image'): continue
|
||||||
|
alt = img.get('alt', '').strip() if img.get('alt') is not None else ''
|
||||||
|
alt_text = alt if alt else '[BRAK]'
|
||||||
|
is_modern = src.lower().endswith(('.webp', '.avif', '.svg'))
|
||||||
|
parent = img.find_parent('picture')
|
||||||
|
has_modern_source = False
|
||||||
|
if parent:
|
||||||
|
for source in parent.find_all('source'):
|
||||||
|
srcs = source.get('srcset', '')
|
||||||
|
typ = source.get('type', '')
|
||||||
|
if 'webp' in srcs.lower() or 'avif' in srcs.lower() or 'webp' in typ or 'avif' in typ:
|
||||||
|
has_modern_source = True
|
||||||
|
break
|
||||||
|
if not has_modern_source:
|
||||||
|
srcset = img.get('srcset', '')
|
||||||
|
if 'webp' in srcset.lower() or 'avif' in srcset.lower():
|
||||||
|
has_modern_source = True
|
||||||
|
images_data.append({
|
||||||
|
'img_url': urljoin(url, src), 'alt': alt_text,
|
||||||
|
'is_modern': int(is_modern), 'has_modern_source': int(has_modern_source)
|
||||||
|
})
|
||||||
|
if alt_text == '[BRAK]': no_alt += 1
|
||||||
|
if not is_modern and not has_modern_source: no_webp += 1
|
||||||
|
return images_data, no_alt, no_webp
|
||||||
|
|
||||||
|
def process_url(url, source):
|
||||||
|
nonlocal crawled_count, error_count, total_response_time
|
||||||
|
if not rp.can_fetch("Googlebot", url):
|
||||||
|
tg_notifier.add_critical(url, "ROBOTS.TXT BLOCK")
|
||||||
|
db_queue.put({'page': {'url': url, 'source': source, 'status': 0, 'time': 0, 'ttfb': 0, 'access': 'Blocked', 'idx': '-', 's_crit': 0, 's_warn': 0, 'images_no_alt': 0, 'images_no_webp': 0, 'title': '', 'meta_desc': '', 'canonical': '', 'lang': '?', 'ts': datetime.now().isoformat()}, 'schemas': [], 'images': []})
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
start_t = time.time()
|
||||||
|
resp = session.get(url, timeout=10, stream=True)
|
||||||
|
ttfb = round(time.time() - start_t, 4)
|
||||||
|
soup = BeautifulSoup(resp.text, 'lxml')
|
||||||
|
total_t = round(time.time() - start_t, 4)
|
||||||
|
lang = soup.find('html').get('lang', 'unknown') if soup.find('html') else 'unknown'
|
||||||
|
schemas, s_crit, s_warn = analyze_schema(soup)
|
||||||
|
idx = "Indexable"
|
||||||
|
if 'noindex' in resp.headers.get('X-Robots-Tag', '').lower(): idx = "Noindex"
|
||||||
|
elif soup.find('meta', attrs={'name': ['robots', 'googlebot'], 'content': lambda x: x and 'noindex' in x.lower()}): idx = "Noindex"
|
||||||
|
|
||||||
|
|
||||||
|
title_tag = soup.find('title')
|
||||||
|
title = title_tag.text.strip() if title_tag else ''
|
||||||
|
meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
|
||||||
|
meta_desc = meta_desc_tag.get('content', '').strip() if meta_desc_tag else ''
|
||||||
|
canonical_tag = soup.find('link', rel='canonical')
|
||||||
|
canonical = canonical_tag.get('href', '').strip() if canonical_tag else ''
|
||||||
|
|
||||||
|
images_data, no_alt, no_webp = analyze_images(soup, url)
|
||||||
|
|
||||||
|
if resp.status_code >= 500: tg_notifier.add_critical(url, f"ERR {resp.status_code}")
|
||||||
|
if resp.status_code == 404: tg_notifier.add_critical(url, "404")
|
||||||
|
if s_crit > 0: tg_notifier.add_schema(url, s_crit)
|
||||||
|
|
||||||
|
db_queue.put({'page': {'url': url, 'source': source, 'status': resp.status_code, 'time': total_t, 'ttfb': ttfb, 'access': 'Allowed', 'idx': idx, 's_crit': s_crit, 's_warn': s_warn, 'images_no_alt': no_alt, 'images_no_webp': no_webp, 'title': title, 'meta_desc': meta_desc, 'canonical': canonical, 'lang': lang, 'ts': datetime.now().isoformat()}, 'schemas': schemas, 'images': images_data})
|
||||||
|
with stats_lock:
|
||||||
|
crawled_count += 1
|
||||||
|
total_response_time += total_t
|
||||||
|
if resp.status_code != 200: error_count += 1
|
||||||
|
if resp.status_code == 200 and 'text/html' in resp.headers.get('Content-Type', ''):
|
||||||
|
for link in soup.find_all('a', href=True):
|
||||||
|
full = urljoin(url, link['href'])
|
||||||
|
parsed = urlparse(full)
|
||||||
|
if parsed.netloc == base_domain:
|
||||||
|
clean = parsed._replace(query='', fragment='').geturl()
|
||||||
|
with visited_lock:
|
||||||
|
if clean not in visited: visited.add(clean); url_queue.put((clean, url))
|
||||||
|
except:
|
||||||
|
with stats_lock: error_count += 1
|
||||||
|
|
||||||
|
def worker():
|
||||||
|
while not stop_event.is_set():
|
||||||
|
try: u, s = url_queue.get(timeout=0.5); process_url(u, s); url_queue.task_done()
|
||||||
|
except queue.Empty: continue
|
||||||
|
|
||||||
|
threads = [threading.Thread(target=worker, daemon=True) for _ in range(max_threads)]
|
||||||
|
for t in threads: t.start()
|
||||||
|
try:
|
||||||
|
while not stop_event.is_set() and url_queue.unfinished_tasks > 0:
|
||||||
|
with stats_lock:
|
||||||
|
cc = crawled_count
|
||||||
|
err = error_count
|
||||||
|
avg = round(total_response_time / cc, 3) if cc > 0 else 0
|
||||||
|
q_size = url_queue.unfinished_tasks
|
||||||
|
print(f"\r[AUDYT] Skanowanie: {cc} | Błędy: {err} | Kolejka: {q_size} | Średni czas: {avg}s ", end="")
|
||||||
|
time.sleep(0.5)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n[!] Przerwano (Ctrl+C). Trwa bezpieczne zamykanie...")
|
||||||
|
stop_event.set()
|
||||||
|
while not url_queue.empty():
|
||||||
|
try: url_queue.get_nowait(); url_queue.task_done()
|
||||||
|
except queue.Empty: break
|
||||||
|
|
||||||
|
if not stop_event.is_set():
|
||||||
|
url_queue.join()
|
||||||
|
else:
|
||||||
|
# Czekamy krótką chwilę, by wątki url_queue zdążyły zobaczyć stop_event
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
print("\n[*] Zapisywanie bazy danych, proszę czekać...")
|
||||||
|
db_queue.put(None)
|
||||||
|
db_thread.join()
|
||||||
|
|
||||||
|
# AUDYT WIELOJĘZYCZNOŚCI
|
||||||
|
cursor.execute('SELECT sku, lang, full_json FROM structured_data JOIN pages ON structured_data.page_id = pages.id WHERE sku IS NOT NULL AND sku != "None"')
|
||||||
|
sku_map = {}
|
||||||
|
for sku, lang, fjson in cursor.fetchall():
|
||||||
|
if sku not in sku_map: sku_map[sku] = {}
|
||||||
|
data = json.loads(fjson)
|
||||||
|
sku_map[sku][lang] = {'name': data.get('name', ''), 'description': data.get('description', '')}
|
||||||
|
for sku, langs in sku_map.items():
|
||||||
|
lang_list = list(langs.keys())
|
||||||
|
if len(lang_list) > 1:
|
||||||
|
for i in range(len(lang_list)):
|
||||||
|
for j in range(i + 1, len(lang_list)):
|
||||||
|
l1, l2 = lang_list[i], lang_list[j]
|
||||||
|
if langs[l1]['name'] == langs[l2]['name']:
|
||||||
|
cursor.execute('INSERT INTO translation_audit (sku, field, lang1, lang2, content) VALUES (?, ?, ?, ?, ?)', (sku, 'name', l1, l2, langs[l1]['name']))
|
||||||
|
tg_notifier.add_translation_issue(sku, l1, l2, 'name')
|
||||||
|
conn.commit(); conn.close()
|
||||||
|
|
||||||
|
# TEST WYSZUKIWARKI
|
||||||
|
search_count = -1
|
||||||
|
try:
|
||||||
|
print("\n[*] Przeprowadzanie testu wyszukiwarki (szukana fraza: karuzela)...")
|
||||||
|
search_url = "https://fluo.dog/szukaj?controller=search&s=karuzela"
|
||||||
|
resp_search = session.get(search_url, timeout=15)
|
||||||
|
if resp_search.status_code == 200:
|
||||||
|
soup_search = BeautifulSoup(resp_search.text, 'lxml')
|
||||||
|
products = soup_search.find_all('article', class_='product-miniature')
|
||||||
|
search_count = len(products)
|
||||||
|
if search_count == 0:
|
||||||
|
tg_notifier.add_critical(search_url, "TEST WYSZUKIWARKI: Brak wyników (0) dla 'karuzela'!")
|
||||||
|
print("[!] Test wyszukiwarki NIEPOWODZENIE: 0 wyników.")
|
||||||
|
else:
|
||||||
|
print(f"[*] Test wyszukiwarki OK: znaleziono {search_count} produktów.")
|
||||||
|
else:
|
||||||
|
tg_notifier.add_critical(search_url, f"TEST WYSZUKIWARKI: Błąd HTTP {resp_search.status_code}")
|
||||||
|
print(f"[!] Test wyszukiwarki BŁĄD HTTP: {resp_search.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
tg_notifier.add_critical("https://fluo.dog/szukaj?controller=search&s=karuzela", f"TEST WYSZUKIWARKI: Błąd połączenia")
|
||||||
|
print(f"[!] Test wyszukiwarki BŁĄD: {e}")
|
||||||
|
|
||||||
|
tg_notifier.send_final_report(start_url, crawled_count, error_count, db_file, search_results=search_count)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
def load_config():
|
||||||
|
try:
|
||||||
|
# Wymuszamy utf-8, żeby uniknąć problemów z kodowaniem Windows
|
||||||
|
with open("config.json", "r", encoding="utf-8") as f:
|
||||||
|
raw_config = json.load(f)
|
||||||
|
# Czyścimy klucze ze spacji na wszelki wypadek
|
||||||
|
return {k.strip(): v for k, v in raw_config.items()}
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[!] Błąd wczytywania config.json: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Crawler SEO - Podgląd błędów i audyt.")
|
||||||
|
parser.add_argument("--url", default="https://fluo.dog", help="Startowy URL")
|
||||||
|
parser.add_argument("--threads", type=int, default=10, help="Liczba wątków")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Upewnij się, że katalog scans istnieje
|
||||||
|
if not os.path.exists("scans"):
|
||||||
|
os.makedirs("scans")
|
||||||
|
|
||||||
|
config = load_config()
|
||||||
|
|
||||||
|
# Debug: wypiszmy jakie klucze faktycznie widzi Python
|
||||||
|
available_keys = ", ".join(config.keys())
|
||||||
|
print(f"[*] Wczytane klucze z config: {available_keys}")
|
||||||
|
|
||||||
|
token = config.get("telegram_token")
|
||||||
|
id_info = config.get("telegram_chat_id_info")
|
||||||
|
id_err = config.get("telegram_chat_id_errors")
|
||||||
|
|
||||||
|
print(f"[*] Konfiguracja: INFO_ID={id_info}, ERRORS_ID={id_err}")
|
||||||
|
|
||||||
|
notifier = TelegramNotifier(token, id_info, id_err)
|
||||||
|
if notifier.enabled:
|
||||||
|
print("[*] Telegram powiadomienia: WŁĄCZONE")
|
||||||
|
notifier.send(f"🚀 <b>Rozpoczynam audyt SEO</b> dla: {html.escape(args.url)}", target='info')
|
||||||
|
else:
|
||||||
|
print("[!] Telegram powiadomienia: WYŁĄCZONE (sprawdź czy klucze w config.json są poprawne)")
|
||||||
|
|
||||||
|
db_name = f"scans/crawler_v18_{datetime.now().strftime('%Y%m%d_%H%M%S')}.db"
|
||||||
|
crawler(args.url, db_name, args.threads, notifier)
|
||||||
@@ -0,0 +1,575 @@
|
|||||||
|
import sqlite3
|
||||||
|
import json
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
from fastapi import FastAPI, HTTPException, Query
|
||||||
|
from fastapi.responses import HTMLResponse, StreamingResponse
|
||||||
|
from typing import List, Optional
|
||||||
|
import io
|
||||||
|
import csv
|
||||||
|
|
||||||
|
app = FastAPI(title="Crawler SEO Dashboard API")
|
||||||
|
|
||||||
|
def get_db_conn(db_name: str):
|
||||||
|
# Sprawdź czy db_name zawiera już katalog, jeśli nie - dodaj scans/
|
||||||
|
if not db_name.startswith("scans/"):
|
||||||
|
db_path = os.path.join("scans", db_name)
|
||||||
|
else:
|
||||||
|
db_path = db_name
|
||||||
|
|
||||||
|
if not os.path.exists(db_path):
|
||||||
|
raise HTTPException(status_code=404, detail=f"Baza danych nie istnieje: {db_path}")
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
return conn
|
||||||
|
|
||||||
|
@app.get("/", response_class=HTMLResponse)
|
||||||
|
def get_dashboard():
|
||||||
|
return """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="pl">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Crawler SEO Dashboard</title>
|
||||||
|
<script src="https://unpkg.com/react@18/umd/react.production.min.js"></script>
|
||||||
|
<script src="https://unpkg.com/react-dom@18/umd/react-dom.production.min.js"></script>
|
||||||
|
<script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
|
||||||
|
<script src="https://cdn.tailwindcss.com"></script>
|
||||||
|
<script src="https://unpkg.com/lucide@0.321.0/dist/umd/lucide.min.js"></script>
|
||||||
|
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap" rel="stylesheet">
|
||||||
|
<style>
|
||||||
|
body { font-family: 'Inter', sans-serif; background: #0f172a; color: #f1f5f9; margin: 0; }
|
||||||
|
.glass { background: rgba(30, 41, 59, 0.7); backdrop-filter: blur(12px); border: 1px solid rgba(255,255,255,0.1); }
|
||||||
|
::-webkit-scrollbar { width: 8px; }
|
||||||
|
::-webkit-scrollbar-track { background: #0f172a; }
|
||||||
|
::-webkit-scrollbar-thumb { background: #334155; border-radius: 4px; }
|
||||||
|
.animate-fade-in { animation: fadeIn 0.4s ease-out; }
|
||||||
|
@keyframes fadeIn { from { opacity: 0; transform: translateY(10px); } to { opacity: 1; transform: translateY(0); } }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="root"></div>
|
||||||
|
<script type="text/babel">
|
||||||
|
const { useState, useEffect } = React;
|
||||||
|
|
||||||
|
const SCHEMA_DEFS = {
|
||||||
|
'Product': [
|
||||||
|
{ field: 'name', label: 'Nazwa produktu', status: 'req' },
|
||||||
|
{ field: 'image', label: 'Zdjęcie', status: 'req' },
|
||||||
|
{ field: 'offers.price', label: 'Cena', status: 'req' },
|
||||||
|
{ field: 'offers.priceCurrency', label: 'Waluta', status: 'req' },
|
||||||
|
{ field: 'description', label: 'Opis', status: 'opt' },
|
||||||
|
{ field: 'sku', label: 'SKU', status: 'warn' },
|
||||||
|
{ field: 'brand.name', label: 'Marka', status: 'warn' },
|
||||||
|
{ field: 'aggregateRating.ratingValue', label: 'Ocena', status: 'opt' },
|
||||||
|
{ field: 'offers.availability', label: 'Dostępność', status: 'warn' }
|
||||||
|
],
|
||||||
|
'BreadcrumbList': [
|
||||||
|
{ field: 'itemListElement', label: 'Elementy listy', status: 'req' }
|
||||||
|
]
|
||||||
|
};
|
||||||
|
|
||||||
|
function App() {
|
||||||
|
const [dbs, setDbs] = useState([]);
|
||||||
|
const [selectedDb, setSelectedDb] = useState('');
|
||||||
|
const [stats, setStats] = useState(null);
|
||||||
|
const [pages, setPages] = useState([]);
|
||||||
|
const [loading, setLoading] = useState(false);
|
||||||
|
const [filter, setFilter] = useState('all');
|
||||||
|
const [selectedPage, setSelectedPage] = useState(null);
|
||||||
|
const [analysisData, setAnalysisData] = useState({ schemas: [], images: [] });
|
||||||
|
const [activeTab, setActiveTab] = useState('schema');
|
||||||
|
const [mainTab, setMainTab] = useState('pages');
|
||||||
|
const [translations, setTranslations] = useState({ langs: [], data: [] });
|
||||||
|
const [sortConfig, setSortConfig] = useState({ key: 'id', direction: 'desc' });
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
fetch('/api/list-dbs').then(res => res.json()).then(data => {
|
||||||
|
setDbs(data);
|
||||||
|
if (data.length > 0) setSelectedDb(data[0]);
|
||||||
|
});
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (selectedDb) {
|
||||||
|
setLoading(true);
|
||||||
|
const endpoints = mainTab === 'pages'
|
||||||
|
? [fetch(`/api/stats?db=${selectedDb}`), fetch(`/api/pages?db=${selectedDb}&status_type=${filter}`)]
|
||||||
|
: [fetch(`/api/stats?db=${selectedDb}`), fetch(`/api/translations?db=${selectedDb}`)];
|
||||||
|
|
||||||
|
Promise.all(endpoints).then(async ([resStats, resData]) => {
|
||||||
|
const s = await resStats.json();
|
||||||
|
const d = await resData.json();
|
||||||
|
setStats(s);
|
||||||
|
if (mainTab === 'pages') setPages(d);
|
||||||
|
else setTranslations(d);
|
||||||
|
setLoading(false);
|
||||||
|
}).catch(() => setLoading(false));
|
||||||
|
}
|
||||||
|
}, [selectedDb, filter, mainTab]);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (window.lucide) window.lucide.createIcons();
|
||||||
|
}, [pages, stats, selectedPage, sortConfig, loading, activeTab, mainTab, translations]);
|
||||||
|
|
||||||
|
const requestSort = (key) => {
|
||||||
|
let direction = 'asc';
|
||||||
|
if (sortConfig.key === key && sortConfig.direction === 'asc') direction = 'desc';
|
||||||
|
setSortConfig({ key, direction });
|
||||||
|
};
|
||||||
|
|
||||||
|
const getSortedPages = () => {
|
||||||
|
if (!pages) return [];
|
||||||
|
const sortable = [...pages];
|
||||||
|
sortable.sort((a, b) => {
|
||||||
|
let valA = a[sortConfig.key] ?? 0;
|
||||||
|
let valB = b[sortConfig.key] ?? 0;
|
||||||
|
if (sortConfig.key === 'schema_status') {
|
||||||
|
valA = ((a.schema_critical || 0) * 100) + (a.schema_warnings || 0);
|
||||||
|
valB = ((b.schema_critical || 0) * 100) + (b.schema_warnings || 0);
|
||||||
|
}
|
||||||
|
if (valA < valB) return sortConfig.direction === 'asc' ? -1 : 1;
|
||||||
|
if (valA > valB) return sortConfig.direction === 'asc' ? 1 : -1;
|
||||||
|
return 0;
|
||||||
|
});
|
||||||
|
return sortable;
|
||||||
|
};
|
||||||
|
|
||||||
|
const viewAnalysis = (page) => {
|
||||||
|
fetch(`/api/analysis/${page.id}?db=${selectedDb}`).then(res => res.json()).then(data => {
|
||||||
|
setAnalysisData(data);
|
||||||
|
setSelectedPage(page);
|
||||||
|
setActiveTab('schema');
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
const getVal = (obj, path) => path.split('.').reduce((acc, part) => acc && acc[part], obj);
|
||||||
|
const sortedPages = getSortedPages();
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="min-h-screen p-6 max-w-7xl mx-auto animate-fade-in">
|
||||||
|
<header className="flex flex-col md:flex-row justify-between items-start md:items-center mb-10 gap-4">
|
||||||
|
<div>
|
||||||
|
<h1 className="text-3xl font-extrabold bg-gradient-to-r from-blue-400 to-emerald-400 bg-clip-text text-transparent uppercase tracking-tight">
|
||||||
|
SEO Audit Dashboard
|
||||||
|
</h1>
|
||||||
|
<p className="text-slate-500 text-sm mt-1 font-medium">Monitoring techniczny i audyt wielojęzyczności</p>
|
||||||
|
</div>
|
||||||
|
<div className="flex items-center space-x-3 bg-slate-800/50 p-1.5 rounded-2xl border border-slate-700">
|
||||||
|
<span className="text-[10px] text-slate-500 font-black px-3 uppercase tracking-widest">Baza danych</span>
|
||||||
|
<select value={selectedDb} onChange={(e) => setSelectedDb(e.target.value)} className="bg-transparent text-slate-200 p-2 pr-8 rounded-xl outline-none text-xs font-bold cursor-pointer">
|
||||||
|
{dbs.map(db => <option key={db} value={db} className="bg-slate-900">{db}</option>)}
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
{stats && (
|
||||||
|
<div className="grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-4 gap-4 mb-10">
|
||||||
|
{[
|
||||||
|
{ t: 'Strony', v: stats.total_pages, i: 'layers', c: 'blue' },
|
||||||
|
{ t: 'Błędy HTTP', v: stats.errors, i: 'alert-circle', c: 'red' },
|
||||||
|
{ t: 'Obiekty Schema', v: stats.schema_objects, i: 'code', c: 'emerald' },
|
||||||
|
{ t: 'Błędy Tłumaczeń', v: stats.translation_errors || 0, i: 'globe', c: 'amber' }
|
||||||
|
].map((s, idx) => (
|
||||||
|
<div key={idx} className="glass p-5 rounded-2xl border border-white/5 shadow-lg flex items-center space-x-4">
|
||||||
|
<div className={`p-2.5 rounded-xl bg-${s.c}-500/20 text-${s.c}-400`}><i data-lucide={s.i} className="w-5 h-5"></i></div>
|
||||||
|
<div><p className="text-[10px] text-slate-500 font-bold uppercase tracking-wider">{s.t}</p><p className="text-xl font-bold text-slate-100">{s.v}</p></div>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<div className="flex bg-slate-800/30 rounded-2xl p-1 mb-8 w-fit border border-white/5">
|
||||||
|
<button onClick={() => setMainTab('pages')} className={`px-6 py-2 rounded-xl text-xs font-bold transition-all ${mainTab==='pages' ? 'bg-blue-600 text-white shadow-lg' : 'text-slate-500 hover:text-slate-300'}`}>AUDYT TECHNICZNY</button>
|
||||||
|
<button onClick={() => setMainTab('translations')} className={`px-6 py-2 rounded-xl text-xs font-bold transition-all ${mainTab==='translations' ? 'bg-blue-600 text-white shadow-lg' : 'text-slate-500 hover:text-slate-300'}`}>AUDYT TŁUMACZEŃ</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{mainTab === 'pages' ? (
|
||||||
|
<div>
|
||||||
|
<div className="flex flex-wrap items-center justify-between gap-4 mb-6">
|
||||||
|
<div className="flex flex-wrap gap-2">
|
||||||
|
{[{id:'all',l:'Wszystkie'},{id:'error',l:'Błędy'},{id:'slow',l:'Wolne'},{id:'images',l:'Obrazy'}].map(f => (
|
||||||
|
<button key={f.id} onClick={() => setFilter(f.id)} className={`px-5 py-1.5 rounded-xl text-xs font-bold transition-all ${filter===f.id ? 'bg-blue-600 text-white shadow-lg' : 'glass text-slate-500 hover:text-slate-300'}`}>
|
||||||
|
{f.l.toUpperCase()}
|
||||||
|
</button>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
<a href={`/api/export-csv?db=${selectedDb}&status_type=${filter}`} download className="bg-emerald-600/20 hover:bg-emerald-600 text-emerald-400 hover:text-white px-5 py-1.5 rounded-xl text-xs font-bold transition-all flex items-center space-x-2 border border-emerald-600/30">
|
||||||
|
<i data-lucide="download" className="w-4 h-4"></i><span>EKSPORTUJ CSV</span>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="glass rounded-3xl border border-white/5 shadow-2xl">
|
||||||
|
<div className="overflow-auto max-h-[75vh]">
|
||||||
|
<table className="w-full text-left border-collapse">
|
||||||
|
<thead className="bg-slate-900 sticky top-0 z-10 shadow-md">
|
||||||
|
<tr className="text-[10px] font-black text-slate-400 uppercase tracking-[0.15em]">
|
||||||
|
{[{k:'status',l:'Status'},{k:'lang',l:'Język'},{k:'url',l:'URL'},{k:'total_time',l:'Czas'},{k:'schema_status',l:'Schema'}].map(col => (
|
||||||
|
<th key={col.k} onClick={()=>requestSort(col.k)} className="p-4 cursor-pointer hover:bg-white/5 transition">
|
||||||
|
<div className="flex items-center space-x-1"><span>{col.l}</span>{sortConfig.key === col.k && <i data-lucide={sortConfig.direction==='asc'?'chevron-up':'chevron-down'} className="w-3 h-3 text-blue-400"></i>}</div>
|
||||||
|
</th>
|
||||||
|
))}
|
||||||
|
<th className="p-4 text-right">Akcje</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody className="text-sm">
|
||||||
|
{loading ? (
|
||||||
|
<tr><td colSpan="6" className="p-20 text-center text-slate-600 font-bold animate-pulse uppercase tracking-[0.2em]">Pobieranie danych...</td></tr>
|
||||||
|
) : sortedPages.map(page => (
|
||||||
|
<tr key={page.id} className="border-t border-white/5 hover:bg-white/[0.02] transition-colors group">
|
||||||
|
<td className="p-4"><span className={`px-2 py-0.5 rounded text-[10px] font-black ${page.status===200 ? 'bg-emerald-500/10 text-emerald-400' : 'bg-red-500/10 text-red-400'}`}>{page.status || 'ERROR'}</span></td>
|
||||||
|
<td className="p-4 font-black text-[10px] text-slate-500 uppercase">{page.lang || '??'}</td>
|
||||||
|
<td className="p-4 max-w-sm"><div className="truncate font-medium"><a href={page.url} target="_blank" rel="noopener noreferrer" className="text-blue-400 hover:text-blue-300 transition-colors underline decoration-blue-400/30 hover:decoration-blue-400 underline-offset-4">{page.url}</a></div><div className="text-[10px] text-slate-500 mt-0.5 truncate italic">Źródło: {page.source_url ? <a href={page.source_url} target="_blank" rel="noopener noreferrer" className="hover:text-slate-300 transition-colors">{page.source_url}</a> : 'Bezpośrednie'}</div></td>
|
||||||
|
<td className="p-4 text-slate-400 tabular-nums">{page.total_time?.toFixed(3)}s</td>
|
||||||
|
<td className="p-4">
|
||||||
|
{page.schema_critical > 0 ? <span className="text-red-500 text-[10px] font-black uppercase">Krytyczny</span> :
|
||||||
|
page.schema_warnings > 0 ? <span className="text-amber-500 text-[10px] font-black uppercase">Ostrzeżenia</span> : <span className="text-emerald-500 text-[10px] font-black uppercase">OK</span>}
|
||||||
|
</td>
|
||||||
|
<td className="p-4 text-right"><button onClick={() => viewAnalysis(page)} className="bg-blue-500/10 hover:bg-blue-600 text-blue-400 hover:text-white px-3 py-1.5 rounded-lg text-[10px] font-black transition-all uppercase">Analiza</button></td>
|
||||||
|
</tr>
|
||||||
|
))}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="glass rounded-3xl border border-white/5 shadow-2xl animate-fade-in">
|
||||||
|
<div className="overflow-auto max-h-[75vh]">
|
||||||
|
<table className="w-full text-left border-collapse">
|
||||||
|
<thead className="bg-slate-900 text-[10px] font-black text-slate-400 uppercase tracking-widest sticky top-0 z-10 shadow-md">
|
||||||
|
<tr>
|
||||||
|
<th className="p-5 w-40">SKU</th>
|
||||||
|
<th className="p-5 w-40">POLE</th>
|
||||||
|
{translations?.langs?.map(l => <th key={l} className="p-5 text-center">{l.toUpperCase()}</th>)}
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody className="text-sm">
|
||||||
|
{loading ? (
|
||||||
|
<tr><td colSpan={2 + (translations?.langs?.length || 0)} className="p-20 text-center text-slate-600 font-bold animate-pulse uppercase tracking-[0.2em]">Analiza tłumaczeń...</td></tr>
|
||||||
|
) : translations?.data?.length > 0 ? translations.data.map((t, idx) => {
|
||||||
|
const isFirstOfSku = idx === 0 || translations.data[idx-1].sku !== t.sku;
|
||||||
|
return (
|
||||||
|
<tr key={idx} className={`hover:bg-white/[0.02] transition-colors ${isFirstOfSku ? 'border-t-2 border-t-white/10' : 'border-t border-white/5 border-dashed'}`}>
|
||||||
|
<td className="p-5 font-bold tabular-nums">
|
||||||
|
{isFirstOfSku ? <a href={t.url} target="_blank" rel="noopener noreferrer" className="text-blue-400 hover:text-blue-300 transition-colors underline decoration-blue-400/30 hover:decoration-blue-400 underline-offset-4">{t.sku}</a> : null}
|
||||||
|
</td>
|
||||||
|
<td className="p-5 uppercase text-[10px] font-black text-slate-400">{t.field}</td>
|
||||||
|
{translations.langs.map(l => (
|
||||||
|
<td key={l} className="p-5 text-center">
|
||||||
|
{t[l] === 'V' ? <i data-lucide="check-circle" className="w-4 h-4 text-emerald-500/70 mx-auto"></i> : <i data-lucide="x-circle" className="w-5 h-5 text-red-500 mx-auto drop-shadow-md"></i>}
|
||||||
|
</td>
|
||||||
|
))}
|
||||||
|
</tr>
|
||||||
|
);
|
||||||
|
}) : (
|
||||||
|
<tr><td colSpan={10} className="p-20 text-center text-slate-600 font-bold uppercase tracking-widest">Brak danych o tłumaczeniach.</td></tr>
|
||||||
|
)}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{selectedPage && (
|
||||||
|
<div className="fixed inset-0 bg-slate-950/90 backdrop-blur-md flex items-center justify-center p-4 z-50 animate-fade-in">
|
||||||
|
<div className="glass w-full max-w-6xl max-h-[90vh] overflow-hidden flex flex-col rounded-[2.5rem] border border-white/10 shadow-2xl">
|
||||||
|
<div className="p-6 border-b border-white/5 flex justify-between items-center bg-slate-900/50">
|
||||||
|
<div><p className="text-blue-400 text-[10px] font-black uppercase tracking-widest mb-1">Pełny audyt strony</p><h2 className="text-lg font-bold text-slate-100 truncate max-w-2xl">{selectedPage.url}</h2></div>
|
||||||
|
<button onClick={() => setSelectedPage(null)} className="p-2 hover:bg-white/10 rounded-xl transition-colors"><i data-lucide="x"></i></button>
|
||||||
|
</div>
|
||||||
|
<div className="flex bg-slate-900/50 border-b border-white/5">
|
||||||
|
<button onClick={()=>setActiveTab('schema')} className={`px-8 py-3 text-[10px] font-black uppercase tracking-widest transition-all ${activeTab==='schema'?'text-blue-400 border-b-2 border-blue-400 bg-blue-400/5':'text-slate-500 hover:text-slate-300'}`}>Schema.org</button>
|
||||||
|
<button onClick={()=>setActiveTab('metadata')} className={`px-8 py-3 text-[10px] font-black uppercase tracking-widest transition-all ${activeTab==='metadata'?'text-purple-400 border-b-2 border-purple-400 bg-purple-400/5':'text-slate-500 hover:text-slate-300'}`}>Metadane SEO</button>
|
||||||
|
<button onClick={()=>setActiveTab('images')} className={`px-8 py-3 text-[10px] font-black uppercase tracking-widest transition-all ${activeTab==='images'?'text-emerald-400 border-b-2 border-emerald-400 bg-emerald-400/5':'text-slate-500 hover:text-slate-300'}`}>Audyt Grafiki</button>
|
||||||
|
</div>
|
||||||
|
<div className="p-6 overflow-y-auto">
|
||||||
|
{activeTab === 'schema' ? (
|
||||||
|
<div>
|
||||||
|
<div className="mb-10">
|
||||||
|
<h3 className="text-slate-500 text-[10px] font-black mb-4 uppercase tracking-[0.2em] border-b border-white/5 pb-2">I. Audyt pól Schema.org</h3>
|
||||||
|
{analysisData.schemas.length > 0 ? analysisData.schemas.map((s, i) => {
|
||||||
|
const fields = SCHEMA_DEFS[s.type] || [];
|
||||||
|
return (
|
||||||
|
<div key={i} className="mb-6 glass rounded-2xl overflow-hidden border border-white/5">
|
||||||
|
<div className="px-4 py-2 bg-white/5 flex items-center space-x-2 text-[10px] font-black uppercase text-slate-400"><i data-lucide="file-json" className="w-3 h-3"></i><span>{s.type}</span></div>
|
||||||
|
<table className="w-full text-[10px] border-collapse">
|
||||||
|
<tbody>
|
||||||
|
{fields.map((f, fi) => {
|
||||||
|
const val = getVal(s.data, f.field);
|
||||||
|
const ex = val !== undefined && val !== null && val !== '';
|
||||||
|
return (
|
||||||
|
<tr key={fi} className="border-t border-white/5">
|
||||||
|
<td className="p-2.5 text-slate-400 w-1/3">{f.label}</td>
|
||||||
|
<td className="p-2.5 text-slate-200 truncate max-w-[200px] font-medium">{ex ? (typeof val==='object'?'Obiekt':String(val)):'—'}</td>
|
||||||
|
<td className="p-2.5 text-right font-black uppercase">{ex ? <span className="text-emerald-500">OK</span> : f.status==='req' ? <span className="text-red-500">Wymagane</span> : f.status==='warn' ? <span className="text-amber-500">Zalecane</span> : <span className="text-slate-600">Opcjonalne</span>}</td>
|
||||||
|
</tr>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}) : <p className="text-center py-10 text-slate-600 font-bold uppercase tracking-widest text-[10px]">Brak danych strukturalnych</p>}
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<h3 className="text-slate-500 text-[10px] font-black mb-4 uppercase tracking-[0.2em] border-b border-white/5 pb-2">II. Kod JSON-LD</h3>
|
||||||
|
{analysisData.schemas.map((s, i) => (
|
||||||
|
<pre key={i} className="bg-slate-950/80 p-5 rounded-xl overflow-x-auto text-[11px] text-blue-200/70 border border-white/5 font-mono mb-4 last:mb-0">{JSON.stringify(s.data, null, 2)}</pre>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : activeTab === 'metadata' ? (
|
||||||
|
<div>
|
||||||
|
<h3 className="text-slate-500 text-[10px] font-black mb-4 uppercase tracking-[0.2em] border-b border-white/5 pb-2">Metadane SEO</h3>
|
||||||
|
<div className="glass rounded-2xl overflow-hidden border border-white/5 p-5 text-sm text-slate-300">
|
||||||
|
<div className="mb-4">
|
||||||
|
<span className="block text-[10px] font-black uppercase text-slate-500 mb-1">Tag Title</span>
|
||||||
|
<div className="font-bold text-slate-100 bg-slate-900/50 p-3 rounded-xl border border-white/5">{selectedPage.title || 'Brak tagu <title>'}</div>
|
||||||
|
</div>
|
||||||
|
<div className="mb-4">
|
||||||
|
<span className="block text-[10px] font-black uppercase text-slate-500 mb-1">Meta Description</span>
|
||||||
|
<div className="font-medium text-slate-300 bg-slate-900/50 p-3 rounded-xl border border-white/5">{selectedPage.meta_desc || 'Brak description'}</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<span className="block text-[10px] font-black uppercase text-slate-500 mb-1">Link Canonical</span>
|
||||||
|
<div className="font-mono text-xs text-blue-400 bg-slate-900/50 p-3 rounded-xl border border-white/5 truncate">{selectedPage.canonical || 'Brak canonical'}</div>
|
||||||
|
{selectedPage.canonical && selectedPage.canonical !== selectedPage.url && <div className="mt-2 text-[10px] text-amber-500 font-bold uppercase"><i data-lucide="alert-triangle" className="w-3 h-3 inline mr-1"></i>Canonical wskazuje na inną stronę!</div>}
|
||||||
|
{selectedPage.canonical && selectedPage.canonical === selectedPage.url && <div className="mt-2 text-[10px] text-emerald-500 font-bold uppercase"><i data-lucide="check-circle" className="w-3 h-3 inline mr-1"></i>Samoodwołujący (Zgodny z URL)</div>}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div>
|
||||||
|
<h3 className="text-slate-500 text-[10px] font-black mb-4 uppercase tracking-[0.2em] border-b border-white/5 pb-2">Audyt optymalizacji obrazów</h3>
|
||||||
|
<div className="glass rounded-2xl overflow-hidden border border-white/5">
|
||||||
|
<table className="w-full text-left border-collapse">
|
||||||
|
<thead className="bg-white/5 text-[9px] font-black text-slate-500 uppercase tracking-widest">
|
||||||
|
<tr><th className="p-3">Podgląd</th><th className="p-3">Atrybut ALT</th><th className="p-3">Format Modern</th><th className="p-3 text-right">Status</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody className="text-[10px]">
|
||||||
|
{analysisData.images.length > 0 ? analysisData.images.map((img, i) => (
|
||||||
|
<tr key={i} className="border-t border-white/5 hover:bg-white/[0.02]">
|
||||||
|
<td className="p-3"><img src={img.img_url} className="w-10 h-10 object-cover rounded bg-slate-800" onError={(e)=>e.target.src='https://via.placeholder.com/40'} /></td>
|
||||||
|
<td className="p-3"><span className={img.alt==='[BRAK]'?'text-red-400 font-bold':'text-slate-300'}>{img.alt}</span></td>
|
||||||
|
<td className="p-3">{img.is_modern ? <span className="text-emerald-400 font-bold">TAK (Bezpośrednio)</span> : img.has_modern_source ? <span className="text-blue-400 font-bold">TAK (Picture/Srcset)</span> : <span className="text-amber-500 font-bold">NIE (Stary format)</span>}</td>
|
||||||
|
<td className="p-3 text-right">{img.alt!=='[BRAK]' && (img.is_modern || img.has_modern_source) ? <span className="text-emerald-500 font-black">ZOPTYMALIZOWANO</span> : <span className="text-amber-500 font-black text-[9px]">DO POPRAWY</span>}</td>
|
||||||
|
</tr>
|
||||||
|
)) : <tr><td colSpan="4" className="p-10 text-center text-slate-600 font-bold uppercase">Nie znaleziono obrazów</td></tr>}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const root = ReactDOM.createRoot(document.getElementById('root'));
|
||||||
|
root.render(<App />);
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
@app.get("/api/list-dbs")
|
||||||
|
def list_dbs():
|
||||||
|
dbs = glob.glob("scans/*.db")
|
||||||
|
# Zwracamy same nazwy plików dla ładniejszego widoku w select
|
||||||
|
return sorted([os.path.basename(db) for db in dbs], reverse=True)
|
||||||
|
|
||||||
|
@app.get("/api/stats")
|
||||||
|
def get_stats(db: str):
|
||||||
|
conn = get_db_conn(db)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
stats = {"total_pages": 0, "errors": 0, "avg_time": 0, "schema_objects": 0, "img_issues": 0, "translation_errors": 0}
|
||||||
|
try:
|
||||||
|
stats["total_pages"] = cursor.execute("SELECT COUNT(*) FROM pages").fetchone()[0]
|
||||||
|
stats["errors"] = cursor.execute("SELECT COUNT(*) FROM pages WHERE status != 200 AND status != 0").fetchone()[0]
|
||||||
|
stats["avg_time"] = cursor.execute("SELECT AVG(total_time) FROM pages WHERE total_time > 0").fetchone()[0] or 0
|
||||||
|
stats["schema_objects"] = cursor.execute("SELECT COUNT(*) FROM structured_data").fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
try:
|
||||||
|
img_stats = cursor.execute("SELECT SUM(images_no_alt), SUM(images_no_webp) FROM pages").fetchone()
|
||||||
|
stats["img_issues"] = (img_stats[0] or 0) + (img_stats[1] or 0)
|
||||||
|
except: pass
|
||||||
|
try:
|
||||||
|
stats["translation_errors"] = cursor.execute("SELECT COUNT(*) FROM translation_audit").fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
conn.close()
|
||||||
|
return stats
|
||||||
|
|
||||||
|
@app.get("/api/pages")
|
||||||
|
def get_pages(db: str, status_type: Optional[str] = "all"):
|
||||||
|
conn = get_db_conn(db)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
try:
|
||||||
|
query = "SELECT * FROM pages"
|
||||||
|
try:
|
||||||
|
cursor.execute("SELECT images_no_alt FROM pages LIMIT 1")
|
||||||
|
has_img_cols = True
|
||||||
|
except: has_img_cols = False
|
||||||
|
|
||||||
|
if status_type == "error": query += " WHERE status != 200 AND status != 0"
|
||||||
|
elif status_type == "noindex": query += " WHERE index_status LIKE 'Noindex%'"
|
||||||
|
elif status_type == "slow": query += " WHERE total_time > 1.5"
|
||||||
|
elif status_type == "images" and has_img_cols: query += " WHERE images_no_alt > 0 OR images_no_webp > 0"
|
||||||
|
|
||||||
|
query += " ORDER BY id DESC LIMIT 1000"
|
||||||
|
pages = cursor.execute(query).fetchall()
|
||||||
|
return [dict(p) for p in pages]
|
||||||
|
except: return []
|
||||||
|
finally: conn.close()
|
||||||
|
|
||||||
|
@app.get("/api/translations")
|
||||||
|
def get_translations(db: str):
|
||||||
|
conn = get_db_conn(db)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
cursor.execute("SELECT title, meta_desc FROM pages LIMIT 1")
|
||||||
|
has_meta = True
|
||||||
|
except: has_meta = False
|
||||||
|
|
||||||
|
if has_meta:
|
||||||
|
query = """
|
||||||
|
SELECT s.sku, p.lang, s.full_json, MIN(p.url) as url, MAX(p.title) as title, MAX(p.meta_desc) as meta_desc
|
||||||
|
FROM structured_data s
|
||||||
|
JOIN pages p ON s.page_id = p.id
|
||||||
|
WHERE s.sku IS NOT NULL AND s.sku != 'None' AND s.sku != '' AND s.schema_type LIKE '%Product%'
|
||||||
|
GROUP BY s.sku, p.lang
|
||||||
|
"""
|
||||||
|
else:
|
||||||
|
query = """
|
||||||
|
SELECT s.sku, p.lang, s.full_json, MIN(p.url) as url, '' as title, '' as meta_desc
|
||||||
|
FROM structured_data s
|
||||||
|
JOIN pages p ON s.page_id = p.id
|
||||||
|
WHERE s.sku IS NOT NULL AND s.sku != 'None' AND s.sku != '' AND s.schema_type LIKE '%Product%'
|
||||||
|
GROUP BY s.sku, p.lang
|
||||||
|
"""
|
||||||
|
rows = cursor.execute(query).fetchall()
|
||||||
|
|
||||||
|
sku_map = {}
|
||||||
|
langs_set = set()
|
||||||
|
|
||||||
|
for r in rows:
|
||||||
|
sku = str(r['sku']).strip()
|
||||||
|
lang = str(r['lang']).lower().strip()
|
||||||
|
if '-' in lang: lang = lang.split('-')[0]
|
||||||
|
langs_set.add(lang)
|
||||||
|
|
||||||
|
try: data = json.loads(r['full_json'])
|
||||||
|
except: continue
|
||||||
|
|
||||||
|
obj = {}
|
||||||
|
if isinstance(data, list): obj = next((item for item in data if 'Product' in str(item.get('@type', ''))), {})
|
||||||
|
else: obj = data if 'Product' in str(data.get('@type', '')) else {}
|
||||||
|
|
||||||
|
name = obj.get('name', '').strip()
|
||||||
|
desc = obj.get('description', '').strip()
|
||||||
|
title = (r['title'] or '').strip()
|
||||||
|
meta_desc = (r['meta_desc'] or '').strip()
|
||||||
|
|
||||||
|
slug = ''
|
||||||
|
if r['url']:
|
||||||
|
parts = r['url'].rstrip('/').split('/')
|
||||||
|
if parts: slug = parts[-1].split('?')[0].split('#')[0]
|
||||||
|
|
||||||
|
if sku not in sku_map: sku_map[sku] = {'langs': {}, 'url': r['url']}
|
||||||
|
sku_map[sku]['langs'][lang] = {
|
||||||
|
'nazwa': name, 'opis': desc,
|
||||||
|
'nazwa seo': title, 'opis seo': meta_desc, 'slug': slug
|
||||||
|
}
|
||||||
|
|
||||||
|
all_langs = sorted(list(langs_set))
|
||||||
|
if 'pl' in all_langs: all_langs.remove('pl')
|
||||||
|
|
||||||
|
results = []
|
||||||
|
fields = ['nazwa', 'opis', 'nazwa seo', 'opis seo', 'slug']
|
||||||
|
|
||||||
|
for sku, info in sku_map.items():
|
||||||
|
if 'pl' not in info['langs']: continue
|
||||||
|
|
||||||
|
pl_data = info['langs']['pl']
|
||||||
|
sku_has_errors = False
|
||||||
|
sku_rows = []
|
||||||
|
|
||||||
|
for field in fields:
|
||||||
|
pl_val = pl_data.get(field, '')
|
||||||
|
if not pl_val: continue
|
||||||
|
|
||||||
|
row = {'sku': sku, 'field': field, 'url': info['url']}
|
||||||
|
for lang in all_langs:
|
||||||
|
l_val = info['langs'].get(lang, {}).get(field, '')
|
||||||
|
if not l_val or l_val == pl_val:
|
||||||
|
row[lang] = 'X'
|
||||||
|
sku_has_errors = True
|
||||||
|
else:
|
||||||
|
row[lang] = 'V'
|
||||||
|
sku_rows.append(row)
|
||||||
|
|
||||||
|
if sku_has_errors:
|
||||||
|
results.extend(sku_rows)
|
||||||
|
|
||||||
|
return {"langs": all_langs, "data": results}
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in translations: {e}")
|
||||||
|
return {"langs": [], "data": []}
|
||||||
|
finally: conn.close()
|
||||||
|
|
||||||
|
@app.get("/api/analysis/{page_id}")
|
||||||
|
def get_analysis(db: str, page_id: int):
|
||||||
|
conn = get_db_conn(db)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
try:
|
||||||
|
schemas = cursor.execute("SELECT schema_type, full_json FROM structured_data WHERE page_id = ?", (page_id,)).fetchall()
|
||||||
|
try: images = cursor.execute("SELECT img_url, alt, is_modern, has_modern_source FROM images_audit WHERE page_id = ?", (page_id,)).fetchall()
|
||||||
|
except: images = []
|
||||||
|
schema_list = []
|
||||||
|
for s in schemas:
|
||||||
|
try: schema_list.append({"type": s["schema_type"], "data": json.loads(s["full_json"])})
|
||||||
|
except: schema_list.append({"type": s["schema_type"], "data": s["full_json"]})
|
||||||
|
return {"schemas": schema_list, "images": [dict(img) for img in images]}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
@app.get("/api/export-csv")
|
||||||
|
def export_csv(db: str, status_type: Optional[str] = "all"):
|
||||||
|
conn = get_db_conn(db)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
try:
|
||||||
|
cursor.execute("SELECT images_no_alt FROM pages LIMIT 1")
|
||||||
|
has_img_cols = True
|
||||||
|
except: has_img_cols = False
|
||||||
|
|
||||||
|
query = "SELECT * FROM pages"
|
||||||
|
if status_type == "error": query += " WHERE status != 200 AND status != 0"
|
||||||
|
elif status_type == "noindex": query += " WHERE index_status LIKE 'Noindex%'"
|
||||||
|
elif status_type == "slow": query += " WHERE total_time > 1.5"
|
||||||
|
elif status_type == "images" and has_img_cols: query += " WHERE images_no_alt > 0 OR images_no_webp > 0"
|
||||||
|
query += " ORDER BY id DESC"
|
||||||
|
pages = cursor.execute(query).fetchall()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
output = io.StringIO()
|
||||||
|
writer = csv.writer(output, delimiter=';')
|
||||||
|
if pages:
|
||||||
|
keys = list(dict(pages[0]).keys())
|
||||||
|
writer.writerow([k.upper() for k in keys])
|
||||||
|
for p in pages:
|
||||||
|
writer.writerow([dict(p).get(k, '') for k in keys])
|
||||||
|
else:
|
||||||
|
writer.writerow(['BRAK DANYCH'])
|
||||||
|
output.seek(0)
|
||||||
|
filename = f"raport_seo_{status_type}_{db.replace('.db', '')}.csv"
|
||||||
|
return StreamingResponse(io.BytesIO(output.getvalue().encode('utf-8-sig')), media_type="text/csv", headers={"Content-Disposition": f"attachment; filename={filename}"})
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="127.0.0.1", port=8000)
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
@echo off
|
||||||
|
cd /d "e:\Lukasz\Projekty\Python\Crawler_XML"
|
||||||
|
python crawler.py --url https://fluo.dog
|
||||||
|
exit
|
||||||
+144
@@ -0,0 +1,144 @@
|
|||||||
|
import sqlite3
|
||||||
|
import json
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import html
|
||||||
|
import requests
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
def load_config():
|
||||||
|
try:
|
||||||
|
with open("config.json", "r", encoding="utf-8") as f:
|
||||||
|
raw_config = json.load(f)
|
||||||
|
return {k.strip(): v for k, v in raw_config.items()}
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[!] Błąd wczytywania config.json: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def send_telegram(token, chat_id, message):
|
||||||
|
url = f"https://api.telegram.org/bot{token}/sendMessage"
|
||||||
|
try:
|
||||||
|
r = requests.post(url, json={"chat_id": chat_id, "text": message, "parse_mode": "HTML"}, timeout=15)
|
||||||
|
if r.status_code != 200:
|
||||||
|
print(f"[!] Błąd wysyłania (HTTP {r.status_code}): {r.text}")
|
||||||
|
else:
|
||||||
|
print("[*] Wiadomość wysłana pomyślnie.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[!] Błąd połączenia z Telegramem: {e}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
config = load_config()
|
||||||
|
token = config.get("telegram_token")
|
||||||
|
chat_id_errors = config.get("telegram_chat_id_errors") or config.get("telegram_chat_id_info")
|
||||||
|
|
||||||
|
if not token or not chat_id_errors:
|
||||||
|
print("[!] Brak poprawnej konfiguracji Telegram (token lub chat_id_errors w config.json).")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Znajdź najnowszą bazę danych crawla w podfolderze scans
|
||||||
|
dbs = glob.glob("scans/crawler_v*.db")
|
||||||
|
if not dbs:
|
||||||
|
print("[!] Nie znaleziono żadnej bazy danych crawler_v*.db w podfolderze scans.")
|
||||||
|
return
|
||||||
|
|
||||||
|
dbs.sort(key=os.path.getmtime, reverse=True)
|
||||||
|
latest_db = dbs[0]
|
||||||
|
print(f"[*] Odczytuję dane z bazy: {latest_db}")
|
||||||
|
|
||||||
|
critical_errors = []
|
||||||
|
schema_errors = []
|
||||||
|
translation_issues = []
|
||||||
|
domain = "fluo.dog"
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(latest_db)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Wyciągamy domenę z pierwszego rekordu
|
||||||
|
cursor.execute("SELECT url FROM pages LIMIT 1")
|
||||||
|
row = cursor.fetchone()
|
||||||
|
if row:
|
||||||
|
domain = html.escape(urlparse(row[0]).netloc)
|
||||||
|
|
||||||
|
# Błędy krytyczne (404, 500+, zablokowane)
|
||||||
|
cursor.execute("SELECT url, status, google_access FROM pages WHERE status = 404 OR status >= 500 OR google_access = 'Blocked' LIMIT 15")
|
||||||
|
for url, status, access in cursor.fetchall():
|
||||||
|
if access == 'Blocked':
|
||||||
|
err_type = "ROBOTS.TXT BLOCK"
|
||||||
|
elif status == 404:
|
||||||
|
err_type = "404"
|
||||||
|
else:
|
||||||
|
err_type = f"ERR {status}"
|
||||||
|
critical_errors.append((url, err_type))
|
||||||
|
|
||||||
|
# Błędy schema
|
||||||
|
cursor.execute("SELECT url, schema_critical FROM pages WHERE schema_critical > 0 LIMIT 10")
|
||||||
|
for url, count in cursor.fetchall():
|
||||||
|
schema_errors.append((url, count))
|
||||||
|
|
||||||
|
# Błędy tłumaczeń
|
||||||
|
try:
|
||||||
|
cursor.execute("SELECT sku, lang1, lang2, field FROM translation_audit LIMIT 10")
|
||||||
|
for sku, lang1, lang2, field in cursor.fetchall():
|
||||||
|
translation_issues.append(f"SKU {sku}: {field} identyczny w {lang1} i {lang2}")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass # Jeśli z jakiegoś powodu nie ma jeszcze tej tabeli
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[!] Błąd odczytu bazy danych: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not (critical_errors or schema_errors or translation_issues):
|
||||||
|
print("[*] Brak błędów do zaraportowania w ostatnim skanie.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Budowanie wiadomości z promptem
|
||||||
|
err_msg = f"🚨 <b>BŁĘDY: {domain} (Wysłane ręcznie)</b>\n\n"
|
||||||
|
|
||||||
|
prompt_text = (
|
||||||
|
"Twoim zadaniem jest weryfikacja poniższych błędów na sklepie i przygotowanie planu naprawy. "
|
||||||
|
"Znasz strukturę plików, masz dostęp do bazy danych sklepu oraz wiesz jak działają wszystkie mechanizmy cache. "
|
||||||
|
"WAŻNE: Podane błędy zostały wykryte przez crawler, który analizował wyrenderowany kod HTML stron sklepu. "
|
||||||
|
"Crawler nie sprawdzał bazy danych – może być tak, że w bazie dane są w pełni poprawne, "
|
||||||
|
"a problem leży po stronie modułów (np. wstrzykujących dane strukturalne do źródła strony).\n"
|
||||||
|
"Oto zestawienie błędów do przeanalizowania:\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
if critical_errors:
|
||||||
|
err_msg += f"❌ <b>KRYTYCZNE:</b>\n"
|
||||||
|
prompt_text += "BŁĘDY KRYTYCZNE:\n"
|
||||||
|
for url, err in critical_errors:
|
||||||
|
safe_url = html.escape(url)
|
||||||
|
err_msg += f"• {err}: {safe_url}\n"
|
||||||
|
prompt_text += f"- {err}: {url}\n"
|
||||||
|
err_msg += "\n"
|
||||||
|
prompt_text += "\n"
|
||||||
|
|
||||||
|
if schema_errors:
|
||||||
|
err_msg += f"🛠 <b>SCHEMA.ORG:</b>\n"
|
||||||
|
prompt_text += "BŁĘDY SCHEMA.ORG:\n"
|
||||||
|
for url, count in schema_errors:
|
||||||
|
safe_url = html.escape(url)
|
||||||
|
err_msg += f"• Brak {count} pól: {safe_url}\n"
|
||||||
|
prompt_text += f"- Brak {count} pól: {url}\n"
|
||||||
|
err_msg += "\n"
|
||||||
|
prompt_text += "\n"
|
||||||
|
|
||||||
|
if translation_issues:
|
||||||
|
err_msg += f"🌐 <b>TŁUMACZENIA:</b>\n"
|
||||||
|
prompt_text += "BŁĘDY TŁUMACZEŃ:\n"
|
||||||
|
for issue in translation_issues:
|
||||||
|
err_msg += f"• {html.escape(issue)}\n"
|
||||||
|
prompt_text += f"- {issue}\n"
|
||||||
|
err_msg += "\n"
|
||||||
|
prompt_text += "\n"
|
||||||
|
|
||||||
|
err_msg += f"🤖 <b>Gotowy prompt dla Agenta AI:</b>\n"
|
||||||
|
err_msg += f"<pre><code class=\"language-text\">{html.escape(prompt_text.strip())}</code></pre>"
|
||||||
|
|
||||||
|
print("[*] Wysyłanie raportu na kanał Errors...")
|
||||||
|
send_telegram(token, chat_id_errors, err_msg)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user