НЕ ИНСТРУКЦИЯ! МЫСЛИ ВСЛУХ!!! Python 3.10–3.11 (3.11.6) Драйвер RTX CUDA Toolkit 11.8 (https://developer.nvidia.com/cuda-11-8-0-download-archive) Latest supported (2015–2022) Latest supported Redistributable version (https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170) Скачать vc_redist.x64.exe MSYS2 (https://www.msys2.org/) Перезагрузи компьютер в cmd: 1 pip install --upgrade pip pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118 2 pip install fastapi uvicorn[standard] pydantic==2.* chromadb sentence-transformers \ llama-cpp-python[cuda] python-docx openpyxl xlrd pillow pytesseract easyocr 3 pip install fastapi uvicorn chromadb sentence-transformers pillow easyocr \ python-docx openpyxl xlrd torch torchvision transformers llama-cpp-python[cuda] 4 pip install -pandas 5 pip install fastapi uvicorn chromadb sentence-transformers transformers easyocr python-docx openpyxl xlrd pillow llama-cpp-python[cuda] streamlit requests 6 #pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-directml onnxruntime-training pip install --upgrade --force-reinstall onnxruntime==1.17.3 7 pip install fastapi uvicorn[standard] pydantic==2.* chromadb sentence-transformers \ llama-cpp-python[cuda] python-docx openpyxl xlrd pillow pytesseract easyocr 8 pip install --upgrade pip setuptools wheel 9 pip install streamlit chromadb sentence-transformers transformers torch pandas python-docx openpyxl easyocr PyPDF2 onnxruntime 10 pip install PyPDF2 11 pip install streamlit chromadb sentence-transformers transformers torch pandas python-docx openpyxl easyocr PyPDF2 onnxruntime Создаем папку C:\AI_RAG в ней файл run app_qwen_rag.py: файл запускаем в cmd: cd c:\ai_rag python -m streamlit run app_qwen_rag.py --server.fileWatcherType none --server.port 8502 # app_qwen_rag.py — v5.3 (длинные ответы, OCR-fallback для PDF, GPU/CPU эмбеддер/реранк, MMR, keyword-booster) from __future__ import annotations import os, io, sys, gc, uuid, logging, re from pathlib import Path from typing import List, Dict, Any, Tuple import streamlit as st import numpy as np from PIL import Image Image.MAX_IMAGE_PIXELS = 20_000_000 # ====== ENV ====== os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS_WARNING", "1") os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false") os.environ.setdefault("OMP_NUM_THREADS", "1") os.environ.setdefault("OPENBLAS_NUM_THREADS", "1") os.environ.setdefault("MKL_NUM_THREADS", "1") os.environ.setdefault("NUMEXPR_NUM_THREADS", "1") APP_TITLE = "🧠 Qwen 2.5 RAG — v5.3" MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" DATA_DIR = Path("data"); DATA_DIR.mkdir(exist_ok=True) DB_DIR = Path("chroma_db"); DB_DIR.mkdir(exist_ok=True) LOG_DIR = Path("logs"); LOG_DIR.mkdir(parents=True, exist_ok=True) LOG_FILE = LOG_DIR / "app.log" MAX_TEXT_CHARS = 200_000 DEFAULTS = dict( MAX_TOKENS=1024, # большие чанки для полноценного контекста OVERLAP_TOKENS=140, N_RESULTS=70, TOP_AFTER=18, SCORE_THRESHOLD=0.18, MAX_CTX_CHARS=26_000, MAX_NEW_TOKENS=1400, STYLE_GUIDE=("Сформируй подробный структурированный ответ:\n" "• Краткий итог; • Подробности с цитатами; • Пошаговые действия; " "• Риски/ограничения; • Ссылки [filename • chunk].") ) # ====== LOGGING ====== logging.basicConfig( filename=str(LOG_FILE), level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" ) log = logging.getLogger("qwen_rag_v53") # ====== UI ====== st.set_page_config(page_title="Qwen 2.5 RAG", page_icon="🧠", layout="wide") st.title(APP_TITLE) st.caption("Длинные ответы • OCR-fallback для PDF • Семантические чанки • MMR • CrossEncoder реранк • Chroma Persistent") # ====== Torch/CUDA ====== import torch DEVICE = "cuda" if torch.cuda.is_available() else "cpu" if torch.cuda.is_available(): try: torch.backends.cuda.matmul.allow_tf32 = True torch.set_float32_matmul_precision("high") except Exception: pass DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32 st.write(f"**PY:** {sys.executable} • **CWD:** {os.getcwd()}") st.write(f"**Torch:** {torch.__version__} • **CUDA:** {torch.cuda.is_available()} • **GPU:** {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU only'}") log.info("🚀 start v5.3 | device=%s dtype=%s torch=%s", DEVICE, DTYPE, torch.__version__) # ====== Session ====== _init = dict( db_ready=False, collection=None, embedder=None, emb_name=None, emb_dim=None, reranker=None, model_loaded=False, tokenizer=None, model=None, ocr_ready=False, ocr_gpu=False, _ocr=None, SUPER_DETAILED=True ) for k,v in _init.items(): st.session_state.setdefault(k, v) for k,v in DEFAULTS.items(): st.session_state.setdefault(k, v) # ====== Helpers ====== def _supported(p:Path)->bool: return p.suffix.lower() in (".txt",".pdf",".docx",".doc",".xlsx",".xls",".jpg",".jpeg",".png") def safe_trim(text:str)->str: if not text: return "" if len(text) > MAX_TEXT_CHARS: log.warning("Текст обрезан до %d символов", MAX_TEXT_CHARS) return text[:MAX_TEXT_CHARS] return text def ocr_preproc(pil:Image.Image)->Image.Image: try: import cv2, numpy as np img = cv2.cvtColor(np.array(pil), cv2.COLOR_RGB2BGR) g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) g = cv2.fastNlMeansDenoising(g, None, 15,7,21) _, bw = cv2.threshold(g, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) return Image.fromarray(bw) except Exception: return pil def ensure_ocr(): if not st.session_state.ocr_ready: import easyocr st.session_state._ocr = easyocr.Reader(["ru","en"], gpu=st.session_state.ocr_gpu, verbose=False) st.session_state.ocr_ready=True log.info("✅ EasyOCR готов (gpu=%s)", st.session_state.ocr_gpu) def pdf_ocr_fallback(data:bytes)->str: """Рендер PDF страниц и OCR (если текст из PyPDF2 пустой/мал).""" try: import pypdfium2 as pdfium ensure_ocr() pdf = pdfium.PdfDocument(io.BytesIO(data)) out=[] for i in range(len(pdf)): page = pdf[i] pil = page.render(scale=2).to_pil() pil = ocr_preproc(pil) o = st.session_state._ocr.readtext(pil, detail=0, paragraph=True) if o: out.append("\n".join(o)) txt = "\n\n".join(out).strip() if txt: log.info("PDF OCR fallback сработал, длина=%d", len(txt)) return txt except Exception as e: log.exception("PDF OCR fallback error: %s", e) return "" def extract_text_from_bytes(data:bytes, filename:str, ocr_enable:bool)->str: name = filename.lower() try: if name.endswith(".pdf"): from PyPDF2 import PdfReader reader = PdfReader(io.BytesIO(data)) parts=[] for p in reader.pages: try: parts.append(p.extract_text() or "") except Exception: parts.append("") txt = "\n".join(parts).strip() # fallback если PyPDF2 дал пусто if len(txt) < 200 and ocr_enable: ocr_txt = pdf_ocr_fallback(data) if len(ocr_txt) > len(txt): txt = ocr_txt elif name.endswith((".docx",".doc")): from docx import Document doc = Document(io.BytesIO(data)) txt="\n".join(p.text for p in doc.paragraphs) elif name.endswith((".xlsx",".xls")): import pandas as pd df = pd.read_excel(io.BytesIO(data), dtype=str, engine=None) txt="\n".join(map(str, df.fillna("").values.flatten())) elif name.endswith((".jpg",".jpeg",".png")): if not ocr_enable: log.info("OCR выключен, %s пропущен", filename); return "" ensure_ocr() img = Image.open(io.BytesIO(data)) img = ocr_preproc(img) out = st.session_state._ocr.readtext(img, detail=0, paragraph=True) txt = "\n".join(out) else: txt = data.decode("utf-8", errors="ignore") txt = safe_trim((txt or "").strip()) log.info("📄 %s → %d символов", filename, len(txt)) return txt except Exception as e: log.exception("Парсинг %s: %s", filename, e) st.error(f"Ошибка парсинга {filename}: {e}") return "" def extract_text_from_file(path:Path, ocr_enable:bool)->str: try: return extract_text_from_bytes(path.read_bytes(), path.name, ocr_enable) except Exception as e: log.exception("Чтение %s: %s", path.name, e) st.error(f"Не удалось прочитать {path.name}: {e}") return "" # ====== Семантическая нарезка ====== RE_HDR = re.compile(r"^#{1,6}\s+.+$", re.M) RE_CODE = re.compile(r"```[\s\S]*?```", re.M) RE_DLG = re.compile(r"^(?:User|Assistant|Speaker \d+|Оператор|Клиент)\s*[:\-–]\s", re.M) def detect_doc_type(t:str)->str: if RE_HDR.search(t): return "docs" if RE_CODE.search(t): return "code" if RE_DLG.search(t): return "dialogue" if re.search(r"[|;\t].*[|;\t].*\n", t): return "table" return "plain" def tokenize_len_fn(tokenizer): def _fn(s:str)->int: return len(tokenizer.encode(s, add_special_tokens=False)) return _fn def chunk_by_tokens(text:str, max_tok:int, overlap:int, tok_len)->List[str]: if tok_len(text)<=max_tok: return [text] words=text.split() out=[]; cur=[]; n=0 for w in words: t=tok_len(w+" ") if n+t>max_tok and cur: out.append(" ".join(cur)) tail=[]; tn=0 for ww in reversed(cur): tt=tok_len(ww+" ") if tn+tt>overlap: break tail.append(ww); tn+=tt cur=list(reversed(tail)); n=tn cur.append(w); n+=t if cur: out.append(" ".join(cur)) return out def semantic_chunks(text:str, tokenizer, max_tok:int, overlap:int)->List[Tuple[str,Dict[str,Any]]]: kind=detect_doc_type(text) segments=[text] if kind=="docs": idx=[m.start() for m in RE_HDR.finditer(text)] if idx: idx.append(len(text)) segments=[text[idx[i]:idx[i+1]].strip() for i in range(len(idx)-1)] elif kind=="code": segments=[]; last=0 for m in RE_CODE.finditer(text): if m.start()>last: segments.append(text[last:m.start()].strip()) segments.append(text[m.start():m.end()].strip()); last=m.end() if last0 and i-1str: tail = name.split("/")[-1].replace("-","_") return f"documents__{tail}__d{dim}" st.sidebar.header("🗄️ База") cdb1, cdb2 = st.sidebar.columns(2) with cdb1: if st.button("🧱 Инициализировать БД"): try: from chromadb import PersistentClient dev = "gpu" if use_gpu_embedder else "cpu" st.session_state.embedder, st.session_state.emb_name, st.session_state.emb_dim, emb_dev = load_embedder(emb_model_name, dev) client = PersistentClient(path=str(DB_DIR)) cname = coll_name_for(st.session_state.emb_name, st.session_state.emb_dim) st.session_state.collection = client.get_or_create_collection(cname) st.session_state.db_ready = True st.success(f"✅ База готова: {cname} (emb on {emb_dev})") except Exception as e: log.exception("DB init: %s", e) st.error(f"Ошибка инициализации БД: {e}") with cdb2: if st.button("🧹 Очистить БД"): import shutil try: if DB_DIR.exists(): shutil.rmtree(DB_DIR) st.session_state.update(dict(db_ready=False, collection=None, embedder=None, emb_name=None, emb_dim=None)) st.success("БД очищена. Снова нажмите «Инициализировать БД».") except Exception as e: log.exception("DB clear: %s", e) st.error(f"Ошибка очистки: {e}") # ====== CrossEncoder ====== def get_reranker(): if not use_reranker: return None if st.session_state.reranker is None: try: from sentence_transformers import CrossEncoder dev = "cuda" if (use_gpu_reranker and torch.cuda.is_available()) else "cpu" st.session_state.reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device=dev) log.info("CrossEncoder ready (%s)", dev) except Exception as e: log.exception("CE load: %s", e) st.warning(f"Не удалось загрузить CrossEncoder: {e}") return None return st.session_state.reranker # ====== Индексация ====== st.sidebar.header("📁 Индексирование") if st.sidebar.button("🔎 Индексировать папку data/"): files = [p for p in DATA_DIR.iterdir() if p.is_file()] if not files: st.sidebar.info("Папка data/ пуста."); st.stop() if not (st.session_state.db_ready and st.session_state.collection and st.session_state.embedder): st.sidebar.error("Сначала инициализируйте БД."); st.stop() try: try: from transformers import AutoTokenizer except Exception as e: print("Fallback import:", e); from transformers.models.auto import AutoTokenizer tok = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) if tok.pad_token_id is None and tok.eos_token_id is not None: tok.pad_token_id = tok.eos_token_id except Exception as e: st.error(f"Не удалось загрузить токенайзер: {e}"); st.stop() added, skipped, failed = 0, [], [] with st.spinner("Индексирую…"): prog = st.progress(0.0, text="Подготовка…"); total=len(files) for i, p in enumerate(files, start=1): try: if not _supported(p): skipped.append(f"{p.name} (тип не поддержан)"); prog.progress(i/total, text=f"Пропущен {p.name}"); continue txt = extract_text_from_file(p, ocr_enable) if not txt: skipped.append(f"{p.name} (пустой текст)"); prog.progress(i/total, text=f"Пустой {p.name}"); continue segments = semantic_chunks(txt, tokenizer=tok, max_tok=st.session_state.MAX_TOKENS, overlap=st.session_state.OVERLAP_TOKENS) ids, docs, embs, metas = [], [], [], [] for j,(ch,extra) in enumerate(segments): try: v = st.session_state.embedder.encode(ch, convert_to_numpy=True, normalize_embeddings=True) ids.append(f"{uuid.uuid4()}_{j}") docs.append(ch) embs.append(v.tolist()) metas.append(dict( filename=p.name, filetype=p.suffix.lower().lstrip("."), chunk_index=j, doc_type=extra.get("doc_type","plain"), token_limit=st.session_state.MAX_TOKENS, token_overlap=st.session_state.OVERLAP_TOKENS )) except Exception as ee: log.exception("Embed chunk %s_%d: %s", p.name, j, ee) if ids: st.session_state.collection.add(ids=ids, documents=docs, embeddings=embs, metadatas=metas) added += 1; prog.progress(i/total, text=f"Добавлен {p.name} ({len(ids)} чанков)") else: failed.append(f"{p.name} (эмбеддинги не получены)") except Exception as e: failed.append(f"{p.name} ({e})"); log.exception("Index %s: %s", p.name, e); prog.progress(i/total, text=f"Ошибка {p.name}") if added: st.sidebar.success(f"✅ Проиндексировано файлов: {added}") if skipped: st.sidebar.info("ℹ️ Пропущены:\n- " + "\n- ".join(skipped)) if failed: st.sidebar.error("❌ Ошибки:\n- " + "\n".join(failed)) log.info("Индекс: added=%d skipped=%d failed=%d", added, len(skipped), len(failed)) # ====== Qwen loader ====== def load_qwen(force_cpu:bool=False): try: try: from transformers import AutoTokenizer, AutoModelForCausalLM except Exception as e: print("Fallback transformers:", e) from transformers.models.auto import AutoTokenizer from transformers import AutoModelForCausalLM dtype = torch.float32 if force_cpu else (torch.bfloat16 if torch.cuda.is_available() else torch.float32) device_map = "cpu" if force_cpu else ("auto" if torch.cuda.is_available() else "cpu") tok = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) if tok.pad_token_id is None and tok.eos_token_id is not None: tok.pad_token_id = tok.eos_token_id mdl = AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, torch_dtype=dtype, device_map=device_map, low_cpu_mem_usage=True, attn_implementation="sdpa", ) mdl.eval() mdl.config.pad_token_id = tok.pad_token_id or tok.eos_token_id torch.set_grad_enabled(False) log.info("✅ Qwen загружена (%s)", "CPU" if force_cpu else "GPU BF16") return tok, mdl except Exception as e: log.exception("Qwen load: %s", e); raise # ====== Санитайз ====== RE_ZW = re.compile(r"[\u200b\u200c\u200d\ufeff]+") RE_SAFE = re.compile(r"[^\s\u0009-\uFFFF]+") RE_BANG = re.compile(r"(?:!){12,}|(?:!!.){6,}") def sanitize(s:str)->str: s = RE_ZW.sub("", s) s = RE_SAFE.sub("", s).strip() s = RE_BANG.sub("", s).strip() return s # ====== Q&A ====== st.subheader("💬 Вопрос к базе") query = st.text_input("Введите вопрос:") if st.button("Ответить"): if not (st.session_state.db_ready and st.session_state.collection and st.session_state.embedder): st.error("Сначала инициализируйте БД и проиндексируйте документы."); st.stop() if not query.strip(): st.warning("Введите вопрос."); st.stop() # Поиск кандидатов try: queries = [query] if use_multi_query: base = [query, f"{query}. Краткая формулировка", f"{query}. Другими словами", f"{query}. Альтернативная формулировка"] queries = list(dict.fromkeys([q.strip() for q in base if q.strip()]))[:4] all_cand=[] for q_ in queries: qv = st.session_state.embedder.encode(q_, convert_to_numpy=True, normalize_embeddings=True).tolist() res = st.session_state.collection.query( query_embeddings=[qv], n_results=st.session_state.N_RESULTS, include=["documents","metadatas","distances","embeddings"] ) docs = res.get("documents",[[]])[0] metas= res.get("metadatas",[[]])[0] dists= res.get("distances",[[]])[0] embs = res.get("embeddings",[[]])[0] for d,m,dist,e in zip(docs,metas,dists,embs): if not d: continue try: score = max(0.0, 1.0 - float(dist)) except: score = 0.0 if score >= st.session_state.SCORE_THRESHOLD: all_cand.append((score,d,m,e)) if not all_cand: # fallback без порога qv = st.session_state.embedder.encode(query, convert_to_numpy=True, normalize_embeddings=True).tolist() res = st.session_state.collection.query( query_embeddings=[qv], n_results=min(20, st.session_state.N_RESULTS), include=["documents","metadatas","embeddings"] ) docs=res.get("documents",[[]])[0]; metas=res.get("metadatas",[[]])[0]; embs=res.get("embeddings",[[]])[0] all_cand.extend([(0.0,d,m,e) for d,m,e in zip(docs,metas,embs) if d]) # дедуп по (file, chunk) seen=set(); dedup=[] for c in all_cand: m=c[2]; key=(m.get("filename","?"), m.get("chunk_index","?")) if key in seen: continue seen.add(key); dedup.append(c) all_cand=dedup # ---- Keyword booster ---- keywords = [w.lower() for w in re.findall(r"\w{4,}", query)] if keywords and all_cand: boosted=[] for score, d, m, e in list(all_cand): text_low = d.lower() hits = sum(1 for k in keywords if k in text_low) if hits >= 2: boosted.append((min(score + 0.25 + 0.05*hits, 1.0), d, m, e)) if boosted: all_cand = sorted(all_cand + boosted, key=lambda x: x[0], reverse=True) # реранк (опционально) rr = None if use_reranker: try: rr = get_reranker() except Exception: rr = None if rr and all_cand: pairs=[(query, d) for _,d,_,_ in all_cand] scores = rr.predict(pairs) ranked = sorted(zip(scores, all_cand), key=lambda x:x[0], reverse=True) all_cand=[c for _,c in ranked] # MMR if all_cand: qvec0 = st.session_state.embedder.encode(query, convert_to_numpy=True, normalize_embeddings=True) vecs = [np.array(e, dtype=float) for _,_,_,e in all_cand if e is not None] if vecs: sel = mmr_select(np.array(qvec0), vecs, top_k=st.session_state.TOP_AFTER, lam=0.65) selected=[all_cand[i] for i in sel if i < len(all_cand)] else: selected=all_cand[:st.session_state.TOP_AFTER] else: selected=[] sources=[{"filename":m.get("filename","?"),"chunk_index":m.get("chunk_index","?"),"score":round(float(s),3) if isinstance(s,(int,float)) else s} for s,_,m,_ in selected] except Exception as e: log.exception("Запрос коллекции: %s", e) st.error(f"Ошибка запроса коллекции: {e}") st.stop() if sources: st.markdown("#### 🔗 Источники (top)") for i,s in enumerate(sources,1): st.write(f"{i}. {s['filename']} — chunk {s['chunk_index']} (score={s['score']})") # Загрузка Qwen (лениво) try: if not st.session_state.model_loaded: with st.spinner("⏳ Загружаю Qwen…"): try: tok, mdl = load_qwen(force_cpu=force_cpu_qwen) except Exception as e: if not force_cpu_qwen: st.warning("⚠️ Ошибка CUDA/VRAM. Перехожу на CPU…") tok, mdl = load_qwen(force_cpu=True) else: raise e st.session_state.tokenizer = tok st.session_state.model = mdl st.session_state.model_loaded=True except Exception as e: st.error(f"Ошибка загрузки модели: {e}"); st.stop() if not selected: st.warning("Релевантных фрагментов не найдено. Уточните запрос или снизьте порог похожести.") st.stop() # Сбор контекста ctx=[] for s,d,m,_ in selected: head=f"[{m.get('filename','?')} • chunk {m.get('chunk_index','?')}]" ctx.append(f"{head}\n{d.strip()}\n") context = "\n".join(ctx) if len(context) > st.session_state.MAX_CTX_CHARS: context = context[:st.session_state.MAX_CTX_CHARS] # Промпт (системка не показывается в UI) sys_prompt = "Отвечай строго по контексту; если факта нет — так и скажи. Приводи ссылки [filename • chunk]." usr_prompt = f"Стиль: {st.session_state.STYLE_GUIDE}\nКОНТЕКСТ:\n{context}\n\nВОПРОС: {query}\n\nОТВЕТ:" tok = st.session_state.tokenizer mdl = st.session_state.model try: chat = tok.apply_chat_template( [{"role":"system","content":sys_prompt},{"role":"user","content":usr_prompt}], add_generation_prompt=True, tokenize=False ) except Exception: chat = f"<|im_start|>system\n{sys_prompt}<|im_end|>\n<|im_start|>user\n{usr_prompt}<|im_end|>\n<|im_start|>assistant\n" enc = tok(chat, return_tensors="pt", add_special_tokens=False) input_ids = enc["input_ids"].to(getattr(mdl, "device", "cpu")) attention_mask = enc.get("attention_mask") if attention_mask is not None: attention_mask = attention_mask.to(getattr(mdl, "device", "cpu")) # Генерация — длинная stop = None pad_id = tok.pad_token_id or tok.eos_token_id gen_kwargs = dict( min_new_tokens=260, max_new_tokens=st.session_state.MAX_NEW_TOKENS, pad_token_id=pad_id, stopping_criteria=stop, return_dict_in_generate=True, output_scores=False, use_cache=True, ) if st.session_state.SUPER_DETAILED: gen_kwargs.update(dict( do_sample=True, num_beams=4, length_penalty=0.85, temperature=0.72, top_p=0.96, top_k=100, repetition_penalty=1.05, )) else: gen_kwargs.update(dict( do_sample=True, temperature=0.8, top_p=0.96, top_k=120, repetition_penalty=1.05, )) try: with st.spinner("🧠 Генерация…"), torch.inference_mode(): out = mdl.generate(input_ids=input_ids, attention_mask=attention_mask, **gen_kwargs) full = out.sequences[0].tolist() pr = input_ids.shape[1] gen_ids = full[pr:] ans = tok.decode(gen_ids, skip_special_tokens=True) for stop_tok in ("<|im_end|>", "<|endoftext|>"): p = ans.find(stop_tok) if p != -1: ans = ans[:p] ans = sanitize(ans) if len(ans) < 20: with st.spinner("🧠 Повтор…"), torch.inference_mode(): out2 = mdl.generate(input_ids=input_ids, attention_mask=attention_mask, **gen_kwargs) full2=out2.sequences[0].tolist(); gen2=full2[pr:]; ans=sanitize(tok.decode(gen2, skip_special_tokens=True)) st.markdown("### ✅ Ответ") st.write(ans if ans else "В контексте не нашлось достаточных сведений.") log.info("Ответ готов") except RuntimeError as e: if "CUDA" in str(e): log.error("CUDA ошибка: %s", e) st.warning("⚠️ Ошибка CUDA. Перехожу на CPU и повторяю…") try: tok_cpu, mdl_cpu = load_qwen(force_cpu=True) st.session_state.tokenizer, st.session_state.model = tok_cpu, mdl_cpu enc = tok_cpu(chat, return_tensors="pt", add_special_tokens=False) input_ids = enc["input_ids"] gen_kwargs["pad_token_id"] = tok_cpu.pad_token_id or tok_cpu.eos_token_id with st.spinner("🧠 Генерация (CPU)…"), torch.inference_mode(): out = mdl_cpu.generate(input_ids=input_ids, **gen_kwargs) full = out.sequences[0].tolist(); pr=input_ids.shape[1]; gen=full[pr:] ans = sanitize(tok_cpu.decode(gen, skip_special_tokens=True)) st.markdown("### ✅ Ответ (CPU)") st.write(ans if ans else "В контексте не нашлось достаточных сведений.") except Exception as ee: log.exception("CPU retry fail: %s", ee) st.error(f"Повтор на CPU не удался: {ee}\nВключите тумблер «Запуск Qwen на CPU».") else: log.exception("Генерация: %s", e); st.error(f"Ошибка: {e}") finally: gc.collect() if torch.cuda.is_available(): try: torch.cuda.empty_cache() except Exception: pass # ====== Логи ====== with st.expander("📜 Показать логи"): try: st.code(LOG_FILE.read_text(encoding="utf-8")[-10_000:] if LOG_FILE.exists() else "Лог пуст.") except Exception as e: st.warning(f"Не удалось прочитать лог: {e}")