from pathlib import Path from docling.document_converter import DocumentConverter from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from transformers import AutoModel from sentence_transformers import SentenceTransformer from rag.db import get_db from rag.rerank import get_rerank_model import rag.ingest import rag.search import torch # converter = DocumentConverter() # chunker = HybridChunker() # file = Path("yek.md") # doc = converter.convert(file).document # chunk_iter = chunker.chunk(doc) # for chunk in chunk_iter: # print(chunk) # txt = chunker.contextualize(chunk) # print(txt) def t(): batch: list[str] = ["This son of a bitch has gone too far", "Fuck me baby please", "I'm hungry now", "Charlie Kirk is dead"] model = rag.ingest.get_embed_model() v = model.encode("pepeee", normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True) print("v") print(type(v)) print(v) print(v.dtype) print(v.device) # V = torch.cat([v], dim=0) # print("V") # print(type(V)) # print(V) # print(V.dtype) # print(V.device) # print("V_np") # V_idk = V.cpu().float() # when they were pytorch tensors # V = embed_many(chunks) # float32/fp16 on CPU? ensure float32 for DB: # V_np = V.float().cpu().numpy().astype("float32") # DIM = V_np.shape[1] # db = sqlite3.connect("./rag.db") queries = [ "How was Shuihu zhuan received in early modern Japan?", "Edo-period readers’ image of Song Jiang / Liangshan outlaws", "Channels of transmission for Chinese vernacular fiction into Japan (kanbun kundoku, digests, translations)", "Role of woodblock prints/illustrations in mediating Chinese fiction", "Key Japanese scholars, writers, or publishers who popularized Chinese fiction", "Kyokutei Bakin’s engagement with Chinese vernacular narrative", "Santō Kyōden, gesaku, and Chinese models", "Kanzen chōaku (encourage good, punish evil) and Water Margin in Japan", "Moral ambivalence of outlaw heroes as discussed in the text", "Censorship or moral debates around reading Chinese fiction", "Translation strategies from vernacular Chinese to Japanese (furigana, kundoku, glossing)", "Paratexts: prefaces, commentaries, reader guidance apparatus", "Bibliographic details: editions, reprints, circulation networks", "How does this book challenge older narratives about Sino-Japanese literary influence?", "Methodology: sources, archives, limitations mentioned by the author", ] def t2(): db = get_db() # Hybrid + CE rerank query: for query in queries: print("query", query) results = rag.search.search_hybrid(db, query, k_vec=50, k_bm25=50, k_final=8) for rid, txt, score in results: sim = score print(f"[{rid:04d}] ce_score={sim:.3f}\n{txt[:300]}...\n") t2()