from pathlib import Path from docling.document_converter import DocumentConverter from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from transformers import AutoModel from sentence_transformers import SentenceTransformer from rag.db import get_db from rag.rerank import get_rerank_model import rag.ingest import rag.search import torch # converter = DocumentConverter() # chunker = HybridChunker() # file = Path("yek.md") # doc = converter.convert(file).document # chunk_iter = chunker.chunk(doc) # for chunk in chunk_iter: # print(chunk) # txt = chunker.contextualize(chunk) # print(txt) def t(): batch: list[str] = ["This son of a bitch has gone too far", "Fuck me baby please", "I'm hungry now", "Charlie Kirk is dead"] model = rag.ingest.get_embed_model() v = model.encode("pepeee", normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True) print("v") print(type(v)) print(v) print(v.dtype) print(v.device) # V = torch.cat([v], dim=0) # print("V") # print(type(V)) # print(V) # print(V.dtype) # print(V.device) # print("V_np") # V_idk = V.cpu().float() # when they were pytorch tensors # V = embed_many(chunks) # float32/fp16 on CPU? ensure float32 for DB: # V_np = V.float().cpu().numpy().astype("float32") # DIM = V_np.shape[1] # db = sqlite3.connect("./rag.db") queries = [ # "How was Shuihu zhuan received in early modern Japan?", # "Edo-period readers’ image of Song Jiang / Liangshan outlaws", # "Channels of transmission for Chinese vernacular fiction into Japan (kanbun kundoku, digests, translations)", # "Role of woodblock prints/illustrations in mediating Chinese fiction", # "Key Japanese scholars, writers, or publishers who popularized Chinese fiction", # "Kyokutei Bakin’s engagement with Chinese vernacular narrative", # "Santō Kyōden, gesaku, and Chinese models", # "Kanzen chōaku (encourage good, punish evil) and Water Margin in Japan", # "Moral ambivalence of outlaw heroes as discussed in the text", # "Censorship or moral debates around reading Chinese fiction", # "Translation strategies from vernacular Chinese to Japanese (furigana, kundoku, glossing)", # "Paratexts: prefaces, commentaries, reader guidance apparatus", # "Bibliographic details: editions, reprints, circulation networks", # "How does this book challenge older narratives about Sino-Japanese literary influence?", # "Methodology: sources, archives, limitations mentioned by the author", "sex" ] def t2(): db = get_db() # Hybrid + CE rerank query: model = rag.ingest.get_embed_model() for query in queries: print("query", query) print("-----------\n\n") # results = rag.search.search_hybrid(db, "muh", query, k_vec=50, k_bm25=50, k_final=8) # for rid, txt, score in results: # sim = score # print(f"[{rid:04d}] ce_score={sim:.3f}\n{txt[:300]}...\n") results = rag.search.vec_search(db, model, "muh", query, k=50, min_sim=0.25, max_per_doc=5, use_mmr=False, mmr_lambda=0.7) for rid, txt, score in results: sim = score print(f"[{rid:04d}] ce_score={sim:.3f}\n{txt[:300]}...\n") t2()