diff options
author | polwex <polwex@sortug.com> | 2025-09-23 03:50:53 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-09-23 03:50:53 +0700 |
commit | 57aaafdb137fe49930711f6ed5ccc83b3a119cd2 (patch) | |
tree | 1a7556927bed94377630d33dd29c3bf07d159619 /rag/test.py |
init
Diffstat (limited to 'rag/test.py')
-rw-r--r-- | rag/test.py | 77 |
1 files changed, 77 insertions, 0 deletions
diff --git a/rag/test.py b/rag/test.py new file mode 100644 index 0000000..b7a6d8e --- /dev/null +++ b/rag/test.py @@ -0,0 +1,77 @@ +from pathlib import Path +from docling.document_converter import DocumentConverter +from docling_core.transforms.chunker.hybrid_chunker import HybridChunker +from transformers import AutoModel +from sentence_transformers import SentenceTransformer +from rag.db import get_db +from rag.rerank import get_rerank_model +import rag.ingest +import rag.search +import torch + +# converter = DocumentConverter() +# chunker = HybridChunker() +# file = Path("yek.md") +# doc = converter.convert(file).document +# chunk_iter = chunker.chunk(doc) +# for chunk in chunk_iter: +# print(chunk) +# txt = chunker.contextualize(chunk) +# print(txt) + + + +def t(): + batch: list[str] = ["This son of a bitch has gone too far", "Fuck me baby please", "I'm hungry now", "Charlie Kirk is dead"] + model = rag.ingest.get_embed_model() + v = model.encode("pepeee", normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True) + print("v") + print(type(v)) + print(v) + print(v.dtype) + print(v.device) + +# V = torch.cat([v], dim=0) +# print("V") +# print(type(V)) +# print(V) +# print(V.dtype) +# print(V.device) +# print("V_np") +# V_idk = V.cpu().float() + +# when they were pytorch tensors +# V = embed_many(chunks) # float32/fp16 on CPU? ensure float32 for DB: +# V_np = V.float().cpu().numpy().astype("float32") +# DIM = V_np.shape[1] +# db = sqlite3.connect("./rag.db") + +queries = [ +"How was Shuihu zhuan received in early modern Japan?", +"Edo-period readers’ image of Song Jiang / Liangshan outlaws", +"Channels of transmission for Chinese vernacular fiction into Japan (kanbun kundoku, digests, translations)", +"Role of woodblock prints/illustrations in mediating Chinese fiction", +"Key Japanese scholars, writers, or publishers who popularized Chinese fiction", +"Kyokutei Bakin’s engagement with Chinese vernacular narrative", +"Santō Kyōden, gesaku, and Chinese models", +"Kanzen chōaku (encourage good, punish evil) and Water Margin in Japan", +"Moral ambivalence of outlaw heroes as discussed in the text", +"Censorship or moral debates around reading Chinese fiction", +"Translation strategies from vernacular Chinese to Japanese (furigana, kundoku, glossing)", +"Paratexts: prefaces, commentaries, reader guidance apparatus", +"Bibliographic details: editions, reprints, circulation networks", +"How does this book challenge older narratives about Sino-Japanese literary influence?", +"Methodology: sources, archives, limitations mentioned by the author", + +] +def t2(): + db = get_db() + # Hybrid + CE rerank query: + for query in queries: + print("query", query) + results = rag.search.search_hybrid(db, query, k_vec=50, k_bm25=50, k_final=8) + for rid, txt, score in results: + sim = score + print(f"[{rid:04d}] ce_score={sim:.3f}\n{txt[:300]}...\n") + +t2() |