init

author: polwex <polwex@sortug.com> 2025-09-23 03:50:53 +0700
committer: polwex <polwex@sortug.com> 2025-09-23 03:50:53 +0700
commit: 57aaafdb137fe49930711f6ed5ccc83b3a119cd2 (patch)
tree: 1a7556927bed94377630d33dd29c3bf07d159619 /rag/test.py
1 files changed, 77 insertions, 0 deletions
diff --git a/rag/test.py b/rag/test.py
new file mode 100644
index 0000000..b7a6d8e
--- /dev/null
+++ b/rag/test.py
@@ -0,0 +1,77 @@
+from pathlib import Path
+from docling.document_converter import DocumentConverter
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+from transformers import AutoModel
+from sentence_transformers import SentenceTransformer
+from rag.db import get_db
+from rag.rerank import get_rerank_model
+import rag.ingest
+import rag.search
+import torch
+
+# converter = DocumentConverter()
+# chunker = HybridChunker()
+# file = Path("yek.md")
+# doc = converter.convert(file).document
+# chunk_iter = chunker.chunk(doc)
+# for  chunk in chunk_iter:
+#   print(chunk)
+#   txt = chunker.contextualize(chunk)
+#   print(txt)
+
+
+  
+def t():
+  batch: list[str] = ["This son of a bitch has gone too far", "Fuck me baby please", "I'm hungry now", "Charlie Kirk is dead"]
+  model = rag.ingest.get_embed_model()
+  v = model.encode("pepeee", normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
+  print("v")
+  print(type(v))
+  print(v)
+  print(v.dtype)
+  print(v.device)
+
+# V = torch.cat([v], dim=0)
+# print("V")
+# print(type(V))
+# print(V)
+# print(V.dtype)
+# print(V.device)
+# print("V_np")
+# V_idk = V.cpu().float()
+
+# when they were pytorch tensors
+# V = embed_many(chunks)                         # float32/fp16 on CPU? ensure float32 for DB:
+# V_np = V.float().cpu().numpy().astype("float32")
+# DIM = V_np.shape[1]
+# db = sqlite3.connect("./rag.db")
+
+queries = [
+"How was Shuihu zhuan received in early modern Japan?",
+"Edo-period readers’ image of Song Jiang / Liangshan outlaws",
+"Channels of transmission for Chinese vernacular fiction into Japan (kanbun kundoku, digests, translations)",
+"Role of woodblock prints/illustrations in mediating Chinese fiction",
+"Key Japanese scholars, writers, or publishers who popularized Chinese fiction",
+"Kyokutei Bakin’s engagement with Chinese vernacular narrative",
+"Santō Kyōden, gesaku, and Chinese models",
+"Kanzen chōaku (encourage good, punish evil) and Water Margin in Japan",
+"Moral ambivalence of outlaw heroes as discussed in the text",
+"Censorship or moral debates around reading Chinese fiction",
+"Translation strategies from vernacular Chinese to Japanese (furigana, kundoku, glossing)",
+"Paratexts: prefaces, commentaries, reader guidance apparatus",
+"Bibliographic details: editions, reprints, circulation networks",
+"How does this book challenge older narratives about Sino-Japanese literary influence?",
+"Methodology: sources, archives, limitations mentioned by the author",
+ 
+]
+def t2():
+  db = get_db()
+  # Hybrid + CE rerank query:
+  for query in queries:
+    print("query", query)
+    results = rag.search.search_hybrid(db,  query,  k_vec=50, k_bm25=50, k_final=8)
+    for rid, txt, score in results:
+      sim = score
+      print(f"[{rid:04d}] ce_score={sim:.3f}\n{txt[:300]}...\n")
+
+t2()
author	polwex <polwex@sortug.com>	2025-09-23 03:50:53 +0700
committer	polwex <polwex@sortug.com>	2025-09-23 03:50:53 +0700
commit	57aaafdb137fe49930711f6ed5ccc83b3a119cd2 (patch)
tree	1a7556927bed94377630d33dd29c3bf07d159619 /rag/test.py