rag/test.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

from pathlib import Path
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from transformers import AutoModel
from sentence_transformers import SentenceTransformer
from rag.db import get_db
from rag.rerank import get_rerank_model
import rag.ingest
import rag.search
import torch

# converter = DocumentConverter()
# chunker = HybridChunker()
# file = Path("yek.md")
# doc = converter.convert(file).document
# chunk_iter = chunker.chunk(doc)
# for  chunk in chunk_iter:
#   print(chunk)
#   txt = chunker.contextualize(chunk)
#   print(txt)


def t():
  batch: list[str] = ["This son of a bitch has gone too far", "Fuck me baby please", "I'm hungry now", "Charlie Kirk is dead"]
  model = rag.ingest.get_embed_model()
  v = model.encode("pepeee", normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
  print("v")
  print(type(v))
  print(v)
  print(v.dtype)
  print(v.device)

# V = torch.cat([v], dim=0)
# print("V")
# print(type(V))
# print(V)
# print(V.dtype)
# print(V.device)
# print("V_np")
# V_idk = V.cpu().float()

# when they were pytorch tensors
# V = embed_many(chunks)                         # float32/fp16 on CPU? ensure float32 for DB:
# V_np = V.float().cpu().numpy().astype("float32")
# DIM = V_np.shape[1]
# db = sqlite3.connect("./rag.db")

queries = [
"How was Shuihu zhuan received in early modern Japan?",
"Edo-period readers’ image of Song Jiang / Liangshan outlaws",
"Channels of transmission for Chinese vernacular fiction into Japan (kanbun kundoku, digests, translations)",
"Role of woodblock prints/illustrations in mediating Chinese fiction",
"Key Japanese scholars, writers, or publishers who popularized Chinese fiction",
"Kyokutei Bakin’s engagement with Chinese vernacular narrative",
"Santō Kyōden, gesaku, and Chinese models",
"Kanzen chōaku (encourage good, punish evil) and Water Margin in Japan",
"Moral ambivalence of outlaw heroes as discussed in the text",
"Censorship or moral debates around reading Chinese fiction",
"Translation strategies from vernacular Chinese to Japanese (furigana, kundoku, glossing)",
"Paratexts: prefaces, commentaries, reader guidance apparatus",
"Bibliographic details: editions, reprints, circulation networks",
"How does this book challenge older narratives about Sino-Japanese literary influence?",
"Methodology: sources, archives, limitations mentioned by the author",
 
]
def t2():
  db = get_db()
  # Hybrid + CE rerank query:
  for query in queries:
    print("query", query)
    results = rag.search.search_hybrid(db,  query,  k_vec=50, k_bm25=50, k_final=8)
    for rid, txt, score in results:
      sim = score
      print(f"[{rid:04d}] ce_score={sim:.3f}\n{txt[:300]}...\n")

t2()