rag/test.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

from pathlib import Path
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from transformers import AutoModel
from sentence_transformers import SentenceTransformer
from rag.db import get_db
from rag.rerank import get_rerank_model
import rag.ingest
import rag.search
import torch

# converter = DocumentConverter()
# chunker = HybridChunker()
# file = Path("yek.md")
# doc = converter.convert(file).document
# chunk_iter = chunker.chunk(doc)
# for  chunk in chunk_iter:
#   print(chunk)
#   txt = chunker.contextualize(chunk)
#   print(txt)


def t():
  batch: list[str] = ["This son of a bitch has gone too far", "Fuck me baby please", "I'm hungry now", "Charlie Kirk is dead"]
  model = rag.ingest.get_embed_model()
  v = model.encode("pepeee", normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
  print("v")
  print(type(v))
  print(v)
  print(v.dtype)
  print(v.device)

# V = torch.cat([v], dim=0)
# print("V")
# print(type(V))
# print(V)
# print(V.dtype)
# print(V.device)
# print("V_np")
# V_idk = V.cpu().float()

# when they were pytorch tensors
# V = embed_many(chunks)                         # float32/fp16 on CPU? ensure float32 for DB:
# V_np = V.float().cpu().numpy().astype("float32")
# DIM = V_np.shape[1]
# db = sqlite3.connect("./rag.db")

queries = [
# "How was Shuihu zhuan received in early modern Japan?",
# "Edo-period readers’ image of Song Jiang / Liangshan outlaws",
# "Channels of transmission for Chinese vernacular fiction into Japan (kanbun kundoku, digests, translations)",
# "Role of woodblock prints/illustrations in mediating Chinese fiction",
# "Key Japanese scholars, writers, or publishers who popularized Chinese fiction",
# "Kyokutei Bakin’s engagement with Chinese vernacular narrative",
# "Santō Kyōden, gesaku, and Chinese models",
# "Kanzen chōaku (encourage good, punish evil) and Water Margin in Japan",
# "Moral ambivalence of outlaw heroes as discussed in the text",
# "Censorship or moral debates around reading Chinese fiction",
# "Translation strategies from vernacular Chinese to Japanese (furigana, kundoku, glossing)",
# "Paratexts: prefaces, commentaries, reader guidance apparatus",
# "Bibliographic details: editions, reprints, circulation networks",
# "How does this book challenge older narratives about Sino-Japanese literary influence?",
# "Methodology: sources, archives, limitations mentioned by the author",
"sex"
 
]
def t2():
  db = get_db()
  # Hybrid + CE rerank query:
  model = rag.ingest.get_embed_model()
  for query in queries:
    print("query", query)
    print("-----------\n\n")
    # results = rag.search.search_hybrid(db, "muh", query,  k_vec=50, k_bm25=50, k_final=8)
    # for rid, txt, score in results:
    #   sim = score
    #   print(f"[{rid:04d}] ce_score={sim:.3f}\n{txt[:300]}...\n")
    results = rag.search.vec_search(db, model, "muh", query,  k=50, min_sim=0.25, max_per_doc=5, use_mmr=False, mmr_lambda=0.7)
    for rid, txt, score in results:
      sim = score
      print(f"[{rid:04d}] ce_score={sim:.3f}\n{txt[:300]}...\n")

t2()