From 623fe6ac7a292db40d215713d5dfe16a94b8d238 Mon Sep 17 00:00:00 2001
From: polwex <polwex@sortug.com>
Date: Wed, 27 Nov 2024 22:29:30 +0700
Subject: [PATCH] init

---
 .envrc        |   3 ++
 .gitignore    |   9 ++++
 devenv.lock   | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++
 devenv.nix    |  64 ++++++++++++++++++++++++++
 devenv.yaml   |  15 +++++++
 pkg/main.py   |  78 ++++++++++++++++++++++++++++++++
 pkg/run.sh    |   1 +
 pkg/test.py   |  44 ++++++++++++++++++
 pkg/tunnel.sh |   1 +
 test.sh       |   6 +++
 10 files changed, 343 insertions(+)
 create mode 100644 .envrc
 create mode 100644 .gitignore
 create mode 100644 devenv.lock
 create mode 100644 devenv.nix
 create mode 100644 devenv.yaml
 create mode 100644 pkg/main.py
 create mode 100644 pkg/run.sh
 create mode 100644 pkg/test.py
 create mode 100644 pkg/tunnel.sh
 create mode 100644 test.sh

diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000..5bf8fc1
--- /dev/null
+++ b/.envrc
@@ -0,0 +1,3 @@
+source_url "https://raw.githubusercontent.com/cachix/devenv/95f329d49a8a5289d31e0982652f7058a189bfca/direnvrc" "sha256-d+8cBpDfDBj41inrADaJt+bDWhOktwslgoP5YiGJ1v0="
+
+use devenv
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4d058db
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+# Devenv
+.devenv*
+devenv.local.nix
+
+# direnv
+.direnv
+
+# pre-commit
+.pre-commit-config.yaml
diff --git a/devenv.lock b/devenv.lock
new file mode 100644
index 0000000..0e20c62
--- /dev/null
+++ b/devenv.lock
@@ -0,0 +1,122 @@
+{
+  "nodes": {
+    "devenv": {
+      "locked": {
+        "dir": "src/modules",
+        "lastModified": 1725964132,
+        "owner": "cachix",
+        "repo": "devenv",
+        "rev": "98c7c131e3fa30eb00e9bfe44c1a180c7f94102f",
+        "treeHash": "145543926bdaaeabf22cd0f42e2991d0e35131c4",
+        "type": "github"
+      },
+      "original": {
+        "dir": "src/modules",
+        "owner": "cachix",
+        "repo": "devenv",
+        "type": "github"
+      }
+    },
+    "flake-compat": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1696426674,
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
+        "treeHash": "2addb7b71a20a25ea74feeaf5c2f6a6b30898ecb",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "gitignore": {
+      "inputs": {
+        "nixpkgs": [
+          "pre-commit-hooks",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1709087332,
+        "owner": "hercules-ci",
+        "repo": "gitignore.nix",
+        "rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
+        "treeHash": "ca14199cabdfe1a06a7b1654c76ed49100a689f9",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "gitignore.nix",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1716977621,
+        "owner": "cachix",
+        "repo": "devenv-nixpkgs",
+        "rev": "4267e705586473d3e5c8d50299e71503f16a6fb6",
+        "treeHash": "6d9f1f7ca0faf1bc2eeb397c78a49623260d3412",
+        "type": "github"
+      },
+      "original": {
+        "owner": "cachix",
+        "ref": "rolling",
+        "repo": "devenv-nixpkgs",
+        "type": "github"
+      }
+    },
+    "nixpkgs-stable": {
+      "locked": {
+        "lastModified": 1725826545,
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "f4c846aee8e1e29062aa8514d5e0ab270f4ec2f9",
+        "treeHash": "8fc49deaed3f2728a7147c38163cc468a117570a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-24.05",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "pre-commit-hooks": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "gitignore": "gitignore",
+        "nixpkgs": [
+          "nixpkgs"
+        ],
+        "nixpkgs-stable": "nixpkgs-stable"
+      },
+      "locked": {
+        "lastModified": 1725513492,
+        "owner": "cachix",
+        "repo": "pre-commit-hooks.nix",
+        "rev": "7570de7b9b504cfe92025dd1be797bf546f66528",
+        "treeHash": "4b46d77870afecd8f642541cb4f4927326343b59",
+        "type": "github"
+      },
+      "original": {
+        "owner": "cachix",
+        "repo": "pre-commit-hooks.nix",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "devenv": "devenv",
+        "nixpkgs": "nixpkgs",
+        "pre-commit-hooks": "pre-commit-hooks"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/devenv.nix b/devenv.nix
new file mode 100644
index 0000000..853e017
--- /dev/null
+++ b/devenv.nix
@@ -0,0 +1,64 @@
+{ pkgs, lib, config, inputs, ... }:
+
+{
+  # https://devenv.sh/basics/
+  env.GREET = "devenv";
+  env.PYTHON_KEYRING_BACKEND= "keyring.backends.fail.Keyring";
+  env.CUDA_HOME = pkgs.cudaPackages.cudatoolkit;
+  env.CUDA_PATH = pkgs.cudaPackages.cudatoolkit;
+  env.LD_LIBRARY_PATH = "/run/opengl-driver/lib:$LD_LIBRARY_PATH";
+  env.CUDA_VISIBLE_DEVICES="0";
+  env.PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:256";
+  env.EXTRA_LDFLAGS="-L/lib -L${pkgs.linuxPackages.nvidia_x11}/lib";
+	env.EXTRA_CCFLAGS="-I/usr/include";
+  # env.LD_LIBRARY_PATH = ":${pkgs.cudatoolkit}/lib:${pkgs.cudatoolkit}/lib64:${pkgs.cudatoolkit}/host-linux-x64/Mesa";
+
+
+  # https://devenv.sh/packages/
+  packages = with pkgs;[ 
+    git 
+    python311Packages.python-lsp-server
+    cudaPackages.cudatoolkit
+    # gitRepo gnupg autoconf curl
+    # procps gnumake utillinux m4 gperf unzip
+    # linuxPackages.nvidia_x11
+    # libGLU
+    # xorg.libXi xorg.libXmu freeglut
+    # xorg.libXext xorg.libX11 xorg.libXv xorg.libXrandr zlib 
+    # ncurses5 stdenv.cc binutils
+    # cudaPackages.libcublas
+    # cudaPackages.cudnn
+  ];
+
+  # https://devenv.sh/scripts/
+  scripts.hello.exec = "echo hello from $GREET";
+
+  enterShell = ''
+    hello
+    git --version
+  '';
+
+  # https://devenv.sh/tests/
+  enterTest = ''
+    echo "Running tests"
+    git --version | grep "2.42.0"
+  '';
+
+  # https://devenv.sh/services/
+  # services.postgres.enable = true;
+
+  # https://devenv.sh/languages/
+  # languages.nix.enable = true;
+  languages.python = {
+    enable = true;
+    venv.enable = true;
+  };
+
+  # https://devenv.sh/pre-commit-hooks/
+  # pre-commit.hooks.shellcheck.enable = true;
+
+  # https://devenv.sh/processes/
+  # processes.ping.exec = "ping example.com";
+
+  # See full reference at https://devenv.sh/reference/options/
+}
diff --git a/devenv.yaml b/devenv.yaml
new file mode 100644
index 0000000..01189cf
--- /dev/null
+++ b/devenv.yaml
@@ -0,0 +1,15 @@
+# yaml-language-server: $schema=https://devenv.sh/devenv.schema.json
+inputs:
+  nixpkgs:
+    url: github:cachix/devenv-nixpkgs/rolling
+
+# If you're using non-OSS software, you can set allowUnfree to true.
+allowUnfree: true
+
+# If you're willing to use a package that's vulnerable
+# permittedInsecurePackages:
+#  - "openssl-1.1.1w"
+
+# If you have more than one devenv you can merge them
+#imports:
+# - ./backend
diff --git a/pkg/main.py b/pkg/main.py
new file mode 100644
index 0000000..a01030c
--- /dev/null
+++ b/pkg/main.py
@@ -0,0 +1,78 @@
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+import time
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List
+import uvicorn
+import gc
+import os
+
+app = FastAPI()
+os.environ['CUDA_VISIBLE_DEVICES'] = '0' 
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
+
+# Load model (do this only once at startup)
+print("Loading model...")
+model = AutoModel.from_pretrained('nvidia/NV-Embed-v2', trust_remote_code=True, torch_dtype="auto")
+model.eval()
+# model.half()
+
+device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
+print(device.type)
+model.to(device)
+if device.type == 'cuda':
+    # gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.memory.empty_cache()
+    torch.cuda.max_memory_allocated()
+
+
+print(f"Model loaded and moved to {device}")
+
+# Define request and response models
+class EmbeddingRequest(BaseModel):
+    texts: List[str]
+    is_query: bool
+
+class EmbeddingResponse(BaseModel):
+    embeddings: List[List[float]]
+    time_taken: float
+
+# Each query needs to be accompanied by an corresponding instruction describing the task.
+task_name_to_instruct = {"example": "Given a question, retrieve passages that answer the question",}
+query_prefix = "Instruct: " + task_name_to_instruct["example"] + "\nQuery: "
+
+max_length = 32768
+
+@app.post("/embed", response_model=EmbeddingResponse)
+async def embed_texts(request: EmbeddingRequest):
+    start_time = time.time()
+    
+    try:
+        with torch.no_grad():
+            if device.type == 'cuda':
+                # gc.collect()
+                torch.cuda.empty_cache()
+                torch.cuda.memory.empty_cache()
+            if request.is_query:
+                texts = [query_prefix + text for text in request.texts]
+            else:
+                texts = request.texts
+            
+            embeddings = model.encode(texts, max_length=max_length)
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+
+            print(f"Embedding size: {embeddings.size()}")
+            
+            embeddings_list = embeddings.cpu().float().tolist()
+            
+        time_taken = time.time() - start_time
+        return EmbeddingResponse(embeddings=embeddings_list, time_taken=time_taken)
+    
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/pkg/run.sh b/pkg/run.sh
new file mode 100644
index 0000000..dd972d5
--- /dev/null
+++ b/pkg/run.sh
@@ -0,0 +1 @@
+poetry run python -m uvicorn prosody.server:app --host 0.0.0.0 --reload
diff --git a/pkg/test.py b/pkg/test.py
new file mode 100644
index 0000000..af72411
--- /dev/null
+++ b/pkg/test.py
@@ -0,0 +1,44 @@
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+
+# Each query needs to be accompanied by an corresponding instruction describing the task.
+task_name_to_instruct = {"example": "Given a question, retrieve passages that answer the question",}
+
+query_prefix = "Instruct: "+task_name_to_instruct["example"]+"\nQuery: "
+queries = [
+    'are judo throws allowed in wrestling?', 
+    'how to become a radiology technician in michigan?'
+    ]
+
+# No instruction needed for retrieval passages
+passage_prefix = ""
+passages = [
+    "Since you're reading this, you are probably someone from a judo background or someone who is just wondering how judo techniques can be applied under wrestling rules. So without further ado, let's get to the question. Are Judo throws allowed in wrestling? Yes, judo throws are allowed in freestyle and folkstyle wrestling. You only need to be careful to follow the slam rules when executing judo throws. In wrestling, a slam is lifting and returning an opponent to the mat with unnecessary force.",
+    "Below are the basic steps to becoming a radiologic technologist in Michigan:Earn a high school diploma. As with most careers in health care, a high school education is the first step to finding entry-level employment. Taking classes in math and science, such as anatomy, biology, chemistry, physiology, and physics, can help prepare students for their college studies and future careers.Earn an associate degree. Entry-level radiologic positions typically require at least an Associate of Applied Science. Before enrolling in one of these degree programs, students should make sure it has been properly accredited by the Joint Review Committee on Education in Radiologic Technology (JRCERT).Get licensed or certified in the state of Michigan."
+]
+
+# load model with tokenizer
+model = AutoModel.from_pretrained('nvidia/NV-Embed-v2', trust_remote_code=True)
+device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
+print(device.type)
+
+
+# get the embeddings
+max_length = 32768
+query_embeddings = model.encode(queries, instruction=query_prefix, max_length=max_length)
+passage_embeddings = model.encode(passages, instruction=passage_prefix, max_length=max_length)
+
+# normalize embeddings
+query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
+passage_embeddings = F.normalize(passage_embeddings, p=2, dim=1)
+
+# get the embeddings with DataLoader (spliting the datasets into multiple mini-batches)
+# batch_size=2
+# query_embeddings = model._do_encode(queries, batch_size=batch_size, instruction=query_prefix, max_length=max_length, num_workers=32, return_numpy=True)
+# passage_embeddings = model._do_encode(passages, batch_size=batch_size, instruction=passage_prefix, max_length=max_length, num_workers=32, return_numpy=True)
+
+scores = (query_embeddings @ passage_embeddings.T) * 100
+print(scores.tolist())
+# [[87.42693328857422, 0.46283677220344543], [0.965264618396759, 86.03721618652344]]
+
diff --git a/pkg/tunnel.sh b/pkg/tunnel.sh
new file mode 100644
index 0000000..97ceaf6
--- /dev/null
+++ b/pkg/tunnel.sh
@@ -0,0 +1 @@
+autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -R 8000:localhost:8000 y@sortug
diff --git a/test.sh b/test.sh
new file mode 100644
index 0000000..c9836d3
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,6 @@
+curl -X POST "http://localhost:8000/embed" \
+     -H "Content-Type: application/json" \
+     -d '{
+       "texts": ["What is the capital of France?", "Who wrote Romeo and Juliet?"],
+       "is_query": true
+     }'