This commit is contained in:
polwex 2024-11-27 22:29:30 +07:00
commit 623fe6ac7a
10 changed files with 343 additions and 0 deletions

3
.envrc Normal file
View File

@ -0,0 +1,3 @@
source_url "https://raw.githubusercontent.com/cachix/devenv/95f329d49a8a5289d31e0982652f7058a189bfca/direnvrc" "sha256-d+8cBpDfDBj41inrADaJt+bDWhOktwslgoP5YiGJ1v0="
use devenv

9
.gitignore vendored Normal file
View File

@ -0,0 +1,9 @@
# Devenv
.devenv*
devenv.local.nix
# direnv
.direnv
# pre-commit
.pre-commit-config.yaml

122
devenv.lock Normal file
View File

@ -0,0 +1,122 @@
{
"nodes": {
"devenv": {
"locked": {
"dir": "src/modules",
"lastModified": 1725964132,
"owner": "cachix",
"repo": "devenv",
"rev": "98c7c131e3fa30eb00e9bfe44c1a180c7f94102f",
"treeHash": "145543926bdaaeabf22cd0f42e2991d0e35131c4",
"type": "github"
},
"original": {
"dir": "src/modules",
"owner": "cachix",
"repo": "devenv",
"type": "github"
}
},
"flake-compat": {
"flake": false,
"locked": {
"lastModified": 1696426674,
"owner": "edolstra",
"repo": "flake-compat",
"rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
"treeHash": "2addb7b71a20a25ea74feeaf5c2f6a6b30898ecb",
"type": "github"
},
"original": {
"owner": "edolstra",
"repo": "flake-compat",
"type": "github"
}
},
"gitignore": {
"inputs": {
"nixpkgs": [
"pre-commit-hooks",
"nixpkgs"
]
},
"locked": {
"lastModified": 1709087332,
"owner": "hercules-ci",
"repo": "gitignore.nix",
"rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
"treeHash": "ca14199cabdfe1a06a7b1654c76ed49100a689f9",
"type": "github"
},
"original": {
"owner": "hercules-ci",
"repo": "gitignore.nix",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1716977621,
"owner": "cachix",
"repo": "devenv-nixpkgs",
"rev": "4267e705586473d3e5c8d50299e71503f16a6fb6",
"treeHash": "6d9f1f7ca0faf1bc2eeb397c78a49623260d3412",
"type": "github"
},
"original": {
"owner": "cachix",
"ref": "rolling",
"repo": "devenv-nixpkgs",
"type": "github"
}
},
"nixpkgs-stable": {
"locked": {
"lastModified": 1725826545,
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "f4c846aee8e1e29062aa8514d5e0ab270f4ec2f9",
"treeHash": "8fc49deaed3f2728a7147c38163cc468a117570a",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-24.05",
"repo": "nixpkgs",
"type": "github"
}
},
"pre-commit-hooks": {
"inputs": {
"flake-compat": "flake-compat",
"gitignore": "gitignore",
"nixpkgs": [
"nixpkgs"
],
"nixpkgs-stable": "nixpkgs-stable"
},
"locked": {
"lastModified": 1725513492,
"owner": "cachix",
"repo": "pre-commit-hooks.nix",
"rev": "7570de7b9b504cfe92025dd1be797bf546f66528",
"treeHash": "4b46d77870afecd8f642541cb4f4927326343b59",
"type": "github"
},
"original": {
"owner": "cachix",
"repo": "pre-commit-hooks.nix",
"type": "github"
}
},
"root": {
"inputs": {
"devenv": "devenv",
"nixpkgs": "nixpkgs",
"pre-commit-hooks": "pre-commit-hooks"
}
}
},
"root": "root",
"version": 7
}

64
devenv.nix Normal file
View File

@ -0,0 +1,64 @@
{ pkgs, lib, config, inputs, ... }:
{
# https://devenv.sh/basics/
env.GREET = "devenv";
env.PYTHON_KEYRING_BACKEND= "keyring.backends.fail.Keyring";
env.CUDA_HOME = pkgs.cudaPackages.cudatoolkit;
env.CUDA_PATH = pkgs.cudaPackages.cudatoolkit;
env.LD_LIBRARY_PATH = "/run/opengl-driver/lib:$LD_LIBRARY_PATH";
env.CUDA_VISIBLE_DEVICES="0";
env.PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:256";
env.EXTRA_LDFLAGS="-L/lib -L${pkgs.linuxPackages.nvidia_x11}/lib";
env.EXTRA_CCFLAGS="-I/usr/include";
# env.LD_LIBRARY_PATH = ":${pkgs.cudatoolkit}/lib:${pkgs.cudatoolkit}/lib64:${pkgs.cudatoolkit}/host-linux-x64/Mesa";
# https://devenv.sh/packages/
packages = with pkgs;[
git
python311Packages.python-lsp-server
cudaPackages.cudatoolkit
# gitRepo gnupg autoconf curl
# procps gnumake utillinux m4 gperf unzip
# linuxPackages.nvidia_x11
# libGLU
# xorg.libXi xorg.libXmu freeglut
# xorg.libXext xorg.libX11 xorg.libXv xorg.libXrandr zlib
# ncurses5 stdenv.cc binutils
# cudaPackages.libcublas
# cudaPackages.cudnn
];
# https://devenv.sh/scripts/
scripts.hello.exec = "echo hello from $GREET";
enterShell = ''
hello
git --version
'';
# https://devenv.sh/tests/
enterTest = ''
echo "Running tests"
git --version | grep "2.42.0"
'';
# https://devenv.sh/services/
# services.postgres.enable = true;
# https://devenv.sh/languages/
# languages.nix.enable = true;
languages.python = {
enable = true;
venv.enable = true;
};
# https://devenv.sh/pre-commit-hooks/
# pre-commit.hooks.shellcheck.enable = true;
# https://devenv.sh/processes/
# processes.ping.exec = "ping example.com";
# See full reference at https://devenv.sh/reference/options/
}

15
devenv.yaml Normal file
View File

@ -0,0 +1,15 @@
# yaml-language-server: $schema=https://devenv.sh/devenv.schema.json
inputs:
nixpkgs:
url: github:cachix/devenv-nixpkgs/rolling
# If you're using non-OSS software, you can set allowUnfree to true.
allowUnfree: true
# If you're willing to use a package that's vulnerable
# permittedInsecurePackages:
# - "openssl-1.1.1w"
# If you have more than one devenv you can merge them
#imports:
# - ./backend

78
pkg/main.py Normal file
View File

@ -0,0 +1,78 @@
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import time
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List
import uvicorn
import gc
import os
app = FastAPI()
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
# Load model (do this only once at startup)
print("Loading model...")
model = AutoModel.from_pretrained('nvidia/NV-Embed-v2', trust_remote_code=True, torch_dtype="auto")
model.eval()
# model.half()
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(device.type)
model.to(device)
if device.type == 'cuda':
# gc.collect()
torch.cuda.empty_cache()
torch.cuda.memory.empty_cache()
torch.cuda.max_memory_allocated()
print(f"Model loaded and moved to {device}")
# Define request and response models
class EmbeddingRequest(BaseModel):
texts: List[str]
is_query: bool
class EmbeddingResponse(BaseModel):
embeddings: List[List[float]]
time_taken: float
# Each query needs to be accompanied by an corresponding instruction describing the task.
task_name_to_instruct = {"example": "Given a question, retrieve passages that answer the question",}
query_prefix = "Instruct: " + task_name_to_instruct["example"] + "\nQuery: "
max_length = 32768
@app.post("/embed", response_model=EmbeddingResponse)
async def embed_texts(request: EmbeddingRequest):
start_time = time.time()
try:
with torch.no_grad():
if device.type == 'cuda':
# gc.collect()
torch.cuda.empty_cache()
torch.cuda.memory.empty_cache()
if request.is_query:
texts = [query_prefix + text for text in request.texts]
else:
texts = request.texts
embeddings = model.encode(texts, max_length=max_length)
embeddings = F.normalize(embeddings, p=2, dim=1)
print(f"Embedding size: {embeddings.size()}")
embeddings_list = embeddings.cpu().float().tolist()
time_taken = time.time() - start_time
return EmbeddingResponse(embeddings=embeddings_list, time_taken=time_taken)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)

1
pkg/run.sh Normal file
View File

@ -0,0 +1 @@
poetry run python -m uvicorn prosody.server:app --host 0.0.0.0 --reload

44
pkg/test.py Normal file
View File

@ -0,0 +1,44 @@
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
# Each query needs to be accompanied by an corresponding instruction describing the task.
task_name_to_instruct = {"example": "Given a question, retrieve passages that answer the question",}
query_prefix = "Instruct: "+task_name_to_instruct["example"]+"\nQuery: "
queries = [
'are judo throws allowed in wrestling?',
'how to become a radiology technician in michigan?'
]
# No instruction needed for retrieval passages
passage_prefix = ""
passages = [
"Since you're reading this, you are probably someone from a judo background or someone who is just wondering how judo techniques can be applied under wrestling rules. So without further ado, let's get to the question. Are Judo throws allowed in wrestling? Yes, judo throws are allowed in freestyle and folkstyle wrestling. You only need to be careful to follow the slam rules when executing judo throws. In wrestling, a slam is lifting and returning an opponent to the mat with unnecessary force.",
"Below are the basic steps to becoming a radiologic technologist in Michigan:Earn a high school diploma. As with most careers in health care, a high school education is the first step to finding entry-level employment. Taking classes in math and science, such as anatomy, biology, chemistry, physiology, and physics, can help prepare students for their college studies and future careers.Earn an associate degree. Entry-level radiologic positions typically require at least an Associate of Applied Science. Before enrolling in one of these degree programs, students should make sure it has been properly accredited by the Joint Review Committee on Education in Radiologic Technology (JRCERT).Get licensed or certified in the state of Michigan."
]
# load model with tokenizer
model = AutoModel.from_pretrained('nvidia/NV-Embed-v2', trust_remote_code=True)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(device.type)
# get the embeddings
max_length = 32768
query_embeddings = model.encode(queries, instruction=query_prefix, max_length=max_length)
passage_embeddings = model.encode(passages, instruction=passage_prefix, max_length=max_length)
# normalize embeddings
query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
passage_embeddings = F.normalize(passage_embeddings, p=2, dim=1)
# get the embeddings with DataLoader (spliting the datasets into multiple mini-batches)
# batch_size=2
# query_embeddings = model._do_encode(queries, batch_size=batch_size, instruction=query_prefix, max_length=max_length, num_workers=32, return_numpy=True)
# passage_embeddings = model._do_encode(passages, batch_size=batch_size, instruction=passage_prefix, max_length=max_length, num_workers=32, return_numpy=True)
scores = (query_embeddings @ passage_embeddings.T) * 100
print(scores.tolist())
# [[87.42693328857422, 0.46283677220344543], [0.965264618396759, 86.03721618652344]]

1
pkg/tunnel.sh Normal file
View File

@ -0,0 +1 @@
autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -R 8000:localhost:8000 y@sortug

6
test.sh Normal file
View File

@ -0,0 +1,6 @@
curl -X POST "http://localhost:8000/embed" \
-H "Content-Type: application/json" \
-d '{
"texts": ["What is the capital of France?", "Who wrote Romeo and Juliet?"],
"is_query": true
}'