Update doc

superduper-io · Jan 3, 2024 · 535e847 · 535e847
1 parent 95056c0
commit 535e847
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 17 deletions.
diff --git a/docs/hr/content/docs/ai_integrations/llm.md b/docs/hr/content/docs/ai_integrations/llm.md
@@ -0,0 +1,97 @@
+# LLMs
+## Inference
+
+`superduperdb` allows users to work with LLM services and models
+
+以下是支持的LLM服务/模型列表
+
+- vLLM
+- OpenAI格式的API服务
+
+
+
+### vLLM
+
+[vLLM](https://docs.vllm.ai/en/latest/) is a fast and easy-to-use library for LLM inference and serving.
+
+目前`superduperdb`支持以下方式来使用vLLM
+
+- **VllmModel**: 使用vLLM加载模型
+
+- **VllmOpenAI**:  请求已部署[OpenAI格式的vLLM服务](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server)
+- **VllmAPI**: 请求已部署[API server格式的服务](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#api-server)
+
+#### VllmModel
+
+`VllmModel` 支持将模型加载到本地或者Ray集群。
+
+使用 `model = VllmModel(....) ` 实例化后，模型会延迟加载，直到调用`model.predict`方法，模型才开始加载
+
+**加载到本地**
+
+使用该模型先安装`vllm`
+
+```bash
+pip install vllm
+```
+
+```python
+from superduperdb.ext.llm import VllmModel
+model = VllmModel(model_name="mistralai/Mistral-7B-Instruct-v0.2")
+```
+
+**加载到Ray集群**
+
+需要先安装`ray`，如果加载到Ray集群是无需安装`vllm`依赖的。
+
+> 安装`vllm`需要Cuda 环境，对于无GPU的客户端，这将导致无法安装vllm。因此，superduper做了适配，如果加载到ray集群，则本地无需安装`vllm`
+
+```bash
+pip install 'ray[default]'
+```
+
+```python
+from superduperdb.ext.llm import VllmModel
+model = VllmModel(model_name="mistralai/Mistral-7B-Instruct-v0.2", ray_address="ray://172.31.29.75:10001")
+```
+
+如果你是第一次在该ray集群上运行，等待时间会稍长，因为会在ray集群的服务器上安装`vllm` 和下载对应的模型。
+
+
+
+#### VllmOpenAI
+
+```python
+from superduperdb.ext.llm import VllmOpenAI
+model = VllmOpenAI(openai_api_base="http://localhost:8000/v1", model_name="mistralai/Mistral-7B-Instruct-v0.2")
+```
+
+
+
+#### VllmAPI
+
+初始化模型
+
+```python
+from superduperdb.ext.llm import VllmAPI
+model = VllmAPI(identifier='llm', api_url='http://localhost:8000/v1')
+```
+
+
+
+### OpenAI
+
+`superduperdb` 支持OpenAI格式的API服务，若不提供`openai_api_base` 等参数，默认调用OpenAI的服务。
+
+初始化模型
+
+```python
+from superduperdb.ext.llm import OpenAI
+model = OpenAI(openai_api_base="http://localhost:8000/v1", model_name="mistralai/Mistral-7B-Instruct-v0.2")
+```
+
+
+
+## Training
+
+Coming soon...
diff --git a/superduperdb/ext/llm/vllm.py b/superduperdb/ext/llm/vllm.py
@@ -133,45 +133,66 @@ def __post_init__(self):
         super().__post_init__()
 
     def init(self):
-        try:
-            from vllm import LLM
-        except ImportError:
-            raise Exception("You must install vllm with command 'pip install vllm'")
+        class _VLLMCore:
+            """
+            Wrapper for vllm model to support ray.
+            Implementing the client in this way will no longer require vllm dependencies
+            """
+
+            def __init__(self, **kwargs) -> None:
+                try:
+                    from vllm import LLM
+                except ImportError:
+                    raise Exception(
+                        "You must install vllm with command 'pip install vllm'"
+                    )
+                self.model = LLM(**kwargs)
+
+            def generate(self, prompts: List[str], **kwargs) -> List[str]:
+                from vllm import SamplingParams
+
+                sampling_params = SamplingParams(**kwargs)
+                results = self.model.generate(prompts, sampling_params, use_tqdm=False)
+                results = [result.outputs[0].text for result in results]
+                return results
 
         if self.on_ray:
             try:
                 import ray
             except ImportError:
                 raise Exception("You must install vllm with command 'pip install ray'")
 
-            runtime_env = {"pip": ["vllm"]}
+            runtime_env = {
+                "pip": [
+                    "vllm",
+                ]
+            }
             if not ray.is_initialized():
                 ray.init(address=self.ray_address, runtime_env=runtime_env)
 
             if "num_gpus" not in self.ray_config:
                 self.ray_config["num_gpus"] = self.tensor_parallel_size
-            LLM = ray.remote(**self.ray_config)(LLM).remote
+            LLM = ray.remote(**self.ray_config)(_VLLMCore).remote
+        else:
+            LLM = _VLLMCore
 
         self.llm = LLM(**self.vllm_kwargs)
 
     def _batch_generate(self, prompts: List[str], **kwargs: Any) -> List[str]:
-        from vllm import SamplingParams
-
-        # support more parameters
-        sampling_params = SamplingParams(
-            **self.get_kwargs(SamplingParams, kwargs, self.inference_kwargs)
-        )
+        total_kwargs = {}
+        for key, value in {**self.inference_kwargs, **kwargs}.items():
+            if key in VLLM_INFERENCE_PARAMETERS_LIST:
+                total_kwargs[key] = value
 
         if self.on_ray:
             import ray
 
-            results = ray.get(
-                self.llm.generate.remote(prompts, sampling_params, use_tqdm=False)
-            )
+            results = ray.get(self.llm.generate.remote(prompts, **total_kwargs))
         else:
-            results = self.llm.generate(prompts, sampling_params, use_tqdm=False)
+            results = self.llm.generate(prompts, **total_kwargs)
+            results = [result.outputs[0].text for result in results]
 
-        return [result.outputs[0].text for result in results]
+        return results
 
     def _generate(self, prompt: str, **kwargs: Any) -> str:
         return self._batch_generate([prompt], **kwargs)[0]