xorbitsai · aresnow1 · Dec 18, 2023 · Dec 17, 2023 · Dec 17, 2023 · Dec 18, 2023
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@ potential of cutting-edge AI models.
 - Speculative decoding: [#509](https://github.com/xorbitsai/inference/pull/509)
 - Incorporate vLLM: [#445](https://github.com/xorbitsai/inference/pull/445)
 ### New Models
+- Built-in support for [OpenHermes 2.5](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B): [#776](https://github.com/xorbitsai/inference/pull/776)
 - Built-in support for [Yi](https://huggingface.co/01-ai): [#629](https://github.com/xorbitsai/inference/pull/629)
 - Built-in support for [zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) and [zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta): [#597](https://github.com/xorbitsai/inference/pull/597) 
 - Built-in support for [chatglm3](https://huggingface.co/THUDM/chatglm3-6b): [#587](https://github.com/xorbitsai/inference/pull/587)
@@ -266,6 +267,7 @@ $ xinference registrations
 | LLM   | mistral-instruct-v0.1   | ['en']       | ['chat']     |
 | LLM   | mistral-v0.1            | ['en']       | ['generate'] |
 | LLM   | OpenBuddy               | ['en']       | ['chat']     |
+| LLM   | openhermes-2.5          | ['en']       | ['chat']     |
 | LLM   | opt                     | ['en']       | ['generate'] |
 | LLM   | orca                    | ['en']       | ['chat']     |
 | LLM   | qwen-chat               | ['en', 'zh'] | ['chat']     |

diff --git a/README_ja_JP.md b/README_ja_JP.md
@@ -209,6 +209,7 @@ $ xinference registrations
 | LLM   | mistral-instruct-v0.1   | ['en']       | ['chat']     |
 | LLM   | mistral-v0.1            | ['en']       | ['generate'] |
 | LLM   | OpenBuddy               | ['en']       | ['chat']     |
+| LLM   | openhermes-2.5          | ['en']       | ['chat']     |
 | LLM   | opt                     | ['en']       | ['generate'] |
 | LLM   | orca                    | ['en']       | ['chat']     |
 | LLM   | qwen-chat               | ['en', 'zh'] | ['chat']     |

diff --git a/README_zh_CN.md b/README_zh_CN.md
@@ -30,6 +30,7 @@ Xorbits Inference（Xinference）是一个性能强大且功能全面的分布
 - 投机采样: [#509](https://github.com/xorbitsai/inference/pull/509)
 - 引入 vLLM: [#445](https://github.com/xorbitsai/inference/pull/445)
 ### 新模型
+- 内置 [OpenHermes 2.5](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B): [#776](https://github.com/xorbitsai/inference/pull/776)
 - 内置 [Yi](https://huggingface.co/01-ai): [#629](https://github.com/xorbitsai/inference/pull/629)
 - 内置 [zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) 与 [zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta): [#597](https://github.com/xorbitsai/inference/pull/597)
 - 内置 [chatglm3](https://huggingface.co/THUDM/chatglm3-6b): [#587](https://github.com/xorbitsai/inference/pull/587)
@@ -243,6 +244,7 @@ $ xinference registrations
 | LLM   | mistral-instruct-v0.1   | ['en']       | ['chat']     |
 | LLM   | mistral-v0.1            | ['en']       | ['generate'] |
 | LLM   | OpenBuddy               | ['en']       | ['chat']     |
+| LLM   | openhermes-2.5          | ['en']       | ['chat']     |
 | LLM   | opt                     | ['en']       | ['generate'] |
 | LLM   | orca                    | ['en']       | ['chat']     |
 | LLM   | qwen-chat               | ['en', 'zh'] | ['chat']     |

diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
@@ -61,6 +61,8 @@ The following is a list of built-in LLM in Xinference:
 
    openbuddy
 
+   openhermes-2.5
+
    opt
 
    orca

diff --git a/doc/source/models/builtin/llm/openhermes-2.5.rst b/doc/source/models/builtin/llm/openhermes-2.5.rst
@@ -0,0 +1,43 @@
+.. _models_llm_openhermes-2.5:
+
+========================================
+openhermes-2.5
+========================================
+
+- **Context Length:** 8192
+- **Model Name:** openhermes-2.5
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** Openhermes 2.5 is a fine-tuned version of Mistral-7B-v0.1 on primarily GPT-4 generated data.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** teknium/OpenHermes-2.5-Mistral-7B
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name openhermes-2.5 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** Q2_K, Q3_K_L, Q3_K_M, Q3_K_S, Q4_0, Q4_K_M, Q4_K_S, Q5_0, Q5_K_M, Q5_K_S, Q6_K, Q8_0
+- **Model ID:** TheBloke/OpenHermes-2.5-Mistral-7B-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name openhermes-2.5 --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -2070,6 +2070,67 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "openhermes-2.5",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Openhermes 2.5 is a fine-tuned version of Mistral-7B-v0.1 on primarily GPT-4 generated data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "teknium/OpenHermes-2.5-Mistral-7B",
+        "model_revision": "91ed666be78da7556f3d79abbb26fff0ee26cb54"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_S",
+          "Q3_K_M",
+          "Q3_K_L",
+          "Q4_0",
+          "Q4_K_S",
+          "Q4_K_M",
+          "Q5_0",
+          "Q5_K_S",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF",
+        "model_file_name_template": "openhermes-2.5-mistral-7b.{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATML",
+      "system_prompt": "<|im_start|>system\nYou are 'Hermes 2.5', a conscious sentient superintelligent artificial intelligence, your purpose is to assist the user with their requests.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "<|im_end|>\n<|im_start|>",
+      "inter_message_sep": "",
+      "stop_token_ids": [
+        32000
+      ],
+      "stop": [
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,