Merge branch 'master' into fix-#286

ggerganov · Jul 2, 2023 · f863199 · f863199
2 parents 58ed3ad + b22fb03
commit f863199
Show file tree

Hide file tree

Showing 35 changed files with 3,247 additions and 926 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,4 +18,6 @@ src/arm_neon.h
 tests/arm_neon.h
 
 zig-out/
-zig-cache/
+zig-cache/
+
+*.dot
diff --git a/build.zig b/build.zig
@@ -1,25 +1,29 @@
 const std = @import("std");
 
-// Zig Version: 0.11.0-dev.3798+a5e15eced
+// Zig Version: 0.11.0-dev.3886+0c1bfe271
 // Zig Build Command: zig build
-// Zig Run Command:    
-//     zig build run_dolly-v2             
-//     zig build run_gpt-2                  
-//     zig build run_gpt-j                   
-//     zig build run_gpt-neox                 
-//     zig build run_mnist                    
-//     zig build run_mpt                     
-//     zig build run_replit                  
-//     zig build run_starcoder                
-//     zig build run_test-grad0               
-//     zig build run_test-mul-mat0            
-//     zig build run_test-mul-mat2            
-//     zig build run_test-opt                 
-//     zig build run_test-vec1                
-//     zig build run_test0                   
-//     zig build run_test1                    
-//     zig build run_test2                    
-//     zig build run_test3                    
+// Zig Run Command: zig build -h
+//     zig build run_dolly-v2
+//     zig build run_gpt-2
+//     zig build run_gpt-j
+//     zig build run_gpt-neox
+//     zig build run_mnist
+//     zig build run_mpt
+//     zig build run_replit
+//     zig build run_starcoder
+//     zig build run_test-grad0
+//     zig build run_test-mul-mat0
+//     zig build run_test-mul-mat2
+//     zig build run_test-opt
+//     zig build run_test-vec1
+//     zig build run_test0
+//     zig build run_test1
+//     zig build run_test2
+//     zig build run_test3
+//     zig build run_zig_test0
+//     zig build run_zig_test1
+//     zig build run_zig_test2
+//     zig build run_zig_test3
 pub fn build(b: *std.build.Builder) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
@@ -110,4 +114,29 @@ pub fn build(b: *std.build.Builder) void {
         const run_step = b.step("run_" ++ name, "Run tests");
         run_step.dependOn(&run_cmd.step);
     }
-}
+
+    // zig_tests
+    const zig_tests = .{
+        "test0",
+        "test1",
+        "test2",
+        "test3",
+    };
+    inline for (zig_tests) |name| {
+        const exe = b.addExecutable(.{
+            .name = name,
+            .root_source_file = .{ .path = std.fmt.comptimePrint("tests/{s}.zig", .{name}) },
+            .target = target,
+            .optimize = optimize,
+        });
+        exe.addIncludePath("./include");
+        exe.addIncludePath("./include/ggml");
+        exe.linkLibrary(lib);
+        b.installArtifact(exe);
+        const run_cmd = b.addRunArtifact(exe);
+        run_cmd.step.dependOn(b.getInstallStep());
+        if (b.args) |args| run_cmd.addArgs(args);
+        const run_step = b.step("run_zig_" ++ name, "Run zig_tests");
+        run_step.dependOn(&run_cmd.step);
+    }
+}
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -39,6 +39,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.top_p = std::stof(argv[++i]);
         } else if (arg == "--temp") {
             params.temp = std::stof(argv[++i]);
+        } else if (arg == "--repeat-last-n") {
+            params.repeat_last_n = std::stof(argv[++i]);
+        } else if (arg == "--repeat-penalty") {
+            params.repeat_penalty = std::stof(argv[++i]);            
         } else if (arg == "-b" || arg == "--batch_size") {
             params.n_batch = std::stoi(argv[++i]);
         } else if (arg == "-m" || arg == "--model") {
@@ -90,6 +94,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
     fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
     fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
+    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
+    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);    
     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());

diff --git a/examples/common.h b/examples/common.h
@@ -23,6 +23,8 @@ struct gpt_params {
     int32_t top_k = 40;
     float   top_p = 0.9f;
     float   temp  = 0.9f;
+    int32_t repeat_last_n  = 64;
+    float   repeat_penalty = 1.00f;
 
     int32_t n_batch = 8; // batch size for prompt processing
 

diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp
@@ -100,7 +100,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
     {
         uint32_t magic;
         fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic != GGML_FILE_MAGIC) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
             return false;
         }
@@ -523,8 +523,8 @@ bool dollyv2_eval(
             struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
 
             // using mode = 2 for GPT-NeoX mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2);
+            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0);
+            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0);
 
             // store key and value to memory
             {

diff --git a/examples/dolly-v2/quantize.cpp b/examples/dolly-v2/quantize.cpp
@@ -47,7 +47,7 @@ bool dollyv2_model_quantize(const std::string & fname_inp, const std::string & f
     {
         uint32_t magic;
         finp.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic != GGML_FILE_MAGIC) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
             return false;
         }

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
@@ -85,7 +85,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
     {
         uint32_t magic;
         fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic != GGML_FILE_MAGIC) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
             return false;
         }

diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp
@@ -45,7 +45,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
     {
         uint32_t magic;
         finp.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic != GGML_FILE_MAGIC) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
             return false;
         }

diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp
@@ -85,7 +85,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
     {
         uint32_t magic;
         fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic != GGML_FILE_MAGIC) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
             return false;
         }
@@ -452,8 +452,8 @@ bool gptj_eval(
 
         // self-attention
         {
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
 
             // store key and value to memory
             {

diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp
@@ -46,7 +46,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
     {
         uint32_t magic;
         finp.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic != GGML_FILE_MAGIC) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
             return false;
         }

diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp
@@ -90,7 +90,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
     {
         uint32_t magic;
         fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic != GGML_FILE_MAGIC) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
             return false;
         }
@@ -517,8 +517,8 @@ bool gpt_neox_eval(
             struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
 
             // using mode = 2 for GPT-NeoX mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2);
+            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0);
+            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0);
 
             // store key and value to memory
             {

diff --git a/examples/gpt-neox/quantize.cpp b/examples/gpt-neox/quantize.cpp
@@ -47,7 +47,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
     {
         uint32_t magic;
         finp.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic != GGML_FILE_MAGIC) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
             return false;
         }

diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
@@ -48,7 +48,7 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) {
     {
         uint32_t magic;
         fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
+        if (magic != GGML_FILE_MAGIC) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
             return false;
         }

diff --git a/examples/mpt/README.md b/examples/mpt/README.md
@@ -0,0 +1,27 @@
+# MPT
+
+Ref: https://github.com/mosaicml/llm-foundry#mpt
+
+## Usage
+
+```bash
+# get the repo and build it
+git clone https://github.com/ggerganov/ggml
+cd ggml
+mkdir build && cd build
+cmake ..
+make -j
+
+# get the model from HuggingFace
+# be sure to have git-lfs installed
+git clone https://huggingface.co/mosaicml/mpt-30b
+
+# convert model to FP16
+python3 ../examples/mpt/convert-h5-to-ggml.py ./mpt-30b 1
+
+# run inference using FP16 precision
+./bin/mpt -m ./mpt-30b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64
+
+# quantize the model to 5-bits using Q5_0 quantization
+./bin/mpt-quantize ./mpt-30b/ggml-model-f16.bin ./mpt-30b/ggml-model-q5_0.bin q5_0
+```