Expand PersHelper to support disk/memory approaches to communicate pe…

…rsistence, add unit tests.
score-p · Apr 28, 2024 · c7b8af6 · c7b8af6
1 parent 1a1c406
commit c7b8af6
Show file tree

Hide file tree

Showing 13 changed files with 593 additions and 336 deletions.
diff --git a/examples/gpt-demo/01-GPT-Training.ipynb b/examples/gpt-demo/01-GPT-Training.ipynb
@@ -16,9 +16,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "set user environment sucessfully: {'SCOREP_ENABLE_TRACING': '1', 'SCOREP_ENABLE_PROFILING': '0', 'SCOREP_TOTAL_MEMORY': '3g'}"
+     ]
+    }
+   ],
    "source": [
     "%%scorep_env\n",
     "SCOREP_ENABLE_TRACING=1\n",
@@ -28,56 +36,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#%env SCOREP_KERNEL_PERSISTENCE_MODE MEMORY"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%env"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%switch_serializer\n",
-    "cloudpickle"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "use the following scorep python binding arguments: --noinstrumenter"
+     ]
+    }
+   ],
    "source": [
     "%%scorep_python_binding_arguments\n",
     "--noinstrumenter"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "filename = \"fairy_test.txt\""
+    "filename = \"fairytales.txt\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data has 49496 characters, 79 unique.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "15/09/2021 15:35:08 - INFO - model - Number of parameters : 2.531738e+07\n",
+      "/home/h9/s4122485/virtualenv_jupyterkernel_scorep_python/lib/python3.8/site-packages/torch/utils/data/dataloader.py:478: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 1, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n",
+      "  warnings.warn(_create_warning_msg(\n",
+      "epoch 1 iter 96: train loss 2.31473. lr 0.00030152924503397155: 100%|██████████| 97/97 [02:27<00:00,  1.52s/it]\n",
+      "[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
+      "[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
+      "[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
+      "[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
+      "epoch 2 iter 96: train loss 2.05380. lr 5.9999999999999995e-05: 100%|██████████| 97/97 [02:27<00:00,  1.52s/it]\n",
+      "[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
+      "[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
+      "[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
+      "[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
+      "epoch 3 iter 96: train loss 1.88871. lr 0.0003015292450339715: 100%|██████████| 97/97 [02:27<00:00,  1.52s/it] \n"
+     ]
+    }
+   ],
    "source": [
     "%%execute_with_scorep\n",
     "import scorep\n",
@@ -130,33 +146,21 @@
     "\n",
     "    from model import GPT, GPTconfig\n",
     "    mconf = GPTconfig(train_dataset.vocab_size, train_dataset.block_size,\n",
-    "                      n_layer=1, n_head=8, n_embd=512)\n",
+    "                      n_layer=8, n_head=8, n_embd=512)\n",
     "    model = GPT(mconf)\n",
     "\n",
     "    from trainer import Trainer, TrainerConfig\n",
     "\n",
-    "    tconf = TrainerConfig(max_epochs=1, batch_size=512, learning_rate=6e-4,\n",
+    "    tconf = TrainerConfig(max_epochs=3, batch_size=512, learning_rate=6e-4,\n",
     "                          lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,\n",
     "                          num_workers=4)\n",
     "    trainer = Trainer(model, train_dataset, None, tconf)\n",
+    "\n",
+    "    torch.cuda.empty_cache()\n",
     "    trainer.train()\n",
     "\n",
     "    torch.save(model.state_dict(), \"./saved_models/trained_gpt_model\")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -165,14 +169,14 @@
    "lastKernelId": null
   },
   "kernelspec": {
-   "display_name": "scorep-python",
-   "language": "python",
-   "name": "scorep-python"
+   "display_name": "scorep-python3",
+   "language": "python3",
+   "name": "scorep-python3"
   },
   "language_info": {
    "file_extension": ".py",
    "mimetype": "text/plain",
-   "name": "python"
+   "name": "Any text"
   }
  },
  "nbformat": 4,

diff --git a/examples/gpt-demo/fairy_test.txt b/examples/gpt-demo/fairy_test.txt
diff --git a/examples/gpt-demo/trainer.py b/examples/gpt-demo/trainer.py
@@ -37,7 +37,7 @@ def __init__(self, model, train_dataset, test_dataset, config):
         self.test_dataset = test_dataset
         self.config = config
 
-        self.device = "cpu"
+        self.device = "gpu"
         if torch.cuda.is_available():
             self.device = torch.cuda.current_device()
             self.model = torch.nn.DataParallel(self.model).to(self.device)