Skip to content

Commit

Permalink
Expand PersHelper to support disk/memory approaches to communicate pe…
Browse files Browse the repository at this point in the history
…rsistence, add unit tests.
  • Loading branch information
user committed Apr 28, 2024
1 parent 1a1c406 commit c7b8af6
Show file tree
Hide file tree
Showing 13 changed files with 593 additions and 336 deletions.
116 changes: 60 additions & 56 deletions examples/gpt-demo/01-GPT-Training.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"set user environment sucessfully: {'SCOREP_ENABLE_TRACING': '1', 'SCOREP_ENABLE_PROFILING': '0', 'SCOREP_TOTAL_MEMORY': '3g'}"
]
}
],
"source": [
"%%scorep_env\n",
"SCOREP_ENABLE_TRACING=1\n",
Expand All @@ -28,56 +36,64 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#%env SCOREP_KERNEL_PERSISTENCE_MODE MEMORY"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%env"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%%switch_serializer\n",
"cloudpickle"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"use the following scorep python binding arguments: --noinstrumenter"
]
}
],
"source": [
"%%scorep_python_binding_arguments\n",
"--noinstrumenter"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"filename = \"fairy_test.txt\""
"filename = \"fairytales.txt\""
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data has 49496 characters, 79 unique.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"15/09/2021 15:35:08 - INFO - model - Number of parameters : 2.531738e+07\n",
"/home/h9/s4122485/virtualenv_jupyterkernel_scorep_python/lib/python3.8/site-packages/torch/utils/data/dataloader.py:478: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 1, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n",
" warnings.warn(_create_warning_msg(\n",
"epoch 1 iter 96: train loss 2.31473. lr 0.00030152924503397155: 100%|██████████| 97/97 [02:27<00:00, 1.52s/it]\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"epoch 2 iter 96: train loss 2.05380. lr 5.9999999999999995e-05: 100%|██████████| 97/97 [02:27<00:00, 1.52s/it]\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"epoch 3 iter 96: train loss 1.88871. lr 0.0003015292450339715: 100%|██████████| 97/97 [02:27<00:00, 1.52s/it] \n"
]
}
],
"source": [
"%%execute_with_scorep\n",
"import scorep\n",
Expand Down Expand Up @@ -130,33 +146,21 @@
"\n",
" from model import GPT, GPTconfig\n",
" mconf = GPTconfig(train_dataset.vocab_size, train_dataset.block_size,\n",
" n_layer=1, n_head=8, n_embd=512)\n",
" n_layer=8, n_head=8, n_embd=512)\n",
" model = GPT(mconf)\n",
"\n",
" from trainer import Trainer, TrainerConfig\n",
"\n",
" tconf = TrainerConfig(max_epochs=1, batch_size=512, learning_rate=6e-4,\n",
" tconf = TrainerConfig(max_epochs=3, batch_size=512, learning_rate=6e-4,\n",
" lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,\n",
" num_workers=4)\n",
" trainer = Trainer(model, train_dataset, None, tconf)\n",
"\n",
" torch.cuda.empty_cache()\n",
" trainer.train()\n",
"\n",
" torch.save(model.state_dict(), \"./saved_models/trained_gpt_model\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -165,14 +169,14 @@
"lastKernelId": null
},
"kernelspec": {
"display_name": "scorep-python",
"language": "python",
"name": "scorep-python"
"display_name": "scorep-python3",
"language": "python3",
"name": "scorep-python3"
},
"language_info": {
"file_extension": ".py",
"mimetype": "text/plain",
"name": "python"
"name": "Any text"
}
},
"nbformat": 4,
Expand Down
36 changes: 0 additions & 36 deletions examples/gpt-demo/fairy_test.txt

This file was deleted.

2 changes: 1 addition & 1 deletion examples/gpt-demo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, model, train_dataset, test_dataset, config):
self.test_dataset = test_dataset
self.config = config

self.device = "cpu"
self.device = "gpu"
if torch.cuda.is_available():
self.device = torch.cuda.current_device()
self.model = torch.nn.DataParallel(self.model).to(self.device)
Expand Down
Loading

0 comments on commit c7b8af6

Please sign in to comment.