From 4e096c4afd4d1d377053cdfc6964f67f6435dceb Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Wed, 6 Sep 2023 17:59:30 +0200
Subject: [PATCH] Deprecate xformers/factory (#850)

* Add deprecation warning to factory

* Remove tests for deprecated functions

* Remove bad copy-paste

* Remove benchmarks

* Bugfix

* More docs removed
---
 .circleci/continue_config.yml                 |  18 -
 docs/source/factory/block.rst                 |   7 -
 docs/source/factory/index.rst                 |  10 -
 docs/source/factory/model.rst                 |   7 -
 docs/source/index.rst                         |   9 -
 docs/source/tutorials/hierarchical.rst        |  59 ---
 docs/source/tutorials/index.rst               |   2 -
 docs/source/tutorials/pytorch_encoder.rst     | 483 ------------------
 tests/test_block_factory.py                   | 415 ---------------
 tests/test_hierarchical_transformer.py        |  67 ---
 tests/test_model_factory.py                   | 227 --------
 tests/test_pytorch_transformer_parity.py      | 203 --------
 xformers/_deprecation_warning.py              |  12 +
 xformers/benchmarks/benchmark_encoder.py      | 387 --------------
 .../benchmark_pytorch_transformer.py          | 237 ---------
 xformers/factory/block_factory.py             |   3 +
 xformers/factory/model_factory.py             |   3 +
 xformers/helpers/hierarchical_configs.py      |   2 +
 18 files changed, 20 insertions(+), 2131 deletions(-)
 delete mode 100644 docs/source/factory/block.rst
 delete mode 100644 docs/source/factory/index.rst
 delete mode 100644 docs/source/factory/model.rst
 delete mode 100644 docs/source/tutorials/hierarchical.rst
 delete mode 100644 docs/source/tutorials/pytorch_encoder.rst
 delete mode 100644 tests/test_block_factory.py
 delete mode 100644 tests/test_hierarchical_transformer.py
 delete mode 100644 tests/test_model_factory.py
 delete mode 100644 tests/test_pytorch_transformer_parity.py
 create mode 100644 xformers/_deprecation_warning.py
 delete mode 100644 xformers/benchmarks/benchmark_encoder.py
 delete mode 100644 xformers/benchmarks/benchmark_pytorch_transformer.py

diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml
index 1d819c4a5..0a6d1e31b 100644
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@@ -142,22 +142,6 @@ run_unittests: &run_unittests
         source $BASH_ENV
         $CONDA_PYTHON -m pytest --junitxml=test-results/junit.xml --verbose --maxfail=20 tests
 
-run_benchmarks: &run_benchmarks
-  - run:
-      name: Run Benchmarks
-      when: always
-      command: |
-        source $BASH_ENV
-        $CONDA_PYTHON xformers/benchmarks/benchmark_encoder.py --activations gelu --plot -emb 128 -bs 16 -heads 4
-
-run_pytorch_benchmark: &run_pytorch_benchmark
-  - run:
-      name: Run Pytorch benchmark
-      when: always
-      command: |
-        source $BASH_ENV
-        $CONDA_PYTHON xformers/benchmarks/benchmark_pytorch_transformer.py
-
 run_doc_build: &run_doc_build
    - run:
       name: Testing doc build
@@ -242,8 +226,6 @@ commands:
 
         - <<: *install_repo
         - <<: *run_coverage
-        - <<: *run_benchmarks
-        - <<: *run_pytorch_benchmark
 
         - store_test_results:
             path: test-results
diff --git a/docs/source/factory/block.rst b/docs/source/factory/block.rst
deleted file mode 100644
index bbe018a90..000000000
--- a/docs/source/factory/block.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Block factory
-=============
-
-.. automodule:: xformers.factory.block_factory
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/factory/index.rst b/docs/source/factory/index.rst
deleted file mode 100644
index d15c5fd14..000000000
--- a/docs/source/factory/index.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Factory
-=======
-
-Factories are completely optional, they were primarily developed for CI and benchmarking purposes.
-
-.. toctree::
-   :maxdepth: 2
-
-   block
-   model
diff --git a/docs/source/factory/model.rst b/docs/source/factory/model.rst
deleted file mode 100644
index 781ac371e..000000000
--- a/docs/source/factory/model.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Model factory
-=============
-
-.. automodule:: xformers.factory.model_factory
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 97d031a7c..cc8fec1e4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -25,15 +25,6 @@ to create some state of the art models.
 |
 |
 
-.. toctree::
-   :maxdepth: 2
-   :caption: Build models and blocks programatically
-
-   factory/index
-
-|
-|
-
 .. toctree::
    :maxdepth: 2
    :caption: Tutorials and examples
diff --git a/docs/source/tutorials/hierarchical.rst b/docs/source/tutorials/hierarchical.rst
deleted file mode 100644
index 6ba0db193..000000000
--- a/docs/source/tutorials/hierarchical.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-Hierarchical Transformers
-=========================
-
-The original Transformer proposal processes ("transforms") sequences of tokens, across possibly many layers. Crucially, the number of tokens is unchanged cross the depth of the model, and this prove to be really efficient in many domains.
-
-It seems that some domains could however benefit from an architecture more typical from CNN, where there's a tradeoff across the depth of the model in between the spatial extent (ie: number of tokens) and their expressiveness (ie: the model or embedding dimension). These architectures are handled in xformers, through the "patch_embedding" element, which translates the sequence of tokens from one layer to another.
-
-A small helper is provided to make it easier to generate matching configurations, as follows. We present in this example a truncated version of a small Metaformer_.
-
-.. _Metaformer: https://arxiv.org/abs/2111.11418v1
-
-.. code-block:: python
-
-    from xformers.factory import xFormer, xFormerConfig
-    from xformers.helpers.hierarchical_configs import (
-        BasicLayerConfig,
-        get_hierarchical_configuration,
-    )
-
-
-    base_hierarchical_configs = [
-        BasicLayerConfig(
-            embedding=64,   # the dimensions just have to match along the layers
-            attention_mechanism="scaled_dot_product",   # anything you like
-            patch_size=7,
-            stride=4,
-            padding=2,
-            seq_len=image_size * image_size // 16,
-            feedforward="MLP",
-        ),
-        BasicLayerConfig(
-            embedding=128,
-            attention_mechanism="scaled_dot_product",
-            patch_size=3,
-            stride=2,
-            padding=1,
-            seq_len=image_size * image_size // 64,
-            feedforward="MLP",
-        ),
-        BasicLayerConfig(
-            embedding=320,
-            attention_mechanism="scaled_dot_product",
-            patch_size=3,
-            stride=2,
-            padding=1,
-            seq_len=image_size * image_size // 256,
-            feedforward="MLP",
-        ),
-    ]
-
-    # Fill in the gaps in the config
-    xformer_config = get_hierarchical_configuration(
-        base_hierarchical_configs,
-        residual_norm_style="pre",
-        use_rotary_embeddings=False,
-        mlp_multiplier=4,
-        dim_head=32,
-    )
-    config = xFormerConfig(xformer_config)
diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst
index 37af922a9..bdb8a345b 100644
--- a/docs/source/tutorials/index.rst
+++ b/docs/source/tutorials/index.rst
@@ -8,7 +8,5 @@ Tutorials
    blocksparse
    extend_attentions
    use_attention
-   pytorch_encoder
    reversible
    triton
-   hierarchical
diff --git a/docs/source/tutorials/pytorch_encoder.rst b/docs/source/tutorials/pytorch_encoder.rst
deleted file mode 100644
index 98e716e2b..000000000
--- a/docs/source/tutorials/pytorch_encoder.rst
+++ /dev/null
@@ -1,483 +0,0 @@
-Building an encoder, comparing to PyTorch
-=========================================
-
-Let's now walk up the hierarchy, and consider a whole encoder block. You may be used to the PyTorch encoder layer so we'll consider it as a point of comparison, but other libraries would probably expose similar interfaces.
-
-PyTorch Encoder Layer
----------------------
-
-PyTorch already exposes a TransformerEncoderLayer_. Its constructor is:
-
-.. _TransformerEncoderLayer: https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html?highlight=encoder#torch.nn.TransformerEncoderLayer
-
-.. code-block:: python
-
-    TransformerEncoderLayer(
-        d_model,
-        nhead,
-        dim_feedforward=2048,
-        dropout=0.1,
-        activation='relu',
-        layer_norm_eps=1e-05,
-        batch_first=False,
-        device=None,
-        dtype=None
-        ):
-        ...
-
-Note that you cannot change the attention mechanism, so this example will use the "Scaled Dot Product", as proposed by Vaswani et al., but in the xFormers case this is a free floating parameter.
-
-Warning
--------
-
-It’s worth noting that **xFormer’s blocks expect tensors to be batch first, while PyTorch’s transformers uses a sequence first convention. Don’t forget to permute if you use xFormers’s blocks as drop-in replacements.**
-
-Similarly, the attention masks conventions are different: in PyTorch, the mask is *True* when an element should *not* be attended to, whereas in xFormer it’s the opposite. Don’t forget to negate your attention masks to use xFormers’ blocks as drop-in replacements.
-
-Block factory
--------------
-
-We don't have the exact same interfaces, but we have something fairly close to PyTorch with the model_factory_. Please note that, similarly to the attention example above, you can also directly import the `xFormerEncoderBlock` and construct it from there, but we'll assume here that you could be interested in systematic evaluation of different architectures, and that as such something which can be easily automated is preferred, so the "factory" path is the one put forward.
-
-The equivalent to the PyTorch example above would look like the following. You can think of it  as a declaration of the sequence of blocks that you would like instantiated. We're trying to:
-
-- make it very explicit what is in this block
-- keep everything pythonic
-- make this sweep and automation friendly in general
-
-With this said, you can build an encoder directly as follows:
-
-.. code-block:: python
-
-    from xformers.factory import xFormerEncoderBlock, xFormerEncoderConfig
-    import torch
-
-    BATCH = 8
-    SEQ = 1024
-    EMB = 384
-    VOCAB = 64
-
-    encoder_config = {
-        "dim_model": EMB,
-        "residual_norm_style": "pre",  # Optional, pre/post
-        "position_encoding_config": {
-            "name": "vocab",  # whatever position encodinhg makes sense
-            "seq_len": SEQ,
-            "vocab_size": VOCAB,
-        },
-        "multi_head_config": {
-            "num_heads": 4,
-            "residual_dropout": 0,
-            "attention": {
-                "name": "linformer",  # whatever attention mechanism
-                "dropout": 0,
-                "seq_len": SEQ,
-            },
-        },
-        "feedforward_config": {
-            "name": "MLP",
-            "dropout": 0,
-            "activation": "relu",
-            "hidden_layer_multiplier": 4,
-        },
-    }
-
-    # "constructing" the config will lead to a lot of type checking,
-    # which could catch some errors early on
-    config = xFormerEncoderConfig(**encoder_config)
-
-    encoder = xFormerEncoderBlock(config)
-
-    #  Test out with dummy inputs
-    x = (torch.rand((BATCH, SEQ)) * VOCAB).abs().to(torch.int)
-    y = encoder(x, x, x)
-    print(y)
-
-
-Building full models
-====================
-
-
- Now let's build a full Tranformers/xFormer model. Please note that this is just an example, because building the whole model from explicit parts is always an option, from pure PyTorch building blocks or adding some xFormers primitives.
-
-PyTorch Transformer
--------------------
-
-Am implementation of a full Transformer is supported directly by PyTorch, see the PyTorchTransformer_ for more options.
-
-.. _PyTorchTransformer: https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html?highlight=transformer#torch.nn.Transformer
-
-.. code-block:: python
-
-    Transformer(
-        d_model=512,
-        nhead=8,
-        num_encoder_layers=6,
-        num_decoder_layers=6,
-        dim_feedforward=2048,
-        dropout=0.1,
-        activation='relu',
-        custom_encoder=None, # the xFormers exemple below defines that
-        custom_decoder=None, # Same
-        layer_norm_eps=1e-05,
-        batch_first=False,
-        device=None,
-        dtype=None):
-        .
-
-model factory
--------------
-
-We don't have the exact same interfaces, but we have something to propose with the model_factory_.
-Please note that, similarly to the attention example above, you can also directly import the `xFormer` and `xFormerConfig`
-and construct it from there, but we'll assume here that you could be interested in systematic evaluation of different architectures,
-and that as such something which can be easily automated is preferred, so the "factory" path is the one put forward.
-
-.. _model_factory: https://github.com/facebookresearch/xformers/blob/main/xformers/factory/model_factory.py
-
-The equivalent to the PyTorch example above would look like the following.
-You can think of it  as a declaration of the sequence of blocks that you would like instantiated.
-This is not really apples to apples, because we define a custom encoder and decoder here.
-There's also an added flexibility with xFormers in that attention mechanisms can be chosen at will, on a per-layer basis.
-
-.. code-block:: python
-
-    from xformers.factory.model_factory import xFormer, xFormerConfig
-    import torch
-
-    EMB = 384
-    SEQ = 1024
-    BATCH = 16
-    VOCAB = 64
-
-    my_config = [
-        # A list of the encoder or decoder blocks which constitute the Transformer.
-        # Note that a sequence of different encoder blocks can be used, same for decoders
-        {
-            "reversible": False,  # Optionally make these layers reversible, to save memory
-            "block_type": "encoder",
-            "num_layers": 3,  # Optional, this means that this config will repeat N times
-            "dim_model": EMB,
-            "residual_norm_style": "pre",  # Optional, pre/post
-            "position_encoding_config": {
-                "name": "vocab",  # whatever position encodinhg makes sense
-                "seq_len": 1024,
-                "vocab_size": VOCAB,
-            },
-            "multi_head_config": {
-                "num_heads": 4,
-                "residual_dropout": 0,
-                "attention": {
-                    "name": "linformer",  # whatever attention mechanism
-                    "dropout": 0,
-                    "causal": False,
-                    "seq_len": SEQ,
-                },
-            },
-            "feedforward_config": {
-                "name": "MLP",
-                "dropout": 0,
-                "activation": "relu",
-                "hidden_layer_multiplier": 4,
-            },
-        },
-        {
-            "reversible": False,  # Optionally make these layers reversible, to save memory
-            "block_type": "decoder",
-            "num_layers": 3,  # Optional, this means that this config will repeat N times
-            "dim_model": EMB,
-            "residual_norm_style": "pre",  # Optional, pre/post
-            "position_encoding_config": {
-                "name": "vocab",  # whatever position encodinhg makes sense
-                "seq_len": SEQ,
-                "vocab_size": VOCAB,
-            },
-            "multi_head_config_masked": {
-                "num_heads": 4,
-                "residual_dropout": 0,
-                "attention": {
-                    "name": "nystrom",  # whatever attention mechanism
-                    "dropout": 0,
-                    "causal": True,
-                    "seq_len": SEQ,
-                },
-            },
-            "multi_head_config_cross": {
-                "num_heads": 4,
-                "residual_dropout": 0,
-                "attention": {
-                    "name": "favor",  # whatever attention mechanism
-                    "dropout": 0,
-                    "causal": True,
-                    "seq_len": SEQ,
-                },
-            },
-            "feedforward_config": {
-                "name": "MLP",
-                "dropout": 0,
-                "activation": "relu",
-                "hidden_layer_multiplier": 4,
-            },
-        },
-    ]
-
-    # This part of xFormers is entirely type checked and needs a config object,
-    # could be changed in the future
-    config = xFormerConfig(my_config)
-    model = xFormer.from_config(config)
-
-    #  Test out with dummy inputs
-    x = (torch.rand((BATCH, SEQ)) * VOCAB).abs().to(torch.int)
-    y = model(src=x, tgt=x)
-    print(y)
-
-
-Note that this exposes quite a few more knobs than the PyTorch Transformer interface, but in turn is probably a little more flexible. There are a couple of repeated settings here (dimensions mostly), this is taken care of in the `LRA benchmarking config`_.
-
-.. _LRA benchmarking config: https://github.com/facebookresearch/xformers/blob/main/xformers/benchmarks/LRA/code/config.json
-
-You can compare the speed and memory use of the vanilla PyTorch Transformer Encoder and an equivalent from xFormers, there is an existing benchmark for that (see_).
-It can be run with `python3 xformers/benchmarks/benchmark_pytorch_transformer.py`, and returns the loss values for every step along with the training time for a couple of shapes that you can customize.
-Current results are as follows, on a nVidia V100 (PyTorch 1.9, Triton 1.1, xFormers 0.0.2):
-
-.. _see: https://github.com/facebookresearch/xformers/blob/main/xformers/benchmarks/benchmark_pytorch_transformer.py
-
-.. code-block:: bash
-
-    --- Transformer training benchmark - runtime ---
-    | Units: s | emb 128 - heads 8 | emb 1024 - heads 8 | emb 2048 - heads 8 |
-    | -------- | ----------------- | ------------------ | ------------------ |
-    | xformers | 0.3               | 0.4                | 0.7                |
-    | pytorch  | 0.2               | 0.6                | 0.8                |
-
-    --- Transformer training benchmark - memory use ---
-    | Units: MB | emb 128 - heads 8 | emb 1024 - heads 8 | emb 2048 - heads 8 |
-    | --------- | ----------------- | ------------------ | ------------------ |
-    | xformers  | 89                | 1182               | 2709               |
-    | pytorch   | 155               | 1950               | 4117               |
-
-
-
-Build an `xFormer` model with Hydra
------------------------------------
-
-Alternatively, you can use Hydra_ to build an xFormer model.
-We've included an example `here <https://github.com/facebookresearch/xformers/tree/main/examples/build_model/>`_.
-The example replicates the model from the above example and demonstrates one way to use Hydra to minimize config duplication.
-The example is built on top of some more advanced Hydra features. If you are new to Hydra, you can start these docs:
-`basic tutorials <https://hydra.cc/docs/tutorials/intro/>`_, `extending configs <https://hydra.cc/docs/patterns/extending_configs/>`_,
-`Hydra packages <https://hydra.cc/docs/advanced/overriding_packages/>`_ and
-`instantiation API <https://hydra.cc/docs/advanced/instantiate_objects/overview/>`_.
-
-.. _Hydra: https://hydra.cc/
-
-.. code-block:: yaml
-
-    defaults:
-        - /stack@xformer.stack_configs:
-            - encoder_local
-            - encoder_random
-            - decoder_nystrom_favor
-        - _self_
-
-    xformer:
-        _target_: xformers.factory.model_factory.xFormer
-
-
-Building a model this way makes it possible for you to leverage many features Hydra has to offer.
-For example, you can override the model architecture from the commandline:
-
-.. code-block:: bash
-
-    python examples/build_model/my_model.py  'stack@xformer.stack_configs=[encoder_local]'
-
-    Built a model with 1 stack: dict_keys(['encoder_local'])
-        xFormer(
-        (encoders): ModuleList(
-            (0): xFormerEncoderBlock(
-            (pose_encoding): VocabEmbedding(
-                (dropout): Dropout(p=0, inplace=False)
-                (position_embeddings): Embedding(1024, 384)
-                (word_embeddings): Embedding(64, 384)
-            )
-            (mha): MultiHeadDispatch(
-                (attention): LocalAttention(
-                (attn_drop): Dropout(p=0.0, inplace=False)
-                )
-                (in_proj_container): InputProjection()
-                (resid_drop): Dropout(p=0, inplace=False)
-                (proj): Linear(in_features=384, out_features=384, bias=True)
-            )
-            (feedforward): MLP(
-                (mlp): Sequential(
-                (0): Linear(in_features=384, out_features=1536, bias=True)
-                (1): ReLU()
-                (2): Dropout(p=0, inplace=False)
-                (3): Linear(in_features=1536, out_features=384, bias=True)
-                (4): Dropout(p=0, inplace=False)
-                )
-            )
-            (wrap_att): Residual(
-                (layer): PreNorm(
-                (norm): FusedLayerNorm()
-                (sublayer): MultiHeadDispatch(
-                    (attention): LocalAttention(
-                    (attn_drop): Dropout(p=0.0, inplace=False)
-                    )
-                    (in_proj_container): InputProjection()
-                    (resid_drop): Dropout(p=0, inplace=False)
-                    (proj): Linear(in_features=384, out_features=384, bias=True)
-                )
-                )
-            )
-            (wrap_ff): PostNorm(
-                (norm): FusedLayerNorm()
-                (sublayer): Residual(
-                (layer): PreNorm(
-                    (norm): FusedLayerNorm()
-                    (sublayer): MLP(
-                    (mlp): Sequential(
-                        (0): Linear(in_features=384, out_features=1536, bias=True)
-                        (1): ReLU()
-                        (2): Dropout(p=0, inplace=False)
-                        (3): Linear(in_features=1536, out_features=384, bias=True)
-                        (4): Dropout(p=0, inplace=False)
-                    )
-                    )
-                )
-                )
-            )
-            )
-        )
-        (decoders): ModuleList()
-        )
-
-
-You can also launch multiple runs of your application with different architectures:
-
-.. code-block:: bash
-
-    $ python my_model.py  --multirun 'stack@xformer.stack_configs=[encoder_local], [encoder_random]'
-    [HYDRA] Launching 2 jobs locally
-    [HYDRA]        #0 : stack@xformer.stack_configs=[encoder_local]
-    Built a model with 1 stack: dict_keys(['encoder_local'])
-    xFormer(
-    (encoders): ModuleList(
-        (0): xFormerEncoderBlock(
-        (pose_encoding): VocabEmbedding(
-            (dropout): Dropout(p=0, inplace=False)
-            (position_embeddings): Embedding(1024, 384)
-            (word_embeddings): Embedding(64, 384)
-        )
-        (mha): MultiHeadDispatch(
-            (attention): LocalAttention(
-            (attn_drop): Dropout(p=0.0, inplace=False)
-            )
-            (in_proj_container): InputProjection()
-            (resid_drop): Dropout(p=0, inplace=False)
-            (proj): Linear(in_features=384, out_features=384, bias=True)
-        )
-        (feedforward): MLP(
-            (mlp): Sequential(
-            (0): Linear(in_features=384, out_features=1536, bias=True)
-            (1): ReLU()
-            (2): Dropout(p=0, inplace=False)
-            (3): Linear(in_features=1536, out_features=384, bias=True)
-            (4): Dropout(p=0, inplace=False)
-            )
-        )
-        (wrap_att): Residual(
-            (layer): PreNorm(
-            (norm): FusedLayerNorm()
-            (sublayer): MultiHeadDispatch(
-                (attention): LocalAttention(
-                (attn_drop): Dropout(p=0.0, inplace=False)
-                )
-                (in_proj_container): InputProjection()
-                (resid_drop): Dropout(p=0, inplace=False)
-                (proj): Linear(in_features=384, out_features=384, bias=True)
-            )
-            )
-        )
-        (wrap_ff): PostNorm(
-            (norm): FusedLayerNorm()
-            (sublayer): Residual(
-            (layer): PreNorm(
-                (norm): FusedLayerNorm()
-                (sublayer): MLP(
-                (mlp): Sequential(
-                    (0): Linear(in_features=384, out_features=1536, bias=True)
-                    (1): ReLU()
-                    (2): Dropout(p=0, inplace=False)
-                    (3): Linear(in_features=1536, out_features=384, bias=True)
-                    (4): Dropout(p=0, inplace=False)
-                )
-                )
-            )
-            )
-        )
-        )
-    )
-    (decoders): ModuleList()
-    )
-    [HYDRA]        #1 : stack@xformer.stack_configs=[encoder_random]
-    Built a model with 1 stack: dict_keys(['encoder_random'])
-    xFormer(
-    (encoders): ModuleList(
-        (0): xFormerEncoderBlock(
-        (pose_encoding): VocabEmbedding(
-            (dropout): Dropout(p=0, inplace=False)
-            (position_embeddings): Embedding(1024, 384)
-            (word_embeddings): Embedding(64, 384)
-        )
-        (mha): MultiHeadDispatch(
-            (attention): RandomAttention(
-            (attn_drop): Dropout(p=0.0, inplace=False)
-            )
-            (in_proj_container): InputProjection()
-            (resid_drop): Dropout(p=0, inplace=False)
-            (proj): Linear(in_features=384, out_features=384, bias=True)
-        )
-        (feedforward): MLP(
-            (mlp): Sequential(
-            (0): Linear(in_features=384, out_features=1536, bias=True)
-            (1): ReLU()
-            (2): Dropout(p=0, inplace=False)
-            (3): Linear(in_features=1536, out_features=384, bias=True)
-            (4): Dropout(p=0, inplace=False)
-            )
-        )
-        (wrap_att): Residual(
-            (layer): PreNorm(
-            (norm): FusedLayerNorm()
-            (sublayer): MultiHeadDispatch(
-                (attention): RandomAttention(
-                (attn_drop): Dropout(p=0.0, inplace=False)
-                )
-                (in_proj_container): InputProjection()
-                (resid_drop): Dropout(p=0, inplace=False)
-                (proj): Linear(in_features=384, out_features=384, bias=True)
-            )
-            )
-        )
-        (wrap_ff): PostNorm(
-            (norm): FusedLayerNorm()
-            (sublayer): Residual(
-            (layer): PreNorm(
-                (norm): FusedLayerNorm()
-                (sublayer): MLP(
-                (mlp): Sequential(
-                    (0): Linear(in_features=384, out_features=1536, bias=True)
-                    (1): ReLU()
-                    (2): Dropout(p=0, inplace=False)
-                    (3): Linear(in_features=1536, out_features=384, bias=True)
-                    (4): Dropout(p=0, inplace=False)
-                )
-                )
-            )
-            )
-        )
-        )
-    )
-    (decoders): ModuleList()
-    )
diff --git a/tests/test_block_factory.py b/tests/test_block_factory.py
deleted file mode 100644
index 79e85f7a1..000000000
--- a/tests/test_block_factory.py
+++ /dev/null
@@ -1,415 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import pytest
-import torch
-
-# Automatically fetch all registered attentions and Feedforwards
-from xformers.components import Activation
-from xformers.components.attention import ATTENTION_REGISTRY, AttentionMask
-from xformers.components.feedforward import FEEDFORWARD_REGISTRY
-from xformers.factory import (
-    xFormerDecoderBlock,
-    xFormerDecoderConfig,
-    xFormerEncoderBlock,
-    xFormerEncoderConfig,
-)
-from xformers.helpers.test_utils import init_torch_distributed_local
-
-BATCH = 2
-SEQ = 64
-MODEL = 64
-DROPOUT = 0.5
-GLOBAL_ATTENTION_RATIO = 0.1  # 10% of the tokens have a global view
-DEVICES = [torch.device("cuda")]
-VOCAB_SIZE = 64
-
-
-@pytest.mark.parametrize("attn_dropout", [0.1])
-@pytest.mark.parametrize("residual_dropout", [0.1])
-@pytest.mark.parametrize("heads", [1, 2])
-@pytest.mark.parametrize("activation", [a.value for a in Activation])
-@pytest.mark.parametrize("attention_name", ATTENTION_REGISTRY.keys())
-@pytest.mark.parametrize("feedforward_name", FEEDFORWARD_REGISTRY.keys())
-@pytest.mark.parametrize("residual_norm_style", ["pre", "post", "deepnorm"])
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("reversible", [True, False])
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason="This test requires a CUDA device"
-)
-def test_xformer_encoder_block(
-    attention_name: str,
-    feedforward_name: str,
-    heads: int,
-    attn_dropout: float,
-    residual_dropout: float,
-    activation: Activation,
-    residual_norm_style: str,
-    device: torch.device,
-    reversible: bool,
-):
-
-    block_size = 16
-
-    attention_config = {
-        "name": attention_name,
-        "dropout": attn_dropout,
-        "causal": False,
-        "window_size": SEQ // 8 + 1,
-        "seq_len": SEQ,
-        "attention_query_mask": torch.rand((SEQ, 1)) < GLOBAL_ATTENTION_RATIO,
-        "dim_model": MODEL,
-        "num_heads": heads,
-        "dim_head": MODEL // heads,
-        "layout": torch.eye(SEQ // block_size, SEQ // block_size, dtype=torch.long),
-        "block_size": block_size,
-        "num_rules": 2,  # Compositional Attention
-    }
-
-    multi_head_config = {
-        "num_heads": heads,
-        "dim_model": MODEL,
-        "residual_dropout": residual_dropout,
-        "attention": attention_config,
-    }
-
-    feedforward_config = {
-        "name": feedforward_name,
-        "dim_model": MODEL,
-        "dropout": DROPOUT,
-        "activation": activation,
-        "hidden_layer_multiplier": 4,
-        "number_of_experts": 4,
-        "gate": "top_2",
-    }
-
-    if feedforward_name == "MixtureOfExperts":
-        init_torch_distributed_local()
-
-    position_encoding_config = {
-        "name": "sine",
-        "dim_model": MODEL,
-        "seq_len": SEQ,
-        "vocab_size": VOCAB_SIZE,
-    }
-
-    block_config = xFormerEncoderConfig(
-        dim_model=MODEL,
-        multi_head_config=multi_head_config,
-        feedforward_config=feedforward_config,
-        position_encoding_config=position_encoding_config,
-        residual_norm_style=residual_norm_style,
-        reversible=reversible,
-    )
-
-    # Test that the whole block can be instantiated
-    block = xFormerEncoderBlock.from_config(block_config).to(device)
-
-    # Check that the dimensions make sense, to a FW pass
-    inputs = torch.rand(BATCH, SEQ, device=device)
-    _ = block(inputs)
-
-    # Check that we support attention masking, at least interface wise (do not check correctness yet)
-    att_mask_tensor = torch.ones(SEQ, SEQ, dtype=torch.bool, device=device)
-    att_mask = AttentionMask.from_bool(att_mask_tensor)
-
-    if block.supports_attention_mask:
-        _ = block(inputs, att_mask=att_mask)
-        _ = block(inputs, att_mask=att_mask_tensor)
-    else:
-        with pytest.raises(AssertionError):
-            # Check that passing an attention mask to a mechanism which does not support it raises
-            # an exception
-            _ = block(inputs, att_mask=att_mask)
-
-    # Check that we support input masking, at least interface wise (do not check correctness yet)
-    input_mask = torch.randn(SEQ, dtype=torch.float, device=device)
-    input_mask[input_mask < 0.0] = -float("inf")
-    _ = block(inputs, input_mask=input_mask)
-
-
-@pytest.mark.parametrize("attn_dropout", [0.1])
-@pytest.mark.parametrize("residual_dropout", [0.1])
-@pytest.mark.parametrize("causal", [True, False])
-@pytest.mark.parametrize("heads", [1, 2])
-@pytest.mark.parametrize("activation", [a.value for a in Activation])
-@pytest.mark.parametrize("rotary_embeddings", [False, True])
-@pytest.mark.parametrize("attention_name", ATTENTION_REGISTRY.keys())
-@pytest.mark.parametrize("feedforward_name", FEEDFORWARD_REGISTRY.keys())
-@pytest.mark.parametrize("residual_norm_style", ["pre", "post", "deepnorm"])
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason="This test requires a CUDA device"
-)
-def test_xformer_decoder_block(
-    attention_name: str,
-    rotary_embeddings: bool,
-    feedforward_name: str,
-    heads: int,
-    attn_dropout: float,
-    residual_dropout: float,
-    causal: bool,
-    activation: Activation,
-    residual_norm_style: str,
-    device: torch.device,
-):
-
-    block_size = 16
-
-    attention_config = {
-        "name": attention_name,
-        "dropout": attn_dropout,
-        "causal": causal,
-        "window_size": SEQ // 8 + 1,
-        "seq_len": SEQ,
-        "dim_head": MODEL // heads,
-        "attention_query_mask": torch.rand((SEQ, 1)) < GLOBAL_ATTENTION_RATIO,
-        "layout": torch.eye(SEQ // block_size, SEQ // block_size, dtype=torch.long),
-        "block_size": block_size,
-        "num_rules": 2,  # Compositional Attention
-    }
-
-    multi_head_config = {
-        "num_heads": heads,
-        "dim_model": MODEL,
-        "residual_dropout": residual_dropout,
-        "attention": attention_config,
-        "use_rotary_embeddings": rotary_embeddings,
-    }
-
-    feedforward_config = {
-        "name": feedforward_name,
-        "dim_model": MODEL,
-        "dropout": DROPOUT,
-        "activation": activation,
-        "hidden_layer_multiplier": 4,
-        "number_of_experts": 4,
-        "gate": "top_2",
-    }
-
-    if feedforward_name == "MixtureOfExperts":
-        init_torch_distributed_local()
-
-    position_encoding_config = {
-        "name": "sine",
-        "dim_model": MODEL,
-        "seq_len": SEQ,
-        "vocab_size": VOCAB_SIZE,
-    }
-
-    encoder_block_config = xFormerEncoderConfig(
-        dim_model=MODEL,
-        multi_head_config=multi_head_config,
-        feedforward_config=feedforward_config,
-        position_encoding_config=position_encoding_config,
-        residual_norm_style=residual_norm_style,
-    )
-
-    decoder_block_config = xFormerDecoderConfig(
-        dim_model=MODEL,
-        multi_head_config_masked=multi_head_config,
-        multi_head_config_cross=multi_head_config,
-        feedforward_config=feedforward_config,
-        position_encoding_config=position_encoding_config,
-        residual_norm_style=residual_norm_style,
-    )
-
-    # Test that the whole block can be instantiated
-    encoder_block = xFormerEncoderBlock.from_config(encoder_block_config).to(device)
-    decoder_block = xFormerDecoderBlock.from_config(decoder_block_config).to(device)
-
-    # Check that the dimensions make sense, to a FW pass
-    inputs = torch.rand(BATCH, SEQ, device=device)
-    encoded = encoder_block(inputs)
-    _ = decoder_block(
-        inputs, encoded
-    )  # NOTE: does not make a lot of sense, just checking dimensions
-
-    # Check that we support masking, at least interface wise (do not check correctness yet)
-    att_mask_tensor = torch.ones(SEQ, SEQ, dtype=torch.bool, device=device)
-    att_mask = AttentionMask.from_bool(att_mask_tensor)
-    input_mask = torch.randn(SEQ, dtype=torch.float, device=device)
-    input_mask[input_mask < 0.0] = -float("inf")
-
-    encoded = encoder_block(inputs)
-    if decoder_block.supports_attention_mask:
-        _ = decoder_block(
-            inputs, encoded, encoder_att_mask=att_mask, input_mask=input_mask
-        )
-        _ = decoder_block(
-            inputs, encoded, encoder_att_mask=att_mask_tensor, input_mask=input_mask
-        )
-
-    # Test different sequence lengths when encoding and decoding
-    if (
-        not decoder_block.requires_same_k_q_dimensions
-        and not decoder_block.requires_squared_context_length
-    ):
-        if not causal or not decoder_block.causal_attention:
-            _ = decoder_block(inputs[:, :-16], encoded)
-        else:
-            # Check that we assert properly
-            with pytest.raises(AssertionError):
-                _ = decoder_block(inputs[:, :-16], encoded)
-    else:
-        # Check that we assert properly
-        with pytest.raises(AssertionError):
-            _ = decoder_block(inputs[:, :-16], encoded)
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason="This test requires a CUDA device"
-)
-def test_embedding_projection():
-    block_size = 16
-
-    attention_config = {
-        "name": "scaled_dot_product",
-        "dropout": 0.1,
-        "causal": False,
-        "window_size": SEQ // 8 + 1,
-        "seq_len": SEQ,
-        "attention_query_mask": torch.rand((SEQ, 1)) < GLOBAL_ATTENTION_RATIO,
-        "dim_model": MODEL,
-        "num_heads": 2,
-        "dim_head": MODEL // 2,
-        "layout": torch.eye(SEQ // block_size, SEQ // block_size, dtype=torch.long),
-        "block_size": block_size,
-        "num_rules": 2,  # Compositional Attention
-    }
-
-    multi_head_config = {
-        "num_heads": 2,
-        "dim_model": MODEL,
-        "residual_dropout": 0.1,
-        "attention": attention_config,
-    }
-
-    feedforward_config = {
-        "name": "MLP",
-        "dim_model": MODEL,
-        "dropout": DROPOUT,
-        "activation": "relu",
-        "hidden_layer_multiplier": 4,
-        "number_of_experts": 4,
-        "gate": "top_2",
-    }
-
-    position_encoding_config = {
-        "name": "sine",
-        "dim_model": 2 * MODEL,
-        "seq_len": SEQ,
-        "vocab_size": VOCAB_SIZE,
-    }
-
-    block_config = xFormerEncoderConfig(
-        dim_model=MODEL,
-        multi_head_config=multi_head_config,
-        feedforward_config=feedforward_config,
-        position_encoding_config=position_encoding_config,
-        residual_norm_style="pre",
-        reversible=False,
-    )
-
-    device = torch.device("cuda")
-
-    # Test that the whole block can be instantiated
-    block = xFormerEncoderBlock.from_config(block_config).to(device)
-
-    # Check that the dimensions make sense, to a FW pass
-    inputs = torch.rand(BATCH, SEQ, device=device)
-    _ = block(inputs)
-
-    # Check that we support attention masking, at least interface wise (do not check correctness yet)
-    if block.supports_attention_mask:
-        att_mask = torch.ones(SEQ, SEQ, dtype=torch.bool, device=device)
-        _ = block(inputs, att_mask=att_mask)
-
-    # Check that we support input masking, at least interface wise (do not check correctness yet)
-    input_mask = torch.randn(SEQ, dtype=torch.float, device=device)
-    input_mask[input_mask < 0.0] = -float("inf")
-    _ = block(inputs, input_mask=input_mask)
-
-
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason="This test requires a CUDA device"
-)
-def test_simplicial_embedding(
-    device: torch.device,
-):
-    attention_config = {
-        "name": "scaled_dot_product",
-        "dropout": 0.1,
-        "causal": False,
-        "window_size": SEQ // 8 + 1,
-        "seq_len": SEQ,
-        "dim_model": MODEL,
-        "num_heads": 4,
-    }
-
-    multi_head_config = {
-        "num_heads": 4,
-        "dim_model": MODEL,
-        "residual_dropout": 0.1,
-        "attention": attention_config,
-    }
-
-    feedforward_config = {
-        "name": "MLP",
-        "dim_model": MODEL,
-        "dropout": DROPOUT,
-        "activation": "relu",
-        "hidden_layer_multiplier": 4,
-    }
-
-    position_encoding_config = {
-        "name": "sine",
-        "dim_model": MODEL,
-        "seq_len": SEQ,
-        "vocab_size": VOCAB_SIZE,
-    }
-
-    block_config = xFormerEncoderConfig(
-        dim_model=MODEL,
-        multi_head_config=multi_head_config,
-        feedforward_config=feedforward_config,
-        position_encoding_config=position_encoding_config,
-        residual_norm_style="pre",
-        reversible=False,
-        simplicial_embeddings={"L": 4},
-    )
-
-    # Test that the whole block can be instantiated
-    block = xFormerEncoderBlock.from_config(block_config).to(device)
-
-    # Check that the dimensions make sense, to a FW pass
-    inputs = torch.rand(BATCH, SEQ, device=device)
-    _ = block(inputs)
-
-    # Check that we support attention masking, at least interface wise (do not check correctness yet)
-    att_mask = torch.ones(SEQ, SEQ, dtype=torch.bool, device=device)
-    _ = block(inputs, att_mask=att_mask)
-
-    # Check that we support input masking, at least interface wise (do not check correctness yet)
-    input_mask = torch.randn(SEQ, dtype=torch.float, device=device)
-    input_mask[input_mask < 0.0] = -float("inf")
-    _ = block(inputs, input_mask=input_mask)
-
-    # Check that a faulty L is caught
-    block_config = xFormerEncoderConfig(
-        dim_model=MODEL,
-        multi_head_config=multi_head_config,
-        feedforward_config=feedforward_config,
-        position_encoding_config=position_encoding_config,
-        residual_norm_style="pre",
-        reversible=False,
-        simplicial_embeddings={"L": 3},
-    )
-
-    # Test that the whole block can be instantiated
-    with pytest.raises(AssertionError):
-        block = xFormerEncoderBlock.from_config(block_config).to(device)
-        _ = block(inputs)
diff --git a/tests/test_hierarchical_transformer.py b/tests/test_hierarchical_transformer.py
deleted file mode 100644
index c6e838ca4..000000000
--- a/tests/test_hierarchical_transformer.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import torch
-
-from xformers.factory import xFormer, xFormerConfig
-from xformers.helpers.hierarchical_configs import (
-    BasicLayerConfig,
-    get_hierarchical_configuration,
-)
-
-BATCH = 20
-SEQ = 512
-MODEL = 384
-
-
-def test_hierarchical_transformer():
-    image_size = 32
-
-    base_hierarchical_configs = [
-        BasicLayerConfig(
-            embedding=64,
-            attention_mechanism="pooling",
-            patch_size=7,
-            stride=4,
-            padding=2,
-            seq_len=image_size * image_size // 16,
-            feedforward="MLP",
-        ),
-        BasicLayerConfig(
-            embedding=128,
-            attention_mechanism="pooling",
-            patch_size=3,
-            stride=2,
-            padding=1,
-            seq_len=image_size * image_size // 64,
-            feedforward="MLP",
-            repeat_layer=2,
-        ),
-        BasicLayerConfig(
-            embedding=320,
-            attention_mechanism="scaled_dot_product",
-            patch_size=3,
-            stride=2,
-            padding=1,
-            seq_len=image_size * image_size // 256,
-            feedforward="MLP",
-        ),
-    ]
-
-    # Fill in the gaps in the config
-    xformer_config = get_hierarchical_configuration(
-        base_hierarchical_configs,
-        residual_norm_style="pre",
-        use_rotary_embeddings=False,
-        mlp_multiplier=4,
-        dim_head=32,
-    )
-    config = xFormerConfig(xformer_config)
-    hierarchical_xformer = xFormer.from_config(config)
-
-    # Forward some dummy data
-    dummy = torch.rand((2, 3, image_size, image_size))
-    _ = hierarchical_xformer(dummy)
diff --git a/tests/test_model_factory.py b/tests/test_model_factory.py
deleted file mode 100644
index 45ed0b53c..000000000
--- a/tests/test_model_factory.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-from contextlib import nullcontext
-
-import pytest
-import torch
-
-import xformers.factory.weight_init as xformers_weight_init
-from xformers.factory import xFormer, xFormerConfig, xFormerWeightInit
-
-BATCH = 2
-SEQ = 16
-EMB = 16
-VOC = 16
-
-DEVICES = (
-    [torch.device("cpu")]
-    if not torch.cuda.is_available()
-    else [
-        torch.device("cuda")
-    ]  # save a bit on CI for now, we have separate cpu and gpu jobs
-)
-
-encoder_configs = {
-    "reversible": False,
-    "block_type": "encoder",
-    "dim_model": EMB,
-    "residual_norm_style": "pre",
-    "position_encoding_config": {
-        "name": "vocab",
-        "seq_len": SEQ,
-        "vocab_size": VOC,
-        "dim_model": EMB,
-    },
-    "num_layers": 3,
-    "multi_head_config": {
-        "num_heads": 4,
-        "residual_dropout": 0,
-        "attention": {
-            "name": "scaled_dot_product",
-            "dropout": 0,
-            "causal": True,
-            "seq_len": SEQ,
-        },
-        "dim_model": EMB,
-    },
-    "feedforward_config": {
-        "name": "MLP",
-        "dropout": 0,
-        "activation": "relu",
-        "hidden_layer_multiplier": 4,
-        "dim_model": EMB,
-        "number_of_experts": 4,
-        "gate_config": "top_2",
-    },
-}
-
-decoder_configs = {
-    "block_type": "decoder",
-    "dim_model": EMB,
-    "residual_norm_style": "pre",
-    "position_encoding_config": {
-        "name": "vocab",
-        "seq_len": SEQ,
-        "vocab_size": VOC,
-        "dim_model": EMB,
-    },
-    "num_layers": 2,
-    "multi_head_config_masked": {
-        "num_heads": 4,
-        "residual_dropout": 0,
-        "dim_model": EMB,
-        "attention": {
-            "name": "scaled_dot_product",
-            "dropout": 0,
-            "causal": True,
-            "seq_len": SEQ,
-        },
-    },
-    "multi_head_config_cross": {
-        "num_heads": 4,
-        "residual_dropout": 0,
-        "dim_model": EMB,
-        "attention": {
-            "name": "scaled_dot_product",
-            "dropout": 0,
-            "causal": True,
-            "seq_len": SEQ,
-        },
-    },
-    "feedforward_config": {
-        "name": "MLP",
-        "dropout": 0,
-        "activation": "relu",
-        "hidden_layer_multiplier": 4,
-        "dim_model": EMB,
-    },
-}
-
-test_configs_list = [encoder_configs, decoder_configs]
-test_configs_dict = {"encoder": encoder_configs, "decoder": decoder_configs}
-
-""" Test all the model configurations saved in model_presets. """
-
-
-@pytest.mark.parametrize("config", [test_configs_list, test_configs_dict])
-@pytest.mark.parametrize("reversible", [True, False])
-@pytest.mark.parametrize("tie_embedding_weights", [True, False])
-@pytest.mark.parametrize("residual_norm_style", ["pre", "post", "deepnorm"])
-@pytest.mark.parametrize("device", DEVICES)
-def test_presets(
-    config, reversible, tie_embedding_weights, residual_norm_style, device
-):
-    torch.cuda.manual_seed(42)
-    torch.manual_seed(42)
-
-    # Build the model
-    if isinstance(config, list):
-        # Only the encoder can be reversible
-        config[0]["reversible"] = reversible
-
-        config[0]["residual_norm_style"] = residual_norm_style
-        config[1]["residual_norm_style"] = residual_norm_style
-    else:
-        config["encoder"]["reversible"] = reversible
-        config["encoder"]["residual_norm_style"] = residual_norm_style
-        config["decoder"]["residual_norm_style"] = residual_norm_style
-
-    modelConfig = xFormerConfig(config, tie_embedding_weights)
-    if isinstance(modelConfig.stack_configs, dict):
-        for _, blockConfig in modelConfig.stack_configs.items():
-            assert blockConfig.layer_position
-    else:
-        for blockConfig in modelConfig.stack_configs:
-            assert blockConfig.layer_position
-
-    context = (
-        pytest.raises(AssertionError)
-        if reversible and (tie_embedding_weights or residual_norm_style == "deepnorm")
-        else nullcontext()
-    )
-
-    with context:
-        model = xFormer.from_config(modelConfig).to(device)
-
-        def check_against_default(p):
-            # check that a different gain than 1 was used
-            vanilla = p.clone()
-            torch.nn.init.xavier_normal_(p, gain=1)
-            change = torch.abs((torch.std(vanilla) - torch.std(p)) / torch.std(p))
-            assert change > 0.1
-
-        # Check deepnorm init, if applicable
-        if residual_norm_style == "deepnorm":
-            for n, p in model.encoders.named_parameters():
-                # Check the MHA
-                if "in_proj_weight" in n:
-                    # self attention projection, check that the value projection has been changed
-                    M, _ = p.shape
-                    K = M // 3
-
-                    value_rel_std = torch.abs(
-                        torch.std(p[:K, :]) - torch.std(p[-K:, :])
-                    )
-                    qp_rel_std = torch.abs(torch.std(p[:K, :]) - torch.std(p[K:-K, :]))
-
-                    # Check that the value proj init has been changed by more than the noise
-                    assert (
-                        value_rel_std / qp_rel_std > 2
-                    ), f"{(value_rel_std/qp_rel_std)}"
-
-                if "v_proj_weight" in n:
-                    check_against_default(p)
-
-                if "mha.proj" in n and "weight" in n:
-                    check_against_default(p)
-
-                # Check the feedforward
-                if "feedforward" in n and "weight" in n:
-                    check_against_default(p)
-
-        # Dummy inputs, test a forward
-        inputs = (torch.rand((BATCH, SEQ), device=device) * 10).abs().to(torch.int)
-
-        input_mask = torch.randn(SEQ, dtype=torch.float, device=device)
-        input_mask[input_mask < 0.0] = -float("inf")
-        outputs = model(
-            inputs, encoder_input_mask=input_mask, decoder_input_mask=input_mask
-        )
-
-        # Test a BW
-        loss = torch.sum(torch.abs(outputs))
-        loss.backward()
-
-        # If we requested tied embedding weights, check that this is the case indeed
-        if tie_embedding_weights and not reversible:
-            assert model.encoders[0].pose_encoding == model.decoders[0].pose_encoding
-
-
-@pytest.mark.parametrize("weight_init", [w.value for w in xFormerWeightInit])
-@pytest.mark.parametrize("feedforward", ["MLP", "Conv2DFeedforward"])
-@pytest.mark.parametrize("deepnorm", [False, True])
-@pytest.mark.parametrize("device", DEVICES)
-def test_weight_init(weight_init, feedforward, deepnorm, device):
-    torch.cuda.manual_seed(42)
-    torch.manual_seed(42)
-
-    config = test_configs_dict
-
-    if deepnorm:
-        config["encoder"]["residual_norm_style"] = "deepnorm"
-        config["encoder"]["feedforward_config"]["name"] = feedforward
-
-        config["decoder"]["residual_norm_style"] = "deepnorm"
-
-    # Make sure that all the init methods catch all the weights
-    xformers_weight_init._assert_if_not_initialized = True
-
-    # Build the model
-    config_instance = xFormerConfig(  # noqa
-        config, tie_embedding_weights=False, weight_init=weight_init
-    )
-
-    _ = xFormer.from_config(config_instance).to(device)
diff --git a/tests/test_pytorch_transformer_parity.py b/tests/test_pytorch_transformer_parity.py
deleted file mode 100644
index 83ac9202a..000000000
--- a/tests/test_pytorch_transformer_parity.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import random
-
-import pytest
-import torch
-
-from xformers import _is_triton_available
-
-if _is_triton_available():
-    from xformers.benchmarks.benchmark_pytorch_transformer import evaluate, train
-    from xformers.factory.model_factory import xFormer, xFormerConfig
-
-    BATCH = 20
-    SEQ = 32
-    EMB = 8
-    VOCAB = 8
-    HEADS = 4
-    DROP = 0.1
-    LAYERS = 2
-    ACTIVATION = "relu"
-
-    _test_config_encoder = {
-        "block_type": "encoder",
-        "dim_model": EMB,
-        "num_layers": LAYERS,
-        "residual_norm_style": "post",
-        "multi_head_config": {
-            "num_heads": HEADS,
-            "residual_dropout": DROP,
-            "bias": True,
-            "attention": {
-                "name": "scaled_dot_product",
-                "dropout": DROP,
-                "seq_len": SEQ,
-            },
-            "dim_model": EMB,
-        },
-        "feedforward_config": {
-            "name": "MLP",
-            "dropout": DROP,
-            "activation": ACTIVATION,
-            "hidden_layer_multiplier": 4,
-            "dim_model": EMB,
-        },
-    }
-
-    _test_config_decoder = {
-        "block_type": "decoder",
-        "dim_model": EMB,
-        "num_layers": LAYERS,
-        "residual_norm_style": "post",
-        "multi_head_config_masked": {
-            "num_heads": HEADS,
-            "residual_dropout": DROP,
-            "dim_model": EMB,
-            "bias": True,
-            "attention": {
-                "name": "scaled_dot_product",
-                "dropout": DROP,
-                "seq_len": SEQ,
-            },
-        },
-        "multi_head_config_cross": {
-            "num_heads": HEADS,
-            "residual_dropout": DROP,
-            "dim_model": EMB,
-            "bias": True,
-            "attention": {
-                "name": "scaled_dot_product",
-                "dropout": DROP,
-                "seq_len": SEQ,
-            },
-        },
-        "feedforward_config": {
-            "name": "MLP",
-            "dropout": DROP,
-            "activation": ACTIVATION,
-            "hidden_layer_multiplier": 4,
-            "dim_model": EMB,
-        },
-    }
-
-    _test_config = [_test_config_encoder, _test_config_decoder]
-
-    def reset_seeds():
-        torch.manual_seed(42)
-        torch.cuda.manual_seed(42)
-        random.seed(42)
-
-    @pytest.mark.skipif(
-        not torch.cuda.is_available(), reason="This test requires a gpu"
-    )
-    def test_pytorch_encoder_parity(device=torch.device("cuda")):
-        # Build both a xFormers and Pytorch model
-        reset_seeds()
-        model_xformers = xFormer.from_config(xFormerConfig([_test_config_encoder])).to(
-            device
-        )
-        print(model_xformers)
-
-        model_pytorch = torch.nn.TransformerEncoder(
-            torch.nn.TransformerEncoderLayer(
-                d_model=EMB,
-                nhead=HEADS,
-                dim_feedforward=4 * EMB,
-                dropout=DROP,
-                activation=ACTIVATION,
-                batch_first=True,  # (batch, seq, feature)
-                device=device,
-            ),
-            num_layers=LAYERS,
-        )
-        print(model_pytorch)
-
-        optim_xformers = torch.optim.SGD(
-            model_xformers.parameters(), lr=1e-3, momentum=0.9
-        )
-        optim_pytorch = torch.optim.SGD(
-            model_pytorch.parameters(), lr=1e-3, momentum=0.9
-        )
-
-        # Check that both models can be trained to comparable results
-        eval_start_xformer = evaluate(model_xformers, BATCH, SEQ, EMB, device)
-        eval_start_pytorch = evaluate(model_pytorch, BATCH, SEQ, EMB, device)
-        print("starting point: ", eval_start_pytorch, eval_start_xformer)
-        train(model_pytorch, optim_pytorch, "pytorch", 500, BATCH, SEQ, EMB, device)
-        train(model_xformers, optim_xformers, "xformers", 500, BATCH, SEQ, EMB, device)
-
-        # Check that we can classify this dummy example
-        # Arbitrary threshold
-        eval_stop_xformer = evaluate(model_xformers, BATCH, SEQ, EMB, device)
-        eval_stop_pytorch = evaluate(model_pytorch, BATCH, SEQ, EMB, device)
-        print("end point: ", eval_stop_pytorch, eval_stop_xformer)
-
-        fit_ratio_xformer = eval_start_xformer / eval_stop_xformer
-        fit_ratio_pytorch = eval_start_pytorch / eval_stop_pytorch
-        print("fit ratios: ", fit_ratio_pytorch, fit_ratio_xformer)
-
-        # Catch a broken training
-        assert fit_ratio_xformer > 120
-        assert fit_ratio_pytorch > 120
-
-        # Catch a significant difference in between the two
-        assert (
-            abs(eval_start_xformer - eval_start_pytorch) < 1e-6
-        )  # initial eval is about 25, arbitrary limits
-        assert (
-            abs(eval_stop_xformer - eval_stop_pytorch) < 1e-1
-        )  # final eval is about 0.2, arbitrary limits
-
-    @pytest.mark.skipif(
-        not torch.cuda.is_available(), reason="This test requires a gpu"
-    )
-    def test_pytorch_tranformer_parity(device=torch.device("cuda")):
-        # Build both a xFormers and Pytorch model
-        reset_seeds()
-        model_xformers = xFormer.from_config(xFormerConfig(_test_config)).to(device)
-        print(model_xformers)
-
-        model_pytorch = torch.nn.Transformer(
-            d_model=EMB,
-            nhead=HEADS,
-            num_encoder_layers=LAYERS,
-            num_decoder_layers=LAYERS,
-            dim_feedforward=4 * EMB,
-            dropout=DROP,
-            activation=ACTIVATION,
-            batch_first=True,  # (batch, seq, feature)
-            device=device,
-        )
-        print(model_pytorch)
-
-        optim_xformers = torch.optim.SGD(
-            model_xformers.parameters(), lr=1e-3, momentum=0.9
-        )
-        optim_pytorch = torch.optim.SGD(
-            model_pytorch.parameters(), lr=1e-3, momentum=0.9
-        )
-
-        # Check that both models can be trained to comparable results
-        eval_start_xformer = evaluate(model_xformers, BATCH, SEQ, EMB, device)
-        eval_start_pytorch = evaluate(model_pytorch, BATCH, SEQ, EMB, device)
-        print("starting point: ", eval_start_pytorch, eval_start_xformer)
-        train(model_xformers, optim_xformers, "xformers", 100, BATCH, SEQ, EMB, device)
-        train(model_pytorch, optim_pytorch, "pytorch", 100, BATCH, SEQ, EMB, device)
-
-        # Check that we can classify this dummy example
-        # Arbitrary threshold
-        eval_stop_xformer = evaluate(model_xformers, BATCH, SEQ, EMB, device)
-        eval_stop_pytorch = evaluate(model_pytorch, BATCH, SEQ, EMB, device)
-        print("end point: ", eval_stop_pytorch, eval_stop_xformer)
-
-        fit_ratio_xformer = eval_start_xformer / eval_stop_xformer
-        fit_ratio_pytorch = eval_start_pytorch / eval_stop_pytorch
-
-        print(fit_ratio_pytorch, fit_ratio_xformer)
-
-        assert fit_ratio_xformer > 50
-        assert fit_ratio_pytorch > 50
diff --git a/xformers/_deprecation_warning.py b/xformers/_deprecation_warning.py
new file mode 100644
index 000000000..505ef15e6
--- /dev/null
+++ b/xformers/_deprecation_warning.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+
+
+def deprecated_function(self):
+    name = repr(self)  # self.__name__
+    msg = f"{name} is deprecated and is not maintained anymore. It might be removed in a future version of xFormers"
+    warnings.warn(msg, FutureWarning, stacklevel=2)
diff --git a/xformers/benchmarks/benchmark_encoder.py b/xformers/benchmarks/benchmark_encoder.py
deleted file mode 100644
index a5a073f9e..000000000
--- a/xformers/benchmarks/benchmark_encoder.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import json
-import time
-from contextlib import suppress
-from typing import Any, Dict, List, Optional
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import torch
-import torch.nn.functional as F
-from sklearn.model_selection import ParameterGrid
-
-# CREDITS: Sean Naren
-from torch.autograd.profiler import record_function
-from tqdm import tqdm
-
-from xformers.components import Activation
-from xformers.components.attention import ATTENTION_REGISTRY
-from xformers.factory.block_factory import xFormerEncoderBlock, xFormerEncoderConfig
-
-_use_cuda = torch.cuda.is_available()
-_GLOBAL_ATTENTION_RATIO = 0.1  # arbitrary
-
-
-def _get_attention_query_mask(sequence_length: int, ratio: float):
-    mask = torch.rand((sequence_length, 1)) < ratio
-    while torch.count_nonzero(mask) / float(mask.numel()) > ratio:
-        mask = torch.rand((sequence_length, 1)) < ratio
-
-    return mask
-
-
-def _get_trace_handler(name: str):
-    def trace_handler(prof):
-        prof.export_chrome_trace(f"profile_{name}.json")
-        prof.export_stacks(f"stacks_{name}.txt", "self_cuda_time_total")
-
-    return trace_handler
-
-
-def _train_for_several_steps(
-    block: xFormerEncoderBlock,
-    num_steps: int,
-    batch_size: int,
-    sequence_length: int,
-    embed_dim: int,
-    autocast: bool,
-    device: torch.device,
-    lr: float = 0.01,
-    norm_type: Optional[float] = None,
-    profile: bool = False,
-    att_name: str = "",
-) -> Dict[str, float]:
-    # use SGD with momentum instead of Adam, since Adam is scale invariant
-    # and this makes it bad for tests
-    optim = torch.optim.SGD(block.parameters(), lr=lr, momentum=0.9)
-
-    if _use_cuda:
-        torch.cuda.empty_cache()
-        torch.cuda.reset_peak_memory_stats()
-        torch.cuda.synchronize()
-
-    start_time = time.time()
-
-    # Optional profiler, requires a context and some setup
-    profiler = (
-        torch.profiler.profile(  # type: ignore
-            activities=[
-                torch.profiler.ProfilerActivity.CPU,  # type: ignore
-                torch.profiler.ProfilerActivity.CUDA,  # type: ignore
-            ],
-            schedule=torch.profiler.schedule(wait=1, warmup=1, active=1),  # type: ignore
-            on_trace_ready=_get_trace_handler(
-                f"{att_name}_batch_{batch_size}_seq_{sequence_length}_embed_dim_{embed_dim}"
-            ),
-            profile_memory=True,
-            with_stack=True,
-        )
-        if profile
-        else suppress()
-    )
-
-    # Actual vanilla training loop
-    # - nonsensical data, but remove that from the compute time
-    inputs = torch.rand(batch_size, sequence_length).to(device)
-
-    with profiler as p:  # type: ignore
-        for _ in range(num_steps):
-            optim.zero_grad()
-
-            with torch.cuda.amp.autocast(enabled=autocast):
-                with record_function("attention_forward"):
-                    output = block(inputs)
-
-                with record_function("loss"):
-                    loss = F.mse_loss(
-                        inputs.unsqueeze(-1).repeat(1, 1, output.shape[-1]),
-                        output,
-                        reduction="sum",
-                    )
-
-            with record_function("backward"):
-                loss.backward()
-
-            if norm_type is not None:
-                clip_norm = 0.3
-                torch.nn.utils.clip_grad_norm_(block.parameters(), clip_norm, norm_type)
-            optim.step()
-
-            if p:
-                p.step()
-
-    if _use_cuda:
-        torch.cuda.synchronize()
-        max_memory = torch.cuda.max_memory_allocated() / 2**20
-    else:
-        max_memory = -1
-    run_time = time.time() - start_time
-
-    return {"run_time": run_time, "max_memory": round(max_memory, 1)}
-
-
-def benchmark_model(num_warmup: int, num_steps: int, **kwargs) -> Dict[str, float]:
-    # Run warm-up first
-    warm_up_args = {**kwargs}
-    warm_up_args["profile"] = False
-    _train_for_several_steps(num_steps=num_warmup, **warm_up_args)
-
-    return _train_for_several_steps(num_steps=num_steps, **kwargs)
-
-
-def test_xformer_encoder_block(
-    attention_name: str,
-    feedforward_name: str,
-    heads: int,
-    attn_dropout: float,
-    residual_dropout: float,
-    causal: bool,
-    activation: Activation,
-    autocast: bool,
-    batch_size: int,
-    sequence_length: int,
-    embed_dim: int,
-    dropout: float,
-    num_steps: int,
-    num_warmup: int,
-    device: torch.device,
-    profile: bool,
-) -> Dict[str, float]:
-
-    block = instantiate_xformer(
-        activation=activation,
-        attention_name=attention_name,
-        attn_dropout=attn_dropout,
-        causal=causal,
-        feedforward_name=feedforward_name,
-        heads=heads,
-        residual_dropout=residual_dropout,
-        sequence_length=sequence_length,
-        embed_dim=embed_dim,
-        dropout=dropout,
-    ).to(device)
-
-    print(
-        "Testing:",
-        block,
-        batch_size,
-        sequence_length,
-        embed_dim,
-        autocast,
-        device,
-        attention_name,
-    )
-
-    return benchmark_model(
-        num_steps=num_steps,
-        num_warmup=num_warmup,
-        block=block,
-        batch_size=batch_size,
-        sequence_length=sequence_length,
-        embed_dim=embed_dim,
-        autocast=autocast,
-        device=device,
-        profile=profile,
-        att_name=attention_name,
-    )
-
-
-def instantiate_xformer(
-    activation: Activation,
-    attention_name: str,
-    attn_dropout: float,
-    causal: bool,
-    feedforward_name: str,
-    heads: int,
-    residual_dropout: float,
-    sequence_length: int,
-    embed_dim: int,
-    dropout: float,
-) -> xFormerEncoderBlock:
-
-    block_size = 16
-
-    attention_config = {
-        "name": attention_name,
-        "dropout": attn_dropout,
-        "causal": causal,
-        "seq_len": sequence_length,
-        "attention_query_mask": _get_attention_query_mask(
-            sequence_length, _GLOBAL_ATTENTION_RATIO
-        ),
-        "num_heads": heads,
-        "dim_head": embed_dim / heads,
-        "layout": torch.eye(
-            sequence_length // block_size,
-            sequence_length // block_size,
-            dtype=torch.long,
-        )
-        .unsqueeze(0)
-        .expand(heads, -1, -1),
-        "block_size": block_size,
-    }
-
-    multi_head_config = {
-        "num_heads": heads,
-        "dim_model": embed_dim,
-        "residual_dropout": residual_dropout,
-        "attention": attention_config,
-    }
-
-    feedforward_config = {
-        "name": feedforward_name,
-        "dim_model": embed_dim,
-        "dropout": dropout,
-        "activation": activation,
-        "hidden_layer_multiplier": 4,
-    }
-
-    position_embedding_config = {
-        "name": "sine",
-        "dim_model": embed_dim,
-        "seq_len": sequence_length,
-    }
-
-    block_config = xFormerEncoderConfig(
-        dim_model=embed_dim,
-        multi_head_config=multi_head_config,
-        feedforward_config=feedforward_config,
-        position_encoding_config=position_embedding_config,
-    )
-
-    block = xFormerEncoderBlock.from_config(block_config)
-    return block
-
-
-def plot(args, results: List[Dict[str, Any]]):
-    df = pd.DataFrame(results)
-    HEADS = args.heads[-1]
-    AMP = args.pytorch_amp[-1]
-    EMB = args.embedding_dim[-1]
-    CAUSAL = args.causal[-1]
-    BATCH_SIZE = args.batch_size[-1]
-    ACTIVATION = args.activations[-1]
-
-    df_filtered = df[
-        (df["activation"] == ACTIVATION)
-        & (df["heads"] == HEADS)
-        & (df["autocast"] == AMP)
-        & (df["embed_dim"] == EMB)
-        & (df["causal"] == CAUSAL)
-        & (df["batch_size"] == BATCH_SIZE)
-    ]
-
-    df_filtered.sort_values(
-        by=["sequence_length", "max_memory"], ascending=[False, True], inplace=True
-    )
-    sns.barplot(
-        x="sequence_length",
-        y="max_memory",
-        hue="attention_name",
-        data=df_filtered,
-        palette="Set2",
-    )
-    plt.xlabel("Sequence length")
-    plt.ylabel("Max memory being used")
-    plt.title("Memory use")
-    plt.savefig("memory_vs_attention.png")
-    plt.clf()
-
-    df_filtered.sort_values(
-        by=["sequence_length", "run_time"], ascending=[False, True], inplace=True
-    )
-    sns.barplot(
-        x="sequence_length",
-        y="run_time",
-        hue="attention_name",
-        data=df_filtered,
-        palette="Set2",
-    )
-    plt.xlabel("Sequence length")
-    plt.ylabel("Average epoch time")
-    plt.title("Runtime")
-    plt.savefig("runtime_vs_attention.png")
-
-
-if __name__ == "__main__":
-    # Get the user requests
-    parser = argparse.ArgumentParser(
-        "Benchmark different attention mechanisms on various sequence lengths"
-    )
-    parser.add_argument(
-        "-a", "--attentions", nargs="+", default=list(ATTENTION_REGISTRY.keys())
-    )
-    parser.add_argument("-mlp", "--mlp", nargs="+", default=["MLP"])
-    parser.add_argument(
-        "-act", "--activations", nargs="+", default=[a.value for a in Activation]
-    )
-    parser.add_argument(
-        "-emb", "--embedding_dim", nargs="+", default=[64, 128, 256], type=int
-    )
-    parser.add_argument(
-        "-sl", "--sequence_length", nargs="+", default=[576, 1024], type=int
-    )
-    parser.add_argument("-bs", "--batch_size", nargs="+", default=[8, 16, 32], type=int)
-    parser.add_argument("-heads", "--heads", nargs="+", default=[8, 16], type=int)
-
-    parser.add_argument("-fp16", "--pytorch_amp", nargs="+", default=[True], type=bool)
-    parser.add_argument("-causal", "--causal", nargs="+", default=[False], type=bool)
-    parser.add_argument("-plot", "--plot", action="store_true", default=False)
-    parser.add_argument(
-        "-profile",
-        "--profile",
-        help="Pofile the runtime and memory",
-        action="store_true",
-        default=False,
-    )
-
-    args = parser.parse_args()
-
-    # Setup the test configs
-    constants = {
-        "device": torch.device("cuda") if _use_cuda else torch.device("cpu"),
-        "num_warmup": 5,
-        "num_steps": 10,
-        "dropout": 0.1,
-        "attn_dropout": 0.1,
-        "residual_dropout": 0.1,
-        "profile": args.profile,
-    }
-
-    param_grid = {
-        "autocast": args.pytorch_amp,
-        "causal": args.causal,
-        "heads": args.heads,
-        "activation": args.activations,
-        "attention_name": args.attentions,
-        "feedforward_name": args.mlp,
-        "sequence_length": args.sequence_length,
-        "embed_dim": args.embedding_dim,
-        "batch_size": args.batch_size,
-    }
-
-    print(
-        "Testing the following parameters: \n",
-        json.dumps(param_grid, sort_keys=True, indent=4),
-    )
-
-    grid = ParameterGrid(param_grid)
-
-    grid_outputs = []
-
-    for params in tqdm(grid, total=len(grid)):
-        outputs = test_xformer_encoder_block(**constants, **params)  # type: ignore
-        results = {**outputs, **params}
-        grid_outputs.append(results)
-
-    print(json.dumps(grid_outputs, sort_keys=True, indent=4))
-
-    # Optional plots
-    if args.plot:
-        plot(args, grid_outputs)
diff --git a/xformers/benchmarks/benchmark_pytorch_transformer.py b/xformers/benchmarks/benchmark_pytorch_transformer.py
deleted file mode 100644
index 2137c557f..000000000
--- a/xformers/benchmarks/benchmark_pytorch_transformer.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import random
-import time
-from typing import Any, Dict, List, Tuple
-
-import torch
-import triton
-from torch.cuda.amp import autocast
-
-from xformers.benchmarks.utils import TestCase, pretty_print
-from xformers.factory.model_factory import xFormer, xFormerConfig
-
-VOCAB = 8
-
-
-def _data(device, batch, seq, emb, vocab=VOCAB):
-    # The dummy task is basically to classify sequences, either pure zeroes or some noise
-    input_a = torch.zeros((batch, seq, emb), device=device)
-    input_b = (torch.rand((batch, seq, emb), device=device) * vocab).abs()
-
-    target_a = torch.zeros((batch, seq), device=device)
-    target_b = torch.ones((batch, seq), device=device)
-
-    if random.random() > 0.5:
-        return torch.cat([input_a, input_b], dim=0), torch.cat(
-            [target_a, target_b], dim=0
-        )
-
-    return torch.cat([input_b, input_a], dim=0), torch.cat([target_b, target_a], dim=0)
-
-
-def reset_seeds():
-    torch.manual_seed(0)
-    random.seed(0)
-
-
-def step(
-    model: torch.nn.Module,
-    optim: torch.optim.Optimizer,
-    batch: int,
-    seq: int,
-    emb: int,
-    device,
-):
-    model.train()
-    optim.zero_grad()
-    batch, target = _data(device, batch, seq, emb)
-
-    try:
-        outputs = model(batch)
-    except TypeError:
-        # Pytorch encoder exposes target explicitly
-        outputs = model(batch, tgt=batch)
-
-    loss = torch.norm(torch.mean(outputs, dim=-1) - target)
-    loss.backward()
-
-    # Clip grad and error out if we're producing NaNs
-    torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0, norm_type=2.0, error_if_nonfinite=True)  # type: ignore
-    optim.step()
-
-    return loss.item()
-
-
-def evaluate(model: torch.nn.Module, batch: int, seq: int, emb: int, device):
-    reset_seeds()
-    batch, target = _data(device, batch, seq, emb)
-    model.eval()
-    try:
-        outputs = model(batch)
-    except TypeError:
-        # Pytorch decoder exposes target explicitly
-        outputs = model(batch, tgt=batch)
-
-    return torch.norm(torch.mean(outputs, dim=-1) - target).item()
-
-
-def train(model, optimizer, name, steps, batch: int, seq: int, emb: int, device):
-    # Dummy training, just checking that both options give the same results
-    # Same seed for everyone
-    start = time.time()
-    for _ in range(steps):
-        _ = step(model, optimizer, batch, seq, emb, device)
-
-    torch.cuda.synchronize()
-    print("Trained {} in {:.3}s".format(name, time.time() - start))
-
-
-def bench_pytorch_encoder(
-    shapes: List[Tuple[int, int, int]],
-    activation: str,
-    n_heads: int,
-    dropout: float = 0.1,
-    layers: int = 2,
-    device: torch.device = torch.device("cuda"),
-    steps: int = 20,
-    use_amp: bool = True,
-):
-    results_time: Dict[str, Any] = {}
-    results_memory: Dict[str, Any] = {}
-
-    for shape in shapes:
-        batch, seq, emb = shape
-
-        # Build both a xFormers and Pytorch model
-        reset_seeds()
-
-        model_xformers = xFormer.from_config(
-            xFormerConfig(
-                [
-                    {
-                        "block_type": "encoder",
-                        "dim_model": emb,
-                        "num_layers": layers,
-                        "residual_norm_style": "post",
-                        "multi_head_config": {
-                            "num_heads": n_heads,
-                            "residual_dropout": dropout,
-                            "use_separate_proj_weight": True,
-                            "bias": True,
-                            "attention": {
-                                "name": "scaled_dot_product",
-                                "dropout": dropout,
-                                "causal": False,
-                                "seq_len": seq,
-                            },
-                            "dim_model": emb,
-                        },
-                        "feedforward_config": {
-                            "name": "FusedMLP",
-                            "dropout": dropout,
-                            "activation": activation,
-                            "hidden_layer_multiplier": 4,
-                            "dim_model": emb,
-                        },
-                    },
-                ]
-            )
-        ).to(device)
-        print(model_xformers)
-
-        reset_seeds()
-        model_pytorch = torch.nn.TransformerEncoder(
-            torch.nn.TransformerEncoderLayer(
-                d_model=emb,
-                nhead=n_heads,
-                dim_feedforward=4 * emb,
-                dropout=dropout,
-                activation=activation,
-                layer_norm_eps=1e-05,
-                batch_first=True,  # (batch, seq, feature)
-                device=device,
-            ),
-            num_layers=layers,
-        )
-        print(model_pytorch)
-
-        optim_xformers = torch.optim.Adam(model_xformers.parameters(), lr=1e-3)
-        optim_pytorch = torch.optim.Adam(model_pytorch.parameters(), lr=1e-3)
-
-        def run_training(model, optimizer, label):
-            with autocast(enabled=use_amp):
-                eval_start = evaluate(model, batch, seq, emb, device)
-                torch.cuda.empty_cache()
-                torch.cuda.reset_peak_memory_stats()
-                torch.cuda.synchronize()
-
-                train(model, optimizer, label, steps, batch, seq, emb, device)
-                max_memory = torch.cuda.max_memory_allocated() // 2**20
-                print(f"Peak memory use: {max_memory}MB")
-
-                eval_stop = evaluate(model, batch, seq, emb, device)
-                print(f"Trained from {eval_start} to {eval_stop}\n")
-                return eval_start, eval_stop, max_memory
-
-        # Save the memory being used by both
-        memory: Dict[str, Any] = {"pytorch": [], "xformers": []}
-
-        def torch_train():
-            _, _, max_memory = run_training(model_pytorch, optim_pytorch, "pytorch")
-            memory["pytorch"].append(max_memory)
-
-        def xformers_train():
-            _, _, max_memory = run_training(model_xformers, optim_xformers, "xformers")
-            memory["xformers"].append(max_memory)
-
-        for testcase in [
-            TestCase(
-                xformers_train,
-                "xformers",
-            ),
-            TestCase(
-                torch_train,
-                "pytorch",
-            ),
-        ]:
-            time, _, _ = triton.testing.do_bench(lambda: testcase.function())
-            key = "emb {} - heads {}".format(emb, n_heads)
-            if key not in results_time:
-                results_time[key] = {}
-                results_memory[key] = {}
-
-            results_time[key][testcase.name] = f"{time/1000:.1f}"
-
-            median_memory = sorted(memory[testcase.name])[
-                len(memory[testcase.name]) // 2
-            ]
-            results_memory[key][testcase.name] = median_memory
-
-    pretty_print(
-        results_time,
-        title="\n--- Transformer training benchmark - runtime ---",
-        units="s",
-    )
-    pretty_print(
-        results_memory,
-        title="\n--- Transformer training benchmark - memory use ---",
-        units="MB",
-    )
-
-
-if __name__ == "__main__":
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    bench_pytorch_encoder(
-        shapes=[(16, 128, 128), (2, 1024, 1024), (1, 1024, 2048)],
-        activation="gelu",
-        n_heads=8,
-        dropout=0.1,
-        layers=2,
-        device=device,
-        steps=20,
-        use_amp=True,
-    )
diff --git a/xformers/factory/block_factory.py b/xformers/factory/block_factory.py
index 113f440fe..4868f9f4d 100644
--- a/xformers/factory/block_factory.py
+++ b/xformers/factory/block_factory.py
@@ -11,6 +11,7 @@
 import torch
 import torch.nn as nn
 
+from xformers._deprecation_warning import deprecated_function
 from xformers.components import (
     PatchEmbeddingConfig,
     PostNorm,
@@ -97,6 +98,7 @@ class xFormerEncoderBlock(torch.nn.Module):
 
     def __init__(self, config: xFormerEncoderConfig, **kwargs):
         super().__init__()
+        deprecated_function(self)
 
         self.reversible_f = None
         self.reversible_g = None
@@ -245,6 +247,7 @@ class xFormerDecoderBlock(torch.nn.Module):
 
     def __init__(self, config: xFormerDecoderConfig, **kwargs):
         super().__init__()
+        deprecated_function(self)
 
         # If this layer is the first one, and a pose encoding as been requested
         if (
diff --git a/xformers/factory/model_factory.py b/xformers/factory/model_factory.py
index 4e15f9528..d4be38876 100644
--- a/xformers/factory/model_factory.py
+++ b/xformers/factory/model_factory.py
@@ -10,6 +10,7 @@
 
 import torch
 
+from xformers._deprecation_warning import deprecated_function
 from xformers.components import reversible as rv
 from xformers.components.residual import ResidualNormStyle, get_deepnorm_coefficients
 from xformers.factory.block_configs import (
@@ -101,6 +102,7 @@ def __init__(
 
         self.tie_embedding_weights = tie_embedding_weights
         self.weight_init = weight_init
+        deprecated_function(self)
 
 
 class xFormer(torch.nn.Module):
@@ -117,6 +119,7 @@ def __init__(
         This is only a helper and can easily be bypassed
         """
         super().__init__()
+        deprecated_function(self)
 
         if isinstance(stack_configs, Dict):
             stack_configs = list(stack_configs.values())
diff --git a/xformers/helpers/hierarchical_configs.py b/xformers/helpers/hierarchical_configs.py
index a05a38154..60b9564c7 100644
--- a/xformers/helpers/hierarchical_configs.py
+++ b/xformers/helpers/hierarchical_configs.py
@@ -8,6 +8,7 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
+from xformers._deprecation_warning import deprecated_function
 from xformers.components.residual import ResidualNormStyle
 
 
@@ -39,6 +40,7 @@ def get_hierarchical_configuration(
     Contrary to more "classical" Transformer architectures, which conserve the sequence/context
     length across layers, hierarchical Transformers trade the sequence length for the embedding dimension
     """
+    deprecated_function(get_hierarchical_configuration)
 
     base_config: Dict[str, Any] = {
         "block_type": "encoder",