Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce build_conformer_conv() and build_sdpa() in wav2vec2 builder. #97

Merged
merged 2 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions src/fairseq2/models/w2vbert/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@ def _encoder_600m() -> Wav2Vec2EncoderConfig:
layer_drop_p=0.0,
norm_order=TransformerNormOrder.POST,
depthwise_conv_kernel_size=31,
causal_depthwise_conv=False,
conv_norm_type="batch_norm",
shaw_rel_pos_sdpa_config=None,
)


Expand Down Expand Up @@ -77,9 +74,6 @@ def _encoder_300m() -> Wav2Vec2EncoderConfig:
layer_drop_p=0.0,
norm_order=TransformerNormOrder.POST,
depthwise_conv_kernel_size=31,
causal_depthwise_conv=False,
conv_norm_type="batch_norm",
shaw_rel_pos_sdpa_config=None,
)


Expand Down
80 changes: 21 additions & 59 deletions src/fairseq2/models/wav2vec2/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# LICENSE file in the root directory of this source tree.

from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple
from typing import List, Optional, Tuple

from torch.nn import GELU, SiLU

Expand Down Expand Up @@ -34,7 +34,6 @@
MultiheadAttention,
RelativePositionalEncoding,
RelativePositionSDPA,
ShawRelativePositionSDPA,
StandardFeedForwardNetwork,
StandardMultiheadAttention,
StandardTransformerEncoder,
Expand All @@ -47,20 +46,6 @@
from fairseq2.typing import DataType, Device


@dataclass
class ShawRelativePositionSDPAConfig:
"""Holds the configuration of the :class:ShawRelativePositionSDPA module."""

max_left_rel_pos: int
"""The left clipping value for relative positions."""

max_right_rel_pos: Optional[int]
"""The right clipping value for relative positions."""

use_rel_pos_values: bool = False
"""If True, also uses relative position values to compute relative attention."""


@dataclass
class Wav2Vec2EncoderConfig:
"""Holds the configuration of a wav2vec 2.0 encoder."""
Expand Down Expand Up @@ -112,8 +97,8 @@ class Wav2Vec2EncoderConfig:
sample_fbank_every_k: int

# Position Encoder
pos_encoder_type: Literal["conv", "relative", "relative_shaw", "rotary"]
"""The type of position encoder."""
pos_encoder_type: str
kauterry marked this conversation as resolved.
Show resolved Hide resolved
"""The type of position encoder ('conv', 'relative', 'rotary')."""

# Convolutional Position Encoder
pos_encoder_depth: int
Expand Down Expand Up @@ -154,16 +139,6 @@ class Wav2Vec2EncoderConfig:
depthwise_conv_kernel_size: int
"""The kernel size of depthwise convolutions in Conformer blocks."""

causal_depthwise_conv: bool
"""If True, uses a causal depthwise convolution similar to that described in
Section 2.1 of :cite:t:`https://doi.org/10.48550/arxiv.1609.03499`."""

conv_norm_type: Literal["batch_norm", "layer_norm"]
"""The type of normalization to use in the Conformer convolution module."""

shaw_rel_pos_sdpa_config: Optional[ShawRelativePositionSDPAConfig]
"""The parameters for ShawRelativePositionSDPA."""


def _encoder_base() -> Wav2Vec2EncoderConfig:
layer_descs = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2
Expand Down Expand Up @@ -195,9 +170,6 @@ def _encoder_base() -> Wav2Vec2EncoderConfig:
layer_drop_p=0.05,
norm_order=TransformerNormOrder.POST,
depthwise_conv_kernel_size=0,
causal_depthwise_conv=False,
conv_norm_type="batch_norm",
shaw_rel_pos_sdpa_config=None,
)


Expand Down Expand Up @@ -336,14 +308,7 @@ def build_conformer_block(self) -> TransformerEncoderLayer:

self_attn = self.build_attention()

conv = ConformerConvolution(
self.config.model_dim,
self.config.depthwise_conv_kernel_size,
causal_depthwise_conv=self.config.causal_depthwise_conv,
norm_type=self.config.conv_norm_type,
device=self.device,
dtype=self.dtype,
)
conv = self.build_conformer_conv()

ffn2 = self.build_ffn(use_swish=True)

Expand All @@ -369,8 +334,18 @@ def build_attention(self) -> MultiheadAttention:
else:
pos_encoder = None

sdpa: SDPA
sdpa = self.build_sdpa()

return StandardMultiheadAttention(
self.config.model_dim,
self.config.num_encoder_attn_heads,
pos_encoder=pos_encoder,
sdpa=sdpa,
device=self.device,
dtype=self.dtype,
)

def build_sdpa(self) -> SDPA:
if self.config.pos_encoder_type == "relative":
if self.rel_pos_encoding is None:
self.rel_pos_encoding = RelativePositionalEncoding(
Expand All @@ -380,34 +355,21 @@ def build_attention(self) -> MultiheadAttention:
dtype=self.dtype,
)

sdpa = RelativePositionSDPA(
return RelativePositionSDPA(
self.config.model_dim,
self.config.num_encoder_attn_heads,
self.rel_pos_encoding,
attn_dropout_p=self.config.attn_dropout_p,
device=self.device,
dtype=self.dtype,
)
elif self.config.pos_encoder_type == "relative_shaw":
sdpa_config = self.config.shaw_rel_pos_sdpa_config
sdpa = ShawRelativePositionSDPA(
self.config.model_dim,
self.config.num_encoder_attn_heads,
sdpa_config.max_left_rel_pos,
max_right_rel_pos=sdpa_config.max_right_rel_pos,
use_rel_pos_values=sdpa_config.use_rel_pos_values,
attn_dropout_p=self.config.attn_dropout_p,
device=self.device,
dtype=self.dtype,
)
else:
sdpa = create_default_sdpa(self.config.attn_dropout_p)

return StandardMultiheadAttention(
return create_default_sdpa(self.config.attn_dropout_p)

def build_conformer_conv(self) -> ConformerConvolution:
return ConformerConvolution(
self.config.model_dim,
self.config.num_encoder_attn_heads,
pos_encoder=pos_encoder,
sdpa=sdpa,
self.config.depthwise_conv_kernel_size,
device=self.device,
dtype=self.dtype,
)
Expand Down