SpeechToText(
(frontend): ConvFrontEnd(
(conv1): Conv2dLayer(
(conv_layer): Conv2d(1, 64, kernel_size=[3, 3], stride=(2, 2), padding=(0, 1))
(dropout): Dropout(p=0.0, inplace=False)
)
(conv2): Conv2dLayer(
(conv_layer): Conv2d(64, 128, kernel_size=[3, 3], stride=(2, 2), padding=(0, 1))
(dropout): Dropout(p=0.0, inplace=False)
)
(output_layer): Linear(in_features=1280, out_features=256, bias=True)
)
(encoder): TransformerEncoder(
(pos_emb): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
(blocks): ModuleList(
(0): TransformerEncoderLayer(
(slf_attn): MultiHeadedSelfAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(qvk_proj): Linear(in_features=256, out_features=768, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=256, out_features=2048, bias=True)
(w_2): Linear(in_features=1024, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(1): TransformerEncoderLayer(
(slf_attn): MultiHeadedSelfAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(qvk_proj): Linear(in_features=256, out_features=768, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=256, out_features=2048, bias=True)
(w_2): Linear(in_features=1024, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(2): TransformerEncoderLayer(
(slf_attn): MultiHeadedSelfAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(qvk_proj): Linear(in_features=256, out_features=768, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=256, out_features=2048, bias=True)
(w_2): Linear(in_features=1024, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(3): TransformerEncoderLayer(
(slf_attn): MultiHeadedSelfAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(qvk_proj): Linear(in_features=256, out_features=768, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=256, out_features=2048, bias=True)
(w_2): Linear(in_features=1024, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
)
)
(decoder): TransformerDecoder(
(embedding): Embedding(4233, 256)
(pos_emb): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
(blocks): ModuleList(
(0): TransformerDecoderLayer(
(slf_attn): MultiHeadedSelfAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(qvk_proj): Linear(in_features=256, out_features=768, bias=True)
)
(src_attn): MultiHeadedCrossAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(q_proj): Linear(in_features=256, out_features=256, bias=True)
(vk_proj): Linear(in_features=256, out_features=512, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=256, out_features=2048, bias=True)
(w_2): Linear(in_features=1024, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(1): TransformerDecoderLayer(
(slf_attn): MultiHeadedSelfAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(qvk_proj): Linear(in_features=256, out_features=768, bias=True)
)
(src_attn): MultiHeadedCrossAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(q_proj): Linear(in_features=256, out_features=256, bias=True)
(vk_proj): Linear(in_features=256, out_features=512, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=256, out_features=2048, bias=True)
(w_2): Linear(in_features=1024, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(2): TransformerDecoderLayer(
(slf_attn): MultiHeadedSelfAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(qvk_proj): Linear(in_features=256, out_features=768, bias=True)
)
(src_attn): MultiHeadedCrossAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(q_proj): Linear(in_features=256, out_features=256, bias=True)
(vk_proj): Linear(in_features=256, out_features=512, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=256, out_features=2048, bias=True)
(w_2): Linear(in_features=1024, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(3): TransformerDecoderLayer(
(slf_attn): MultiHeadedSelfAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(qvk_proj): Linear(in_features=256, out_features=768, bias=True)
)
(src_attn): MultiHeadedCrossAttention(
(output_proj): Linear(in_features=256, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(q_proj): Linear(in_features=256, out_features=256, bias=True)
(vk_proj): Linear(in_features=256, out_features=512, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=256, out_features=2048, bias=True)
(w_2): Linear(in_features=1024, out_features=256, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
)
(output_layer): Linear(in_features=256, out_features=4233, bias=True)
)
(crit): LabelSmoothingLoss()
(assistor): CTCAssistor(
(output_layer): Linear(in_features=256, out_features=4233, bias=True)
(ctc_crit): CTCLoss()
)
)