Some examples:
Residual Dropout We apply dropout [27] to the output of each sub-layer, before it is added to the
sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the
positional encodings in both the encoder and decoder stacks.
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 500, 1)] 0
__________________________________________________________________________________________________
layer_normalization (LayerNorma (None, 500, 1) 2 input_1[0][0]
__________________________________________________________________________________________________
multi_head_attention (MultiHead (None, 500, 1) 7169 layer_normalization[0][0]
layer_normalization[0][0]
__________________________________________________________________________________________________
dropout (Dropout) (None, 500, 1) 0 multi_head_attention[0][0]
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 500, 1) 0 dropout[0][0]
input_1[0][0]
__________________________________________________________________________________________________
layer_normalization_1 (LayerNor (None, 500, 1) 2 tf.__operators__.add[0][0]
__________________________________________________________________________________________________
conv1d (Conv1D) (None, 500, 4) 8 layer_normalization_1[0][0]
__________________________________________________________________________________________________
dropout_1 (Dropout) (None, 500, 4) 0 conv1d[0][0]
__________________________________________________________________________________________________
conv1d_1 (Conv1D) (None, 500, 1) 5 dropout_1[0][0]
__________________________________________________________________________________________________
tf.__operators__.add_1 (TFOpLam (None, 500, 1) 0 conv1d_1[0][0]
tf.__operators__.add[0][0]
__________________________________________________________________________________________________
layer_normalization_2 (LayerNor (None, 500, 1) 2 tf.__operators__.add_1[0][0]
__________________________________________________________________________________________________
multi_head_attention_1 (MultiHe (None, 500, 1) 7169 layer_normalization_2[0][0]
layer_normalization_2[0][0]
__________________________________________________________________________________________________
dropout_2 (Dropout) (None, 500, 1) 0 multi_head_attention_1[0][0]
__________________________________________________________________________________________________
tf.__operators__.add_2 (TFOpLam (None, 500, 1) 0 dropout_2[0][0]
tf.__operators__.add_1[0][0]
__________________________________________________________________________________________________
layer_normalization_3 (LayerNor (None, 500, 1) 2 tf.__operators__.add_2[0][0]
__________________________________________________________________________________________________
conv1d_2 (Conv1D) (None, 500, 4) 8 layer_normalization_3[0][0]
__________________________________________________________________________________________________
dropout_3 (Dropout) (None, 500, 4) 0 conv1d_2[0][0]
__________________________________________________________________________________________________
conv1d_3 (Conv1D) (None, 500, 1) 5 dropout_3[0][0]
__________________________________________________________________________________________________
tf.__operators__.add_3 (TFOpLam (None, 500, 1) 0 conv1d_3[0][0]
tf.__operators__.add_2[0][0]
__________________________________________________________________________________________________
layer_normalization_4 (LayerNor (None, 500, 1) 2 tf.__operators__.add_3[0][0]
__________________________________________________________________________________________________
multi_head_attention_2 (MultiHe (None, 500, 1) 7169 layer_normalization_4[0][0]
layer_normalization_4[0][0]
__________________________________________________________________________________________________
dropout_4 (Dropout) (None, 500, 1) 0 multi_head_attention_2[0][0]
__________________________________________________________________________________________________
tf.__operators__.add_4 (TFOpLam (None, 500, 1) 0 dropout_4[0][0]
tf.__operators__.add_3[0][0]
__________________________________________________________________________________________________
layer_normalization_5 (LayerNor (None, 500, 1) 2 tf.__operators__.add_4[0][0]
__________________________________________________________________________________________________
conv1d_4 (Conv1D) (None, 500, 4) 8 layer_normalization_5[0][0]
__________________________________________________________________________________________________
dropout_5 (Dropout) (None, 500, 4) 0 conv1d_4[0][0]
__________________________________________________________________________________________________
conv1d_5 (Conv1D) (None, 500, 1) 5 dropout_5[0][0]
__________________________________________________________________________________________________
tf.__operators__.add_5 (TFOpLam (None, 500, 1) 0 conv1d_5[0][0]
tf.__operators__.add_4[0][0]
__________________________________________________________________________________________________
layer_normalization_6 (LayerNor (None, 500, 1) 2 tf.__operators__.add_5[0][0]
__________________________________________________________________________________________________
multi_head_attention_3 (MultiHe (None, 500, 1) 7169 layer_normalization_6[0][0]
layer_normalization_6[0][0]
__________________________________________________________________________________________________
dropout_6 (Dropout) (None, 500, 1) 0 multi_head_attention_3[0][0]
__________________________________________________________________________________________________
tf.__operators__.add_6 (TFOpLam (None, 500, 1) 0 dropout_6[0][0]
tf.__operators__.add_5[0][0]
__________________________________________________________________________________________________
layer_normalization_7 (LayerNor (None, 500, 1) 2 tf.__operators__.add_6[0][0]
__________________________________________________________________________________________________
conv1d_6 (Conv1D) (None, 500, 4) 8 layer_normalization_7[0][0]
__________________________________________________________________________________________________
dropout_7 (Dropout) (None, 500, 4) 0 conv1d_6[0][0]
__________________________________________________________________________________________________
conv1d_7 (Conv1D) (None, 500, 1) 5 dropout_7[0][0]
__________________________________________________________________________________________________
tf.__operators__.add_7 (TFOpLam (None, 500, 1) 0 conv1d_7[0][0]
tf.__operators__.add_6[0][0]
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 500) 0 tf.__operators__.add_7[0][0]
__________________________________________________________________________________________________
dense (Dense) (None, 128) 64128 global_average_pooling1d[0][0]
__________________________________________________________________________________________________
dropout_8 (Dropout) (None, 128) 0 dense[0][0]
__________________________________________________________________________________________________
dense_1 (Dense) (None, 2) 258 dropout_8[0][0]
==================================================================================================