Convert Source Code and Model Weights from Tensorflow to Pytorch

Megh_Bhalerao · August 23, 2021, 11:44am

Hi Community,
I have the following code of a two-layer BiLSTM as follows . It is a fairly large implementation.

from utils import *
import tensorflow as tf
from functools import reduce
from operator import mul
import numpy as np


def rnn_encoder(rnn_type, x, mask, d_dim, n_layer, initializer, pooling=None):
    seq_len = tf.reduce_sum(mask, 1)
    with tf.variable_scope('rnn_{}'.format(0)):
        h_outs, h_final = rnn_encoder_single_layer(rnn_type, x, seq_len, d_dim, initializer)
    if n_layer > 1:
        for i in range(1, n_layer):
            with tf.variable_scope('rnn_{}'.format(i)):
                h_outs, h_final = rnn_encoder_single_layer( rnn_type, h_outs, seq_len, d_dim, initializer)

    if pooling is None:
        h = h_outs
    else:
        if pooling == 'max':
            h = dynamic_max_pooling(h_outs, mask)
        elif pooling == 'last':
            h = h_final

    h_size = d_dim * (int(rnn_type.find('bi') > -1) + 1)
    return h, h_size, h_final


def dynamic_max_pooling(x, mask):
    mask = tf.expand_dims(tf.cast(mask, tf.float32) - 1., 2)
    return tf.reduce_max(x + mask * 999999, axis=1)


def rnn_encoder_single_layer(rnn_type, input, seq_len, d_dim, initializer):
    bi_directional = rnn_type.find('bi') == 0
    with tf.variable_scope('rnn_encoder'):
        batch_size = tf.shape(seq_len)[0]

        cell = {}
        initial_state = {}
        for d in ['forward', 'backward'] if bi_directional else ['forward']:
            with tf.variable_scope(d):
                cell[d] = tf.contrib.rnn.CoupledInputForgetGateLSTMCell(d_dim, forget_bias=1.0, initializer=initializer, state_is_tuple=True)
                
                i_cell = tf.get_variable(d + 'i_cell', shape=[1, d_dim], dtype=tf.float32, initializer=initializer)
                i_output = tf.get_variable(d + 'i_output', shape=[1, d_dim], dtype=tf.float32, initializer=initializer)

                c_states = tf.tile(i_cell, tf.stack([batch_size, 1]))
                h_states = tf.tile(i_output, tf.stack([batch_size, 1]))
                initial_state[d] = tf.contrib.rnn.LSTMStateTuple(c_states, h_states)

        if bi_directional:
            raw_outputs, (fw, bw) = \
                tf.nn.bidirectional_dynamic_rnn(cell['forward'], cell['backward'], input, dtype=tf.float32, sequence_length=seq_len, initial_state_fw=initial_state['forward'], initial_state_bw=initial_state['backward'])
            raw_outputs = tf.concat(raw_outputs, axis=2)
            final_states = tf.concat([fw.h, bw.h], axis=1)

        else:
            raw_outputs, final_states = \
                tf.nn.dynamic_rnn(cell['forward'], input, dtype=tf.float32, sequence_length=seq_len, initial_state=initial_state['forward'])
            final_states = final_states.h
    return raw_outputs, final_states


def shape_list(x):
    ps = x.get_shape().as_list()
    ts = tf.shape(x)
    return [ts[i] if ps[i] is None else ps[i] for i in range(len(ps))]


def ffnn(inputs, h_dim, out_dim, n_layer, initializer,
         act_mid=tf.nn.tanh, act_last=tf.nn.tanh, use_bias=True,
         scope_name='ffnn'):
    h = inputs
    for i in range(n_layer-1):
        with tf.variable_scope('{}_{}'.format(scope_name, i)):
            w = tf.get_variable('W',
                                shape=[h_dim, h_dim],
                                initializer=initializer)
            if len(inputs.get_shape().as_list()) == 3:
                w = tf.tile(tf.expand_dims(w, axis=0),
                            [tf.shape(inputs)[0], 1, 1])
            h = tf.matmul(h, w)
            if use_bias:
                b = tf.get_variable('b',
                                    shape=[h_dim],
                                    initializer=tf.zeros_initializer())
                h = h + b
            if act_mid is not None:
                h = act_mid(h)

    with tf.variable_scope('{}_{}'.format(scope_name, n_layer-1)):
        W = tf.get_variable('W',
                            shape=[h_dim, out_dim],
                            initializer=initializer)
        if len(inputs.get_shape().as_list()) == 3:
            W = tf.tile(tf.expand_dims(W, axis=0), [tf.shape(inputs)[0], 1, 1])
        y_raw = tf.matmul(h, W)
        if use_bias:
            b = tf.get_variable('b',
                                shape=[out_dim],
                                initializer=tf.zeros_initializer())
            y_raw = y_raw + b
        if act_last is not None:
            y_raw = act_last(y_raw)
    return y_raw


def create_mixed_trainable_emb(dim, n_ws, n_special_ws, initializer, is_trainable, scope_name):
    """ Reserve index 0 for non-trainable padding, following by
    n_ws pretrained embeddings and n_special_ws trainable embeddings.
    """
    with tf.variable_scope(scope_name):
        pad_e = tf.get_variable(
            "pad_e",
            dtype=tf.float32,
            shape=[1, dim],
            initializer=tf.zeros_initializer(),
            trainable=False)

        e = tf.get_variable(
            "e",
            dtype=tf.float32,
            shape=[n_ws, dim],
            initializer=initializer,
            trainable=is_trainable)

        special_e = tf.get_variable(
            "special_e",
            dtype=tf.float32,
            shape=[n_special_ws, dim],
            initializer=initializer,
            trainable=True)

        mixed_e = tf.concat([pad_e, e, special_e], axis=0)
    return mixed_e, e


def dropout(x, keep_prob, is_train):
    if is_train and keep_prob > 0:
        x = tf.nn.dropout(x, keep_prob)
    return x


class BaseModel:
    def create_holder(self):
        self.x = tf.placeholder(tf.int32, [None, None, None])
        self.x_mask = tf.placeholder(tf.int32, [None, None, None])
        self.xw = tf.placeholder(tf.int32, [None, None])
        self.xw_mask = tf.placeholder(tf.int32, [None, None])

        self.y = tf.placeholder(tf.int32, [None, None])
        self.drp_keep = tf.placeholder(tf.float32)
        self.lr = tf.placeholder(tf.float32)

    def __init__(self, params, session):
        self.initializer = tf.contrib.layers.xavier_initializer()
        self.params = params
        self.session = session
        self.ignored_vars = []

    def build_graph(self):
        pass

    def get_hybrid_emb(self, x, x_mask, xw, ce, we, initializer):
        with tf.variable_scope('hybrid_emb'):
            dims = shape_list(x)
            n_w, cpw = dims[-2], dims[-1]
            dims_1 = reduce(mul, dims[:-1], 1)
            dims_2 = reduce(mul, dims[:-2], 1)

            x_flat = tf.reshape(x, (dims_1, cpw))
            x_mask_flat = tf.reshape(x_mask, (dims_1, cpw))
            x_rep = tf.nn.embedding_lookup(ce, x_flat)

            with tf.variable_scope('c_encoder'):
                xc_rep, xc_size, _ = rnn_encoder( self.params['c_rnn_type'], x_rep, x_mask_flat, self.params['c_h_dim'], self.params['c_rnn_layers'], initializer, self.params['c_pooling'])
                xc_rep = tf.reshape(xc_rep, [dims_2, n_w, xc_size])

            xw_fat = tf.reshape(xw, (dims_2, n_w))
            xw_rep = tf.nn.embedding_lookup(we, xw_fat)
            w_rep = tf.concat([xc_rep, xw_rep], axis=-1)
            hw_size = self.params['we_dim'] + xc_size
        return w_rep, hw_size

    def encode(self, x, x_mask, xw, xw_mask,
               drp_keep, initializer, reuse):
        with tf.variable_scope('encoder', reuse=reuse):
            n_ce = max(self.params['c2id'].values()) + 1
            ce, _ = create_mixed_trainable_emb( self.params['ce_dim'], n_ce - len(RESERVE_TKS), len(RESERVE_TKS) - 1, initializer, True, 'ce')

            n_we = max(self.params['w2id'].values()) + 1
            assert n_we == len(self.params['w2id'])
            we, we_core = create_mixed_trainable_emb(
                        self.params['we_dim'],
                        n_we - len(RESERVE_TKS),
                        len(RESERVE_TKS) - 1,
                        initializer,
                        self.params['we_trainable'],
                        'we'
                        )

            self.we = we
            self.we_core = we_core
            self.ignored_vars.append(we_core)

            with tf.variable_scope('main_text'):
                w_rep, _ = self.get_hybrid_emb(x, x_mask, xw, ce, we, initializer)
                w_rep = tf.nn.dropout(w_rep, drp_keep)

            with tf.variable_scope('w_encoder'):
                if self.params['w_encoder'] == 'rnn':
                    hw, hw_size, h_last = rnn_encoder(self.params['w_rnn_type'], w_rep, xw_mask, self.params['w_h_dim'], self.params['w_rnn_layers'], initializer, None)
        return hw, hw_size, h_last

    def hw_pooling(self, hw, mask, hw_size, out_size, use_l2, initializer,
                   scope_name, reuse):
        with tf.variable_scope(scope_name, reuse=reuse):
            h = dynamic_max_pooling(hw, mask)

            with tf.variable_scope('ff0', reuse=reuse):
                h = ffnn(
                        h, hw_size, out_size,
                        1, initializer)
            if use_l2:
                h_ = tf.nn.l2_normalize(h, -1)
            else:
                h_ = h
        return h_, h

    def fill_token_idx(self, words_batch, b_size, mlen, cpw):
        x = np.ones((b_size, mlen, cpw)) * self.params['c2id'][PAD]
        x_mask = np.zeros((b_size, mlen, cpw))
        xw = np.ones((b_size, mlen)) * self.params['w2id'][PAD]
        xw_mask = np.zeros((b_size, mlen))

        for i, words in enumerate(words_batch):
            c_ids = np.ones((mlen, cpw)) * self.params['c2id'][PAD]
            c_mask = np.zeros((mlen, cpw))
            for j, w in enumerate([[START_S]] + words[:mlen-2] + [[END_S]]):
                tmp = [self.params['c2id'][c]if c in self.params['c2id'] else self.params['c2id'][UNK]
                       for c in [START_W] + list(w)[:self.params['max_c_per_w']]+[END_W]]
                c_ids[j, :len(tmp)] = tmp
                c_mask[j, :len(tmp)] = 1
                if w[0] in RESERVE_TKS:
                    w_ = w[0]
                elif self.params['we_is_lw']:
                    w_ = w.lower()
                else:
                    w_ = w
                if w_ in self.params['w2id']:
                    xw[i, j] = self.params['w2id'][w_]
                else:
                    if self.params['try_lw_emb'] and w_.lower() in self.params['w2id']:
                        xw[i, j] = self.params['w2id'][w_.lower()]
                    else:
                        xw[i, j] = self.params['w2id'][UNK]
            x[i] = c_ids
            x_mask[i] = c_mask
            xw_mask[i, :len(words) + 2] = 1
        return x, x_mask, xw, xw_mask


class NameEncoder(BaseModel):
    def __init__(self, params, session):
        super().__init__(params, session)
        self.cf = params['tasks']['diff_name']
        self.params = params
        self.build_graph()

    def build_graph(self):
        self.create_holder()
        hw, hw_size, h_last = self.encode(self.x,
                                    self.x_mask,
                                    self.xw,
                                    self.xw_mask,
                                    self.drp_keep,
                                    self.initializer,
                                    reuse=tf.AUTO_REUSE)
        h, h_raw = self.hw_pooling(hw,
                                   self.xw_mask,
                                   hw_size,
                                   self.params['w_h_dim'],
                                   self.cf['use_l2_norm'],
                                   self.initializer,
                                   scope_name='diff_name_classifier',
                                   reuse=tf.AUTO_REUSE)
        self.h = h

    def get_fd_data(self, data_batch):
        max_len = max([len(row) for row in data_batch]) + 2
        b_size = len(data_batch)
        chars_per_word = self.params['max_c_per_w'] + 2
        x, x_mask, xw, xw_mask = self.fill_token_idx(data_batch, b_size, max_len, chars_per_word)
        data_dict = {
            self.x: x,
            self.x_mask: x_mask,
            self.xw: xw,
            self.xw_mask: xw_mask,
            self.drp_keep: 1.,
            }
        return data_dict

I want to convert this implementation to pytorch source code. Is there any automated way to do this? Or is there any mapping from tensorflow methods to pytorch methods?

Additionally, there are also model checkpoints which are loaded into tensorflow of this sort - here is the link - BNE/models/BNE_SGsc at master · minhcp/BNE · GitHub
How do I convert these tensorflow model weights to pytorch weights? Is there any software to do that?
Thank you in advance for your help,
Megh

Bhack · August 23, 2021, 11:50am

Hi, I suggest to post this in the pytorch community forum.

Topic		Replies	Views
Raise ValueError("as_list() is not defined on an unknown TensorShape.") General Discussion tfmodel , tfcompat	1	131	January 31, 2025
Converting model TF model to PyTorch General Discussion tfdata , pytorch	0	150	March 28, 2024
Conversion of pytorch code to tensorflow TensorFlow models	8	9814	May 8, 2023
tf.function(experimental_compile=True): Creating variables on a non-first call to a function decorated with tf.function General Discussion tfconfig , tfkeras	0	47	March 20, 2025
INVALID_ARGUMENT: required broadcastable shapes General Discussion model-predict , autoencoder	1	209	April 19, 2024

Convert Source Code and Model Weights from Tensorflow to Pytorch

Related topics