Accuracy issue to compute the gradients

Hi, I encountered an accuracy issue when computing the backprop of some layers through TensorFlow OPs. The gradients was computed through two different ways:

  1. tf.gradients
  2. compute the gradients directly through TF APIs, take softmax as an example, we can compute the gradients as follows:
  sum_channels = math_ops.reduce_sum(grad_softmax * softmax, -1, keepdims=True)
  grad =  (grad_softmax - sum_channels) * softmax

But I found that the results from the two implementation are not exactly the same. Anyone knows what is the problem?
And another question is that when training in Tensorflow, whether the gradients are the same as that computed through tf.gradients? Thanks.

The entire testing code is as follows (tested with TF 1.15):

import numpy as np
import tensorflow as tf

batch_size = 20
num_heads = 8
from_seq_len = 50
to_seq_len = 50

class testSoftmaxBackprop:
    def __init__(self,
                 batch_size,
                 num_heads,
                 from_seq_len,
                 to_seq_len):
        self.batch_size = batch_size
        self.num_heads = num_heads
        self.from_seq_len = from_seq_len
        self.to_seq_len = to_seq_len

        self.input_data = tf.placeholder(tf.float32, shape=[
            self.num_heads * self.batch_size,
            self.from_seq_len,
            self.to_seq_len
        ])

        self.out  = tf.nn.softmax(self.input_data)

        # self.out = tf.identity(softmax)


    def forward(self, np_data):
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            out = sess.run([self.out],
                           feed_dict={
                               self.input_data:np_data
                           })
        return out[0]


    def back_auto(self, np_grads, np_data, np_softmax):
        grads = tf.placeholder(tf.float32,
                               shape=[
                                   self.num_heads * self.batch_size,
                                   self.from_seq_len,
                                   self.to_seq_len
                               ])

        g = tf.gradients(self.out, [self.input_data], grad_ys=grads)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            g_out = sess.run(g,
                             feed_dict={
                                 self.input_data:np_data,
                                 grads:np_grads,
                             })

        return g_out

    def back_api(self, np_grads, np_data, np_softmax):
        tf_grads = tf.constant(np_grads, dtype=tf.float32)
        # tf_data = tf.constant(np_data, dtype=tf.float32)
        tf_softmax = tf.constant(np_softmax, dtype=tf.float32)

        sum_channels = tf.reduce_sum(tf_grads * tf_softmax, axis=-1, keepdims=True)
        d_out = (tf_grads - sum_channels) * tf_softmax # [h*N, T_q, T_k]

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            grad = sess.run([d_out])
        return grad


def main():
    np.random.seed(0)
    np_data = np.random.rand(num_heads * batch_size, from_seq_len, to_seq_len)
    np_data = np_data.astype(np.float32)
    np_grad = np.random.rand(num_heads * batch_size, from_seq_len, to_seq_len)
    np_grad = np_grad.astype(np.float32)

    test_back = testSoftmaxBackprop(batch_size, num_heads, from_seq_len, to_seq_len)

    np_softmax = test_back.forward(np_data)

    grad_auto = test_back.back_auto(np_grad, np_data, np_softmax)
    grad_api = test_back.back_api(np_grad, np_data, np_softmax)

    api_data = grad_api[0]
    auto_data = grad_auto[0]

    api_save = api_data.reshape(-1)
    auto_save = auto_data.reshape(-1)
    np.savetxt("api_data.txt", api_save)
    np.savetxt("auto_data.txt", auto_save)

    print("Results:")
    print("Comparison :" + str(np.allclose(api_data, auto_data, atol = 5e-6)))
    print("max diff " + str(np.fabs(api_data - auto_data).max()))

if __name__ == "__main__":
    main()

Anyone has an idea on this issue?