Discrepancy in Output Shapes When Running ssd_mobilenet_v3_float TFLite Model in TFLM

I am trying to run the ssd_mobilenet_v3_float TFLite model using TFLM (TensorFlow Lite for Microcontrollers). However, I am encountering discrepancies in the output shapes when comparing the results obtained from Python and C++ code.

Below are the Python and C++ codes I used to verify the input and output shapes, along with their respective outputs. Is there an issue with my model conversion, or am I using the TFLM library incorrectly?

Any insights or suggestions would be greatly appreciated.

ssd_mobilenet_v3_float_tflite link(google drive) :

Python

import tensorflow as tf

# Load the TFLite model
interpreter = tf.lite.Interpreter(model_path='ssd_mobilenet_v3_float.tflite')
interpreter.allocate_tensors()

# Get input details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print("Input Details:")
for detail in input_details:
    print(f"Name: {detail['name']}")
    print(f"Shape: {detail['shape']}")
    print(f"Type: {detail['dtype']}")

print("\nOutput Details:")
for detail in output_details:
    print(f"Name: {detail['name']}")
    print(f"Shape: {detail['shape']}")
    print(f"Type: {detail['dtype']}")

output
Input Details:
Name: normalized_input_image_tensor
Shape: [ 1 320 320 3]
Type: <class ‘numpy.float32’>

Output Details:
Name: TFLite_Detection_PostProcess
Shape: [ 1 10 4]
Type: <class ‘numpy.float32’>
Name: TFLite_Detection_PostProcess:1
Shape: [ 1 10]
Type: <class ‘numpy.float32’>
Name: TFLite_Detection_PostProcess:2
Shape: [ 1 10]
Type: <class ‘numpy.float32’>
Name: TFLite_Detection_PostProcess:3
Shape: [1]
Type: <class ‘numpy.float32’>

TFLM C++

#include "tensorflow/lite/micro/micro_interpreter.h"
#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
#include "tensorflow/lite/schema/schema_generated.h"
#include "ssd_mobilenet_v3_float.h"

constexpr int kNumCols = 320;
constexpr int kNumRows = 320;
constexpr int kNumChannels = 3;
constexpr int kMaxImageSize = kNumCols * kNumRows * kNumChannels;
constexpr int kPersonIndex = 1;
constexpr int kNotAPersonIndex = 0;
constexpr int tensor_arena_size = 16420 * 1024;
uint8_t tensor_arena[tensor_arena_size];

TfLiteStatus GetImage(int image_width, int image_height, int channels, float* image_data) {
  for (int i = 0; i < image_width * image_height * channels; ++i) {
    image_data[i] = 0.0f;
  }
  return kTfLiteOk;
}

int main() {
  clock_t start, end;
  double cpu_time_used;
  start = clock();
  
  const tflite::Model* model = ::tflite::GetModel(ssd_mobilenet_v3_float_tflite);
  if (model->version() != TFLITE_SCHEMA_VERSION) {
    printf("Model provided is schema version %lu not equal to supported version %d.\n", model->version(), TFLITE_SCHEMA_VERSION);
    return -1;
  }

  tflite::MicroMutableOpResolver<20> micro_op_resolver;
  micro_op_resolver.AddConv2D();
  micro_op_resolver.AddAdd();
  micro_op_resolver.AddDepthwiseConv2D();
  micro_op_resolver.AddReshape();
  micro_op_resolver.AddConcatenation();
  micro_op_resolver.AddLogistic();
  micro_op_resolver.AddDetectionPostprocess();
  micro_op_resolver.AddPack();
  micro_op_resolver.AddMul();
  micro_op_resolver.AddMean();
  micro_op_resolver.AddHardSwish();
  
  tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena, tensor_arena_size);
  
  TfLiteStatus allocate_status = interpreter.AllocateTensors();
  if (allocate_status != kTfLiteOk) {
    printf("Tensor allocation failed\n");
    return -1;
  }

  TfLiteTensor* input = interpreter.input(0);
  GetImage(kNumCols, kNumRows, kNumChannels, input->data.f);
  
  TfLiteStatus invoke_status = interpreter.Invoke();
  if (invoke_status != kTfLiteOk) {
    printf("Model invocation failed\n");
    return -1;
  }

  for (int i = 0; i < interpreter.outputs_size(); ++i) {
    TfLiteTensor* output = interpreter.output(i);
    printf("Output %d:\n", i);
    printf("  dims->size: %d\n", output->dims->size);
    printf("  dims->data[0]: %d\n", output->dims->data[0]);
    printf("  dims->data[1]: %d\n", output->dims->data[1]);
    printf("  dims->data[2]: %d\n", output->dims->data[2]);
    printf("  dims->data[3]: %d\n", output->dims->data[3]);
    printf("  type: %d\n", output->type);
  }

  end = clock();
  cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
  printf("Function took %f seconds to execute\n", cpu_time_used);

  return 0;
}

output

Output 0:
dims->size: 0
dims->data[0]: 28
dims->data[0]: 1766606420
dims->data[0]: 1147102580
dims->data[0]: 1667593317
type: 1
Output 1:
dims->size: 0
dims->data[0]: 30
dims->data[0]: 1766606420
dims->data[0]: 1147102580
dims->data[0]: 1667593317
type: 1
Output 2:
dims->size: 0
dims->data[0]: 30
dims->data[0]: 1766606420
dims->data[0]: 1147102580
dims->data[0]: 1667593317
type: 1
Output 3:
dims->size: 0
dims->data[0]: 30
dims->data[0]: 1766606420
dims->data[0]: 1147102580
dims->data[0]: 1667593317
type: 1
Function took 0.204229 seconds to execute