diff --git a/cv/detection/ssd/tensorflow/dataset/dataset_common.py b/cv/detection/ssd/tensorflow/dataset/dataset_common.py
index 9c17c0eea470df2d18c119b195e5313782f78aed..791c809746052b2e9e7dd219a2eaabcbc871401f 100644
--- a/cv/detection/ssd/tensorflow/dataset/dataset_common.py
+++ b/cv/detection/ssd/tensorflow/dataset/dataset_common.py
@@ -1,24 +1,8 @@
-# Copyright 2018 Changan Wang
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import tensorflow.compat.v1 as tf
-import tf_slim as slim
-
 
 VOC_LABELS = {
     'none': (0, 'Background'),
@@ -44,195 +28,100 @@ VOC_LABELS = {
     'tvmonitor': (20, 'Indoor'),
 }
 
-COCO_LABELS = {
-    "bench":  (14, 'outdoor') ,
-    "skateboard":  (37, 'sports') ,
-    "toothbrush":  (80, 'indoor') ,
-    "person":  (1, 'person') ,
-    "donut":  (55, 'food') ,
-    "none":  (0, 'background') ,
-    "refrigerator":  (73, 'appliance') ,
-    "horse":  (18, 'animal') ,
-    "elephant":  (21, 'animal') ,
-    "book":  (74, 'indoor') ,
-    "car":  (3, 'vehicle') ,
-    "keyboard":  (67, 'electronic') ,
-    "cow":  (20, 'animal') ,
-    "microwave":  (69, 'appliance') ,
-    "traffic light":  (10, 'outdoor') ,
-    "tie":  (28, 'accessory') ,
-    "dining table":  (61, 'furniture') ,
-    "toaster":  (71, 'appliance') ,
-    "baseball glove":  (36, 'sports') ,
-    "giraffe":  (24, 'animal') ,
-    "cake":  (56, 'food') ,
-    "handbag":  (27, 'accessory') ,
-    "scissors":  (77, 'indoor') ,
-    "bowl":  (46, 'kitchen') ,
-    "couch":  (58, 'furniture') ,
-    "chair":  (57, 'furniture') ,
-    "boat":  (9, 'vehicle') ,
-    "hair drier":  (79, 'indoor') ,
-    "airplane":  (5, 'vehicle') ,
-    "pizza":  (54, 'food') ,
-    "backpack":  (25, 'accessory') ,
-    "kite":  (34, 'sports') ,
-    "sheep":  (19, 'animal') ,
-    "umbrella":  (26, 'accessory') ,
-    "stop sign":  (12, 'outdoor') ,
-    "truck":  (8, 'vehicle') ,
-    "skis":  (31, 'sports') ,
-    "sandwich":  (49, 'food') ,
-    "broccoli":  (51, 'food') ,
-    "wine glass":  (41, 'kitchen') ,
-    "surfboard":  (38, 'sports') ,
-    "sports ball":  (33, 'sports') ,
-    "cell phone":  (68, 'electronic') ,
-    "dog":  (17, 'animal') ,
-    "bed":  (60, 'furniture') ,
-    "toilet":  (62, 'furniture') ,
-    "fire hydrant":  (11, 'outdoor') ,
-    "oven":  (70, 'appliance') ,
-    "zebra":  (23, 'animal') ,
-    "tv":  (63, 'electronic') ,
-    "potted plant":  (59, 'furniture') ,
-    "parking meter":  (13, 'outdoor') ,
-    "spoon":  (45, 'kitchen') ,
-    "bus":  (6, 'vehicle') ,
-    "laptop":  (64, 'electronic') ,
-    "cup":  (42, 'kitchen') ,
-    "bird":  (15, 'animal') ,
-    "sink":  (72, 'appliance') ,
-    "remote":  (66, 'electronic') ,
-    "bicycle":  (2, 'vehicle') ,
-    "tennis racket":  (39, 'sports') ,
-    "baseball bat":  (35, 'sports') ,
-    "cat":  (16, 'animal') ,
-    "fork":  (43, 'kitchen') ,
-    "suitcase":  (29, 'accessory') ,
-    "snowboard":  (32, 'sports') ,
-    "clock":  (75, 'indoor') ,
-    "apple":  (48, 'food') ,
-    "mouse":  (65, 'electronic') ,
-    "bottle":  (40, 'kitchen') ,
-    "frisbee":  (30, 'sports') ,
-    "carrot":  (52, 'food') ,
-    "bear":  (22, 'animal') ,
-    "hot dog":  (53, 'food') ,
-    "teddy bear":  (78, 'indoor') ,
-    "knife":  (44, 'kitchen') ,
-    "train":  (7, 'vehicle') ,
-    "vase":  (76, 'indoor') ,
-    "banana":  (47, 'food') ,
-    "motorcycle":  (4, 'vehicle') ,
-    "orange":  (50, 'food')
-  }
-
-# use dataset_inspect.py to get these summary
 data_splits_num = {
     'train': 22136,
     'val': 4952,
 }
 
-def slim_get_batch(num_classes, batch_size, split_name, file_pattern, num_readers, num_preprocessing_threads, image_preprocessing_fn, anchor_encoder, num_epochs=None, is_training=True):
-    """Gets a dataset tuple with instructions for reading Pascal VOC dataset.
-
-    Args:
-      num_classes: total class numbers in dataset.
-      batch_size: the size of each batch.
-      split_name: 'train' of 'val'.
-      file_pattern: The file pattern to use when matching the dataset sources (full path).
-      num_readers: the max number of reader used for reading tfrecords.
-      num_preprocessing_threads: the max number of threads used to run preprocessing function.
-      image_preprocessing_fn: the function used to dataset augumentation.
-      anchor_encoder: the function used to encoder all anchors.
-      num_epochs: total epoches for iterate this dataset.
-      is_training: whether we are in traing phase.
-
-    Returns:
-      A batch of [image, shape, loc_targets, cls_targets, match_scores].
-    """
+def tf1_get_batch(num_classes, batch_size, split_name, file_pattern, num_readers, num_preprocessing_threads, image_preprocessing_fn, anchor_encoder, num_epochs=None, is_training=True):
     if split_name not in data_splits_num:
         raise ValueError('split name %s was not recognized.' % split_name)
 
-    # Features in Pascal VOC TFRecords.
-    keys_to_features = {
-        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
-        'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),
-        'image/filename': tf.FixedLenFeature((), tf.string, default_value=''),
-        'image/height': tf.FixedLenFeature([1], tf.int64),
-        'image/width': tf.FixedLenFeature([1], tf.int64),
-        'image/channels': tf.FixedLenFeature([1], tf.int64),
-        'image/shape': tf.FixedLenFeature([3], tf.int64),
-        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
-        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
-        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
-        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
-        'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
-        'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
-        'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64),
-    }
-    items_to_handlers = {
-        'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),
-        'filename': slim.tfexample_decoder.Tensor('image/filename'),
-        'shape': slim.tfexample_decoder.Tensor('image/shape'),
-        'object/bbox': slim.tfexample_decoder.BoundingBox(
-                ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),
-        'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'),
-        'object/difficult': slim.tfexample_decoder.Tensor('image/object/bbox/difficult'),
-        'object/truncated': slim.tfexample_decoder.Tensor('image/object/bbox/truncated'),
-    }
-    decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)
-
-    labels_to_names = {}
-    for name, pair in VOC_LABELS.items():
-        labels_to_names[pair[0]] = name
-
-    dataset = slim.dataset.Dataset(
-                data_sources=file_pattern,
-                reader=tf.TFRecordReader,
-                decoder=decoder,
-                num_samples=data_splits_num[split_name],
-                items_to_descriptions=None,
-                num_classes=num_classes,
-                labels_to_names=labels_to_names)
-
-    with tf.name_scope('dataset_data_provider'):
-        provider = slim.dataset_data_provider.DatasetDataProvider(
-            dataset,
-            num_readers=num_readers,
-            common_queue_capacity=32 * batch_size,
-            common_queue_min=8 * batch_size,
-            shuffle=is_training,
-            num_epochs=num_epochs)
-
-    [org_image, filename, shape, glabels_raw, gbboxes_raw, isdifficult] = provider.get(['image', 'filename', 'shape',
-                                                                     'object/label',
-                                                                     'object/bbox',
-                                                                     'object/difficult'])
-
-    if is_training:
-        # if all is difficult, then keep the first one
-        isdifficult_mask =tf.cond(tf.count_nonzero(isdifficult, dtype=tf.int32) < tf.shape(isdifficult)[0],
-                                lambda : isdifficult < tf.ones_like(isdifficult),
-                                lambda : tf.one_hot(0, tf.shape(isdifficult)[0], on_value=True, off_value=False, dtype=tf.bool))
-
-        glabels_raw = tf.boolean_mask(glabels_raw, isdifficult_mask)
-        gbboxes_raw = tf.boolean_mask(gbboxes_raw, isdifficult_mask)
-
-    # Pre-processing image, labels and bboxes.
-
-    if is_training:
-        image, glabels, gbboxes = image_preprocessing_fn(org_image, glabels_raw, gbboxes_raw)
+    def parse_tfrecord(example_proto):
+        keys_to_features = {
+            'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
+            'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),
+            'image/filename': tf.FixedLenFeature((), tf.string, default_value=''),
+            'image/shape': tf.FixedLenFeature([3], tf.int64),
+            'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
+            'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
+            'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
+            'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
+            'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
+            'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
+        }
+        parsed_features = tf.parse_single_example(example_proto, keys_to_features)
+
+        image = tf.image.decode_jpeg(parsed_features['image/encoded'], channels=3)
+        shape = parsed_features['image/shape']
+        filename = parsed_features['image/filename']
+
+        xmin = tf.sparse.to_dense(parsed_features['image/object/bbox/xmin'])
+        ymin = tf.sparse.to_dense(parsed_features['image/object/bbox/ymin'])
+        xmax = tf.sparse.to_dense(parsed_features['image/object/bbox/xmax'])
+        ymax = tf.sparse.to_dense(parsed_features['image/object/bbox/ymax'])
+        gbboxes_raw = tf.stack([ymin, xmin, ymax, xmax], axis=1)
+        glabels_raw = tf.sparse.to_dense(parsed_features['image/object/bbox/label'])
+        isdifficult = tf.sparse.to_dense(parsed_features['image/object/bbox/difficult'])
+
+        return image, filename, shape, glabels_raw, gbboxes_raw, isdifficult
+
+    # 构建 tf.data 管道
+    dataset = tf.data.Dataset.list_files(file_pattern, shuffle=is_training)  # 支持通配符
+    dataset = dataset.interleave(
+        lambda x: tf.data.TFRecordDataset(x, num_parallel_reads=num_readers),
+        cycle_length=num_readers,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE
+    )
+    dataset = dataset.map(parse_tfrecord, num_parallel_calls=num_preprocessing_threads)
+    
+    total_samples = data_splits_num[split_name]
+    dataset = dataset.take(total_samples)
+    print(f"split_name:{split_name}, batch_size:{batch_size}, num_epochs:{num_epochs}, samples_per_epoch: {total_samples},  batch_nums_per_epoch:{total_samples // batch_size}")
+
+    if num_epochs is not None:
+        dataset = dataset.repeat(num_epochs)
     else:
-        image = image_preprocessing_fn(org_image, glabels_raw, gbboxes_raw)
-        glabels, gbboxes = glabels_raw, gbboxes_raw
-
-    gt_targets, gt_labels, gt_scores = anchor_encoder(glabels, gbboxes)
+        dataset = dataset.repeat()  # 无限重复
 
-    return tf.train.batch([image, filename, shape, gt_targets, gt_labels, gt_scores],
-                    dynamic_pad=False,
-                    batch_size=batch_size,
-                    allow_smaller_final_batch=(not is_training),
-                    num_threads=num_preprocessing_threads,
-                    capacity=64 * batch_size)
+    if is_training:
+        dataset = dataset.map(lambda img, fn, sh, lbl, box, diff: (
+            img, fn, sh, tf.cond(
+                tf.count_nonzero(diff, dtype=tf.int32) < tf.shape(diff)[0],
+                lambda: tf.boolean_mask(lbl, diff < tf.ones_like(diff)),
+                lambda: tf.boolean_mask(lbl, tf.one_hot(0, tf.shape(diff)[0], on_value=True, off_value=False, dtype=tf.bool))
+            ),
+            tf.cond(
+                tf.count_nonzero(diff, dtype=tf.int32) < tf.shape(diff)[0],
+                lambda: tf.boolean_mask(box, diff < tf.ones_like(diff)),
+                lambda: tf.boolean_mask(box, tf.one_hot(0, tf.shape(diff)[0], on_value=True, off_value=False, dtype=tf.bool))
+            )
+        ), num_parallel_calls=num_preprocessing_threads)
+
+    # 预处理
+    def preprocess(image, filename, shape, glabels_raw, gbboxes_raw):
+        if is_training:
+            image, glabels, gbboxes = image_preprocessing_fn(image, glabels_raw, gbboxes_raw)
+        else:
+            image = image_preprocessing_fn(image, glabels_raw, gbboxes_raw)
+            glabels, gbboxes = glabels_raw, gbboxes_raw
+        return image, filename, shape, glabels, gbboxes
+
+    dataset = dataset.map(preprocess, num_parallel_calls=num_preprocessing_threads)
+
+    # anchor 编码
+    def encode(glabels, gbboxes):
+        gt_targets, gt_labels, gt_scores = anchor_encoder(glabels, gbboxes)
+        return gt_targets, gt_labels, gt_scores
+
+    dataset = dataset.map(lambda img, fn, sh, lbl, box: (
+        img, fn, sh, *encode(lbl, box)
+    ), num_parallel_calls=num_preprocessing_threads)
+
+    # 批处理
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+
+    # 创建 iterator
+    iterator = dataset.make_one_shot_iterator()
+    return iterator.get_next()  # 返回 (image, filename, shape, gt_targets, gt_labels, gt_scores)
\ No newline at end of file
diff --git a/cv/detection/ssd/tensorflow/net/ssd_net.py b/cv/detection/ssd/tensorflow/net/ssd_net.py
index c584a7f283597b0490c9f6b9fd519e5b7afbcbe3..52d7304a033bd9d4e48c996947f95e3778ace042 100644
--- a/cv/detection/ssd/tensorflow/net/ssd_net.py
+++ b/cv/detection/ssd/tensorflow/net/ssd_net.py
@@ -56,7 +56,7 @@ _USE_FUSED_BN = True
 # vgg_16/conv3/conv3_3/biases
 # vgg_16/conv1/conv1_2/weights
 
-class ReLuLayer(tf.layers.Layer):
+class ReLuLayer(tf.keras.layers.Layer):
     def __init__(self, name, **kwargs):
         super(ReLuLayer, self).__init__(name=name, trainable=trainable, **kwargs)
         self._name = name
@@ -72,9 +72,9 @@ class ReLuLayer(tf.layers.Layer):
 
 
 def forward_module(m, inputs, training=False):
-    if isinstance(m, tf.layers.BatchNormalization) or isinstance(m, tf.layers.Dropout):
-        return m.apply(inputs, training=training)
-    return m.apply(inputs)
+    if isinstance(m, tf.keras.layers.BatchNormalization) or isinstance(m, tf.keras.layers.Dropout):
+        return m(inputs, training=training)
+    return m(inputs)
 
 
 def get_backbone(backbone, training, **kwargs):
@@ -124,18 +124,18 @@ def ssd_conv_block(
     with tf.variable_scope(name):
         conv_blocks = []
         conv_blocks.append(
-            tf.layers.Conv2D(filters=filters, kernel_size=1, strides=1, padding=padding,
+            tf.keras.layers.Conv2D(filters=filters, kernel_size=1, strides=1, padding=padding,
                 data_format=data_format, activation=tf.nn.relu, use_bias=True,
                 kernel_initializer=kernel_initializer,
                 bias_initializer=tf.zeros_initializer(),
-                name='{}_1'.format(name), _scope='{}_1'.format(name), _reuse=None)
+                name='{}_1'.format(name))
         )
         conv_blocks.append(
-            tf.layers.Conv2D(filters=filters * 2, kernel_size=3, strides=strides, padding=padding,
+            tf.keras.layers.Conv2D(filters=filters * 2, kernel_size=3, strides=strides, padding=padding,
                 data_format=data_format, activation=tf.nn.relu, use_bias=True,
                 kernel_initializer=kernel_initializer,
                 bias_initializer=tf.zeros_initializer(),
-                name='{}_2'.format(name), _scope='{}_2'.format(name), _reuse=None)
+                name='{}_2'.format(name))
         )
         return conv_blocks
 
@@ -203,28 +203,28 @@ class VGG16Backbone(object):
         # VGG layers
         self._conv1_block = self.conv_block(2, 64, 3, (1, 1), 'conv1')
         # down_1
-        self._pool1 = tf.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool1')
+        self._pool1 = tf.keras.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool1')
         self._conv2_block = self.conv_block(2, 128, 3, (1, 1), 'conv2')
         # down_2
-        self._pool2 = tf.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool2')
+        self._pool2 = tf.keras.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool2')
         self._conv3_block = self.conv_block(3, 256, 3, (1, 1), 'conv3')
         # down_3
-        self._pool3 = tf.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool3')
+        self._pool3 = tf.keras.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool3')
         self._conv4_block = self.conv_block(3, 512, 3, (1, 1), 'conv4')
         # down_4
-        self._pool4 = tf.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool4')
+        self._pool4 = tf.keras.layers.MaxPooling2D(2, 2, padding='same', data_format=self._data_format, name='pool4')
         self._conv5_block = self.conv_block(3, 512, 3, (1, 1), 'conv5')
-        self._pool5 = tf.layers.MaxPooling2D(3, 1, padding='same', data_format=self._data_format, name='pool5')
-        self._conv6 = tf.layers.Conv2D(filters=1024, kernel_size=3, strides=1, padding='same', dilation_rate=6,
+        self._pool5 = tf.keras.layers.MaxPooling2D(3, 1, padding='same', data_format=self._data_format, name='pool5')
+        self._conv6 = tf.keras.layers.Conv2D(filters=1024, kernel_size=3, strides=1, padding='same', dilation_rate=6,
                             data_format=self._data_format, activation=tf.nn.relu, use_bias=True,
                             kernel_initializer=self._conv_initializer(),
                             bias_initializer=tf.zeros_initializer(),
-                            name='fc6', _scope='fc6', _reuse=None)
-        self._conv7 = tf.layers.Conv2D(filters=1024, kernel_size=1, strides=1, padding='same',
+                            name='fc6')
+        self._conv7 = tf.keras.layers.Conv2D(filters=1024, kernel_size=1, strides=1, padding='same',
                             data_format=self._data_format, activation=tf.nn.relu, use_bias=True,
                             kernel_initializer=self._conv_initializer(),
                             bias_initializer=tf.zeros_initializer(),
-                            name='fc7', _scope='fc7', _reuse=None)
+                            name='fc7')
 
     def l2_normalize(self, x, name):
         with tf.name_scope(name, "l2_normalize", [x]) as name:
@@ -239,13 +239,13 @@ class VGG16Backbone(object):
         # forward vgg layers
         for conv in self._conv1_block:
             inputs = forward_module(conv, inputs, training=training)
-        inputs = self._pool1.apply(inputs)
+        inputs = self._pool1(inputs)
         for conv in self._conv2_block:
             inputs = forward_module(conv, inputs, training=training)
-        inputs = self._pool2.apply(inputs)
+        inputs = self._pool2(inputs)
         for conv in self._conv3_block:
             inputs = forward_module(conv, inputs, training=training)
-        inputs = self._pool3.apply(inputs)
+        inputs = self._pool3(inputs)
         for conv in self._conv4_block:
             inputs = forward_module(conv, inputs, training=training)
         # conv4_3
@@ -258,13 +258,13 @@ class VGG16Backbone(object):
 
             feature_layers.append(tf.multiply(weight_scale, self.l2_normalize(inputs, name='norm'), name='rescale')
                                 )
-        inputs = self._pool4.apply(inputs)
+        inputs = self._pool4(inputs)
         for conv in self._conv5_block:
             inputs = forward_module(conv, inputs, training=training)
-        inputs = self._pool5.apply(inputs)
+        inputs = self._pool5(inputs)
         # forward fc layers
-        inputs = self._conv6.apply(inputs)
-        inputs = self._conv7.apply(inputs)
+        inputs = self._conv6(inputs)
+        inputs = self._conv7(inputs)
         # fc7
         feature_layers.append(inputs)
 
@@ -275,11 +275,11 @@ class VGG16Backbone(object):
             conv_blocks = []
             for ind in range(1, num_blocks + 1):
                 conv_blocks.append(
-                        tf.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding='same',
+                        tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding='same',
                             data_format=self._data_format, activation=tf.nn.relu, use_bias=True,
                             kernel_initializer=self._conv_initializer(),
                             bias_initializer=tf.zeros_initializer(),
-                            name='{}_{}'.format(name, ind), _scope='{}_{}'.format(name, ind), _reuse=None)
+                            name='{}_{}'.format(name, ind))
                     )
             return conv_blocks
 
@@ -287,32 +287,32 @@ class VGG16Backbone(object):
         with tf.variable_scope(name):
             conv_bn_blocks = []
             conv_bn_blocks.append(
-                    tf.layers.Conv2D(filters=filters, kernel_size=1, strides=1, padding='same',
+                    tf.keras.layers.Conv2D(filters=filters, kernel_size=1, strides=1, padding='same',
                         data_format=self._data_format, activation=None, use_bias=False,
                         kernel_initializer=self._conv_bn_initializer(),
                         bias_initializer=None,
-                        name='{}_1'.format(name), _scope='{}_1'.format(name), _reuse=None)
+                        name='{}_1'.format(name))
                 )
             conv_bn_blocks.append(
-                    tf.layers.BatchNormalization(axis=self._bn_axis, momentum=BN_MOMENTUM, epsilon=BN_EPSILON, fused=USE_FUSED_BN,
-                        name='{}_bn1'.format(name), _scope='{}_bn1'.format(name), _reuse=None)
+                    tf.keras.layers.BatchNormalization(axis=self._bn_axis, momentum=BN_MOMENTUM, epsilon=BN_EPSILON, fused=USE_FUSED_BN,
+                        name='{}_bn1'.format(name))
                 )
             conv_bn_blocks.append(
-                    ReLuLayer('{}_relu1'.format(name), _scope='{}_relu1'.format(name), _reuse=None)
+                    ReLuLayer('{}_relu1'.format(name))
                 )
             conv_bn_blocks.append(
-                    tf.layers.Conv2D(filters=filters * 2, kernel_size=3, strides=strides, padding='same',
+                    tf.keras.layers.Conv2D(filters=filters * 2, kernel_size=3, strides=strides, padding='same',
                         data_format=self._data_format, activation=None, use_bias=False,
                         kernel_initializer=self._conv_bn_initializer(),
                         bias_initializer=None,
-                        name='{}_2'.format(name), _scope='{}_2'.format(name), _reuse=None)
+                        name='{}_2'.format(name))
                 )
             conv_bn_blocks.append(
-                    tf.layers.BatchNormalization(axis=self._bn_axis, momentum=BN_MOMENTUM, epsilon=BN_EPSILON, fused=USE_FUSED_BN,
-                        name='{}_bn2'.format(name), _scope='{}_bn2'.format(name), _reuse=None)
+                    tf.keras.layers.BatchNormalization(axis=self._bn_axis, momentum=BN_MOMENTUM, epsilon=BN_EPSILON, fused=USE_FUSED_BN,
+                        name='{}_bn2'.format(name))
                 )
             conv_bn_blocks.append(
-                    ReLuLayer('{}_relu2'.format(name), _scope='{}_relu2'.format(name), _reuse=None)
+                    ReLuLayer('{}_relu2'.format(name))
                 )
             return conv_bn_blocks
 
@@ -322,15 +322,15 @@ def multibox_head(feature_layers, num_classes, num_anchors_depth_per_layer, data
         cls_preds = []
         loc_preds = []
         for ind, feat in enumerate(feature_layers):
-            loc_preds.append(tf.layers.conv2d(feat, num_anchors_depth_per_layer[ind] * 4, (3, 3), use_bias=True,
+            loc_preds.append(tf.keras.layers.Conv2D(num_anchors_depth_per_layer[ind] * 4, (3, 3), use_bias=True,
                         name='loc_{}'.format(ind), strides=(1, 1),
                         padding='same', data_format=data_format, activation=None,
                         kernel_initializer=tf.glorot_uniform_initializer(),
-                        bias_initializer=tf.zeros_initializer()))
-            cls_preds.append(tf.layers.conv2d(feat, num_anchors_depth_per_layer[ind] * num_classes, (3, 3), use_bias=True,
+                        bias_initializer=tf.zeros_initializer())(feat))
+            cls_preds.append(tf.keras.layers.Conv2D(num_anchors_depth_per_layer[ind] * num_classes, (3, 3), use_bias=True,
                         name='cls_{}'.format(ind), strides=(1, 1),
                         padding='same', data_format=data_format, activation=None,
                         kernel_initializer=tf.glorot_uniform_initializer(),
-                        bias_initializer=tf.zeros_initializer()))
+                        bias_initializer=tf.zeros_initializer())(feat))
 
         return loc_preds, cls_preds
diff --git a/cv/detection/ssd/tensorflow/preprocessing/ssd_preprocessing.py b/cv/detection/ssd/tensorflow/preprocessing/ssd_preprocessing.py
index 739305df2d18d7eca660a01b4e7cddcab0679fc2..fb6da46a0d314e2f12f95dc3c20f1b92f8da84da 100644
--- a/cv/detection/ssd/tensorflow/preprocessing/ssd_preprocessing.py
+++ b/cv/detection/ssd/tensorflow/preprocessing/ssd_preprocessing.py
@@ -34,7 +34,7 @@ from __future__ import print_function
 
 import tensorflow.compat.v1 as tf
 from tensorflow.python.ops import control_flow_ops
-import tf_slim as slim
+# import tf_slim as slim
 
 
 _R_MEAN = 123.68
diff --git a/cv/detection/ssd/tensorflow/train_ssd.py b/cv/detection/ssd/tensorflow/train_ssd.py
index 736c084e1585d0ec2168067f349a0b5891b91de9..0a7e3fce0a395914c375af84d5ab9fcd0407574c 100644
--- a/cv/detection/ssd/tensorflow/train_ssd.py
+++ b/cv/detection/ssd/tensorflow/train_ssd.py
@@ -1,17 +1,3 @@
-# Copyright 2018 Changan Wang
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -130,33 +116,6 @@ tf.app.flags.DEFINE_string(
     'The backbone for feature extraction: vgg16/resnet18/resnet34/resnet50/resnet101.')
 
 FLAGS = tf.app.flags.FLAGS
-#CUDA_VISIBLE_DEVICES
-def validate_batch_size_for_multi_gpu(batch_size):
-    """For multi-gpu, batch-size must be a multiple of the number of
-    available GPUs.
-
-    Note that this should eventually be handled by replicate_model_fn
-    directly. Multi-GPU support is currently experimental, however,
-    so doing the work here until that feature is in place.
-    """
-    if FLAGS.multi_gpu:
-        from tensorflow.python.client import device_lib
-
-        local_device_protos = device_lib.list_local_devices()
-        num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU'])
-        if not num_gpus:
-            raise ValueError('Multi-GPU mode was specified, but no GPUs '
-                            'were found. To use CPU, run --multi_gpu=False.')
-
-        remainder = batch_size % num_gpus
-        if remainder:
-            err = ('When running with multiple GPUs, batch size '
-                    'must be a multiple of the number of available GPUs. '
-                    'Found {} GPUs with a batch size of {}; try --batch_size={} instead.'
-                    ).format(num_gpus, batch_size, batch_size - remainder)
-            raise ValueError(err)
-        return num_gpus
-    return 0
 
 def get_init_fn():
     return scaffolds.get_init_fn_for_scaffold(FLAGS.model_dir, FLAGS.checkpoint_path,
@@ -164,59 +123,50 @@ def get_init_fn():
                                             FLAGS.checkpoint_exclude_scopes, FLAGS.ignore_missing_vars,
                                             name_remap={'/kernel': '/weights', '/bias': '/biases'})
 
-# couldn't find better way to pass params from input_fn to model_fn
-# some tensors used by model_fn must be created in input_fn to ensure they are in the same graph
-# but when we put these tensors to labels's dict, the replicate_model_fn will split them into each GPU
-# the problem is that they shouldn't be splited
 global_anchor_info = dict()
 
 def input_pipeline(dataset_pattern='train-*', is_training=True, batch_size=FLAGS.batch_size):
-    def input_fn():
-        out_shape = [FLAGS.train_image_size] * 2
-        anchor_creator = anchor_manipulator.AnchorCreator(out_shape,
-                                                    layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
-                                                    anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)],
-                                                    extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)],
-                                                    anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
-                                                    layer_steps = [8, 16, 32, 64, 100, 300])
-        all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()
-
-        num_anchors_per_layer = []
-        for ind in range(len(all_anchors)):
-            num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind])
-
-        anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6,
-                                                            positive_threshold = FLAGS.match_threshold,
-                                                            ignore_threshold = FLAGS.neg_threshold,
-                                                            prior_scaling=[0.1, 0.1, 0.2, 0.2])
-
-        image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False)
-        anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)
-
-        image, _, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes,
-                                                                                batch_size,
-                                                                                ('train' if is_training else 'val'),
-                                                                                os.path.join(FLAGS.data_dir, dataset_pattern),
-                                                                                FLAGS.num_readers,
-                                                                                FLAGS.num_preprocessing_threads,
-                                                                                image_preprocessing_fn,
-                                                                                anchor_encoder_fn,
-                                                                                num_epochs=FLAGS.train_epochs,
-                                                                                is_training=is_training)
-        global global_anchor_info
-        global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer),
-                            'num_anchors_per_layer': num_anchors_per_layer,
-                            'all_num_anchors_depth': all_num_anchors_depth }
-
-        return image, {'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}
-    return input_fn
+    out_shape = [FLAGS.train_image_size] * 2
+    anchor_creator = anchor_manipulator.AnchorCreator(out_shape,
+                                                layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
+                                                anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)],
+                                                extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)],
+                                                anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
+                                                layer_steps = [8, 16, 32, 64, 100, 300])
+    all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()
+
+    num_anchors_per_layer = []
+    for ind in range(len(all_anchors)):
+        num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind])
+
+    prior_scaling = [0.1, 0.1, 0.2, 0.2]
+    anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6,
+                                                        positive_threshold = FLAGS.match_threshold,
+                                                        ignore_threshold = FLAGS.neg_threshold,
+                                                        prior_scaling=prior_scaling)
+
+    image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False)
+    anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)
+
+    image, _, shape, loc_targets, cls_targets, match_scores = dataset_common.tf1_get_batch(FLAGS.num_classes,
+                                                                            batch_size,
+                                                                            ('train' if is_training else 'val'),
+                                                                            os.path.join(FLAGS.data_dir, dataset_pattern),
+                                                                            FLAGS.num_readers,
+                                                                            FLAGS.num_preprocessing_threads,
+                                                                            image_preprocessing_fn,
+                                                                            anchor_encoder_fn,
+                                                                            num_epochs=FLAGS.train_epochs,
+                                                                            is_training=is_training)
+    global global_anchor_info
+    global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.ext_decode_all_anchors(pred, all_anchors, all_num_anchors_depth, all_num_anchors_spatial),
+                        'num_anchors_per_layer': num_anchors_per_layer,
+                        'all_num_anchors_depth': all_num_anchors_depth }
+
+    return image, {'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}
+
 
 def modified_smooth_l1(bbox_pred, bbox_targets, bbox_inside_weights=1., bbox_outside_weights=1., sigma=1.):
-    """
-        ResultLoss = outside_weights * SmoothL1(inside_weights * (bbox_pred - bbox_targets))
-        SmoothL1(x) = 0.5 * (sigma * x)^2,    if |x| < 1 / sigma^2
-                      |x| - 0.5 / sigma^2,    otherwise
-    """
     with tf.name_scope('smooth_l1', [bbox_pred, bbox_targets]):
         sigma2 = sigma * sigma
 
@@ -231,28 +181,25 @@ def modified_smooth_l1(bbox_pred, bbox_targets, bbox_inside_weights=1., bbox_out
         outside_mul = tf.multiply(bbox_outside_weights, smooth_l1_result)
 
         return outside_mul
-
-
-def ssd_model_fn(features, labels, mode, params):
-    """model_fn for SSD to be used with our Estimator."""
-    shape = labels['shape']
+    
+def build_model_graph(features, labels, params, is_training):
+    """
+    构建完整的计算图，返回 loss, train_op, predictions, metrics 等
+    """
     loc_targets = labels['loc_targets']
     cls_targets = labels['cls_targets']
-    match_scores = labels['match_scores']
 
     global global_anchor_info
     decode_fn = global_anchor_info['decode_fn']
     num_anchors_per_layer = global_anchor_info['num_anchors_per_layer']
     all_num_anchors_depth = global_anchor_info['all_num_anchors_depth']
 
-    #print(all_num_anchors_depth)
     with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE):
         ssd_backbone = ssd_net.SSDBackbone(
             FLAGS.backbone,
-            training=(mode == tf.estimator.ModeKeys.TRAIN),
+            training=is_training,
             data_format=params['data_format'])
         feature_layers = ssd_backbone.forward(features)
-        #print(feature_layers)
         location_pred, cls_pred = ssd_net.multibox_head(feature_layers, params['num_classes'], all_num_anchors_depth, data_format=params['data_format'])
 
         if params['data_format'] == 'channels_first':
@@ -271,125 +218,130 @@ def ssd_model_fn(features, labels, mode, params):
     with tf.device('/cpu:0'):
         with tf.control_dependencies([cls_pred, location_pred]):
             with tf.name_scope('post_forward'):
-                #bboxes_pred = decode_fn(location_pred)
-                bboxes_pred = tf.map_fn(lambda _preds : decode_fn(_preds),
-                                        tf.reshape(location_pred, [tf.shape(features)[0], -1, 4]),
-                                        dtype=[tf.float32] * len(num_anchors_per_layer), back_prop=False)
-                #cls_targets = tf.Print(cls_targets, [tf.shape(bboxes_pred[0]),tf.shape(bboxes_pred[1]),tf.shape(bboxes_pred[2]),tf.shape(bboxes_pred[3])])
+                bboxes_pred = tf.map_fn(
+                    lambda _preds: decode_fn(_preds),
+                    tf.reshape(location_pred, [tf.shape(features)[0], -1, 4]),
+                    dtype=[tf.float32] * len(num_anchors_per_layer),
+                    back_prop=False
+                )
+
                 bboxes_pred = [tf.reshape(preds, [-1, 4]) for preds in bboxes_pred]
                 bboxes_pred = tf.concat(bboxes_pred, axis=0)
 
                 flaten_cls_targets = tf.reshape(cls_targets, [-1])
-                flaten_match_scores = tf.reshape(match_scores, [-1])
                 flaten_loc_targets = tf.reshape(loc_targets, [-1, 4])
 
-                # each positive examples has one label
                 positive_mask = flaten_cls_targets > 0
-                n_positives = tf.count_nonzero(positive_mask)
-
                 batch_n_positives = tf.count_nonzero(cls_targets, -1)
-
-                batch_negtive_mask = tf.equal(cls_targets, 0)#tf.logical_and(tf.equal(cls_targets, 0), match_scores > 0.)
+                batch_negtive_mask = tf.equal(cls_targets, 0)
                 batch_n_negtives = tf.count_nonzero(batch_negtive_mask, -1)
 
                 batch_n_neg_select = tf.cast(params['negative_ratio'] * tf.cast(batch_n_positives, tf.float32), tf.int32)
                 batch_n_neg_select = tf.minimum(batch_n_neg_select, tf.cast(batch_n_negtives, tf.int32))
 
-                # hard negative mining for classification
                 predictions_for_bg = tf.nn.softmax(tf.reshape(cls_pred, [tf.shape(features)[0], -1, params['num_classes']]))[:, :, 0]
-                prob_for_negtives = tf.where(batch_negtive_mask,
-                                       0. - predictions_for_bg,
-                                       # ignore all the positives
-                                       0. - tf.ones_like(predictions_for_bg))
+                prob_for_negtives = tf.where(
+                    batch_negtive_mask,
+                    0. - predictions_for_bg,
+                    0. - tf.ones_like(predictions_for_bg)
+                )
                 topk_prob_for_bg, _ = tf.nn.top_k(prob_for_negtives, k=tf.shape(prob_for_negtives)[1])
                 score_at_k = tf.gather_nd(topk_prob_for_bg, tf.stack([tf.range(tf.shape(features)[0]), batch_n_neg_select - 1], axis=-1))
-
                 selected_neg_mask = prob_for_negtives >= tf.expand_dims(score_at_k, axis=-1)
 
-                # include both selected negtive and all positive examples
-                final_mask = tf.stop_gradient(tf.logical_or(tf.reshape(tf.logical_and(batch_negtive_mask, selected_neg_mask), [-1]), positive_mask))
-                total_examples = tf.count_nonzero(final_mask)
+                final_mask = tf.stop_gradient(
+                    tf.logical_or(
+                        tf.reshape(tf.logical_and(batch_negtive_mask, selected_neg_mask), [-1]),
+                        positive_mask
+                    )
+                )
 
-                cls_pred = tf.boolean_mask(cls_pred, final_mask)
-                location_pred = tf.boolean_mask(location_pred, tf.stop_gradient(positive_mask))
-                flaten_cls_targets = tf.boolean_mask(tf.clip_by_value(flaten_cls_targets, 0, params['num_classes']), final_mask)
-                flaten_loc_targets = tf.stop_gradient(tf.boolean_mask(flaten_loc_targets, positive_mask))
+                cls_pred_masked = tf.boolean_mask(cls_pred, final_mask)
+                location_pred_masked = tf.boolean_mask(location_pred, tf.stop_gradient(positive_mask))
+                flaten_cls_targets_masked = tf.boolean_mask(
+                    tf.clip_by_value(flaten_cls_targets, 0, params['num_classes']), final_mask)
+                flaten_loc_targets_masked = tf.stop_gradient(
+                    tf.boolean_mask(flaten_loc_targets, positive_mask))
 
                 predictions = {
-                            'classes': tf.argmax(cls_pred, axis=-1),
-                            'probabilities': tf.reduce_max(tf.nn.softmax(cls_pred, name='softmax_tensor'), axis=-1),
-                            'loc_predict': bboxes_pred }
-
-                cls_accuracy = tf.metrics.accuracy(flaten_cls_targets, predictions['classes'])
-                metrics = {'cls_accuracy': cls_accuracy}
-
-                # Create a tensor named train_accuracy for logging purposes.
-                tf.identity(cls_accuracy[1], name='cls_accuracy')
-                tf.summary.scalar('cls_accuracy', cls_accuracy[1])
+                    'classes': tf.argmax(cls_pred_masked, axis=-1),
+                    'probabilities': tf.reduce_max(tf.nn.softmax(cls_pred_masked, name='softmax_tensor'), axis=-1),
+                    # 'loc_predict': bboxes_pred
+                }
 
-    if mode == tf.estimator.ModeKeys.PREDICT:
-        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
+                cls_accuracy = tf.metrics.accuracy(flaten_cls_targets_masked, predictions['classes'])
 
-    cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=flaten_cls_targets, logits=cls_pred) * (params['negative_ratio'] + 1.)
-    # Create a tensor named cross_entropy for logging purposes.
+    # 损失计算
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+        labels=flaten_cls_targets_masked, logits=cls_pred_masked) * (params['negative_ratio'] + 1.)
     tf.identity(cross_entropy, name='cross_entropy_loss')
-    tf.summary.scalar('cross_entropy_loss', cross_entropy)
 
-    #loc_loss = tf.cond(n_positives > 0, lambda: modified_smooth_l1(location_pred, tf.stop_gradient(flaten_loc_targets), sigma=1.), lambda: tf.zeros_like(location_pred))
-    loc_loss = modified_smooth_l1(location_pred, flaten_loc_targets, sigma=1.)
-    #loc_loss = modified_smooth_l1(location_pred, tf.stop_gradient(gtargets))
+    loc_loss = modified_smooth_l1(location_pred_masked, flaten_loc_targets_masked, sigma=1.)
     loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1), name='location_loss')
-    tf.summary.scalar('location_loss', loc_loss)
     tf.losses.add_loss(loc_loss)
 
+    # L2 正则化
     l2_loss_vars = []
-    for trainable_var in tf.trainable_variables():
-        if '_bn' not in trainable_var.name:
-            if 'conv4_3_scale' not in trainable_var.name:
-                l2_loss_vars.append(tf.nn.l2_loss(trainable_var))
+    for var in tf.trainable_variables():
+        if '_bn' not in var.name:
+            if 'conv4_3_scale' not in var.name:
+                l2_loss_vars.append(tf.nn.l2_loss(var))
             else:
-                l2_loss_vars.append(tf.nn.l2_loss(trainable_var) * 0.1)
-    # Add weight decay to the loss. We exclude the batch norm variables because
-    # doing so leads to a small improvement in accuracy.
-    total_loss = tf.add(cross_entropy + loc_loss, tf.multiply(params['weight_decay'], tf.add_n(l2_loss_vars), name='l2_loss'), name='total_loss')
-
-    if mode == tf.estimator.ModeKeys.TRAIN:
-        global_step = tf.train.get_or_create_global_step()
-
-        lr_values = [params['learning_rate'] * decay for decay in params['lr_decay_factors']]
-        learning_rate = tf.train.piecewise_constant(tf.cast(global_step, tf.int32),
-                                                    [int(_) for _ in params['decay_boundaries']],
-                                                    lr_values)
-        truncated_learning_rate = tf.maximum(learning_rate, tf.constant(params['end_learning_rate'], dtype=learning_rate.dtype), name='learning_rate')
-        # Create a tensor named learning_rate for logging purposes.
-        tf.summary.scalar('learning_rate', truncated_learning_rate)
-
-        optimizer = tf.train.MomentumOptimizer(learning_rate=truncated_learning_rate,
-                                                momentum=params['momentum'])
-        if params['use_amp']:
-            optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
-        # optimizer = tf.contrib.estimator.TowerOptimizer(optimizer)
-
-        # Batch norm requires update_ops to be added as a train_op dependency.
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        with tf.control_dependencies(update_ops):
-            train_op = optimizer.minimize(total_loss, global_step)
-    else:
-        train_op = None
-
-    return tf.estimator.EstimatorSpec(
-                              mode=mode,
-                              predictions=predictions,
-                              loss=total_loss,
-                              train_op=train_op,
-                              eval_metric_ops=metrics,
-                              scaffold=tf.train.Scaffold(init_fn=get_init_fn()))
+                l2_loss_vars.append(tf.nn.l2_loss(var) * 0.1)
+    l2_loss = tf.multiply(params['weight_decay'], tf.add_n(l2_loss_vars), name='l2_loss')
+    total_loss = tf.add(cross_entropy + loc_loss, l2_loss, name='total_loss')
+
+    # 优化器
+    global_step = tf.train.get_or_create_global_step()
+    lr_values = [params['learning_rate'] * decay for decay in params['lr_decay_factors']]
+    learning_rate = tf.train.piecewise_constant(
+        tf.cast(global_step, tf.int32),
+        [int(_) for _ in params['decay_boundaries']],
+        lr_values
+    )
+    truncated_learning_rate = tf.maximum(learning_rate, params['end_learning_rate'], name='learning_rate')
+
+    optimizer = tf.train.MomentumOptimizer(learning_rate=truncated_learning_rate, momentum=params['momentum'])
+    if params['use_amp']:
+        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
+
+    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+    with tf.control_dependencies(update_ops):
+        train_op = optimizer.minimize(total_loss, global_step=global_step)
+
+    return total_loss, train_op, predictions, cls_accuracy, truncated_learning_rate
+
+def validate_batch_size_for_multi_gpu(batch_size):
+    """For multi-gpu, batch-size must be a multiple of the number of
+    available GPUs.
+
+    Note that this should eventually be handled by replicate_model_fn
+    directly. Multi-GPU support is currently experimental, however,
+    so doing the work here until that feature is in place.
+    """
+    if FLAGS.multi_gpu:
+        from tensorflow.python.client import device_lib
+
+        local_device_protos = device_lib.list_local_devices()
+        num_gpus = sum([1 for d in local_device_protos if d.device_type == 'GPU'])
+        if not num_gpus:
+            raise ValueError('Multi-GPU mode was specified, but no GPUs '
+                            'were found. To use CPU, run --multi_gpu=False.')
+
+        remainder = batch_size % num_gpus
+        if remainder:
+            err = ('When running with multiple GPUs, batch size '
+                    'must be a multiple of the number of available GPUs. '
+                    'Found {} GPUs with a batch size of {}; try --batch_size={} instead.'
+                    ).format(num_gpus, batch_size, batch_size - remainder)
+            raise ValueError(err)
+        return num_gpus
+    return 0
 
 def parse_comma_list(args):
     return [float(s.strip()) for s in args.split(',')]
 
 def main(_):
-    # Using the Winograd non-fused algorithms provides a small performance boost.
     os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
 
     try:
@@ -398,58 +350,74 @@ def main(_):
     except:
         pass
 
-    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
-    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, intra_op_parallelism_threads=FLAGS.num_cpu_threads, inter_op_parallelism_threads=FLAGS.num_cpu_threads, gpu_options=gpu_options)
-
+    # 参数
     num_gpus = validate_batch_size_for_multi_gpu(FLAGS.batch_size)
-
-    # Set up a RunConfig to only save checkpoints once per training cycle.
-    run_config = tf.estimator.RunConfig().replace(
-                                        save_checkpoints_secs=FLAGS.save_checkpoints_secs).replace(
-                                        save_checkpoints_steps=None).replace(
-                                        save_summary_steps=FLAGS.save_summary_steps).replace(
-                                        keep_checkpoint_max=5).replace(
-                                        tf_random_seed=FLAGS.tf_random_seed).replace(
-                                        log_step_count_steps=FLAGS.log_every_n_steps).replace(
-                                        session_config=config)
-
-    # replicate_ssd_model_fn = tf.contrib.estimator.replicate_model_fn(ssd_model_fn, loss_reduction=tf.losses.Reduction.MEAN)
-    replicate_ssd_model_fn =ssd_model_fn
-    ssd_detector = tf.estimator.Estimator(
-        model_fn=replicate_ssd_model_fn, model_dir=FLAGS.model_dir, config=run_config,
-        params={
-            'num_gpus': num_gpus,
-            'data_format': FLAGS.data_format,
-            'batch_size': FLAGS.batch_size,
-            'model_scope': FLAGS.model_scope,
-            'num_classes': FLAGS.num_classes,
-            'negative_ratio': FLAGS.negative_ratio,
-            'match_threshold': FLAGS.match_threshold,
-            'neg_threshold': FLAGS.neg_threshold,
-            'weight_decay': FLAGS.weight_decay,
-            'momentum': FLAGS.momentum,
-            'learning_rate': FLAGS.learning_rate,
-            'end_learning_rate': FLAGS.end_learning_rate,
-            'decay_boundaries': parse_comma_list(FLAGS.decay_boundaries),
-            'lr_decay_factors': parse_comma_list(FLAGS.lr_decay_factors),
-            'use_amp':FLAGS.use_amp,
-        })
-    tensors_to_log = {
-        'lr': 'learning_rate',
-        'ce': 'cross_entropy_loss',
-        'loc': 'location_loss',
-        'loss': 'total_loss',
-        'l2': 'l2_loss',
-        'acc': 'post_forward/cls_accuracy',
+    params = {
+        'num_gpus': num_gpus,
+        'data_format': FLAGS.data_format,
+        'batch_size': FLAGS.batch_size,
+        'model_scope': FLAGS.model_scope,
+        'num_classes': FLAGS.num_classes,
+        'negative_ratio': FLAGS.negative_ratio,
+        'match_threshold': FLAGS.match_threshold,
+        'neg_threshold': FLAGS.neg_threshold,
+        'weight_decay': FLAGS.weight_decay,
+        'momentum': FLAGS.momentum,
+        'learning_rate': FLAGS.learning_rate,
+        'end_learning_rate': FLAGS.end_learning_rate,
+        'decay_boundaries': parse_comma_list(FLAGS.decay_boundaries),
+        'lr_decay_factors': parse_comma_list(FLAGS.lr_decay_factors),
+        'use_amp': FLAGS.use_amp,
     }
-    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=FLAGS.log_every_n_steps,
-                                            formatter=lambda dicts: (', '.join(['%s=%.6f' % (k, v) for k, v in dicts.items()])))
 
-    #hook = tf.train.ProfilerHook(save_steps=50, output_dir='.', show_memory=True)
-    print('Starting a training cycle.')
-    ssd_detector.train(input_fn=input_pipeline(dataset_pattern='train-*', is_training=True, batch_size=FLAGS.batch_size),
-                    hooks=[logging_hook], max_steps=FLAGS.max_number_of_steps)
+    # 构建图-数据
+    features, labels = input_pipeline(
+        dataset_pattern='train-*',
+        is_training=True,
+        batch_size=FLAGS.batch_size
+    )
+
+    # 构建图-模型及模型后处理
+    total_loss, train_op, predictions, cls_metrics, lr = build_model_graph(features, labels, params, is_training=True)
+
+    # Session 配置
+    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
+    config = tf.ConfigProto(
+        allow_soft_placement=True,
+        log_device_placement=False,
+        intra_op_parallelism_threads=FLAGS.num_cpu_threads,
+        inter_op_parallelism_threads=FLAGS.num_cpu_threads,
+        gpu_options=gpu_options
+    )
+
+    scaffold = tf.train.Scaffold(init_fn=get_init_fn())
+
+    # 训练循环
+    with tf.train.MonitoredTrainingSession(
+        master='',
+        is_chief=True,
+        checkpoint_dir=FLAGS.model_dir,
+        save_checkpoint_secs=None,
+        scaffold=scaffold,
+        config=config
+    ) as sess:
+
+        step_ = 0
+        print('Starting a training cycle.')
+
+        while step_ < FLAGS.max_number_of_steps:
+            try:
+                _, total_loss_, lr_, step_, acc_ = sess.run([train_op, total_loss, lr, tf.train.get_global_step(), cls_metrics[1]])
+                
+                if step_ % FLAGS.log_every_n_steps == 0:
+                    tf.logging.info('global_step %d: loss = %.4f, lr = %.6f, acc = %.4f', step_, total_loss_, lr_, acc_)
+
+            except tf.errors.OutOfRangeError:
+                tf.logging.info('Epoch finished after %d steps.', step_)
+                break
+
+    print('Training completed.')
 
 if __name__ == '__main__':
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run()
+    tf.logging.set_verbosity(tf.logging.INFO)
+    tf.app.run()
\ No newline at end of file