diff --git a/PASCAL_VOC/get_data_from_XML.py b/PASCAL_VOC/get_data_from_XML.py new file mode 100644 index 0000000..d1b536e --- /dev/null +++ b/PASCAL_VOC/get_data_from_XML.py @@ -0,0 +1,91 @@ +import numpy as np +import os +from xml.etree import ElementTree + +class XML_preprocessor(object): + + def __init__(self, data_path): + self.path_prefix = data_path + self.num_classes = 20 + self.data = dict() + self._preprocess_XML() + + def _preprocess_XML(self): + filenames = os.listdir(self.path_prefix) + for filename in filenames: + tree = ElementTree.parse(self.path_prefix + filename) + root = tree.getroot() + bounding_boxes = [] + one_hot_classes = [] + size_tree = root.find('size') + width = float(size_tree.find('width').text) + height = float(size_tree.find('height').text) + for object_tree in root.findall('object'): + for bounding_box in object_tree.iter('bndbox'): + xmin = float(bounding_box.find('xmin').text)/width + ymin = float(bounding_box.find('ymin').text)/height + xmax = float(bounding_box.find('xmax').text)/width + ymax = float(bounding_box.find('ymax').text)/height + bounding_box = [xmin,ymin,xmax,ymax] + bounding_boxes.append(bounding_box) + class_name = object_tree.find('name').text + one_hot_class = self._to_one_hot(class_name) + one_hot_classes.append(one_hot_class) + image_name = root.find('filename').text + bounding_boxes = np.asarray(bounding_boxes) + one_hot_classes = np.asarray(one_hot_classes) + image_data = np.hstack((bounding_boxes, one_hot_classes)) + self.data[image_name] = image_data + + def _to_one_hot(self,name): + one_hot_vector = [0] * self.num_classes + if name == 'aeroplane': + one_hot_vector[0] = 1 + elif name == 'bicycle': + one_hot_vector[1] = 1 + elif name == 'bird': + one_hot_vector[2] = 1 + elif name == 'boat': + one_hot_vector[3] = 1 + elif name == 'bottle': + one_hot_vector[4] = 1 + elif name == 'bus': + one_hot_vector[5] = 1 + elif name == 'car': + one_hot_vector[6] = 1 + elif name == 'cat': + one_hot_vector[7] = 1 + elif name == 'chair': + one_hot_vector[8] = 1 + elif name == 'cow': + one_hot_vector[9] = 1 + elif name == 'diningtable': + one_hot_vector[10] = 1 + elif name == 'dog': + one_hot_vector[11] = 1 + elif name == 'horse': + one_hot_vector[12] = 1 + elif name == 'motorbike': + one_hot_vector[13] = 1 + elif name == 'person': + one_hot_vector[14] = 1 + elif name == 'pottedplant': + one_hot_vector[15] = 1 + elif name == 'sheep': + one_hot_vector[16] = 1 + elif name == 'sofa': + one_hot_vector[17] = 1 + elif name == 'train': + one_hot_vector[18] = 1 + elif name == 'tvmonitor': + one_hot_vector[19] = 1 + else: + print('unknown label: %s' %name) + + return one_hot_vector + +## example on how to use it +# import pickle +# data = XML_preprocessor('VOC2007/Annotations/').data +# pickle.dump(data,open('VOC2007.p','wb')) + diff --git a/README.md b/README.md index b6e4a79..8527b64 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ +[![license](https://img.shields.io/github/license/mashape/apistatus.svg)](LICENSE) # A port of [SSD: Single Shot MultiBox Detector](https://github.com/weiliu89/caffe/tree/ssd) to [Keras](https://keras.io) framework. For more details, please refer to [arXiv paper](http://arxiv.org/abs/1512.02325). For forward pass for 300x300 model, please, follow `SSD.ipynb` for examples. For training procedure for 300x300 model, please, follow `SSD_training.ipynb` for examples. Moreover, in `testing_utils` folder there is a useful script to test `SSD` on video or on camera input. Weights are ported from the original models and are available [here](https://mega.nz/#F!7RowVLCL!q3cEVRK9jyOSB9el3SssIA). You need `weights_SSD300.hdf5`, `weights_300x300_old.hdf5` is for the old version of architecture with 3x3 convolution for `pool6`. + +This code was tested with `Keras` v1.2.2, `Tensorflow` v1.0.0, `OpenCV` v3.1.0-dev diff --git a/SSD_training.ipynb b/SSD_training.ipynb index 2c7d743..1a22495 100644 --- a/SSD_training.ipynb +++ b/SSD_training.ipynb @@ -158,7 +158,7 @@ "\n", " def lighting(self, img):\n", " cov = np.cov(img.reshape(-1, 3) / 255.0, rowvar=False)\n", - " eigval, eigvec = np.linalg.eig(cov)\n", + " eigval, eigvec = np.linalg.eigh(cov)\n", " noise = np.random.randn(3) * self.lighting_std\n", " noise = eigvec.dot(eigval * noise) * 255\n", " img += noise\n", @@ -516,21 +516,21 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python [conda root]", + "display_name": "Python 3", "language": "python", - "name": "conda-root-py" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" + "pygments_lexer": "ipython3", + "version": "3.5.2" } }, "nbformat": 4, diff --git a/ssd_training.py b/ssd_training.py index 1cb3f8c..de97dcd 100644 --- a/ssd_training.py +++ b/ssd_training.py @@ -47,7 +47,7 @@ def _l1_smooth_loss(self, y_true, y_pred): """ abs_loss = tf.abs(y_true - y_pred) sq_loss = 0.5 * (y_true - y_pred)**2 - l1_loss = tf.select(tf.less(abs_loss, 1.0), sq_loss, abs_loss - 0.5) + l1_loss = tf.where(tf.less(abs_loss, 1.0), sq_loss, abs_loss - 0.5) return tf.reduce_sum(l1_loss, -1) def _softmax_loss(self, y_true, y_pred): @@ -64,7 +64,7 @@ def _softmax_loss(self, y_true, y_pred): """ y_pred = tf.maximum(tf.minimum(y_pred, 1 - 1e-15), 1e-15) softmax_loss = -tf.reduce_sum(y_true * tf.log(y_pred), - reduction_indices=-1) + axis=-1) return softmax_loss def compute_loss(self, y_true, y_pred): @@ -93,18 +93,18 @@ def compute_loss(self, y_true, y_pred): y_pred[:, :, :4]) # get positives loss - num_pos = tf.reduce_sum(y_true[:, :, -8], reduction_indices=-1) + num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1) pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8], - reduction_indices=1) + axis=1) pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8], - reduction_indices=1) + axis=1) # get negatives loss, we penalize only confidence here num_neg = tf.minimum(self.neg_pos_ratio * num_pos, num_boxes - num_pos) pos_num_neg_mask = tf.greater(num_neg, 0) has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask)) - num_neg = tf.concat(0, [num_neg, + num_neg = tf.concat(axis=0, values=[num_neg, [(1 - has_min) * self.negatives_for_hard]]) num_neg_batch = tf.reduce_min(tf.boolean_mask(num_neg, tf.greater(num_neg, 0))) @@ -112,7 +112,7 @@ def compute_loss(self, y_true, y_pred): confs_start = 4 + self.background_label_id + 1 confs_end = confs_start + self.num_classes - 1 max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end], - reduction_indices=2) + axis=2) _, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]), k=num_neg_batch) batch_idx = tf.expand_dims(tf.range(0, batch_size), 1) @@ -126,12 +126,12 @@ def compute_loss(self, y_true, y_pred): full_indices) neg_conf_loss = tf.reshape(neg_conf_loss, [batch_size, num_neg_batch]) - neg_conf_loss = tf.reduce_sum(neg_conf_loss, reduction_indices=1) + neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1) # loss is sum of positives and negatives total_loss = pos_conf_loss + neg_conf_loss total_loss /= (num_pos + tf.to_float(num_neg_batch)) - num_pos = tf.select(tf.not_equal(num_pos, 0), num_pos, + num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos, tf.ones_like(num_pos)) total_loss += (self.alpha * pos_loc_loss) / num_pos return total_loss