yolov1 pascal label data

code example

import os
import xml.etree.ElementTree as ET
import numpy as np
import cv2
import pickle
import copy
import yolo.config as cfg
import matplotlib.pyplot as plt

np.random.seed(1234)  # for np.random.shuffle(gt_labels)

classes = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
           'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
           'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
           'train', 'tvmonitor']


class pascal_voc(object):
    def __init__(self, phase, rebuild=False):
        self.devkil_path = os.path.join(cfg.PASCAL_PATH, 'VOCdevkit')
        self.data_path = os.path.join(self.devkil_path, 'VOC2007')
        self.cache_path = cfg.CACHE_PATH
        self.batch_size = cfg.BATCH_SIZE
        self.image_size = cfg.IMAGE_SIZE
        self.cell_size = cfg.CELL_SIZE
        self.classes = cfg.CLASSES
        self.class_to_ind = dict(zip(self.classes, range(len(self.classes))))
        self.flipped = False  # cfg.FLIPPED
        self.phase = phase
        self.rebuild = rebuild
        self.cursor = 0
        self.epoch = 1
        self.gt_labels = None
        self.prepare()

    def get(self):
        images = np.zeros(
            (self.batch_size, self.image_size, self.image_size, 3))
        labels = np.zeros(
            (self.batch_size, self.cell_size, self.cell_size, 25))
        count = 0
        while count < self.batch_size:
            imname = self.gt_labels[self.cursor]['imname']
            flipped = self.gt_labels[self.cursor]['flipped']
            images[count, :, :, :] = self.image_read(imname, flipped)
            labels[count, :, :, :] = self.gt_labels[self.cursor]['label']
            count += 1
            self.cursor += 1
            if self.cursor >= len(self.gt_labels):
                np.random.shuffle(self.gt_labels)
                self.cursor = 0
                self.epoch += 1
        return images, labels

    def image_read(self, imname, flipped=False):
        image = cv2.imread(imname)
        image = cv2.resize(image, (self.image_size, self.image_size))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image = (image / 255.0) * 2.0 - 1.0
        if flipped:
            image = image[:, ::-1, :]
        return image

    def prepare(self):
        gt_labels = self.load_labels()
        if self.flipped:
            print('Appending horizontally-flipped training examples ...')
            # keep y; flip x;
            gt_labels_cp = copy.deepcopy(gt_labels)
            for idx in range(len(gt_labels_cp)):
                gt_labels_cp[idx]['flipped'] = True
                gt_labels_cp[idx]['label'] = \
                    gt_labels_cp[idx]['label'][:, ::-1, :]  # flip x grid index  [0,1,2,3,4,5,6] ===>[6,5,4,3,2,1,0]
                for i in range(self.cell_size):
                    for j in range(self.cell_size):
                        if gt_labels_cp[idx]['label'][i, j, 0] == 1:
                            gt_labels_cp[idx]['label'][i, j, 1] = \
                                self.image_size - 1 - \
                                gt_labels_cp[idx]['label'][i, j, 1]  # cx = 448 -1 - cx  flipped cx
            gt_labels += gt_labels_cp
        np.random.shuffle(gt_labels)  # shuffle labels
        self.gt_labels = gt_labels
        return gt_labels

    def load_labels(self):
        cache_file = os.path.join(
            self.cache_path, 'pascal_' + self.phase + '_gt_labels.pkl')

        if os.path.isfile(cache_file) and not self.rebuild:
            print('Loading gt_labels from: ' + cache_file)
            with open(cache_file, 'rb') as f:
                gt_labels = pickle.load(f)
            return gt_labels

        print('Processing gt_labels from: ' + self.data_path)

        if not os.path.exists(self.cache_path):
            os.makedirs(self.cache_path)

        if self.phase == 'train':
            txtname = os.path.join(
                self.data_path, 'ImageSets', 'Main', 'trainval.txt')
        else:
            txtname = os.path.join(
                self.data_path, 'ImageSets', 'Main', 'test.txt')
        with open(txtname, 'r') as f:
            self.image_index = [x.strip() for x in f.readlines()]  # 5011 lines

        gt_labels = []
        for index in self.image_index:
            label, num = self.load_pascal_annotation(index)
            if num == 0:
                continue
            imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg')
            gt_labels.append({'imname': imname,
                              'label': label,
                              'flipped': False})
        print('Saving gt_labels to: ' + cache_file)
        with open(cache_file, 'wb') as f:
            pickle.dump(gt_labels, f)
        return gt_labels

    def load_pascal_annotation(self, index):
        """
        Load image and bounding boxes info from XML file in the PASCAL VOC
        format.   002939
        """

        imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg')
        im = cv2.imread(imname)
        h_ratio = 1.0 * self.image_size / im.shape[0]
        w_ratio = 1.0 * self.image_size / im.shape[1]
        # im = cv2.resize(im, [self.image_size, self.image_size])

        label = np.zeros((self.cell_size, self.cell_size, 25))  # 7,7,25
        filename = os.path.join(self.data_path, 'Annotations', index + '.xml')
        tree = ET.parse(filename)
        objs = tree.findall('object')

        for obj in objs:
            bbox = obj.find('bndbox')  # xmin,ymin,xmax,ymax  1-based ===> 0-based
            # Make pixel indexes 0-based
            x1 = max(min((float(bbox.find('xmin').text) - 1) * w_ratio, self.image_size - 1), 0)
            y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio, self.image_size - 1), 0)
            x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio, self.image_size - 1), 0)
            y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio, self.image_size - 1), 0)
            cls_ind = self.class_to_ind[obj.find('name').text.lower().strip()]
            boxes = [(x2 + x1) / 2.0, (y2 + y1) / 2.0, x2 - x1, y2 - y1]  # cx,cy,w,h   [0-447]
            x_ind = int(boxes[0] * self.cell_size / self.image_size)  # grid x,y index  [0-6]
            y_ind = int(boxes[1] * self.cell_size / self.image_size)
            if label[y_ind, x_ind, 0] == 1:  # if multiple objects fall in same grid, we only use the first one
                continue
            label[y_ind, x_ind, 0] = 1  # has object  1 or 0
            label[y_ind, x_ind, 1:5] = boxes  # boxs (cx,cy,w,h)  [0-447]
            label[y_ind, x_ind, 5 + cls_ind] = 1  # class   20-one-hot-vector

        return label, len(objs)


"""
3 , 4 =  [0. 0. 0. 0. 0.]
3 , 5 =  [  1.    325.248 229.6   111.104 228.48 ]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  8
    class_name =  chair
3 , 6 =  [0. 0. 0. 0. 0.]
4 , 0 =  [0. 0. 0. 0. 0.]
4 , 1 =  [0. 0. 0. 0. 0.]
4 , 2 =  [  1.    132.16  288.4   172.928 316.96 ]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  8
    class_name =  chair
4 , 3 =  [0. 0. 0. 0. 0.]


data['label'].shape # 7,7,25  (confidence+ (x,y,w,h) + 20-classes)

confidence: 1 if gt_box center falls in this grid, otherwise 0
box(x,y,w,h): gt_box center x,y,w,h; otherwize [0,0,0,0]
class: 20-one-hot-vector if gt_box; othersize [0]*20

how flip works: flip x dim

(1) flip grid x-dim:  data['label'] = data['label'][:, ::-1, :]

y-grid = y-grid  [0,1,2,3,4,5,6]
x-grid flip      [0,1,2,3,4,5,6] ===>[6,5,4,3,2,1,0]

(2) flip data['label']
confidence = confidence
cx:  flip cx = 417-cx:   data['label'][i, j, 1] = 448 - 1 - data['label'][i, j, 1]
cy = cy
w = w
h = h
class = class
"""


def print_data(data):
    # grid y,x
    for y in range(7):
        for x in range(7):
            print(y, ",", x, "= ", data['label'][y, x, :5])
            if data['label'][y, x, 0] > 0:  # confidence >0
                class_one_hot = data['label'][y, x, 5:]  # (20)
                class_index = np.argmax(class_one_hot)
                print("    class_one_hot = ", class_one_hot)
                print("    class_index = ", class_index)
                print("    class_name = ", classes[class_index])


def flip_data(data):
    data['flipped'] = True
    data['label'] = data['label'][:, ::-1, :]
    for y in range(7):
        for x in range(7):
            if data['label'][y, x, 0] == 1:
                data['label'][y, x, 1] = 448 - 1 - data['label'][y, x, 1]  # cx = 448 -1 - cx  flipped cx


def show_image(filename):
    image = cv2.imread(filename)
    # convert from BGR to RGB
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.axis("off")
    plt.imshow(rgb_image)
    plt.show()


print("========================PASCAL=================================")
pascal = pascal_voc('train')
print(pascal.class_to_ind)  # dict 20
print(len(pascal.gt_labels))  # list: default 5011; flipped 10022
data = pascal.gt_labels[0]
print(data.keys())  # dict_keys(['flipped', 'imname', 'label'])
print(data['imname'])
print(data['label'].shape)  # 7,7,25  (confidence+ (x,y,w,h) + 20-classes)

print(classes)
show_image(data['imname'])

print("========================DATA=================================")
print_data(data)

print("=========================FLIPPED================================")
# flip data
flipped = copy.deepcopy(data)
flip_data(flipped)
print_data(flipped)

Loading gt_labels from: data\pascal_voc\cache\pascal_train_gt_labels.pkl
{'dog': 11, 'train': 18, 'bus': 5, 'motorbike': 13, 'aeroplane': 0, 'bicycle': 1, 'person': 14, 'horse': 12, 'bird': 2, 'tvmonitor': 19, 'sheep': 16, 'boat': 3, 'car': 6, 'diningtable': 10, 'pottedplant': 15, 'sofa': 17, 'bottle': 4, 'chair': 8, 'cat': 7, 'cow': 9}
5011
dict_keys(['flipped', 'label', 'imname'])
data\pascal_voc\VOCdevkit\VOC2007\JPEGImages\002939.jpg
(7, 7, 25)
['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']

========================DATA=================================
0 , 0 =  [0. 0. 0. 0. 0.]
0 , 1 =  [0. 0. 0. 0. 0.]
0 , 2 =  [0. 0. 0. 0. 0.]
0 , 3 =  [0. 0. 0. 0. 0.]
0 , 4 =  [0. 0. 0. 0. 0.]
0 , 5 =  [0. 0. 0. 0. 0.]
0 , 6 =  [0. 0. 0. 0. 0.]
1 , 0 =  [0. 0. 0. 0. 0.]
1 , 1 =  [0. 0. 0. 0. 0.]
1 , 2 =  [0. 0. 0. 0. 0.]
1 , 3 =  [0. 0. 0. 0. 0.]
1 , 4 =  [0. 0. 0. 0. 0.]
1 , 5 =  [0. 0. 0. 0. 0.]
1 , 6 =  [0. 0. 0. 0. 0.]
2 , 0 =  [0. 0. 0. 0. 0.]
2 , 1 =  [0. 0. 0. 0. 0.]
2 , 2 =  [0. 0. 0. 0. 0.]
2 , 3 =  [0. 0. 0. 0. 0.]
2 , 4 =  [0. 0. 0. 0. 0.]
2 , 5 =  [0. 0. 0. 0. 0.]
2 , 6 =  [0. 0. 0. 0. 0.]
3 , 0 =  [0. 0. 0. 0. 0.]
3 , 1 =  [  1.          70.336      202.496       74.368      149.33333333]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
    class_index =  19
    class_name =  tvmonitor
3 , 2 =  [0. 0. 0. 0. 0.]
3 , 3 =  [0. 0. 0. 0. 0.]
3 , 4 =  [  1.         267.456      229.97333333  29.568       77.65333333]
    class_one_hot =  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  4
    class_name =  bottle
3 , 5 =  [0. 0. 0. 0. 0.]
3 , 6 =  [0. 0. 0. 0. 0.]
4 , 0 =  [0. 0. 0. 0. 0.]
4 , 1 =  [0. 0. 0. 0. 0.]
4 , 2 =  [0. 0. 0. 0. 0.]
4 , 3 =  [  1.         220.864      283.136      158.592      327.33866667]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
    class_index =  14
    class_name =  person
4 , 4 =  [0. 0. 0. 0. 0.]
4 , 5 =  [0. 0. 0. 0. 0.]
4 , 6 =  [0. 0. 0. 0. 0.]
5 , 0 =  [0. 0. 0. 0. 0.]
5 , 1 =  [0. 0. 0. 0. 0.]
5 , 2 =  [0. 0. 0. 0. 0.]
5 , 3 =  [0. 0. 0. 0. 0.]
5 , 4 =  [  1.         283.584      337.49333333  92.288      185.17333333]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  8
    class_name =  chair
5 , 5 =  [0. 0. 0. 0. 0.]
5 , 6 =  [0. 0. 0. 0. 0.]
6 , 0 =  [0. 0. 0. 0. 0.]
6 , 1 =  [0. 0. 0. 0. 0.]
6 , 2 =  [0. 0. 0. 0. 0.]
6 , 3 =  [0. 0. 0. 0. 0.]
6 , 4 =  [0. 0. 0. 0. 0.]
6 , 5 =  [0. 0. 0. 0. 0.]
6 , 6 =  [0. 0. 0. 0. 0.]
=========================FLIPPED================================
0 , 0 =  [0. 0. 0. 0. 0.]
0 , 1 =  [0. 0. 0. 0. 0.]
0 , 2 =  [0. 0. 0. 0. 0.]
0 , 3 =  [0. 0. 0. 0. 0.]
0 , 4 =  [0. 0. 0. 0. 0.]
0 , 5 =  [0. 0. 0. 0. 0.]
0 , 6 =  [0. 0. 0. 0. 0.]
1 , 0 =  [0. 0. 0. 0. 0.]
1 , 1 =  [0. 0. 0. 0. 0.]
1 , 2 =  [0. 0. 0. 0. 0.]
1 , 3 =  [0. 0. 0. 0. 0.]
1 , 4 =  [0. 0. 0. 0. 0.]
1 , 5 =  [0. 0. 0. 0. 0.]
1 , 6 =  [0. 0. 0. 0. 0.]
2 , 0 =  [0. 0. 0. 0. 0.]
2 , 1 =  [0. 0. 0. 0. 0.]
2 , 2 =  [0. 0. 0. 0. 0.]
2 , 3 =  [0. 0. 0. 0. 0.]
2 , 4 =  [0. 0. 0. 0. 0.]
2 , 5 =  [0. 0. 0. 0. 0.]
2 , 6 =  [0. 0. 0. 0. 0.]
3 , 0 =  [0. 0. 0. 0. 0.]
3 , 1 =  [0. 0. 0. 0. 0.]
3 , 2 =  [  1.         179.544      229.97333333  29.568       77.65333333]
    class_one_hot =  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  4
    class_name =  bottle
3 , 3 =  [0. 0. 0. 0. 0.]
3 , 4 =  [0. 0. 0. 0. 0.]
3 , 5 =  [  1.         376.664      202.496       74.368      149.33333333]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
    class_index =  19
    class_name =  tvmonitor
3 , 6 =  [0. 0. 0. 0. 0.]
4 , 0 =  [0. 0. 0. 0. 0.]
4 , 1 =  [0. 0. 0. 0. 0.]
4 , 2 =  [0. 0. 0. 0. 0.]
4 , 3 =  [  1.         226.136      283.136      158.592      327.33866667]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
    class_index =  14
    class_name =  person
4 , 4 =  [0. 0. 0. 0. 0.]
4 , 5 =  [0. 0. 0. 0. 0.]
4 , 6 =  [0. 0. 0. 0. 0.]
5 , 0 =  [0. 0. 0. 0. 0.]
5 , 1 =  [0. 0. 0. 0. 0.]
5 , 2 =  [  1.         163.416      337.49333333  92.288      185.17333333]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  8
    class_name =  chair
5 , 3 =  [0. 0. 0. 0. 0.]
5 , 4 =  [0. 0. 0. 0. 0.]
5 , 5 =  [0. 0. 0. 0. 0.]
5 , 6 =  [0. 0. 0. 0. 0.]
6 , 0 =  [0. 0. 0. 0. 0.]
6 , 1 =  [0. 0. 0. 0. 0.]
6 , 2 =  [0. 0. 0. 0. 0.]
6 , 3 =  [0. 0. 0. 0. 0.]
6 , 4 =  [0. 0. 0. 0. 0.]
6 , 5 =  [0. 0. 0. 0. 0.]
6 , 6 =  [0. 0. 0. 0. 0.]

Reference

History

20181126: created.