yolov1 pascal label data

Publish Date: 2018-11-26

Word Count: 2,220

Read Times: 14 Min

Read Count:

code example

import os
import xml.etree.ElementTree as ET
import numpy as np
import cv2
import pickle
import copy
import yolo.config as cfg
import matplotlib.pyplot as plt

np.random.seed(1234)  # for np.random.shuffle(gt_labels)

classes = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
           'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
           'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
           'train', 'tvmonitor']


class pascal_voc(object):
    def __init__(self, phase, rebuild=False):
        self.devkil_path = os.path.join(cfg.PASCAL_PATH, 'VOCdevkit')
        self.data_path = os.path.join(self.devkil_path, 'VOC2007')
        self.cache_path = cfg.CACHE_PATH
        self.batch_size = cfg.BATCH_SIZE
        self.image_size = cfg.IMAGE_SIZE
        self.cell_size = cfg.CELL_SIZE
        self.classes = cfg.CLASSES
        self.class_to_ind = dict(zip(self.classes, range(len(self.classes))))
        self.flipped = False  # cfg.FLIPPED
        self.phase = phase
        self.rebuild = rebuild
        self.cursor = 0
        self.epoch = 1
        self.gt_labels = None
        self.prepare()

    def get(self):
        images = np.zeros(
            (self.batch_size, self.image_size, self.image_size, 3))
        labels = np.zeros(
            (self.batch_size, self.cell_size, self.cell_size, 25))
        count = 0
        while count < self.batch_size:
            imname = self.gt_labels[self.cursor]['imname']
            flipped = self.gt_labels[self.cursor]['flipped']
            images[count, :, :, :] = self.image_read(imname, flipped)
            labels[count, :, :, :] = self.gt_labels[self.cursor]['label']
            count += 1
            self.cursor += 1
            if self.cursor >= len(self.gt_labels):
                np.random.shuffle(self.gt_labels)
                self.cursor = 0
                self.epoch += 1
        return images, labels

    def image_read(self, imname, flipped=False):
        image = cv2.imread(imname)
        image = cv2.resize(image, (self.image_size, self.image_size))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image = (image / 255.0) * 2.0 - 1.0
        if flipped:
            image = image[:, ::-1, :]
        return image

    def prepare(self):
        gt_labels = self.load_labels()
        if self.flipped:
            print('Appending horizontally-flipped training examples ...')
            # keep y; flip x;
            gt_labels_cp = copy.deepcopy(gt_labels)
            for idx in range(len(gt_labels_cp)):
                gt_labels_cp[idx]['flipped'] = True
                gt_labels_cp[idx]['label'] = \
                    gt_labels_cp[idx]['label'][:, ::-1, :]  # flip x grid index  [0,1,2,3,4,5,6] ===>[6,5,4,3,2,1,0]
                for i in range(self.cell_size):
                    for j in range(self.cell_size):
                        if gt_labels_cp[idx]['label'][i, j, 0] == 1:
                            gt_labels_cp[idx]['label'][i, j, 1] = \
                                self.image_size - 1 - \
                                gt_labels_cp[idx]['label'][i, j, 1]  # cx = 448 -1 - cx  flipped cx
            gt_labels += gt_labels_cp
        np.random.shuffle(gt_labels)  # shuffle labels
        self.gt_labels = gt_labels
        return gt_labels

    def load_labels(self):
        cache_file = os.path.join(
            self.cache_path, 'pascal_' + self.phase + '_gt_labels.pkl')

        if os.path.isfile(cache_file) and not self.rebuild:
            print('Loading gt_labels from: ' + cache_file)
            with open(cache_file, 'rb') as f:
                gt_labels = pickle.load(f)
            return gt_labels

        print('Processing gt_labels from: ' + self.data_path)

        if not os.path.exists(self.cache_path):
            os.makedirs(self.cache_path)

        if self.phase == 'train':
            txtname = os.path.join(
                self.data_path, 'ImageSets', 'Main', 'trainval.txt')
        else:
            txtname = os.path.join(
                self.data_path, 'ImageSets', 'Main', 'test.txt')
        with open(txtname, 'r') as f:
            self.image_index = [x.strip() for x in f.readlines()]  # 5011 lines

        gt_labels = []
        for index in self.image_index:
            label, num = self.load_pascal_annotation(index)
            if num == 0:
                continue
            imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg')
            gt_labels.append({'imname': imname,
                              'label': label,
                              'flipped': False})
        print('Saving gt_labels to: ' + cache_file)
        with open(cache_file, 'wb') as f:
            pickle.dump(gt_labels, f)
        return gt_labels

    def load_pascal_annotation(self, index):
        """
        Load image and bounding boxes info from XML file in the PASCAL VOC
        format.   002939
        """

        imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg')
        im = cv2.imread(imname)
        h_ratio = 1.0 * self.image_size / im.shape[0]
        w_ratio = 1.0 * self.image_size / im.shape[1]
        # im = cv2.resize(im, [self.image_size, self.image_size])

        label = np.zeros((self.cell_size, self.cell_size, 25))  # 7,7,25
        filename = os.path.join(self.data_path, 'Annotations', index + '.xml')
        tree = ET.parse(filename)
        objs = tree.findall('object')

        for obj in objs:
            bbox = obj.find('bndbox')  # xmin,ymin,xmax,ymax  1-based ===> 0-based
            # Make pixel indexes 0-based
            x1 = max(min((float(bbox.find('xmin').text) - 1) * w_ratio, self.image_size - 1), 0)
            y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio, self.image_size - 1), 0)
            x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio, self.image_size - 1), 0)
            y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio, self.image_size - 1), 0)
            cls_ind = self.class_to_ind[obj.find('name').text.lower().strip()]
            boxes = [(x2 + x1) / 2.0, (y2 + y1) / 2.0, x2 - x1, y2 - y1]  # cx,cy,w,h   [0-447]
            x_ind = int(boxes[0] * self.cell_size / self.image_size)  # grid x,y index  [0-6]
            y_ind = int(boxes[1] * self.cell_size / self.image_size)
            if label[y_ind, x_ind, 0] == 1:  # if multiple objects fall in same grid, we only use the first one
                continue
            label[y_ind, x_ind, 0] = 1  # has object  1 or 0
            label[y_ind, x_ind, 1:5] = boxes  # boxs (cx,cy,w,h)  [0-447]
            label[y_ind, x_ind, 5 + cls_ind] = 1  # class   20-one-hot-vector

        return label, len(objs)


"""
3 , 4 =  [0. 0. 0. 0. 0.]
3 , 5 =  [  1.    325.248 229.6   111.104 228.48 ]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  8
    class_name =  chair
3 , 6 =  [0. 0. 0. 0. 0.]
4 , 0 =  [0. 0. 0. 0. 0.]
4 , 1 =  [0. 0. 0. 0. 0.]
4 , 2 =  [  1.    132.16  288.4   172.928 316.96 ]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  8
    class_name =  chair
4 , 3 =  [0. 0. 0. 0. 0.]


data['label'].shape # 7,7,25  (confidence+ (x,y,w,h) + 20-classes)

confidence: 1 if gt_box center falls in this grid, otherwise 0
box(x,y,w,h): gt_box center x,y,w,h; otherwize [0,0,0,0]
class: 20-one-hot-vector if gt_box; othersize [0]*20

how flip works: flip x dim

(1) flip grid x-dim:  data['label'] = data['label'][:, ::-1, :]

y-grid = y-grid  [0,1,2,3,4,5,6]
x-grid flip      [0,1,2,3,4,5,6] ===>[6,5,4,3,2,1,0]

(2) flip data['label']
confidence = confidence
cx:  flip cx = 417-cx:   data['label'][i, j, 1] = 448 - 1 - data['label'][i, j, 1]
cy = cy
w = w
h = h
class = class
"""


def print_data(data):
    # grid y,x
    for y in range(7):
        for x in range(7):
            print(y, ",", x, "= ", data['label'][y, x, :5])
            if data['label'][y, x, 0] > 0:  # confidence >0
                class_one_hot = data['label'][y, x, 5:]  # (20)
                class_index = np.argmax(class_one_hot)
                print("    class_one_hot = ", class_one_hot)
                print("    class_index = ", class_index)
                print("    class_name = ", classes[class_index])


def flip_data(data):
    data['flipped'] = True
    data['label'] = data['label'][:, ::-1, :]
    for y in range(7):
        for x in range(7):
            if data['label'][y, x, 0] == 1:
                data['label'][y, x, 1] = 448 - 1 - data['label'][y, x, 1]  # cx = 448 -1 - cx  flipped cx


def show_image(filename):
    image = cv2.imread(filename)
    # convert from BGR to RGB
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.axis("off")
    plt.imshow(rgb_image)
    plt.show()


print("========================PASCAL=================================")
pascal = pascal_voc('train')
print(pascal.class_to_ind)  # dict 20
print(len(pascal.gt_labels))  # list: default 5011; flipped 10022
data = pascal.gt_labels[0]
print(data.keys())  # dict_keys(['flipped', 'imname', 'label'])
print(data['imname'])
print(data['label'].shape)  # 7,7,25  (confidence+ (x,y,w,h) + 20-classes)

print(classes)
show_image(data['imname'])

print("========================DATA=================================")
print_data(data)

print("=========================FLIPPED================================")
# flip data
flipped = copy.deepcopy(data)
flip_data(flipped)
print_data(flipped)

Loading gt_labels from: data\pascal_voc\cache\pascal_train_gt_labels.pkl
{'dog': 11, 'train': 18, 'bus': 5, 'motorbike': 13, 'aeroplane': 0, 'bicycle': 1, 'person': 14, 'horse': 12, 'bird': 2, 'tvmonitor': 19, 'sheep': 16, 'boat': 3, 'car': 6, 'diningtable': 10, 'pottedplant': 15, 'sofa': 17, 'bottle': 4, 'chair': 8, 'cat': 7, 'cow': 9}
5011
dict_keys(['flipped', 'label', 'imname'])
data\pascal_voc\VOCdevkit\VOC2007\JPEGImages\002939.jpg
(7, 7, 25)
['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']

png

========================DATA=================================
0 , 0 =  [0. 0. 0. 0. 0.]
0 , 1 =  [0. 0. 0. 0. 0.]
0 , 2 =  [0. 0. 0. 0. 0.]
0 , 3 =  [0. 0. 0. 0. 0.]
0 , 4 =  [0. 0. 0. 0. 0.]
0 , 5 =  [0. 0. 0. 0. 0.]
0 , 6 =  [0. 0. 0. 0. 0.]
1 , 0 =  [0. 0. 0. 0. 0.]
1 , 1 =  [0. 0. 0. 0. 0.]
1 , 2 =  [0. 0. 0. 0. 0.]
1 , 3 =  [0. 0. 0. 0. 0.]
1 , 4 =  [0. 0. 0. 0. 0.]
1 , 5 =  [0. 0. 0. 0. 0.]
1 , 6 =  [0. 0. 0. 0. 0.]
2 , 0 =  [0. 0. 0. 0. 0.]
2 , 1 =  [0. 0. 0. 0. 0.]
2 , 2 =  [0. 0. 0. 0. 0.]
2 , 3 =  [0. 0. 0. 0. 0.]
2 , 4 =  [0. 0. 0. 0. 0.]
2 , 5 =  [0. 0. 0. 0. 0.]
2 , 6 =  [0. 0. 0. 0. 0.]
3 , 0 =  [0. 0. 0. 0. 0.]
3 , 1 =  [  1.          70.336      202.496       74.368      149.33333333]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
    class_index =  19
    class_name =  tvmonitor
3 , 2 =  [0. 0. 0. 0. 0.]
3 , 3 =  [0. 0. 0. 0. 0.]
3 , 4 =  [  1.         267.456      229.97333333  29.568       77.65333333]
    class_one_hot =  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  4
    class_name =  bottle
3 , 5 =  [0. 0. 0. 0. 0.]
3 , 6 =  [0. 0. 0. 0. 0.]
4 , 0 =  [0. 0. 0. 0. 0.]
4 , 1 =  [0. 0. 0. 0. 0.]
4 , 2 =  [0. 0. 0. 0. 0.]
4 , 3 =  [  1.         220.864      283.136      158.592      327.33866667]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
    class_index =  14
    class_name =  person
4 , 4 =  [0. 0. 0. 0. 0.]
4 , 5 =  [0. 0. 0. 0. 0.]
4 , 6 =  [0. 0. 0. 0. 0.]
5 , 0 =  [0. 0. 0. 0. 0.]
5 , 1 =  [0. 0. 0. 0. 0.]
5 , 2 =  [0. 0. 0. 0. 0.]
5 , 3 =  [0. 0. 0. 0. 0.]
5 , 4 =  [  1.         283.584      337.49333333  92.288      185.17333333]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  8
    class_name =  chair
5 , 5 =  [0. 0. 0. 0. 0.]
5 , 6 =  [0. 0. 0. 0. 0.]
6 , 0 =  [0. 0. 0. 0. 0.]
6 , 1 =  [0. 0. 0. 0. 0.]
6 , 2 =  [0. 0. 0. 0. 0.]
6 , 3 =  [0. 0. 0. 0. 0.]
6 , 4 =  [0. 0. 0. 0. 0.]
6 , 5 =  [0. 0. 0. 0. 0.]
6 , 6 =  [0. 0. 0. 0. 0.]
=========================FLIPPED================================
0 , 0 =  [0. 0. 0. 0. 0.]
0 , 1 =  [0. 0. 0. 0. 0.]
0 , 2 =  [0. 0. 0. 0. 0.]
0 , 3 =  [0. 0. 0. 0. 0.]
0 , 4 =  [0. 0. 0. 0. 0.]
0 , 5 =  [0. 0. 0. 0. 0.]
0 , 6 =  [0. 0. 0. 0. 0.]
1 , 0 =  [0. 0. 0. 0. 0.]
1 , 1 =  [0. 0. 0. 0. 0.]
1 , 2 =  [0. 0. 0. 0. 0.]
1 , 3 =  [0. 0. 0. 0. 0.]
1 , 4 =  [0. 0. 0. 0. 0.]
1 , 5 =  [0. 0. 0. 0. 0.]
1 , 6 =  [0. 0. 0. 0. 0.]
2 , 0 =  [0. 0. 0. 0. 0.]
2 , 1 =  [0. 0. 0. 0. 0.]
2 , 2 =  [0. 0. 0. 0. 0.]
2 , 3 =  [0. 0. 0. 0. 0.]
2 , 4 =  [0. 0. 0. 0. 0.]
2 , 5 =  [0. 0. 0. 0. 0.]
2 , 6 =  [0. 0. 0. 0. 0.]
3 , 0 =  [0. 0. 0. 0. 0.]
3 , 1 =  [0. 0. 0. 0. 0.]
3 , 2 =  [  1.         179.544      229.97333333  29.568       77.65333333]
    class_one_hot =  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  4
    class_name =  bottle
3 , 3 =  [0. 0. 0. 0. 0.]
3 , 4 =  [0. 0. 0. 0. 0.]
3 , 5 =  [  1.         376.664      202.496       74.368      149.33333333]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
    class_index =  19
    class_name =  tvmonitor
3 , 6 =  [0. 0. 0. 0. 0.]
4 , 0 =  [0. 0. 0. 0. 0.]
4 , 1 =  [0. 0. 0. 0. 0.]
4 , 2 =  [0. 0. 0. 0. 0.]
4 , 3 =  [  1.         226.136      283.136      158.592      327.33866667]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
    class_index =  14
    class_name =  person
4 , 4 =  [0. 0. 0. 0. 0.]
4 , 5 =  [0. 0. 0. 0. 0.]
4 , 6 =  [0. 0. 0. 0. 0.]
5 , 0 =  [0. 0. 0. 0. 0.]
5 , 1 =  [0. 0. 0. 0. 0.]
5 , 2 =  [  1.         163.416      337.49333333  92.288      185.17333333]
    class_one_hot =  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
    class_index =  8
    class_name =  chair
5 , 3 =  [0. 0. 0. 0. 0.]
5 , 4 =  [0. 0. 0. 0. 0.]
5 , 5 =  [0. 0. 0. 0. 0.]
5 , 6 =  [0. 0. 0. 0. 0.]
6 , 0 =  [0. 0. 0. 0. 0.]
6 , 1 =  [0. 0. 0. 0. 0.]
6 , 2 =  [0. 0. 0. 0. 0.]
6 , 3 =  [0. 0. 0. 0. 0.]
6 , 4 =  [0. 0. 0. 0. 0.]
6 , 5 =  [0. 0. 0. 0. 0.]
6 , 6 =  [0. 0. 0. 0. 0.]