'How to filter COCO dataset classes & annotations for custom dataset?
I was able to filter the images using the code below with the COCO API, I performed this code multiple times for all the classes I needed, this is an example for category person, I did this for car and etc.
What I want to do now, is filter the annotations of the dataset (instances_train2017.json), and save it in json instances_train2017.json.
# Load categories with the specified ids, in this case all
cats = coco.loadCats(coco.getCatIds())
nms = [cat['name'] for cat in cats]
print('COCO categories: \n{}\n'.format(' '.join(nms)))
# Get all images containing given categories
catIds = coco.getCatIds(catNms=['person'])
imgIds = coco.getImgIds(catIds=catIds)
images = coco.loadImgs(imgIds)
print("imgIds: ", len(imgIds))
#print("images: ", images)
# download images for specific category
for im in images:
print("im: ", im)
img_data = requests.get(im['coco_url']).content
with open('customCoco/images/train2017/' + im['file_name'], 'wb') as handler:
handler.write(img_data)
I tried to use the COCO API, but I that doesn't give me the COCO format I want like intances_train2017.json)
# download annotation for specific category
for im in images:
annIds = coco.getAnnIds(imgIds=im['id'], catIds=catIds, iscrowd=None)
anns = coco.loadAnns(annIds)
print("anns: ", anns)
I Found this post: https://github.com/cocodataset/cocoapi/issues/271 but its been saved in csv format which I don't want, I want the same file but just filtered.
Solution 1:[1]
I recommend Jalagarto's coco_utils / COCO API wrapper: https://github.com/Jalagarto/coco_utils, which generates both images and annotations. In the code below, I've extended it to work for multiple classes.
Other resources:
Visualization of Image/Annotation
xD Hope this helps!!
"""
1. saves images/annotations from categories
2. creates new json by filtering the main json file
coco_categories = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
Expected Directory:
script.py
COCO[
annotations
val2017
train2017
]
"""
from pycocotools.coco import COCO
import requests
import os
from os.path import join
from tqdm import tqdm
import json
class coco_category_filter:
"""
Downloads images of one category & filters jsons
to only keep annotations of this category
"""
def __init__(self, json_path, _categ):
self.coco = COCO(json_path) # instanciate coco class
self.categ = ''
self.images = self.get_imgs_from_json(_categ)
def get_imgs_from_json(self, _categ):
"""returns image names of the desired category"""
# Get category ids
self.catIds = self.coco.getCatIds(catNms=_categ)
assert len(self.catIds) > 0, "[ERROR] cannot find category index for {}".format(_categ)
print("catIds: ", self.catIds)
# Get the corresponding image ids and images using loadImgs
imgIds = []
for c in self.catIds:
imgIds += self.coco.getImgIds(catIds=c) # get images over categories (logical OR)
imgIds = list(set(imgIds)) # remove duplicates
images = self.coco.loadImgs(imgIds)
print(f"{len(images)} images of '{self.categ}' instances")
return images
def save_imgs(self, imgs_dir):
"""saves the images of this category"""
print("Saving the images with required categories ...")
os.makedirs(imgs_dir, exist_ok=True)
# Save the images into a local folder
for im in tqdm(self.images):
img_data = requests.get(im['coco_url']).content
with open(os.path.join(imgs_dir, im['file_name']), 'wb') as handler:
handler.write(img_data)
def filter_json_by_category(self, json_dir):
"""creates a new json with the desired category"""
# {'supercategory': 'person', 'id': 1, 'name': 'person'}
### Filter images:
print("Filtering the annotations ... ")
imgs_ids = [x['id'] for x in self.images] # get img_ids of imgs with the category (prefiltered)
new_imgs = [x for x in self.coco.dataset['images'] if x['id'] in imgs_ids] # select images by img_ids
catIds = self.catIds
### Filter annotations
new_annots = [x for x in self.coco.dataset['annotations'] if x['category_id'] in catIds] # select annotations based on category id
### Reorganize the ids (note for reordering subset 1-N)
#new_imgs, annotations = self.modify_ids(new_imgs, new_annots)
### Filter categories
new_categories = [x for x in self.coco.dataset['categories'] if x['id'] in catIds]
print("new_categories: ", new_categories)
data = {
"info": self.coco.dataset['info'],
"licenses": self.coco.dataset['licenses'],
"images": new_imgs,
"annotations": new_annots,
"categories": new_categories
}
print("saving json: ")
with open(os.path.join(json_dir, "coco_annotation.json"), 'w') as f:
json.dump(data, f)
def modify_ids(self, images, annotations):
"""
creates new ids for the images. I.e., maps existing image id to new subset image id and returns the dictionaries back
images: list of images dictionaries
images[n]['id'] # id of image
annotations[n]['id'] # id of annotation
images[n]['id'] --> annotations[n]['image_id'] # map 'id' of image to 'image_id' in annotation
"""
print("Reinitialicing images and annotation IDs ...")
### Images
map_old_to_new_id = {} # necessary for the annotations!
for n, im in enumerate(images):
map_old_to_new_id[images[n]['id']] = n + 1 # dicto with old im_ids and new im_ids
images[n]['id'] = n + 1 # reorganize the ids
### Annotations
for n, ann in enumerate(annotations):
annotations[n]['id'] = n + 1
old_image_id = annotations[n]['image_id']
annotations[n]['image_id'] = map_old_to_new_id[old_image_id] # replace im_ids in the annotations as well
return images, annotations
def main(subset, year, root_dir, categories, experiment):
json_file = join(root_dir, 'annotations/instances_' + subset + year + '.json') # local path
# Output files
img_dir = join(root_dir, experiment, 'images')
os.makedirs(img_dir, exist_ok=True)
json_dir = join(root_dir, experiment, 'annotations')
os.makedirs(json_dir, exist_ok=True)
# Methods
coco_filter = coco_category_filter(json_file, categories) # instantiate class
coco_filter.save_imgs(img_dir)
coco_filter.filter_json_by_category(json_dir)
if __name__ == '__main__':
subset, year = 'val', '2017' # val - train
root_dir = './datasets/COCO'
experiment = "my_custom_dataset"
categories = ['person', 'bicycle', 'car'] # can be multiple categories
main(subset, year, root_dir, categories, experiment)
Solution 2:[2]
@corticalhazard answer is working well. But the code is throwing error when Max retries exceeded with URL in requests. So I refactored the code based on this awesome solution and solve this issue.
Here is the code:
from pycocotools.coco import COCO
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import os
from os.path import join
from tqdm import tqdm
import json
class coco_category_filter:
"""
Downloads images of one category & filters jsons
to only keep annotations of this category
"""
def __init__(self, json_path, imgs_dir, categ='person'):
self.coco = COCO(json_path) # instanciate coco class
self.json_path = json_path
self.imgs_dir = imgs_dir
self.categ = categ
self.images = self.get_imgs_from_json()
def get_imgs_from_json(self):
"""returns image names of the desired category"""
# instantiate COCO specifying the annotations json path
# Specify a list of category names of interest
catIds = self.coco.getCatIds(catNms=[self.categ])
print("catIds: ", catIds)
# Get the corresponding image ids and images using loadImgs
imgIds = self.coco.getImgIds(catIds=catIds)
images = self.coco.loadImgs(imgIds)
print(f"{len(images)} images in '{self.json_path}' with '{self.categ}' instances")
self.catIds = catIds # list
return images
def save_imgs(self):
"""saves the images of this category"""
print("Saving the images with required categories ...")
os.makedirs(self.imgs_dir, exist_ok=True)
# Save the images into a local folder
################################################# Modified lines
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
#################################################
for im in tqdm(self.images):
img_data = session.get(im['coco_url']).content
with open(os.path.join(self.imgs_dir, im['file_name']), 'wb') as handler:
handler.write(img_data)
def filter_json_by_category(self, new_json_path):
"""creates a new json with the desired category"""
# {'supercategory': 'person', 'id': 1, 'name': 'person'}
### Filter images:
print("Filtering the annotations ... ")
json_parent = os.path.split(new_json_path)[0]
os.makedirs(json_parent, exist_ok=True)
imgs_ids = [x['id'] for x in self.images] # get img_ids of imgs with the category
new_imgs = [x for x in self.coco.dataset['images'] if x['id'] in imgs_ids]
catIds = self.catIds
### Filter annotations
new_annots = [x for x in self.coco.dataset['annotations'] if x['category_id'] in catIds]
### Reorganize the ids
new_imgs, annotations = self.modify_ids(new_imgs, new_annots)
### Filter categories
new_categories = [x for x in self.coco.dataset['categories'] if x['id'] in catIds]
print("new_categories: ", new_categories)
data = {
"info": self.coco.dataset['info'],
"licenses": self.coco.dataset['licenses'],
"images": new_imgs,
"annotations": new_annots,
"categories": new_categories
}
print("saving json: ")
with open(new_json_path, 'w') as f:
json.dump(data, f)
def modify_ids(self, images, annotations):
"""
creates new ids for the images. I.e., reorganizes the ids and returns the dictionaries back
images: list of images dictionaries
imId_counter: image id starting from one (each dicto will start with id of last json +1)
"""
print("Reinitialicing images and annotation IDs ...")
### Images
old_new_imgs_ids = {} # necessary for the annotations!
for n,im in enumerate(images):
old_new_imgs_ids[images[n]['id']] = n+1 # dicto with old im_ids and new im_ids
images[n]['id'] = n+1 # reorganize the ids
### Annotations
for n,ann in enumerate(annotations):
annotations[n]['id'] = n+1
old_image_id = annotations[n]['image_id']
annotations[n]['image_id'] = old_new_imgs_ids[old_image_id] # replace im_ids in the annotations as well
return images, annotations
def main(subset, year, root_dir, category='person'):
json_file = join(os.path.split(root_dir)[0], 'instances_'+subset+year+'.json') # local path
imgs_dir = join(root_dir, category + '_' + subset)
new_json_file = join(root_dir, 'annotations', subset+".json")
coco_filter = coco_category_filter(json_file, imgs_dir, categ=category) # instanciate class
coco_filter.save_imgs()
coco_filter.filter_json_by_category(new_json_file)
if __name__ == '__main__':
subset, year='train', '2017'
root_dir = './datasets/COCO/annotations'
main(subset, year, root_dir, category='person')
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | |
| Solution 2 | Masoud Masoumi Moghadam |
