Source code for pylabel.importer

"""This module includes the commands to import an existing dataset. 
PyLabel current supports importing labels from COCO, YOLO, and VOC formats. 
You can also import set of images that do not have labels yet and label them manually using the PyLabel
labelling tool. """

import json
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import os
from os.path import exists
from pathlib import Path, PurePath
import copy
import cv2
import yaml
from tqdm import tqdm

from pylabel.shared import schema
from pylabel.dataset import Dataset
from pylabel.exporter import Export


def _GetValueOrBlank(element, user_input=None):
    """
    If an element is missing from the XML file reading the .text value will return an error.
    If the element does not exist return ""
    """
    if user_input == None:
        try:
            return element.text
        except AttributeError:
            return ""
    else:
        return user_input


# These are the valid columns in the pylabel annotations table.
[docs] def ImportCoco(path, path_to_images=None, name=None, encoding="utf-8"): """ This function takes the path to a JSON file in COCO format as input. It returns a PyLabel dataset object that contains the annotations. Returns: PyLabel dataset object. Args: path (str):The path to the JSON file with the COCO annotations. path_to_images (str): The path to the images relative to the json file. If the images are in the same directory as the JSON file then omit this parameter. If the images are in a different directory on the same level as the annotations then you would set `path_to_images='../images/'` name (str): This will set the dataset.name property for this dataset. If not specified, the filename (without extension) of the COCO annotation file file will be used as the dataset name. encoding (str): Default is 'utf-8. Encoding of the annotations file(s). Example: >>> from pylabel import importer >>> dataset = importer.ImportCoco("coco_annotations.json") """ with open(path, encoding=encoding) as cocojson: annotations_json = json.load(cocojson) # Store the 3 sections of the json as seperate json arrays images = pd.json_normalize(annotations_json["images"]) images.columns = "img_" + images.columns try: images["img_folder"] except: images["img_folder"] = "" # print(images) # If the user has specified a different image folder then use that one if path_to_images != None: images["img_folder"] = path_to_images astype_dict = {"img_width": "int64", "img_height": "int64", "img_depth": "int64"} astype_keys = list(astype_dict.keys()) for element in astype_keys: if element not in images.columns: astype_dict.pop(element) # print(astype_dict) # images = images.astype({'img_width': 'int64','img_height': 'int64','img_depth': 'int64'}) images = images.astype(astype_dict) annotations = pd.json_normalize(annotations_json["annotations"]) annotations.columns = "ann_" + annotations.columns categories = pd.json_normalize(annotations_json["categories"]) categories.columns = "cat_" + categories.columns # Converting this to string resolves issue #23 categories.cat_id = categories.cat_id.astype(str) df = annotations # Converting this to string resolves issue #23 df.ann_category_id = df.ann_category_id.astype(str) df[ ["ann_bbox_xmin", "ann_bbox_ymin", "ann_bbox_width", "ann_bbox_height"] ] = pd.DataFrame(df.ann_bbox.tolist(), index=df.index) df.insert(8, "ann_bbox_xmax", df["ann_bbox_xmin"] + df["ann_bbox_width"]) df.insert(10, "ann_bbox_ymax", df["ann_bbox_ymin"] + df["ann_bbox_height"]) # debug print(df.info()) # Join the annotions with the information about the image to add the image columns to the dataframe df = pd.merge(images, df, left_on="img_id", right_on="ann_image_id", how="left") df = pd.merge( df, categories, left_on="ann_category_id", right_on="cat_id", how="left" ) # Rename columns if needed from the coco column name to the pylabel column name df.rename(columns={"img_file_name": "img_filename"}, inplace=True) # Drop columns that are not in the schema df = df[df.columns.intersection(schema)] # Add missing columns that are in the schema but not part of the table df[list(set(schema) - set(df.columns))] = "" # Reorder columns df = df[schema] df.index.name = "id" df.annotated = 1 # Fill na values with empty strings which resolved some errors when # working with images that don't have any annotations df.fillna("", inplace=True) # These should be strings df.cat_id = df.cat_id.astype(str) # These should be integers df.img_width = df.img_width.astype(int) df.img_height = df.img_height.astype(int) dataset = Dataset(df) # Assign the filename (without extension) as the name of the dataset if name == None: dataset.name = Path(path).stem else: dataset.name = name dataset.path_to_annotations = PurePath(path).parent return dataset
[docs] def ImportVOC(path, path_to_images=None, name="dataset", encoding="utf-8"): """ Provide the path a directory with annotations in VOC Pascal XML format and it returns a PyLabel dataset object that contains the annotations. Returns: PyLabel dataset object. Args: path (str): The path to the directory with the annotations in VOC Pascal XML format. path_to_images (str): The path to the images relative to the annotations. If the images are in the same directory as the annotation files then omit this parameter. If the images are in a different directory on the same level as the annotations then you would set `path_to_images='../images/'` name (str): Default is 'dataset'. This will set the dataset.name property for this dataset. encoding (str): Default is 'utf-8. Encoding of the annotations file(s). Example: >>> from pylabel import importer >>> dataset = importer.ImportVOC(path="annotations/", path_to_images="../images/") """ # Create an empty dataframe df = pd.DataFrame(columns=schema) # the dictionary to pass to pandas dataframe d = {} row_id = 0 img_id = 0 cat_names = [] def GetCatId(cat_name): """This will assign a numeric cat_id to each cat_name.""" if cat_name not in cat_names: cat_names.append(cat_name) return cat_names.index(cat_name) # iterate over files in that directory pbar = tqdm(desc="Importing VOC files...", total=len(os.listdir(path))) for filename in os.scandir(path): if filename.is_file() and filename.name.endswith(".xml"): filepath = filename.path xml_data = open(filepath, "r", encoding=encoding).read() # Read file root = ET.XML(xml_data) # Parse XML # ignore "folder" node in xml # folder = _GetValueOrBlank(root.find("folder"), user_input=path_to_images) folder = path_to_images if path_to_images else "." # "filename" node in xml sometimes is invalid # only get suffix filename_node = root.find("filename").text suffix = Path(filename_node).suffix size = root.find("size") size_width = size.find("width").text size_height = size.find("height").text size_depth = _GetValueOrBlank(size.find("depth")) segmented = _GetValueOrBlank(root.find("segmented")) row = {} # Build dictionary that will be become the row in the dataframe row["img_folder"] = folder row["img_filename"] = filename.name.replace(".xml", suffix) row["img_id"] = img_id row["img_width"] = size_width row["img_height"] = size_height row["img_depth"] = size_depth row["ann_segmented"] = segmented object = root.findall("object") for o in object: row["cat_name"] = o.find("name").text row["cat_id"] = GetCatId(row["cat_name"]) row["ann_pose"] = _GetValueOrBlank(o.find("pose")) row["ann_truncated"] = _GetValueOrBlank(o.find("truncated")) row["ann_difficult"] = _GetValueOrBlank(o.find("difficult")) row["ann_bbox_xmin"] = float(o.find("bndbox").find("xmin").text) row["ann_bbox_ymin"] = float(o.find("bndbox").find("ymin").text) row["ann_bbox_xmax"] = float(o.find("bndbox").find("xmax").text) row["ann_bbox_ymax"] = float(o.find("bndbox").find("ymax").text) row["ann_bbox_width"] = row["ann_bbox_xmax"] - row["ann_bbox_xmin"] row["ann_bbox_height"] = row["ann_bbox_ymax"] - row["ann_bbox_ymin"] row["ann_area"] = row["ann_bbox_width"] * row["ann_bbox_height"] row["split"] = "" # Add this row to the dict d[row_id] = copy.deepcopy(row) # increment the rowid row_id += 1 # Increment the imageid because we are going to read annother file img_id += 1 pbar.update() # Convert the dict with all of the annotation data to a dataframe df = pd.DataFrame.from_dict(d, "index", columns=schema) df.index.name = "id" df.annotated = 1 # These should be strings df.cat_id = df.cat_id.astype(str) # These should be integers df.img_width = df.img_width.astype(int) df.img_height = df.img_height.astype(int) # Reorder columns df = df[schema] dataset = Dataset(df) dataset.name = name dataset.path_to_annotations = path # Get the path without the filename # dataset.path_to_annotations = "Alex ander" return dataset
[docs] def ImportYoloV5( path, img_ext="jpg,jpeg,png,webp", cat_names=[], path_to_images="", name="dataset", encoding="utf-8", ): """ Provide the path a directory with annotations in YOLO format and it returns a PyLabel dataset object that contains the annotations. The Yolo format does not store much information about the images, such as the height and width. When you import a Yolo dataset PyLabel will extract this information from the images. Returns: PyLabel dataset object. Args: path (str): The path to the directory with the annotations in YOLO format. img_ext (str, comma separated): Specify the file extension(s) of the images used in your dataset: .jpeg, .png, etc. This is required because the YOLO format does not store the filename of the images. It could be any of the image formats supported by YoloV5. PyLabel will iterate through the file extensions specified until it finds a match. cat_names (list): YOLO annotations only store a class number, not the name. You can provide a list of class ids that correspond to the int used to represent that class in the annotations. For example `['Squirrel,'Nut']`. If you have the class names already stored in a YOLO YAML file then use the ImportYoloV5WithYaml method to automatically read the class names from that file. path_to_images (str): The path to the images relative to the annotations. If the images are in the same directory as the annotation files then omit this parameter. If the images are in a different directory on the same level as the annotations then you would set `path_to_images='../images/'` name (str): Default is 'dataset'. This will set the dataset.name property for this dataset. encoding (str): Default is 'utf-8. Encoding of the annotations file(s). Example: >>> from pylabel import importer >>> dataset = importer.ImportYoloV5(path="labels/", path_to_images="../images/") """ def GetCatNameFromId(cat_id, cat_names): cat_id = int(cat_id) if len(cat_names) > int(cat_id): return cat_names[cat_id] # Create an empty dataframe df = pd.DataFrame(columns=schema) # the dictionary to pass to pandas dataframe d = {} row_id = 0 img_id = 0 # iterate over files in that directory pbar = tqdm(desc="Importing YOLO files...", total=len(os.listdir(path))) for filename in os.scandir(path): if filename.is_file() and filename.name.endswith(".txt"): filepath = filename.path file = open(filepath, "r", encoding=encoding) # Read file row = {} # First find the image files and extract the metadata about the image row["img_folder"] = path_to_images # Figure out what the extension is of the corresponding image file # by looping through the extension in the img_ext parameter found_image = False for ext in img_ext.split(","): image_filename = filename.name.replace("txt", ext) # Get the path to the image file to extract the height, width, and depth image_path = PurePath(path, path_to_images, image_filename) if exists(image_path): found_image = True break # Check if there is a file at this location. assert ( found_image == True ), f"No image file found: {image_path}. Check path_to_images and img_ext arguments." row["img_filename"] = image_filename imgstream = open(str(image_path), "rb") imgbytes = bytearray(imgstream.read()) numpyarray = np.asarray(imgbytes, dtype=np.uint8) im = cv2.imdecode(numpyarray, cv2.IMREAD_UNCHANGED) img_height = im.shape[0] img_width = im.shape[1] # If the image is grayscale then there is no img_depth if len(im.shape) == 2: img_depth = 1 else: img_depth = im.shape[2] # 3 for color images row["img_id"] = img_id row["img_width"] = img_width row["img_height"] = img_height row["img_depth"] = img_depth # Read the annotation in the file # Check if the file has at least one line: numlines = len(open(filepath, encoding=encoding).readlines()) if numlines == 0: # Create a row without annotations d[row_id] = row row_id += 1 else: for line in file: line = line.strip() # check if the row is empty, leave annotation columns blank if line: d[row_id] = copy.deepcopy(row) ( cat_id, x_center_norm, y_center_norm, width_norm, height_norm, ) = line.split() row["ann_bbox_width"] = float(width_norm) * img_width row["ann_bbox_height"] = float(height_norm) * img_height row["ann_bbox_xmin"] = float(x_center_norm) * img_width - ( (row["ann_bbox_width"] / 2) ) row["ann_bbox_ymax"] = float(y_center_norm) * img_height + ( (row["ann_bbox_height"] / 2) ) row["ann_bbox_xmax"] = ( row["ann_bbox_xmin"] + row["ann_bbox_width"] ) row["ann_bbox_ymin"] = ( row["ann_bbox_ymax"] - row["ann_bbox_height"] ) row["ann_area"] = row["ann_bbox_width"] * row["ann_bbox_height"] row["cat_id"] = cat_id row["cat_name"] = GetCatNameFromId(cat_id, cat_names) d[row_id] = dict(row) row_id += 1 # Copy the image data to use for the next row else: # Create a row without annotations d[row_id] = row row_id += 1 # Add this row to the dict # increment the image id img_id += 1 pbar.update() df = pd.DataFrame.from_dict(d, "index", columns=schema) df.index.name = "id" df.annotated = 1 df.fillna("", inplace=True) # These should be strings df.cat_id = df.cat_id.astype(str) # These should be integers df.img_width = df.img_width.astype(int) df.img_height = df.img_height.astype(int) # Reorder columns dataset = Dataset(df) dataset.name = name dataset.path_to_annotations = path return dataset
[docs] def ImportImagesOnly(path, name="dataset"): """Import a directory of images as a dataset with no annotations. Then use PyLabel to annote the images. Will import images with these extensions: ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif') Args: path (str): The path to the directory with the images. name (str): Default is 'dataset'. Descriptive name, which is used when outputting files. Returns: A dataset object with one row for each image and no annotations. Example: >>> from pylabel import importer >>> dataset = importer.ImportImagesOnly(path="images/") """ # Create an empty dataframe df = pd.DataFrame(columns=schema) # the dictionary to pass to pandas dataframe d = {} img_id = 0 # iterate over files in that directory pbar = tqdm(desc="Importing image files...", total=len(os.listdir(path))) for filename in os.scandir(path): if filename.is_file() and filename.name.lower().endswith( (".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif") ): row = {} row["img_folder"] = "" row["img_filename"] = filename.name image_path = PurePath(path, row["img_filename"]) im = cv2.imread(str(image_path)) try: # If the file is not an image then this will fail im.shape except: raise ValueError( f"Error reading file '{image_path}'. Exclude non-image files by using the ends_width param." ) img_height, img_width, img_depth = im.shape row["img_id"] = img_id row["img_width"] = img_width row["img_height"] = img_height row["img_depth"] = img_depth row["cat_name"] = "" # Add this row to the dict d[img_id] = row img_id += 1 pbar.update() df = pd.DataFrame.from_dict(d, "index", columns=schema) df.index.name = "id" # Reorder columns dataset = Dataset(df) dataset.name = name dataset.path_to_annotations = path return dataset
def _yaml_reader(yaml_file, encoding): """Import the YAML File for the YOLOv5 data as dict.""" with open(yaml_file, encoding=encoding) as file: data = yaml.safe_load(file) return data
[docs] def ImportYoloV5WithYaml( yaml_file, image_ext="jpg", name_of_annotations_folder="labels", path_to_annotations=None, encoding="utf-8", ): """Import a YOLO dataset by reading the YAML file to extract the class names, image and label locations, and preserve if an image should be in the train, test, or val split. Returns: PyLabel dataset object. Args: yaml_file (str): Path to the yaml file that describes the dataset to be imported. image_ext (str): The image file extension. path_to_annotations (str): the path to the annotations file; if path to annotations is none, file replaces name of images file from yaml file with annotations. name_of_annotations_folder (str): Default is "labels". Change this to "annotations" if your folder is called "annotations" encoding (str): Default is 'utf-8. Encoding of the annotations file(s). Example: >>> from pylabel import importer >>> dataset = importer.ImportYoloV5WithYaml(yaml_file='data/dataset.yaml') """ """ Note to other developers: As a note, the "path_to_images" variable in this code refers to the relative path relative to the path to annotations. It is different than the path to images specified in the YAML file. PyLabel uses the former to establish its pathing and the latter path to actually view the needed data. """ path_to_annotations_copy = path_to_annotations if path_to_annotations == None: path_to_annotations_defined = False else: path_to_annotations_defined = True counter = 0 data = _yaml_reader(yaml_file, encoding) yoloclasses = data["names"] iterated_list = list(data.keys()) for splitted in iterated_list: if splitted in ["nc", "names"]: pass else: try: path_to_images = data[splitted] except: raise Exception("split type not in the YAML file.") # if counter > 0 # change PoA to new split type # if if path_to_annotations == None or counter != 0: # In case your folder is called labels or some thing else that doesn't jive with what we want you to call it. if path_to_annotations_defined == True and counter > 0: path_to_annotations = str( PurePath( path_to_annotations_copy.replace( iterated_list[counter - 1], splitted ) ) ) # This probably needs to be reworked but there's potentially an issue with resetting the path # to annotations as we iterate through each split type. elif name_of_annotations_folder != "labels" and counter > 0: path_to_annotations = str( PurePath( path_to_annotations.replace( iterated_list[counter - 1], splitted ) ) ) elif name_of_annotations_folder != "labels" and counter == 0: path_to_annotations = str( PurePath( path_to_images.replace("images", name_of_annotations_folder) ) ) else: path_to_annotations = str( PurePath(path_to_images.replace("images", "labels")) ) path_to_images = str(PurePath("../../images/", splitted)) if counter == 0: dataset = ImportYoloV5( path=path_to_annotations, path_to_images=path_to_images, cat_names=yoloclasses, img_ext=image_ext, ) dataset.df["split"] = splitted counter += 1 else: dataset2 = ImportYoloV5( path=path_to_annotations, path_to_images=path_to_images, cat_names=yoloclasses, img_ext=image_ext, ) dataset2.df["split"] = splitted # This code is added so that the image ids are unique when the multiple datasets are merged # It will take the max img_id of the first data set # And then add that to the image ids in the second dataset so they don't collide max_img_id = max(dataset.df["img_id"]) dataset2.df["img_id"] += max_img_id + 1 dataset.df = dataset.df.append(dataset2.df) dataset.df.reset_index(0, inplace=True) counter += 1 return dataset