Source code for pylabel.analyze

"""The analyze module analyzes the contents of the dataset and provides summary statistics 
such as the count of images and classes. """

import pandas as pd
import numpy as np


[docs] class Analyze: def __init__(self, dataset=None): self.dataset = dataset # self.split_counts = ds.df["split"].value_counts(dropna=False) # self.split_pct = ds.df["split"].value_counts(normalize=True, dropna=False) @property def classes(self): """Returns list of all cat names in the dataset sorted by cat_id value. Returns: List Example: >>> dataset.analyze.classes ["Squirrel", "Nut"] """ self.dataset.df.cat_id = self.dataset.df.cat_id.replace( r"^\s*$", np.nan, regex=True ) pd.to_numeric(self.dataset.df["cat_id"]) filtered_df = self.dataset.df[self.dataset.df["cat_id"].notnull()] categories = dict(zip(filtered_df.cat_name, filtered_df.cat_id.astype("int"))) categories = sorted(categories.items(), key=lambda x: x[1]) return [c[0] for c in categories if str(c[0]).strip() != ""] @property def class_ids(self): """Returns a sorted list of all cat ids in the dataset. Returns: List Example: >>> dataset.analyze.class_ids [0,1] """ filtered_df = self.dataset.df[self.dataset.df["cat_id"].notnull()] cat_ids = list(filtered_df.cat_id.astype("int").unique()) cat_ids.sort() return cat_ids @property def class_counts(self): """Counts the numbers of instances of each class in the dataset. Uses the Pandas value_counts method and returns a Pandas Series. Returns: Pandas Series Example: >>> dataset.analyze.class_counts squirrel 50 nut 100 """ return self.dataset.df["cat_name"].value_counts(dropna=False) @property def num_classes(self): """Counts the unique number of classes in the dataset. Returns: Int Example: >>> dataset.analyze.num_classes 2 """ cat_names = list(self.dataset.df.cat_name.unique()) return len([i for i in cat_names if str(i).strip() != ""]) @property def num_images(self): """Counts the number of images in the dataset. Returns: Int Example: >>> dataset.analyze.num_images 100 """ return self.dataset.df["img_filename"].nunique() @property def class_name_id_map(self): """Returns a dict where the class name is the key and class id is the value. Returns: Dict Example: >>> dataset.analyze.class_name_id_map {('Squirrel', 0),('Nut', 1)} """ return dict(zip(self.dataset.df.cat_name, self.dataset.df.cat_id))
[docs] def ShowClassSplits(self, normalize=True): """Show the distribution of classes across train, val, and test splits of the dataset. Args: normalize (bool): Defaults to True. If True, then will return the relative frequencies of the classes between 0 and 1. If False, then will return the absolute counts of each class. Returns: Pandas Dataframe Examples: >>> dataset.analyze.ShowClassSplits(normalize=True) cat_name all train test dev squirrel .66 .64 .65 .63 nut .34 .34 .35 .37 >>> dataset.analyze.ShowClassSplits(normalize=False) cat_name all train test dev squirrel 66 64 65 63 nut 34 34 35 37 """ ds = self.dataset def move_column_inplace(df, col, pos): """ Assists to rearrange columns to a desired order. """ col = df.pop(col) df.insert(pos, col.name, col) df_value_counts = pd.DataFrame( ds.df["cat_name"].value_counts(normalize=normalize), columns=["cat_name"] ) df_value_counts.index.name = "cat_name" df_value_counts.columns = ["all"] split_df = ds.df.groupby("split") if split_df.ngroups == 1: return df_value_counts for name, group in split_df: group_df = pd.DataFrame(group) df_split_value_counts = pd.DataFrame( group_df["cat_name"].value_counts(normalize=normalize), columns=["cat_name"], ) df_split_value_counts.index.name = "cat_name" df_split_value_counts.columns = [name] df_value_counts = pd.merge( df_value_counts, df_split_value_counts, how="left", on=["cat_name"] ) # Move 'train' to the left of the table since that is the usual convention. if "train" in df_value_counts.columns: move_column_inplace(df_value_counts, "train", 1) return df_value_counts