Source code for pylabel.splitter

import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit as sklearnGroupShuffleSplit
from pylabel.shared import schema
from tqdm import tqdm


[docs] class Split: def __init__(self, dataset=None): self.dataset = dataset
[docs] def UnSplit(self): """Unsplit the dataset by setting all values of the split column to null.""" self.dataset.df["split"] = np.nan
[docs] def GroupShuffleSplit( self, train_pct=0.5, test_pct=0.25, val_pct=0.25, group_col="img_filename", random_state=None, ): """ This function uses the GroupShuffleSplit command from sklearn. It can split into 3 groups (train, test, and val) by applying the command twice. If you want to split into only 2 groups (train and test), then set val_pct to 0. """ # Check inputs and raise errors if needed assert 0 < float(train_pct) < 1, "train_pct must be between 0 and 1" assert 0 < float(test_pct) < 1, "test_pct must be between 0 and 1" # check that the sum of train_pct, test_pct, and val_pct is equal to 1 assert ( round(train_pct + test_pct + val_pct, 1) == 1 ), "Sum of train_pct, test_pct, and val_pct must equal 1." df_main = self.dataset.df gss = sklearnGroupShuffleSplit( n_splits=1, train_size=train_pct, random_state=random_state ) train_indexes, test_indexes = next( gss.split(X=df_main, y=df_main[group_col], groups=df_main.index.values) ) df_main.loc[train_indexes, "split"] = "train" df_main.loc[test_indexes, "split"] = "test" self.dataset.df = df_main if val_pct: df_train = df_main.loc[df_main["split"] == "train"] df_test = df_main.loc[df_main["split"] == "test"] df_test = df_test.reset_index() second_split_pct = float(test_pct / (test_pct + val_pct)) gss2 = sklearnGroupShuffleSplit( n_splits=1, train_size=second_split_pct, random_state=random_state ) test_indexes_2, val_indexes_2 = next( gss2.split(X=df_test, y=df_test[group_col], groups=df_test.index.values) ) df_test.loc[test_indexes_2, "split"] = "test" df_test.loc[val_indexes_2, "split"] = "val" self.dataset.df = pd.concat([df_train, df_test]) self.dataset.df = self.dataset.df.reset_index(drop=True) self.dataset.df = self.dataset.df[schema]
# Written with the help of https://stackoverflow.com/questions/56872664/complex-dataset-split-stratifiedgroupshufflesplit
[docs] def StratifiedGroupShuffleSplit( self, train_pct=0.7, test_pct=0.3, val_pct=0.0, weight=0.01, group_col="img_filename", cat_col="cat_name", batch_size=1, ): """ This function will 'split" the dataframe by setting the split collumn equal to train, test, or val. When a split dataset is exported the annotations will be split into seperate groups so that can be used used in model training, testing, and validation. """ # Check inputs and raise errors if needed assert ( 0 <= float(train_pct) <= 1 ), "train_pct must be greater than or equal to 0 and less than or equal to 1" assert ( 0 <= float(test_pct) <= 1 ), "test_pct must be greater than or equal to 0 and less than or equal to 1" assert ( 0 <= float(val_pct) <= 1 ), "val_pct must be greater than or equal to 0 and less than or equal to 1" # check that the sum of train_pct, test_pct, and val_pct is equal to 1 assert ( round(train_pct + test_pct + val_pct, 1) == 1 ), "Sum of train_pct, test_pct, and val_pct must equal 1." df_main = self.dataset.df df_main = df_main.reindex( np.random.permutation(df_main.index) ) # shuffle dataset # create empty train, val and test datasets df_train = pd.DataFrame() df_val = pd.DataFrame() df_test = pd.DataFrame() subject_grouped_df_main = df_main.groupby( [group_col], sort=False, as_index=False ) category_grouped_df_main = ( df_main.groupby(cat_col).count()[[group_col]] / len(df_main) * 100 ) # Check inputs assert 0 <= weight <= 1, "Weight must be between 0 and 1" total_splits = round((train_pct) + float(test_pct) + float(val_pct), 1) assert ( total_splits == 1 ), "Sum of train_pct, test_pct, and val_pct must equal 1." assert ( batch_size >= 1 and batch_size <= subject_grouped_df_main.ngroups / 10 ), "Batch must be greater than 1 and less than 1/10 count of groups" def calc_mse_loss(df): grouped_df = df.groupby(cat_col).count()[[group_col]] / len(df) * 100 df_temp = category_grouped_df_main.join( grouped_df, on=cat_col, how="left", lsuffix="_main" ) df_temp.fillna(0, inplace=True) df_temp["diff"] = (df_temp["img_filename_main"] - df_temp[group_col]) ** 2 mse_loss = np.mean(df_temp["diff"]) return mse_loss i = 0 # counter for all items in dataset b = 0 # counter for the batches batch_df = df_main[0:0] # Use tqdm in the for loop to show progress bar and iterate through the groups pbar = tqdm(total=subject_grouped_df_main.ngroups, desc="Splitting dataset") for _, group in subject_grouped_df_main: if i < 3: if i == 0: df_train = pd.concat( [df_train, pd.DataFrame(group)], ignore_index=True ) i += 1 continue elif i == 1: df_val = pd.concat([df_val, pd.DataFrame(group)], ignore_index=True) i += 1 continue else: df_test = pd.concat( [df_test, pd.DataFrame(group)], ignore_index=True ) i += 1 continue # Add groups to the batch batch_df = pd.concat([batch_df, group]) b += 1 if b < batch_size and i < subject_grouped_df_main.ngroups - 3: i += 1 continue mse_loss_diff_train = calc_mse_loss(df_train) - calc_mse_loss( pd.concat([df_train, batch_df], ignore_index=True) ) mse_loss_diff_val = calc_mse_loss(df_val) - calc_mse_loss( pd.concat([df_train, batch_df], ignore_index=True) ) mse_loss_diff_test = calc_mse_loss(df_test) - calc_mse_loss( pd.concat([df_train, batch_df], ignore_index=True) ) total_records = len(df_train) + len(df_val) + len(df_test) len_diff_train = train_pct - (len(df_train) / total_records) len_diff_val = val_pct - (len(df_val) / total_records) len_diff_test = test_pct - (len(df_test) / total_records) len_loss_diff_train = len_diff_train * abs(len_diff_train) len_loss_diff_val = len_diff_val * abs(len_diff_val) len_loss_diff_test = len_diff_test * abs(len_diff_test) loss_train = (weight * mse_loss_diff_train) + ( (1 - weight) * len_loss_diff_train ) loss_val = (weight * mse_loss_diff_val) + ((1 - weight) * len_loss_diff_val) loss_test = (weight * mse_loss_diff_test) + ( (1 - weight) * len_loss_diff_test ) if max(loss_train, loss_val, loss_test) == loss_train: df_train = pd.concat([df_train, batch_df], ignore_index=True) elif max(loss_train, loss_val, loss_test) == loss_val: df_val = pd.concat([df_val, batch_df], ignore_index=True) else: df_test = pd.concat([df_test, batch_df], ignore_index=True) # print ("Group " + str(i) + ". loss_train: " + str(loss_train) + " | " + "loss_val: " + str(loss_val) + " | " + "loss_test: " + str(loss_test) + " | ") i += 1 # update the progress bar pbar.update(b) # Reset the batch b = 0 batch_df = df_main[0:0] ###### # Final prep tasks before returning the split dataframe # Sometimes the algo will put some rows in the val set even if the split percent was set to zero # In those cases move the rows from val to test if round(val_pct, 1) == round(0, 1): # Move the rows from val to test and remove the rows from val df_test = pd.concat([df_test, df_val]) df_val = df_val[0:0] # remove the values from val # Apply train, split, val labels to the split collumn df_train["split"] = "train" df_test["split"] = "test" df_val["split"] = "val" # Combine the train, test, and val dataframes df = pd.concat([df_train, pd.concat([df_test, df_val])]) assert ( df.shape == df_main.shape ), "Output shape does not match input shape. Data loss has occured." self.dataset.df = df self.dataset.df = self.dataset.df.reset_index(drop=True) self.dataset.df = self.dataset.df[schema] # set progtess bar to 100% pbar.update(subject_grouped_df_main.ngroups) pbar.close()