Add util functions for data preparation

2 years ago · 1c30b71959
parent abc1f56cef
commit 1c30b71959
1 changed files with 68 additions and 0 deletions
--- a/tools/prepare_dataset/common.py
+++ b/tools/prepare_dataset/common.py
@ -1,4 +1,6 @@
 import argparse
+import random
+import copy
 import os
 import os.path as osp
 from glob import glob
@ -198,6 +200,20 @@ def create_file_list(file_list, path_tuples, sep=' '):
            f.write(line + '\n')


+def create_label_list(label_list, labels):
+    """
+    Create label list.
+    
+    Args:
+        label_list (str): Path of label list to create.
+        labels (list[str]|tuple[str]]): Label names.
+    """
+
+    with open(label_list, 'w') as f:
+        for label in labels:
+            f.write(label + '\n')
+
+
 def link_dataset(src, dst):
    """
    Make a symbolic link to a dataset.
@ -211,5 +227,57 @@ def link_dataset(src, dst):
        raise ValueError(f"{dst} exists and is not a directory.")
    elif not osp.exists(dst):
        os.makedirs(dst)
+    src = osp.realpath(src)
    name = osp.basename(osp.normpath(src))
    os.symlink(src, osp.join(dst, name), target_is_directory=True)
+
+
+def random_split(samples,
+                 ratios=(0.7, 0.2, 0.1),
+                 inplace=True,
+                 drop_remainder=False):
+    """
+    Randomly split the dataset into two or three subsets.
+    
+    Args:
+        samples (list): All samples of the dataset.
+        ratios (tuple[float], optional): If the length of `ratios` is 2,
+            the two elements indicate the ratios of samples used for training
+            and evaluation. If the length of `ratios` is 3, the three elements
+            indicate the ratios of samples used for training, validation, and 
+            testing. Defaults to (0.7, 0.2, 0.1).
+        inplace (bool, optional): Whether to shuffle `samples` in place. 
+            Defaults to True.
+        drop_remainder (bool, optional): Whether to discard the remaining samples.
+            If False, the remaining samples will be included in the last subset.
+            For example, if `ratios` is (0.7, 0.1) and `drop_remainder` is False, 
+            the two subsets after splitting will contain 70% and 30% of the samples, 
+            respectively. Defaults to False.
+    """
+
+    if not inplace:
+        samples = copy.deepcopy(samples)
+
+    if len(samples) == 0:
+        raise ValueError("There are no samples!")
+
+    if len(ratios) not in (2, 3):
+        raise ValueError("`len(ratios)` must be 2 or 3!")
+
+    random.shuffle(samples)
+
+    n_samples = len(samples)
+    acc_r = 0
+    st_idx, ed_idx = 0, 0
+    splits = []
+    for r in ratios:
+        acc_r += r
+        ed_idx = round(acc_r * n_samples)
+        splits.append(samples[st_idx:ed_idx])
+        st_idx = ed_idx
+
+    if ed_idx < len(ratios) and not drop_remainder:
+        # Append remainder to the last split
+        splits[-1].append(splits[ed_idx:])
+
+    return splits