Merge pull request #56 from Bobholamovic/add_datasets

[Feat] Add Dataset Preparation Scripts of iSAID and UCMerced
2 years ago · d4134cb7d3
parent 9e60842e5f 31b4e1b7d9
commit d4134cb7d3
5 changed files with 272 additions and 9 deletions
--- a/docs/intro/data_prep.md
+++ b/docs/intro/data_prep.md
@ -6,4 +6,6 @@
 |-----|-----------|----------|----------|
 | 变化检测 | LEVIR-CD | https://justchenhao.github.io/LEVIR/ | [prepare_levircd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_levircd.py) |
 | 变化检测 | Season-varying | https://paperswithcode.com/dataset/cdd-dataset-season-varying | [prepare_svcd.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_svcd.py) |
 | 场景分类 | UC Merced | http://weegee.vision.ucmerced.edu/datasets/landuse.html | [prepare_ucmerced.py](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_ucmerced.py) |
 | 目标检测 | RSOD | https://github.com/RSIA-LIESMARS-WHU/RSOD-Dataset- | [prepare_rsod](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_rsod.py) |
 | 图像分割 | iSAID | https://captain-whu.github.io/iSAID/ | [prepare_isaid](https://github.com/PaddlePaddle/PaddleRS/blob/develop/tools/prepare_dataset/prepare_isaid.py) |
--- a/paddlers/tasks/utils/slider_predict.py
+++ b/paddlers/tasks/utils/slider_predict.py
@ -299,11 +299,12 @@ def slider_predict(predict_func,
        raise ValueError(
            "`overlap` must be a tuple/list of length 2 or an integer.")
    if block_size[0] <= overlap[0] or block_size[1] <= overlap[1]:
        raise ValueError("`block_size` must be larger than `overlap`.")
    step = np.array(
        block_size, dtype=np.int32) - np.array(
            overlap, dtype=np.int32)
    if step[0] == 0 or step[1] == 0:
        raise ValueError("`block_size` and `overlap` should not be equal.")
    if isinstance(img_file, tuple):
        if len(img_file) != 2:
--- a/tools/prepare_dataset/common.py
+++ b/tools/prepare_dataset/common.py
@ -3,11 +3,13 @@ import random
 import copy
 import os
 import os.path as osp
 import shutil
 from glob import glob
 from itertools import count
 from functools import partial
 from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 from skimage.io import imread, imsave
 from tqdm import tqdm
@ -57,20 +59,54 @@ def add_crop_options(parser):
    return parser
-def crop_and_save(path, out_subdir, crop_size, stride):
+def crop_and_save(path,
                  out_subdir,
                  crop_size,
                  stride,
                  keep_last=False,
                  pad=True,
                  pad_val=0):
    name, ext = osp.splitext(osp.basename(path))
    out_subsubdir = osp.join(out_subdir, name)
    if not osp.exists(out_subsubdir):
        os.makedirs(out_subsubdir)
    img = imread(path)
-    w, h = img.shape[:2]
+    h, w = img.shape[:2]
    if h < crop_size or w < crop_size:
        if not pad:
            raise ValueError(
                f"`crop_size` must be smaller than image size. `crop_size` is {crop_size}, but got image size {h}x{w}."
            )
        padded_img = np.full(
            shape=(max(h, crop_size), max(w, crop_size)) + img.shape[2:],
            fill_value=pad_val,
            dtype=img.dtype)
        padded_img[:h, :w] = img
        h, w = padded_img.shape[:2]
        img = padded_img
    counter = count()
-    for i in range(0, h - crop_size + 1, stride):
+    for i in range(0, h, stride):
-        for j in range(0, w - crop_size + 1, stride):
+        i_st = i
        i_ed = i_st + crop_size
        if i_ed > h:
            if keep_last:
                i_st = h - crop_size
                i_ed = h
            else:
                continue
        for j in range(0, w, stride):
            j_st = j
            j_ed = j_st + crop_size
            if j_ed > w:
                if keep_last:
                    j_st = w - crop_size
                    j_ed = w
                else:
                    continue
            imsave(
                osp.join(out_subsubdir, '{}_{}{}'.format(name,
                                                         next(counter), ext)),
-                img[i:i + crop_size, j:j + crop_size],
+                img[i_st:i_ed, j_st:j_ed],
                check_contrast=False)
@ -81,7 +117,8 @@ def crop_patches(crop_size,
                 subsets=('train', 'val', 'test'),
                 subdirs=('A', 'B', 'label'),
                 glob_pattern='*',
-                 max_workers=0):
+                 max_workers=0,
                 keep_last=False):
    """
    Crop patches from images in specific directories.
@ -102,6 +139,9 @@ def crop_patches(crop_size,
            Defaults to '*', which matches arbitrary file. 
        max_workers (int, optional): Number of worker threads to perform the cropping 
            operation. Deafults to 0.
        keep_last (bool, optional): If True, keep the last patch in each row and each 
            column. The left and upper border of the last patch will be shifted to 
            ensure that size of the patch be `crop_size`. Defaults to False.
    """
    if max_workers < 0:
@ -110,6 +150,8 @@ def crop_patches(crop_size,
    if subsets is None:
        subsets = ('', )
    print("Cropping patches...")
    if max_workers == 0:
        for subset in subsets:
            for subdir in subdirs:
@ -122,7 +164,8 @@ def crop_patches(crop_size,
                        p,
                        out_subdir=out_subdir,
                        crop_size=crop_size,
-                        stride=stride)
+                        stride=stride,
                        keep_last=keep_last)
    else:
        # Concurrently crop image patches
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
@ -232,6 +275,25 @@ def link_dataset(src, dst):
    os.symlink(src, osp.join(dst, name), target_is_directory=True)
 def copy_dataset(src, dst):
    """
    Make a copy a dataset.
    Args:
        src (str): Path of the original dataset.
        dst (str): Path to copy to.
    """
    if osp.exists(dst) and not osp.isdir(dst):
        raise ValueError(f"{dst} exists and is not a directory.")
    elif not osp.exists(dst):
        os.makedirs(dst)
    src = osp.realpath(src)
    name = osp.basename(osp.normpath(src))
    shutil.copytree(src, osp.join(dst, name))
 def random_split(samples,
                 ratios=(0.7, 0.2, 0.1),
                 inplace=True,
--- a/tools/prepare_dataset/prepare_isaid.py
+++ b/tools/prepare_dataset/prepare_isaid.py
@ -0,0 +1,136 @@
 #!/usr/bin/env python
 import os.path as osp
 from glob import glob
 from PIL import Image
 from tqdm import tqdm
 from common import (get_default_parser, add_crop_options, crop_patches,
                    create_file_list, copy_dataset, create_label_list,
                    get_path_tuples)
 # According to the official doc(https://github.com/CAPTAIN-WHU/iSAID_Devkit), 
 # the files should be organized as follows:
 # 
 # iSAID
 # ├── test
 # │   └── images
 # │       ├── P0006.png
 # │       └── ...
 # │       └── P0009.png
 # ├── train
 # │   └── images
 # │       ├── P0002_instance_color_RGB.png
 # │       ├── P0002_instance_id_RGB.png
 # │       ├── P0002.png
 # │       ├── ...
 # │       ├── P0010_instance_color_RGB.png
 # │       ├── P0010_instance_id_RGB.png
 # │       └── P0010.png
 # └── val
 #     └── images
 #         ├── P0003_instance_color_RGB.png
 #         ├── P0003_instance_id_RGB.png
 #         ├── P0003.png
 #         ├── ...
 #         ├── P0004_instance_color_RGB.png
 #         ├── P0004_instance_id_RGB.png
 #         └── P0004.png
 CLASSES = ('background', 'ship', 'storage_tank', 'baseball_diamond',
           'tennis_court', 'basketball_court', 'ground_track_field', 'bridge',
           'large_vehicle', 'small_vehicle', 'helicopter', 'swimming_pool',
           'roundabout', 'soccer_ball_field', 'plane', 'harbor')
 # Refer to https://github.com/Z-Zheng/FarSeg/blob/master/data/isaid.py
 COLOR_MAP = [[0, 0, 0], [0, 0, 63], [0, 191, 127], [0, 63, 0], [0, 63, 127],
             [0, 63, 191], [0, 63, 255], [0, 127, 63], [0, 127, 127],
             [0, 0, 127], [0, 0, 191], [0, 0, 255], [0, 63, 63], [0, 127, 191],
             [0, 127, 255], [0, 100, 155]]
 SUBSETS = ('train', 'val')
 SUBDIR = 'images'
 FILE_LIST_PATTERN = "{subset}.txt"
 LABEL_LIST_NAME = "labels.txt"
 URL = ""
 def flatten(nested_list):
    flattened_list = []
    for ele in nested_list:
        if isinstance(ele, list):
            flattened_list.extend(flatten(ele))
        else:
            flattened_list.append(ele)
    return flattened_list
 def rgb2mask(rgb):
    palette = flatten(COLOR_MAP)
    # Pad with zero
    palette = palette + [0] * (256 * 3 - len(palette))
    ref = Image.new(mode='P', size=(1, 1))
    ref.putpalette(palette)
    mask = rgb.quantize(palette=ref, dither=0)
    return mask
 if __name__ == '__main__':
    parser = get_default_parser()
    parser.add_argument(
        '--crop_size', type=int, help="Size of cropped patches.", default=800)
    parser.add_argument(
        '--crop_stride',
        type=int,
        help="Stride of sliding windows when cropping patches. `crop_size` will be used only if `crop_size` is not None.",
        default=600)
    args = parser.parse_args()
    out_dir = osp.join(args.out_dataset_dir,
                       osp.basename(osp.normpath(args.in_dataset_dir)))
    assert args.crop_size is not None
    # According to https://github.com/CAPTAIN-WHU/iSAID_Devkit/blob/master/preprocess/split.py
    # Set keep_last=True
    crop_patches(
        args.crop_size,
        args.crop_stride,
        data_dir=args.in_dataset_dir,
        out_dir=out_dir,
        subsets=SUBSETS,
        subdirs=(SUBDIR, ),
        glob_pattern='*.png',
        max_workers=8,
        keep_last=True)
    for subset in SUBSETS:
        path_tuples = []
        print(f"Processing {subset} labels...")
        for im_subdir in tqdm(glob(osp.join(out_dir, subset, SUBDIR, "*/"))):
            im_name = osp.basename(im_subdir[:-1])  # Strip trailing '/'
            if '_' in im_name:
                # Do not process labels
                continue
            mask_subdir = osp.join(out_dir, subset, SUBDIR,
                                   im_name + '_instance_color_RGB')
            for mask_path in glob(osp.join(mask_subdir, '*.png')):
                # Convert RGB files to mask files (pseudo color)
                rgb = Image.open(mask_path).convert('RGB')
                mask = rgb2mask(rgb)
                # Write to the original location
                mask.save(mask_path)
            path_tuples.extend(
                get_path_tuples(
                    im_subdir,
                    mask_subdir,
                    glob_pattern='*.png',
                    data_dir=args.out_dataset_dir))
        path_tuples.sort()
        file_list = osp.join(
            args.out_dataset_dir, FILE_LIST_PATTERN.format(subset=subset))
        create_file_list(file_list, path_tuples)
        print(f"Write file list to {file_list}.")
    label_list = osp.join(args.out_dataset_dir, LABEL_LIST_NAME)
    create_label_list(label_list, CLASSES)
    print(f"Write label list to {label_list}.")
--- a/tools/prepare_dataset/prepare_ucmerced.py
+++ b/tools/prepare_dataset/prepare_ucmerced.py
@ -0,0 +1,62 @@
 #!/usr/bin/env python
 import random
 import os.path as osp
 from glob import iglob
 from functools import reduce, partial
 from common import (get_default_parser, create_file_list, link_dataset,
                    random_split, create_label_list)
 CLASSES = ('agricultural', 'airplane', 'baseballdiamond', 'beach', 'buildings',
           'chaparral', 'denseresidential', 'forest', 'freeway', 'golfcourse',
           'harbor', 'intersection', 'mediumresidential', 'mobilehomepark',
           'overpass', 'parkinglot', 'river', 'runway', 'sparseresidential',
           'storagetanks', 'tenniscourt')
 SUBSETS = ('train', 'val', 'test')
 SUBDIRS = tuple(osp.join('Images', cls) for cls in CLASSES)
 FILE_LIST_PATTERN = "{subset}.txt"
 LABEL_LIST_NAME = "labels.txt"
 URL = ""
 if __name__ == '__main__':
    parser = get_default_parser()
    parser.add_argument('--seed', type=int, default=None, help="Random seed.")
    parser.add_argument(
        '--ratios',
        type=float,
        nargs='+',
        default=(0.7, 0.2, 0.1),
        help="Ratios of each subset (train/val or train/val/test).")
    args = parser.parse_args()
    if args.seed is not None:
        random.seed(args.seed)
    if len(args.ratios) not in (2, 3):
        raise ValueError("Wrong number of ratios!")
    out_dir = osp.join(args.out_dataset_dir,
                       osp.basename(osp.normpath(args.in_dataset_dir)))
    link_dataset(args.in_dataset_dir, args.out_dataset_dir)
    splits_list = []
    for idx, (cls, subdir) in enumerate(zip(CLASSES, SUBDIRS)):
        pairs = []
        for p in iglob(osp.join(out_dir, subdir, '*.tif')):
            pair = (osp.relpath(p, args.out_dataset_dir), str(idx))
            pairs.append(pair)
        splits = random_split(pairs, ratios=args.ratios)
        splits_list.append(splits)
    splits = map(partial(reduce, list.__add__), zip(*splits_list))
    for subset, split in zip(SUBSETS, splits):
        file_list = osp.join(
            args.out_dataset_dir, FILE_LIST_PATTERN.format(subset=subset))
        create_file_list(file_list, split)
        print(f"Write file list to {file_list}.")
    label_list = osp.join(args.out_dataset_dir, LABEL_LIST_NAME)
    create_label_list(label_list, CLASSES)
    print(f"Write label list to {label_list}.")