Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ This should get you started to do some serious deep learning on your data. [Read
- A [seed](https://docs.python.org/3/library/random.html#random.seed) makes splits reproducible.
- Allows randomized [oversampling](https://en.wikipedia.org/wiki/Oversampling_and_undersampling_in_data_analysis) for imbalanced datasets.
- Optionally group files by prefix.
- Optionally split files by file format(s).
- (Should) work on all operating systems.

## Install
Expand Down Expand Up @@ -83,27 +84,29 @@ import splitfolders
# Split with a ratio.
# To only split into training and validation set, set a tuple to `ratio`, i.e, `(.8, .2)`.
splitfolders.ratio("input_folder", output="output",
seed=1337, ratio=(.8, .1, .1), group_prefix=None, move=False) # default values
seed=1337, ratio=(.8, .1, .1), group_prefix=None, formats=None, move=False) # default values

# Split val/test with a fixed number of items, e.g. `(100, 100)`, for each set.
# To only split into training and validation set, use a single number to `fixed`, i.e., `10`.
# Set 3 values, e.g. `(300, 100, 100)`, to limit the number of training values.
splitfolders.fixed("input_folder", output="output",
seed=1337, fixed=(100, 100), oversample=False, group_prefix=None, move=False) # default values
seed=1337, fixed=(100, 100), oversample=False, group_prefix=None, formats=None, move=False) # default values
```

Occasionally, you may have things that comprise more than a single file (e.g. picture (.png) + annotation (.txt)).
`splitfolders` lets you split files into equally-sized groups based on their prefix.
Set `group_prefix` to the length of the group (e.g. `2`).
But now _all_ files should be part of groups.

Also, there might be some instances when you have multiple file formats in these folders. Provide one or multiple extension(s) to `formats` for spliting the files in a list (e.g. `formats = ['.jpeg','.png']`).

Set `move=True` if you want to move the files instead of copying.

### CLI

```
Usage:
splitfolders [--output] [--ratio] [--fixed] [--seed] [--oversample] [--group_prefix] [--move] folder_with_images
splitfolders [--output] [--ratio] [--fixed] [--seed] [--oversample] [--group_prefix] [--formats] [--move] folder_with_images
Options:
--output path to the output folder. defaults to `output`. Get created if non-existent.
--ratio the ratio to split. e.g. for train/val/test `.8 .1 .1 --` or for train/val `.8 .2 --`.
Expand All @@ -113,6 +116,7 @@ Options:
--seed set seed value for shuffling the items. defaults to 1337.
--oversample enable oversampling of imbalanced datasets, works only with --fixed.
--group_prefix split files into equally-sized groups based on their prefix
--formats split the files based on specified extension(s)
--move move the files instead of copying
Example:
splitfolders --ratio .8 .1 .1 -- folder_with_images
Expand Down
10 changes: 9 additions & 1 deletion splitfolders/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,19 @@ def run():
"input",
help="directory with the input data. The directory needs to have the labels as sub-directories. In those sub-directories are then the actual files that gets split.",
)
parser.add_argument(
"--formats",
nargs="+",
type=str,
default = None,
help="specify the file format(s) which should be considered for spliting the data e.g. `.png .jpeg .jpg`",
)

args = parser.parse_args()

if args.ratio:
ratio(
args.input, args.output, args.seed, args.ratio, args.group_prefix, args.move
args.input, args.output, args.seed, args.ratio, args.group_prefix, args.move, args.formats
)
else:
if args.fixed:
Expand All @@ -67,6 +74,7 @@ def run():
args.oversample,
args.group_prefix,
args.move,
args.formats
)
else:
print(
Expand Down
30 changes: 23 additions & 7 deletions splitfolders/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,20 +65,32 @@ def check_input_format(input):
)


def valid_extensions(formats):
"""
Check if an extension starts with `.`
"""
invalid_ext = [s for s in formats if not s.startswith('.')]

if invalid_ext:
raise ValueError (f"Extension must start with '.' {invalid_ext} is/are the invalid extension(s).")


def ratio(
input,
output="output",
seed=1337,
ratio=(0.8, 0.1, 0.1),
group_prefix=None,
move=False,
formats = None
):
if not round(sum(ratio), 5) == 1: # round for floating imprecision
raise ValueError("The sums of `ratio` is over 1.")
if not len(ratio) in (2, 3):
raise ValueError("`ratio` should")

check_input_format(input)
valid_extensions(formats)

if use_tqdm:
prog_bar = tqdm(desc=f"Copying files", unit=" files")
Expand All @@ -92,6 +104,7 @@ def ratio(
prog_bar if use_tqdm else None,
group_prefix,
move,
formats
)

if use_tqdm:
Expand All @@ -106,6 +119,7 @@ def fixed(
oversample=False,
group_prefix=None,
move=False,
formats = None
):
if isinstance(fixed, int):
fixed = [fixed]
Expand All @@ -119,6 +133,7 @@ def fixed(
)

check_input_format(input)
valid_extensions(formats)

if use_tqdm:
prog_bar = tqdm(desc=f"Copying files", unit=" files")
Expand All @@ -135,6 +150,7 @@ def fixed(
prog_bar if use_tqdm else None,
group_prefix,
move,
formats
)
)

Expand All @@ -155,7 +171,7 @@ def fixed(
for num_items, class_dir in iteration:
class_name = path.split(class_dir)[1]
full_path = path.join(output, "train", class_name)
train_files = list_files(full_path)
train_files = list_files(full_path, formats)

if group_prefix is not None:
train_files = group_by_prefix(train_files, group_prefix)
Expand Down Expand Up @@ -210,13 +226,13 @@ def group_by_prefix(files, len_pairs):
return results


def setup_files(class_dir, seed, group_prefix=None):
def setup_files(class_dir, seed, group_prefix=None, formats=None):
"""
Returns shuffeld list of filenames
"""
random.seed(seed) # make sure its reproducible

files = list_files(class_dir)
files = list_files(class_dir, formats)

if group_prefix is not None:
files = group_by_prefix(files, group_prefix)
Expand All @@ -226,11 +242,11 @@ def setup_files(class_dir, seed, group_prefix=None):
return files


def split_class_dir_ratio(class_dir, output, ratio, seed, prog_bar, group_prefix, move):
def split_class_dir_ratio(class_dir, output, ratio, seed, prog_bar, group_prefix, move, formats):
"""
Splits a class folder
"""
files = setup_files(class_dir, seed, group_prefix)
files = setup_files(class_dir, seed, group_prefix, formats)

# the data was shuffled already
split_train_idx = int(ratio[0] * len(files))
Expand All @@ -240,11 +256,11 @@ def split_class_dir_ratio(class_dir, output, ratio, seed, prog_bar, group_prefix
copy_files(li, class_dir, output, prog_bar, move)


def split_class_dir_fixed(class_dir, output, fixed, seed, prog_bar, group_prefix, move):
def split_class_dir_fixed(class_dir, output, fixed, seed, prog_bar, group_prefix, move, formats):
"""
Splits a class folder and returns the total number of files
"""
files = setup_files(class_dir, seed, group_prefix)
files = setup_files(class_dir, seed, group_prefix, formats)

if not len(files) >= sum(fixed):
raise ValueError(
Expand Down
21 changes: 14 additions & 7 deletions splitfolders/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,19 @@ def list_dirs(directory):
return [f for f in Path(directory).iterdir() if f.is_dir()]


def list_files(directory):
def list_files(directory, formats):
"""
Returns all files in a given directory
Returns all files in a given directory based on the given file formats
"""
return [
f
for f in Path(directory).iterdir()
if f.is_file() and not f.name.startswith(".")
]
if formats:
return [
f
for f in Path(directory).iterdir()
if f.is_file() and not f.name.startswith(".") and f.name.endswith(tuple(formats))
]
else:
return [
f
for f in Path(directory).iterdir()
if f.is_file() and not f.name.startswith(".")
]