Problem of using tfds when try to load my super resolution dataset

I have a super resolution dataset which have lr and hr folder containing png images.
I follow the instruction on the site and based on div2k srcipt to write a load for my dataset.
My training cannot load the dataset with the tfds, and it tried to download some other dataset, i.e. I comment the download action.

error : Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/grass/1.0.0…
TypeError: _generate_examples() got an unexpected keyword argument ‘lr_path’

class Grass(tfds.core.GeneratorBasedBuilder):
  """DatasetBuilder for face_grass dataset."""

  VERSION = tfds.core.Version('1.0.0')
  RELEASE_NOTES = {
      '1.0.0': 'Initial release.',
  }

  def _info(self) -> tfds.core.DatasetInfo:
    """Returns the dataset metadata."""
    # TODO(face_grass): Specifies the tfds.core.DatasetInfo object
    return tfds.core.DatasetInfo(
        builder=self,
        description=_DESCRIPTION,
        features=tfds.features.FeaturesDict({
            "lr": tfds.features.Image(),
            "hr": tfds.features.Image(),
        }),
        supervised_keys=("lr", "hr"),
        # If there's a common (input, target) tuple from the
        # features, specify them here. They'll be used if
        # `as_supervised=True` in `builder.as_dataset`.
        # supervised_keys=('image', 'label'),  # Set to `None` to disable
        homepage='https://dataset-homepage/',
        citation=_CITATION,
    )

  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
    """Returns SplitGenerators."""
    # TODO(face_grass): Downloads the data and defines the splits
    #path = dl_manager.download_and_extract('https://todo-data-url')

    # TODO(face_grass): Returns the Dict[split names, Iterator[Key, Example]]
    return [
        tfds.core.SplitGenerator(
            name=tfds.Split.TRAIN,
            gen_kwargs={
                "lr_path":
                    "../data1/project/grass_0808/train/LR",
                "hr_path":
                    "../data1/project/grass_0808/train/HR",
            }),
        tfds.core.SplitGenerator(
            name=tfds.Split.VALIDATION,
            gen_kwargs={
                "lr_path":
                    "../data1/project/grass_0808/valid/LR",
                "hr_path":
                    "../data1/project/grass_0808/valid/HR",
            }),
    ]

  def _generate_examples(self, path):
    """Yields examples."""
    # TODO(face_grass): Yields (key, example) tuples from the dataset
    for root, _, files in tf.io.gfile.walk(lr_path):
      for file_path in files:
        # Select only png files.
        if file_path.endswith(".png"):
          yield file_path, {
              "lr": os.path.join(root, file_path),
            
              "hr": os.path.join(hr_path, file_path)
          }

Thanks.

I don’t see who is calling _generate_examples.

Generally isn’t _split_generator going to call _genrate_examples explicitly with a path arg?

Thanks.
Is that I miss a function call for the _generate_examples .?

In your case I don’t undertand where you thake lr_path in _generate_example.

Have you tried to formulate as in:

Thanks.
I based on the offical div2k source script to modify.

"""DIV2K dataset: DIVerse 2K resolution high quality images.
As used for the challenges @ NTIRE (CVPR 2017 and CVPR 2018)
and @ PIRM (ECCV 2018)
"""

import os.path

import tensorflow as tf
import tensorflow_datasets.public_api as tfds

_CITATION = """
@InProceedings{Agustsson_2017_CVPR_Workshops,
	author = {Agustsson, Eirikur and Timofte, Radu},
	title = {NTIRE 2017 Challenge on Single Image Super-Resolution: Dataset and Study},
	booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) Workshops},
    url = "http://www.vision.ee.ethz.ch/~timofter/publications/Agustsson-CVPRW-2017.pdf",
	month = {July},
	year = {2017}
} 
"""

_DESCRIPTION = """
DIV2K dataset: DIVerse 2K resolution high quality images as used for the challenges @ NTIRE (CVPR 2017 and CVPR 2018) and @ PIRM (ECCV 2018)
"""

_DL_URL = "https://data.vision.ee.ethz.ch/cvl/DIV2K/"

_DL_URLS = {
    "train_hr": _DL_URL + "DIV2K_train_HR.zip",
    "valid_hr": _DL_URL + "DIV2K_valid_HR.zip",
    "train_bicubic_x2": _DL_URL + "DIV2K_train_LR_bicubic_X2.zip",
    "train_unknown_x2": _DL_URL + "DIV2K_train_LR_unknown_X2.zip",
    "valid_bicubic_x2": _DL_URL + "DIV2K_valid_LR_bicubic_X2.zip",
    "valid_unknown_x2": _DL_URL + "DIV2K_valid_LR_unknown_X2.zip",
    "train_bicubic_x3": _DL_URL + "DIV2K_train_LR_bicubic_X3.zip",
    "train_unknown_x3": _DL_URL + "DIV2K_train_LR_unknown_X3.zip",
    "valid_bicubic_x3": _DL_URL + "DIV2K_valid_LR_bicubic_X3.zip",
    "valid_unknown_x3": _DL_URL + "DIV2K_valid_LR_unknown_X3.zip",
    "train_bicubic_x4": _DL_URL + "DIV2K_train_LR_bicubic_X4.zip",
    "train_unknown_x4": _DL_URL + "DIV2K_train_LR_unknown_X4.zip",
    "valid_bicubic_x4": _DL_URL + "DIV2K_valid_LR_bicubic_X4.zip",
    "valid_unknown_x4": _DL_URL + "DIV2K_valid_LR_unknown_X4.zip",
    "train_bicubic_x8": _DL_URL + "DIV2K_train_LR_x8.zip",
    "valid_bicubic_x8": _DL_URL + "DIV2K_valid_LR_x8.zip",
    "train_realistic_mild_x4": _DL_URL + "DIV2K_train_LR_mild.zip",
    "valid_realistic_mild_x4": _DL_URL + "DIV2K_valid_LR_mild.zip",
    "train_realistic_difficult_x4": _DL_URL + "DIV2K_train_LR_difficult.zip",
    "valid_realistic_difficult_x4": _DL_URL + "DIV2K_valid_LR_difficult.zip",
    "train_realistic_wild_x4": _DL_URL + "DIV2K_train_LR_wild.zip",
    "valid_realistic_wild_x4": _DL_URL + "DIV2K_valid_LR_wild.zip",
}

_DATA_OPTIONS = [
    "bicubic_x2", "bicubic_x3", "bicubic_x4", "bicubic_x8", "unknown_x2",
    "unknown_x3", "unknown_x4", "realistic_mild_x4", "realistic_difficult_x4",
    "realistic_wild_x4"
]


class Div2kConfig(tfds.core.BuilderConfig):
  """BuilderConfig for Div2k."""

  def __init__(self, name, **kwargs):
    """Constructs a Div2kConfig."""
    if name not in _DATA_OPTIONS:
      raise ValueError("data must be one of %s" % _DATA_OPTIONS)

    description = kwargs.get("description", "Uses %s data." % name)
    kwargs["description"] = description

    super(Div2kConfig, self).__init__(name=name, **kwargs)
    self.data = name
    self.download_urls = {
        "train_lr_url": _DL_URLS["train_" + self.data],
        "valid_lr_url": _DL_URLS["valid_" + self.data],
        "train_hr_url": _DL_URLS["train_hr"],
        "valid_hr_url": _DL_URLS["valid_hr"],
    }


def _make_builder_configs():
  configs = []
  for data in _DATA_OPTIONS:
    configs.append(Div2kConfig(version=tfds.core.Version("2.0.0"), name=data))
  return configs


class Div2k(tfds.core.GeneratorBasedBuilder):
  """DIV2K dataset: DIVerse 2K resolution high quality images."""

  BUILDER_CONFIGS = _make_builder_configs()

  def _info(self):
    return tfds.core.DatasetInfo(
        builder=self,
        description=_DESCRIPTION,
        features=tfds.features.FeaturesDict({
            "lr": tfds.features.Image(),
            "hr": tfds.features.Image(),
        }),
        supervised_keys=("lr", "hr"),
        homepage=_DL_URL,
        citation=_CITATION,
    )

  def _split_generators(self, dl_manager):
    """Returns SplitGenerators."""
    print("EXTRACTING", self.builder_config.download_urls)
    extracted_paths = dl_manager.download_and_extract(
        self.builder_config.download_urls)

    return [
        tfds.core.SplitGenerator(
            name=tfds.Split.TRAIN,
            gen_kwargs={
                "lr_path":
                    extracted_paths["train_lr_url"],
                "hr_path":
                    os.path.join(extracted_paths["train_hr_url"],
                                 "DIV2K_train_HR"),
            }),
        tfds.core.SplitGenerator(
            name=tfds.Split.VALIDATION,
            gen_kwargs={
                "lr_path":
                    extracted_paths["valid_lr_url"],
                "hr_path":
                    os.path.join(extracted_paths["valid_hr_url"],
                                 "DIV2K_valid_HR"),
            }),
    ]

  def _generate_examples(self, lr_path, hr_path):
    """Yields examples."""
    for root, _, files in tf.io.gfile.walk(lr_path):
      for file_path in files:
        # Select only png files.
        if file_path.endswith(".png"):
          yield file_path, {
              "lr": os.path.join(root, file_path),
              # Extract the image id from the filename: "0001x2.png"
              "hr": os.path.join(hr_path, file_path[:4] + ".png")
          }

However, it does not work in my case.

Yes but in your case I see tf.io.gfile.walk(lr_path) and you have path arg instead.

Thanks. I notice the bug now.
def _generate_examples(self, path):
to be
def _generate_examples(self, lr_path, hr_path):

Now the TypeError: _generate_examples() got an unexpected keyword argument ‘lr_path’ is fixed.
However, the program rise another error:
assertionerror: no examples were yielded.

It seems that those image cannot be loaded into the tfds.
Is that tfds does not accept images in folder, e.g.
“lr_path”:
“…/data1/project/grass_0808/train/LR”,
“hr_path”:
“…/data1/project/grass_0808/train/HR”,
?

Check directly tf.io.gfile.walk isolated with these relative paths.

Thanks.
May I know where I use tf.io.gfile.walk?
Also, is that tfds only accept the dataset in zip format or tfrecord format?
as I get this error:
file “/data1/anaconda3/envs/lib/python3.7/site-packages/tensorflow_datasets/core/tfrecords_writer.py”, line 150, in _get_shard_boundaries raise assertionerror(“no examples were yielded.”)
assertionerror: no examples were yielded.

It Is in your original code.

U mean in my train.py ?

In your first post def _generate_examples

Like this?
def _generate_examples(self, lr_path, hr_path):
tf.io.gfile.walk(lr_path)

Hi.
I got the follow error when using that:

ModuleNotFoundError: No module named ‘apache_beam’

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File “/data1/anaconda3/envs/bin/tfds”, line 8, in
sys.exit(launch_cli())
File “/data1/anaconda3/envs//lib/python3.7/site-packages/tensorflow_datasets/scripts/cli/main.py”, line 102, in launch_cli
app.run(main, flags_parser=_parse_flags)
File “/data1/anaconda3/envs/lib/python3.7/site-packages/absl/app.py”, line 303, in run
_run_main(main, args)
File “/data1/anaconda3/envs/lib/python3.7/site-packages/absl/app.py”, line 251, in _run_main
sys.exit(main(argv))
File “/data1/anaconda3/envs/lib/python3.7/site-packages/tensorflow_datasets/scripts/cli/main.py”, line 97, in main
args.subparser_fn(args)
File “/data1/anaconda3/envs/lib/python3.7/site-packages/tensorflow_datasets/scripts/cli/build.py”, line 192, in _build_datasets
_download_and_prepare(args, builder)
File “/data1/anaconda3/envs/lib/python3.7/site-packages/tensorflow_datasets/scripts/cli/build.py”, line 345, in _download_and_prepare
download_config=dl_config,
File “/data1/anaconda3/envs/lib/python3.7/site-packages/tensorflow_datasets/core/dataset_builder.py”, line 464, in download_and_prepare
download_config=download_config,
File “/data1/anaconda3/envs/lib/python3.7/site-packages/tensorflow_datasets/core/dataset_builder.py”, line 1195, in _download_and_prepare
disable_shuffling=self.info.disable_shuffling,
File “/data1/anaconda3/envs/lib/python3.7/site-packages/tensorflow_datasets/core/split_builder.py”, line 301, in submit_split_generation
raise unknown_generator_type
TypeError: Invalid split generator value for split train. Expected generator or apache_beam object. Got: <class ‘NoneType’>

I cannot follow anymore what code you are using now.

It would help if you can share a very minimized standalone code gist or a Colab.