AlirezaSalehi99 commited on Feb 21

Commit

95cc73b

verified ·

1 Parent(s): 1954c27

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
Tipsomaly/.gitignore +2 -0
Tipsomaly/imgs/Models_Architecture_page-0001.jpg +3 -0
Tipsomaly/imgs/Qualitative_results_page-0001.jpg +3 -0
Tipsomaly/imgs/results-table.png +3 -0
Tipsomaly/model/big_vision/__pycache__/__init__.cpython-39.pyc +0 -0
Tipsomaly/model/big_vision/__pycache__/load_siglip.cpython-39.pyc +0 -0
Tipsomaly/model/big_vision/__pycache__/utils.cpython-39.pyc +0 -0
Tipsomaly/model/big_vision/configs/__init__.py +0 -0
Tipsomaly/model/big_vision/configs/bit_i1k.py +102 -0
Tipsomaly/model/big_vision/configs/bit_i21k.py +85 -0
Tipsomaly/model/big_vision/configs/common.py +188 -0
Tipsomaly/model/big_vision/configs/common_fewshot.py +60 -0
Tipsomaly/model/big_vision/configs/load_and_eval.py +143 -0
Tipsomaly/model/big_vision/configs/mlp_mixer_i1k.py +120 -0
Tipsomaly/model/big_vision/configs/proj/paligemma/transfers/vertexai_l4.py +115 -0
Tipsomaly/model/big_vision/configs/proj/paligemma/transfers/vqav2.py +160 -0
Tipsomaly/model/big_vision/configs/transfer.py +186 -0
Tipsomaly/model/big_vision/configs/vit_i1k.py +177 -0
Tipsomaly/model/big_vision/configs/vit_i21k.py +145 -0
Tipsomaly/model/big_vision/configs/vit_s16_i1k.py +105 -0
Tipsomaly/model/big_vision/datasets/core.py +77 -0
Tipsomaly/model/big_vision/datasets/jsonl.py +177 -0
Tipsomaly/model/big_vision/datasets/sequence_packing.py +77 -0
Tipsomaly/model/big_vision/datasets/tfds.py +94 -0
Tipsomaly/model/big_vision/evaluators/__init__.py +0 -0
Tipsomaly/model/big_vision/evaluators/classification.py +76 -0
Tipsomaly/model/big_vision/evaluators/common.py +228 -0
Tipsomaly/model/big_vision/evaluators/fewshot_lsr.py +245 -0
Tipsomaly/model/big_vision/evaluators/mean.py +80 -0
Tipsomaly/model/big_vision/evaluators/save.py +121 -0
Tipsomaly/model/big_vision/models/__init__.py +0 -0
Tipsomaly/model/big_vision/models/bit.py +162 -0
Tipsomaly/model/big_vision/models/bit_paper.py +260 -0
Tipsomaly/model/big_vision/models/common.py +133 -0
Tipsomaly/model/big_vision/models/mlp_mixer.py +177 -0
Tipsomaly/model/big_vision/models/vit.py +505 -0
Tipsomaly/model/big_vision/pp/__init__.py +0 -0
Tipsomaly/model/big_vision/pp/autoaugment.py +700 -0
Tipsomaly/model/big_vision/pp/builder.py +85 -0
Tipsomaly/model/big_vision/pp/builder_test.py +72 -0
Tipsomaly/model/big_vision/pp/ops_general.py +468 -0
Tipsomaly/model/big_vision/pp/ops_general_test.py +236 -0
Tipsomaly/model/big_vision/pp/ops_image.py +361 -0
Tipsomaly/model/big_vision/pp/ops_image_test.py +82 -0
Tipsomaly/model/big_vision/pp/ops_text.py +411 -0
Tipsomaly/model/big_vision/pp/ops_text_test.py +200 -0
Tipsomaly/model/big_vision/pp/registry.py +163 -0
Tipsomaly/model/big_vision/pp/registry_test.py +128 -0
Tipsomaly/model/big_vision/pp/tokenizer.py +103 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Tipsomaly/imgs/Models_Architecture_page-0001.jpg filter=lfs diff=lfs merge=lfs -text
+Tipsomaly/imgs/results-table.png filter=lfs diff=lfs merge=lfs -text
+Tipsomaly/imgs/Qualitative_results_page-0001.jpg filter=lfs diff=lfs merge=lfs -text

Tipsomaly/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ *.pyc

Tipsomaly/imgs/Models_Architecture_page-0001.jpg ADDED Viewed

Git LFS Details

SHA256: 6e793f62366a11789d2f93727d36730378d7cff7e89d2f53a179d3799eb1ddfe
Pointer size: 131 Bytes
Size of remote file: 565 kB

Tipsomaly/imgs/Qualitative_results_page-0001.jpg ADDED Viewed

Git LFS Details

SHA256: 23581ba2e6b0fd8fee121395adb9eb4249c5088f23d255dd99c850cb881679ed
Pointer size: 132 Bytes
Size of remote file: 1.07 MB

Tipsomaly/imgs/results-table.png ADDED Viewed

Git LFS Details

SHA256: 58efcca11d4ea3e7f418d4450b895ac0cae26cd719aefad91ef2f83d9f91eeef
Pointer size: 131 Bytes
Size of remote file: 317 kB

Tipsomaly/model/big_vision/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (163 Bytes). View file

Tipsomaly/model/big_vision/__pycache__/load_siglip.cpython-39.pyc ADDED Viewed

Binary file (5.4 kB). View file

Tipsomaly/model/big_vision/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (52.4 kB). View file

Tipsomaly/model/big_vision/configs/__init__.py ADDED Viewed

File without changes

Tipsomaly/model/big_vision/configs/bit_i1k.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Pre-training BiT on ILSVRC-2012 as in https://arxiv.org/abs/1912.11370
+Run training of a BiT-ResNet-50x1 variant, which takes ~32min on v3-128:
+big_vision.train \
+    --config big_vision/configs/bit_i1k.py \
+    --workdir gs://[your_bucket]/big_vision/`date '+%m-%d_%H%M'` \
+    --config.model.depth 50 --config.model.width 1
+"""
+# from big_vision.configs.common_fewshot import get_fewshot_lsr
+import ml_collections as mlc
+def get_config(runlocal=False):
+  """Config for training on ImageNet-1k."""
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 90
+  config.num_classes = 1000
+  config.loss = 'softmax_xent'
+  config.input = dict()
+  config.input.data = dict(
+      name='imagenet2012',
+      split='train[:99%]',
+  )
+  config.input.batch_size = 4096
+  config.input.cache_raw = True  # Needs up to 120GB of RAM!
+  config.input.shuffle_buffer_size = 250_000  # Per host.
+  pp_common = '|onehot(1000, key="{lbl}", key_result="labels")'
+  pp_common += '|value_range(-1, 1)|keep("image", "labels")'
+  config.input.pp = 'decode_jpeg_and_inception_crop(224)|flip_lr' + pp_common.format(lbl='label')
+  pp_eval = 'decode|resize_small(256)|central_crop(224)' + pp_common
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  # Model section
+  config.model_name = 'bit'
+  config.model = dict(
+      depth=50,  # You can also pass e.g. [3, 5, 10, 2]
+      width=1.0,
+  )
+  # Optimizer section
+  config.optax_name = 'big_vision.momentum_hp'
+  config.grad_clip_norm = 1.0
+  # linear scaling rule. Don't forget to sweep if sweeping batch_size.
+  config.wd = (1e-4 / 256) * config.input.batch_size
+  config.lr = (0.1 / 256) * config.input.batch_size
+  config.schedule = dict(decay_type='cosine', warmup_steps=1000)
+  # Eval section
+  def get_eval(split, dataset='imagenet2012'):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        pp_fn=pp_eval.format(lbl='label'),
+        loss_name=config.loss,
+        log_steps=1000,  # Very fast O(seconds) so it's fine to run it often.
+        cache='final_data',
+    )
+  config.evals = {}
+  config.evals.train = get_eval('train[:2%]')
+  config.evals.minival = get_eval('train[99%:]')
+  config.evals.val = get_eval('validation')
+  config.evals.v2 = get_eval('test', dataset='imagenet_v2')
+  config.evals.real = get_eval('validation', dataset='imagenet2012_real')
+  config.evals.real.pp_fn = pp_eval.format(lbl='real_label')
+  # config.evals.fewshot = get_fewshot_lsr(runlocal=runlocal)
+  # config.evals.fewshot.log_steps = 1000
+  if runlocal:
+    config.input.batch_size = 32
+    config.input.cache_raw = False
+    config.input.shuffle_buffer_size = 100
+    local_eval = config.evals.val
+    config.evals = {'val': local_eval}
+    config.evals.val.cache = 'none'
+  return config

Tipsomaly/model/big_vision/configs/bit_i21k.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""A config for pre-training BiT on ImageNet-21k.
+This config relies on the Imagenet-21k tfds dataset, which is not yet
+available publicly in TFDS. We intend to add the dataset to public TFDS soon,
+and this config will then be runnable.
+"""
+from big_vision.configs.common_fewshot import get_fewshot_lsr
+import ml_collections as mlc
+def get_config():
+  """Config for training on imagenet-21k."""
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 90
+  config.num_classes = 21843
+  config.init_head_bias = -10.0
+  config.loss = 'sigmoid_xent'
+  config.input = dict()
+  config.input.data = dict(
+      name='imagenet21k',
+      split='full[51200:]',
+  )
+  config.input.batch_size = 4096
+  config.input.shuffle_buffer_size = 250_000  # Per host, so small-ish is ok.
+  pp_common = '|value_range(-1, 1)|onehot({onehot_args})|keep("image", "labels")'
+  pp_common_i21k = pp_common.format(onehot_args=f'{config.num_classes}')
+  pp_common_i1k = pp_common.format(onehot_args='1000, key="label", key_result="labels"')
+  config.input.pp = 'decode_jpeg_and_inception_crop(224)|flip_lr' + pp_common_i21k
+  pp_eval = 'decode|resize_small(256)|central_crop(224)'
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  # Model section
+  config.model_name = 'bit_paper'
+  config.model = dict(depth=50, width=1.0)
+  # Optimizer section
+  config.optax_name = 'big_vision.momentum_hp'
+  config.grad_clip_norm = 1.0
+  # linear scaling rule. Don't forget to sweep if sweeping batch_size.
+  config.lr = (0.03 / 256) * config.input.batch_size
+  config.wd = (3e-5 / 256) * config.input.batch_size
+  config.schedule = dict(decay_type='cosine', warmup_steps=5000)
+  # Evaluations on i21k itself.
+  def eval_i21k(split):
+    return dict(
+        type='classification',
+        data={**config.input.data, 'split': split},
+        pp_fn=pp_eval + pp_common_i21k,
+        loss_name=config.loss,
+        log_steps=1000,  # Very fast O(seconds) so it's fine to run it often.
+    )
+  config.evals = {}
+  config.evals.test = eval_i21k('full[:25_600]')
+  config.evals.val = eval_i21k('full[25_600:51_200]')
+  config.evals.train = eval_i21k('full[51_200:76_800]')
+  # Few-shot evaluators
+  config.evals.fewshot = get_fewshot_lsr()
+  config.evals.fewshot.log_steps = 25_000
+  return config

Tipsomaly/model/big_vision/configs/common.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A few things commonly used across A LOT of config files."""
+import string
+import ml_collections as mlc
+def input_for_quicktest(config_input, quicktest):
+  if quicktest:
+    config_input.batch_size = 8
+    config_input.shuffle_buffer_size = 10
+    config_input.cache_raw = False
+def parse_arg(arg, lazy=False, **spec):
+  """Makes ConfigDict's get_config single-string argument more usable.
+  Example use in the config file:
+    import big_vision.configs.common as bvcc
+    def get_config(arg):
+      arg = bvcc.parse_arg(arg,
+          res=(224, int),
+          runlocal=False,
+          schedule='short',
+      )
+      # ...
+      config.shuffle_buffer = 250_000 if not arg.runlocal else 50
+  Ways that values can be passed when launching:
+    --config amazing.py:runlocal,schedule=long,res=128
+    --config amazing.py:res=128
+    --config amazing.py:runlocal  # A boolean needs no value for "true".
+    --config amazing.py:runlocal=False  # Explicit false boolean.
+    --config amazing.py:128  # The first spec entry may be passed unnamed alone.
+  Uses strict bool conversion (converting 'True', 'true' to True, and 'False',
+    'false', '' to False).
+  Args:
+    arg: the string argument that's passed to get_config.
+    lazy: allow lazy parsing of arguments, which are not in spec. For these,
+      the type is auto-extracted in dependence of most complex possible type.
+    **spec: the name and default values of the expected options.
+      If the value is a tuple, the value's first element is the default value,
+      and the second element is a function called to convert the string.
+      Otherwise the type is automatically extracted from the default value.
+  Returns:
+    ConfigDict object with extracted type-converted values.
+  """
+  # Normalize arg and spec layout.
+  arg = arg or ''  # Normalize None to empty string
+  spec = {k: get_type_with_default(v) for k, v in spec.items()}
+  result = mlc.ConfigDict(type_safe=False)  # For convenient dot-access only.
+  # Expand convenience-cases for a single parameter without = sign.
+  if arg and ',' not in arg and '=' not in arg:
+    # (think :runlocal) If it's the name of sth in the spec (or there is no
+    # spec), it's that in bool.
+    if arg in spec or not spec:
+      arg = f'{arg}=True'
+    # Otherwise, it is the value for the first entry in the spec.
+    else:
+      arg = f'{list(spec.keys())[0]}={arg}'
+      # Yes, we rely on Py3.7 insertion order!
+  # Now, expand the `arg` string into a dict of keys and values:
+  raw_kv = {raw_arg.split('=')[0]:
+                raw_arg.split('=', 1)[-1] if '=' in raw_arg else 'True'
+            for raw_arg in arg.split(',') if raw_arg}
+  # And go through the spec, using provided or default value for each:
+  for name, (default, type_fn) in spec.items():
+    val = raw_kv.pop(name, None)
+    result[name] = type_fn(val) if val is not None else default
+  if raw_kv:
+    if lazy:  # Process args which are not in spec.
+      for k, v in raw_kv.items():
+        result[k] = autotype(v)
+    else:
+      raise ValueError(f'Unhandled config args remain: {raw_kv}')
+  return result
+def get_type_with_default(v):
+  """Returns (v, string_to_v_type) with lenient bool parsing."""
+  # For bool, do safe string conversion.
+  if isinstance(v, bool):
+    def strict_bool(x):
+      assert x.lower() in {'true', 'false', ''}
+      return x.lower() == 'true'
+    return (v, strict_bool)
+  # If already a (default, type) tuple, use that.
+  if isinstance(v, (tuple, list)):
+    assert len(v) == 2 and isinstance(v[1], type), (
+        'List or tuple types are currently not supported because we use `,` as'
+        ' dumb delimiter. Contributions (probably using ast) welcome. You can'
+        ' unblock by using a string with eval(s.replace(";", ",")) or similar')
+    return (v[0], v[1])
+  # Otherwise, derive the type from the default value.
+  return (v, type(v))
+def autotype(x):
+  """Auto-converts string to bool/int/float if possible."""
+  assert isinstance(x, str)
+  if x.lower() in {'true', 'false'}:
+    return x.lower() == 'true'  # Returns as bool.
+  try:
+    return int(x)  # Returns as int.
+  except ValueError:
+    try:
+      return float(x)  # Returns as float.
+    except ValueError:
+      return x  # Returns as str.
+def pack_arg(**kw):
+  """Packs key-word args as a string to be parsed by `parse_arg()`."""
+  for v in kw.values():
+    assert ',' not in f'{v}', f"Can't use `,` in config_arg value: {v}"
+  return ','.join([f'{k}={v}' for k, v in kw.items()])
+def arg(**kw):
+  """Use like `add(**bvcc.arg(res=256, foo=bar), lr=0.1)` to pass config_arg."""
+  return {'config_arg': pack_arg(**kw), **kw}
+def _get_field_ref(config_dict, field_name):
+  path = field_name.split('.')
+  for field in path[:-1]:
+    config_dict = getattr(config_dict, field)
+  return config_dict.get_ref(path[-1])
+def format_str(format_string, config):
+  """Format string with reference fields from config.
+  This makes it easy to build preprocess strings that contain references to
+  fields tha are edited after. E.g.:
+  ```
+  config = mlc.ConficDict()
+  config.res = (256, 256)
+  config.pp = bvcc.format_str('resize({res})', config)
+  ...
+  # if config.res is modified (e.g. via sweeps) it will propagate to pp field:
+  config.res = (512, 512)
+  assert config.pp == 'resize((512, 512))'
+  ```
+  Args:
+    format_string: string to format with references.
+    config: ConfigDict to get references to format the string.
+  Returns:
+    A reference field which renders a string using references to config fields.
+  """
+  output = ''
+  parts = string.Formatter().parse(format_string)
+  for (literal_text, field_name, format_spec, conversion) in parts:
+    assert not format_spec and not conversion
+    output += literal_text
+    if field_name:
+      output += _get_field_ref(config, field_name).to_str()
+  return output

Tipsomaly/model/big_vision/configs/common_fewshot.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Most common few-shot eval configuration."""
+import ml_collections as mlc
+def get_fewshot_lsr(target_resolution=224, resize_resolution=256,
+                    runlocal=False, pp=None, **kw):
+  """Returns a standard-ish fewshot eval configuration."""
+  kw.setdefault('representation_layer', 'pre_logits')
+  kw.setdefault('shots', (1, 5, 10, 25))
+  kw.setdefault('l2_reg', 2.0 ** 10)
+  kw.setdefault('num_seeds', 3)
+  kw.setdefault('prefix', '')  # No prefix as we already use a/ z/ and zz/
+  # Backward-compatible default:
+  if not any(f'log_{x}' in kw for x in ['steps', 'percent', 'examples', 'epochs']):  # pylint: disable=line-too-long
+    kw['log_steps'] = 25_000
+  config = mlc.ConfigDict(kw)
+  config.type = 'fewshot_lsr'
+  config.datasets = {
+      'caltech': ('caltech101', 'train', 'test'),  # copybara:srtip
+      'cars': ('cars196:2.1.0', 'train', 'test'),
+      'cifar100': ('cifar100', 'train', 'test'),
+      'dtd': ('dtd', 'train', 'test'),
+      # The first 65000 ImageNet samples have at least 30 shots per any class.
+      # Commented out by default because needs manual download.
+      # 'imagenet': ('imagenet2012', 'train[:65000]', 'validation'),
+      'pets': ('oxford_iiit_pet', 'train', 'test'),
+      'uc_merced': ('uc_merced', 'train[:1000]', 'train[1000:]'),
+  } if not runlocal else {
+      'pets': ('oxford_iiit_pet', 'train', 'test'),
+  }
+  pp = pp or '|'.join([
+      'decode',
+      f'resize({resize_resolution})',
+      f'central_crop({target_resolution})',
+      'value_range(-1,1)'
+  ])
+  pp += '|keep("image", "label")'
+  config.pp_train = pp
+  config.pp_eval = pp
+  config.display_first = [('imagenet', 10)] if not runlocal else [('pets', 10)]
+  return config

Tipsomaly/model/big_vision/configs/load_and_eval.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pytype: disable=not-writable,attribute-error
+# pylint: disable=line-too-long,missing-function-docstring
+r"""A config to load and eval key model using the core train.py.
+The runtime varies widely depending on the model, but each one should reproduce
+the corresponding paper's numbers.
+This configuration makes use of the "arg" to get_config to select which model
+to run, so a few examples are given below:
+Run and evaluate a BiT-M ResNet-50x1 model that was transferred to i1k:
+big_vision.train \
+    --config big_vision/configs/load_and_eval.py:name=bit_paper,batch_size=8 \
+    --config.model_init M-imagenet2012 --config.model.width 1 --config.model.depth 50
+Run and evaluate the recommended ViT-B/32 from "how to train your vit" paper:
+big_vision.train \
+    --config big_vision/configs/load_and_eval.py:name=vit_i21k,batch_size=8 \
+    --config.model.variant B/32 --config.model_init howto-i21k-B/32
+"""
+import big_vision.configs.common as bvcc
+from big_vision.configs.common_fewshot import get_fewshot_lsr
+def eval_only(config, batch_size, spec_for_init):
+  """Set a few configs that turn trainer into (almost) eval-only."""
+  config.total_steps = 0
+  config.input = {}
+  config.input.batch_size = batch_size
+  config.input.data = dict(name='bv:dummy', spec=spec_for_init)
+  config.optax_name = 'identity'
+  config.lr = 0.0
+  config.mesh = [('data', -1)]
+  config.sharding_strategy = [('params/.*', 'fsdp(axis="data")')]
+  config.sharding_rules = [('act_batch', ('data',))]
+  return config
+def get_config(arg=''):
+  config = bvcc.parse_arg(arg, name='bit_paper', batch_size=4)
+  # Make the config eval-only by setting some dummies.
+  eval_only(config, config.batch_size, spec_for_init=dict(
+      image=dict(shape=(224, 224, 3), dtype='float32'),
+  ))
+  config.evals = dict(fewshot=get_fewshot_lsr())
+  # Just calls the function with the name given as `config`.
+  # Could also be a giant if-block if you're into that kind of thing.
+  globals()[config.name](config)
+  return config
+def bit_paper(config):
+  config.num_classes = 1000
+  config.model_name = 'bit_paper'
+  config.model_init = 'M-imagenet2012'  # M = i21k, -imagenet2012 = fine-tuned
+  config.model = dict(width=1, depth=50)
+  def get_eval(split, lbl, dataset='imagenet2012_real'):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        loss_name='softmax_xent',
+        cache='none',  # Only run once, on low-mem machine.
+        pp_fn=(
+            'decode|resize(384)|value_range(-1, 1)'
+            f'|onehot(1000, key="{lbl}", key_result="labels")'
+            '|keep("image", "labels")'
+        ),
+    )
+  config.evals.test = get_eval('validation', 'original_label')
+  config.evals.real = get_eval('validation', 'real_label')
+  config.evals.v2 = get_eval('test', 'label', 'imagenet_v2')
+def vit_i1k(config):
+  config.num_classes = 1000
+  config.model_name = 'vit'
+  config.model_init = ''  # Will be set in sweep.
+  config.model = dict(variant='S/16', pool_type='gap', posemb='sincos2d',
+                      rep_size=True)
+  config.evals.val = dict(
+      type='classification',
+      data=dict(name='imagenet2012', split='validation'),
+      pp_fn='decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000, key="label", key_result="labels")|keep("image", "labels")',
+      loss_name='softmax_xent',
+      cache='none',  # Only run once, on low-mem machine.
+  )
+def mlp_mixer_i1k(config):
+  config.num_classes = 1000
+  config.model_name = 'mlp_mixer'
+  config.model_init = ''  # Will be set in sweep.
+  config.model = dict(variant='L/16')
+  config.evals.val = dict(
+      type='classification',
+      data=dict(name='imagenet2012', split='validation'),
+      pp_fn='decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000, key="label", key_result="labels")|keep("image", "labels")',
+      loss_name='softmax_xent',
+      cache='none',  # Only run once, on low-mem machine.
+  )
+def vit_i21k(config):
+  config.num_classes = 21843
+  config.model_name = 'vit'
+  config.model_init = ''  # Will be set in sweep.
+  config.model = dict(variant='B/32', pool_type='tok')
+  config.evals.val = dict(
+      type='classification',
+      data=dict(name='imagenet21k', split='full[:51200]'),
+      pp_fn='decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(21843)|keep("image", "labels")',
+      loss_name='sigmoid_xent',
+      cache='none',  # Only run once, on low-mem machine.
+  )

Tipsomaly/model/big_vision/configs/mlp_mixer_i1k.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""A config for training MLP-Mixer-B/16 model on ILSVRC-2012 ("ImageNet-1k").
+Achieves 76.3% top-1 accuracy on the test split in 2h11m on TPU v3-128
+with 300 epochs. A shorter 60 epochs run is expected to get to 70.5% in 27m.
+big_vision.train \
+    --config big_vision/configs/mlp_mixer_i1k.py \
+    --workdir gs://[your_bucket]/big_vision/`date '+%m-%d_%H%M'` \
+"""
+from big_vision.configs.common_fewshot import get_fewshot_lsr
+import ml_collections as mlc
+def get_config(mode=None):
+  """Config for training Mixer on i1k."""
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 300
+  config.num_classes = 1000
+  config.loss = 'sigmoid_xent'
+  config.init_head_bias = -6.9
+  config.input = dict()
+  config.input.data = dict(
+      name='imagenet2012',
+      split='train[:99%]',
+  )
+  config.input.batch_size = 4096
+  config.input.cache_raw = True  # Needs up to 120GB of RAM!
+  config.input.shuffle_buffer_size = 250_000
+  config.input.pp = (
+      'decode_jpeg_and_inception_crop(224)'
+      '|flip_lr'
+      '|randaug(2,15)'
+      '|value_range(-1, 1)'
+      '|onehot(1000, key="label", key_result="labels")'
+      '|keep("image", "labels")'
+  )
+  pp_eval = (
+      'decode'
+      '|resize_small(256)|central_crop(224)'
+      '|value_range(-1, 1)'
+      '|onehot(1000, key="{lbl}", key_result="labels")'
+      '|keep("image", "labels")'
+  )
+  # To continue using the near-defunct randaug op.
+  config.pp_modules = ['ops_general', 'ops_image', 'ops_text', 'archive.randaug']
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  config.prefetch_to_device = 2
+  # Model section
+  config.model_name = 'mlp_mixer'
+  config.model = dict()
+  config.model.variant = 'B/16'
+  config.model.stoch_depth = 0.1
+  config.mixup = dict(fold_in=None, p=0.5)
+  # Optimizer section
+  config.optax_name = 'scale_by_adam'
+  config.grad_clip_norm = 1.
+  config.lr = 0.001
+  config.wd = 1e-4
+  config.schedule = dict(
+      decay_type='linear',
+      warmup_steps=10_000,
+      linear_end=1e-5,
+  )
+  # Eval section
+  def get_eval(split, dataset='imagenet2012'):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        pp_fn=pp_eval.format(lbl='label'),
+        loss_name=config.loss,
+        log_steps=2500,  # Very fast O(seconds) so it's fine to run it often.
+        cache_final=mode != 'gpu8',
+    )
+  config.evals = {}
+  config.evals.train = get_eval('train[:2%]')
+  config.evals.minival = get_eval('train[99%:]')
+  config.evals.val = get_eval('validation')
+  config.evals.v2 = get_eval('test', dataset='imagenet_v2')
+  config.evals.real = get_eval('validation', dataset='imagenet2012_real')
+  config.evals.real.pp_fn = pp_eval.format(lbl='real_label')
+  config.fewshot = get_fewshot_lsr()
+  if mode == 'gpu8':
+    config.total_epochs = 60
+    config.input.batch_size = 512
+    config.input.cache_raw = False
+  if mode == 'regression_test':
+    config.total_epochs = 60
+  return config

Tipsomaly/model/big_vision/configs/proj/paligemma/transfers/vertexai_l4.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""PaliGemma transfer to a task stored in JSON-L, designed to fit on an L4 GPU.
+"""
+import big_vision.configs.common as bvcc
+def training_data(res, text_len):
+  """Creates training data config."""
+  c = bvcc.parse_arg('')  # Just make a configdict without extra import.
+  c.data = dict(
+      name='bv:jsonl',
+      fname='gs://longcap100/data_train90.jsonl',
+      fopen_keys={'image': 'gs://longcap100/'},
+      # See docstring in datasets/jsonl.py for further details.
+      # download_keys=['image'],  # If jsonl contains external paths.
+  )
+  c.pp = '|'.join([
+      # Read and prepare the image by just resizing it:
+      f'decode|resize({res}, antialias=True)|value_range(-1, 1)',
+      # The texts are already prepared in `prefix` and `suffix` keys.
+      'strfmt("caption en", outkey="prefix")',
+      combine_and_keep(text_len),
+  ])
+  # Keep the whole dataset in RAM after first pass. Useful optimization for
+  # small/mid-size datasets, but risks a host OOM for large datasets.
+  c.cache_raw = True
+  return c
+def get_config(arg=None):
+  """Config for training."""
+  # You probably do NOT want to add settings here. The `arg` way of settings is
+  # really only for things you'd want to sweep and which affect MULTIPLE config
+  # settings at once or go into the pp string.
+  c = bvcc.parse_arg(arg, res=224, text_len=128, batch_size=4,
+                     freeze_vit=False, freeze_llm=False)
+  c.input = training_data(c.res, c.text_len)
+  # These settings are suited for fitting in a single L4.
+  c.total_epochs = 1
+  c.input.batch_size = c.batch_size
+  c.optax_name = 'big_vision.sgd'  # Without momentum, so really low-memory.
+  c.lr = 0.1
+  c.wd = 0.0
+  c.grad_clip_norm = 1.0
+  c.label_smoothing = 0.0
+  # Learning-rate schedule. Probably is fine like this.
+  sched = dict(decay_type='cosine', warmup_percent=0.05)
+  c.schedule = [
+      ('img/.*', None if c.freeze_vit else sched),
+      ('llm/.*', None if c.freeze_llm else sched),
+  ]
+  c.evals = {}
+  # Model section.
+  c.model_name = 'proj.paligemma.paligemma'
+  c.model = {}
+  # TODO: b/lbeyer - no scan and no remat might be better on 1-GPU machines?
+  c.model.img = dict(variant='So400m/14', pool_type='none', scan=True)
+  c.model.llm = dict(vocab_size=256_000 + 1024 + 128, dropout=0.0)
+  c.model_init = f'pt_{c.res}'
+  # FSDP strategy.
+  c.mesh = [('data', -1)]
+  c.sharding_strategy = [('.*', 'fsdp(axis="data")')]
+  c.sharding_rules = [('act_batch', ('data',))]
+  c.input.shuffle_buffer_size = 1000
+  c.log_training_steps = 1
+  c.ckpt_steps = 1_000
+  c.pp_modules = ['ops_general', 'ops_image', 'ops_text', 'proj.paligemma.ops']
+  c.seed = 0
+  return c
+def tok(**kw):
+  """Creates the tokenization preprocessing string."""
+  # Single entry point so that it's consistent everywhere and easier to switch.
+  kw.setdefault('model', 'gemma(tokensets=("loc", "seg"))')
+  kw = ', '.join(f'{k}={repr(v)}' for k, v in kw.items())
+  return f'tok({kw})'
+def combine_and_keep(text_len):
+  return '|'.join([
+      tok(key='prefix', bos='yes'),
+      tok(key='suffix', eos='yes'),
+      tok(key='septok', text='\n'),
+      # If masks confuse you, see (internal link)
+      'masked_concat(["prefix", "septok", "suffix"], mask_ar=[0, 0, 1], mask_loss=[0, 0, 1])',
+      # For training, we +1 because the trainer removes EOS.
+      f'tolen({text_len+1}, pad_value=0, key="text")',  # For text, value doesn't matter.
+      f'tolen({text_len+1}, pad_value=1, key="mask_ar")',
+      f'tolen({text_len+1}, pad_value=0, key="mask_loss")',
+      'keep("image", "text", "mask_ar", "mask_loss")',
+  ])

Tipsomaly/model/big_vision/configs/proj/paligemma/transfers/vqav2.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""PaliGemma transfer to VQAv2.
+"""
+import big_vision.configs.common as bvcc
+from big_vision.configs.proj.paligemma.transfers.common import combine_and_keep_train, combine_and_keep_eval, TOKENIZER
+def training_data(res, final_split, text_len=32):
+  """Creates training data config.
+  See (internal link)
+  You can add more arguments beside `res`, but give them good defaults.
+  Args:
+    res: The requested image resolution (eg 224).
+    final_split: Whether to use all of the validation data.
+    text_len: sequence length
+  Returns:
+    The ConfigDict for the input section.
+  """
+  c = bvcc.parse_arg('')  # Just make a configdict without extra import.
+  c.data = dict(
+      name='vqa',
+      split='train + validation' if final_split else 'train + validation[:-10240]',
+  )
+  c.pp = '|'.join([
+      f'decode|resize({res}, antialias=True)|value_range(-1, 1)',
+      'strfmt("answer en {question_text}", outkey="prefix")',
+      'choice_no_replacement(inkey="answers", outkey="suffix")',
+      combine_and_keep_train(text_len),
+  ])
+  return c
+def add_eval(c, res, text_len=32, **kw):
+  """VQAv2 evaluators."""
+  pp = '|'.join([
+      f'decode|resize({res}, antialias=True)|value_range(-1, 1)',
+      'strfmt("answer en {question_text}", outkey="prefix")',
+      combine_and_keep_eval(text_len, keep=('answers', 'answer_type', 'question_type', 'question_id')),
+  ])
+  for freq, name, split in [
+      (1/4, 'minitrain', 'train[:5120]'),       # To gauge memorization.
+      (1/4, 'minival', 'validation[-10240:]'),  # To tune hparams.
+      # To generate final predictions. Test sets combined since 2021 challenge.
+      (1.0, 'test', 'test + test-dev'),
+  ]:
+    c.evals[f'vqav2/{name}'] = dict(
+        type='proj.paligemma.transfers.vqav2',
+        pred='decode', pred_kw={'max_decode_len': text_len},
+        outfile=f'{{workdir}}/vqav2_{name}.json',
+        data={**training_data(res, True, text_len).data, 'split': split},
+        log_percent=freq, skip_first=freq == 1, tokenizer=TOKENIZER, pp_fn=pp)
+    c.evals[f'vqav2/{name}'].update(kw)
+def add_eval_pplx(c, res, text_len=32):
+  """Perplexity evaluator to test runs before implementing the real deal."""
+  c_train = training_data(res, True, text_len)  # Use mostly same settings as training.
+  for name, split in [
+      ('minitrain', 'train[:20_864]'),     # To gauge memorization.
+      ('minival', 'validation[-10240:]'),  # To tune hparams.
+  ]:
+    c.evals[f'vqav2/{name}/pplx'] = dict(
+        type='proj.paligemma.perplexity', pred='logits',
+        key='text', shift_labels=True,
+        log_percent=1/4,  # Not too cheap, do 4x per run.
+        data={**c_train.data, 'split': split},
+        pp_fn=c_train.pp,
+    )
+def sweep_best(add, arg=None):
+  """Train with best hyper-params."""
+  c = bvcc.parse_arg(arg, final_split=False)
+  # NOTE: lr was highest in sweep.
+  add(total_epochs=10, lr=1e-5, wd=1e-6, **bvcc.arg(res=224, **c))
+  add(total_epochs=10, lr=1e-5, wd=0.00, **bvcc.arg(res=448, **c))
+sweep = sweep_best  # Choose which sweep to run.
+def get_config(arg=None):
+  """Config for training."""
+  c = bvcc.parse_arg(arg, mode='xm', res=224, final_split=False)
+  c.input = training_data(c.res, c.final_split)
+  # Instead of epochs, you can also use `total_examples` or `total_steps`.
+  c.total_epochs = 10
+  c.input.batch_size = 256
+  c.optax_name = 'scale_by_adam'
+  c.lr = 3e-6
+  c.wd = 3e-7
+  c.grad_clip_norm = 1.0
+  c.label_smoothing = 0.0
+  c.schedule = dict(decay_type='cosine', warmup_percent=0.05)
+  # Add evaluators.
+  c.evals = {}
+  add_eval(c, c.res, batch_size=1024)
+  add_eval_pplx(c, c.res)
+  # Model section.
+  c.model_name = 'proj.paligemma.paligemma'
+  c.model = {}
+  c.model.img = dict(variant='So400m/14', pool_type='none', scan=True)
+  c.model.llm = dict(vocab_size=256_000 + 1024 + 128, dropout=0.0)
+  c.model_init = f'pt_{c.res}'
+  # FSDP strategy.
+  c.mesh = [('data', -1)]
+  c.sharding_strategy = [('.*', 'fsdp(axis="data")')]
+  c.sharding_rules = [('act_batch', ('data',))]
+  # These probably do not need any change/tuning
+  c.input.shuffle_buffer_size = 50_000
+  c.log_training_steps = 50
+  c.ckpt_steps = 1_000
+  c.pp_modules = ['ops_general', 'ops_image', 'ops_text', 'proj.paligemma.ops']
+  # Update configs for quicker local runs and avoid swapping.
+  if c.mode in ('runlocal', 'mock'):
+    c.input.shuffle_buffer_size = None
+    for ev in c.evals.values():
+      ev.data.split = ev.data.split.split('[')[0] + '[:16]'
+  if c.mode == 'runlocal':
+    c.log_training_steps = 1
+    c.input.batch_size = 2
+  c.seed = 0
+  return c
+def metrics(arg=None):  # pylint: disable=unused-argument
+  m = ['training_loss']
+  for split in ('minival', 'minitrain'):
+    m.append(f'vqav2/{split}/acc')
+    m.append(f'vqav2/{split}/pplx/avg')
+  return m

Tipsomaly/model/big_vision/configs/transfer.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long,missing-function-docstring
+r"""A config for transferring vit-augreg.
+Best HP selected on (mini)val, expected test results (repeated 5 times):
+ViT-Augreg-B/32:
+    Dataset, crop, learning rate, mean (%), range (%)
+  - ImageNet, inception_crop, 0.03, 83.27, [83.22...83.33]
+  - Cifar10, resmall_crop, 0.003, 98.55, [98.46...98.6]
+  - Cifar100, resmall_crop, 0.01, 91.35, [91.09...91.62]
+  - Pets, inception_crop, 0.003, 93.78, [93.62...94.00]
+  - Flowers, inception_crop, 0.003, 99.43, [99.42...99.45]
+Command to run:
+big_vision.train \
+    --config big_vision/configs/transfer.py:model=vit-i21k-augreg-b/32,dataset=cifar10,crop=resmall_crop \
+    --workdir gs://$GS_BUCKET_NAME/big_vision/workdir/`date '+%m-%d_%H%M'` --config.lr=0.03
+"""
+import big_vision.configs.common as bvcc
+import ml_collections as mlc
+def _set_model(config, model):
+  """Load pre-trained models: vit or bit."""
+  # Reset the head to init (of zeros) when transferring.
+  config.model_load = dict(dont_load=['head/kernel', 'head/bias'])
+  if model == 'vit-i21k-augreg-b/32':
+    # Load "recommended" upstream B/32 from https://arxiv.org/abs/2106.10270
+    config.model_name = 'vit'
+    config.model_init = 'howto-i21k-B/32'
+    config.model = dict(variant='B/32', pool_type='tok')
+  elif model == 'vit-i21k-augreg-l/16':
+    config.model_name = 'vit'
+    config.model_init = 'howto-i21k-L/16'
+    config.model = dict(variant='L/16', pool_type='tok')
+  elif model == 'vit-s16':
+    config.model_name = 'vit'
+    config.model_init = 'i1k-s16-300ep'
+    config.model = dict(variant='S/16', pool_type='gap', posemb='sincos2d',
+                        rep_size=True)
+  elif model == 'bit-m-r50x1':
+    config.model_name = 'bit_paper'
+    config.model_init = 'M'
+    config.model = dict(depth=50, width=1)
+  else:
+    raise ValueError(f'Unknown model: {model}, please define customized model.')
+def _set_dataset(config, dataset, crop='inception_crop', h_res=448, l_res=384):
+  if dataset == 'cifar10':
+    _set_task(config, 'cifar10', 'train[:98%]', 'train[98%:]', 'test', 10, steps=10_000, warmup=500, crop=crop, h_res=h_res, l_res=l_res)
+  elif dataset == 'cifar100':
+    _set_task(config, 'cifar100', 'train[:98%]', 'train[98%:]', 'test', 100, steps=10_000, warmup=500, crop=crop, h_res=h_res, l_res=l_res)
+  elif dataset == 'imagenet2012':
+    _set_task(config, 'imagenet2012', 'train[:99%]', 'train[99%:]', 'validation', 1000, steps=20_000, warmup=500, crop=crop, h_res=h_res, l_res=l_res)
+    _set_imagenet_variants(config)
+  elif dataset == 'oxford_iiit_pet':
+    _set_task(config, 'oxford_iiit_pet', 'train[:90%]', 'train[90%:]', 'test', 37, steps=500, warmup=100, crop=crop, h_res=h_res, l_res=l_res)
+  elif dataset == 'oxford_flowers102':
+    _set_task(config, 'oxford_flowers102', 'train[:90%]', 'train[90%:]', 'test', 102, steps=500, warmup=100, crop=crop, h_res=h_res, l_res=l_res)
+  else:
+    raise ValueError(
+        f'Unknown dataset: {dataset}, please define customized dataset.')
+def _set_task(config, dataset, train, val, test, n_cls,
+              steps=20_000, warmup=500, lbl='label', crop='resmall_crop',
+              flip=True, h_res=448, l_res=384):
+  """Vision task with val and test splits."""
+  config.total_steps = steps
+  config.schedule = dict(
+      warmup_steps=warmup,
+      decay_type='cosine',
+  )
+  config.input.data = dict(name=dataset, split=train)
+  pp_common = (
+      '|value_range(-1, 1)|'
+      f'onehot({n_cls}, key="{lbl}", key_result="labels")|'
+      'keep("image", "labels")'
+  )
+  if crop == 'inception_crop':
+    pp_train = f'decode|inception_crop({l_res})'
+  elif crop == 'resmall_crop':
+    pp_train = f'decode|resize_small({h_res})|random_crop({l_res})'
+  elif crop == 'resize_crop':
+    pp_train = f'decode|resize({h_res})|random_crop({l_res})'
+  else:
+    raise ValueError(f'Unknown crop: {crop}. Must be one of: '
+                     'inception_crop, resmall_crop, resize_crop')
+  if flip:
+    pp_train += '|flip_lr'
+  config.input.pp = pp_train + pp_common
+  pp = f'decode|resize_small({h_res})|central_crop({l_res})' + pp_common
+  config.num_classes = n_cls
+  def get_eval(split):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        loss_name='softmax_xent',
+        log_steps=100,
+        pp_fn=pp,
+    )
+  config.evals = dict(val=get_eval(val), test=get_eval(test))
+def _set_imagenet_variants(config, h_res=448, l_res=384):
+  """Evaluation tasks on ImageNet variants: v2 and real."""
+  pp = (f'decode|resize_small({h_res})|central_crop({l_res})'
+        '|value_range(-1, 1)|onehot(1000, key="{lbl}", key_result="labels")|'
+        'keep("image", "labels")'
+        )
+  # Special-case rename for i1k (val+test -> minival+val)
+  config.evals.minival = config.evals.val
+  config.evals.val = config.evals.test
+  # NOTE: keep test == val for convenience in subsequent analysis.
+  config.evals.real = dict(type='classification')
+  config.evals.real.data = dict(name='imagenet2012_real', split='validation')
+  config.evals.real.pp_fn = pp.format(lbl='real_label')
+  config.evals.real.loss_name = config.loss
+  config.evals.real.log_steps = 100
+  config.evals.v2 = dict(type='classification')
+  config.evals.v2.data = dict(name='imagenet_v2', split='test')
+  config.evals.v2.pp_fn = pp.format(lbl='label')
+  config.evals.v2.loss_name = config.loss
+  config.evals.v2.log_steps = 100
+def get_config(arg=None):
+  """Config for adaptation."""
+  arg = bvcc.parse_arg(arg, model='vit', dataset='cifar10', crop='resmall_crop',
+                       h_res=448, l_res=384, batch_size=512, fsdp=False,
+                       runlocal=False)
+  config = mlc.ConfigDict()
+  config.input = {}
+  config.input.batch_size = arg.batch_size if not arg.runlocal else 8
+  config.input.shuffle_buffer_size = 50_000 if not arg.runlocal else 100
+  config.log_training_steps = 10
+  config.ckpt_steps = 1000
+  config.ckpt_timeout = 600
+  # Optimizer section
+  config.optax_name = 'big_vision.momentum_hp'
+  config.grad_clip_norm = 1.0
+  config.wd = None  # That's our default, but just being explicit here!
+  config.loss = 'softmax_xent'
+  config.lr = 0.01
+  config.mixup = dict(p=0.0)
+  config.seed = 0
+  _set_dataset(config, arg.dataset, arg.crop, arg.h_res, arg.l_res)
+  _set_model(config, arg.model)
+  if arg.fsdp:
+    config.mesh = [('data', -1)]
+    config.sharding_strategy = [('.*', 'fsdp(axis="data")')]
+    config.sharding_rules = [('act_batch', ('data',))]
+    config.model.scan = True
+  return config

Tipsomaly/model/big_vision/configs/vit_i1k.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Pre-training ViT on ILSVRC-2012 as in https://arxiv.org/abs/2106.10270
+This config does NOT include regularization (dropout, stochastic depth), which
+was shown to help with B/32, B/16, L/16 models in the paper (Figure 4).
+This configuration makes use of the "arg" to get_config to select which model
+to run, so a few examples are given below:
+Run training of a B/16 model:
+big_vision.train \
+    --config big_vision/configs/vit_i1k.py:variant=B/16 \
+    --workdir gs://[your_bucket]/big_vision/`date '+%m-%d_%H%M'`
+Run training of a B/32 model with custom aug-strenght and 300ep:
+big_vision.train \
+    --config big_vision/configs/vit_i1k.py:variant=B/32,aug=light1 \
+    --workdir gs://[your_bucket]/big_vision/`date '+%m-%d_%H%M'` \
+    --config.total_epochs 300
+"""
+import big_vision.configs.common as bvcc
+from big_vision.configs.common_fewshot import get_fewshot_lsr
+import ml_collections as mlc
+MIXUP_DEF = {
+    'none': dict(p=0.0, fold_in=None),
+    'light1': dict(p=0.0, fold_in=None),
+    'light2': dict(p=0.2, fold_in=None),
+    'medium1': dict(p=0.2, fold_in=None),
+    'medium2': dict(p=0.5, fold_in=None),
+    'strong1': dict(p=0.5, fold_in=None),
+    'strong2': dict(p=0.8, fold_in=None),
+}
+RANDAUG_DEF = {
+    'none': '',
+    'light1': 'randaug(2,0)',  # Actually not nothing!
+    'light2': 'randaug(2,10)',
+    'medium1': 'randaug(2,15)',
+    'medium2': 'randaug(2,15)',
+    'strong1': 'randaug(2,20)',
+    'strong2': 'randaug(2,20)',
+}
+def get_config(arg=None):
+  """Config for training."""
+  arg = bvcc.parse_arg(arg, variant='B/16', runlocal=False, aug='')
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 300
+  config.num_classes = 1000
+  config.loss = 'sigmoid_xent'
+  config.init_head_bias = -6.9
+  # If this gives a KeyError, lookup Fig4 of the paper and add an entry.
+  # Note, this here is a good average between 30ep and 300ep, sometimes you coud
+  # find a slightly better setting for either of them.
+  aug_setting = arg.aug or {
+      'Ti/16': 'light1',
+      'S/32': 'medium1',
+      'S/16': 'medium2',
+      'B/32': 'medium2',
+      'B/16': 'medium2',
+      'L/16': 'medium2',
+  }[arg.variant]
+  config.input = dict()
+  config.input.data = dict(
+      name='imagenet2012',
+      split='train[:99%]',
+  )
+  config.input.batch_size = 4096
+  config.input.cache = 'raw_data' if arg.runlocal else 'none'  # Needs up to 120GB of RAM!
+  config.input.shuffle_buffer_size = 250_000
+  pp_common = (
+      '|value_range(-1, 1)'
+      '|onehot(1000, key="{lbl}", key_result="labels")'
+      '|keep("image", "labels")'
+  )
+  config.input.pp = (
+      'decode_jpeg_and_inception_crop(224)|flip_lr|' +
+      RANDAUG_DEF[aug_setting] +
+      pp_common.format(lbl='label')
+  )
+  pp_eval = 'decode|resize_small(256)|central_crop(224)' + pp_common
+  # To continue using the near-defunct randaug op.
+  config.pp_modules = ['ops_general', 'ops_image', 'ops_text', 'archive.randaug']
+  # Aggressive pre-fetching because our models here are small, so we not only
+  # can afford it, but we also need it for the smallest models to not be
+  # bottle-necked by the input pipeline. Play around with it for -L models tho.
+  config.input.prefetch = 8
+  config.prefetch_to_device = 4
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  # Model section
+  config.model_name = 'vit'
+  config.model = dict(
+      variant=arg.variant,
+      rep_size=True,
+      pool_type='tok',
+  )
+  # Optimizer section
+  config.grad_clip_norm = 1.0
+  config.optax_name = 'scale_by_adam'
+  config.optax = dict(mu_dtype='bfloat16')
+  # The modified AdaFactor we introduced in https://arxiv.org/abs/2106.04560
+  # almost always behaves exactly like adam, but at a fraction of the memory
+  # cost (specifically, adam_bf16 = +1.5M, adafactor = +0.5M), hence it is a
+  # good idea to try it when you are memory-bound!
+  # config.optax_name = 'big_vision.scale_by_adafactor'
+  # A good flag to play with when hitting instabilities, is the following:
+  # config.optax = dict(beta2_cap=0.95)
+  config.lr = 0.001
+  config.wd = 0.0001
+  config.schedule = dict(warmup_steps=10_000, decay_type='cosine')
+  config.mixup = MIXUP_DEF[aug_setting]
+  # Eval section
+  def get_eval(split, dataset='imagenet2012'):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        pp_fn=pp_eval.format(lbl='label'),
+        loss_name=config.loss,
+        log_steps=2500,  # Very fast O(seconds) so it's fine to run it often.
+        cache='final_data' if arg.runlocal else 'none',
+    )
+  config.evals = {}
+  config.evals.train = get_eval('train[:2%]')
+  config.evals.minival = get_eval('train[99%:]')
+  config.evals.val = get_eval('validation')
+  config.evals.v2 = get_eval('test', dataset='imagenet_v2')
+  config.evals.real = get_eval('validation', dataset='imagenet2012_real')
+  config.evals.real.pp_fn = pp_eval.format(lbl='real_label')
+  config.fewshot = get_fewshot_lsr(runlocal=arg.runlocal)
+  config.fewshot.log_steps = 10_000
+  # Make a few things much smaller for quick local debugging testruns.
+  if arg.runlocal:
+    config.input.shuffle_buffer_size = 10
+    config.input.batch_size = 8
+    config.input.cache_raw = False
+    config.evals.train.data.split = 'train[:16]'
+    config.evals.minival.data.split = 'train[:16]'
+    config.evals.val.data.split = 'validation[:16]'
+    config.evals.v2.data.split = 'test[:16]'
+    config.evals.real.data.split = 'validation[:16]'
+  return config

Tipsomaly/model/big_vision/configs/vit_i21k.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Pre-training ViT on ImageNet-21k as in https://arxiv.org/abs/2106.10270
+This config relies on the Imagenet-21k tfds dataset, which is not yet
+available publicly in TFDS. We intend to add the dataset to public TFDS soon,
+and this config will then be runnable.
+Note that regularization (dropout, stochastic depth) is not currently
+implemented. This was not beneficial for ImageNet-21k pre-trainning.
+"""
+import big_vision.configs.common as bvcc
+from big_vision.configs.common_fewshot import get_fewshot_lsr
+import ml_collections as mlc
+MIXUP_DEF = {
+    'none': dict(p=0.0, fold_in=None),
+    'light1': dict(p=0.0, fold_in=None),
+    'light2': dict(p=0.2, fold_in=None),
+    'medium1': dict(p=0.2, fold_in=None),
+    'medium2': dict(p=0.5, fold_in=None),
+    'strong1': dict(p=0.5, fold_in=None),
+    'strong2': dict(p=0.8, fold_in=None),
+}
+RANDAUG_DEF = {
+    'none': '',
+    'light1': 'randaug(2,0)',  # Actually not nothing!
+    'light2': 'randaug(2,10)',
+    'medium1': 'randaug(2,15)',
+    'medium2': 'randaug(2,15)',
+    'strong1': 'randaug(2,20)',
+    'strong2': 'randaug(2,20)',
+}
+def get_config(arg=None):
+  """Config for training."""
+  arg = bvcc.parse_arg(arg, variant='B/16', runlocal=False, aug=None)
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 300
+  config.num_classes = 21843
+  config.init_head_bias = -10.0
+  config.loss = 'sigmoid_xent'
+  # If this gives a KeyError, lookup Fig4 of the paper and add an entry.
+  # Note, this here is a good average between 30ep and 300ep, sometimes you coud
+  # find a slightly better setting for either of them.
+  aug_setting = {
+      'Ti/16': 'none',
+      'S/32': 'none',
+      'S/16': 'light1',
+      'B/32': 'light2',
+      'B/16': 'light2',
+      'L/16': 'medium2',
+  }[arg.variant]
+  config.input = dict()
+  config.input.data = dict(
+      name='imagenet21k',
+      split='full[51200:]',
+  )
+  config.input.batch_size = 4096
+  config.input.shuffle_buffer_size = 250_000  # Per host, so small-ish is ok.
+  pp_common = '|value_range(-1, 1)|onehot({onehot_args})|keep("image", "labels")'
+  pp_common_i21k = pp_common.format(onehot_args=f'{config.num_classes}')
+  pp_common_i1k = pp_common.format(onehot_args='1000, key="label", key_result="labels"')
+  config.input.pp = f'decode_jpeg_and_inception_crop(224)|flip_lr|{RANDAUG_DEF[aug_setting]}' + pp_common_i21k
+  pp_eval = 'decode|resize_small(256)|central_crop(224)'
+  # To continue using the near-defunct randaug op.
+  config.pp_modules = ['ops_general', 'ops_image', 'ops_text', 'archive.randaug']
+  # Aggressive pre-fetching because our models here are small, so we not only
+  # can afford it, but we also need it for the smallest models to not be
+  # bottle-necked by the input pipeline. Play around with it for -L models tho.
+  config.input.prefetch = 8
+  config.prefetch_to_device = 4
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  # Model section
+  config.model_name = 'vit'
+  config.model = dict(variant=arg.variant, pool_type='gap', posemb='learn')
+  # Optimizer section
+  config.optax_name = 'scale_by_adam'
+  config.optax = dict(mu_dtype='bfloat16')
+  config.grad_clip_norm = 1.0
+  config.lr = 0.001
+  config.wd = 0.0001
+  config.schedule = dict(warmup_steps=10_000, decay_type='cosine')
+  config.mixup = MIXUP_DEF[aug_setting]
+  # Evaluations on i21k itself.
+  def eval_i21k(split):
+    return dict(
+        type='classification',
+        data={**config.input.data, 'split': split},
+        pp_fn=pp_eval + pp_common_i21k,
+        loss_name=config.loss,
+        log_steps=1000,  # Very fast O(seconds) so it's fine to run it often.
+    )
+  config.evals = {}
+  config.evals.test = eval_i21k('full[:25_600]')
+  config.evals.val = eval_i21k('full[25_600:51_200]')
+  config.evals.train = eval_i21k('full[51_200:76_800]')
+  # Few-shot evaluators
+  config.evals.fewshot = get_fewshot_lsr(runlocal=arg.runlocal)
+  config.evals.fewshot.log_steps = 25_000
+  # Make a few things much smaller for quick local debugging testruns.
+  if arg.runlocal:
+    config.input.shuffle_buffer_size = 10
+    config.input.batch_size = 8
+    config.evals.test.data.split = 'full[:16]'
+    config.evals.train.data.split = 'full[:16]'
+    config.evals.val.data.split = 'full[:16]'
+    config.evals.i1k_val.data.split = 'validation[:16]'
+    config.evals.i1k_v2.data.split = 'test[:16]'
+    config.evals.i1k_a.data.split = 'test[:16]'
+    config.evals.i1k_r.data.split = 'test[:16]'
+  return config

Tipsomaly/model/big_vision/configs/vit_s16_i1k.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Pre-training ViT-S/16 on ILSVRC-2012 following https://arxiv.org/abs/2205.01580.
+This should take 6-7h to finish 90ep on a TPU-v3-8 and reach 76.5%,
+see the tech report for more details.
+Command to run:
+big_vision.train \
+    --config big_vision/configs/vit_s16_i1k.py \
+    --workdir gs://[your_bucket]/big_vision/`date '+%m-%d_%H%M'`
+To run for 300ep, add `--config.total_epochs 300` to the command.
+"""
+import ml_collections as mlc
+def get_config():
+  """Config for training."""
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 90
+  config.num_classes = 1000
+  config.loss = 'softmax_xent'
+  config.input = {}
+  config.input.data = dict(
+      name='imagenet2012',
+      split='train[:99%]',
+  )
+  config.input.batch_size = 1024
+  config.input.cache_raw = True  # Needs up to 120GB of RAM!
+  config.input.shuffle_buffer_size = 250_000
+  pp_common = (
+      '|value_range(-1, 1)'
+      '|onehot(1000, key="{lbl}", key_result="labels")'
+      '|keep("image", "labels")'
+  )
+  config.input.pp = (
+      'decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)' +
+      pp_common.format(lbl='label')
+  )
+  pp_eval = 'decode|resize_small(256)|central_crop(224)' + pp_common
+  # To continue using the near-defunct randaug op.
+  config.pp_modules = ['ops_general', 'ops_image', 'ops_text', 'archive.randaug']
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  # Model section
+  config.model_name = 'vit'
+  config.model = dict(
+      variant='S/16',
+      rep_size=True,
+      pool_type='gap',
+      posemb='sincos2d',
+  )
+  # Optimizer section
+  config.grad_clip_norm = 1.0
+  config.optax_name = 'scale_by_adam'
+  config.optax = dict(mu_dtype='bfloat16')
+  config.lr = 0.001
+  config.wd = 0.0001
+  config.schedule = dict(warmup_steps=10_000, decay_type='cosine')
+  config.mixup = dict(p=0.2, fold_in=None)
+  # Eval section
+  def get_eval(split, dataset='imagenet2012'):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        pp_fn=pp_eval.format(lbl='label'),
+        loss_name=config.loss,
+        log_steps=2500,  # Very fast O(seconds) so it's fine to run it often.
+    )
+  config.evals = {}
+  config.evals.train = get_eval('train[:2%]')
+  config.evals.minival = get_eval('train[99%:]')
+  config.evals.val = get_eval('validation')
+  config.evals.v2 = get_eval('test', dataset='imagenet_v2')
+  config.evals.real = get_eval('validation', dataset='imagenet2012_real')
+  config.evals.real.pp_fn = pp_eval.format(lbl='real_label')
+  return config

Tipsomaly/model/big_vision/datasets/core.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Core data functions, dispatch calls to the requested dataset."""
+import importlib
+# Note: intentionally not using ABC to avoid forcing implementation of every
+# method, since one can imagine train-only datasets for example.
+class DataSource:
+  """The API that any data source should implement."""
+  def get_tfdata(self, ordered, *, process_split=True, allow_cache=True):
+    """Creates this data object as a tf.data.Dataset.
+    This will be called separately in each process, and it is up to the dataset
+    implementation to shard it accordingly if desired!
+    Args:
+      ordered: if True, the dataset should use deterministic ordering, if False
+        it may have undefined ordering. Think of True == val, False == train.
+      process_split: if False then every process receives the entire dataset
+        (e.g.  for evaluators running in a single process).
+      allow_cache: whether to allow caching the opened data or not.
+    Returns:
+      A tf.data.Dataset object.
+    Raises:
+      RuntimeError: if not implemented by the dataset, but called.
+    """
+    raise RuntimeError("not implemented for {self.__class__.__name__}")
+  @property
+  def total_examples(self):
+    """Returns number of examples in the dataset, regardless of sharding."""
+    raise RuntimeError("not implemented for {self.__class__.__name__}")
+  def num_examples_per_process(self):
+    """Returns a list of the numer of examples for each process.
+    This is only needed for datasets that should go through make_for_inference.
+    Returns:
+      Returns a list of the numer of examples for each process.
+      Ideally, this would always be `[total() / nprocess] * nprocess`, but in
+      reality we can almost never perfectly shard a dataset across arbitrary
+      number of processes.
+      One alternative option that can work in some cases is to not even shard
+      the dataset and thus return `[num_examples()] * nprocess.
+    Raises:
+      RuntimeError: if not implemented by the dataset, but called.
+    """
+    raise RuntimeError("not implemented for {self.__class__.__name__}")
+def get(name, **kw):
+  if name.startswith("bv:"):
+    mod = importlib.import_module(f"big_vision.datasets.{name[3:]}")
+    return mod.DataSource(**kw)
+  else:
+    mod = importlib.import_module("big_vision.datasets.tfds")
+    return mod.DataSource(name, **kw)

Tipsomaly/model/big_vision/datasets/jsonl.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Simple data input from .jsonl files."""
+import hashlib
+import json
+from multiprocessing.pool import ThreadPool
+import os
+import tempfile
+import urllib.request
+from absl import logging
+import big_vision.datasets.core as ds_core
+import jax
+import numpy as np
+import overrides
+import tensorflow as tf
+def cached_download(url, dest=None, verbose=True):
+  """Download `url` to local file and return path to that, but with caching."""
+  # NOTE: there is a small chance of saving corrupted data if the process is
+  # interrupted in the middle of writing the file. Then, reading in the input
+  # pipeline will fail, and the fix is to nuke the temp folder.
+  # Compute a temp name based on the URL, so we can check if we already
+  # downloaded it before.
+  dest = dest or os.path.join(tempfile.gettempdir(), "bv")
+  os.makedirs(dest, exist_ok=True)
+  dest = os.path.join(dest, hashlib.md5(url.encode()).hexdigest())
+  # NOTE: we should use last-modified header to know whether to re-download.
+  if os.path.isfile(dest):
+    return dest
+  if verbose:
+    print(f"\rRetrieving {url} into {dest}", end="", flush=True)
+  with urllib.request.urlopen(url) as f:
+    data = f.read()
+  with open(dest, "wb+") as f:
+    f.write(data)
+  return dest
+class DataSource(ds_core.DataSource):
+  """.jsonl DataSource."""
+  def __init__(self, fname, *, fopen_keys=(), download_keys=(),
+               start=0, stop=float("inf")):
+    """Create data-source that's jsonl + data files (eg images).
+    This correctly supports multi-host in that each host only reads a subset of
+    the dataset automatically. However, currently, all hosts download all items
+    if `download_keys` is specified. TODO: b/lbeyer - This can be improved.
+    Args:
+      fname: str, the path to the jsonl file that holds the dataset.
+      fopen_keys: collection of str or dict, the keys in the dataset whose
+        string value actually is a file-path that should be opened and read,
+        and its content is what goes into the batch (eg image filenames
+        commonly ["image"]).
+        If a dict, the values are folders prefixed to the filenames.
+        Supports gs:// for reading from buckets.
+      download_keys: collection of str, the keys in the dataset whose string
+        value actually is a URL from which the file should be downloaded first.
+        files are downloaded to a persistent tmp folder using the URL hash as
+        filename. If the file already exists, the download is skipped.
+        Must be a subset of `fopen_keys`.
+      start: int, index of the first row to use; use for slicing the data.
+      stop: int or inf, index of the row after the last one to use.
+    Note:
+      This simple data input does not allow for nested/hierarchical values,
+      or in any way more complicated values like vectors. Use TFDS for that.
+      The way start/stop arguments are used is as in list slicing[start:stop].
+    """
+    self.examples = []
+    with tf.io.gfile.GFile(fname) as f:
+      for i, line in enumerate(f):
+        if (start or 0) <= i < (stop or float("inf")):
+          try:
+            self.examples.append(json.loads(line))
+          except json.decoder.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in line {i}:\n{line}") from e
+    if download_keys:
+      for k in download_keys:
+        assert k in fopen_keys, (
+            f"{k} in download_keys but missing from fopen_keys {fopen_keys}")
+      # TODO: b/lbeyer - use info from trainer instead, move that to utils.
+      logging.info(  # pylint: disable=logging-fstring-interpolation
+          f"\u001b[33mNOTE\u001b[0m: Downloading {download_keys} "
+          f"for dataset {fname} ({len(self.examples)} examples) ...")
+      def _dl_one(ex):
+        for k in download_keys:
+          ex[k] = cached_download(ex[k])
+      ThreadPool(100).map(_dl_one, self.examples)
+      print("Done")
+      logging.info("\u001b[33mNOTE\u001b[0m: Done downloading.")
+    # Normalize.
+    if isinstance(fopen_keys, (list, tuple)):
+      self.fopen_keys = {k: "" for k in fopen_keys}
+    else:
+      self.fopen_keys = fopen_keys or {}
+    # We need to apply fopen path prefix here already, because doing so while
+    # actually reading the files in TF, things are symbolic :(
+    for ex in self.examples:
+      for k, dirname in self.fopen_keys.items():
+        ex[k] = os.path.join(dirname, ex[k])
+  def _indices(self, *, process_split=True, process_index=None):
+    indices = np.arange(len(self.examples))
+    if not process_split:
+      return list(indices)
+    pid = jax.process_index() if process_index is None else process_index
+    return list(np.array_split(indices, jax.process_count())[pid])
+  @overrides.overrides
+  def get_tfdata(self, ordered=False, *, process_split=True, allow_cache=True):
+    del allow_cache  # We don't cache anything anyways.
+    assert not process_split or len(self.examples) >= jax.process_count(), (
+        "Process splitting the data with fewer examples than processes!?")
+    my_idxs = self._indices(process_split=process_split)
+    if not ordered:
+      np.random.shuffle(my_idxs)
+    dataset = tf.data.Dataset.from_generator(
+        generator=lambda: ({"id": str(i), **self.examples[i]} for i in my_idxs),
+        output_signature={
+            "id": _guess_signature("0"),
+            **{k: _guess_signature(v) for k, v in self.examples[0].items()},
+            })
+    def _read_files(example):
+      for k in self.fopen_keys:
+        example[k] = tf.io.read_file(example[k])
+      return example
+    dataset = dataset.map(_read_files)
+    return dataset
+  @property
+  @overrides.overrides
+  def total_examples(self):
+    return len(self.examples)
+  @overrides.overrides
+  def num_examples_per_process(self):
+    return [len(self._indices(process_index=pid))
+            for pid in range(jax.process_count())]
+def _guess_signature(value):
+  return tf.TensorSpec.from_tensor(tf.constant(value))

Tipsomaly/model/big_vision/datasets/sequence_packing.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Packed Sequence Op."""
+# Forked from
+# https://github.com/google/maxtext/blob/main/MaxText/sequence_packing.py.
+from typing import Dict, Optional, List, Union
+from flax import traverse_util
+import tensorflow as tf
+AUTOTUNE = tf.data.experimental.AUTOTUNE
+FLATTEN_SEPARATOR = "<|sep|>"
+def pack_dataset(
+    dataset: tf.data.Dataset,
+    batch_size: int | None,
+    key2length: Union[int, Dict[str, int]],
+    keys: Optional[List[str | tuple[str, ...]]] = None) -> tf.data.Dataset:
+  """Creates a 'packed' version of a dataset on-the-fly.
+  Wrap `tensorflow.grain` ops.
+  This is meant to replace the irritation of having to create a separate
+  "packed" version of a dataset to train efficiently on TPU.
+  Each example in the output dataset represents several examples in the
+  input dataset.
+  For each key in the input dataset, two additional keys are created:
+  <key>_segment_ids: an int32 tensor identifying the parts
+     representing the original example.
+  <key>_positions: an int32 tensor identifying the position within the original
+     example.
+  Example:
+  Two input examples get combined to form an output example.
+  The input examples are:
+  {"inputs": [8, 7, 1, 0], "targets":[4, 1, 0]}
+  {"inputs": [2, 3, 4, 1], "targets":[5, 6, 1]}
+  The output example is:
+  {
+                 "inputs": [8, 7, 1, 2, 3, 4, 1, 0, 0, 0]
+             "inputs_seg": [1, 1, 1, 2, 2, 2, 2, 0, 0, 0]
+             "inputs_pos": [0, 1, 2, 0, 1, 2, 3, 0, 0, 0]
+                "targets": [4, 1, 5, 6, 1, 0, 0, 0, 0, 0]
+            "targets_seg": [1, 1, 2, 2, 2, 0, 0, 0, 0, 0]
+            "targets_pos": [0, 1, 0, 1, 2, 0, 0, 0, 0, 0]
+  }
+  0 represents padding in both the inputs and the outputs.
+  Sequences in the incoming examples are truncated to length "length", and the
+  sequences in the output examples all have fixed (padded) length "length".
+  Args:
+    dataset: A `tf.data.Dataset`.
+    batch_size: Batch size of the packed dataset.
+    key2length: An integer, or a dict from feature-key to integer.
+    keys: A list of strings (e.g. ["inputs", "targets"]).
+  Returns:
+    A `tf.data.Dataset`.
+  """
+  raise ValueError("Not implemented in OSS yet.")

Tipsomaly/model/big_vision/datasets/tfds.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow Datasets as data source for big_vision."""
+import functools
+import big_vision.datasets.core as ds_core
+import jax
+import numpy as np
+import overrides
+import tensorflow as tf
+import tensorflow_datasets as tfds
+class DataSource(ds_core.DataSource):
+  """Use TFDS as a data source."""
+  def __init__(self, name, split, data_dir=None, skip_decode=("image",)):
+    self.builder = _get_builder(name, data_dir)
+    self.split = split
+    # Each host is responsible for a fixed subset of data
+    process_splits = tfds.even_splits(split, jax.process_count())
+    self.process_split = process_splits[jax.process_index()]
+    self.skip_decode = skip_decode
+  @overrides.overrides
+  def get_tfdata(
+      self, ordered=False, *, process_split=True, allow_cache=True, **kw):
+    # The tf.data may use a lot of RAM, so we need to expose the option of not
+    # keeping this in memory when we use lots of input pipelines, such as when
+    # having many ephemeral evaluators.
+    return (_cached_get_dataset if allow_cache else _get_dataset)(
+        self.builder, self.skip_decode,
+        split=self.process_split if process_split else self.split,
+        shuffle_files=not ordered,
+        **kw)
+  @property
+  @overrides.overrides
+  def total_examples(self):
+    return self.builder.info.splits[self.split].num_examples
+  @overrides.overrides
+  def num_examples_per_process(self):
+    splits = tfds.even_splits(self.split, jax.process_count())
+    return [self.builder.info.splits[s].num_examples for s in splits]
+@functools.cache
+def _get_builder(dataset, data_dir):
+  if dataset == "from_data_dir":
+    return tfds.builder_from_directory(data_dir)
+  else:
+    return tfds.builder(dataset, data_dir=data_dir, try_gcs=True)
+# Cache as it may well take 1-2min on large datasets, and we may use the same
+# multiple times (eg various evaluators).
+def _get_dataset(builder, skip_decode, shuffle_files, split=None, **rckw):
+  """Returns a tf.data to be used."""
+  ds = builder.as_dataset(
+      split=split, shuffle_files=shuffle_files,
+      read_config=tfds.ReadConfig(
+          skip_prefetch=True,  # We prefetch after pipeline.
+          try_autocache=False,  # We control this, esp. for few-shot.
+          add_tfds_id=True,
+          **rckw,
+      ),
+      decoders={
+          f: tfds.decode.SkipDecoding()
+          for f in skip_decode if f in builder.info.features
+      })
+  def _hash_tfds_id(example):
+    id_ = tf.strings.to_hash_bucket_strong(
+        example["tfds_id"],
+        np.iinfo(np.uint32).max,  # Max value
+        [3714561454027272724, 8800639020734831960])  # Magic.
+    example["_id"] = tf.bitcast(id_, tf.int32)[0]  # good device dtype.
+    return example
+  return ds.map(_hash_tfds_id)
+_cached_get_dataset = functools.cache(_get_dataset)

Tipsomaly/model/big_vision/evaluators/__init__.py ADDED Viewed

File without changes

Tipsomaly/model/big_vision/evaluators/classification.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator for the classfication task."""
+# pylint: disable=consider-using-from-import
+import functools
+from big_vision.evaluators import common
+import big_vision.utils as u
+import jax
+import jax.numpy as jnp
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = 'jit'
+# To avoid re-compiling the function for every new instance of the same
+# evaluator on a different dataset!
+@functools.cache
+def get_eval_fn(predict_fn, loss_name):
+  """Produces eval function, also applies pmap."""
+  @jax.jit
+  def _eval_fn(train_state, batch, labels, mask):
+    logits, *_ = predict_fn(train_state, batch)
+    # Ignore the entries with all zero labels for evaluation.
+    mask *= labels.max(axis=1)
+    loss = getattr(u, loss_name)(
+        logits=logits, labels=labels, reduction=False)
+    loss = jnp.sum(loss * mask)
+    top1_idx = jnp.argmax(logits, axis=1)
+    # Extracts the label at the highest logit index for each image.
+    top1_correct = jnp.take_along_axis(
+        labels, top1_idx[:, None], axis=1)[:, 0]
+    ncorrect = jnp.sum(top1_correct * mask)
+    nseen = jnp.sum(mask)
+    return ncorrect, loss, nseen
+  return _eval_fn
+class Evaluator:
+  """Classification evaluator."""
+  def __init__(self, predict_fn, loss_name, label_key='labels', **kw):
+    self.get_data_iter, self.steps = common.eval_input_pipeline(**kw)
+    self.eval_fn = get_eval_fn(predict_fn, loss_name)
+    self.label_key = label_key
+  def run(self, train_state):
+    """Computes all metrics."""
+    ncorrect, loss, nseen = 0, 0, 0
+    for _, batch in zip(range(self.steps), self.get_data_iter()):
+      labels, mask = batch.pop(self.label_key), batch.pop('_mask')
+      batch_ncorrect, batch_losses, batch_nseen = jax.device_get(
+          self.eval_fn(train_state, batch, labels, mask))
+      ncorrect += batch_ncorrect
+      loss += batch_losses
+      nseen += batch_nseen
+    yield ('prec@1', ncorrect / nseen)
+    yield ('loss', loss / nseen)

Tipsomaly/model/big_vision/evaluators/common.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils for evaluators in general."""
+import dataclasses
+import functools
+import importlib
+import json
+import os
+from typing import Any, Callable
+from absl import flags
+from big_vision import input_pipeline
+from big_vision.datasets import core as ds_core
+from big_vision.pp import builder as pp_builder
+import big_vision.utils as u
+import flax
+import jax
+import numpy as np
+from tensorflow.io import gfile
+def from_config(config, predict_fns,
+                write_note=lambda s: s,
+                get_steps=lambda key, cfg: cfg[f"{key}_steps"],
+                devices=None):
+  """Creates a list of evaluators based on `config`."""
+  evaluators = []
+  specs = config.get("evals", {})
+  for name, cfg in specs.items():
+    write_note(name)
+    # Pop all generic settings off so we're left with eval's kwargs in the end.
+    cfg = cfg.to_dict()
+    module = cfg.pop("type", name)
+    pred_key = cfg.pop("pred", "predict")
+    pred_kw = cfg.pop("pred_kw", None)
+    prefix = cfg.pop("prefix", f"{name}/")
+    cfg.pop("skip_first", None)
+    logsteps = get_steps("log", cfg)
+    for typ in ("steps", "epochs", "examples", "percent"):
+      cfg.pop(f"log_{typ}", None)
+    # Use same batch_size as eval by default, to reduce fragmentation.
+    # TODO: eventually remove all the deprecated names...
+    cfg["batch_size"] = cfg.get("batch_size") or config.get("batch_size_eval") or config.get("input.batch_size") or config.get("batch_size")  # pylint: disable=line-too-long
+    module = importlib.import_module(f"big_vision.evaluators.{module}")
+    if devices is not None:
+      cfg["devices"] = devices
+    api_type = getattr(module, "API", "pmap")
+    if api_type == "pmap" and "devices" in cfg:
+      raise RuntimeError(
+          "You are seemingly using the old pmap-based evaluator, but with "
+          "jit-based train loop, see (internal link) for more details.")
+    if api_type == "jit" and "devices" not in cfg:
+      raise RuntimeError(
+          "You are seemingly using new jit-based evaluator, but with "
+          "old pmap-based train loop, see (internal link) for more details.")
+    try:
+      predict_fn = predict_fns[pred_key]
+    except KeyError as e:
+      raise ValueError(
+          f"Unknown predict_fn '{pred_key}'. Available predict_fns are:\n"
+          + "\n".join(predict_fns)) from e
+    if pred_kw is not None:
+      predict_fn = _CacheablePartial(predict_fn, flax.core.freeze(pred_kw))
+    evaluator = module.Evaluator(predict_fn, **cfg)
+    evaluators.append((name, evaluator, logsteps, prefix))
+  return evaluators
+@dataclasses.dataclass(frozen=True, eq=True)
+class _CacheablePartial:
+  """partial(fn, **kwargs) that defines hash and eq - to help with jit caches.
+  This is particularly common in evaluators when one has many evaluator
+  instances that run on difference slices of data.
+  Example:
+  ```
+    f1 = _CacheablePartial(fn, a=1)
+    jax.jit(f1)(...)
+    jax.jit(_CacheablePartial(fn, a=1))(...)   # fn won't be retraced.
+    del f1
+    jax.jit(_CacheablePartial(fn, a=1))(...)   # fn will be retraced.
+  ```
+  """
+  fn: Callable[..., Any]
+  kwargs: flax.core.FrozenDict
+  def __call__(self, *args, **kwargs):
+    return functools.partial(self.fn, **self.kwargs)(*args, **kwargs)
+def eval_input_pipeline(
+    data, pp_fn, batch_size, devices, keep_on_cpu=(),
+    cache="pipeline", prefetch=1, warmup=False,
+):
+  """Create an input pipeline in the way used by most evaluators.
+  Args:
+    data: The configuration to create the data source (like for training).
+    pp_fn: A string representing the preprocessing to be performed.
+    batch_size: The batch size to use.
+    devices: The devices that the batches are sharded and pre-fetched onto.
+    keep_on_cpu: See input_pipeline.start_global. Entries in the batch that
+      should be kept on the CPU, hence could be ragged or of string type.
+    cache: One of "none", "pipeline", "raw_data", "final_data". Determines what
+      part of the input stream should be cached across evaluator runs. They use
+      more and more RAM, but make evals faster, in that order.
+      - "none": Entirely re-create and destroy the input pipeline each run.
+      - "pipeline": Keep the (tf.data) pipeline object alive across runs.
+      - "raw_data": Cache the full raw data before pre-processing.
+      - "final_data": Cache the full raw data after pre-processing.
+    prefetch: How many batches to fetch ahead.
+    warmup: Start fetching the first batch at creation time (right now),
+      instead of once the iteration starts.
+  Returns:
+    A tuple (get_iter, steps), the first element is a function that returns
+    the iterator to be used for an evaluation, the second one is how many steps
+    should be iterated for doing one evaluation.
+  """
+  assert (
+      cache is None
+      or cache.lower() in ("none", "pipeline", "raw_data", "final_data")
+  ), f"Unknown value for cache: {cache}"
+  data_source = ds_core.get(**data)
+  tfdata, steps = input_pipeline.make_for_inference(
+      data_source.get_tfdata(ordered=True, allow_cache=cache.lower() != "none"),
+      batch_size=batch_size,
+      num_ex_per_process=data_source.num_examples_per_process(),
+      preprocess_fn=pp_builder.get_preprocess_fn(pp_fn, str(data)),
+      cache_final=cache == "raw_data",
+      cache_raw=cache == "final_data")
+  get_data_iter = lambda: input_pipeline.start_global(
+      tfdata, devices, prefetch, keep_on_cpu, warmup)
+  # Possibly create one persistent iterator:
+  if cache in ("pipeline", "raw_data", "final_data"):
+    data_iter = get_data_iter()
+    get_data_iter = lambda: data_iter
+  return get_data_iter, steps
+def process_sum(tree):
+  """Sums the pytree across all processes."""
+  if jax.process_count() == 1:  # Avoids corner-cases on donuts.
+    return tree
+  with jax.transfer_guard_device_to_host("allow"):
+    gathered = jax.experimental.multihost_utils.process_allgather(tree)
+  return jax.tree.map(functools.partial(np.sum, axis=0), gathered)
+def resolve_outfile(outfile, split="", **kw):
+  if not outfile:
+    return None
+  # A caveat: when workdir doesn't exist but is in the `outfile`, we should
+  # skip. This is common in small runs or runlocal debuggings.
+  if "{workdir}" in outfile and not flags.FLAGS.workdir:
+    return None
+  return outfile.format(
+      workdir=flags.FLAGS.workdir,
+      split="".join(c if c not in "[]%:" else "_" for c in (split or "")),
+      step=getattr(u.chrono, "prev_step", None),
+      **kw,
+  )
+def multiprocess_write_json(outfile, jobj):  # jobj = "json object"
+  """Write a single json file combining all processes' `jobj`s."""
+  if not outfile:
+    return
+  outfile = resolve_outfile(outfile)
+  gfile.makedirs(os.path.dirname(outfile))
+  if isinstance(jobj, list):
+    combine_fn = list.extend
+  elif isinstance(jobj, dict):
+    combine_fn = dict.update
+  else:
+    raise TypeError(f"Can only write list or dict jsons, but got {type(jobj)}")
+  # First, each process writes its own file.
+  with gfile.GFile(outfile + f".p{jax.process_index()}", "w+") as f:
+    f.write(json.dumps(jobj))
+  u.sync()  # Wait for all files to be written; `with` above does close/flush.
+  # Have process 0 collect, concat, and write final output.
+  all_json = type(jobj)()
+  if jax.process_index() == 0:
+    for pid in range(jax.process_count()):
+      with gfile.GFile(outfile + f".p{pid}", "r") as f:
+        combine_fn(all_json, json.loads(f.read()))
+    with gfile.GFile(outfile, "w+") as f:
+      f.write(json.dumps(all_json))
+  # Cleanup time
+  u.sync()
+  gfile.remove(outfile + f".p{jax.process_index()}")
+  return all_json

Tipsomaly/model/big_vision/evaluators/fewshot_lsr.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils for few-shot evaluation."""
+# pylint: disable=consider-using-from-import,g-importing-member
+import functools
+import big_vision.datasets.core as ds_core
+import big_vision.input_pipeline as input_pipeline
+import big_vision.pp.builder as pp_builder
+import big_vision.utils as u
+import jax
+import jax.numpy as jnp
+from jax.sharding import NamedSharding as Sharding
+from jax.sharding import PartitionSpec as P
+import numpy as np
+BIAS_CONSTANT = 100.0
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = "jit"
+# Setup function for few-shot regression on CPU to avoid "polluting" the TPU.
+@u.jit_cpu(static_argnums=(2,))
+def _precompute_cache(x, y, num_classes):
+  """Cache quantities to speed-up the computation of L2-regularized least-sq."""
+  # Whiten
+  mean = jnp.mean(x, axis=0, keepdims=True)
+  std = jnp.std(x, axis=0, keepdims=True) + 1e-5
+  x = (x - mean) / std
+  # Add a constant feature for the bias, large so it's almost unregularized:
+  x = jnp.pad(x, ((0, 0), (0, 1)), constant_values=BIAS_CONSTANT)
+  # To one-hot representation rescaled into {-1, 1}
+  y = 2.0 * jax.nn.one_hot(y, num_classes) - 1.0
+  num_points, dim = x.shape
+  # Let N be the number of points, D the dimension and C the number of classes.
+  # We have x of shape (N, D) and y of shape (N, C).
+  # For least-squares, we can compute
+  #
+  #   (A) when N >= D, (x^T x + l2 Id)^{-1} x^T y
+  #   (B) when D > N, x^T  (x x^T + l2 Id)^{-1} y
+  #
+  # We pre-compute the eigen-decomposition of either x^T x or x x^T which
+  # becomes q diag(eigs) q^T with q unitary matrix either (D, D) or (N, N)
+  # and eigs a vector (D,) or (N,).
+  #
+  # For any l2 > 0, we can compute (x^T x + l2 Id)^{-1} or (x x^T + l2 Id)^{-1}
+  # by simply computing q (diag(eigs) + l2 Id)^{-1} q^T.
+  # (SVD would be more natural here, but it proved slower, so we use eigh)
+  #
+  # Both cases (A) and (B) can be viewed as lhs (diag(eigs) + l2 Id)^{-1} rhs,
+  # where lhs/rhs are pre-computed left/right-hand sides to specify.
+  #
+  # Detailed evaluation in terms of time and fewshot metrics can be found in
+  # (internal link)
+  #
+  # Implemented by Rodolphe Jenatton.
+  if num_points >= dim:
+    eigs, q = jnp.linalg.eigh(x.T @ x)
+    rhs = q.T @ (x.T @ y)
+    lhs = q
+  else:
+    eigs, q = jnp.linalg.eigh(x @ x.T)
+    rhs = q.T @ y
+    lhs = x.T @ q
+  cache = {
+      "eigs": eigs,
+      "rhs": rhs,
+      "lhs": lhs,
+      "mean": mean,
+      "std": std
+  }
+  return cache
+@u.jit_cpu()
+def _eig_fewshot_acc_fn(cache, x_test, y_test, l2_reg):
+  """Computes (x,y) linear regression accuracy on (x_test, y_test)."""
+  x_test = (x_test - cache["mean"]) / cache["std"]
+  x_test = jnp.pad(x_test, ((0, 0), (0, 1)), constant_values=BIAS_CONSTANT)
+  rhs = cache["rhs"]
+  lhs = cache["lhs"]
+  eigs = cache["eigs"]
+  # See comments in _precompute_cache for context about the formula.
+  scaling = 1.0 / (eigs + l2_reg * jnp.ones_like(eigs))
+  scaling = scaling.reshape((1, -1))
+  w = (lhs * scaling) @ rhs
+  # Predict test-set values and measure their accuracy
+  preds = jnp.argmax(x_test @ w, axis=1)
+  return jnp.mean(preds == y_test)
+class Evaluator:
+  """Class for few-shot evaluation."""
+  def __init__(self, predict_fn, batch_size,
+               datasets, shots, l2_reg,
+               pp_train, pp_eval, display_first,
+               representation_layer=None, num_seeds=3,
+               label_key="label", mask_key="_mask", data_dir=None, *,
+               devices):
+    self.datasets = datasets
+    self.shots = shots
+    self.l2_reg = l2_reg
+    self.batch_size = batch_size
+    self.pp_tr = pp_train
+    self.pp_te = pp_eval
+    self.display_first = display_first
+    self._datasets = {}  # Cache for tfds data. Persists while object is alive.
+    self._repr = {}  # Cache for precomputed repr. Persists within the run call.
+    self.num_seeds = num_seeds
+    self.label_key = label_key
+    self.mask_key = mask_key
+    self.data_dir = data_dir
+    self.devices = devices
+    self.mesh = jax.sharding.Mesh(devices, ("devices",))
+    self.repr_fn = self.get_representation_fn(
+        predict_fn, representation_layer)
+  def get_representation_fn(self, predict_fn, representation_layer):
+    # `out_shardings=Sharding(self.mesh, P())` will "all_gather" the outputs.
+    @functools.partial(jax.jit, out_shardings=Sharding(self.mesh, P()))
+    def _repr_fn(train_state, batch, labels, mask):
+      zimg, *_, out = predict_fn(train_state, batch)
+      if representation_layer is not None:
+        rep = u.tree_get(out, representation_layer)
+      else:
+        rep = zimg
+      return rep, labels, mask
+    return _repr_fn
+  # Setup input pipeline.
+  def _get_dataset(self, dataset, train_split, test_split):
+    """Lazy-loads given dataset."""
+    key = (dataset, train_split, test_split)
+    try:
+      return self._datasets[key]
+    except KeyError:
+      # NOTE: only supporting TFDS data for now for bwd compat/lazyness.
+      train_data = ds_core.get(
+          name=dataset, split=train_split, data_dir=self.data_dir
+      )
+      test_data = ds_core.get(
+          name=dataset, split=test_split, data_dir=self.data_dir
+      )
+      train_ds, batches_tr = input_pipeline.make_for_inference(
+          train_data.get_tfdata(ordered=True),
+          num_ex_per_process=train_data.num_examples_per_process(),
+          batch_size=self.batch_size,
+          preprocess_fn=pp_builder.get_preprocess_fn(self.pp_tr))
+      test_ds, batches_te = input_pipeline.make_for_inference(
+          test_data.get_tfdata(ordered=True),
+          num_ex_per_process=test_data.num_examples_per_process(),
+          batch_size=self.batch_size,
+          preprocess_fn=pp_builder.get_preprocess_fn(self.pp_te))
+      num_classes = train_data.builder.info.features[self.label_key].num_classes
+      return self._datasets.setdefault(
+          key, (train_ds, batches_tr, test_ds, batches_te, num_classes))
+  def _get_repr(self, params, data, steps):
+    """Compute representation for the whole dataset."""
+    pre_logits_list = []
+    labels_list = []
+    for batch, _ in zip(
+        input_pipeline.start_global(data, self.devices, 0), range(steps)):
+      labels, mask = batch.pop(self.label_key), batch.pop(self.mask_key)
+      pre_logits, labels, mask = jax.device_get(self.repr_fn(
+          params, batch, labels, mask))
+      mask = mask.astype(bool)
+      pre_logits_list.append(pre_logits[mask])
+      labels_list.append(labels[mask])
+    pre_logits = np.concatenate(pre_logits_list, axis=0)
+    labels = np.concatenate(labels_list, axis=0)
+    return pre_logits, labels
+  def compute_fewshot_metrics(self, train_state, seed,
+                              dataset, train_split, test_split):
+    """Compute few-shot metrics on one dataset."""
+    if dataset in self._repr:
+      repr_train, labels_train, repr_test, labels_test, num_classes = (
+          self._repr[dataset])
+    else:
+      train_ds, steps_tr, test_ds, steps_te, num_classes = self._get_dataset(
+          dataset, train_split, test_split)
+      repr_train, labels_train = self._get_repr(train_state, train_ds, steps_tr)
+      repr_test, labels_test = self._get_repr(train_state, test_ds, steps_te)
+      self._repr[dataset] = (repr_train, labels_train,
+                             repr_test, labels_test,
+                             num_classes)
+    # Collect where we have samples of which classes.
+    rng = np.random.default_rng(seed)
+    class_indices = [rng.permutation(np.where(labels_train == cls_i)[0])
+                     for cls_i in range(num_classes)]
+    results = {}
+    for shots in self.shots:
+      all_idx = [indices[:shots] for indices in class_indices]
+      all_idx = np.concatenate(all_idx, axis=0)
+      x = u.put_cpu(repr_train[all_idx])
+      y = u.put_cpu(labels_train[all_idx])
+      repr_test, labels_test = u.put_cpu((repr_test, labels_test))
+      # Note the code is optimized to solve multiple LSR tasks for changing l2
+      # strength, even though we currently used the fixed l2_reg constant.
+      cache = _precompute_cache(x, y, num_classes)
+      acc = _eig_fewshot_acc_fn(
+          cache, repr_test, labels_test, u.put_cpu(self.l2_reg))
+      results[shots] = jax.device_get(acc)
+    return results
+  def run(self, train_state):
+    """New API executed in terms of old API."""
+    self._repr = {}
+    for seed in range(self.num_seeds):
+      for name, dataset_args in self.datasets.items():
+        result = self.compute_fewshot_metrics(train_state, seed, *dataset_args)
+        for shots, v in result.items():
+          prefix = "a/" if (name, shots) in self.display_first else "z/"
+          suffix = f"-seed-{seed}"
+          yield f"{prefix}{name}_{shots}shot{suffix}", v

Tipsomaly/model/big_vision/evaluators/mean.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator for computing mean of per-example metrics.
+This evaluator can be used in two ways:
+  1. Create a new evaluator with reduced boilerplate by inheriting from it.
+  2. For quick prototyping, use this with predict_fns which return the metrics.
+"""
+from functools import partial
+from typing import Mapping
+from big_vision.evaluators import common
+import jax
+import jax.numpy as jnp
+import numpy as np
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = 'jit'
+# Note: global to avoid jax re-compiling across different evaluator instances.
+@partial(jax.jit, static_argnums=0)
+def _run_predict_fn(predict_fn, train_state, batch):
+  """Sum per-example metrics weighted by `_mask`."""
+  metrics = predict_fn(train_state, batch)
+  mask = batch['_mask']
+  # Sanity check output format of predict_fn.
+  assert isinstance(metrics, Mapping), 'predict_fn must return a dict'
+  for y in jax.tree.leaves(metrics):
+    if y.shape != mask.shape:
+      raise ValueError(
+          f'Expected per-example metrics of shape {mask.shape} found '
+          f'{jax.tree.map(lambda x: x.shape, metrics)}.')
+  metrics = {**metrics, '_mask': mask}
+  return jax.tree.map(lambda x: jnp.sum(jnp.where(mask, x, 0)), metrics)
+class Evaluator:
+  """Report the mean of per-example metrics computed by predict_fn.
+  `predict_fn(params, batch)` must return a dict from metric name to
+  per-example metrics of shape [batch_size].
+  """
+  def __init__(self, predict_fn, **kw):
+    self.get_data_iter, self.steps = common.eval_input_pipeline(**kw)
+    self.predict_fn = partial(_run_predict_fn, predict_fn)
+  def run(self, train_state):
+    """Computes all metrics."""
+    metrics = []
+    # Compute batch metrics without blocking.
+    for _, batch in zip(range(self.steps), self.get_data_iter()):
+      batch_metrics = self.predict_fn(train_state, batch)
+      metrics.append(batch_metrics)
+    # Transfer metrics (blocking).
+    metrics = jax.device_get(metrics)
+    # Accumulate metrics across batches.
+    metrics_sum = jax.tree.map(lambda *x: np.sum(x), *metrics)
+    mask_sum = metrics_sum.pop('_mask')
+    for key, value_sum in metrics_sum.items():
+      yield (key, value_sum / mask_sum)

Tipsomaly/model/big_vision/evaluators/save.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator that save inputs and outputs of prediction functions."""
+import functools
+from absl import flags
+from absl import logging
+from big_vision import input_pipeline
+from big_vision import optax as bv_optax
+from big_vision import utils
+from big_vision.datasets import core as ds_core
+from big_vision.pp import builder as pp_builder
+import jax
+import numpy as np
+# Temporary global flag to facilitate backwards compatability. Will be removed
+# by the end of year 2023.
+API = 'jit'
+# Note: global to avoid jax re-compiling across different evaluator instances.
+def _run_predict_fn(predict_fn, train_state, batch):
+  """Run predict_fn and gather all outputs on all devices."""
+  y = predict_fn(train_state, batch)
+  return {'inputs': batch, 'outputs': y}
+class Evaluator:
+  """Evaluator that saves the inputs and outputs of a prediction function.
+  Example configuration:
+  ```
+    config.evals.save_pred = {
+      'type': 'save',
+      'pred': 'inference',
+      'outfile': '{workdir}/inference-{step:09d}.npz',
+      'data': ..., 'pp_fn': ..., 'log_steps': ...,
+  }
+  ```
+  Results can then be easily inspected in a notebook such as:
+  ```
+    results = utils.load_checkpoint("<full_path_to_outfile>")
+    inputs, outputs = (results["inputs"], results["outputs"])
+  ```
+  """
+  def __init__(self, predict_fn, data, pp_fn, batch_size, outfile,
+               cache_final=True, cache_raw=False, prefetch=1, *, devices):
+    replicate = jax.sharding.NamedSharding(
+        jax.sharding.Mesh(devices, ('devices',)),
+        jax.sharding.PartitionSpec()
+    )
+    self.predict_fn = functools.partial(
+        jax.jit(_run_predict_fn, static_argnums=0, out_shardings=replicate),
+        predict_fn,
+    )
+    data = ds_core.get(**data)
+    self.dataset, self.steps = input_pipeline.make_for_inference(
+        data.get_tfdata(ordered=True),
+        batch_size=batch_size,
+        num_ex_per_process=data.num_examples_per_process(),
+        preprocess_fn=pp_builder.get_preprocess_fn(pp_fn),
+        cache_final=cache_final,
+        cache_raw=cache_raw,
+    )
+    self.data_iter = input_pipeline.start_global(
+        self.dataset, devices, prefetch
+    )
+    self.outfile = outfile
+  def run(self, train_state):
+    """Compute all predictions, gather in main host and save in outfile."""
+    step = jax.device_get(bv_optax.get_count(train_state['opt'], jittable=True))
+    outfile = self.outfile.format(workdir=flags.FLAGS.workdir, step=step)
+    count = 0
+    outputs = []
+    for _, batch in zip(range(self.steps), self.data_iter):
+      out = self.predict_fn(train_state, batch)
+      if jax.process_index():
+        continue
+      out = jax.device_get(out)
+      mask = out['inputs']['_mask']
+      out = jax.tree.map(lambda x: x[mask == 1], out)  # pylint: disable=cell-var-from-loop
+      count += mask.shape[0]
+      out['inputs'].pop('_mask')
+      outputs.append(out)
+      logging.log_every_n_seconds(
+          logging.INFO, 'Processed %i examples so far.', 60,
+          count)
+    if jax.process_index():
+      return
+    logging.info('Saving %d examples in %s', count, outfile)
+    outputs = jax.tree.map(lambda *x: np.concatenate(x, axis=0), *outputs)
+    utils.save_checkpoint(outputs, outfile, compressed=True)
+    return
+    yield None  # pylint: disable=unreachable

Tipsomaly/model/big_vision/models/__init__.py ADDED Viewed

File without changes

Tipsomaly/model/big_vision/models/bit.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ResNet V1 with GroupNorm."""
+from typing import Optional, Sequence, Union
+from big_vision import utils
+from big_vision.models import common
+import flax
+import flax.linen as nn
+import flax.training.checkpoints
+import jax.numpy as jnp
+import numpy as np
+def weight_standardize(w, axis, eps):
+  w = w - jnp.mean(w, axis=axis)
+  w = w / (jnp.std(w, axis=axis) + eps)
+  return w
+class StdConv(nn.Conv):
+  def param(self, name, *a, **kw):
+    param = super().param(name, *a, **kw)
+    if name == "kernel":
+      param = weight_standardize(param, axis=[0, 1, 2], eps=1e-5)
+    return param
+class ResidualUnit(nn.Module):
+  """Bottleneck ResNet block."""
+  nmid: Optional[int] = None
+  strides: Sequence[int] = (1, 1)
+  @nn.compact
+  def __call__(self, x):
+    nmid = self.nmid or x.shape[-1] // 4
+    nout = nmid * 4
+    residual = x
+    if x.shape[-1] != nout or self.strides != (1, 1):
+      residual = StdConv(nout, (1, 1), self.strides, use_bias=False,
+                         name="conv_proj")(residual)
+      residual = nn.GroupNorm(name="gn_proj")(residual)
+    y = StdConv(nmid, (1, 1), use_bias=False, name="conv1")(x)
+    y = nn.GroupNorm(name="gn1")(y)
+    y = nn.relu(y)
+    y = StdConv(nmid, (3, 3), self.strides, use_bias=False, name="conv2")(y)
+    y = nn.GroupNorm(name="gn2")(y)
+    y = nn.relu(y)
+    y = StdConv(nout, (1, 1), use_bias=False, name="conv3")(y)
+    y = nn.GroupNorm(name="gn3", scale_init=nn.initializers.zeros)(y)
+    y = nn.relu(residual + y)
+    return y
+class ResNetStage(nn.Module):
+  """One stage of ResNet."""
+  block_size: int
+  first_stride: Sequence[int] = (1, 1)
+  nmid: Optional[int] = None
+  @nn.compact
+  def __call__(self, x):
+    x = ResidualUnit(self.nmid, strides=self.first_stride, name="unit1")(x)
+    for i in range(1, self.block_size):
+      x = ResidualUnit(self.nmid, name=f"unit{i + 1}")(x)
+    return x
+class Model(nn.Module):
+  """ResNetV1."""
+  num_classes: Optional[int] = None
+  width: float = 1
+  depth: Union[int, Sequence[int]] = 50
+  @nn.compact
+  def __call__(self, image, *, train=False):
+    del train  # Unused
+    blocks = get_block_desc(self.depth)
+    width = int(64 * self.width)
+    out = {}
+    # Root block
+    x = StdConv(width, (7, 7), (2, 2), use_bias=False, name="conv_root")(image)
+    x = nn.GroupNorm(name="gn_root")(x)
+    x = nn.relu(x)
+    x = nn.max_pool(x, (3, 3), strides=(2, 2), padding="SAME")
+    out["stem"] = x
+    # Stages
+    x = ResNetStage(blocks[0], nmid=width, name="block1")(x)
+    out["stage1"] = x
+    for i, block_size in enumerate(blocks[1:], 1):
+      x = ResNetStage(block_size, nmid=width * 2 ** i,
+                      first_stride=(2, 2), name=f"block{i + 1}")(x)
+      out[f"stage{i + 1}"] = x
+    out["pre_logits_2d"] = x
+    # Head
+    x = out["pre_logits"] = jnp.mean(x, axis=(1, 2))
+    if self.num_classes:
+      head = nn.Dense(self.num_classes, name="head",
+                      kernel_init=nn.initializers.zeros)
+      out["logits_2d"] = head(out["pre_logits_2d"])
+      x = out["logits"] = head(out["pre_logits"])
+    return x, out
+# A dictionary mapping the number of layers in a resnet to the number of
+# blocks in each stage of the model.
+# NOTE: Does not include 18/34 as they also need non-bottleneck block!
+def get_block_desc(depth):
+  if isinstance(depth, list):  # Be robust to silly mistakes.
+    depth = tuple(depth)
+  return {
+      26: [2, 2, 2, 2],  # From timm, gets ~75% on ImageNet.
+      50: [3, 4, 6, 3],
+      101: [3, 4, 23, 3],
+      152: [3, 8, 36, 3],
+      200: [3, 24, 36, 3]
+  }.get(depth, depth)
+def fix_old_checkpoints(params):
+  """Modifies params from old checkpoints to run with current implementation."""
+  params = flax.core.unfreeze(
+      flax.training.checkpoints.convert_pre_linen(params))
+  # Old linen used to store non-squeezed GN params.
+  params = flax.traverse_util.unflatten_dict({
+      k: np.squeeze(v) if (set(k)
+                           & {"gn_root", "gn_proj", "gn1", "gn2", "gn3"}) else v
+      for k, v in flax.traverse_util.flatten_dict(params).items()
+  })
+  return params
+def load(init_params, init_file, model_cfg, dont_load=()):
+  """Load init from checkpoint."""
+  del model_cfg  # Unused
+  params = utils.load_params(init_file)
+  params = common.merge_params(params, init_params, dont_load)
+  params = fix_old_checkpoints(params)
+  return params

Tipsomaly/model/big_vision/models/bit_paper.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BiT models as in the paper (ResNet V2) w/ loading of public weights.
+See reproduction proof: http://(internal link)/qY70qs6j944
+"""
+import functools
+import re
+from typing import Optional, Sequence, Union
+from big_vision import utils as u
+from big_vision.models import bit
+from big_vision.models import common
+import flax.linen as nn
+import jax.numpy as jnp
+def standardize(x, axis, eps):
+  x = x - jnp.mean(x, axis=axis, keepdims=True)
+  x = x / jnp.sqrt(jnp.mean(jnp.square(x), axis=axis, keepdims=True) + eps)
+  return x
+# Defined our own, because we compute normalizing variance slightly differently,
+# which does affect performance when loading pre-trained weights!
+class GroupNorm(nn.Module):
+  """Group normalization (arxiv.org/abs/1803.08494)."""
+  ngroups: int = 32
+  @nn.compact
+  def __call__(self, x):
+    input_shape = x.shape
+    group_shape = x.shape[:-1] + (self.ngroups, x.shape[-1] // self.ngroups)
+    x = x.reshape(group_shape)
+    # Standardize along spatial and group dimensions
+    x = standardize(x, axis=[1, 2, 4], eps=1e-5)
+    x = x.reshape(input_shape)
+    bias_scale_shape = tuple([1, 1, 1] + [input_shape[-1]])
+    x = x * self.param('scale', nn.initializers.ones, bias_scale_shape)
+    x = x + self.param('bias', nn.initializers.zeros, bias_scale_shape)
+    return x
+class StdConv(nn.Conv):
+  def param(self, name, *a, **kw):
+    param = super().param(name, *a, **kw)
+    if name == 'kernel':
+      param = standardize(param, axis=[0, 1, 2], eps=1e-10)
+    return param
+class RootBlock(nn.Module):
+  """Root block of ResNet."""
+  width: int
+  @nn.compact
+  def __call__(self, x):
+    x = StdConv(self.width, (7, 7), (2, 2), padding=[(3, 3), (3, 3)],
+                use_bias=False, name='conv_root')(x)
+    x = nn.max_pool(x, (3, 3), strides=(2, 2), padding=[(1, 1), (1, 1)])
+    return x
+class ResidualUnit(nn.Module):
+  """Bottleneck ResNet block."""
+  nmid: Optional[int] = None
+  strides: Sequence[int] = (1, 1)
+  @nn.compact
+  def __call__(self, x):
+    nmid = self.nmid or x.shape[-1] // 4
+    nout = nmid * 4
+    conv = functools.partial(StdConv, use_bias=False)
+    residual = x
+    x = GroupNorm(name='gn1')(x)
+    x = nn.relu(x)
+    if x.shape[-1] != nout or self.strides != (1, 1):
+      residual = conv(nout, (1, 1), self.strides, name='conv_proj')(x)
+    x = conv(nmid, (1, 1), name='conv1')(x)
+    x = GroupNorm(name='gn2')(x)
+    x = nn.relu(x)
+    x = conv(nmid, (3, 3), self.strides, padding=[(1, 1), (1, 1)],
+             name='conv2')(x)
+    x = GroupNorm(name='gn3')(x)
+    x = nn.relu(x)
+    x = conv(nout, (1, 1), name='conv3')(x)
+    return x + residual
+class ResNetStage(nn.Module):
+  """A stage (sequence of same-resolution blocks)."""
+  block_size: int
+  nmid: Optional[int] = None
+  first_stride: Sequence[int] = (1, 1)
+  @nn.compact
+  def __call__(self, x):
+    out = {}
+    x = out['unit01'] = ResidualUnit(
+        self.nmid, strides=self.first_stride, name='unit01')(x)
+    for i in range(1, self.block_size):
+      x = out[f'unit{i+1:02d}'] = ResidualUnit(
+          self.nmid, name=f'unit{i+1:02d}')(x)
+    return x, out
+class Model(nn.Module):
+  """ResNetV2."""
+  num_classes: Optional[int] = None
+  width: int = 1
+  depth: Union[int, Sequence[int]] = 50  # 50/101/152, or list of block depths.
+  head_zeroinit: bool = True
+  @nn.compact
+  def __call__(self, image, *, train=False):
+    blocks = bit.get_block_desc(self.depth)
+    width = int(64 * self.width)
+    out = {}
+    x = out['stem'] = RootBlock(width=width, name='root_block')(image)
+    # Blocks
+    x, out['stage1'] = ResNetStage(blocks[0], nmid=width, name='block1')(x)
+    for i, block_size in enumerate(blocks[1:], 1):
+      x, out[f'stage{i + 1}'] = ResNetStage(
+          block_size, width * 2 ** i,
+          first_stride=(2, 2), name=f'block{i + 1}')(x)
+    # Pre-head
+    x = out['norm_pre_head'] = GroupNorm(name='norm-pre-head')(x)
+    x = out['pre_logits_2d'] = nn.relu(x)
+    x = out['pre_logits'] = jnp.mean(x, axis=(1, 2))
+    # Head
+    if self.num_classes:
+      kw = {'kernel_init': nn.initializers.zeros} if self.head_zeroinit else {}
+      head = nn.Dense(self.num_classes, name='head', **kw)
+      out['logits_2d'] = head(out['pre_logits_2d'])
+      x = out['logits'] = head(out['pre_logits'])
+    return x, out
+def load(init_params, init_file, model_cfg, dont_load=()):
+  """Loads the TF-dumped NumPy or big_vision checkpoint.
+  Args:
+    init_params: random init params from which the new head is taken.
+    init_file: comes from `config.model_init`, can either be an absolute
+      path (ie starts with /) to the checkpoint, or a string like
+      "L-imagenet2012" describing one of the variants from the paper.
+    model_cfg: the model configuration.
+    dont_load: list of param names to be reset to init.
+  Returns:
+    The loaded parameters.
+  """
+  # Support for vanity model names from the paper.
+  vanity = {
+      'FunMatch-224px-i1k82.8': 'gs://bit_models/distill/R50x1_224.npz',
+      'FunMatch-160px-i1k80.5': 'gs://bit_models/distill/R50x1_160.npz',
+  }
+  if init_file[0] in ('L', 'M', 'S'):  # The models from the original paper.
+    # Supported names are of the following type:
+    # - 'M' or 'S': the original "upstream" model without fine-tuning.
+    # - 'M-ILSVRC2012': i21k model fine-tuned on i1k.
+    # - 'M-run0-caltech101': i21k model fine-tuned on VTAB's caltech101.
+    #    each VTAB fine-tuning was run 3x, so there's run0, run1, run2.
+    if '-' in init_file:
+      up, down = init_file[0], init_file[1:]
+    else:
+      up, down = init_file, ''
+    down = {'-imagenet2012': '-ILSVRC2012'}.get(down, down)  # normalize
+    fname = f'BiT-{up}-R{model_cfg.depth}x{model_cfg.width}{down}.npz'
+    fname = f'gs://bit_models/{fname}'
+  else:
+    fname = vanity.get(init_file, init_file)
+  params = u.load_params(fname)
+  params = maybe_convert_big_transfer_format(params)
+  return common.merge_params(params, init_params, dont_load)
+def maybe_convert_big_transfer_format(params_tf):
+  """If the checkpoint comes from legacy codebase, convert it."""
+  # Only do anything at all if we recognize the format.
+  if 'resnet' not in params_tf:
+    return params_tf
+  # For ease of processing and backwards compatibility, flatten again:
+  params_tf = dict(u.tree_flatten_with_names(params_tf)[0])
+  # Works around some files containing weird naming of variables:
+  for k in list(params_tf):
+    k2 = re.sub('/standardized_conv2d_\\d+/', '/standardized_conv2d/', k)
+    if k2 != k:
+      params_tf[k2] = params_tf[k]
+      del params_tf[k]
+  params = {
+      'root_block': {'conv_root': {'kernel': params_tf[
+          'resnet/root_block/standardized_conv2d/kernel']}},
+      'norm-pre-head': {
+          'bias': params_tf['resnet/group_norm/beta'][None, None, None],
+          'scale': params_tf['resnet/group_norm/gamma'][None, None, None],
+      },
+      'head': {
+          'kernel': params_tf['resnet/head/conv2d/kernel'][0, 0],
+          'bias': params_tf['resnet/head/conv2d/bias'],
+      }
+  }
+  for block in ('block1', 'block2', 'block3', 'block4'):
+    params[block] = {}
+    units = set([re.findall(r'unit\d+', p)[0] for p in params_tf.keys()
+                 if p.find(block) >= 0])
+    for unit in units:
+      params[block][unit] = {}
+      for i, group in enumerate('abc', 1):
+        params[block][unit][f'conv{i}'] = {
+            'kernel': params_tf[f'resnet/{block}/{unit}/{group}/standardized_conv2d/kernel']  # pylint: disable=line-too-long
+        }
+        params[block][unit][f'gn{i}'] = {
+            'bias': params_tf[f'resnet/{block}/{unit}/{group}/group_norm/beta'][None, None, None],  # pylint: disable=line-too-long
+            'scale': params_tf[f'resnet/{block}/{unit}/{group}/group_norm/gamma'][None, None, None],  # pylint: disable=line-too-long
+        }
+      projs = [p for p in params_tf.keys()
+               if p.find(f'{block}/{unit}/a/proj') >= 0]
+      assert len(projs) <= 1
+      if projs:
+        params[block][unit]['conv_proj'] = {
+            'kernel': params_tf[projs[0]]
+        }
+  return params

Tipsomaly/model/big_vision/models/common.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities shared across models."""
+from absl import logging
+import big_vision.utils as u
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+def merge_params(loaded, inited, dont_load=(), match_dtype=False):
+  """Makes `loaded` pytree match `init`, warning or failing on mismatch.
+  Args:
+    loaded: pytree of parameters, typically loaded from a checkpoint.
+    inited: pytree of parameter, typically coming from model init.
+    dont_load: List of regexes for parameters which shall not be taken
+      from `loaded`, either because they should remain at their init value,
+      or because they are missing on either side.
+    match_dtype: returned pytree as leaves converted to dtype from `inited`.
+  Returns:
+    If successful, a new pytree which matches the structure of `init`
+    but contains values from `loaded`, except for `dont_load`.
+    If structures don't match and mismatches are not covered by regexes in
+    `dont_load` argument, then raises an exception with more information.
+  """
+  if inited is None:  # A useful shortcut for example for colabs.
+    return loaded
+  dont_load = u.check_and_compile_patterns(dont_load)
+  def should_merge(name):
+    return not any(pattern.fullmatch(name) for pattern in dont_load)
+  loaded_flat, _ = u.tree_flatten_with_names(loaded)
+  inited_flat, _ = u.tree_flatten_with_names(inited)
+  loaded_flat = {k: v for k, v in loaded_flat}
+  inited_flat = {k: v for k, v in inited_flat}
+  # Let's first build the pytree from all common keys.
+  merged = {}
+  for name, init_val in inited_flat.items():
+    # param is present in both. Load or ignore it!
+    if name in loaded_flat and should_merge(name):
+      merged[name] = loaded_flat[name]
+      if match_dtype:
+        merged[name] = loaded_flat[name].astype(init_val.dtype)
+    else:
+      logging.info("Ignoring checkpoint and using init value for %s", name)
+      merged[name] = init_val
+  def pp(title, names, indent="  "):  # Just pretty-printing
+    if names:
+      return f"{title}:\n" + "\n".join(f"{indent}{k}" for k in sorted(names))
+    else:
+      return ""
+  # Now, if there are keys that only exist in inited or loaded, be helpful:
+  not_in_loaded = inited_flat.keys() - loaded_flat.keys()
+  not_in_inited = loaded_flat.keys() - inited_flat.keys()
+  logging.info(pp("Parameters in model but not in checkpoint", not_in_loaded))
+  logging.info(pp("Parameters in checkpoint but not in model", not_in_inited))
+  # And now see if any of them are not explicitly ignored => an error
+  not_in_loaded = {k for k in not_in_loaded if should_merge(k)}
+  not_in_inited = {k for k in not_in_inited if should_merge(k)}
+  if not_in_loaded or not_in_inited:
+    raise ValueError(
+        pp("Params in checkpoint", loaded_flat.keys()) + "\n" +
+        pp("Params in model (code)", inited_flat.keys()) + "\n" +
+        pp("Params in model (code) but not in checkpoint and not `dont_load`ed",
+           not_in_loaded, indent=" - ") + "\n" +  # Special indent for tests.
+        pp("Params in checkpoint but not in model (code) and not `dont_load`ed",
+           not_in_inited, indent=" + "))  # Special indent for tests.
+  return u.recover_tree(merged.keys(), merged.values())
+class AddPositionEmbs(nn.Module):
+  """Adds positional embeddings to the inputs, supports caching for decode.
+  Attributes:
+    decode: whether to run in single-position autoregressive mode.
+  """
+  decode: bool = False
+  @nn.compact
+  def __call__(self, inputs, posemb):
+    """Applies AddPositionEmbs module.
+    Adds posemb to the inputs, supports single-position autoregressive mode.
+    Args:
+      inputs: input data [batch_size, seq_len, emb_dim].
+      posemb: positional embeddings.
+    Returns:
+      output: inputs modulated by pos-embeddings [batch_size, seq_len, emb_dim].
+    """
+    assert inputs.ndim == 3, f"Unexpected inputs shape: {inputs.shape}"
+    _, seq_len, emb_dim = inputs.shape
+    pe = posemb[:, :seq_len, :]
+    if self.decode:
+      is_initialized = self.has_variable("cache", "cache_index")
+      # We use a cache position index for tracking decoding position.
+      cache_index = self.variable("cache", "cache_index",
+                                  lambda: jnp.array(0, dtype=jnp.uint32))
+      if is_initialized:
+        i = cache_index.value
+        cache_index.value = i + 1
+        # Returns posemb[0, i, :], the positional embedding for the
+        # current decoding position.
+        pe = jax.lax.dynamic_slice(posemb,
+                                   start_indices=jnp.array((0, i, 0)),
+                                   slice_sizes=(1, 1, emb_dim))
+    return inputs + pe

Tipsomaly/model/big_vision/models/mlp_mixer.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MLP-Mixer model."""
+from typing import Optional, Tuple
+from absl import logging
+from big_vision import utils
+from big_vision.models import common
+import einops
+import flax.linen as nn
+import flax.training.checkpoints
+import jax
+import jax.numpy as jnp
+class MlpBlock(nn.Module):
+  mlp_dim: int
+  @nn.compact
+  def __call__(self, x):
+    y = nn.Dense(self.mlp_dim)(x)
+    y = nn.gelu(y)
+    return nn.Dense(x.shape[-1])(y)
+class MixerBlock(nn.Module):
+  """Mixer block layer."""
+  tokens_mlp_dim: int
+  channels_mlp_dim: int
+  drop_p: float
+  @nn.compact
+  def __call__(self, x, *, train=False):
+    y = nn.LayerNorm()(x)
+    y = jnp.swapaxes(y, 1, 2)
+    y = MlpBlock(self.tokens_mlp_dim, name="token_mixing")(y)
+    y = jnp.swapaxes(y, 1, 2)
+    x = x + y * _stoch_depth_mask(x, self.drop_p, not train, self.make_rng)
+    y = nn.LayerNorm()(x)
+    y = MlpBlock(self.channels_mlp_dim, name="channel_mixing")(y)
+    return x + y * _stoch_depth_mask(x, self.drop_p, not train, self.make_rng)
+class MlpMixer(nn.Module):
+  """Mixer architecture."""
+  patch_size: Tuple[int, int]
+  num_classes: Optional[int]
+  num_blocks: int
+  hidden_dim: int
+  tokens_mlp_dim: int
+  channels_mlp_dim: int
+  model_name: Optional[str] = None
+  stoch_depth: float = 0.0
+  @nn.compact
+  def __call__(self, image, *, train=False):
+    out = {}
+    x = out["stem"] = nn.Conv(self.hidden_dim, self.patch_size,
+                              strides=self.patch_size, name="stem")(image)
+    x = out["input_tokens"] = einops.rearrange(x, "n h w c -> n (h w) c")
+    for i in range(self.num_blocks):
+      drop_p = (i / max(self.num_blocks - 1, 1)) * self.stoch_depth
+      x = out[f"block_{i}"] = MixerBlock(
+          self.tokens_mlp_dim, self.channels_mlp_dim, drop_p)(x, train=train)
+    x = nn.LayerNorm(name="pre_head_layer_norm")(x)
+    x = out["pre_logits"] = jnp.mean(x, axis=1)
+    if self.num_classes:
+      x = out["logits"] = nn.Dense(
+          self.num_classes, kernel_init=nn.initializers.zeros, name="head")(x)
+    return x, out
+def Model(num_classes=None, *, variant=None, **kw):  # pylint: disable=invalid-name
+  """Factory function to easily create a Model variant like "L/16"."""
+  if variant is not None:
+    model_size, patch = variant.split("/")
+    kw.setdefault("patch_size", (int(patch), int(patch)))
+    config = {
+        "S": {
+            "hidden_dim": 512,
+            "num_blocks": 8,
+            "channels_mlp_dim": 2048,
+            "tokens_mlp_dim": 256
+        },
+        "B": {
+            "hidden_dim": 768,
+            "num_blocks": 12,
+            "channels_mlp_dim": 3072,
+            "tokens_mlp_dim": 384
+        },
+        "L": {
+            "hidden_dim": 1024,
+            "num_blocks": 24,
+            "channels_mlp_dim": 4096,
+            "tokens_mlp_dim": 512
+        },
+        "H": {
+            "hidden_dim": 1280,
+            "num_blocks": 32,
+            "channels_mlp_dim": 5120,
+            "tokens_mlp_dim": 640
+        },
+    }[model_size]
+    for k, v in config.items():
+      kw.setdefault(k, v)
+  logging.info("Mixer config: %s", kw)
+  return MlpMixer(num_classes=num_classes, **kw)
+def load(init_params, init_file, model_cfg, dont_load=()):
+  """Load checkpoint."""
+  del model_cfg
+  # Shortcut names for some canonical paper checkpoints:
+  init_file = {
+      # pylint: disable=line-too-long
+      # Pretrained models from the MLP-Mixer paper: https://arxiv.org/abs/2105.01601.
+      "B-i1k/16": "gs://mixer_models/imagenet1k/Mixer-B_16.npz",
+      "L-i1k/16": "gs://mixer_models/imagenet1k/Mixer-L_16.npz",
+      "B-i21k/16": "gs://mixer_models/imagenet21k/Mixer-B_16.npz",
+      "L-i21k/16": "gs://mixer_models/imagenet21k/Mixer-L_16.npz",
+      # pylint: enable=line-too-long
+  }.get(init_file, init_file)
+  restored_params = utils.load_params(init_file)
+  restored_params = flax.training.checkpoints.convert_pre_linen(restored_params)
+  if "Mixer" in restored_params:
+    restored_params["pre_head_layer_norm"] = restored_params["Mixer"].pop(
+        "encoder_norm"
+    )
+    restored_params["stem"] = restored_params.pop("embedding")
+    def unflatten_dense(d):
+      return {
+          "Dense_0": {
+              "bias": d["bias1"].squeeze(),
+              "kernel": d["kernel1"].squeeze(),
+          },
+          "Dense_1": {
+              "bias": d["bias2"].squeeze(),
+              "kernel": d["kernel2"].squeeze(),
+          },
+      }
+    for k, v in restored_params["Mixer"].items():
+      assert k.startswith("encoderblock_"), k
+      v["token_mixing"] = unflatten_dense(v.pop("token_mixing_phase_0"))
+      v["channel_mixing"] = unflatten_dense(v.pop("channel_mixing_phase_0"))
+      restored_params["MixerBlock_" + k[len("encoderblock_"):]] = v
+    del restored_params["Mixer"]
+  # possibly use the random init for some of the params (such as, the head).
+  restored_params = common.merge_params(restored_params, init_params, dont_load)
+  return restored_params
+def _stoch_depth_mask(x, drop_p, deterministic, make_rng):
+  if not deterministic and drop_p:
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+    return 1.0 - jax.random.bernoulli(make_rng("dropout"), drop_p, shape)
+  return 1.0

Tipsomaly/model/big_vision/models/vit.py ADDED Viewed

	@@ -0,0 +1,505 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A refactored and simplified ViT.
+However, the names of modules are made to match the old ones for easy loading.
+"""
+from typing import Optional, Sequence, Union
+from absl import logging
+from big_vision import utils
+from big_vision.models import common
+import flax
+import flax.linen as nn
+import flax.training.checkpoints
+import jax
+import jax.numpy as jnp
+import numpy as np
+import scipy.ndimage
+def posemb_sincos_2d(h, w, width, temperature=10_000., dtype=jnp.float32):
+  """Follows the MoCo v3 logic."""
+  y, x = jnp.mgrid[:h, :w]
+  assert width % 4 == 0, "Width must be mult of 4 for sincos posemb"
+  omega = jnp.arange(width // 4) / (width // 4 - 1)
+  omega = 1. / (temperature**omega)
+  y = jnp.einsum("m,d->md", y.flatten(), omega)
+  x = jnp.einsum("m,d->md", x.flatten(), omega)
+  pe = jnp.concatenate([jnp.sin(x), jnp.cos(x), jnp.sin(y), jnp.cos(y)], axis=1)
+  return jnp.asarray(pe, dtype)[None, :, :]
+def get_posemb(self, typ, seqshape, width, name, dtype=jnp.float32):
+  if typ == "learn":
+    return self.param(name, nn.initializers.normal(stddev=1/np.sqrt(width)),
+                      (1, np.prod(seqshape), width), dtype)
+  elif typ == "sincos2d":
+    return posemb_sincos_2d(*seqshape, width, dtype=dtype)
+  else:
+    raise ValueError(f"Unknown posemb type: {typ}")
+class MlpBlock(nn.Module):
+  """Transformer MLP / feed-forward block."""
+  mlp_dim: Optional[int] = None  # Defaults to 4x input dim
+  dropout: float = 0.0
+  dtype_mm: str = "float32"
+  @nn.compact
+  def __call__(self, x, deterministic=True):
+    """Applies Transformer MlpBlock module."""
+    inits = dict(
+        kernel_init=nn.initializers.xavier_uniform(),
+        bias_init=nn.initializers.normal(stddev=1e-6),
+    )
+    d = x.shape[-1]
+    x = nn.Dense(self.mlp_dim or 4 * d, dtype=self.dtype_mm, **inits)(x)
+    # In some extreme batch-size cases, this is needed as of Sept 2024:
+    x = nn.with_logical_constraint(x, ("act_batch", "act_len", "act_emb"))
+    x = nn.gelu(x)
+    x = nn.Dropout(rate=self.dropout)(x, deterministic)
+    x = nn.Dense(d, dtype=self.dtype_mm, **inits)(x)
+    return x
+class Encoder1DBlock(nn.Module):
+  """Single transformer encoder block (MHSA + MLP)."""
+  mlp_dim: Optional[int] = None  # Defaults to 4x input dim
+  num_heads: int = 12
+  dropout: float = 0.0
+  dtype_mm: str = "float32"
+  @nn.compact
+  def __call__(self, x, deterministic=True):
+    out = {}
+    x = nn.with_logical_constraint(x, ("act_batch", "act_len", "act_emb"))
+    y = nn.LayerNorm()(x)
+    y = out["sa"] = nn.MultiHeadDotProductAttention(
+        num_heads=self.num_heads,
+        kernel_init=nn.initializers.xavier_uniform(),
+        deterministic=deterministic,
+        dtype=self.dtype_mm,
+    )(y, y)
+    y = nn.with_logical_constraint(y, ("act_batch", "act_len", "act_emb"))
+    y = nn.Dropout(rate=self.dropout)(y, deterministic)
+    x = out["+sa"] = x + y
+    y = nn.LayerNorm()(x)
+    y = out["mlp"] = MlpBlock(
+        mlp_dim=self.mlp_dim, dropout=self.dropout,
+        dtype_mm=self.dtype_mm,
+    )(y, deterministic)
+    y = nn.with_logical_constraint(y, ("act_batch", "act_len", "act_emb"))
+    y = nn.Dropout(rate=self.dropout)(y, deterministic)
+    x = out["+mlp"] = x + y
+    x = nn.with_logical_constraint(x, ("act_batch", "act_len", "act_emb"))
+    return x, out
+class Encoder(nn.Module):
+  """Transformer Model Encoder for sequence to sequence translation."""
+  depth: int
+  mlp_dim: Optional[int] = None  # Defaults to 4x input dim
+  num_heads: int = 12
+  dropout: float = 0.0
+  scan: bool = False
+  remat_policy: str = "nothing_saveable"
+  dtype_mm: str = "float32"
+  @nn.compact
+  def __call__(self, x, deterministic=True):
+    out = {}
+    if self.scan:
+      block = nn.remat(
+          Encoder1DBlock,
+          prevent_cse=False,
+          static_argnums=(2,),  # 0=self, 2=deterministic
+          policy=getattr(jax.checkpoint_policies, self.remat_policy, None),
+          )
+      x, scan_out = nn.scan(
+          block,
+          variable_axes={"params": 0},
+          split_rngs={"params": True, "dropout": True},
+          in_axes=nn.broadcast,
+          length=self.depth)(
+              name="encoderblock",
+              dtype_mm=self.dtype_mm,
+              mlp_dim=self.mlp_dim,
+              num_heads=self.num_heads,
+              dropout=self.dropout)(x, deterministic)
+      for lyr in range(self.depth):
+        out[f"block{lyr:02d}"] = jax.tree.map(lambda o, l=lyr: o[l], scan_out)
+    else:
+      # Input Encoder
+      for lyr in range(self.depth):
+        block_cur = Encoder1DBlock(
+            name=f"encoderblock_{lyr}",
+            dtype_mm=self.dtype_mm,
+            mlp_dim=self.mlp_dim, num_heads=self.num_heads,
+            dropout=self.dropout)
+        x, out[f"block{lyr:02d}"] = block_cur(x, deterministic)
+      out["pre_ln"] = x  # Alias for last block, but without the number in it.
+    return nn.LayerNorm(name="encoder_norm")(x), out
+class MAPHead(nn.Module):
+  """Multihead Attention Pooling."""
+  mlp_dim: Optional[int] = None  # Defaults to 4x input dim
+  num_heads: int = 12
+  @nn.compact
+  def __call__(self, x):
+    # TODO
+    n, l, d = x.shape  # pylint: disable=unused-variable
+    probe = self.param("probe", nn.initializers.xavier_uniform(),
+                       (1, 1, d), x.dtype)
+    probe = jnp.tile(probe, [n, 1, 1])
+    x = nn.MultiHeadDotProductAttention(
+        num_heads=self.num_heads,
+        kernel_init=nn.initializers.xavier_uniform())(probe, x)
+    # TODO: dropout on head?
+    y = nn.LayerNorm()(x)
+    x = x + MlpBlock(mlp_dim=self.mlp_dim)(y)
+    return x[:, 0]
+class _Model(nn.Module):
+  """ViT model."""
+  num_classes: Optional[int] = None
+  patch_size: Sequence[int] = (16, 16)
+  width: int = 768
+  depth: int = 12
+  mlp_dim: Optional[int] = None  # Defaults to 4x input dim
+  num_heads: int = 12
+  posemb: str = "learn"  # Can also be "sincos2d"
+  rep_size: Union[int, bool] = False
+  dropout: float = 0.0
+  pool_type: str = "gap"  # Can also be "map" or "tok"
+  head_zeroinit: bool = True
+  scan: bool = False
+  # or "dots_with_no_batch_dims_saveable" for more speed (memory costly)
+  remat_policy: str = "nothing_saveable"
+  dtype_mm: str = "float32"
+  @nn.compact
+  def __call__(self, image, *, train=False):
+    out = {}
+    image = jnp.asarray(image, self.dtype_mm)
+    # Patch extraction
+    x = out["stem"] = nn.Conv(
+        self.width, self.patch_size, strides=self.patch_size,
+        padding="VALID", name="embedding", dtype=self.dtype_mm)(image)
+    n, h, w, c = x.shape
+    x = jnp.reshape(x, [n, h * w, c])
+    # Add posemb before adding extra token.
+    x = out["with_posemb"] = x + get_posemb(
+        self, self.posemb, (h, w), c, "pos_embedding", x.dtype)
+    if self.pool_type == "tok":
+      cls = self.param("cls", nn.initializers.zeros, (1, 1, c), x.dtype)
+      x = jnp.concatenate([jnp.tile(cls, [n, 1, 1]), x], axis=1)
+    n, l, c = x.shape  # pylint: disable=unused-variable
+    x = nn.Dropout(rate=self.dropout)(x, not train)
+    x, out["encoder"] = Encoder(
+        depth=self.depth,
+        mlp_dim=self.mlp_dim,
+        num_heads=self.num_heads,
+        dropout=self.dropout,
+        scan=self.scan,
+        remat_policy=self.remat_policy,
+        dtype_mm=self.dtype_mm,
+        name="Transformer")(
+            x, deterministic=not train)
+    encoded = out["encoded"] = x
+    if self.pool_type == "map":
+      x = out["head_input"] = MAPHead(
+          num_heads=self.num_heads, mlp_dim=self.mlp_dim)(x)
+    elif self.pool_type == "gap":
+      x = out["head_input"] = jnp.mean(x, axis=1)
+    elif self.pool_type == "0":
+      x = out["head_input"] = x[:, 0]
+    elif self.pool_type == "tok":
+      x = out["head_input"] = x[:, 0]
+      encoded = encoded[:, 1:]
+    elif self.pool_type == "none":
+      pass
+    else:
+      raise ValueError(f"Unknown pool type: '{self.pool_type}'")
+    x_2d = jnp.reshape(encoded, [n, h, w, -1])
+    if self.rep_size:
+      # raise Exception("It should not come here, patch embds should not be ...")
+      rep_size = self.width if self.rep_size is True else self.rep_size
+      hid = nn.Dense(rep_size, name="pre_logits")
+      # NOTE: In the past we did not include tanh in pre_logits.
+      # For few-shot, it should not matter much, as it whitens anyways.
+      x_2d = nn.tanh(hid(x_2d))
+      x = nn.tanh(hid(x))
+      print('here_rep_size')
+    # print('after_rep_size')
+    # print(f'self.pool_type: {self.pool_type}')
+    out["pre_logits_2d"] = x_2d
+    out["pre_logits"] = x
+    if self.num_classes:
+      kw = {"kernel_init": nn.initializers.zeros} if self.head_zeroinit else {}
+      head = nn.Dense(self.num_classes, name="head", **kw)
+      x_2d = out["logits_2d"] = head(x_2d)
+      x = out["logits"] = head(x)
+    return x, out
+def Model(num_classes=None, *, variant=None, **kw):  # pylint: disable=invalid-name
+  """Factory function, because linen really don't like what I'm doing!"""
+  return _Model(num_classes, **{**decode_variant(variant), **kw})
+def decode_variant(variant):
+  """Converts a string like "B" or "B/32" into a params dict."""
+  if variant is None:
+    return {}
+  v, patch = variant, {}
+  if "/" in variant:
+    v, patch = variant.split("/")
+    patch = {"patch_size": (int(patch), int(patch))}
+  return {
+    # pylint:disable=line-too-long
+    # Reference: Table 2 of https://arxiv.org/abs/2106.04560.
+    "width": {"mu": 32, "Ti": 192, "S": 384, "M": 512, "B": 768, "L": 1024, "So400m": 1152, "H": 1280, "g": 1408, "g-opt": 1536, "G": 1664, "G-opt": 1536, "e": 1792}[v],
+    "depth": {"mu": 1, "Ti": 12, "S": 12, "M": 12, "B": 12, "L": 24, "So400m": 27, "H": 32, "g": 40, "g-opt": 40, "G": 48, "G-opt": 48, "e": 56}[v],
+    "mlp_dim": {"mu": 128, "Ti": 768, "S": 1536, "M": 2048, "B": 3072, "L": 4096, "So400m": 4304, "H": 5120, "g": 6144, "g-opt": 6144, "G": 8192, "G-opt": 8192, "e": 15360}[v],
+    "num_heads": {"mu": 2, "Ti": 3, "S": 6, "M": 8, "B": 12, "L": 16, "So400m": 16, "H": 16, "g": 16, "g-opt": 16, "G": 16, "G-opt": 16, "e": 16}[v],
+    # pylint:enable=line-too-long
+    **patch
+  }
+def resample_posemb(old, new):
+  """This function implements "high-res finetuning" for transformer models."""
+  # Rescale the grid of position embeddings. Param shape is (1,N,1024)
+  if old.shape == new.shape:
+    return old
+  logging.info("ViT: resize %s to %s", old.shape, new.shape)
+  gs_old = int(np.sqrt(old.shape[1]))
+  gs_new = int(np.sqrt(new.shape[1]))
+  logging.info("ViT: grid-size from %s to %s", gs_old, gs_new)
+  grid = old.reshape(gs_old, gs_old, -1)
+  zoom = (gs_new/gs_old, gs_new/gs_old, 1)
+  grid = scipy.ndimage.zoom(grid, zoom, order=1)
+  grid = grid.reshape(1, gs_new*gs_new, -1)
+  return grid
+def fix_old_checkpoints(params):
+  """Fix small bwd incompat that can't be resolved with names in model def."""
+  params = flax.core.unfreeze(
+      flax.training.checkpoints.convert_pre_linen(params))
+  # Original ViT paper variant had posemb in a module:
+  if "posembed_input" in params["Transformer"]:
+    logging.info("ViT: Loading and fixing VERY old posemb")
+    posemb = params["Transformer"].pop("posembed_input")
+    params["pos_embedding"] = posemb["pos_embedding"]
+  # Widely used version before 2022 had posemb in Encoder:
+  if "pos_embedding" in params["Transformer"]:
+    logging.info("ViT: Loading and fixing old posemb")
+    params["pos_embedding"] = params["Transformer"].pop("pos_embedding")
+  # Old vit.py used to first concat [cls] token, then add posemb.
+  # This means a B/32@224px would have 7x7+1 posembs. This is useless and clumsy
+  # so we changed to add posemb then concat [cls]. We can recover the old
+  # checkpoint by manually summing [cls] token and its posemb entry.
+  if "pos_embedding" in params:
+    pe = params["pos_embedding"]
+    if int(np.sqrt(pe.shape[1])) ** 2 + 1 == int(pe.shape[1]):
+      logging.info("ViT: Loading and fixing combined cls+posemb")
+      pe_cls, params["pos_embedding"] = pe[:, :1], pe[:, 1:]
+      if "cls" in params:
+        params["cls"] += pe_cls
+  # MAP-head variants during ViT-G development had it inlined:
+  if "probe" in params:
+    params["MAPHead_0"] = {
+        k: params.pop(k) for k in
+        ["probe", "MlpBlock_0", "MultiHeadDotProductAttention_0", "LayerNorm_0"]
+    }
+  return params
+def pyloop_to_scan(params_pyloop):
+  """Converts a python for-loop ViT checkpoint to a lax.scan based one."""
+  # On a high level, they are the same except that the for loop has separate
+  # array pytrees for each encoderblock, while the scan one has just one
+  # encoderblock pytree, with all block's params concatenated.
+  params_scan = jax.tree.map(lambda x: x, params_pyloop)  # Structural copy
+  t = params_scan["Transformer"]
+  # Find highest index of encoderblocks in the checkpoint (they start at 0):
+  encoderblocks = {k for k in t if k.startswith("encoderblock_")}
+  depth = 1 + max({int(k.split("_")[-1]) for k in encoderblocks})
+  def stack(*values):
+    return np.stack(values)
+  # Stack all encoderblocks into a single one:
+  t["encoderblock"] = jax.tree.map(
+      stack, *[t[f"encoderblock_{lyr}"] for lyr in range(depth)])
+  for lyr in range(depth):
+    del t[f"encoderblock_{lyr}"]
+  return params_scan
+def scan_to_pyloop(params_scan):
+  """Converts a lax.scan ViT checkpoint to a python for-loop based one."""
+  # See comment in pyloop_to_scan.
+  params_scan = jax.tree.map(lambda x: x, params_scan)  # Structural copy
+  t = params_scan["Transformer"]
+  # Find out how many encoderblocks there are
+  depth = len(t["encoderblock"]["LayerNorm_0"]["bias"])
+  # Create that many encoderblocks, each with their slice of their sub-pytree.
+  for lyr in range(depth):
+    block = jax.tree.map(lambda x, lyr=lyr: x[lyr], t["encoderblock"])
+    t[f"encoderblock_{lyr}"] = block
+  del t["encoderblock"]
+  return params_scan
+def load(init_params, init_file, model_cfg, dont_load=()):  # pylint: disable=invalid-name because we had to CamelCase above.
+  """Load init from checkpoint, both old model and this one. +Hi-res posemb."""
+  init_file = VANITY_NAMES.get(init_file, init_file)
+  restored_params = utils.load_params(init_file)
+  restored_params = fix_old_checkpoints(restored_params)
+  # Detect attempts to load non-scan checkpoint into scan model.
+  if (model_cfg.get("scan") and
+      "encoderblock" not in restored_params["Transformer"]):
+    restored_params = pyloop_to_scan(restored_params)
+  if (not model_cfg.get("scan")
+      and "encoderblock" in restored_params["Transformer"]):
+    restored_params = scan_to_pyloop(restored_params)
+  # possibly use the random init for some of the params (such as, the head).
+  restored_params = common.merge_params(restored_params, init_params, dont_load)
+  # resample posemb if needed.
+  # TODO: Take this from model_cfg to avoid need for init_params.
+  if init_params and "pos_embedding" in init_params:
+    restored_params["pos_embedding"] = resample_posemb(
+        old=restored_params["pos_embedding"],
+        new=init_params["pos_embedding"])
+  return restored_params
+# Shortcut names for some canonical paper checkpoints:
+VANITY_NAMES = {
+    # pylint: disable=line-too-long
+    # Recommended models from https://arxiv.org/abs/2106.10270
+    # Many more models at https://github.com/google-research/vision_transformer
+    "howto-i21k-Ti/16": "gs://vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz",
+    "howto-i21k-S/32": "gs://vit_models/augreg/S_32-i21k-300ep-lr_0.001-aug_none-wd_0.1-do_0.0-sd_0.0.npz",
+    "howto-i21k-S/16": "gs://vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz",
+    "howto-i21k-B/32": "gs://vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0.npz",
+    "howto-i21k-B/16": "gs://vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz",
+    "howto-i21k-B/8": "gs://vit_models/augreg/B_8-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0.npz",
+    "howto-i21k-L/16": "gs://vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0.npz",
+    # Better plain vit-s16 baselines from https://arxiv.org/abs/2205.01580
+    "i1k-s16-90ep": "gs://big_vision/vit_s16_i1k_90ep.npz",
+    "i1k-s16-150ep": "gs://big_vision/vit_s16_i1k_150ep.npz",
+    "i1k-s16-300ep": "gs://big_vision/vit_s16_i1k_300ep.npz",
+    # DeiT-3 checkpoints from https://github.com/facebookresearch/deit/blob/main/README_revenge.md
+    # First layer converted to take inputs in [-1,1]
+    "deit3_S_224_1k": "gs://big_vision/zoo/deit3/bv_deit_3_small_224_1k.npz",
+    "deit3_S_224_21k": "gs://big_vision/zoo/deit3/bv_deit_3_small_224_21k.npz",
+    "deit3_S_384_1k": "gs://big_vision/zoo/deit3/bv_deit_3_small_384_1k.npz",
+    "deit3_S_384_21k": "gs://big_vision/zoo/deit3/bv_deit_3_small_384_21k.npz",
+    "deit3_B_224_1k": "gs://big_vision/zoo/deit3/bv_deit_3_base_224_1k.npz",
+    "deit3_B_224_21k": "gs://big_vision/zoo/deit3/bv_deit_3_base_224_21k.npz",
+    "deit3_B_384_1k": "gs://big_vision/zoo/deit3/bv_deit_3_base_384_1k.npz",
+    "deit3_B_384_21k": "gs://big_vision/zoo/deit3/bv_deit_3_base_384_21k.npz",
+    "deit3_L_224_1k": "gs://big_vision/zoo/deit3/bv_deit_3_large_224_1k.npz",
+    "deit3_L_224_21k": "gs://big_vision/zoo/deit3/bv_deit_3_large_224_21k.npz",
+    "deit3_L_384_1k": "gs://big_vision/zoo/deit3/bv_deit_3_large_384_1k.npz",
+    "deit3_L_384_21k": "gs://big_vision/zoo/deit3/bv_deit_3_large_384_21k.npz",
+    # SigLIP image encoder checkpoints from https://arxiv.org/abs/2303.15343
+    "SigLIP B/16 224": "gs://big_vision/siglip/webli_en_b16_224_63724782.npz:img",
+    "SigLIP B/16 256": "gs://big_vision/siglip/webli_en_b16_256_60500360.npz:img",
+    "SigLIP B/16 384": "gs://big_vision/siglip/webli_en_b16_384_68578854.npz:img",
+    "SigLIP B/16 512": "gs://big_vision/siglip/webli_en_b16_512_68580893.npz:img",
+    "SigLIP L/16 256": "gs://big_vision/siglip/webli_en_l16_256_60552751.npz:img",
+    "SigLIP L/16 384": "gs://big_vision/siglip/webli_en_l16_384_63634585.npz:img",
+    "SigLIP So400m/14 224": "gs://big_vision/siglip/webli_en_so400m_224_57633886.npz:img",
+    "SigLIP So400m/14 384": "gs://big_vision/siglip/webli_en_so400m_384_58765454.npz:img",
+    "SigLIP B/16-i18n 256": "gs://big_vision/siglip/webli_i18n_b16_256_66117334.npz:img",
+    # SigLIP 2 image encoder checkpoints from https://arxiv.org/abs/2502.14786
+    "SigLIP2 B/16 224": "gs://big_vision/siglip2/siglip2_b16_224.npz:img",
+    "SigLIP2 B/16 256": "gs://big_vision/siglip2/siglip2_b16_256.npz:img",
+    "SigLIP2 B/16 384": "gs://big_vision/siglip2/siglip2_b16_384.npz:img",
+    "SigLIP2 B/16 512": "gs://big_vision/siglip2/siglip2_b16_512.npz:img",
+    "SigLIP2 B/32 256": "gs://big_vision/siglip2/siglip2_b32_256.npz:img",
+    "SigLIP2 L/16 256": "gs://big_vision/siglip2/siglip2_l16_256.npz:img",
+    "SigLIP2 L/16 384": "gs://big_vision/siglip2/siglip2_l16_384.npz:img",
+    "SigLIP2 L/16 512": "gs://big_vision/siglip2/siglip2_l16_512.npz:img",
+    "SigLIP2 So400m/14 224": "gs://big_vision/siglip2/siglip2_so400m14_224.npz:img",
+    "SigLIP2 So400m/14 384": "gs://big_vision/siglip2/siglip2_so400m14_384.npz:img",
+    "SigLIP2 So400m/16 256": "gs://big_vision/siglip2/siglip2_so400m16_256.npz:img",
+    "SigLIP2 So400m/16 384": "gs://big_vision/siglip2/siglip2_so400m16_384.npz:img",
+    "SigLIP2 So400m/16 512": "gs://big_vision/siglip2/siglip2_so400m16_512.npz:img",
+    "SigLIP2 g-opt/16 256": "gs://big_vision/siglip2/siglip2_g-opt16_256.npz:img",
+    "SigLIP2 g-opt/16 384": "gs://big_vision/siglip2/siglip2_g-opt16_384.npz:img",
+    # SigLIP 2 NaFlex image encoder checkpoints.
+    # These need `proj.image_text.naflex_vit.py` as the image encoder model
+    # and a non-standard preprocessing, see configs/proj/image_text/README_siglip2.md.
+    "SigLIP2 B/16 NaFlex": "gs://big_vision/siglip2/siglip2_b16_naflex.npz:img",
+    "SigLIP2 So400m/16 NaFlex": "gs://big_vision/siglip2/siglip2_so400m16_naflex.npz:img",
+    # pylint: enable=line-too-long
+}

Tipsomaly/model/big_vision/pp/__init__.py ADDED Viewed

File without changes

Tipsomaly/model/big_vision/pp/autoaugment.py ADDED Viewed

	@@ -0,0 +1,700 @@

+# Copyright 2023 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AutoAugment and RandAugment policies for enhanced image preprocessing.
+AutoAugment Reference: https://arxiv.org/abs/1805.09501
+RandAugment Reference: https://arxiv.org/abs/1909.13719
+This code is forked from
+https://github.com/tensorflow/tpu/blob/11d0db15cf1c3667f6e36fecffa111399e008acd/models/official/efficientnet/autoaugment.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import dataclasses
+import inspect
+import math
+import tensorflow.compat.v1 as tf
+from tensorflow_addons import image as contrib_image
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+@dataclasses.dataclass
+class HParams:
+  """Parameters for AutoAugment and RandAugment."""
+  cutout_const: int
+  translate_const: int
+def policy_v0():
+  """Autoaugment policy that was used in AutoAugment Paper."""
+  # Each tuple is an augmentation operation of the form
+  # (operation, probability, magnitude). Each element in policy is a
+  # sub-policy that will be applied sequentially on the image.
+  policy = [
+      [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+      [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+      [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+      [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+      [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+      [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+      [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+      [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+      [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+      [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+      [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+      [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+      [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+      [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+      [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+      [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
+      [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+      [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+      [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+      [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+      [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+      [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+      [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+      [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+      [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+  ]
+  return policy
+def policy_vtest():
+  """Autoaugment test policy for debugging."""
+  # Each tuple is an augmentation operation of the form
+  # (operation, probability, magnitude). Each element in policy is a
+  # sub-policy that will be applied sequentially on the image.
+  policy = [
+      [('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)],
+  ]
+  return policy
+def blend(image1, image2, factor):
+  """Blend image1 and image2 using 'factor'.
+  Factor can be above 0.0.  A value of 0.0 means only image1 is used.
+  A value of 1.0 means only image2 is used.  A value between 0.0 and
+  1.0 means we linearly interpolate the pixel values between the two
+  images.  A value greater than 1.0 "extrapolates" the difference
+  between the two pixel values, and we clip the results to values
+  between 0 and 255.
+  Args:
+    image1: An image Tensor of type uint8.
+    image2: An image Tensor of type uint8.
+    factor: A floating point value above 0.0.
+  Returns:
+    A blended image Tensor of type uint8.
+  """
+  if factor == 0.0:
+    return tf.convert_to_tensor(image1)
+  if factor == 1.0:
+    return tf.convert_to_tensor(image2)
+  image1 = tf.to_float(image1)
+  image2 = tf.to_float(image2)
+  difference = image2 - image1
+  scaled = factor * difference
+  # Do addition in float.
+  temp = tf.to_float(image1) + scaled
+  # Interpolate
+  if factor > 0.0 and factor < 1.0:
+    # Interpolation means we always stay within 0 and 255.
+    return tf.cast(temp, tf.uint8)
+  # Extrapolate:
+  #
+  # We need to clip and then cast.
+  return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8)
+def cutout(image, pad_size, replace=0):
+  """Apply cutout (https://arxiv.org/abs/1708.04552) to image.
+  This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
+  a random location within `img`. The pixel values filled in will be of the
+  value `replace`. The located where the mask will be applied is randomly
+  chosen uniformly over the whole image.
+  Args:
+    image: An image Tensor of type uint8.
+    pad_size: Specifies how big the zero mask that will be generated is that
+      is applied to the image. The mask will be of size
+      (2*pad_size x 2*pad_size).
+    replace: What pixel value to fill in the image in the area that has
+      the cutout mask applied to it.
+  Returns:
+    An image Tensor that is of type uint8.
+  """
+  image_height = tf.shape(image)[0]
+  image_width = tf.shape(image)[1]
+  # Sample the center location in the image where the zero mask will be applied.
+  cutout_center_height = tf.random_uniform(
+      shape=[], minval=0, maxval=image_height,
+      dtype=tf.int32)
+  cutout_center_width = tf.random_uniform(
+      shape=[], minval=0, maxval=image_width,
+      dtype=tf.int32)
+  lower_pad = tf.maximum(0, cutout_center_height - pad_size)
+  upper_pad = tf.maximum(0, image_height - cutout_center_height - pad_size)
+  left_pad = tf.maximum(0, cutout_center_width - pad_size)
+  right_pad = tf.maximum(0, image_width - cutout_center_width - pad_size)
+  cutout_shape = [image_height - (lower_pad + upper_pad),
+                  image_width - (left_pad + right_pad)]
+  padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+  mask = tf.pad(
+      tf.zeros(cutout_shape, dtype=image.dtype),
+      padding_dims, constant_values=1)
+  mask = tf.expand_dims(mask, -1)
+  mask = tf.tile(mask, [1, 1, 3])
+  image = tf.where(
+      tf.equal(mask, 0),
+      tf.ones_like(image, dtype=image.dtype) * replace,
+      image)
+  return image
+def solarize(image, threshold=128):
+  # For each pixel in the image, select the pixel
+  # if the value is less than the threshold.
+  # Otherwise, subtract 255 from the pixel.
+  return tf.where(image < threshold, image, 255 - image)
+def solarize_add(image, addition=0, threshold=128):
+  # For each pixel in the image less than threshold
+  # we add 'addition' amount to it and then clip the
+  # pixel value to be between 0 and 255. The value
+  # of 'addition' is between -128 and 128.
+  added_image = tf.cast(image, tf.int64) + addition
+  added_image = tf.cast(tf.clip_by_value(added_image, 0, 255), tf.uint8)
+  return tf.where(image < threshold, added_image, image)
+def color(image, factor):
+  """Equivalent of PIL Color."""
+  degenerate = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image))
+  return blend(degenerate, image, factor)
+def contrast(image, factor):
+  """Equivalent of PIL Contrast."""
+  degenerate = tf.image.rgb_to_grayscale(image)
+  # Cast before calling tf.histogram.
+  degenerate = tf.cast(degenerate, tf.int32)
+  # Compute the grayscale histogram, then compute the mean pixel value,
+  # and create a constant image size of that value.  Use that as the
+  # blending degenerate target of the original image.
+  hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256)
+  mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0
+  degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8))
+  return blend(degenerate, image, factor)
+def brightness(image, factor):
+  """Equivalent of PIL Brightness."""
+  degenerate = tf.zeros_like(image)
+  return blend(degenerate, image, factor)
+def posterize(image, bits):
+  """Equivalent of PIL Posterize."""
+  shift = 8 - bits
+  return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift)
+def rotate(image, degrees, replace):
+  """Rotates the image by degrees either clockwise or counterclockwise.
+  Args:
+    image: An image Tensor of type uint8.
+    degrees: Float, a scalar angle in degrees to rotate all images by. If
+      degrees is positive the image will be rotated clockwise otherwise it will
+      be rotated counterclockwise.
+    replace: A one or three value 1D tensor to fill empty pixels caused by
+      the rotate operation.
+  Returns:
+    The rotated version of image.
+  """
+  # Convert from degrees to radians.
+  degrees_to_radians = math.pi / 180.0
+  radians = degrees * degrees_to_radians
+  # In practice, we should randomize the rotation degrees by flipping
+  # it negatively half the time, but that's done on 'degrees' outside
+  # of the function.
+  image = contrib_image.rotate(wrap(image), radians)
+  return unwrap(image, replace)
+def translate_x(image, pixels, replace):
+  """Equivalent of PIL Translate in X dimension."""
+  image = contrib_image.translate(wrap(image), [-pixels, 0])
+  return unwrap(image, replace)
+def translate_y(image, pixels, replace):
+  """Equivalent of PIL Translate in Y dimension."""
+  image = contrib_image.translate(wrap(image), [0, -pixels])
+  return unwrap(image, replace)
+def shear_x(image, level, replace):
+  """Equivalent of PIL Shearing in X dimension."""
+  # Shear parallel to x axis is a projective transform
+  # with a matrix form of:
+  # [1  level
+  #  0  1].
+  image = contrib_image.transform(
+      wrap(image), [1., level, 0., 0., 1., 0., 0., 0.])
+  return unwrap(image, replace)
+def shear_y(image, level, replace):
+  """Equivalent of PIL Shearing in Y dimension."""
+  # Shear parallel to y axis is a projective transform
+  # with a matrix form of:
+  # [1  0
+  #  level  1].
+  image = contrib_image.transform(
+      wrap(image), [1., 0., 0., level, 1., 0., 0., 0.])
+  return unwrap(image, replace)
+def autocontrast(image):
+  """Implements Autocontrast function from PIL using TF ops.
+  Args:
+    image: A 3D uint8 tensor.
+  Returns:
+    The image after it has had autocontrast applied to it and will be of type
+    uint8.
+  """
+  def scale_channel(image):
+    """Scale the 2D image using the autocontrast rule."""
+    # A possibly cheaper version can be done using cumsum/unique_with_counts
+    # over the histogram values, rather than iterating over the entire image.
+    # to compute mins and maxes.
+    lo = tf.to_float(tf.reduce_min(image))
+    hi = tf.to_float(tf.reduce_max(image))
+    # Scale the image, making the lowest value 0 and the highest value 255.
+    def scale_values(im):
+      scale = 255.0 / (hi - lo)
+      offset = -lo * scale
+      im = tf.to_float(im) * scale + offset
+      im = tf.clip_by_value(im, 0.0, 255.0)
+      return tf.cast(im, tf.uint8)
+    result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image)
+    return result
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image[:, :, 0])
+  s2 = scale_channel(image[:, :, 1])
+  s3 = scale_channel(image[:, :, 2])
+  image = tf.stack([s1, s2, s3], 2)
+  return image
+def sharpness(image, factor):
+  """Implements Sharpness function from PIL using TF ops."""
+  orig_image = image
+  image = tf.cast(image, tf.float32)
+  # Make image 4D for conv operation.
+  image = tf.expand_dims(image, 0)
+  # SMOOTH PIL Kernel.
+  kernel = tf.constant(
+      [[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=tf.float32,
+      shape=[3, 3, 1, 1]) / 13.
+  # Tile across channel dimension.
+  kernel = tf.tile(kernel, [1, 1, 3, 1])
+  strides = [1, 1, 1, 1]
+  with tf.device('/cpu:0'):
+    # Some augmentation that uses depth-wise conv will cause crashing when
+    # training on GPU. See ((internal link)) for details.
+    degenerate = tf.nn.depthwise_conv2d(
+        image, kernel, strides, padding='VALID', rate=[1, 1])
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0])
+  # For the borders of the resulting image, fill in the values of the
+  # original image.
+  mask = tf.ones_like(degenerate)
+  padded_mask = tf.pad(mask, [[1, 1], [1, 1], [0, 0]])
+  padded_degenerate = tf.pad(degenerate, [[1, 1], [1, 1], [0, 0]])
+  result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image)
+  # Blend the final result.
+  return blend(result, orig_image, factor)
+def equalize(image):
+  """Implements Equalize function from PIL using TF ops."""
+  def scale_channel(im, c):
+    """Scale the data in the channel to implement equalize."""
+    im = tf.cast(im[:, :, c], tf.int32)
+    # Compute the histogram of the image channel.
+    histo = tf.histogram_fixed_width(im, [0, 255], nbins=256)
+    # For the purposes of computing the step, filter out the nonzeros.
+    nonzero = tf.where(tf.not_equal(histo, 0))
+    nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1])
+    step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255
+    def build_lut(histo, step):
+      # Compute the cumulative sum, shifting by step // 2
+      # and then normalization by step.
+      lut = (tf.cumsum(histo) + (step // 2)) // step
+      # Shift lut, prepending with 0.
+      lut = tf.concat([[0], lut[:-1]], 0)
+      # Clip the counts to be in range.  This is done
+      # in the C code for image.point.
+      return tf.clip_by_value(lut, 0, 255)
+    # If step is zero, return the original image.  Otherwise, build
+    # lut from the full histogram and step and then index from it.
+    result = tf.cond(tf.equal(step, 0),
+                     lambda: im,
+                     lambda: tf.gather(build_lut(histo, step), im))
+    return tf.cast(result, tf.uint8)
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image, 0)
+  s2 = scale_channel(image, 1)
+  s3 = scale_channel(image, 2)
+  image = tf.stack([s1, s2, s3], 2)
+  return image
+def invert(image):
+  """Inverts the image pixels."""
+  image = tf.convert_to_tensor(image)
+  return 255 - image
+def wrap(image):
+  """Returns 'image' with an extra channel set to all 1s."""
+  shape = tf.shape(image)
+  extended_channel = tf.ones([shape[0], shape[1], 1], image.dtype)
+  extended = tf.concat([image, extended_channel], 2)
+  return extended
+def unwrap(image, replace):
+  """Unwraps an image produced by wrap.
+  Where there is a 0 in the last channel for every spatial position,
+  the rest of the three channels in that spatial dimension are grayed
+  (set to 128).  Operations like translate and shear on a wrapped
+  Tensor will leave 0s in empty locations.  Some transformations look
+  at the intensity of values to do preprocessing, and we want these
+  empty pixels to assume the 'average' value, rather than pure black.
+  Args:
+    image: A 3D Image Tensor with 4 channels.
+    replace: A one or three value 1D tensor to fill empty pixels.
+  Returns:
+    image: A 3D image Tensor with 3 channels.
+  """
+  image_shape = tf.shape(image)
+  # Flatten the spatial dimensions.
+  flattened_image = tf.reshape(image, [-1, image_shape[2]])
+  # Find all pixels where the last channel is zero.
+  alpha_channel = flattened_image[:, 3]
+  replace = tf.concat([replace, tf.ones([1], image.dtype)], 0)
+  # Where they are zero, fill them in with 'replace'.
+  flattened_image = tf.where(
+      tf.equal(alpha_channel, 0),
+      tf.ones_like(flattened_image, dtype=image.dtype) * replace,
+      flattened_image)
+  image = tf.reshape(flattened_image, image_shape)
+  image = tf.slice(image, [0, 0, 0], [image_shape[0], image_shape[1], 3])
+  return image
+NAME_TO_FUNC = {
+    'AutoContrast': autocontrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': rotate,
+    'Posterize': posterize,
+    'Solarize': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'Contrast': contrast,
+    'Brightness': brightness,
+    'Sharpness': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x,
+    'TranslateY': translate_y,
+    'Cutout': cutout,
+}
+def _randomly_negate_tensor(tensor):
+  """With 50% prob turn the tensor negative."""
+  should_flip = tf.cast(tf.floor(tf.random_uniform([]) + 0.5), tf.bool)
+  final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor)
+  return final_tensor
+def _rotate_level_to_arg(level):
+  level = (level/_MAX_LEVEL) * 30.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+def _shrink_level_to_arg(level):
+  """Converts level to ratio by which we shrink the image content."""
+  if level == 0:
+    return (1.0,)  # if level is zero, do not shrink the image
+  # Maximum shrinking ratio is 2.9.
+  level = 2. / (_MAX_LEVEL / level) + 0.9
+  return (level,)
+def _enhance_level_to_arg(level):
+  return ((level/_MAX_LEVEL) * 1.8 + 0.1,)
+def _shear_level_to_arg(level):
+  level = (level/_MAX_LEVEL) * 0.3
+  # Flip level to negative with 50% chance.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+def _translate_level_to_arg(level, translate_const):
+  level = (level/_MAX_LEVEL) * float(translate_const)
+  # Flip level to negative with 50% chance.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+def level_to_arg(hparams):
+  return {
+      'AutoContrast': lambda level: (),
+      'Equalize': lambda level: (),
+      'Invert': lambda level: (),
+      'Rotate': _rotate_level_to_arg,
+      'Posterize': lambda level: (int((level/_MAX_LEVEL) * 4),),
+      'Solarize': lambda level: (int((level/_MAX_LEVEL) * 256),),
+      'SolarizeAdd': lambda level: (int((level/_MAX_LEVEL) * 110),),
+      'Color': _enhance_level_to_arg,
+      'Contrast': _enhance_level_to_arg,
+      'Brightness': _enhance_level_to_arg,
+      'Sharpness': _enhance_level_to_arg,
+      'ShearX': _shear_level_to_arg,
+      'ShearY': _shear_level_to_arg,
+      'Cutout': lambda level: (int((level/_MAX_LEVEL) * hparams.cutout_const),),
+      'TranslateX': lambda level: _translate_level_to_arg(
+          level, hparams.translate_const),
+      'TranslateY': lambda level: _translate_level_to_arg(
+          level, hparams.translate_const),
+      # pylint:enable=g-long-lambda
+  }
+def _parse_policy_info(name, prob, level, replace_value, augmentation_hparams):
+  """Return the function that corresponds to `name` and update `level` param."""
+  func = NAME_TO_FUNC[name]
+  args = level_to_arg(augmentation_hparams)[name](level)
+  # Check to see if prob is passed into function. This is used for operations
+  # where we alter bboxes independently.
+  # pytype:disable=wrong-arg-types
+  if 'prob' in inspect.getfullargspec(func).args:
+    args = tuple([prob] + list(args))
+  # pytype:enable=wrong-arg-types
+  # Add in replace arg if it is required for the function that is being called.
+  # pytype:disable=wrong-arg-types
+  if 'replace' in inspect.getfullargspec(func).args:
+    # Make sure replace is the final argument
+    assert 'replace' == inspect.getfullargspec(func).args[-1]
+    args = tuple(list(args) + [replace_value])
+  # pytype:enable=wrong-arg-types
+  return (func, prob, args)
+def _apply_func_with_prob(func, image, args, prob):
+  """Apply `func` to image w/ `args` as input with probability `prob`."""
+  assert isinstance(args, tuple)
+  # If prob is a function argument, then this randomness is being handled
+  # inside the function, so make sure it is always called.
+  # pytype:disable=wrong-arg-types
+  if 'prob' in inspect.getfullargspec(func).args:
+    prob = 1.0
+  # pytype:enable=wrong-arg-types
+  # Apply the function with probability `prob`.
+  should_apply_op = tf.cast(
+      tf.floor(tf.random_uniform([], dtype=tf.float32) + prob), tf.bool)
+  augmented_image = tf.cond(
+      should_apply_op,
+      lambda: func(image, *args),
+      lambda: image)
+  return augmented_image
+def select_and_apply_random_policy(policies, image):
+  """Select a random policy from `policies` and apply it to `image`."""
+  policy_to_select = tf.random_uniform([], maxval=len(policies), dtype=tf.int32)
+  # Note that using tf.case instead of tf.conds would result in significantly
+  # larger graphs and would even break export for some larger policies.
+  for (i, policy) in enumerate(policies):
+    image = tf.cond(
+        tf.equal(i, policy_to_select),
+        lambda selected_policy=policy: selected_policy(image),
+        lambda: image)
+  return image
+def build_and_apply_nas_policy(policies, image,
+                               augmentation_hparams):
+  """Build a policy from the given policies passed in and apply to image.
+  Args:
+    policies: list of lists of tuples in the form `(func, prob, level)`, `func`
+      is a string name of the augmentation function, `prob` is the probability
+      of applying the `func` operation, `level` is the input argument for
+      `func`.
+    image: tf.Tensor that the resulting policy will be applied to.
+    augmentation_hparams: Hparams associated with the NAS learned policy.
+  Returns:
+    A version of image that now has data augmentation applied to it based on
+    the `policies` pass into the function.
+  """
+  replace_value = [128, 128, 128]
+  # func is the string name of the augmentation function, prob is the
+  # probability of applying the operation and level is the parameter associated
+  # with the tf op.
+  # tf_policies are functions that take in an image and return an augmented
+  # image.
+  tf_policies = []
+  for policy in policies:
+    tf_policy = []
+    # Link string name to the correct python function and make sure the correct
+    # argument is passed into that function.
+    for policy_info in policy:
+      policy_info = list(policy_info) + [replace_value, augmentation_hparams]
+      tf_policy.append(_parse_policy_info(*policy_info))
+    # Now build the tf policy that will apply the augmentation procedue
+    # on image.
+    def make_final_policy(tf_policy_):
+      def final_policy(image_):
+        for func, prob, args in tf_policy_:
+          image_ = _apply_func_with_prob(
+              func, image_, args, prob)
+        return image_
+      return final_policy
+    tf_policies.append(make_final_policy(tf_policy))
+  augmented_image = select_and_apply_random_policy(
+      tf_policies, image)
+  return augmented_image
+def distort_image_with_autoaugment(image, augmentation_name):
+  """Applies the AutoAugment policy to `image`.
+  AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
+  Args:
+    image: `Tensor` of shape [height, width, 3] representing an image.
+    augmentation_name: The name of the AutoAugment policy to use. The available
+      options are `v0` and `test`. `v0` is the policy used for
+      all of the results in the paper and was found to achieve the best results
+      on the COCO dataset. `v1`, `v2` and `v3` are additional good policies
+      found on the COCO dataset that have slight variation in what operations
+      were used during the search procedure along with how many operations are
+      applied in parallel to a single image (2 vs 3).
+  Returns:
+    A tuple containing the augmented versions of `image`.
+  """
+  available_policies = {'v0': policy_v0,
+                        'test': policy_vtest}
+  if augmentation_name not in available_policies:
+    raise ValueError('Invalid augmentation_name: {}'.format(augmentation_name))
+  policy = available_policies[augmentation_name]()
+  # Hparams that will be used for AutoAugment.
+  augmentation_hparams = HParams(
+      cutout_const=100, translate_const=250)
+  return build_and_apply_nas_policy(policy, image, augmentation_hparams)
+def distort_image_with_randaugment(image, num_layers, magnitude):
+  """Applies the RandAugment policy to `image`.
+  RandAugment is from the paper https://arxiv.org/abs/1909.13719,
+  Args:
+    image: `Tensor` of shape [height, width, 3] representing an image.
+    num_layers: Integer, the number of augmentation transformations to apply
+      sequentially to an image. Represented as (N) in the paper. Usually best
+      values will be in the range [1, 3].
+    magnitude: Integer, shared magnitude across all augmentation operations.
+      Represented as (M) in the paper. Usually best values are in the range
+      [5, 30].
+  Returns:
+    The augmented version of `image`.
+  """
+  replace_value = [128] * 3
+  tf.logging.info('Using RandAug.')
+  augmentation_hparams = HParams(
+      cutout_const=40, translate_const=100)
+  available_ops = [
+      'AutoContrast', 'Equalize', 'Invert', 'Rotate', 'Posterize',
+      'Solarize', 'Color', 'Contrast', 'Brightness', 'Sharpness',
+      'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Cutout', 'SolarizeAdd']
+  for layer_num in range(num_layers):
+    op_to_select = tf.random_uniform(
+        [], maxval=len(available_ops), dtype=tf.int32)
+    random_magnitude = float(magnitude)
+    with tf.name_scope('randaug_layer_{}'.format(layer_num)):
+      for (i, op_name) in enumerate(available_ops):
+        prob = tf.random_uniform([], minval=0.2, maxval=0.8, dtype=tf.float32)
+        func, _, args = _parse_policy_info(op_name, prob, random_magnitude,
+                                           replace_value, augmentation_hparams)
+        image = tf.cond(
+            tf.equal(i, op_to_select),
+            lambda selected_func=func, selected_args=args: selected_func(
+                image, *selected_args),
+            # pylint:enable=g-long-lambda
+            lambda: image)
+  return image

Tipsomaly/model/big_vision/pp/builder.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Preprocessing builder."""
+from absl import logging
+from big_vision.pp import registry
+import tensorflow as tf
+def get_preprocess_fn(pp_pipeline, log_data=True, log_steps=False):
+  """Transform an input string into the preprocessing function.
+  The minilanguage is as follows:
+    fn1|fn2(arg, arg2,...)|...
+  And describes the successive application of the various `fn`s to the input,
+  where each function can optionally have one or more arguments, which are
+  either positional or key/value, as dictated by the `fn`.
+  The output preprocessing function expects a dictionary as input. This
+  dictionary should have a key "image" that corresponds to a 3D tensor
+  (height x width x channel).
+  Args:
+    pp_pipeline: A string describing the pre-processing pipeline. If empty or
+      None, no preprocessing will be executed.
+    log_data: Whether to log the data before and after preprocessing. Can also
+      be a string to show in the log for debugging, for example dataset name.
+    log_steps: Whether to log the steps of the preprocessing pipeline.
+  Returns:
+    preprocessing function.
+  Raises:
+    ValueError: if preprocessing function name is unknown
+  """
+  names, ops, spec_strings = [], [], []
+  if pp_pipeline:
+    for op_spec in pp_pipeline.split("|"):
+      if not op_spec: continue  # Skip empty section instead of error.
+      try:
+        ops.append(registry.Registry.lookup(f"preprocess_ops.{op_spec}")())
+        names.append(registry.parse_name(op_spec)[0])
+        spec_strings.append(op_spec)
+      except SyntaxError as err:
+        raise ValueError(f"Syntax error on: {op_spec}") from err
+  def _preprocess_fn(data):
+    """The preprocessing function that is returned."""
+    nonlocal log_data, log_steps
+    # Apply all the individual steps in sequence.
+    if log_data:
+      logging.info("Data before pre-processing (%s):\n%s", log_data, data)
+    for name, op, spec in zip(names, ops, spec_strings):
+      if log_steps:
+        logging.info("Pre-processing step (%s): %s\n%s", name, spec, data)
+      with tf.name_scope(name):
+        data = op(data)
+    # Validate input
+    if not isinstance(data, dict):
+      raise ValueError("Argument `data` must be a dictionary, "
+                       "not %s" % str(type(data)))
+    if log_data:
+      logging.info("Data after pre-processing (%s):\n%s", log_data, data)
+    log_data = False  # For eager&pygrain: only log first one of each pipeline.
+    return data
+  return _preprocess_fn

Tipsomaly/model/big_vision/pp/builder_test.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for builder."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from big_vision.pp import builder
+from big_vision.pp import ops_general  # pylint: disable=unused-import
+from big_vision.pp import ops_image  # pylint: disable=unused-import
+import numpy as np
+import tensorflow.compat.v1 as tf
+class BuilderTest(tf.test.TestCase):
+  def testSingle(self):
+    pp_fn = builder.get_preprocess_fn("resize(256)")
+    x = np.random.randint(0, 256, [640, 480, 3])
+    image = pp_fn({"image": x})["image"]
+    self.assertEqual(image.numpy().shape, (256, 256, 3))
+  def testEmpty(self):
+    pp_fn = builder.get_preprocess_fn("||inception_crop|||resize(256)||")
+    # Typical image input
+    x = np.random.randint(0, 256, [640, 480, 3])
+    image = pp_fn({"image": x})["image"]
+    self.assertEqual(image.numpy().shape, (256, 256, 3))
+  def testPreprocessingPipeline(self):
+    pp_str = ("inception_crop|resize(256)|resize((256, 256))|"
+              "central_crop((80, 120))|flip_lr|value_range(0,1)|"
+              "value_range(-1,1)")
+    pp_fn = builder.get_preprocess_fn(pp_str)
+    # Typical image input
+    x = np.random.randint(0, 256, [640, 480, 3])
+    image = pp_fn({"image": x})["image"]
+    self.assertEqual(image.numpy().shape, (80, 120, 3))
+    self.assertLessEqual(np.max(image.numpy()), 1)
+    self.assertGreaterEqual(np.min(image.numpy()), -1)
+  def testNumArgsException(self):
+    x = np.random.randint(0, 256, [640, 480, 3])
+    for pp_str in [
+        "inception_crop(1)",
+        "resize()",
+        "resize(1, 1, 1)"
+        "flip_lr(1)",
+        "central_crop()",
+    ]:
+      with self.assertRaises(BaseException):
+        builder.get_preprocess_fn(pp_str)(x)
+if __name__ == "__main__":
+  tf.test.main()

Tipsomaly/model/big_vision/pp/ops_general.py ADDED Viewed

	@@ -0,0 +1,468 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generic tensor preprocessing ops.
+All preprocessing ops should return a data processing functors. A data
+is represented as a dictionary of (TF) tensors. The functors output a modified
+dictionary.
+"""
+import collections
+from big_vision.pp import utils
+from big_vision.pp.registry import Registry
+import big_vision.utils as bv_utils
+import jax
+import numpy as np
+import tensorflow as tf
+@Registry.register("preprocess_ops.value_range")
+@utils.InKeyOutKey()
+def get_value_range(vmin=-1, vmax=1, in_min=0, in_max=255.0, clip_values=False):
+  """Transforms a [in_min,in_max] image to [vmin,vmax] range.
+  Input ranges in_min/in_max can be equal-size lists to rescale the invidudal
+  channels independently.
+  Args:
+    vmin: A scalar. Output max value.
+    vmax: A scalar. Output min value.
+    in_min: A scalar or a list of input min values to scale. If a list, the
+      length should match to the number of channels in the image.
+    in_max: A scalar or a list of input max values to scale. If a list, the
+      length should match to the number of channels in the image.
+    clip_values: Whether to clip the output values to the provided ranges.
+  Returns:
+    A function to rescale the values.
+  """
+  def _value_range(image):
+    """Scales values in given range."""
+    in_min_t = tf.constant(in_min, tf.float32)
+    in_max_t = tf.constant(in_max, tf.float32)
+    image = tf.cast(image, tf.float32)
+    image = (image - in_min_t) / (in_max_t - in_min_t)
+    image = vmin + image * (vmax - vmin)
+    if clip_values:
+      image = tf.clip_by_value(image, vmin, vmax)
+    return image
+  return _value_range
+@Registry.register("preprocess_ops.lookup")
+@utils.InKeyOutKey()
+def get_lookup(mapping, npzkey="fnames", sep=None):
+  """Map string to number."""
+  # For NumPy files, we use the `npzkey` array in that file as the list of
+  # strings which are mapped to their index in that array.
+  # This is especially useful when other data (eg precomputed predictions)
+  # goes along with this mapping, to have everything in one place (the npz).
+  if mapping.endswith(".npz"):
+    with tf.io.gfile.GFile(mapping, "rb") as f:
+      keys = np.array(np.load(f, allow_pickle=False)[npzkey])
+    vals = np.arange(len(keys))
+  # Otherwise, we simply use the file as a text file, with either of:
+  # - a string per line, mapped to its line-number
+  # - a pair, separated by `sep` per line, first value being the string, second
+  #   value being the integer that the string is mapped to.
+  else:
+    with tf.io.gfile.GFile(mapping, "r") as f:
+      buf = f.read()
+    if sep is None:  # values are the line numbers
+      keys = buf.splitlines()
+      vals = np.arange(len(keys))
+    else:  # each line is key<sep>val, also make val int
+      keys, vals = zip(*[l.split(sep) for l in buf.splitlines()])
+      vals = [int(v) for v in vals]
+  def _do_the_mapping(needle):
+    """Map string to number."""
+    with tf.init_scope():  # (Originally added for performance reasons.)
+      table = tf.lookup.StaticHashTable(
+          tf.lookup.KeyValueTensorInitializer(keys, vals), -1)
+    return table.lookup(needle)
+  return _do_the_mapping
+@Registry.register("preprocess_ops.onehot")
+def get_onehot(depth,
+               key="labels",
+               key_result=None,
+               multi=True,
+               on=1.0,
+               off=0.0):
+  """One-hot encodes the input.
+  Args:
+    depth: Length of the one-hot vector (how many classes).
+    key: Key of the data to be one-hot encoded.
+    key_result: Key under which to store the result (same as `key` if None).
+    multi: If there are multiple labels, whether to merge them into the same
+      "multi-hot" vector (True) or keep them as an extra dimension (False).
+    on: Value to fill in for the positive label (default: 1).
+    off: Value to fill in for negative labels (default: 0).
+  Returns:
+    Data dictionary.
+  """
+  def _onehot(data):
+  # When there's more than one label, this is significantly more efficient
+    # than using tf.one_hot followed by tf.reduce_max; we tested.
+    labels = data[key]
+    labels = tf.cast(labels, tf.int64)  # both scatter and one_hot expect this
+    if labels.shape.rank > 0 and multi:
+      x = tf.scatter_nd(labels[:, None], tf.ones(tf.shape(labels)[0]), (depth,))
+      x = tf.clip_by_value(x, 0, 1) * (on - off) + off
+    else:
+      x = tf.one_hot(labels, depth, on_value=on, off_value=off)
+    data[key_result or key] = x
+    return data
+  return _onehot
+@Registry.register("preprocess_ops.keep")
+def get_keep(*keys):
+  """Keeps only the given keys."""
+  def _keep(data):
+    return {k: v for k, v in data.items() if k in keys}
+  return _keep
+@Registry.register("preprocess_ops.drop")
+def get_drop(*keys):
+  """Drops the given keys."""
+  def _drop(data):
+    return {k: v for k, v in data.items() if k not in keys}
+  return _drop
+@Registry.register("preprocess_ops.copy")
+def get_copy(inkey, outkey):
+  """Copies value of `inkey` into `outkey`."""
+  def _copy(data):
+    # A "semi-deep" copy. deepcopy doesn't work when tf tensors are part of the
+    # game. What we want, is to only copy the python structure (dicts, lists)
+    # and keep tensors as they are, since we never modify them in-place anyways.
+    # The following achieves exactly that.
+    data[outkey] = jax.tree.map(lambda x: x, data[inkey])
+    return data
+  return _copy
+@Registry.register("preprocess_ops.squeeze_last_dim")
+@utils.InKeyOutKey()
+def get_squeeze_last_dim():
+  def _squeeze_last_dim(x):
+    return tf.squeeze(x, axis=-1)
+  return _squeeze_last_dim
+@Registry.register("preprocess_ops.concat")
+def get_concat(inkeys, outkey=None, axis=-1):
+  """Concatenates elements along some axis."""
+  def _concat(data):
+    data[outkey or inkeys[0]] = tf.concat([data[k] for k in inkeys], axis)
+    return data
+  return _concat
+@Registry.register("preprocess_ops.rag_tensor")
+@utils.InKeyOutKey()
+def get_rag_tensor():
+  """Converts the specified feature to ragged tensor."""
+  def rag_tensor(raw_tensor):
+    # Note: Add one more dimension as `from_tensor` requires at least rank 2.
+    return tf.RaggedTensor.from_tensor(raw_tensor[None])
+  return rag_tensor
+@Registry.register("preprocess_ops.pad_to_shape")
+@utils.InKeyOutKey()
+def get_pad_to_shape(shape, pad_value=0, where="after"):
+  """Pads tensor to specified `shape`."""
+  def _pads(cur, tgt):
+    if tgt is None:
+      return [0, 0]
+    diff = tgt - cur
+    return {
+        "before": [diff, 0],
+        "after": [0, diff],
+        "both": [diff // 2, diff - diff // 2],
+    }[where]
+  def _pad_to_shape(x):
+    assert len(x.shape.as_list()) == len(shape)
+    paddings = [_pads(tgt=shape[i], cur=tf.shape(x)[i])
+                for i in range(len(shape))]
+    constant_value = tf.constant(pad_value, x.dtype)
+    ret = tf.pad(x, paddings, constant_values=constant_value)
+    ret.set_shape(shape)
+    return ret
+  return _pad_to_shape
+@Registry.register("preprocess_ops.flatten")
+def get_flatten(keys=None):
+  """Flattens the keys of data with separator '/'."""
+  def _flatten(data):
+    flatten_keys = keys or list(data.keys())
+    not_flattened = {k: v for k, v in data.items() if k not in flatten_keys}
+    flattened = {k: v for k, v in data.items() if k in flatten_keys}
+    flattened, _ = bv_utils.tree_flatten_with_names(flattened)
+    return {**dict(flattened), **not_flattened}
+  return _flatten
+@Registry.register("preprocess_ops.reshape")
+@utils.InKeyOutKey()
+def get_reshape(new_shape):
+  """Reshapes tensor to a given new shape.
+  Args:
+    new_shape: new shape for the tensor.
+  Returns:
+    A function for reshaping a tensor.
+  """
+  def _reshape(tensor):
+    """Reshapes a tensor to a given shape."""
+    dtype = tensor.dtype
+    tensor = tf.reshape(tensor, new_shape)
+    return tf.cast(tensor, dtype)
+  return _reshape
+@Registry.register("preprocess_ops.setdefault")
+def get_setdefault(key, value):
+  """If `key` is an empty tensor or missing, set it to `value`."""
+  def _setdefault(data):
+    x = data.get(key, tf.constant(value))
+    v = tf.constant(value, dtype=x.dtype)
+    v = tf.broadcast_to(v, [s or 1 for s in x.shape])
+    data[key] = tf.cond(tf.size(x) > 0, lambda: x, lambda: v)
+    return data
+  return _setdefault
+@Registry.register("preprocess_ops.choice")
+def get_choice(n="single", key=None, fewer_ok=False, inkey=None, outkey=None):
+  """Chooses the same `n` random entries of all `keys`.
+  Args:
+    n: how many entries to randomly sample (without repeat). Possible values:
+      - int: that many entries (or fewer if there's fewer, see `fewer_ok`.)
+      - "single": The string "single" only chooses one and drop the leading dim.
+      - [min, max]: A pair means randomly take between min/max examples (incl.).
+    key: str or list of str: See Note.
+    fewer_ok: whether to fail when there's fewer than `n` elements to choose
+              from (and hence set static shape to `n`), or whether to allow it.
+              (and hence have unknown static shape).
+    inkey: str or list of str: See Note.
+    outkey: str or list of str: See Note.
+  Note:
+    If key/inkey/outkey is a list, then the same random entries are chosen for
+    all of the keys. Other than that, they function the same as InKeyOutKey.
+    The outkey can also contain the placeholder `{key}` that'll be .
+  Examples:
+    choice(key="alt_text/text")
+    choice(n=128, key=["patches", "positions"])
+    choice(inkey=["questions_i18n", "answers_i18n"], outkey=["q", "a"])
+  Returns:
+    The pp op.
+  """
+  # Normalize keys:
+  inkeys = utils.maybe_repeat(inkey or key, 1)
+  outkeys = utils.maybe_repeat(outkey or key, 1)
+  outkeys = [ok.format(key=ik) for ok, ik in zip(outkeys, inkeys)]
+  # Let's DRY on this condition and give it a name.
+  is_varlen = isinstance(n, (list, tuple))
+  min_n = n[0] if is_varlen else 1 if n == "single" else n
+  def _choice(data):
+    # Catch a hard to identify/understand user error:
+    assert data[inkeys[0]].ndim > 0, (
+        f"You're calling `choice_no_replacement` on {inkeys}, a scalar."
+        " That's probably a mistake ; double-check and then just don't."
+    )
+    nitems = tf.shape(data[inkeys[0]])[0]
+    # Sanity check that all keys have same leading dimension, and that is at
+    # least as large as the minimum requested output.
+    lengths = [tf.shape(data[k])[0] for k in inkeys]
+    checks = [tf.debugging.assert_equal(l, nitems) for l in lengths]
+    if not fewer_ok:  # Since we check for all-same, a single suffices here.
+      checks.append(tf.debugging.assert_greater_equal(nitems, min_n))
+    with tf.control_dependencies(checks):
+      nitems = tf.identity(nitems)
+    if n == "single":
+      index = tf.random.uniform([], 0, nitems, dtype=tf.int32)
+    else:
+      # Subsample by shuffling and taking first n, but...
+      indices = tf.random.shuffle(tf.range(nitems))
+      end = n
+      if is_varlen:
+        end = tf.random.uniform([], n[0], n[1] + 1, dtype=tf.int32)
+      # ...keep the order while subsampling (it might have a meaning, eg boxes)
+      indices = tf.sort(indices[:end])
+    for ik, ok in zip(inkeys, outkeys):
+      if n == "single":
+        result = data[ik][index]
+      else:
+        result = tf.gather(data[ik], indices, axis=0)
+        if not is_varlen:  # Give static shape when we can.
+          result = tf.ensure_shape(result, [n] + [None] * (result.ndim - 1))
+      data[ok] = result
+    return data
+  return _choice
+def _shuffled_index(count, nitems, seed):
+  """Returns index from a shuffled sequence (items only repeat after epoch)."""
+  nitems = tf.cast(nitems, count.dtype)
+  item_epoch, item_offset = (count // nitems, count % nitems)
+  shuffled_indices = tf.random.experimental.stateless_shuffle(
+      tf.range(nitems), seed=tf.random.fold_in(seed, item_epoch))
+  return shuffled_indices[item_offset]
+@Registry.register("preprocess_ops.choice_no_replacement")
+def get_choice_no_replacement(key=None, inkey=None, outkey=None):
+  """Chooses the same random (no replacement) entry of all `keys`.
+  Note: Consider using this for iterating over small datasets with a small
+  number of epochs. It differs from `choice(n='single')` in that if an example,
+  as identified by its `_id` field, is seen N times then it will cycled through
+  all the inkeys values before repeating them. Additionally each repetition uses
+  a different order.
+  Caveats: requires dataset to provide a _id field and uses host RAM to keep a
+  counter how often each id is seen. It is also not robust to preemptions.
+  Args:
+    key: str or list of str: See Note.
+    inkey: str or list of str: See Note.
+    outkey: str or list of str: See Note.
+  Note:
+    If key/inkey/outkey is a list, then the same random entries are chosen for
+    all of the keys. Other than that, they function the same as InKeyOutKey.
+    The outkey can also contain the placeholder `{key}` that'll be replaced
+    by the inkey name.
+  Examples:
+    choice(key="alt_text/text")
+    choice(key=["patches", "positions"])
+    choice(inkey=["questions_i18n", "answers_i18n"], outkey=["q", "a"])
+  Returns:
+    The pp op.
+  """
+  # Normalize keys:
+  inkeys = utils.maybe_repeat(inkey or key, 1)
+  outkeys = utils.maybe_repeat(outkey or key, 1)
+  outkeys = [ok.format(key=ik) for ok, ik in zip(outkeys, inkeys)]
+  # TODO: Ideally the data pipeline should provide us with an epoch
+  # counter. For now count how often we see a given example id and don't worry
+  # on memory consumption. Counter returns 0 the first time an example is seen.
+  counter = collections.defaultdict(lambda: -1)
+  def _seen_count(example_id):
+    example_id = example_id.item()
+    counter[example_id] += 1
+    return counter[example_id]
+  # We need a seed to deterministically decide on a shuffled sequence and use
+  # the number of times an example was seen to iterate through it. The seed
+  # should be different for every instance of a create preprocessing function
+  # but it has to be fixed for each instance.
+  seed = tf.random.uniform(
+      [2], minval=tf.int32.min, maxval=tf.int32.max, dtype=tf.int32)
+  def _choice(data):
+    # Catch a hard to identify/understand user error:
+    assert data[inkeys[0]].ndim > 0, (
+        f"You're calling `choice` on {inkeys}, a scalar."
+        " That's probably a mistake ; double-check and then just don't."
+    )
+    nitems = tf.shape(data[inkeys[0]])[0]
+    # Sanity check that all keys have same leading dimension.
+    checks = [
+        tf.debugging.assert_equal(tf.shape(data[k])[0], nitems)
+        for k in inkeys
+    ]
+    with tf.control_dependencies(checks):
+      nitems = tf.identity(nitems)
+    # Using the seed, example id and the number of times an example was seen
+    # pick an `index` such that items are only repeated after all items are seen
+    # an equal number of times. E.g. it could return indexes from this sequence:
+    #   [0, 1, 2, 1, 2, 0, 2, 0, 1, 0, 2, 1, ...].
+    count = tf.numpy_function(
+        _seen_count, (data["_id"],), Tout=tf.int64, stateful=True)
+    count = tf.cast(count, tf.int32)
+    nitems = tf.cast(nitems, tf.int32)
+    shuffle_epoch = count // nitems
+    shuffle_offset = count % nitems
+    example_seed = tf.random.fold_in(seed, data["_id"])
+    shuffle_seed = tf.random.fold_in(example_seed, shuffle_epoch)
+    shuffle = tf.random.experimental.stateless_shuffle(
+        tf.range(nitems), seed=shuffle_seed)
+    index = shuffle[shuffle_offset]
+    # Select item[index] for all keys.
+    for ik, ok in zip(inkeys, outkeys):
+      data[ok] = data[ik][index]
+    return data
+  return _choice

Tipsomaly/model/big_vision/pp/ops_general_test.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for ops_general."""
+import copy
+import big_vision.pp.ops_general as pp
+import numpy as np
+import tensorflow as tf
+class PreprocessOpsTest(tf.test.TestCase):
+  def tfrun(self, ppfn, data):
+    # Run once as standalone, as could happen eg in colab.
+    yield {k: np.array(v) for k, v in ppfn(copy.deepcopy(data)).items()}
+    # And then once again as part of tfdata pipeline.
+    # You'd be surprised how much these two differ!
+    tfdata = tf.data.Dataset.from_tensors(copy.deepcopy(data))
+    for npdata in tfdata.map(ppfn).as_numpy_iterator():
+      yield npdata
+  def test_value_range(self):
+    img = tf.random.uniform((640, 480, 3), 0, 255, tf.int32)
+    data = {"image": tf.cast(img, tf.uint8)}
+    for out in self.tfrun(pp.get_value_range(-0.5, 0.5), data):
+      self.assertLessEqual(np.max(out["image"]), 0.5)
+      self.assertGreaterEqual(np.min(out["image"]), -0.5)
+  def test_value_range_custom_input_range(self):
+    img = tf.random.uniform((640, 480, 3), 0, 255, tf.int32)
+    data = {"image": tf.cast(img, tf.uint8)}
+    for out in self.tfrun(pp.get_value_range(-0.5, 0.5, -256, 255, True), data):
+      self.assertLessEqual(np.max(out["image"]), 0.5)
+      self.assertGreaterEqual(np.min(out["image"]), 0.0)
+  def test_get_keep_drop(self):
+    data = {"image": 1, "labels": 2, "something": 3}
+    for data_keep in self.tfrun(pp.get_keep("image", "labels"), data):
+      self.assertAllEqual(set(data_keep.keys()), {"image", "labels"})
+    for data_drop in self.tfrun(pp.get_drop("image", "labels"), data):
+      self.assertAllEqual(set(data_drop.keys()), {"something"})
+  def test_onehot(self):
+    data = {"labels": tf.constant(2, dtype=tf.int64)}
+    for out in self.tfrun(pp.get_onehot(4, "labels", multi=True), data):
+      self.assertAllClose(out["labels"], [0., 0., 1., 0.])
+  def test_onehot_multi(self):
+    data = {"labels": tf.constant([2, 3, 0], dtype=tf.int64)}
+    for out in self.tfrun(pp.get_onehot(4, "labels", multi=False), data):
+      self.assertAllClose(out["labels"], [
+          [0., 0., 1., 0.],
+          [0., 0., 0., 1.],
+          [1., 0., 0., 0.]])
+    for out in self.tfrun(pp.get_onehot(4, "labels", multi=True), data):
+      self.assertAllClose(out["labels"], [1., 0., 1., 1.])
+  def test_onehot_2d(self):
+    data = {"labels": tf.constant([[2, 3], [0, 1]], dtype=tf.int64)}
+    for out in self.tfrun(pp.get_onehot(4, "labels", multi=False), data):
+      self.assertAllClose(out["labels"], [
+          [[0., 0., 1., 0.], [0., 0., 0., 1.]],
+          [[1., 0., 0., 0.], [0., 1., 0., 0.]]])
+  def test_onehot_smoothing(self):
+    data = {"labels": tf.constant([2, 3, 0], dtype=tf.int64)}
+    for out in self.tfrun(
+        pp.get_onehot(4, "labels", multi=False, on=0.8, off=0.1), data):
+      self.assertAllClose(out["labels"], [
+          [0.1, 0.1, 0.8, 0.1],
+          [0.1, 0.1, 0.1, 0.8],
+          [0.8, 0.1, 0.1, 0.1]])
+    for out in self.tfrun(
+        pp.get_onehot(4, "labels", multi=True, on=0.8, off=0.1), data):
+      self.assertAllClose(out["labels"], [0.8, 0.1, 0.8, 0.8])
+  def test_squeeze_last_dim(self):
+    data = {"image": tf.constant(np.zeros((32, 32, 3, 1)))}
+    for out in self.tfrun(pp.get_squeeze_last_dim(), data):
+      self.assertAllEqual(out["image"].shape, [32, 32, 3])
+  def test_pad_to_shape(self):
+    desired_shape = (8, 10)
+    for input_shape in [(8, 4), (8, 3), (8, 10), (8, 1)]:
+      data = {"x": tf.ones(input_shape, dtype=tf.float32)}
+      for out in self.tfrun(
+          pp.get_pad_to_shape(desired_shape, pad_value=-1, key="x"), data):
+        self.assertEqual(
+            tf.reduce_sum(out["x"]),
+            2 * np.prod(input_shape) - np.prod(desired_shape))
+  def test_pad_to_shape_none(self):
+    data = {"x": tf.ones((8, 4), dtype=tf.float32)}
+    for out in self.tfrun(
+        pp.get_pad_to_shape((None, 6), pad_value=-1, key="x"), data):
+      self.assertEqual(out["x"].shape, (8, 6))
+      self.assertEqual(tf.reduce_sum(out["x"]), 8*4 - 8*2)
+  def test_pad_to_shape_which_side(self):
+    data = {"x": tf.ones((8, 4), dtype=tf.float32)}
+    for where, idxs in [("before", [0]), ("both", [0, -1]), ("after", [-1])]:
+      for out in self.tfrun(
+          pp.get_pad_to_shape((8, 6), key="x", where=where), data):
+        self.assertEqual(out["x"].shape, (8, 6))
+        self.assertEqual(tf.reduce_sum(out["x"]), 8*4)
+        for i in idxs:
+          self.assertEqual(out["x"][0, i], 0)
+  def test_flatten(self):
+    d = {"a": {"b": tf.constant([1, 2, 3])}, "c": "str"}
+    self.assertEqual(pp.get_flatten()(d), {
+        "a/b": tf.constant([1, 2, 3]),
+        "c": "str"
+    })
+  def test_reshape(self):
+    data = {"image": tf.constant(np.zeros((8, 32 * 32 * 3)))}
+    for out in self.tfrun(pp.get_reshape(new_shape=(8, 32, 32, 3)), data):
+      self.assertAllEqual(out["image"].shape, [8, 32, 32, 3])
+  def test_setdefault(self):
+    data = {
+        "empty_image": tf.zeros([0, 0, 0]),
+        "image": tf.constant(np.arange(9).reshape(3, 3)),
+        "empty_text": tf.zeros([0], tf.string),
+        "text": tf.constant(["Hello", "World"], tf.string),
+    }
+    for out in self.tfrun(pp.get_setdefault("empty_image", 1), data):
+      self.assertAllEqual(out["empty_image"], np.array([[[1]]]))
+    for out in self.tfrun(pp.get_setdefault("image", 1), data):
+      self.assertAllEqual(out["image"], data["image"])
+    for out in self.tfrun(pp.get_setdefault("empty_text", "Lucas"), data):
+      self.assertAllEqual(out["empty_text"], np.array(["Lucas"]))
+    for out in self.tfrun(pp.get_setdefault("text", "Lucas"), data):
+      self.assertAllEqual(out["text"], data["text"])
+  def _data_for_choice(self):
+    return {
+        "one_f32": tf.constant([0.42], tf.float32),
+        "two_f32": tf.constant([3.14, 0.42], tf.float32),
+        "one_str": tf.constant(["Hi"], tf.string),
+        "two_str": tf.constant(["Hi", "Lucas"], tf.string),
+        "one_vec": tf.reshape(tf.range(2, dtype=tf.float32), (1, 2)),
+        "two_vec": tf.reshape(tf.range(4, dtype=tf.float32), (2, 2)),
+    }
+  def test_choice(self):
+    # Test for the default call (n="single")
+    data = self._data_for_choice()
+    self.assertEqual(
+        pp.get_choice(inkey="one_f32", outkey="choice")(data)["choice"], 0.42)
+    self.assertEqual(
+        pp.get_choice(inkey="one_str", outkey="choice")(data)["choice"], "Hi")
+    self.assertIn(
+        pp.get_choice(inkey="two_f32", outkey="choice")(data)["choice"],
+        [3.14, 0.42])
+    self.assertIn(
+        pp.get_choice(inkey="two_str", outkey="choice")(data)["choice"],
+        ["Hi", "Lucas"])
+  def test_choice_nmax(self):
+    # n == nelems should be identity (and keep ordering!)
+    data = self._data_for_choice()
+    for k in ("one_f32", "one_str", "one_vec"):
+      for out in self.tfrun(pp.get_choice(n=1, key=[k]), data):
+        self.assertAllEqual(out[k], data[k])
+      for out in self.tfrun(pp.get_choice(n=[1, 1], key=[k]), data):
+        self.assertAllEqual(out[k], data[k])
+    for k in ("two_f32", "two_str", "two_vec"):
+      for out in self.tfrun(pp.get_choice(n=2, key=[k]), data):
+        self.assertAllEqual(out[k], data[k])
+      for out in self.tfrun(pp.get_choice(n=[2, 2], key=[k]), data):
+        self.assertAllEqual(out[k], data[k])
+  def test_choice_n(self):
+    # n < nelems should be one of them:
+    data = self._data_for_choice()
+    for k in ("two_f32", "two_str"):
+      for out in self.tfrun(pp.get_choice(n=1, key=[k]), data):
+        self.assertIn(out[k], data[k])
+    # Special testing for vectors.
+    for out in self.tfrun(pp.get_choice(n=1, key=["two_vec"]), data):
+      self.assertTrue(tf.logical_or(
+          tf.reduce_all(out["two_vec"][0] == data["two_vec"][0]),
+          tf.reduce_all(out["two_vec"][0] == data["two_vec"][1]),
+      ))
+  def test_choice_multi(self):
+    # Select consistently across multiple keys.
+    data = self._data_for_choice()
+    op = pp.get_choice(n=1, key=["two_f32", "two_str"])
+    for out in self.tfrun(op, data):
+      self.assertTrue(tf.logical_or(
+          tf.logical_and(
+              tf.reduce_all(out["two_f32"][0] == data["two_f32"][0]),
+              tf.reduce_all(out["two_str"][0] == data["two_str"][0]),
+          ),
+          tf.logical_and(
+              tf.reduce_all(out["two_f32"][0] == data["two_f32"][1]),
+              tf.reduce_all(out["two_str"][0] == data["two_str"][1]),
+          ),
+      ))
+  def test_choice_n_range(self):
+    # n < nelems should be one of them:
+    data = self._data_for_choice()
+    for k in ("two_f32", "two_str", "two_vec"):
+      for out in self.tfrun(pp.get_choice(n=[1, 2], key=[k]), data):
+        self.assertTrue(tf.reduce_any([
+            tf.reduce_all(out[k] == data[k][0:1]),
+            tf.reduce_all(out[k] == data[k][1:2]),
+            tf.reduce_all(out[k] == data[k][0:2]),
+        ]))
+if __name__ == "__main__":
+  tf.test.main()

Tipsomaly/model/big_vision/pp/ops_image.py ADDED Viewed

	@@ -0,0 +1,361 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image-centric preprocessing ops.
+All preprocessing ops should return a data processing functors. A data
+is represented as a dictionary of (TF) tensors. The functors output a modified
+dictionary.
+The key named "image" is commonly used for the image, and is a 3D tensor of
+shape (height x width x channels).
+"""
+from big_vision.pp import utils
+from big_vision.pp.registry import Registry
+import tensorflow as tf
+@Registry.register("preprocess_ops.decode")
+@utils.InKeyOutKey()
+def get_decode(channels=3, precise=False):
+  """Decode an encoded image string, see tf.io.decode_image.
+  Args:
+    channels: see tf.io.decode_image.
+    precise: if False, use default TF image decoding algorithm.
+        If True, change DCT method for JPEG decoding to match PIL/cv2/PyTorch.
+        See also (internal link) for a concrete example.
+  Returns:
+    The decoded image.
+  """
+  def _decode(image):
+    if precise:
+      return tf.image.decode_jpeg(  # Also supports png btw.
+          image, channels=channels, dct_method="INTEGER_ACCURATE")
+    else:
+      return tf.io.decode_image(
+          image, channels=channels, expand_animations=False)
+  return _decode
+@Registry.register("preprocess_ops.resize")
+@utils.InKeyOutKey()
+def get_resize(size, method="bilinear", antialias=False):
+  """Resizes image to a given size.
+  Args:
+    size: either an integer H, where H is both the new height and width
+      of the resized image, or a list or tuple [H, W] of integers, where H and W
+      are new image"s height and width respectively.
+    method: resize method, see tf.image.resize docs for options.
+    antialias: see tf.image.resize. Ideally set to True for all new configs.
+  Returns:
+    A function for resizing an image.
+  """
+  size = utils.maybe_repeat(size, 2)
+  def _resize(image):
+    """Resizes image to a given size."""
+    # Note: use TF-2 version of tf.image.resize as the version in TF-1 is
+    # buggy: https://github.com/tensorflow/tensorflow/issues/6720.
+    # In particular it was not equivariant with rotation and lead to the network
+    # to learn a shortcut in self-supervised rotation task, if rotation was
+    # applied after resize.
+    dtype = image.dtype
+    tf_dtype = tf.type_spec_from_value(image).dtype
+    image = tf.image.resize(image, size, method=method, antialias=antialias)
+    return tf.cast(tf.clip_by_value(image, tf_dtype.min, tf_dtype.max), dtype)
+  return _resize
+# This functionality is used by resize_small and resize_long. But we're not
+# registering it as a pp op yet, as there is no need for it. However, it can
+# probably be slightly generalized into "scale augmentation" eventually.
+def _resize_factor(image, factor, method="area", antialias=True):
+  """Resizes the image by a (float) `factor`, keeping the aspect ratio fixed."""
+  h, w = tf.shape(image)[0], tf.shape(image)[1]
+  h = tf.cast(tf.round(tf.cast(h, tf.float32) * factor), tf.int32)
+  w = tf.cast(tf.round(tf.cast(w, tf.float32) * factor), tf.int32)
+  dtype = image.dtype
+  tf_dtype = tf.type_spec_from_value(image).dtype
+  image = tf.image.resize(image, (h, w), method=method, antialias=antialias)
+  return tf.cast(tf.clip_by_value(image, tf_dtype.min, tf_dtype.max), dtype)
+@Registry.register("preprocess_ops.resize_small")
+@utils.InKeyOutKey()
+def get_resize_small(smaller_size, method="area", antialias=False):
+  """Resizes the smaller side to `smaller_size` keeping aspect ratio.
+  Args:
+    smaller_size: an integer, that represents a new size of the smaller side of
+      an input image.
+    method: the resize method. `area` is a meaningful, bwd-compat default.
+    antialias: see tf.image.resize. Ideally set to True for all new configs.
+  Returns:
+    A function, that resizes an image and preserves its aspect ratio.
+  Note:
+    backwards-compat for "area"+antialias tested here:
+    (internal link)
+  """
+  def _resize_small(image):  # pylint: disable=missing-docstring
+    h, w = tf.shape(image)[0], tf.shape(image)[1]
+    factor = (
+        tf.cast(smaller_size, tf.float32) /
+        tf.cast(tf.minimum(h, w), tf.float32))
+    return _resize_factor(image, factor, method=method, antialias=antialias)
+  return _resize_small
+@Registry.register("preprocess_ops.resize_long")
+@utils.InKeyOutKey()
+def get_resize_long(longer_size, method="area", antialias=True):
+  """Resizes the longer side to `longer_size` keeping aspect ratio.
+  Args:
+    longer_size: an integer, that represents a new size of the longer side of
+      an input image.
+    method: the resize method. `area` is a meaningful, bwd-compat default.
+    antialias: see tf.image.resize. Ideally set to True for all new configs.
+  Returns:
+    A function, that resizes an image and preserves its aspect ratio.
+  """
+  def _resize_long(image):  # pylint: disable=missing-docstring
+    h, w = tf.shape(image)[0], tf.shape(image)[1]
+    factor = (
+        tf.cast(longer_size, tf.float32) /
+        tf.cast(tf.maximum(h, w), tf.float32))
+    return _resize_factor(image, factor, method=method, antialias=antialias)
+  return _resize_long
+@Registry.register("preprocess_ops.inception_crop")
+@utils.InKeyOutKey()
+def get_inception_crop(size=None, area_min=5, area_max=100,
+                       method="bilinear", antialias=False):
+  """Makes inception-style image crop.
+  Inception-style crop is a random image crop (its size and aspect ratio are
+  random) that was used for training Inception models, see
+  https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf.
+  Args:
+    size: Resize image to [size, size] after crop.
+    area_min: minimal crop area.
+    area_max: maximal crop area.
+    method: rezied method, see tf.image.resize docs for options.
+    antialias: see tf.image.resize. Ideally set to True for all new configs.
+  Returns:
+    A function, that applies inception crop.
+  """
+  def _inception_crop(image):  # pylint: disable=missing-docstring
+    begin, crop_size, _ = tf.image.sample_distorted_bounding_box(
+        tf.shape(image),
+        tf.zeros([0, 0, 4], tf.float32),
+        area_range=(area_min / 100, area_max / 100),
+        min_object_covered=0,  # Don't enforce a minimum area.
+        use_image_if_no_bounding_boxes=True)
+    crop = tf.slice(image, begin, crop_size)
+    # Unfortunately, the above operation loses the depth-dimension. So we need
+    # to restore it the manual way.
+    crop.set_shape([None, None, image.shape[-1]])
+    if size:
+      crop = get_resize(size, method, antialias)({"image": crop})["image"]
+    return crop
+  return _inception_crop
+@Registry.register("preprocess_ops.decode_jpeg_and_inception_crop")
+@utils.InKeyOutKey()
+def get_decode_jpeg_and_inception_crop(size=None, area_min=5, area_max=100,
+                                       ratio_min=0.75, ratio_max=1.33,
+                                       method="bilinear", antialias=False):
+  """Decode jpeg string and make inception-style image crop.
+  Inception-style crop is a random image crop (its size and aspect ratio are
+  random) that was used for training Inception models, see
+  https://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf.
+  Args:
+    size: Resize image to [size, size] after crop.
+    area_min: minimal crop area.
+    area_max: maximal crop area.
+    ratio_min: minimal aspect ratio.
+    ratio_max: maximal aspect ratio.
+    method: rezied method, see tf.image.resize docs for options.
+    antialias: see tf.image.resize. Ideally set to True for all new configs.
+  Returns:
+    A function, that applies inception crop.
+  """
+  def _inception_crop(image_data):  # pylint: disable=missing-docstring
+    shape = tf.image.extract_jpeg_shape(image_data)
+    begin, crop_size, _ = tf.image.sample_distorted_bounding_box(
+        shape,
+        tf.zeros([0, 0, 4], tf.float32),
+        area_range=(area_min / 100, area_max / 100),
+        aspect_ratio_range=(ratio_min, ratio_max),
+        min_object_covered=0,  # Don't enforce a minimum area.
+        use_image_if_no_bounding_boxes=True)
+    # Crop the image to the specified bounding box.
+    offset_y, offset_x, _ = tf.unstack(begin)
+    target_height, target_width, _ = tf.unstack(crop_size)
+    crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+    image = tf.image.decode_and_crop_jpeg(image_data, crop_window, channels=3)
+    if size:
+      image = get_resize(size, method, antialias)({"image": image})["image"]
+    return image
+  return _inception_crop
+@Registry.register("preprocess_ops.random_crop")
+@utils.InKeyOutKey()
+def get_random_crop(crop_size):
+  """Makes a random crop of a given size.
+  Args:
+    crop_size: either an integer H, where H is both the height and width of the
+      random crop, or a list or tuple [H, W] of integers, where H and W are
+      height and width of the random crop respectively.
+  Returns:
+    A function, that applies random crop.
+  """
+  crop_size = utils.maybe_repeat(crop_size, 2)
+  def _crop(image):
+    return tf.image.random_crop(image, (*crop_size, image.shape[-1]))
+  return _crop
+@Registry.register("preprocess_ops.central_crop")
+@utils.InKeyOutKey()
+def get_central_crop(crop_size=None):
+  """Makes central crop of a given size.
+  Args:
+    crop_size: either an integer H, where H is both the height and width of the
+      central crop, or a list or tuple [H, W] of integers, where H and W are
+      height and width of the central crop respectively. If `crop_size` is not
+      specified, then the largest possible center crop will be taken.
+  Returns:
+    A function, that applies central crop.
+  """
+  if crop_size:
+    crop_size = utils.maybe_repeat(crop_size, 2)
+  def _crop(image):
+    if crop_size:
+      h, w = crop_size[0], crop_size[1]
+    else:
+      h = w = tf.minimum(tf.shape(image)[0], tf.shape(image)[1])
+    dy = (tf.shape(image)[0] - h) // 2
+    dx = (tf.shape(image)[1] - w) // 2
+    return tf.image.crop_to_bounding_box(image, dy, dx, h, w)
+  return _crop
+@Registry.register("preprocess_ops.flip_lr")
+@utils.InKeyOutKey()
+def get_random_flip_lr():
+  """Flips an image horizontally with probability 50%."""
+  def _random_flip_lr_pp(image):
+    return tf.image.random_flip_left_right(image)
+  return _random_flip_lr_pp
+@Registry.register("preprocess_ops.vgg_value_range")
+@utils.InKeyOutKey()
+def get_vgg_value_range(
+    mean=(0.485 * 255, 0.456 * 255, 0.406 * 255),
+    std=(0.229 * 255, 0.224 * 255, 0.225 * 255),
+):
+  """VGG-style preprocessing, subtracts mean and divides by stddev.
+  This preprocessing is very common for ImageNet pre-trained models since VGG,
+  and to this day the standard for models coming from most PyTorch codes.
+  Args:
+    mean: Tuple of values to be subtracted. Default to widespread VGG values.
+    std: Tuple of values to be divided by. Default to widespread VGG values.
+  Returns:
+    A function to rescale the values.
+  """
+  mean = tf.constant(mean, tf.float32)
+  std = tf.constant(std, tf.float32)
+  def _vgg_value_range(image):
+    return (tf.cast(image, tf.float32) - mean) / std
+  return _vgg_value_range
+@Registry.register("preprocess_ops.clip_value_range")
+@utils.InKeyOutKey()
+def get_clip_value_range():
+  mean = (0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255)
+  std = (0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255)
+  def _clip_value_range(image):
+    return (tf.cast(image, tf.float32) - mean) / std
+  return _clip_value_range
+@Registry.register("preprocess_ops.convert_to_video")
+@utils.InKeyOutKey()
+def get_convert_to_video(num_frames):
+  """Converts an image to a video with zero padded frames.
+  Args:
+    num_frames: total number of frames that the video should have.
+  Returns:
+    A function for converting an image to a video.
+  """
+  def _convert_to_video(image):
+    return tf.pad(
+        tf.expand_dims(image, axis=0),
+        [[0, num_frames - 1], [0, 0], [0, 0], [0, 0]],
+    )
+  return _convert_to_video

Tipsomaly/model/big_vision/pp/ops_image_test.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for ops_image."""
+import copy
+import io
+import big_vision.pp.ops_image as pp
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+def get_image_data():
+  img = tf.random.uniform((640, 480, 3), 0, 255, tf.int32)  # Can't ask uint8!?
+  return {"image": tf.cast(img, tf.uint8)}
+class PreprocessOpsTest(tf.test.TestCase):
+  def tfrun(self, ppfn, data):
+    # Run once as standalone, as could happen eg in colab.
+    yield {k: np.array(v) for k, v in ppfn(copy.deepcopy(data)).items()}
+    # And then once again as part of tfdata pipeline.
+    # You'd be surprised how much these two differ!
+    tfdata = tf.data.Dataset.from_tensors(copy.deepcopy(data))
+    for npdata in tfdata.map(ppfn).as_numpy_iterator():
+      yield npdata
+  def test_resize(self):
+    for data in self.tfrun(pp.get_resize([120, 80]), get_image_data()):
+      self.assertEqual(data["image"].shape, (120, 80, 3))
+  def test_resize_small(self):
+    for data in self.tfrun(pp.get_resize_small(240), get_image_data()):
+      self.assertEqual(data["image"].shape, (320, 240, 3))
+  def test_resize_long(self):
+    for data in self.tfrun(pp.get_resize_long(320), get_image_data()):
+      self.assertEqual(data["image"].shape, (320, 240, 3))
+  def test_inception_crop(self):
+    for data in self.tfrun(pp.get_inception_crop(), get_image_data()):
+      self.assertEqual(data["image"].shape[-1], 3)
+  def test_decode_jpeg_and_inception_crop(self):
+    f = io.BytesIO()
+    plt.imsave(f, get_image_data()["image"].numpy(), format="jpg")
+    data = {"image": tf.cast(f.getvalue(), tf.string)}
+    for data in self.tfrun(pp.get_decode_jpeg_and_inception_crop(), data):
+      self.assertEqual(data["image"].shape[-1], 3)
+  def test_random_crop(self):
+    for data in self.tfrun(pp.get_random_crop([120, 80]), get_image_data()):
+      self.assertEqual(data["image"].shape, (120, 80, 3))
+  def test_central_crop(self):
+    for data in self.tfrun(pp.get_central_crop([20, 80]), get_image_data()):
+      self.assertEqual(data["image"].shape, (20, 80, 3))
+  def test_random_flip_lr(self):
+    data_orig = get_image_data()
+    for data in self.tfrun(pp.get_random_flip_lr(), data_orig):
+      self.assertTrue(
+          np.all(data_orig["image"].numpy() == data["image"]) or
+          np.all(data_orig["image"].numpy() == data["image"][:, ::-1]))
+if __name__ == "__main__":
+  tf.test.main()

Tipsomaly/model/big_vision/pp/ops_text.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Text-centric preprocessing ops.
+All preprocessing ops should return a data processing functors. A data
+is represented as a dictionary of (TF) tensors. The functors output a modified
+dictionary.
+A commonly used key for the tokenized output is "labels".
+"""
+import functools
+import importlib
+import string
+from absl import logging
+from big_vision.datasets.imagenet import class_names as imagenet_class_names
+from big_vision.pp import ops_general
+from big_vision.pp import tokenizer as bv_tok
+from big_vision.pp import utils
+from big_vision.pp.registry import Registry
+import tensorflow as tf
+from tensorflow.io import gfile
+import sentencepiece
+SPProcessor = sentencepiece.SentencePieceProcessor
+import os
+os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
+import sentencepiece.sentencepiece_model_pb2
+del os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION']
+SPModelProto = sentencepiece.sentencepiece_model_pb2.ModelProto
+# TODO: b/lbeyer - softly introduce and move to new tokenizer API.
+KNOWN_TOKENIZERS = {
+    "mc4":  # used in multilingual models (mT5, PaLI), vocab_size=250_000
+        "gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model",
+    "cc_all":   # vocab_size=32_000
+        "gs://t5-data/vocabs/cc_all.32000/sentencepiece.model",
+    "c4_en":   # vocab_size=32_000
+        "gs://t5-data/vocabs/cc_en.32000/sentencepiece.model",
+    "t5":  # same as cc_all, but with 100 extra dummy tokens used by T5 models
+        "gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model",
+    "mt5":  # same as mc4, but with 100 extra dummy tokens used by T5 models
+        "gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model",
+}
+def create_tokenizer(model="c4_en", add_eos=True, add_bos=False):
+  """Creates a tokenizer which can be used in tfds."""
+  logging.info("Creating tokenizer: %s", model)
+  with gfile.GFile(KNOWN_TOKENIZERS.get(model, model), "rb") as f:
+    model = f.read()
+  # Lazy import of tensorflow_text so it is an optional dependency for
+  # the users of this file.
+  import tensorflow_text
+  return tensorflow_text.SentencepieceTokenizer(
+      model=model, add_eos=add_eos, add_bos=add_bos
+  )
+def tokenize(input_text, tokenizer, max_len, *, pad_value, force_eos,
+             multi_text=False):
+  """Tokenizes string, and adds `pad_value` if longer than `max_len`."""
+  def pad(tokens):
+    # Truncate/pad to max_len.
+    if force_eos:
+      tokens = tf.cond(
+          tf.shape(tokens)[0] >= max_len,
+          lambda: tf.concat(
+              # For too long, cut them off, but do keep the final EOS token.
+              [tokens[:max_len - 1], tokens[-1:]], axis=0),
+          lambda: tf.pad(
+              tokens, [(0, max_len - tf.shape(tokens)[0])],
+              constant_values=pad_value),
+      )
+    else:
+      tokens = tokens[:max_len]
+      tokens = tf.pad(
+          tokens, [(0, max_len - tf.shape(tokens)[0])],
+          constant_values=pad_value)
+    tokens.set_shape([max_len])
+    return tokens
+  tokens = tokenizer.tokenize(input_text)
+  if multi_text:
+    tokens = tokens.to_tensor(pad_value)  # tf.RaggedTensor to tf.Tensor
+    tokens = tf.reshape(tokens, [-1, tf.shape(tokens)[-1]])
+    tokens = tf.map_fn(pad, tokens)  # `map_fn` only maps on axis 0
+    final_shape = tf.concat([tf.shape(input_text), [max_len]], axis=0)
+    return tf.reshape(tokens, final_shape)
+  else:
+    return pad(tokens)
+@Registry.register("preprocess_ops.tokenize")
+@utils.InKeyOutKey(indefault=None, outdefault="labels")
+def get_pp_tokenize(
+    max_len,
+    eos,
+    model="c4_en",
+    lower=True,
+    sample_if_multi=True,
+    pad_value="<pad>",
+    add_bos=False
+):
+  """Tokenizes a text.
+  Let's assume max_len=3 and id("</s>")=1, id("a")=2, then we have
+  1. `eos="none", pad_value=0`:
+     - "a" -> [2, 0, 0]
+     - "aa" -> [2, 2, 0]
+     - "aaa" -> [2, 2, 2]
+  2. `eos="yes", pad_value=0`:
+     - "a" -> [2, 1, 0]
+     - "aa" -> [2, 2, 1]
+     - "aaa" -> [2, 2, 2]
+     This is usually used with generative models that need to learn when to
+     properly predict a "</s>" (when the sentence is finished) and when to
+     abstain (when the sentence is truncated).
+  3. `eos="sticky", pad_value=0`:
+     - "a" -> [2, 1, 0]
+     - "aa" -> [2, 2, 1]
+     - "aaa" -> [2, 2, 1]
+  4. `eos="sticky", pad_value=1`:
+     - "a" -> [2, 1, 1]
+     - "aa" -> [2, 2, 1]
+     - "aaa" -> [2, 2, 1]
+     This is traditionally used with contrastive models that use the last token
+     for embeddings, similarly to "cls" tokens in BERT-style models.
+  Args:
+    max_len: maximum length of the tokenized text.
+    eos: Whether to add an "</s>" (end of sentence) token and whether to keep it
+      when the sequence is longer than `max_len - 1`. See examples above for
+      details. Valid values: "none", "yes", "sticky".
+    model: a path to the pretrained sentencepiece model.
+    lower: lowercase the text before tokenizing.
+    sample_if_multi: If there's more than one, randomly pick one if this is
+      True; otherwise pick all texts and keep the input's batch shape in result.
+    pad_value: which token to pad the sequence with. If a string (for example
+      `"<pad>"`), tokenize it and use its first token. Note that there is no
+      guarantee to have any padding at the end of the sentence, if the sentence
+      is longer than `max_len`.
+    add_bos: adds beginning of sentence symbol.
+  Returns:
+    an op that outputs tokenized text.
+  """
+  if eos not in ("yes", "none", "sticky"):
+    raise ValueError(f"Invalid value for eos: '{eos}'.")
+  tokenizer = create_tokenizer(model, add_eos=eos != "none", add_bos=add_bos)
+  if isinstance(pad_value, str):
+    pad_value = tokenizer.string_to_id(pad_value)
+  def _pp_tokenize(txt):
+    if sample_if_multi and tf.convert_to_tensor(txt).ndim:
+      # TODO: I wish this code-path could die.
+      logging.warning("sample_if_multi is deprecated and will be removed."
+                      "Call `choice` (and maybe `setdefault`) instead.")
+      txt = ops_general.get_choice(key="t")(
+          ops_general.get_setdefault("t", "")({"t": txt}))["t"]
+    if lower:
+      txt = tf.strings.lower(txt) if sample_if_multi else tf.map_fn(
+          tf.strings.lower, txt)
+    return tokenize(
+        txt,
+        tokenizer,
+        max_len,
+        pad_value=pad_value,
+        force_eos=eos == "sticky",
+        multi_text=not sample_if_multi)
+  return _pp_tokenize
+@Registry.register("preprocess_ops.coco_captions")
+def get_coco_captions(outkey="captions"):
+  """Extracts coco's captions from nested dict."""
+  def _pp_coco_captions(data):
+    data[outkey] = data["captions"]["text"]
+    return data
+  return _pp_coco_captions
+@Registry.register("preprocess_ops.clip_i1k_label_names")
+@utils.InKeyOutKey(indefault="label", outdefault="labels")
+def get_pp_clip_i1k_label_names():
+  """Convert i1k label numbers to strings, using CLIP's class names."""
+  def _pp_imagenet_labels(label):
+    return tf.gather(imagenet_class_names.CLIP_IMAGENET_CLASS_NAMES, label)
+  return _pp_imagenet_labels
+@Registry.register("preprocess_ops.i21k_label_names")
+@utils.InKeyOutKey(indefault="label", outdefault="labels")
+def get_pp_i21k_label_names():
+  """Converts i21k label ids to strings."""
+  def _pp_imagenet_labels(label):
+    return tf.gather(imagenet_class_names.IMAGENET21k_CLASS_NAMES, label)
+  return _pp_imagenet_labels
+@Registry.register("preprocess_ops.lower")
+@utils.InKeyOutKey(indefault="text", outdefault="text")
+def get_lower():
+  """Lowercases text feature."""
+  def _pp_lower(text):
+    return tf.strings.lower(text)
+  return _pp_lower
+@Registry.register("preprocess_ops.strfmt")
+def get_strfmt(template, outkey="text"):
+  """Formats a string template with content form the data dict."""
+  def _template(data):
+    outputs = []
+    parts = string.Formatter().parse(template)
+    for (literal_text, field_name, format_spec, conversion) in parts:
+      # For now, we keep it simple and don't support fancy format specs.
+      # But we can add support to that via py_func as soon as we need it.
+      assert not format_spec and not conversion
+      outputs.append(tf.constant(literal_text))
+      if field_name:
+        value = data[field_name]
+        # Convert any non-strings (numbers, vectors) to a string.
+        if tf.convert_to_tensor(value).dtype != tf.string:
+          value = tf.strings.format("{}", value, summarize=-1)
+        outputs.append(value)
+    data[outkey] = tf.strings.join(outputs)
+    return data
+  return _template
+def _add_pieces(model_bytes, extra_pieces):
+  """Adds extra pieces to sentencpiece model specified by `model_bytes`."""
+  model = SPProcessor()
+  model.LoadFromSerializedProto(model_bytes)
+  unk_idx = model.PieceToId("<unk>")
+  assert model.IdToPiece(unk_idx) == "<unk>", model.IdToPiece(unk_idx)
+  model_proto = SPModelProto.FromString(model_bytes)
+  idx_to_updated_piece = {}
+  for piece in extra_pieces:
+    # The SentencePieceModel proto stores whitespaces as the special
+    # character '▁'. We perform the conversion here.
+    piece = piece.replace(" ", "▁")
+    spiece = model_proto.SentencePiece(
+        piece=piece,
+        # We set the highest score to force priority on user defined tokens.
+        score=0.0,
+        type=model_proto.SentencePiece().Type.USER_DEFINED,
+    )
+    existing_idx = model.PieceToId(piece)
+    if (existing_idx != unk_idx) ^ (piece == "<unk>"):
+      idx_to_updated_piece[existing_idx] = spiece
+      logging.info("Updating token at idx %d: %s", existing_idx, spiece.piece)
+    else:
+      model_proto.pieces.append(spiece)
+  # Replace duplicated pieces with updated ones.
+  updated_pieces = [
+      idx_to_updated_piece.get(i, piece)
+      for i, piece in enumerate(model_proto.pieces)
+  ]
+  del model_proto.pieces[:]
+  model_proto.pieces.extend(updated_pieces)
+  return model_proto.SerializeToString()
+def _iterable(x):
+  if isinstance(x, tf.RaggedTensor):
+    return True
+  if getattr(x, "ndim", 0) > 1:  # np, jnp
+    return True
+  if isinstance(x, (list, tuple)) and not isinstance(x[0], (int, float)):
+    return True
+  return False
+@Registry.register("tokenizers.sp")
+class SentencepieceTokenizer(bv_tok.Tokenizer):
+  """Wraps a `tftext.SentencepieceTokenizer`.
+  If you plan to use this tokenizer, please familiarize yourself with the test
+  cases first. This is likely to save you a lot of troubles down the road, trust
+  me!
+  """
+  def __init__(self, model, tokensets=()):
+    with gfile.GFile(KNOWN_TOKENIZERS.get(model, model), "rb") as f:
+      model_bytes = f.read()
+    extras = bv_tok.get_extra_tokens(tokensets)
+    model_bytes = _add_pieces(model_bytes, extras)
+    self._tok_sp = SPProcessor()
+    self._tok_sp.LoadFromSerializedProto(model_bytes)
+    self.extras = {self._tok_sp.PieceToId(x): x for x in extras}
+  def to_int(self, text, *, bos=False, eos=False):
+    def _single(s):
+      return (
+          ([self.bos_token] if bos else []) +
+          self._tok_sp.EncodeAsIds(s) +
+          ([self.eos_token] if eos else [])
+      )
+    if isinstance(text, str):
+      return _single(text)
+    return type(text)([_single(s) for s in text])
+  def to_str(self, tokens, *, stop_at_eos=True):
+    def _single(toks):
+      toks = [int(t) for t in toks]  # We really need this for DecodeIds.
+      if stop_at_eos:
+        try:  # The SentencePiece strips eos, but does not stop at it, so we do.
+          toks = toks[:toks.index(self.eos_token)]
+        except ValueError:  # No eos token found, nothing to do.
+          pass
+      return self._tok_sp.DecodeIds(toks)
+    if _iterable(tokens):
+      return [_single(toks) for toks in tokens]
+    return _single(tokens)
+  def _check_known(self, piece):
+    if (id_ := self._tok_sp.PieceToId(piece)) == self._tok_sp.unk_id():
+      logging.error("Piece '%s' is not known (unk=%s)!", piece, id_)
+    return id_
+  def to_piece(self, idx):
+    return self._tok_sp.IdToPiece(int(idx))
+  @property
+  def pad_token(self):
+    return self._tok_sp.pad_id()
+  @property
+  def eos_token(self):
+    return self._tok_sp.eos_id()
+  @property
+  def bos_token(self):
+    return self._tok_sp.bos_id()
+  @property
+  def vocab_size(self):
+    return self._tok_sp.GetPieceSize()
+  # For the _tf_op variants, we need a lot of wrapping boilerplate.
+  def to_int_tf_op(self, text, *, bos=False, eos=False):
+    text = tf.convert_to_tensor(text)
+    if text.ndim == 0:
+      def fn(txt):
+        s = txt.numpy().decode()
+        return tf.constant(self.to_int(s, bos=bos, eos=eos), tf.int32)
+      return tf.py_function(fn, [text], tf.int32)
+    else:
+      def fn(txt):
+        strings = [s.decode() for s in txt.numpy().tolist()]
+        toks = self.to_int(strings, bos=bos, eos=eos)
+        return tf.ragged.constant(toks)
+      out_type = tf.RaggedTensorSpec([tf.shape(text)[0], None], tf.int32)
+      return tf.py_function(fn, [text], Tout=out_type)
+  def to_str_tf_op(self, tokens, *, stop_at_eos=True):
+    def single(t):
+      fn = functools.partial(self.to_str, stop_at_eos=stop_at_eos)
+      return tf.numpy_function(fn, [t], tf.string, stateful=False)
+    if _iterable(tokens):
+      return tf.map_fn(single, tokens, tf.string)
+    return single(tokens)

Tipsomaly/model/big_vision/pp/ops_text_test.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for ops_text."""
+import copy
+from absl.testing import parameterized
+import big_vision.pp.ops_text as pp
+from big_vision.pp.registry import Registry
+import numpy as np
+import tensorflow as tf
+class PyToTfWrapper:
+  """Allows to use `to_{int,str}_tf()` via `to_{int,str}()`."""
+  def __init__(self, tok):
+    self.tok = tok
+    self.bos_token = tok.bos_token
+    self.eos_token = tok.eos_token
+    self.vocab_size = tok.vocab_size
+  def to_int(self, text, *, bos=False, eos=False):
+    ret = self.tok.to_int_tf_op(text, bos=bos, eos=eos)
+    if isinstance(ret, tf.RaggedTensor):
+      return [t.numpy().tolist() for t in ret]
+    return ret.numpy().tolist()
+  def to_str(self, tokens, stop_at_eos=True):
+    ret = self.tok.to_str_tf_op(
+        tf.ragged.constant(tokens),
+        stop_at_eos=stop_at_eos,
+    )
+    if ret.ndim == 0:
+      return ret.numpy().decode()
+    return [t.numpy().decode() for t in ret]
+class PpOpsTest(tf.test.TestCase, parameterized.TestCase):
+  def tfrun(self, ppfn, data):
+    # Run once as standalone, as could happen eg in colab.
+    yield {k: np.array(v) for k, v in ppfn(copy.deepcopy(data)).items()}
+    # And then once again as part of tfdata pipeline.
+    # You'd be surprised how much these two differ!
+    tfdata = tf.data.Dataset.from_tensors(copy.deepcopy(data))
+    for npdata in tfdata.map(ppfn).as_numpy_iterator():
+      yield npdata
+  def testtok(self):
+    # https://github.com/google/sentencepiece/blob/master/python/test/test_model.model
+    return "test_model.model"  # Should we just commit it? It's 200kB
+  def test_get_pp_clip_i1k_label_names(self):
+    op = pp.get_pp_clip_i1k_label_names()
+    labels = op({"label": tf.constant([0, 1])})["labels"].numpy().tolist()
+    self.assertAllEqual(labels, ["tench", "goldfish"])
+  def test_get_pp_i21k_label_names(self):
+    op = pp.get_pp_i21k_label_names()
+    labels = op({"label": tf.constant([0, 1])})["labels"].numpy().tolist()
+    self.assertAllEqual(labels, ["organism", "benthos"])
+  @parameterized.parameters((b"Hello world ScAlAr!", b"hello world scalar!"),
+                            (["Decoded Array!"], ["decoded array!"]),
+                            ([b"aA", "bB"], [b"aa", "bb"]))
+  def test_get_lower(self, inputs, expected_output):
+    op = pp.get_lower()
+    out = op({"text": tf.constant(inputs)})
+    self.assertAllEqual(out["text"].numpy(), np.array(expected_output))
+  @parameterized.named_parameters(
+      ("py", False),
+      ("tf", True),
+  )
+  def test_sentencepiece_tokenizer(self, wrap_tok):
+    tok = pp.SentencepieceTokenizer(self.testtok())
+    if wrap_tok:
+      tok = PyToTfWrapper(tok)
+    self.assertEqual(tok.vocab_size, 1000)
+    bos, eos = tok.bos_token, tok.eos_token
+    self.assertEqual(bos, 1)
+    self.assertEqual(eos, 2)
+    # Note: test model does NOT have a <pad> token (similar to e.g. "mistral").
+    # `.to_int()` wraps `.to_int_tf_ops` which is thus also tested
+    self.assertEqual(tok.to_int("blah"), [80, 180, 60])
+    self.assertEqual(tok.to_int("blah", bos=True), [bos, 80, 180, 60])
+    self.assertEqual(tok.to_int("blah", eos=True), [80, 180, 60, eos])
+    self.assertEqual(
+        tok.to_int("blah", bos=True, eos=True), [bos, 80, 180, 60, eos]
+    )
+    self.assertEqual(
+        tok.to_int(["blah", "blah blah"]),
+        [[80, 180, 60], [80, 180, 60, 80, 180, 60]],
+    )
+    # inverse of above
+    # `.to_str()` wraps `.to_str_tf_ops` which is thus also tested
+    self.assertEqual(tok.to_str([80, 180, 60]), "blah")
+    self.assertEqual(tok.to_str([1, 80, 180, 60]), "blah")
+    self.assertEqual(tok.to_str([80, 180, 60, 2]), "blah")
+    self.assertEqual(
+        tok.to_str([[80, 180, 60], [80, 180, 60, 80, 180, 60]]),
+        ["blah", "blah blah"],
+    )
+  def test_sentencepiece_tokenizer_tf_op_ndarray_input(self):
+    tok = pp.SentencepieceTokenizer(self.testtok())
+    bos, eos = tok.bos_token, tok.eos_token
+    arr = np.array([[bos, 80, 180, 60, eos]] * 2, dtype=np.int32)
+    self.assertEqual(tok.to_str_tf_op(arr).numpy().tolist(), [b"blah"] * 2)
+  def test_sentencepiece_tokenizer_tokensets(self):
+    tok = pp.SentencepieceTokenizer(self.testtok(), tokensets=["loc"])
+    self.assertEqual(tok.vocab_size, 2024)
+    self.assertEqual(
+        tok.to_int("blah<loc0000><loc1023>"), [80, 180, 60, 1000, 2023]
+    )
+  def test_sentencepiece_stop_at_eos(self):
+    tok = pp.SentencepieceTokenizer(self.testtok())
+    self.assertEqual(tok.to_str([80, 180, 60], stop_at_eos=False), "blah")
+    eos = tok.eos_token
+    self.assertEqual(tok.to_str([80, eos, 180, 60], stop_at_eos=False), "blah")
+    self.assertEqual(tok.to_str([80, eos, 180, 60], stop_at_eos=True), "b")
+    self.assertEqual(
+        tok.to_str([[80, eos, 180, 60], [80, 180, eos, 60]], stop_at_eos=True),
+        ["b", "bla"]
+    )
+  def test_sentencepiece_extra_tokens(self):
+    tok = pp.SentencepieceTokenizer(self.testtok())
+    self.assertEqual(tok.to_str([1, 80, 180, 60, 2], stop_at_eos=False), "blah")
+    tok = pp.SentencepieceTokenizer(
+        self.testtok(), tokensets=["sp_extra_tokens"]
+    )
+    self.assertEqual(tok.vocab_size, 1001)  # Also added the <pad> token.
+    self.assertEqual(
+        tok.to_str([1, 80, 180, 60, 2], stop_at_eos=False), "<s> blah</s>"
+    )
+  def test_strfmt(self):
+    data = {
+        "int": tf.constant(42, tf.uint8),
+        "float": tf.constant(3.14, tf.float32),
+        "vec": tf.range(3),
+        "empty_str": tf.constant(""),
+        "regex_problem1": tf.constant(r"no \replace pattern"),
+        "regex_problem2": tf.constant(r"yes \1 pattern"),
+    }
+    for out in self.tfrun(pp.get_strfmt("Nothing"), data):
+      self.assertEqual(out["text"], b"Nothing")
+    for out in self.tfrun(pp.get_strfmt("{int}"), data):
+      self.assertEqual(out["text"], b"42")
+    for out in self.tfrun(pp.get_strfmt("A{int}"), data):
+      self.assertEqual(out["text"], b"A42")
+    for out in self.tfrun(pp.get_strfmt("{int}A"), data):
+      self.assertEqual(out["text"], b"42A")
+    for out in self.tfrun(pp.get_strfmt("{int}{int}"), data):
+      self.assertEqual(out["text"], b"4242")
+    for out in self.tfrun(pp.get_strfmt("A{int}A{int}A"), data):
+      self.assertEqual(out["text"], b"A42A42A")
+    for out in self.tfrun(pp.get_strfmt("A{float}A"), data):
+      self.assertEqual(out["text"], b"A3.14A")
+    for out in self.tfrun(pp.get_strfmt("A{float}A{int}"), data):
+      self.assertEqual(out["text"], b"A3.14A42")
+    for out in self.tfrun(pp.get_strfmt("A{vec}A"), data):
+      self.assertEqual(out["text"], b"A[0 1 2]A")
+    for out in self.tfrun(pp.get_strfmt("A{empty_str}A"), data):
+      self.assertEqual(out["text"], b"AA")
+    for out in self.tfrun(pp.get_strfmt("{empty_str}"), data):
+      self.assertEqual(out["text"], b"")
+    for out in self.tfrun(pp.get_strfmt("A{regex_problem1}A"), data):
+      self.assertEqual(out["text"], br"Ano \replace patternA")
+    for out in self.tfrun(pp.get_strfmt("A{regex_problem2}A"), data):
+      self.assertEqual(out["text"], br"Ayes \1 patternA")
+@Registry.register("tokensets.sp_extra_tokens")
+def _get_sp_extra_tokens():
+  # For sentencepiece, adding these tokens will make them visible when decoding.
+  # If a token is not found (e.g. "<pad>" is not found in "mistral"), then it is
+  # added to the vocabulary, increasing the vocab_size accordingly.
+  return ["<s>", "</s>", "<pad>"]
+if __name__ == "__main__":
+  tf.test.main()

Tipsomaly/model/big_vision/pp/registry.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Global Registry for big_vision pp ops.
+Author: Joan Puigcerver (jpuigcerver@)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import ast
+import contextlib
+import functools
+def parse_name(string_to_parse):
+  """Parses input to the registry's lookup function.
+  Args:
+    string_to_parse: can be either an arbitrary name or function call
+      (optionally with positional and keyword arguments).
+      e.g. "multiclass", "resnet50_v2(filters_factor=8)".
+  Returns:
+    A tuple of input name, argument tuple and a keyword argument dictionary.
+    Examples:
+      "multiclass" -> ("multiclass", (), {})
+      "resnet50_v2(9, filters_factor=4)" ->
+          ("resnet50_v2", (9,), {"filters_factor": 4})
+  Author: Joan Puigcerver (jpuigcerver@)
+  """
+  expr = ast.parse(string_to_parse, mode="eval").body  # pytype: disable=attribute-error
+  if not isinstance(expr, (ast.Attribute, ast.Call, ast.Name)):
+    raise ValueError(
+        "The given string should be a name or a call, but a {} was parsed from "
+        "the string {!r}".format(type(expr), string_to_parse))
+  # Notes:
+  # name="some_name" -> type(expr) = ast.Name
+  # name="module.some_name" -> type(expr) = ast.Attribute
+  # name="some_name()" -> type(expr) = ast.Call
+  # name="module.some_name()" -> type(expr) = ast.Call
+  if isinstance(expr, ast.Name):
+    return string_to_parse, (), {}
+  elif isinstance(expr, ast.Attribute):
+    return string_to_parse, (), {}
+  def _get_func_name(expr):
+    if isinstance(expr, ast.Attribute):
+      return _get_func_name(expr.value) + "." + expr.attr
+    elif isinstance(expr, ast.Name):
+      return expr.id
+    else:
+      raise ValueError(
+          "Type {!r} is not supported in a function name, the string to parse "
+          "was {!r}".format(type(expr), string_to_parse))
+  def _get_func_args_and_kwargs(call):
+    args = tuple([ast.literal_eval(arg) for arg in call.args])
+    kwargs = {
+        kwarg.arg: ast.literal_eval(kwarg.value) for kwarg in call.keywords
+    }
+    return args, kwargs
+  func_name = _get_func_name(expr.func)
+  func_args, func_kwargs = _get_func_args_and_kwargs(expr)
+  return func_name, func_args, func_kwargs
+class Registry(object):
+  """Implements global Registry.
+  Authors: Joan Puigcerver (jpuigcerver@), Alexander Kolesnikov (akolesnikov@)
+  """
+  _GLOBAL_REGISTRY = {}
+  @staticmethod
+  def global_registry():
+    return Registry._GLOBAL_REGISTRY
+  @staticmethod
+  def register(name, replace=False):
+    """Creates a function that registers its input."""
+    def _register(item):
+      if name in Registry.global_registry() and not replace:
+        raise KeyError("The name {!r} was already registered.".format(name))
+      Registry.global_registry()[name] = item
+      return item
+    return _register
+  @staticmethod
+  def lookup(lookup_string, kwargs_extra=None):
+    """Lookup a name in the registry."""
+    try:
+      name, args, kwargs = parse_name(lookup_string)
+    except ValueError as e:
+      raise ValueError(f"Error parsing:\n{lookup_string}") from e
+    if kwargs_extra:
+      kwargs.update(kwargs_extra)
+    item = Registry.global_registry()[name]
+    return functools.partial(item, *args, **kwargs)
+  @staticmethod
+  def knows(lookup_string):
+    try:
+      name, _, _ = parse_name(lookup_string)
+    except ValueError as e:
+      raise ValueError(f"Error parsing:\n{lookup_string}") from e
+    return name in Registry.global_registry()
+@contextlib.contextmanager
+def temporary_ops(**kw):
+  """Registers specified pp ops for use in a `with` block.
+  Example use:
+    with pp_registry.remporary_ops(
+        pow=lambda alpha: lambda d: {k: v**alpha for k, v in d.items()}):
+      pp = pp_builder.get_preprocess_fn("pow(alpha=2.0)|pow(alpha=0.5)")
+      features = pp(features)
+  Args:
+    **kw: Names are preprocess string function names to be used to specify the
+      preprocess function. Values are functions that can be called with params
+      (e.g. the `alpha` param in above example) and return functions to be used
+      to transform features.
+  Yields:
+    A context manager to be used in a `with` statement.
+  """
+  reg = Registry.global_registry()
+  kw = {f"preprocess_ops.{k}": v for k, v in kw.items()}
+  for k in kw:
+    assert k not in reg
+  for k, v in kw.items():
+    reg[k] = v
+  try:
+    yield
+  finally:
+    for k in kw:
+      del reg[k]

Tipsomaly/model/big_vision/pp/registry_test.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for registry."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from unittest import mock
+from absl.testing import absltest
+from big_vision.pp import registry
+class RegistryTest(absltest.TestCase):
+  def setUp(self):
+    super(RegistryTest, self).setUp()
+    # Mock global registry in each test to keep them isolated and allow for
+    # concurrent tests.
+    self.addCleanup(mock.patch.stopall)
+    self.global_registry = dict()
+    self.mocked_method = mock.patch.object(
+        registry.Registry, "global_registry",
+        return_value=self.global_registry).start()
+  def test_parse_name(self):
+    name, args, kwargs = registry.parse_name("f")
+    self.assertEqual(name, "f")
+    self.assertEqual(args, ())
+    self.assertEqual(kwargs, {})
+    name, args, kwargs = registry.parse_name("f()")
+    self.assertEqual(name, "f")
+    self.assertEqual(args, ())
+    self.assertEqual(kwargs, {})
+    name, args, kwargs = registry.parse_name("func(a=0,b=1,c='s')")
+    self.assertEqual(name, "func")
+    self.assertEqual(args, ())
+    self.assertEqual(kwargs, {"a": 0, "b": 1, "c": "s"})
+    name, args, kwargs = registry.parse_name("func(1,'foo',3)")
+    self.assertEqual(name, "func")
+    self.assertEqual(args, (1, "foo", 3))
+    self.assertEqual(kwargs, {})
+    name, args, kwargs = registry.parse_name("func(1,'2',a=3,foo='bar')")
+    self.assertEqual(name, "func")
+    self.assertEqual(args, (1, "2"))
+    self.assertEqual(kwargs, {"a": 3, "foo": "bar"})
+    name, args, kwargs = registry.parse_name("foo.bar.func(a=0,b=(1),c='s')")
+    self.assertEqual(name, "foo.bar.func")
+    self.assertEqual(kwargs, dict(a=0, b=1, c="s"))
+    with self.assertRaises(SyntaxError):
+      registry.parse_name("func(0")
+    with self.assertRaises(SyntaxError):
+      registry.parse_name("func(a=0,,b=0)")
+    with self.assertRaises(SyntaxError):
+      registry.parse_name("func(a=0,b==1,c='s')")
+    with self.assertRaises(ValueError):
+      registry.parse_name("func(a=0,b=undefined_name,c='s')")
+  def test_register(self):
+    # pylint: disable=unused-variable
+    @registry.Registry.register("func1")
+    def func1():
+      pass
+    self.assertLen(registry.Registry.global_registry(), 1)
+  def test_lookup_function(self):
+    @registry.Registry.register("func1")
+    def func1(arg1, arg2, arg3):  # pylint: disable=unused-variable
+      return arg1, arg2, arg3
+    self.assertTrue(callable(registry.Registry.lookup("func1")))
+    self.assertEqual(registry.Registry.lookup("func1")(1, 2, 3), (1, 2, 3))
+    self.assertEqual(
+        registry.Registry.lookup("func1(arg3=9)")(1, 2), (1, 2, 9))
+    self.assertEqual(
+        registry.Registry.lookup("func1(arg2=9,arg1=99)")(arg3=3), (99, 9, 3))
+    self.assertEqual(
+        registry.Registry.lookup("func1(arg2=9,arg1=99)")(arg1=1, arg3=3),
+        (1, 9, 3))
+    self.assertEqual(
+        registry.Registry.lookup("func1(1)")(1, 2), (1, 1, 2))
+    self.assertEqual(
+        registry.Registry.lookup("func1(1)")(arg3=3, arg2=2), (1, 2, 3))
+    self.assertEqual(
+        registry.Registry.lookup("func1(1, 2)")(3), (1, 2, 3))
+    self.assertEqual(
+        registry.Registry.lookup("func1(1, 2)")(arg3=3), (1, 2, 3))
+    self.assertEqual(
+        registry.Registry.lookup("func1(1, arg2=2)")(arg3=3), (1, 2, 3))
+    self.assertEqual(
+        registry.Registry.lookup("func1(1, arg3=2)")(arg2=3), (1, 3, 2))
+    self.assertEqual(
+        registry.Registry.lookup("func1(1, arg3=2)")(3), (1, 3, 2))
+    with self.assertRaises(TypeError):
+      registry.Registry.lookup("func1(1, arg2=2)")(3)
+    with self.assertRaises(TypeError):
+      registry.Registry.lookup("func1(1, arg3=3)")(arg3=3)
+    with self.assertRaises(TypeError):
+      registry.Registry.lookup("func1(1, arg3=3)")(arg1=3)
+    with self.assertRaises(SyntaxError):
+      registry.Registry.lookup("func1(arg1=1, 3)")(arg2=3)
+if __name__ == "__main__":
+  absltest.main()

Tipsomaly/model/big_vision/pp/tokenizer.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The tokenizer API for big_vision, and central registration place."""
+import functools
+import importlib
+from typing import Protocol
+from absl import logging
+from big_vision.pp import registry
+import big_vision.utils as u
+import numpy as np
+class Tokenizer(Protocol):
+  """Just to unify on the API as we now have mmany different ones."""
+  def to_int(self, text, *, bos=False, eos=False):
+    """Tokenizes `text` into a list of integer tokens.
+    Args:
+      text: can be a single string, or a list of strings.
+      bos: Whether a beginning-of-sentence token should be prepended.
+      eos: Whether an end-of-sentence token should be appended.
+    Returns:
+      List or list-of-list of tokens.
+    """
+  def to_int_tf_op(self, text, *, bos=False, eos=False):
+    """Same as `to_int()`, but as TF ops to be used in pp."""
+  def to_str(self, tokens, *, stop_at_eos=True):
+    """Inverse of `to_int()`.
+    Args:
+      tokens: list of tokens, or list of lists of tokens.
+      stop_at_eos: remove everything that may come after the first EOS.
+    Returns:
+      A string (if `tokens` is a list of tokens), or a list of strings.
+      Note that most tokenizers strip select few control tokens like
+      eos/bos/pad/unk from the output string.
+    """
+  def to_str_tf_op(self, tokens, *, stop_at_eos=True):
+    """Same as `to_str()`, but as TF ops to be used in pp."""
+  @property
+  def pad_token(self):
+    """Token id of padding token."""
+  @property
+  def eos_token(self):
+    """Token id of end-of-sentence token."""
+  @property
+  def bos_token(self):
+    """Token id of beginning-of-sentence token."""
+  @property
+  def vocab_size(self):
+    """Returns the size of the vocabulary."""
+@functools.cache
+def get_tokenizer(name):
+  with u.chrono.log_timing(f"z/secs/tokenizer/{name}"):
+    if not registry.Registry.knows(f"tokenizers.{name}"):
+      raw_name, *_ = registry.parse_name(name)
+      logging.info("Tokenizer %s not registered, "
+                   "trying import big_vision.pp.%s", name, raw_name)
+      importlib.import_module(f"big_vision.pp.{raw_name}")
+    return registry.Registry.lookup(f"tokenizers.{name}")()
+def get_extra_tokens(tokensets):
+  extra_tokens = []
+  for tokenset in tokensets:
+    extra_tokens.extend(registry.Registry.lookup(f"tokensets.{tokenset}")())
+  return list(np.unique(extra_tokens))  # Preserves order. Dups make no sense.
+@registry.Registry.register("tokensets.loc")
+def _get_loc1024(n=1024):
+  return [f"<loc{i:04d}>" for i in range(n)]
+@registry.Registry.register("tokensets.seg")
+def _get_seg(n=128):
+  return [f"<seg{i:03d}>" for i in range(n)]