boringKey commited on Mar 8

Commit

b389d26

verified ·

1 Parent(s): 8b8d9f0

Upload 46 files

Browse files

Files changed (46) hide show

LICENSE +203 -0
README.md +113 -0
__init__.py +0 -0
class_orders/cifar100.yaml +1 -0
class_orders/tinyimagenet.yaml +17 -0
clip/README.md +1 -0
clip/__init__.py +1 -0
clip/adapter.py +75 -0
clip/bpe_simple_vocab_16e6.txt.gz +3 -0
clip/clip.py +310 -0
clip/model.py +486 -0
clip/tokenizer.py +140 -0
configs/class/cifar100_10-10.yaml +33 -0
configs/class/cifar100_2-2.yaml +33 -0
configs/class/cifar100_5-5.yaml +37 -0
configs/class/tinyimagenet_100-10.yaml +36 -0
configs/class/tinyimagenet_100-20.yaml +36 -0
configs/class/tinyimagenet_100-5.yaml +36 -0
continual_clip/__init__.py +0 -0
continual_clip/cc.py +53 -0
continual_clip/clip_original/README.md +1 -0
continual_clip/clip_original/__init__.py +1 -0
continual_clip/clip_original/adapter.py +75 -0
continual_clip/clip_original/bpe_simple_vocab_16e6.txt.gz +3 -0
continual_clip/clip_original/clip.py +208 -0
continual_clip/clip_original/model.py +568 -0
continual_clip/clip_original/tokenizer.py +140 -0
continual_clip/datasets.py +124 -0
continual_clip/dynamic_dataset.py +108 -0
continual_clip/models.py +228 -0
continual_clip/utils.py +210 -0
dataset_reqs/imagenet1000_classes.txt +1000 -0
dataset_reqs/imagenet100_classes.txt +100 -0
dataset_reqs/imagenet100_splits/train_100.txt +0 -0
dataset_reqs/imagenet100_splits/val_100.txt +0 -0
dataset_reqs/tinyimagenet_classes.txt +200 -0
main.py +104 -0
requirements.txt +19 -0
run_cifar100-10-10.sh +9 -0
templates/__init__.py +0 -0
templates/fmow_template.py +20 -0
templates/iwildcam_template.py +4 -0
templates/openai_imagenet_template.py +82 -0
templates/simple_template.py +3 -0
templates/template_utils.py +28 -0
templates/testing_template.py +83 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,203 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2020 - present, Facebook, Inc
+   Copyright 2022 - present, Arthur Douillard
+   Copyright 2023 - present, Zangwei Zheng, Mingyuan Ma
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,113 @@

+# DMNSP: Dynamic Multi-Layer Null Space Projection for Vision-Language Continual Learning
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/python-380/)
+[![PyTorch](https://img.shields.io/badge/PyTorch-1.8+-red.svg)](https://pytorch.org/)
+[![License](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
+Official implementation of the paper "Dynamic Multi-Layer Null Space Projection for Vision-Language Continual Learning" (ICCV 2025) in PyTorch.
+## 🎯 Abstract
+Vision-Language Models (VLM) have emerged as a highly promising approach for Continual Learning (CL) due to their powerful generalized features. While adapter-based VLM can exploit both task-specific and task-agnostic features, current CL methods have largely overlooked the distinct and evolving parameter distributions in visual and language modalities, which are found crucial for effectively mitigating catastrophic forgetting.In this study, we find that the **visual modality experiences a broader parameter distribution and greater variance** during class increments than the textual modality, leading to higher vulnerability to forgetting. Consequently, we handle the branches of the two modalities asymmetrically.
+### Key Contributions
+- 🔍 **Asymmetric Modality Handling**: We propose handling visual and language modalities differently based on their distinct parameter distribution characteristics
+- 🚀 **Multi-layer Null Space Projection**: A novel strategy applied only to the visual modality branch to restrict parameter updates within specific subspaces
+- ⚖️ **Dynamic Projection Coefficient**: Precise control of gradient projection magnitude for optimal stability-plasticity balance
+## 🛠️ Installation
+### Setup Environment
+```bash
+# Install dependencies
+pip install -r requirements.txt
+```
+## 📊 Datasets
+The framework supports the following datasets for class incremental learning:
+- **CIFAR100**: 100 classes, various incremental settings (2-2, 5-5, 10-10)
+- **TinyImageNet**: 200 classes, incremental settings (200-100-5, 200-100-10, 200-100-20)
+### Data Preparation
+1. The datasets will be automatically downloaded when running experiments
+2. Update the `dataset_root` path in your configuration files or command line
+3. Ensure sufficient disk space for dataset storage
+## 🚀 Quick Start
+### Basic Usage
+```bash
+# Run CIFAR100 with 10 initial classes and 10 incremental classes
+sh run_cifar100-10-10.sh
+# Or run with custom parameters
+python main.py \
+    --config-path ./configs/class \
+    --config-name cifar100_10-10.yaml \
+    dataset_root="/path/to/your/data" \
+    class_order="./class_orders/cifar100.yaml"
+```
+### Configuration Options
+The project uses Hydra for configuration management. Key parameters include:
+```yaml
+# Model settings
+model_name: "ViT-B/16"              # CLIP model variant
+prompt_template: "a bad photo of a {}."  # Text prompt template
+# Training settings
+batch_size: 128                     # Training batch size
+lr: 1e-3                           # Learning rate
+weight_decay: 0.0                   # Weight decay
+ls: 0.0                            # Label smoothing
+# Incremental learning settings
+initial_increment: 10               # Initial number of classes
+increment: 10                       # Classes per incremental step
+method: "DMNSP"                     # Method name
+```
+## 🔧 Advanced Usage
+### Custom Datasets
+To add support for new datasets:
+1. Add dataset configuration in `continual_clip/datasets.py`
+2. Create corresponding class order file in `class_orders/`
+3. Add configuration YAML in `configs/class/`
+## 📄 License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## 📚 Citation
+If you find this work useful in your research, please consider citing:
+```bibtex
+@inproceedings{Kang2025DMNSP,
+  title={Dynamic Multi-Layer Null Space Projection for Vision-Language Continual Learning},
+  author={Borui Kang, Lei Wang, Zhiping Wu, Tao Feng, Yawen Li, Yang Gao, Wenbin Li},
+  journal={ICCV},
+  year={2025}
+}
+```
+## 📞 Contact
+For questions or issues, please:
+- Open an issue on GitHub
+- Contact the authors at [kangborui.cn@gmail.com]
+---
+**Note**: This implementation is for research purposes. Please ensure you comply with the respective licenses of the datasets and models used.

__init__.py ADDED Viewed

File without changes

class_orders/cifar100.yaml ADDED Viewed

	@@ -0,0 +1 @@

+ class_order: [87, 0, 52, 58, 44, 91, 68, 97, 51, 15, 94, 92, 10, 72, 49, 78, 61, 14, 8, 86, 84, 96, 18, 24, 32, 45, 88, 11, 4, 67, 69, 66, 77, 47, 79, 93, 29, 50, 57, 83, 17, 81, 41, 12, 37, 59, 25, 20, 80, 73, 1, 28, 6, 46, 62, 82, 53, 9, 31, 75, 38, 63, 33, 74, 27, 22, 36, 3, 16, 21, 60, 19, 70, 90, 89, 43, 5, 42, 65, 76, 40, 30, 23, 85, 2, 95, 56, 48, 71, 64, 98, 13, 99, 7, 34, 55, 54, 26, 35, 39]

class_orders/tinyimagenet.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+class_order: [
+  131, 181, 22, 172, 144, 92, 97, 187, 58, 93, 6, 70, 106, 68,
+  153, 168, 179, 199, 29, 46, 9, 142, 134, 88, 193, 110, 26,
+  32, 117, 112, 17, 39, 166, 13, 94, 138, 109, 147, 51, 101,
+  59, 188, 116, 5, 170, 99, 100, 167, 180, 146, 65, 1, 104,
+  43, 38, 184, 123, 171, 137, 162, 71, 44, 95, 174, 12, 7,
+  54, 152, 21, 47, 28, 176, 34, 2, 132, 118, 42, 189, 150,
+  14, 165, 41, 192, 45, 82, 128, 63, 57, 197, 160, 53, 75,
+  108, 135, 121, 159, 183, 67, 169, 50, 87, 69, 89, 196,
+  115, 19, 148, 96, 86, 11, 8, 60, 33, 173, 78, 4, 119, 105,
+  182, 127, 177, 30, 186, 40, 49, 178, 76, 157, 161, 73, 164,
+  151, 31, 74, 191, 27, 125, 198, 81, 20, 155, 114, 139, 36,
+  61, 56, 145, 48, 16, 83, 62, 85, 126, 0, 102, 23, 3, 140,
+  15, 195, 133, 113, 190, 141, 52, 163, 156, 80, 111, 90, 175,
+  143, 120, 84, 18, 25, 79, 37, 154, 136, 64, 158, 24, 185,
+  72, 35, 129, 55, 149, 91, 122, 77, 103, 124, 130, 66, 10, 107, 194, 98
+]

clip/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ This folder is a lightly modified version of https://github.com/openai/CLIP.

clip/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .clip import *

clip/adapter.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# --------------------------------------------------------
+# References:
+# https://github.com/jxhe/unify-parameter-efficient-tuning
+# --------------------------------------------------------
+import math
+import torch
+import torch.nn as nn
+class Adapter(nn.Module):
+    def __init__(self,
+                 d_model=None,
+                 bottleneck=None,
+                 dropout=0.0,
+                 init_option="lora",
+                 adapter_scalar="1.0",
+                 adapter_layernorm_option="in"):
+        super().__init__()
+        self.n_embd = d_model if d_model is None else d_model
+        self.down_size = bottleneck
+        #_before
+        self.adapter_layernorm_option = adapter_layernorm_option
+        self.adapter_layer_norm_before = None
+        if adapter_layernorm_option == "in" or adapter_layernorm_option == "out":
+            self.adapter_layer_norm_before = nn.LayerNorm(self.n_embd)
+        if adapter_scalar == "learnable_scalar":
+            self.scale = nn.Parameter(torch.ones(1))
+        else:
+            self.scale = float(adapter_scalar)
+        # self.linear = nn.Linear(self.n_embd, self.n_embd)
+        self.down_proj = nn.Linear(self.n_embd, 64)
+        self.non_linear_func = nn.ReLU()
+        self.up_proj = nn.Linear(self.down_size, self.n_embd)
+        self.dropout = dropout
+        if init_option == "bert":
+            raise NotImplementedError
+        elif init_option == "lora":
+            with torch.no_grad():
+                nn.init.kaiming_uniform_(self.down_proj.weight, a=math.sqrt(5))
+                nn.init.zeros_(self.up_proj.weight)
+                nn.init.zeros_(self.down_proj.bias)
+                nn.init.zeros_(self.up_proj.bias)
+        elif init_option == "linear":
+            with torch.no_grad():
+                nn.init.zeros_(self.linear.weight)
+    def forward(self, x, add_residual=True, residual=None):
+        residual = x if residual is None else residual
+        if self.adapter_layernorm_option == 'in': #  none
+            x = self.adapter_layer_norm_before(x)
+        down = self.down_proj(x)
+        down = self.non_linear_func(down)
+        down = nn.functional.dropout(down, p=self.dropout, training=self.training)
+        up = self.up_proj(down)
+        up = up * self.scale
+        if self.adapter_layernorm_option == 'out': #  none
+            up = self.adapter_layer_norm_before(up)
+        if add_residual:
+            output = up + residual
+        else:
+            output = up
+        return down, output, \
+            self.up_proj.weight, self.down_proj.weight, self.up_proj.bias, self.down_proj.bias

clip/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

clip/clip.py ADDED Viewed

	@@ -0,0 +1,310 @@

+# Code ported from https://github.com/openai/CLIP
+import hashlib
+import os
+import urllib
+import warnings
+from typing import Union, List
+import torch
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, RandomResizedCrop, InterpolationMode
+from tqdm import tqdm
+from clip.model import build_model
+from clip.tokenizer import SimpleTokenizer as _Tokenizer
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+}
+def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+def _transform(n_px: int, is_train: bool):
+    normalize = Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+    if is_train:
+        return Compose([
+            RandomResizedCrop(n_px, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+    else:
+        return Compose([
+            Resize(n_px, interpolation=InterpolationMode.BICUBIC),
+            CenterCrop(n_px),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+# def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit: bool = False, download_root: str = None):
+#     """Load a CLIP model
+#     Parameters
+#     ----------
+#     name : str
+#         A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+#     device : Union[str, torch.device]
+#         The device to put the loaded model
+#     jit : bool
+#         Whether to load the optimized JIT model or more hackable non-JIT model (default).
+#     download_root: str
+#         path to download the model files; by default, it uses "~/.cache/clip"
+#     Returns
+#     -------
+#     model : torch.nn.Module
+#         The CLIP model
+#     preprocess : Callable[[PIL.Image], torch.Tensor]
+#         A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+#     """
+#     if name in _MODELS:
+#         model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
+#     elif os.path.isfile(name):
+#         model_path = name
+#     else:
+#         raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+#     try:
+#         # loading JIT archive
+#         model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
+#         state_dict = None
+#     except RuntimeError:
+#         # loading saved state dict
+#         if jit:
+#             warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+#             jit = False
+#         state_dict = torch.load(model_path, map_location="cpu")
+#     if not jit:
+#         model = build_model(state_dict or model.state_dict()).to(device)
+#         if str(device) == "cpu":
+#             model.float()
+#         return model, _transform(model.visual.input_resolution)
+#     # patch the device names
+#     device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+#     device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+#     def patch_device(module):
+#         try:
+#             graphs = [module.graph] if hasattr(module, "graph") else []
+#         except RuntimeError:
+#             graphs = []
+#         if hasattr(module, "forward1"):
+#             graphs.append(module.forward1.graph)
+#         for graph in graphs:
+#             for node in graph.findAllNodes("prim::Constant"):
+#                 if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+#                     node.copyAttributes(device_node)
+#     model.apply(patch_device)
+#     patch_device(model.encode_image)
+#     patch_device(model.encode_text)
+#     # patch dtype to float32 on CPU
+#     if str(device) == "cpu":
+#         float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+#         float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+#         float_node = float_input.node()
+#         def patch_float(module):
+#             try:
+#                 graphs = [module.graph] if hasattr(module, "graph") else []
+#             except RuntimeError:
+#                 graphs = []
+#             if hasattr(module, "forward1"):
+#                 graphs.append(module.forward1.graph)
+#             for graph in graphs:
+#                 for node in graph.findAllNodes("aten::to"):
+#                     inputs = list(node.inputs())
+#                     for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+#                         if inputs[i].node()["value"] == 5:
+#                             inputs[i].node().copyAttributes(float_node)
+#         model.apply(patch_float)
+#         patch_float(model.encode_image)
+#         patch_float(model.encode_text)
+#         model.float()
+#     return model, _transform(model.input_resolution.item())
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=True, is_train=False, pretrained=True):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model (default) or more hackable non-JIT model.
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name])
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        if jit:
+            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+            jit = False
+        state_dict = torch.load(model_path, map_location="cpu")
+    if not jit:
+        try:
+            model = build_model(state_dict or model.state_dict()).to(device)
+        except KeyError:
+            sd = {k[7:]: v for k,v in state_dict["state_dict"].items()}
+            model = build_model(sd).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, \
+               _transform(model.visual.input_resolution, is_train=True), \
+               _transform(model.visual.input_resolution, is_train=False)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def patch_device(module):
+        graphs = [module.graph] if hasattr(module, "graph") else []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            graphs = [module.graph] if hasattr(module, "graph") else []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, \
+           _transform(model.input_resolution.item(), is_train=True), \
+           _transform(model.input_resolution.item(), is_train=False)
+def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<start_of_text>"]
+    eot_token = _tokenizer.encoder["<end_of_text>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length: # Truncate
+            tokens = tokens[:context_length]
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result

clip/model.py ADDED Viewed

	@@ -0,0 +1,486 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import os
+import json
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .adapter import Adapter
+from torch.distributions.normal import Normal
+from collections import Counter
+global_taskid = 0
+global_is_train=True
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x[0]
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, text_or_image=None, flag=False):
+        super().__init__()
+        self.register_buffer("mean", torch.tensor([0.0]))
+        self.register_buffer("std", torch.tensor([1.0]))
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.is_train = global_is_train
+        self.ffn_num = 64
+        self.softmax = nn.Softmax(1)
+        self.softplus = nn.Softplus()
+        self.noisy_gating = True
+        self.adaptmlp_list = nn.ModuleList()
+        self.text_or_image = text_or_image
+        self.flag = flag
+        self.adaptmlp = Adapter(d_model=d_model, dropout=0.1, bottleneck=self.ffn_num,
+                                init_option='lora',
+                                adapter_scalar=0.1,
+                                adapter_layernorm_option='none',
+                                )
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x_re = x.permute(1, 0, 2)
+        down, adapt_x, up_proj_weight, down_proj_weight, up_proj_bias, down_proj_bias = self.adaptmlp(x_re, add_residual=False)
+        adapt_x = adapt_x.permute(1, 0, 2)
+        down = down.permute(1, 0, 2)
+        x = x + self.mlp(self.ln_2(x)) + adapt_x
+        return x, down, adapt_x, up_proj_weight, down_proj_weight, up_proj_bias, down_proj_bias
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, text_or_image=None,
+                 flag =True,):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask, text_or_image, flag) for _ in range(layers)])
+        self.lora_feature = {}
+    def forward(self, x: torch.Tensor):
+        for i in range(len(self.resblocks)):
+            x, down, adapt_x, up_proj_weight, down_proj_weight, up_proj_bias, down_proj_bias = self.resblocks[i](x)
+            self.lora_feature[i] = adapt_x
+        return x
+class VisualTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int, text_or_image=None):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        # Added so this info is available. should not change anything.
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads, text_or_image=text_or_image, flag=True)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)
+        x = x.reshape(x.shape[0], x.shape[1], -1)
+        x = x.permute(0, 2, 1)
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x_before_fusion = x
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x, x_before_fusion
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int,
+                 baseline = False
+                 ):
+        super().__init__()
+        self.baseline = baseline
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisualTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim,
+                text_or_image='image',
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask(),
+            text_or_image='text',
+            flag = True,
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        # self.adapt_lamda = [torch.nn.Parameter(30 * torch.rand(1)) for _ in range(12)]
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x_before_fusion = x
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x, x_before_fusion
+    def forward(self, image, text, taskid, is_train):
+        global global_taskid, global_is_train
+        global_taskid = taskid
+        global_is_train = is_train
+        if image is None:
+            return self.encode_text(text)
+        elif text is None:
+            return self.encode_image(image)
+        image_features, x_img_before_fusion = self.encode_image(image)
+        text_features, x_txt_before_fusion = self.encode_text(text)
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        # if self.baseline:
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        return logits_per_image, logits_per_text
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    model.load_state_dict(state_dict, strict=False)
+    for p in model.parameters():
+        p.data = p.data.float()
+    return model.eval()

clip/tokenizer.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        if not special_tokens:
+            special_tokens = ['<start_of_text>', '<end_of_text>']
+        else:
+            special_tokens = ['<start_of_text>', '<end_of_text>'] + special_tokens
+        vocab.extend(special_tokens)
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {t:t for t in special_tokens}
+        special = "|".join(special_tokens)
+        self.pat = re.compile(special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+        self.vocab_size = len(self.encoder)
+        self.all_special_ids = [self.encoder[t] for t in special_tokens]
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text

configs/class/cifar100_10-10.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+hydra:
+  run:
+    dir: ./experiments/${scenario}/${dataset}_${initial_increment}-${increment}-${method}
+  job:
+    chdir: true
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '%(message)s'
+class_order: ""
+dataset_root: ""
+workdir: ""
+log_path: "metrics.json"
+model_name: "ViT-B/16"
+prompt_template: "a bad photo of a {}."
+batch_size: 128
+increment: ${initial_increment}
+initial_increment: 10
+scenario: "class"
+dataset: "cifar100"
+weight_decay: 0.0
+l2: 0
+ce_method: 0
+method: "DMNSP"
+lr: 1e-3
+ls: 0.0

configs/class/cifar100_2-2.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+hydra:
+  run:
+    dir: ./experiments/${scenario}/${dataset}_${initial_increment}-${increment}-${method}-${ls}
+  job:
+    chdir: true
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '%(message)s'
+class_order: ""
+dataset_root: ""
+workdir: ""
+log_path: "metrics.json"
+model_name: "ViT-B/16"
+prompt_template: "a bad photo of a {}."
+batch_size: 128
+increment: ${initial_increment}
+initial_increment: 2
+scenario: "class"
+dataset: "cifar100"
+weight_decay: 0.0
+l2: 0
+ce_method: 0
+method: "DMNSP"
+lr: 1e-3
+ls: 0.0

configs/class/cifar100_5-5.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+hydra:
+  run:
+    dir: ./experiments/${scenario}/${dataset}_${initial_increment}-${increment}-${method}
+  job:
+    chdir: true
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '%(message)s'
+class_order: ""
+dataset_root: ""
+workdir: ""
+log_path: "metrics.json"
+model_name: "ViT-B/16"
+prompt_template: "a bad photo of a {}."
+batch_size: 128
+increment: ${initial_increment}
+initial_increment: 5
+scenario: "class"
+dataset: "cifar100"
+weight_decay: 0.0
+l2: 0
+ce_method: 0
+method: "DMNSP"
+lr: 1e-3
+ls: 0.0

configs/class/tinyimagenet_100-10.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+hydra:
+  run:
+    dir: ./experiments/${scenario}/${dataset}_${initial_increment}-${increment}-${method}
+  job:
+    chdir: true
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '%(message)s'
+class_order: ""
+dataset_root: ""
+workdir: ""
+log_path: "metrics.json"
+model_name: "ViT-B/16"
+prompt_template: "a bad photo of a {}."
+batch_size: 128
+initial_increment: 100
+increment: 10
+scenario: "class"
+dataset: "tinyimagenet"
+weight_decay: 0.0
+l2: 0
+ce_method: 0
+method: "DMNSP"
+lr: 1e-3
+ls: 0.0
+we:
+avg_freq:
+ref_dataset:
+ref_sentences: random

configs/class/tinyimagenet_100-20.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+hydra:
+  run:
+    dir: ./experiments/${scenario}/${dataset}_${initial_increment}-${increment}-${method}
+  job:
+    chdir: true
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '%(message)s'
+class_order: ""
+dataset_root: ""
+workdir: ""
+log_path: "metrics.json"
+model_name: "ViT-B/16"
+prompt_template: "a bad photo of a {}."
+batch_size: 128
+initial_increment: 100
+increment: 20
+scenario: "class"
+dataset: "tinyimagenet"
+weight_decay: 0.0
+l2: 0
+ce_method: 0
+method: "DMNSP"
+lr: 1e-3
+ls: 0.0
+we:
+avg_freq:
+ref_dataset:
+ref_sentences: random

configs/class/tinyimagenet_100-5.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+hydra:
+  run:
+    dir: ./experiments/${scenario}/${dataset}_${initial_increment}-${increment}-${method}
+  job:
+    chdir: true
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '%(message)s'
+class_order: ""
+dataset_root: ""
+workdir: ""
+log_path: "metrics.json"
+model_name: "ViT-B/16"
+prompt_template: "a bad photo of a {}."
+batch_size: 128
+initial_increment: 100
+increment: 5
+scenario: "class"
+dataset: "tinyimagenet"
+weight_decay: 0.0
+l2: 0
+ce_method: 0
+method: "DMNSP"
+lr: 1e-3
+ls: 0.0
+we:
+avg_freq:
+ref_dataset:
+ref_sentences: random

continual_clip/__init__.py ADDED Viewed

File without changes

continual_clip/cc.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import pandas as pd
+from PIL import Image
+import torch
+from torch.utils.data import (
+    DataLoader,
+    Dataset,
+    IterableDataset,
+    SubsetRandomSampler,
+    get_worker_info,
+)
+import clip.clip as clip
+class CsvDataset(Dataset):
+    def __init__(self, input_filename, transforms, img_key, caption_key, sep="\t"):
+        df = pd.read_csv(input_filename, sep=sep)
+        self.location = os.path.dirname(input_filename)
+        self.images = df[img_key].tolist()
+        self.captions = df[caption_key].tolist()
+        self.transforms = transforms
+    def __len__(self):
+        return len(self.captions)
+    def __getitem__(self, idx):
+        image_path = os.path.join(self.location, str(self.images[idx]))
+        images = self.transforms(Image.open(image_path))
+        texts = clip.tokenize([str(self.captions[idx])])[0]
+        return images, texts
+class conceptual_captions(Dataset):
+    def __init__(
+        self, transforms, location, batch_size, *args, num_workers=16, **kwargs
+    ):
+        file_name = "Validation_GCC-1.1.0-Validation_output.csv"
+        file_path = os.path.join(location, file_name)
+        self.template = lambda c: f"a photo of a {c}."
+        self.train_dataset = CsvDataset(
+            input_filename=file_path,
+            transforms=transforms,
+            img_key="filepath",
+            caption_key="title",
+        )
+        # breakpoint()
+        self.train_loader = torch.utils.data.DataLoader(
+            self.train_dataset,
+            batch_size=batch_size,
+            shuffle=True,
+            num_workers=num_workers,
+        )

continual_clip/clip_original/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ This folder is a lightly modified version of https://github.com/openai/CLIP.

continual_clip/clip_original/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .clip import *

continual_clip/clip_original/adapter.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# --------------------------------------------------------
+# References:
+# https://github.com/jxhe/unify-parameter-efficient-tuning
+# --------------------------------------------------------
+import math
+import torch
+import torch.nn as nn
+class Adapter(nn.Module):
+    def __init__(self,
+                 d_model=None,
+                 bottleneck=None,
+                 dropout=0.0,
+                 init_option="lora",
+                 adapter_scalar="1.0",
+                 adapter_layernorm_option="in"):
+        super().__init__()
+        self.n_embd = d_model if d_model is None else d_model
+        self.down_size = bottleneck
+        #_before
+        self.adapter_layernorm_option = adapter_layernorm_option
+        self.adapter_layer_norm_before = None
+        if adapter_layernorm_option == "in" or adapter_layernorm_option == "out":
+            self.adapter_layer_norm_before = nn.LayerNorm(self.n_embd)
+        if adapter_scalar == "learnable_scalar":
+            self.scale = nn.Parameter(torch.ones(1))
+        else:
+            self.scale = float(adapter_scalar)
+        self.linear = nn.Linear(self.n_embd, self.n_embd)
+        self.down_proj = nn.Linear(self.n_embd, 64)
+        self.non_linear_func = nn.ReLU()
+        self.up_proj = nn.Linear(self.down_size, self.n_embd)
+        self.dropout = dropout
+        if init_option == "bert":
+            raise NotImplementedError
+        elif init_option == "lora":
+            with torch.no_grad():
+                nn.init.kaiming_uniform_(self.down_proj.weight, a=math.sqrt(5))
+                nn.init.zeros_(self.up_proj.weight)
+                nn.init.zeros_(self.down_proj.bias)
+                nn.init.zeros_(self.up_proj.bias)
+        elif init_option == "linear":
+            with torch.no_grad():
+                nn.init.zeros_(self.linear.weight)
+    def forward(self, x, add_residual=True, residual=None):
+        residual = x if residual is None else residual
+        if self.adapter_layernorm_option == 'in': #  none
+            x = self.adapter_layer_norm_before(x)
+        down = self.down_proj(x)
+        down = self.non_linear_func(down)
+        down = nn.functional.dropout(down, p=self.dropout, training=self.training)
+        up = self.up_proj(down)
+        up = up * self.scale
+        if self.adapter_layernorm_option == 'out': #  none
+            up = self.adapter_layer_norm_before(up)
+        if add_residual:
+            output = up + residual
+        else:
+            output = up
+        return output

continual_clip/clip_original/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

continual_clip/clip_original/clip.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# Code ported from https://github.com/openai/CLIP
+import hashlib
+import os
+import urllib
+import warnings
+from typing import Union, List
+import torch
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, RandomResizedCrop, InterpolationMode
+from tqdm import tqdm
+from .model import build_model
+from clip.tokenizer import SimpleTokenizer as _Tokenizer
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+}
+def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+def _transform(n_px: int, is_train: bool):
+    normalize = Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+    if is_train:
+        return Compose([
+            RandomResizedCrop(n_px, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+    else:
+        return Compose([
+            Resize(n_px, interpolation=InterpolationMode.BICUBIC),
+            CenterCrop(n_px),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=True, is_train=False, pretrained=True):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model (default) or more hackable non-JIT model.
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name])
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        if jit:
+            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+            jit = False
+        state_dict = torch.load(model_path, map_location="cpu")
+    if not jit:
+        try:
+            model = build_model(state_dict or model.state_dict()).to(device)
+        except KeyError:
+            sd = {k[7:]: v for k,v in state_dict["state_dict"].items()}
+            model = build_model(sd).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, \
+               _transform(model.visual.input_resolution, is_train=True), \
+               _transform(model.visual.input_resolution, is_train=False)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def patch_device(module):
+        graphs = [module.graph] if hasattr(module, "graph") else []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            graphs = [module.graph] if hasattr(module, "graph") else []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, \
+           _transform(model.input_resolution.item(), is_train=True), \
+           _transform(model.input_resolution.item(), is_train=False)
+def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<start_of_text>"]
+    eot_token = _tokenizer.encoder["<end_of_text>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length: # Truncate
+            tokens = tokens[:context_length]
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result

continual_clip/clip_original/model.py ADDED Viewed

	@@ -0,0 +1,568 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import os
+import json
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .adapter import Adapter
+from torch.distributions.normal import Normal
+from collections import Counter
+global_taskid = 0
+global_is_train=True
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x[0]
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, text_or_image=None, flag=False):
+        super().__init__()
+        self.register_buffer("mean", torch.tensor([0.0]))
+        self.register_buffer("std", torch.tensor([1.0]))
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.is_train = global_is_train
+        self.step = 1
+        self.top_k = 2
+        self.ffn_num = 64
+        self.experts_num = 1
+        self.softmax = nn.Softmax(1)
+        self.softplus = nn.Softplus()
+        self.noisy_gating = True
+        self.adaptmlp_list = nn.ModuleList()
+        self.text_or_image = text_or_image
+        self.flag = flag
+        if text_or_image == 'text':
+            print('vanilla text transformer')
+            self.choose_map_text = torch.zeros([ self.experts_num])
+        else:
+            print('vanilla image transformer')
+            self.choose_map_image = torch.zeros([ self.experts_num])
+        # self.taskid = None
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def cv_squared(self, x):
+        """The squared coefficient of variation of a sample.
+        Useful as a loss to encourage a positive distribution to be more uniform.
+        Epsilons added for numerical stability.
+        Returns 0 for an empty Tensor.
+        Args:
+        x: a `Tensor`.
+        Returns:
+        a `Scalar`.
+        """
+        eps = 1e-10
+        # if only num_experts = 1
+        if x.shape[0] == 1:
+            return torch.tensor([0], device=x.device, dtype=x.dtype)
+        return x.float().var() / (x.float().mean()**2 + eps)
+    def _gates_to_load(self, gates):
+        """Compute the true load per expert, given the gates.
+        The load is the number of examples for which the corresponding gate is >0.
+        Args:
+        gates: a `Tensor` of shape [batch_size, n]
+        Returns:
+        a float32 `Tensor` of shape [n]
+        """
+        return (gates > 0).sum(0)
+    def _prob_in_top_k(self, clean_values, noisy_values, noise_stddev, noisy_top_values):
+        """Helper function to NoisyTopKGating.
+        Computes the probability that value is in top k, given different random noise.
+        This gives us a way of backpropagating from a loss that balances the number
+        of times each expert is in the top k experts per example.
+        In the case of no noise, pass in None for noise_stddev, and the result will
+        not be differentiable.
+        Args:
+        clean_values: a `Tensor` of shape [batch, n].
+        noisy_values: a `Tensor` of shape [batch, n].  Equal to clean values plus
+          normally distributed noise with standard deviation noise_stddev.
+        noise_stddev: a `Tensor` of shape [batch, n], or None
+        noisy_top_values: a `Tensor` of shape [batch, m].
+           "values" Output of tf.top_k(noisy_top_values, m).  m >= k+1
+        Returns:
+        a `Tensor` of shape [batch, n].
+        """
+        # print('1231',clean_values)  # 全nan
+        batch = clean_values.size(0)
+        m = noisy_top_values.size(1)
+        top_values_flat = noisy_top_values.flatten()
+        threshold_positions_if_in = torch.arange(batch, device=clean_values.device) * m + self.top_k
+        threshold_if_in = torch.unsqueeze(torch.gather(top_values_flat, 0, threshold_positions_if_in), 1)
+        is_in = torch.gt(noisy_values, threshold_if_in)
+        threshold_positions_if_out = threshold_positions_if_in - 1
+        threshold_if_out = torch.unsqueeze(torch.gather(top_values_flat, 0, threshold_positions_if_out), 1)
+        # is each value currently in the top k.
+        normal = Normal(self.mean, self.std)
+        #
+        prob_if_in = normal.cdf((clean_values - threshold_if_in)/noise_stddev)
+        prob_if_out = normal.cdf((clean_values - threshold_if_out)/noise_stddev)
+        prob = torch.where(is_in, prob_if_in, prob_if_out)
+        return prob
+    def noisy_top_k_gating(self, x, train, w_gate, w_noise, noise_epsilon=1e-2):
+        """Noisy top-k gating.
+          See paper: https://arxiv.org/abs/1701.06538.
+          Args:
+            x: input Tensor with shape [batch_size, input_size]
+            train: a boolean - we only add noise at training time.
+            noise_epsilon: a float
+          Returns:
+            gates: a Tensor with shape [batch_size, num_experts]
+            load: a Tensor with shape [num_experts]
+        """
+        clean_logits = x @ w_gate.to(x)
+        if self.noisy_gating and train:
+            raw_noise_stddev = x @ w_noise.to(x)
+            noise_stddev = ((self.softplus(raw_noise_stddev) + noise_epsilon))
+            noisy_logits = clean_logits + (torch.randn_like(clean_logits) * noise_stddev)
+            logits = noisy_logits
+        else:
+            logits = clean_logits
+        # calculate topk + 1 that will be needed for the noisy gates
+        top_logits, top_indices = logits.topk(min(self.top_k + 1, self.experts_num), dim=1)
+        top_k_logits = top_logits[:, :self.top_k]
+        top_k_indices = top_indices[:, :self.top_k]
+        top_k_gates = self.softmax(top_k_logits)
+        zeros = torch.zeros_like(logits)
+        gates = zeros.scatter(1, top_k_indices, top_k_gates)
+        if self.noisy_gating and self.top_k < self.experts_num and train:  # 目前未用上
+            load = (self._prob_in_top_k(clean_logits, noisy_logits, noise_stddev, top_logits)).sum(0)
+        else:
+            load = self._gates_to_load(gates)
+        return gates, load
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, text_or_image=None,
+                 flag =True,):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask, text_or_image, flag) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class VisualTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int, text_or_image=None):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        # Added so this info is available. should not change anything.
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads, text_or_image=text_or_image, flag=True)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)
+        x = x.reshape(x.shape[0], x.shape[1], -1)
+        x = x.permute(0, 2, 1)
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int,
+                 baseline = False
+                 ):
+        super().__init__()
+        self.baseline = baseline
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisualTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim,
+                text_or_image='image',
+            )
+        # self.transformer = Transformer(
+        #     width=transformer_width,
+        #     layers=transformer_layers,
+        #     heads=transformer_heads,
+        #     attn_mask=self.build_attention_mask(),
+        #     text_or_image='text'
+        # )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask(),
+            text_or_image='text',
+            flag = True,
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text, taskid, is_train):
+        global global_taskid, global_is_train
+        global_taskid = taskid
+        global_is_train = is_train
+        return self.encode_image(image)
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    model.load_state_dict(state_dict, strict=False)
+    for p in model.parameters():
+        p.data = p.data.float()
+    return model.eval()

continual_clip/clip_original/tokenizer.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        if not special_tokens:
+            special_tokens = ['<start_of_text>', '<end_of_text>']
+        else:
+            special_tokens = ['<start_of_text>', '<end_of_text>'] + special_tokens
+        vocab.extend(special_tokens)
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {t:t for t in special_tokens}
+        special = "|".join(special_tokens)
+        self.pat = re.compile(special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+        self.vocab_size = len(self.encoder)
+        self.all_special_ids = [self.encoder[t] for t in special_tokens]
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text

continual_clip/datasets.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import torch.nn as nn
+from continuum import ClassIncremental, InstanceIncremental
+from continuum.datasets import (
+    CIFAR100, ImageNet100, TinyImageNet200, ImageFolderDataset, Core50,
+    fgvc_aircraft, Caltech101, DTD, EuroSAT, flowers102, food101,
+    MNIST, OxfordPet, SUN397
+)
+from .utils import get_dataset_class_names
+class ImageNet1000(ImageFolderDataset):
+    """Continuum dataset for datasetsss with tree-like structure.
+    :param train_folder: The folder of the train data.
+    :param test_folder: The folder of the test data.
+    :param download: Dummy parameter.
+    """
+    def __init__(
+            self,
+            data_path: str,
+            train: bool = True,
+            download: bool = False,
+    ):
+        super().__init__(data_path=data_path, train=train, download=download)
+    def get_data(self):
+        if self.train:
+            self.data_path = os.path.join(self.data_path, "train")
+        else:
+            self.data_path = os.path.join(self.data_path, "val")
+        return super().get_data()
+def get_dataset(cfg, is_train, transforms=None):
+    if cfg.dataset == "cifar100":
+        data_path = os.path.join(cfg.dataset_root, cfg.dataset)
+        dataset = CIFAR100(
+            data_path=data_path,
+            download=True,
+            train=is_train,
+            # transforms=transforms
+        )
+        classes_names = dataset.dataset.classes
+    # elif cfg.dataset == "tiny-imagenet-200":
+    elif cfg.dataset == "tinyimagenet":
+        # data_path = '/data/kangborui/'
+        data_path = os.path.join(cfg.dataset_root, cfg.dataset)
+        dataset = TinyImageNet200(
+            data_path,
+            train=is_train,
+            download=True
+        )
+        classes_names = get_dataset_class_names(cfg.workdir, cfg.dataset)
+    elif cfg.dataset == "imagenet100":
+        data_path = cfg.dataset_root
+        # data_path = os.path.join(cfg.dataset_root, "ImageNet")
+        dataset = ImageNet100(
+            data_path,
+            train=is_train,
+            data_subset=os.path.join('/home/kangborui/ClProject/MoE-Adapters4CL-cross-guild-fusion/cil/dataset_reqs/imagenet100_splits', "train_100.txt" if is_train else "val_100.txt")
+        )
+        classes_names = get_dataset_class_names(cfg.workdir, cfg.dataset)
+    elif cfg.dataset == "imagenet1000":
+        data_path = os.path.join(cfg.dataset_root, cfg.dataset)
+        dataset = ImageNet1000(
+            data_path,
+            train=is_train
+        )
+        classes_names = get_dataset_class_names(cfg.workdir, cfg.dataset)
+    elif cfg.dataset == "core50":
+        data_path = os.path.join(cfg.dataset_root, cfg.dataset)
+        dataset = dataset = Core50(
+            data_path,
+            scenario="domains",
+            classification="category",
+            train=is_train
+        )
+        classes_names = [
+            "plug adapters", "mobile phones", "scissors", "light bulbs", "cans",
+            "glasses", "balls", "markers", "cups", "remote controls"
+        ]
+    else:
+        ValueError(f"'{cfg.dataset}' is a invalid dataset.")
+    return dataset, classes_names
+def build_cl_scenarios(cfg, is_train, transforms) -> nn.Module:
+    dataset, classes_names = get_dataset(cfg, is_train)
+    if cfg.scenario == "class":
+        scenario = ClassIncremental(
+            dataset,
+            initial_increment=cfg.initial_increment,
+            increment=cfg.increment,
+            transformations=transforms.transforms, # Convert Compose into list
+            class_order=cfg.class_order,
+        )
+    elif cfg.scenario == "domain":
+        scenario = InstanceIncremental(
+            dataset,
+            transformations=transforms.transforms,
+        )
+    elif cfg.scenario == "task-agnostic":
+        NotImplementedError("Method has not been implemented. Soon be added.")
+    else:
+        ValueError(f"You have entered `{cfg.scenario}` which is not a defined scenario, "
+                    "please choose from {{'class', 'domain', 'task-agnostic'}}.")
+    return scenario, classes_names

continual_clip/dynamic_dataset.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import re
+import torch
+from tqdm import tqdm
+import numpy as np
+# from torchvision import datasetsss
+from . import datasets, utils
+# import clip.clip as clip
+import clip
+import numpy as np
+class DynamicDataset():
+    def __init__(self, cfg):
+        self.ref_database = {}  # all data key = dataset_name; value is 4D tensor (num_image, 3, 224, 224)
+        self.ref_names = [] # collect the name of the dataset
+        self.ref_model, _, self.test_preprocess = clip.load(cfg.model_name, jit=False)
+        self.cur_dataset = None
+        self.memory_size = 5000
+        self.batch_id = 0
+    def update(self, dataset, load):
+        # load is a model directly
+        self.cur_dataset = dataset
+        if not load: # first round USELESS FOR CICL
+            new_dataset = self.getNewDataset()
+            self.ref_database[self.cur_dataset] = new_dataset[:self.memory_size]
+            self.ref_names.append(self.cur_dataset)
+        else: # other rounds
+            self.ref_model = load
+            self.reduceExampleSet()
+            self.constructExampleSet()
+    def reduceExampleSet(self):
+        print("Reducing Example Set")
+        K, t = self.memory_size, len(self.ref_names)+1
+        m = K // t
+        for dataset in self.ref_names:
+            self.ref_database[dataset] = self.ref_database[dataset][:m]
+    def constructExampleSet(self):
+        # breakpoint()
+        print("Constructing Example Set")
+        self.ref_names.append(self.batch_id)
+        new_dataset = torch.tensor(self.getNewDataset())
+        image_feature = []
+        num = new_dataset.shape[0]
+        print("[Constructing] Calculating Distance")
+        for ndx in tqdm(np.arange(num)):
+            img = torch.unsqueeze(new_dataset[ndx], dim=0)
+            img = img.cuda()
+            img_feature = self.ref_model(img, None)
+            image_feature.append(img_feature.cpu().detach().tolist())
+        image_feature = torch.tensor(image_feature)
+        image_feature = torch.squeeze(image_feature, dim=1)
+        image_feature = image_feature / image_feature.norm(dim=-1, keepdim=True)
+        image_feature = np.array(image_feature.cpu().detach())
+        image_feature_average = image_feature.mean(axis=0)
+        K, t = self.memory_size, len(self.ref_names)
+        m = K - K // t
+        update_dataset = []
+        if not m:
+            m = self.memory_size
+        cur_embedding_sum = None
+        print("[Constructing] Collecting Examples")
+        for k in tqdm(np.arange(min(m, len(image_feature)))):
+            if not k:
+                index = np.argmin(
+                    np.sum((image_feature_average - image_feature)**2, axis=1)
+                )
+                cur_embedding_sum = image_feature[index]
+                update_dataset.append((new_dataset.cpu())[index].tolist())
+                image_feature = np.delete(image_feature, index, axis=0)
+            else:
+                index = np.argmin(
+                    np.sum((
+                        image_feature_average - (1/(k+1))*(image_feature + cur_embedding_sum)
+                    )**2, axis=1)
+                )
+                cur_embedding_sum += image_feature[index]
+                update_dataset.append((new_dataset.cpu())[index].tolist())
+                image_feature = np.delete(image_feature, index, axis=0)
+        self.ref_database[self.batch_id] = update_dataset
+        print("finishing current task", self.batch_id)
+        self.batch_id = self.batch_id + 1
+    def getNewDataset(self):
+        samples = []
+        count = 0
+        for sample in tqdm(self.cur_dataset):
+            if count == 10000:
+                return samples
+            count += 1
+            samples.append(sample[0].tolist())
+        return samples
+    def get(self):
+        print("Getting Reference Images")
+        value = list(self.ref_database.values())
+        out = []
+        for i in tqdm(value):
+            out += i
+        return torch.tensor(out)

continual_clip/models.py ADDED Viewed

	@@ -0,0 +1,228 @@

+from omegaconf import DictConfig
+from tqdm import tqdm
+import torch.nn.functional as F
+import clip.clip as clip
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from .utils import get_class_ids_per_task, get_class_names
+from . import utils
+from .dynamic_dataset import DynamicDataset
+DEFAULT_THRESHOLD = 0.985
+TOP_SELECT = 1
+EPOCH_NUM = 4
+TOP_K_RATIO = 0.1
+LAMBDA_SCALE = 30
+LAYER_NUM = 12
+class ClassIncremental(nn.Module):
+    def __init__(self, cfg, device, origin_flag, jit=False):
+        super().__init__()
+        self.prompt_template = cfg.prompt_template
+        self.device = device
+        self.classes_names = None
+        self.origin_flag = origin_flag
+        self.model, self.transforms, _ = clip.load(cfg.model_name, device=device, jit=jit)
+        self.ref_model = None
+        self.class_ids_per_task = list(get_class_ids_per_task(cfg))
+        self.current_class_names = []
+        self.text_tokens = None
+        self.dynamic_dataset = DynamicDataset(cfg)
+        self.prev_gradients = None
+        self.visual_cur_matrix = {}
+        self.visual_U = {}
+        self.loss_list = []
+    def forward(self, image, taskid):
+        with torch.no_grad():
+            logits_per_image, _ = self.model(image, self.text_tokens, 0, is_train=False)
+            probs = logits_per_image.softmax(dim=-1)
+        return probs
+    def adaptation(self, task_id, cfg, train_dataset, train_classes_names, world):
+        self.current_class_names += get_class_names(self.classes_names, self.class_ids_per_task[task_id])
+        self.text_tokens = clip.tokenize(
+            [self.prompt_template.format(c) for c in self.current_class_names]
+        ).cuda(device=2)
+        if cfg.method != "zeroshot":
+            self.train(task_id, cfg, train_dataset, train_classes_names, world)
+    def train(self, task_id, cfg, train_dataset, train_classes_names, world):
+        train_loader = DataLoader(train_dataset[task_id:task_id + 1],
+                                  batch_size=cfg.batch_size,
+                                  shuffle=True, num_workers=8)
+        train_iter = iter(train_loader)
+        EPOCH = EPOCH_NUM
+        num_batches = len(train_loader)
+        total_iterations = EPOCH * num_batches
+        for k, v in self.model.named_parameters():
+            if "adapt" not in k:
+                v.requires_grad = False
+        params = [
+            v for k, v in self.model.named_parameters() if "adapt" in k
+        ]
+        params_name = [
+            k for k, v in self.model.named_parameters() if "adapt" in k
+        ]
+        print('========trainable params============', params_name)
+        # optimizer
+        optimizer = torch.optim.AdamW(params, lr=cfg.lr, weight_decay=cfg.weight_decay)
+        scheduler = utils.cosine_lr(
+            optimizer, cfg.lr, 30, total_iterations
+        )
+        self.model = self.model.cuda(device=2)
+        classnames = get_class_names(self.classes_names, self.class_ids_per_task[task_id])
+        print(classnames)
+        texts = [self.prompt_template.format(c) for c in classnames]
+        texts = clip.tokenize(texts).cuda(device=2)
+        self.model.train()
+        batch_count = 0
+        lamda = [[0 for _ in range(LAYER_NUM)] for _ in range(LAYER_NUM)]
+        for iteration in tqdm(range(total_iterations + 1)):
+            scheduler(iteration)
+            try:
+                inputs, targets, task_ids = next(train_iter)
+            except:
+                train_iter = iter(train_loader)
+                inputs, targets, task_ids = next(train_iter)
+            if cfg.dataset == "tinyimagenet" and task_id != 0:
+                shift = 100 + (task_id - 1) * cfg.increment
+                targets -= shift
+            elif cfg.dataset == "imagenet100" and task_id != 0:
+                shift = cfg.initial_increment + (task_id - 1) * cfg.increment
+                targets -= shift
+            else:
+                shift = task_id * cfg.increment
+                targets -= shift
+            inputs, targets = inputs.cuda(device=2), targets.cuda(device=2)
+            logits_per_image, _ = self.model.cuda(device=2)(inputs, texts.cuda(device=2), 0, is_train=True)  # 分开
+            loss = F.cross_entropy(logits_per_image, targets, label_smoothing=cfg.ls)
+            self.loss_list.append(loss)
+            print('CELoss: {}'.format(loss))
+            optimizer.zero_grad()
+            loss.backward()
+            if task_id != 0:
+                if batch_count == 0:
+                    for j in range(LAYER_NUM):
+                        activation_visual = self.model.visual.transformer.lora_feature[j]
+                        activation_visual = torch.bmm(activation_visual.detach().permute(1, 2, 0),
+                                                      activation_visual.detach().permute(1, 0, 2)).sum(dim=0)
+                        U_visual, S, Vh = torch.linalg.svd(activation_visual, full_matrices=False)
+                        U_visual = U_visual[:, :TOP_SELECT]
+                        for k in range(LAYER_NUM):
+                            v_visual = self.visual_U[k]
+                            normalized_vector_visual = U_visual / torch.norm(U_visual)
+                            similarities_visual = []
+                            for column_visual in v_visual.t():
+                                normalized_column_visual = column_visual / torch.norm(column_visual)
+                                cos_sim_visual = torch.dot(normalized_vector_visual.squeeze(),
+                                                           normalized_column_visual.squeeze())
+                                similarities_visual.append(cos_sim_visual)
+                            dot_products_visual = torch.mean(
+                                torch.topk(torch.stack(similarities_visual), int(len(similarities_visual) * TOP_K_RATIO))[0])
+                            lamda[j][k] = torch.exp(-dot_products_visual) * LAMBDA_SCALE
+                    batch_count = batch_count + 1
+                for name, params in self.model.named_parameters():
+                    for i in range(LAYER_NUM):
+                        if 'visual' in name and 'adapt' in name and 'down' in name and 'weight' in name:
+                            v = self.visual_U[i]
+                            v_ = torch.mm(params.grad.data, v)
+                            params.grad.data = torch.mm(v_, v.T)* lamda[int(name.split(".")[3])][i]
+                        elif 'visual' in name and 'adapt' in name and 'up' in name and 'weight' in name:
+                            v = self.visual_U[i]
+                            v_ = torch.mm(v.T, params.grad.data)
+                            params.grad.data = torch.mm(v, v_)* lamda[int(name.split(".")[3])][i]
+            optimizer.step()
+        torch.cuda.empty_cache()
+        train_loader_ = DataLoader(train_dataset[task_id:task_id + 1],
+                                  batch_size=128,
+                                  shuffle=True, num_workers=8)
+        counts = 0
+        models = self.model.cuda(2)
+        for inputs, targets, task_ids in tqdm(train_loader_):
+            inputs = inputs.cuda(device=2)
+            with torch.no_grad():
+                outputs = models(inputs, texts.cuda(2), 0, is_train=False)
+            for i in range(LAYER_NUM):
+                if len(self.visual_cur_matrix) == i:
+                    activation = models.visual.transformer.lora_feature[i]
+                    activation = torch.bmm(activation.detach().permute(1, 2, 0),
+                                           activation.detach().permute(1, 0, 2)).sum(dim=0)
+                    self.visual_cur_matrix[i] = activation
+                    U, S, Vh = torch.linalg.svd(activation, full_matrices=False)
+                    self.visual_U[i] = U[:,TOP_SELECT:]
+                else:
+                    activation = models.visual.transformer.lora_feature[i]
+                    activation = torch.bmm(activation.detach().permute(1, 2, 0),
+                                           activation.detach().permute(1, 0, 2)).sum(dim=0)
+                    U1, S1, Vh1 = torch.linalg.svd(activation, full_matrices=False)
+                    Ui = torch.cat((self.visual_U[i], U1[:, TOP_SELECT:]), dim=1)
+                    self.visual_U[i] = Ui
+            counts = counts + 1
+            if counts == 1:
+                break
+        torch.cuda.empty_cache()
+        self.model.eval()
+class DomainIncremental(nn.Module):
+    pass
+class TaskAgnostic(nn.Module):
+    pass
+def load_model(cfg: DictConfig, device: torch.device, origin_flag) -> nn.Module:
+    r"""Load a CLIP model in different continual scenarios.
+    Arguments:
+        cfg (DictConfig): Experiment configurations.
+        device (torch.device): Device to train (or) evaluate the model on.
+    Returns:
+        nn.Module: Return scenario specific CLIP model.
+    """
+    if cfg.scenario == "class":
+        return ClassIncremental(cfg, device, origin_flag)
+    elif cfg.scenario == "domain":
+        return DomainIncremental(cfg, device)
+    elif cfg.scenario == "task-aganostic":
+        return TaskAgnostic(cfg, device)
+    else:
+        raise ValueError(f"""
+            `{cfg.scenarios}` is not a valid scenario,
+            Please choose from ['class', "domain', 'task-agnostic']
+        """)

continual_clip/utils.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import os
+import json
+import yaml
+from omegaconf import DictConfig, OmegaConf
+import torch
+import torch.nn.functional as F
+import pickle
+import random
+import numpy as np
+from clip.tokenizer import SimpleTokenizer as _Tokenizer
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+def get_class_order(file_name: str) -> list:
+    r"""TO BE DOCUMENTED"""
+    with open(file_name, "r+") as f:
+        data = yaml.safe_load(f)
+        return data["class_order"]
+def get_class_ids_per_task(args):
+    yield args.class_order[:args.initial_increment]
+    for i in range(args.initial_increment, len(args.class_order), args.increment):
+        yield args.class_order[i:i + args.increment]
+def get_class_names(classes_names, class_ids_per_task):
+    return [classes_names[class_id] for class_id in class_ids_per_task]
+def get_dataset_class_names(workdir, dataset_name, long=False):
+    with open(os.path.join(workdir, "dataset_reqs", f"{dataset_name}_classes.txt"), "r") as f:
+        lines = f.read().splitlines()
+    return [line.split("\t")[-1] for line in lines]
+def save_config(config: DictConfig) -> None:
+    OmegaConf.save(config, "config.yaml")
+def get_workdir(path):
+    split_path = path.split("/")
+    workdir_idx = split_path.index("cil")
+    return "/".join(split_path[:workdir_idx+1])
+###########################
+def assign_learning_rate(param_group, new_lr):
+    param_group["lr"] = new_lr
+def _warmup_lr(base_lr, warmup_length, step):
+    return base_lr * (step + 1) / warmup_length
+def cosine_lr(optimizer, base_lrs, warmup_length, steps):
+    if not isinstance(base_lrs, list):
+        base_lrs = [base_lrs for _ in optimizer.param_groups]
+    assert len(base_lrs) == len(optimizer.param_groups)
+    def _lr_adjuster(step):
+        for param_group, base_lr in zip(optimizer.param_groups, base_lrs):
+            if step < warmup_length:
+                lr = _warmup_lr(base_lr, warmup_length, step)
+            else:
+                e = step - warmup_length
+                es = steps - warmup_length
+                lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
+            assign_learning_rate(param_group, lr)
+    return _lr_adjuster
+def accuracy(output, target, topk=(1,)):
+    pred = output.topk(max(topk), 1, True, True)[1].t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    return [
+        float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy())
+        for k in topk
+    ]
+def torch_save(classifier, save_path):
+    if os.path.dirname(save_path) != "":
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    torch.save({"state_dict": classifier.state_dict()}, save_path)
+    print("Checkpoint saved to", save_path)
+    # with open(save_path, 'wb') as f:
+    #     pickle.dump(classifier.cpu(), f)
+def torch_load(classifier, save_path, device=None):
+    checkpoint = torch.load(save_path)
+    missing_keys, unexpected_keys = classifier.load_state_dict(
+        checkpoint["state_dict"], strict=False
+    )
+    if len(missing_keys) > 0 or len(unexpected_keys) > 0:
+        print("Missing keys:", missing_keys)
+        print("Unexpected keys:", unexpected_keys)
+    print("Checkpoint loaded from", save_path)
+    # with open(save_path, 'rb') as f:
+    #     classifier = pickle.load(f)
+    if device is not None:
+        classifier = classifier.to(device)
+    return classifier
+def get_logits(inputs, classifier):
+    assert callable(classifier)
+    if hasattr(classifier, "to"):
+        classifier = classifier.to(inputs.device)
+    return classifier(inputs)
+def get_probs(inputs, classifier):
+    if hasattr(classifier, "predict_proba"):
+        probs = classifier.predict_proba(inputs.detach().cpu().numpy())
+        return torch.from_numpy(probs)
+    logits = get_logits(inputs, classifier)
+    return logits.softmax(dim=1)
+class LabelSmoothing(torch.nn.Module):
+    def __init__(self, smoothing=0.0):
+        super(LabelSmoothing, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+    def forward(self, x, target):
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.mean()
+def seed_all(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def num_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def batch(iterable, n=64):
+    l = len(iterable)
+    for ndx in range(0, l, n):
+        yield iterable[ndx:min(ndx + n, l)]
+def merge_we(model_0, model_1, sma_count):
+    for param_q, param_k in zip(model_0.parameters(), model_1.parameters()):
+        param_k.data = (param_k.data * sma_count + param_q.data) / (1.0 + sma_count)
+    return model_1
+def wise_we(model_0, model_1, sma_count, model_n, alpha=0.95):
+    for param_q, param_k, param_n in zip(model_0.parameters(), model_1.parameters(), model_n.parameters()):
+        param_k.data = (
+                        (param_k.data * sma_count + param_q.data) / (1.0 + sma_count)
+                    ) * alpha + param_n.data * (1-alpha)
+    return model_1
+def merge_we_router(model_0, model_1, sma_count):
+    for param_q, param_k, name_q, name_k in zip(model_0.parameters(), model_1.parameters(), model_0.named_parameters(), model_1.named_parameters()):
+        if "router" in name_k[0] or "noise" in name_k[0]:
+            param_k.data = (param_k.data * sma_count + param_q.data) / (1.0 + sma_count)
+            # print('111', name_k[0], name_q[0])
+    return model_1
+def moving_avg(model_0, model_1, alpha=0.999):
+    for param_q, param_k in zip(model_0.parameters(), model_1.parameters()):
+        param_q.data = param_q.data * alpha + param_k.data * (1 - alpha)
+def l2_loss(model, model_ref):
+    loss = 0.0
+    for param_q, param_k in zip(model.parameters(), model_ref.parameters()):
+        loss += F.mse_loss(param_q, param_k.detach(), reduction="sum")
+    return loss
+def virtual_vocab(length=10, n_class=1000):
+    voc_len = len(_tokenizer.encoder)
+    # breakpoint()
+    texts = torch.randint(0, voc_len, (n_class, length))
+    start = torch.full((n_class, 1), _tokenizer.encoder["<start_of_text>"])
+    end = torch.full((n_class, 1), _tokenizer.encoder["<end_of_text>"])
+    zeros = torch.zeros((n_class, 75 - length), dtype=torch.long)
+    texts = torch.cat([start, texts, end, zeros], dim=1)
+    return texts
+def distillation(t, s, T=2):
+    p = F.softmax(t / T, dim=1)
+    loss = F.cross_entropy(s / T, p, reduction="mean") * (T ** 2)
+    return loss

dataset_reqs/imagenet1000_classes.txt ADDED Viewed

	@@ -0,0 +1,1000 @@

+0	tench
+1	goldfish
+2	great white shark
+3	tiger shark
+4	hammerhead shark
+5	electric ray
+6	stingray
+7	rooster
+8	hen
+9	ostrich
+10	brambling
+11	goldfinch
+12	house finch
+13	junco
+14	indigo bunting
+15	American robin
+16	bulbul
+17	jay
+18	magpie
+19	chickadee
+20	American dipper
+21	kite (bird of prey)
+22	bald eagle
+23	vulture
+24	great grey owl
+25	fire salamander
+26	smooth newt
+27	newt
+28	spotted salamander
+29	axolotl
+30	American bullfrog
+31	tree frog
+32	tailed frog
+33	loggerhead sea turtle
+34	leatherback sea turtle
+35	mud turtle
+36	terrapin
+37	box turtle
+38	banded gecko
+39	green iguana
+40	Carolina anole
+41	desert grassland whiptail lizard
+42	agama
+43	frilled-necked lizard
+44	alligator lizard
+45	Gila monster
+46	European green lizard
+47	chameleon
+48	Komodo dragon
+49	Nile crocodile
+50	American alligator
+51	triceratops
+52	worm snake
+53	ring-necked snake
+54	eastern hog-nosed snake
+55	smooth green snake
+56	kingsnake
+57	garter snake
+58	water snake
+59	vine snake
+60	night snake
+61	boa constrictor
+62	African rock python
+63	Indian cobra
+64	green mamba
+65	sea snake
+66	Saharan horned viper
+67	eastern diamondback rattlesnake
+68	sidewinder rattlesnake
+69	trilobite
+70	harvestman
+71	scorpion
+72	yellow garden spider
+73	barn spider
+74	European garden spider
+75	southern black widow
+76	tarantula
+77	wolf spider
+78	tick
+79	centipede
+80	black grouse
+81	ptarmigan
+82	ruffed grouse
+83	prairie grouse
+84	peafowl
+85	quail
+86	partridge
+87	african grey parrot
+88	macaw
+89	sulphur-crested cockatoo
+90	lorikeet
+91	coucal
+92	bee eater
+93	hornbill
+94	hummingbird
+95	jacamar
+96	toucan
+97	duck
+98	red-breasted merganser
+99	goose
+100	black swan
+101	tusker
+102	echidna
+103	platypus
+104	wallaby
+105	koala
+106	wombat
+107	jellyfish
+108	sea anemone
+109	brain coral
+110	flatworm
+111	nematode
+112	conch
+113	snail
+114	slug
+115	sea slug
+116	chiton
+117	chambered nautilus
+118	Dungeness crab
+119	rock crab
+120	fiddler crab
+121	red king crab
+122	American lobster
+123	spiny lobster
+124	crayfish
+125	hermit crab
+126	isopod
+127	white stork
+128	black stork
+129	spoonbill
+130	flamingo
+131	little blue heron
+132	great egret
+133	bittern bird
+134	crane bird
+135	limpkin
+136	common gallinule
+137	American coot
+138	bustard
+139	ruddy turnstone
+140	dunlin
+141	common redshank
+142	dowitcher
+143	oystercatcher
+144	pelican
+145	king penguin
+146	albatross
+147	grey whale
+148	killer whale
+149	dugong
+150	sea lion
+151	Chihuahua
+152	Japanese Chin
+153	Maltese
+154	Pekingese
+155	Shih Tzu
+156	King Charles Spaniel
+157	Papillon
+158	toy terrier
+159	Rhodesian Ridgeback
+160	Afghan Hound
+161	Basset Hound
+162	Beagle
+163	Bloodhound
+164	Bluetick Coonhound
+165	Black and Tan Coonhound
+166	Treeing Walker Coonhound
+167	English foxhound
+168	Redbone Coonhound
+169	borzoi
+170	Irish Wolfhound
+171	Italian Greyhound
+172	Whippet
+173	Ibizan Hound
+174	Norwegian Elkhound
+175	Otterhound
+176	Saluki
+177	Scottish Deerhound
+178	Weimaraner
+179	Staffordshire Bull Terrier
+180	American Staffordshire Terrier
+181	Bedlington Terrier
+182	Border Terrier
+183	Kerry Blue Terrier
+184	Irish Terrier
+185	Norfolk Terrier
+186	Norwich Terrier
+187	Yorkshire Terrier
+188	Wire Fox Terrier
+189	Lakeland Terrier
+190	Sealyham Terrier
+191	Airedale Terrier
+192	Cairn Terrier
+193	Australian Terrier
+194	Dandie Dinmont Terrier
+195	Boston Terrier
+196	Miniature Schnauzer
+197	Giant Schnauzer
+198	Standard Schnauzer
+199	Scottish Terrier
+200	Tibetan Terrier
+201	Australian Silky Terrier
+202	Soft-coated Wheaten Terrier
+203	West Highland White Terrier
+204	Lhasa Apso
+205	Flat-Coated Retriever
+206	Curly-coated Retriever
+207	Golden Retriever
+208	Labrador Retriever
+209	Chesapeake Bay Retriever
+210	German Shorthaired Pointer
+211	Vizsla
+212	English Setter
+213	Irish Setter
+214	Gordon Setter
+215	Brittany dog
+216	Clumber Spaniel
+217	English Springer Spaniel
+218	Welsh Springer Spaniel
+219	Cocker Spaniel
+220	Sussex Spaniel
+221	Irish Water Spaniel
+222	Kuvasz
+223	Schipperke
+224	Groenendael dog
+225	Malinois
+226	Briard
+227	Australian Kelpie
+228	Komondor
+229	Old English Sheepdog
+230	Shetland Sheepdog
+231	collie
+232	Border Collie
+233	Bouvier des Flandres dog
+234	Rottweiler
+235	German Shepherd Dog
+236	Dobermann
+237	Miniature Pinscher
+238	Greater Swiss Mountain Dog
+239	Bernese Mountain Dog
+240	Appenzeller Sennenhund
+241	Entlebucher Sennenhund
+242	Boxer
+243	Bullmastiff
+244	Tibetan Mastiff
+245	French Bulldog
+246	Great Dane
+247	St. Bernard
+248	husky
+249	Alaskan Malamute
+250	Siberian Husky
+251	Dalmatian
+252	Affenpinscher
+253	Basenji
+254	pug
+255	Leonberger
+256	Newfoundland dog
+257	Great Pyrenees dog
+258	Samoyed
+259	Pomeranian
+260	Chow Chow
+261	Keeshond
+262	brussels griffon
+263	Pembroke Welsh Corgi
+264	Cardigan Welsh Corgi
+265	Toy Poodle
+266	Miniature Poodle
+267	Standard Poodle
+268	Mexican hairless dog (xoloitzcuintli)
+269	grey wolf
+270	Alaskan tundra wolf
+271	red wolf or maned wolf
+272	coyote
+273	dingo
+274	dhole
+275	African wild dog
+276	hyena
+277	red fox
+278	kit fox
+279	Arctic fox
+280	grey fox
+281	tabby cat
+282	tiger cat
+283	Persian cat
+284	Siamese cat
+285	Egyptian Mau
+286	cougar
+287	lynx
+288	leopard
+289	snow leopard
+290	jaguar
+291	lion
+292	tiger
+293	cheetah
+294	brown bear
+295	American black bear
+296	polar bear
+297	sloth bear
+298	mongoose
+299	meerkat
+300	tiger beetle
+301	ladybug
+302	ground beetle
+303	longhorn beetle
+304	leaf beetle
+305	dung beetle
+306	rhinoceros beetle
+307	weevil
+308	fly
+309	bee
+310	ant
+311	grasshopper
+312	cricket insect
+313	stick insect
+314	cockroach
+315	praying mantis
+316	cicada
+317	leafhopper
+318	lacewing
+319	dragonfly
+320	damselfly
+321	red admiral butterfly
+322	ringlet butterfly
+323	monarch butterfly
+324	small white butterfly
+325	sulphur butterfly
+326	gossamer-winged butterfly
+327	starfish
+328	sea urchin
+329	sea cucumber
+330	cottontail rabbit
+331	hare
+332	Angora rabbit
+333	hamster
+334	porcupine
+335	fox squirrel
+336	marmot
+337	beaver
+338	guinea pig
+339	common sorrel horse
+340	zebra
+341	pig
+342	wild boar
+343	warthog
+344	hippopotamus
+345	ox
+346	water buffalo
+347	bison
+348	ram (adult male sheep)
+349	bighorn sheep
+350	Alpine ibex
+351	hartebeest
+352	impala (antelope)
+353	gazelle
+354	arabian camel
+355	llama
+356	weasel
+357	mink
+358	European polecat
+359	black-footed ferret
+360	otter
+361	skunk
+362	badger
+363	armadillo
+364	three-toed sloth
+365	orangutan
+366	gorilla
+367	chimpanzee
+368	gibbon
+369	siamang
+370	guenon
+371	patas monkey
+372	baboon
+373	macaque
+374	langur
+375	black-and-white colobus
+376	proboscis monkey
+377	marmoset
+378	white-headed capuchin
+379	howler monkey
+380	titi monkey
+381	Geoffroy's spider monkey
+382	common squirrel monkey
+383	ring-tailed lemur
+384	indri
+385	Asian elephant
+386	African bush elephant
+387	red panda
+388	giant panda
+389	snoek fish
+390	eel
+391	silver salmon
+392	rock beauty fish
+393	clownfish
+394	sturgeon
+395	gar fish
+396	lionfish
+397	pufferfish
+398	abacus
+399	abaya
+400	academic gown
+401	accordion
+402	acoustic guitar
+403	aircraft carrier
+404	airliner
+405	airship
+406	altar
+407	ambulance
+408	amphibious vehicle
+409	analog clock
+410	apiary
+411	apron
+412	trash can
+413	assault rifle
+414	backpack
+415	bakery
+416	balance beam
+417	balloon
+418	ballpoint pen
+419	Band-Aid
+420	banjo
+421	baluster / handrail
+422	barbell
+423	barber chair
+424	barbershop
+425	barn
+426	barometer
+427	barrel
+428	wheelbarrow
+429	baseball
+430	basketball
+431	bassinet
+432	bassoon
+433	swimming cap
+434	bath towel
+435	bathtub
+436	station wagon
+437	lighthouse
+438	beaker
+439	military hat (bearskin or shako)
+440	beer bottle
+441	beer glass
+442	bell tower
+443	baby bib
+444	tandem bicycle
+445	bikini
+446	ring binder
+447	binoculars
+448	birdhouse
+449	boathouse
+450	bobsleigh
+451	bolo tie
+452	poke bonnet
+453	bookcase
+454	bookstore
+455	bottle cap
+456	hunting bow
+457	bow tie
+458	brass memorial plaque
+459	bra
+460	breakwater
+461	breastplate
+462	broom
+463	bucket
+464	buckle
+465	bulletproof vest
+466	high-speed train
+467	butcher shop
+468	taxicab
+469	cauldron
+470	candle
+471	cannon
+472	canoe
+473	can opener
+474	cardigan
+475	car mirror
+476	carousel
+477	tool kit
+478	cardboard box / carton
+479	car wheel
+480	automated teller machine
+481	cassette
+482	cassette player
+483	castle
+484	catamaran
+485	CD player
+486	cello
+487	mobile phone
+488	chain
+489	chain-link fence
+490	chain mail
+491	chainsaw
+492	storage chest
+493	chiffonier
+494	bell or wind chime
+495	china cabinet
+496	Christmas stocking
+497	church
+498	movie theater
+499	cleaver
+500	cliff dwelling
+501	cloak
+502	clogs
+503	cocktail shaker
+504	coffee mug
+505	coffeemaker
+506	spiral or coil
+507	combination lock
+508	computer keyboard
+509	candy store
+510	container ship
+511	convertible
+512	corkscrew
+513	cornet
+514	cowboy boot
+515	cowboy hat
+516	cradle
+517	construction crane
+518	crash helmet
+519	crate
+520	infant bed
+521	Crock Pot
+522	croquet ball
+523	crutch
+524	cuirass
+525	dam
+526	desk
+527	desktop computer
+528	rotary dial telephone
+529	diaper
+530	digital clock
+531	digital watch
+532	dining table
+533	dishcloth
+534	dishwasher
+535	disc brake
+536	dock
+537	dog sled
+538	dome
+539	doormat
+540	drilling rig
+541	drum
+542	drumstick
+543	dumbbell
+544	Dutch oven
+545	electric fan
+546	electric guitar
+547	electric locomotive
+548	entertainment center
+549	envelope
+550	espresso machine
+551	face powder
+552	feather boa
+553	filing cabinet
+554	fireboat
+555	fire truck
+556	fire screen
+557	flagpole
+558	flute
+559	folding chair
+560	football helmet
+561	forklift
+562	fountain
+563	fountain pen
+564	four-poster bed
+565	freight car
+566	French horn
+567	frying pan
+568	fur coat
+569	garbage truck
+570	gas mask or respirator
+571	gas pump
+572	goblet
+573	go-kart
+574	golf ball
+575	golf cart
+576	gondola
+577	gong
+578	gown
+579	grand piano
+580	greenhouse
+581	radiator grille
+582	grocery store
+583	guillotine
+584	hair clip
+585	hair spray
+586	half-track
+587	hammer
+588	hamper
+589	hair dryer
+590	hand-held computer
+591	handkerchief
+592	hard disk drive
+593	harmonica
+594	harp
+595	combine harvester
+596	hatchet
+597	holster
+598	home theater
+599	honeycomb
+600	hook
+601	hoop skirt
+602	gymnastic horizontal bar
+603	horse-drawn vehicle
+604	hourglass
+605	iPod
+606	clothes iron
+607	carved pumpkin
+608	jeans
+609	jeep
+610	T-shirt
+611	jigsaw puzzle
+612	rickshaw
+613	joystick
+614	kimono
+615	knee pad
+616	knot
+617	lab coat
+618	ladle
+619	lampshade
+620	laptop computer
+621	lawn mower
+622	lens cap
+623	letter opener
+624	library
+625	lifeboat
+626	lighter
+627	limousine
+628	ocean liner
+629	lipstick
+630	slip-on shoe
+631	lotion
+632	music speaker
+633	loupe magnifying glass
+634	sawmill
+635	magnetic compass
+636	messenger bag
+637	mailbox
+638	tights
+639	one-piece bathing suit
+640	manhole cover
+641	maraca
+642	marimba
+643	mask
+644	matchstick
+645	maypole
+646	maze
+647	measuring cup
+648	medicine cabinet
+649	megalith
+650	microphone
+651	microwave oven
+652	military uniform
+653	milk can
+654	minibus
+655	miniskirt
+656	minivan
+657	missile
+658	mitten
+659	mixing bowl
+660	mobile home
+661	ford model t
+662	modem
+663	monastery
+664	monitor
+665	moped
+666	mortar and pestle
+667	graduation cap
+668	mosque
+669	mosquito net
+670	vespa
+671	mountain bike
+672	tent
+673	computer mouse
+674	mousetrap
+675	moving van
+676	muzzle
+677	metal nail
+678	neck brace
+679	necklace
+680	baby pacifier
+681	notebook computer
+682	obelisk
+683	oboe
+684	ocarina
+685	odometer
+686	oil filter
+687	pipe organ
+688	oscilloscope
+689	overskirt
+690	bullock cart
+691	oxygen mask
+692	product packet / packaging
+693	paddle
+694	paddle wheel
+695	padlock
+696	paintbrush
+697	pajamas
+698	palace
+699	pan flute
+700	paper towel
+701	parachute
+702	parallel bars
+703	park bench
+704	parking meter
+705	railroad car
+706	patio
+707	payphone
+708	pedestal
+709	pencil case
+710	pencil sharpener
+711	perfume
+712	Petri dish
+713	photocopier
+714	plectrum
+715	Pickelhaube
+716	picket fence
+717	pickup truck
+718	pier
+719	piggy bank
+720	pill bottle
+721	pillow
+722	ping-pong ball
+723	pinwheel
+724	pirate ship
+725	drink pitcher
+726	block plane
+727	planetarium
+728	plastic bag
+729	plate rack
+730	farm plow
+731	plunger
+732	Polaroid camera
+733	pole
+734	police van
+735	poncho
+736	pool table
+737	soda bottle
+738	plant pot
+739	potter's wheel
+740	power drill
+741	prayer rug
+742	printer
+743	prison
+744	missile
+745	projector
+746	hockey puck
+747	punching bag
+748	purse
+749	quill
+750	quilt
+751	race car
+752	racket
+753	radiator
+754	radio
+755	radio telescope
+756	rain barrel
+757	recreational vehicle
+758	fishing casting reel
+759	reflex camera
+760	refrigerator
+761	remote control
+762	restaurant
+763	revolver
+764	rifle
+765	rocking chair
+766	rotisserie
+767	eraser
+768	rugby ball
+769	ruler measuring stick
+770	sneaker
+771	safe
+772	safety pin
+773	salt shaker
+774	sandal
+775	sarong
+776	saxophone
+777	scabbard
+778	weighing scale
+779	school bus
+780	schooner
+781	scoreboard
+782	CRT monitor
+783	screw
+784	screwdriver
+785	seat belt
+786	sewing machine
+787	shield
+788	shoe store
+789	shoji screen / room divider
+790	shopping basket
+791	shopping cart
+792	shovel
+793	shower cap
+794	shower curtain
+795	ski
+796	balaclava ski mask
+797	sleeping bag
+798	slide rule
+799	sliding door
+800	slot machine
+801	snorkel
+802	snowmobile
+803	snowplow
+804	soap dispenser
+805	soccer ball
+806	sock
+807	solar thermal collector
+808	sombrero
+809	soup bowl
+810	keyboard space bar
+811	space heater
+812	space shuttle
+813	spatula
+814	motorboat
+815	spider web
+816	spindle
+817	sports car
+818	spotlight
+819	stage
+820	steam locomotive
+821	through arch bridge
+822	steel drum
+823	stethoscope
+824	scarf
+825	stone wall
+826	stopwatch
+827	stove
+828	strainer
+829	tram
+830	stretcher
+831	couch
+832	stupa
+833	submarine
+834	suit
+835	sundial
+836	sunglasses
+837	sunglasses
+838	sunscreen
+839	suspension bridge
+840	mop
+841	sweatshirt
+842	swim trunks / shorts
+843	swing
+844	electrical switch
+845	syringe
+846	table lamp
+847	tank
+848	tape player
+849	teapot
+850	teddy bear
+851	television
+852	tennis ball
+853	thatched roof
+854	front curtain
+855	thimble
+856	threshing machine
+857	throne
+858	tile roof
+859	toaster
+860	tobacco shop
+861	toilet seat
+862	torch
+863	totem pole
+864	tow truck
+865	toy store
+866	tractor
+867	semi-trailer truck
+868	tray
+869	trench coat
+870	tricycle
+871	trimaran
+872	tripod
+873	triumphal arch
+874	trolleybus
+875	trombone
+876	hot tub
+877	turnstile
+878	typewriter keyboard
+879	umbrella
+880	unicycle
+881	upright piano
+882	vacuum cleaner
+883	vase
+884	vaulted or arched ceiling
+885	velvet fabric
+886	vending machine
+887	vestment
+888	viaduct
+889	violin
+890	volleyball
+891	waffle iron
+892	wall clock
+893	wallet
+894	wardrobe
+895	military aircraft
+896	sink
+897	washing machine
+898	water bottle
+899	water jug
+900	water tower
+901	whiskey jug
+902	whistle
+903	hair wig
+904	window screen
+905	window shade
+906	Windsor tie
+907	wine bottle
+908	airplane wing
+909	wok
+910	wooden spoon
+911	wool
+912	split-rail fence
+913	shipwreck
+914	sailboat
+915	yurt
+916	website
+917	comic book
+918	crossword
+919	traffic or street sign
+920	traffic light
+921	dust jacket
+922	menu
+923	plate
+924	guacamole
+925	consomme
+926	hot pot
+927	trifle
+928	ice cream
+929	popsicle
+930	baguette
+931	bagel
+932	pretzel
+933	cheeseburger
+934	hot dog
+935	mashed potatoes
+936	cabbage
+937	broccoli
+938	cauliflower
+939	zucchini
+940	spaghetti squash
+941	acorn squash
+942	butternut squash
+943	cucumber
+944	artichoke
+945	bell pepper
+946	cardoon
+947	mushroom
+948	Granny Smith apple
+949	strawberry
+950	orange
+951	lemon
+952	fig
+953	pineapple
+954	banana
+955	jackfruit
+956	cherimoya (custard apple)
+957	pomegranate
+958	hay
+959	carbonara
+960	chocolate syrup
+961	dough
+962	meatloaf
+963	pizza
+964	pot pie
+965	burrito
+966	red wine
+967	espresso
+968	tea cup
+969	eggnog
+970	mountain
+971	bubble
+972	cliff
+973	coral reef
+974	geyser
+975	lakeshore
+976	promontory
+977	sandbar
+978	beach
+979	valley
+980	volcano
+981	baseball player
+982	bridegroom
+983	scuba diver
+984	rapeseed
+985	daisy
+986	yellow lady's slipper
+987	corn
+988	acorn
+989	rose hip
+990	horse chestnut seed
+991	coral fungus
+992	agaric
+993	gyromitra
+994	stinkhorn mushroom
+995	earth star fungus
+996	hen of the woods mushroom
+997	bolete
+998	corn cob
+999	toilet paper

dataset_reqs/imagenet100_classes.txt ADDED Viewed

	@@ -0,0 +1,100 @@

+0	n01440764	tench
+1	n01443537	goldfish
+2	n01484850	great white shark
+3	n01491361	tiger shark
+4	n01494475	hammerhead shark
+5	n01496331	electric ray
+6	n01498041	stingray
+7	n01514668	rooster
+8	n01514859	hen
+9	n01518878	ostrich
+10	n01530575	brambling
+11	n01531178	goldfinch
+12	n01532829	house finch
+13	n01534433	junco
+14	n01537544	indigo bunting
+15	n01558993	American robin
+16	n01560419	bulbul
+17	n01580077	jay
+18	n01582220	magpie
+19	n01592084	chickadee
+20	n01601694	American dipper
+21	n01608432	kite (bird of prey)
+22	n01614925	bald eagle
+23	n01616318	vulture
+24	n01622779	great grey owl
+25	n01629819	fire salamander
+26	n01630670	smooth newt
+27	n01631663	newt
+28	n01632458	spotted salamander
+29	n01632777	axolotl
+30	n01641577	American bullfrog
+31	n01644373	tree frog
+32	n01644900	tailed frog
+33	n01664065	loggerhead sea turtle
+34	n01665541	leatherback sea turtle
+35	n01667114	mud turtle
+36	n01667778	terrapin
+37	n01669191	box turtle
+38	n01675722	banded gecko
+39	n01677366	green iguana
+40	n01682714	Carolina anole
+41	n01685808	desert grassland whiptail lizard
+42	n01687978	agama
+43	n01688243	frilled-necked lizard
+44	n01689811	alligator lizard
+45	n01692333	Gila monster
+46	n01693334	European green lizard
+47	n01694178	chameleon
+48	n01695060	Komodo dragon
+49	n01697457	Nile crocodile
+50	n01698640	American alligator
+51	n01704323	triceratops
+52	n01728572	worm snake
+53	n01728920	ring-necked snake
+54	n01729322	eastern hog-nosed snake
+55	n01729977	smooth green snake
+56	n01734418	kingsnake
+57	n01735189	garter snake
+58	n01737021	water snake
+59	n01739381	vine snake
+60	n01740131	night snake
+61	n01742172	boa constrictor
+62	n01744401	African rock python
+63	n01748264	Indian cobra
+64	n01749939	green mamba
+65	n01751748	sea snake
+66	n01753488	Saharan horned viper
+67	n01755581	eastern diamondback rattlesnake
+68	n01756291	sidewinder rattlesnake
+69	n01768244	trilobite
+70	n01770081	harvestman
+71	n01770393	scorpion
+72	n01773157	yellow garden spider
+73	n01773549	barn spider
+74	n01773797	European garden spider
+75	n01774384	southern black widow
+76	n01774750	tarantula
+77	n01775062	wolf spider
+78	n01776313	tick
+79	n01784675	centipede
+80	n01795545	black grouse
+81	n01796340	ptarmigan
+82	n01797886	ruffed grouse
+83	n01798484	prairie grouse
+84	n01806143	peafowl
+85	n01806567	quail
+86	n01807496	partridge
+87	n01817953	african grey parrot
+88	n01818515	macaw
+89	n01819313	sulphur-crested cockatoo
+90	n01820546	lorikeet
+91	n01824575	coucal
+92	n01828970	bee eater
+93	n01829413	hornbill
+94	n01833805	hummingbird
+95	n01843065	jacamar
+96	n01843383	toucan
+97	n01847000	duck
+98	n01855032	red-breasted merganser
+99	n01855672	goose

dataset_reqs/imagenet100_splits/train_100.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

dataset_reqs/imagenet100_splits/val_100.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

dataset_reqs/tinyimagenet_classes.txt ADDED Viewed

	@@ -0,0 +1,200 @@

+0	n02124075	Egyptian Mau
+1	n04067472	fishing casting reel
+2	n04540053	volleyball
+3	n04099969	rocking chair
+4	n07749582	lemon
+5	n01641577	American bullfrog
+6	n02802426	basketball
+7	n09246464	cliff
+8	n07920052	espresso
+9	n03970156	plunger
+10	n03891332	parking meter
+11	n02106662	German Shepherd Dog
+12	n03201208	dining table
+13	n02279972	monarch butterfly
+14	n02132136	brown bear
+15	n04146614	school bus
+16	n07873807	pizza
+17	n02364673	guinea pig
+18	n04507155	umbrella
+19	n03854065	pipe organ
+20	n03838899	oboe
+21	n03733131	maypole
+22	n01443537	goldfish
+23	n07875152	pot pie
+24	n03544143	hourglass
+25	n09428293	beach
+26	n03085013	computer keyboard
+27	n02437312	arabian camel
+28	n07614500	ice cream
+29	n03804744	metal nail
+30	n04265275	space heater
+31	n02963159	cardigan
+32	n02486410	baboon
+33	n01944390	snail
+34	n09256479	coral reef
+35	n02058221	albatross
+36	n04275548	spider web
+37	n02321529	sea cucumber
+38	n02769748	backpack
+39	n02099712	Labrador Retriever
+40	n07695742	pretzel
+41	n02056570	king penguin
+42	n02281406	sulphur butterfly
+43	n01774750	tarantula
+44	n02509815	red panda
+45	n03983396	soda bottle
+46	n07753592	banana
+47	n04254777	sock
+48	n02233338	cockroach
+49	n04008634	missile
+50	n02823428	beer bottle
+51	n02236044	praying mantis
+52	n03393912	freight car
+53	n07583066	guacamole
+54	n04074963	remote control
+55	n01629819	fire salamander
+56	n09332890	lakeshore
+57	n02481823	chimpanzee
+58	n03902125	payphone
+59	n03404251	fur coat
+60	n09193705	mountain
+61	n03637318	lampshade
+62	n04456115	torch
+63	n02666196	abacus
+64	n03796401	moving van
+65	n02795169	barrel
+66	n02123045	tabby cat
+67	n01855672	goose
+68	n01882714	koala
+69	n02917067	high-speed train
+70	n02988304	CD player
+71	n04398044	teapot
+72	n02843684	birdhouse
+73	n02423022	gazelle
+74	n02669723	academic gown
+75	n04465501	tractor
+76	n02165456	ladybug
+77	n03770439	miniskirt
+78	n02099601	Golden Retriever
+79	n04486054	triumphal arch
+80	n02950826	cannon
+81	n03814639	neck brace
+82	n04259630	sombrero
+83	n03424325	gas mask or respirator
+84	n02948072	candle
+85	n03179701	desk
+86	n03400231	frying pan
+87	n02206856	bee
+88	n03160309	dam
+89	n01984695	spiny lobster
+90	n03977966	police van
+91	n03584254	iPod
+92	n04023962	punching bag
+93	n02814860	lighthouse
+94	n01910747	jellyfish
+95	n04596742	wok
+96	n03992509	potter's wheel
+97	n04133789	sandal
+98	n03937543	pill bottle
+99	n02927161	butcher shop
+100	n01945685	slug
+101	n02395406	pig
+102	n02125311	cougar
+103	n03126707	construction crane
+104	n04532106	vestment
+105	n02268443	dragonfly
+106	n02977058	automated teller machine
+107	n07734744	mushroom
+108	n03599486	rickshaw
+109	n04562935	water tower
+110	n03014705	storage chest
+111	n04251144	snorkel
+112	n04356056	sunglasses
+113	n02190166	fly
+114	n03670208	limousine
+115	n02002724	black stork
+116	n02074367	dugong
+117	n04285008	sports car
+118	n04560804	water jug
+119	n04366367	suspension bridge
+120	n02403003	ox
+121	n07615774	popsicle
+122	n04501370	turnstile
+123	n03026506	Christmas stocking
+124	n02906734	broom
+125	n01770393	scorpion
+126	n04597913	wooden spoon
+127	n03930313	picket fence
+128	n04118538	rugby ball
+129	n04179913	sewing machine
+130	n04311004	through arch bridge
+131	n02123394	Persian cat
+132	n04070727	refrigerator
+133	n02793495	barn
+134	n02730930	apron
+135	n02094433	Yorkshire Terrier
+136	n04371430	swim trunks / shorts
+137	n04328186	stopwatch
+138	n03649909	lawn mower
+139	n04417672	thatched roof
+140	n03388043	fountain
+141	n01774384	southern black widow
+142	n02837789	bikini
+143	n07579787	plate
+144	n04399382	teddy bear
+145	n02791270	barbershop
+146	n03089624	candy store
+147	n02814533	station wagon
+148	n04149813	scoreboard
+149	n07747607	orange
+150	n03355925	flagpole
+151	n01983481	American lobster
+152	n04487081	trolleybus
+153	n03250847	drumstick
+154	n03255030	dumbbell
+155	n02892201	brass memorial plaque
+156	n02883205	bow tie
+157	n03100240	convertible
+158	n02415577	bighorn sheep
+159	n02480495	orangutan
+160	n01698640	American alligator
+161	n01784675	centipede
+162	n04376876	syringe
+163	n03444034	go-kart
+164	n01917289	brain coral
+165	n01950731	sea slug
+166	n03042490	cliff dwelling
+167	n07711569	mashed potatoes
+168	n04532670	viaduct
+169	n03763968	military uniform
+170	n07768694	pomegranate
+171	n02999410	chain
+172	n03617480	kimono
+173	n06596364	comic book
+174	n01768244	trilobite
+175	n02410509	bison
+176	n03976657	pole
+177	n01742172	boa constrictor
+178	n03980874	poncho
+179	n02808440	bathtub
+180	n02226429	grasshopper
+181	n02231487	stick insect
+182	n02085620	Chihuahua
+183	n01644900	tailed frog
+184	n02129165	lion
+185	n02699494	altar
+186	n03837869	obelisk
+187	n02815834	beaker
+188	n07720875	bell pepper
+189	n02788148	baluster / handrail
+190	n02909870	bucket
+191	n03706229	magnetic compass
+192	n07871810	meatloaf
+193	n03447447	gondola
+194	n02113799	Standard Poodle
+195	n12267677	acorn
+196	n03662601	lifeboat
+197	n02841315	binoculars
+198	n07715103	cauliflower
+199	n02504458	African bush elephant

main.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import json
+import random
+import hydra
+import logging
+import numpy as np
+from omegaconf import DictConfig
+from tqdm import tqdm
+import torch
+import statistics
+from continuum.metrics import Logger
+from continual_clip import utils
+from continual_clip.models import load_model
+from continual_clip.datasets import build_cl_scenarios
+from torch.utils.data import DataLoader, DistributedSampler
+WORLD_NUM = 1
+@hydra.main(config_path=None, config_name=None, version_base="1.1")
+def continual_clip(cfg: DictConfig) -> None:
+    set_seed(RANDOM_SEED)
+    cfg.workdir = "/***/DMNSP/cil"
+    cfg.dataset_root = os.path.join(cfg.workdir, cfg.dataset_root)
+    utils.save_config(cfg)
+    cfg.class_order = utils.get_class_order(os.path.join(cfg.workdir, cfg.class_order))
+    origin_flag = False
+    devices = [0]
+    model = load_model(cfg, devices[0], origin_flag)
+    eval_dataset, classes_names = build_cl_scenarios(
+        cfg, is_train=False, transforms=model.transforms
+    )
+    print(eval_dataset, eval_dataset)
+    train_dataset, train_classes_names = build_cl_scenarios(
+        cfg, is_train=True, transforms=model.transforms
+    )
+    model.classes_names = classes_names
+    print("Using devices", devices)
+    model = torch.nn.DataParallel(model, device_ids=devices)
+    with open(cfg.log_path, 'w+') as f:
+        pass
+    acc_list = []
+    forgetting_list = []
+    metric_logger = Logger(list_subsets=["test"])
+    world = WORLD_NUM
+    for task_id, _ in enumerate(eval_dataset):
+        logging.info(f"Evaluation for task {task_id} has started.")
+        model.module.adaptation(task_id, cfg, train_dataset, train_classes_names, world)  # task id 已经传入mode
+        eval_sampler = DistributedSampler(eval_dataset[:task_id + 1], num_replicas=world, rank=0)
+        eval_loader = DataLoader(eval_dataset[:task_id + 1], batch_size=64, sampler=eval_sampler, num_workers=8)
+        for inputs, targets, task_ids in tqdm(eval_loader):
+            inputs, targets = inputs.cuda(device=devices[0]), targets.cuda(device=devices[0])
+            outputs = model.module.cuda(devices[0])(inputs.cuda(devices[0]), task_ids)
+            metric_logger.add([outputs.cpu().argmax(dim=1), targets.cpu(), task_ids], subset="test")
+        acc_list.append(100 * metric_logger.accuracy)
+        forgetting_list.append(100 * metric_logger.forgetting)
+        with open(cfg.log_path, 'a+') as f:
+            f.write(json.dumps({
+                'task': task_id,
+                'acc': round(100 * metric_logger.accuracy, 2),
+                'avg_acc': round(100 * metric_logger.average_incremental_accuracy, 2),
+                'forgetting': round(100 * metric_logger.forgetting, 6),
+                'acc_per_task': [round(100 * acc_t, 2) for acc_t in metric_logger.accuracy_per_task],
+                'bwt': round(100 * metric_logger.backward_transfer, 2),
+                'fwt': round(100 * metric_logger.forward_transfer, 2),
+            }) + '\n')
+            metric_logger.end_task()
+    with open(cfg.log_path, 'a+') as f:
+        f.write(json.dumps({
+            'last_Cifar100': round(acc_list[-1], 2),
+            'avg_Cifar100': round(statistics.mean(acc_list), 2),
+            'avg_forgetting': round(statistics.mean(forgetting_list), 2)
+        }) + '\n')
+# Seeds: 386, 2345, 157 (Performance might slightly vary across different machines)
+RANDOM_SEED = 386
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+if __name__ == "__main__":
+    continual_clip()

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+continuum
+hydra-core==1.2.0
+numpy
+oauthlib==3.2.1
+omegaconf==2.2.3
+open-clip-torch==1.3.0
+pandas==1.4.3
+Pillow==9.2.0
+pipreqs==0.4.11
+scikit-image==0.19.3
+scikit-learn==1.1.1
+scipy==1.8.1
+tensorboard==2.10.0
+timm @ git+https://github.com/Arnav0400/pytorch-image-models.git@ceea7127c1ef608179ba06eaeddc22ad3ef22de0
+tokenizers==0.12.1
+tqdm==4.64.0
+transformers==4.21.1
+ftfy
+regex

run_cifar100-10-10.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!bin/bash
+python main.py \
+    --config-path /DMNSP/configs/class \
+    --config-name cifar100_10-10.yaml \
+    dataset_root="/data/**/" \
+    class_order="/DMNSP/class_orders/cifar100.yaml"

templates/__init__.py ADDED Viewed

File without changes

templates/fmow_template.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .template_utils import append_proper_article
+fmow_template = [
+    lambda c : f"satellite photo of a {c}.",
+    lambda c : f"aerial photo of a {c}.",
+    lambda c : f"satellite photo of {append_proper_article(c)}.",
+    lambda c : f"aerial photo of {append_proper_article(c)}.",
+    lambda c : f"satellite photo of a {c} in asia.",
+    lambda c : f"aerial photo of a {c} in asia.",
+    lambda c : f"satellite photo of a {c} in africa.",
+    lambda c : f"aerial photo of a {c} in africa.",
+    lambda c : f"satellite photo of a {c} in the americas.",
+    lambda c : f"aerial photo of a {c} in the americas.",
+    lambda c : f"satellite photo of a {c} in europe.",
+    lambda c : f"aerial photo of a {c} in europe.",
+    lambda c : f"satellite photo of a {c} in oceania.",
+    lambda c : f"aerial photo of a {c} in oceania.",
+    lambda c: f"a photo of a {c}.",
+    lambda c: f"{c}.",
+]

templates/iwildcam_template.py ADDED Viewed

	@@ -0,0 +1,4 @@

+iwildcam_template = [
+    lambda c: f"a photo of {c}.",
+    lambda c: f"{c} in the wild.",
+]

templates/openai_imagenet_template.py ADDED Viewed

	@@ -0,0 +1,82 @@

+openai_imagenet_template = [
+    lambda c: f'a bad photo of a {c}.',
+    lambda c: f'a photo of many {c}.',
+    lambda c: f'a sculpture of a {c}.',
+    lambda c: f'a photo of the hard to see {c}.',
+    lambda c: f'a low resolution photo of the {c}.',
+    lambda c: f'a rendering of a {c}.',
+    lambda c: f'graffiti of a {c}.',
+    lambda c: f'a bad photo of the {c}.',
+    lambda c: f'a cropped photo of the {c}.',
+    lambda c: f'a tattoo of a {c}.',
+    lambda c: f'the embroidered {c}.',
+    lambda c: f'a photo of a hard to see {c}.',
+    lambda c: f'a bright photo of a {c}.',
+    lambda c: f'a photo of a clean {c}.',
+    lambda c: f'a photo of a dirty {c}.',
+    lambda c: f'a dark photo of the {c}.',
+    lambda c: f'a drawing of a {c}.',
+    lambda c: f'a photo of my {c}.',
+    lambda c: f'the plastic {c}.',
+    lambda c: f'a photo of the cool {c}.',
+    lambda c: f'a close-up photo of a {c}.',
+    lambda c: f'a black and white photo of the {c}.',
+    lambda c: f'a painting of the {c}.',
+    lambda c: f'a painting of a {c}.',
+    lambda c: f'a pixelated photo of the {c}.',
+    lambda c: f'a sculpture of the {c}.',
+    lambda c: f'a bright photo of the {c}.',
+    lambda c: f'a cropped photo of a {c}.',
+    lambda c: f'a plastic {c}.',
+    lambda c: f'a photo of the dirty {c}.',
+    lambda c: f'a jpeg corrupted photo of a {c}.',
+    lambda c: f'a blurry photo of the {c}.',
+    lambda c: f'a photo of the {c}.',
+    lambda c: f'a good photo of the {c}.',
+    lambda c: f'a rendering of the {c}.',
+    lambda c: f'a {c} in a video game.',
+    lambda c: f'a photo of one {c}.',
+    lambda c: f'a doodle of a {c}.',
+    lambda c: f'a close-up photo of the {c}.',
+    lambda c: f'a photo of a {c}.',
+    lambda c: f'the origami {c}.',
+    lambda c: f'the {c} in a video game.',
+    lambda c: f'a sketch of a {c}.',
+    lambda c: f'a doodle of the {c}.',
+    lambda c: f'a origami {c}.',
+    lambda c: f'a low resolution photo of a {c}.',
+    lambda c: f'the toy {c}.',
+    lambda c: f'a rendition of the {c}.',
+    lambda c: f'a photo of the clean {c}.',
+    lambda c: f'a photo of a large {c}.',
+    lambda c: f'a rendition of a {c}.',
+    lambda c: f'a photo of a nice {c}.',
+    lambda c: f'a photo of a weird {c}.',
+    lambda c: f'a blurry photo of a {c}.',
+    lambda c: f'a cartoon {c}.',
+    lambda c: f'art of a {c}.',
+    lambda c: f'a sketch of the {c}.',
+    lambda c: f'a embroidered {c}.',
+    lambda c: f'a pixelated photo of a {c}.',
+    lambda c: f'itap of the {c}.',
+    lambda c: f'a jpeg corrupted photo of the {c}.',
+    lambda c: f'a good photo of a {c}.',
+    lambda c: f'a plushie {c}.',
+    lambda c: f'a photo of the nice {c}.',
+    lambda c: f'a photo of the small {c}.',
+    lambda c: f'a photo of the weird {c}.',
+    lambda c: f'the cartoon {c}.',
+    lambda c: f'art of the {c}.',
+    lambda c: f'a drawing of the {c}.',
+    lambda c: f'a photo of the large {c}.',
+    lambda c: f'a black and white photo of a {c}.',
+    lambda c: f'the plushie {c}.',
+    lambda c: f'a dark photo of a {c}.',
+    lambda c: f'itap of a {c}.',
+    lambda c: f'graffiti of the {c}.',
+    lambda c: f'a toy {c}.',
+    lambda c: f'itap of my {c}.',
+    lambda c: f'a photo of a cool {c}.',
+    lambda c: f'a photo of a small {c}.',
+    lambda c: f'a tattoo of the {c}.',
+]

templates/simple_template.py ADDED Viewed

	@@ -0,0 +1,3 @@

+simple_template = [
+    lambda c: f"a photo of a {c}."
+]

templates/template_utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+def get_plural(name):
+    name = name.replace('_', ' ')
+    if name[-2:] == 'sh':
+        name = name + 'es'
+    elif name[-2:] == 'ch':
+        name = name + 'es'
+    elif name[-1:] == 'y':
+        name = name[:-1] + 'ies'
+    elif name[-1:] == 's':
+        name = name + 'es'
+    elif name[-1:] == 'x':
+        name = name + 'es'
+    elif name[-3:] == 'man':
+        name = name[:-3] + 'men'
+    elif name == 'mouse':
+        name = 'mice'
+    elif name[-1:] == 'f':
+        name = name[:-1] + 'ves'
+    else:
+        name = name + 's'
+    return name
+def append_proper_article(name):
+    name = name.replace('_', ' ')
+    if name[0] in 'aeiou':
+        return 'an ' + name
+    return 'a ' + name

templates/testing_template.py ADDED Viewed

	@@ -0,0 +1,83 @@

+testing_template = [
+    lambda c : f'a photo of the number: "{c}".',
+    lambda c: f'a bad photo of a {c}.',
+    lambda c: f'a photo of many {c}.',
+    lambda c: f'a sculpture of a {c}.',
+    lambda c: f'a photo of the hard to see {c}.',
+    lambda c: f'a low resolution photo of the {c}.',
+    lambda c: f'a rendering of a {c}.',
+    lambda c: f'graffiti of a {c}.',
+    lambda c: f'a bad photo of the {c}.',
+    lambda c: f'a cropped photo of the {c}.',
+    lambda c: f'a tattoo of a {c}.',
+    lambda c: f'the embroidered {c}.',
+    lambda c: f'a photo of a hard to see {c}.',
+    lambda c: f'a bright photo of a {c}.',
+    lambda c: f'a photo of a clean {c}.',
+    lambda c: f'a photo of a dirty {c}.',
+    lambda c: f'a dark photo of the {c}.',
+    lambda c: f'a drawing of a {c}.',
+    lambda c: f'a photo of my {c}.',
+    lambda c: f'the plastic {c}.',
+    lambda c: f'a photo of the cool {c}.',
+    lambda c: f'a close-up photo of a {c}.',
+    lambda c: f'a black and white photo of the {c}.',
+    lambda c: f'a painting of the {c}.',
+    lambda c: f'a painting of a {c}.',
+    lambda c: f'a pixelated photo of the {c}.',
+    lambda c: f'a sculpture of the {c}.',
+    lambda c: f'a bright photo of the {c}.',
+    lambda c: f'a cropped photo of a {c}.',
+    lambda c: f'a plastic {c}.',
+    lambda c: f'a photo of the dirty {c}.',
+    lambda c: f'a jpeg corrupted photo of a {c}.',
+    lambda c: f'a blurry photo of the {c}.',
+    lambda c: f'a photo of the {c}.',
+    lambda c: f'a good photo of the {c}.',
+    lambda c: f'a rendering of the {c}.',
+    lambda c: f'a {c} in a video game.',
+    lambda c: f'a photo of one {c}.',
+    lambda c: f'a doodle of a {c}.',
+    lambda c: f'a close-up photo of the {c}.',
+    lambda c: f'a photo of a {c}.',
+    lambda c: f'the origami {c}.',
+    lambda c: f'the {c} in a video game.',
+    lambda c: f'a sketch of a {c}.',
+    lambda c: f'a doodle of the {c}.',
+    lambda c: f'a origami {c}.',
+    lambda c: f'a low resolution photo of a {c}.',
+    lambda c: f'the toy {c}.',
+    lambda c: f'a rendition of the {c}.',
+    lambda c: f'a photo of the clean {c}.',
+    lambda c: f'a photo of a large {c}.',
+    lambda c: f'a rendition of a {c}.',
+    lambda c: f'a photo of a nice {c}.',
+    lambda c: f'a photo of a weird {c}.',
+    lambda c: f'a blurry photo of a {c}.',
+    lambda c: f'a cartoon {c}.',
+    lambda c: f'art of a {c}.',
+    lambda c: f'a sketch of the {c}.',
+    lambda c: f'a embroidered {c}.',
+    lambda c: f'a pixelated photo of a {c}.',
+    lambda c: f'itap of the {c}.',
+    lambda c: f'a jpeg corrupted photo of the {c}.',
+    lambda c: f'a good photo of a {c}.',
+    lambda c: f'a plushie {c}.',
+    lambda c: f'a photo of the nice {c}.',
+    lambda c: f'a photo of the small {c}.',
+    lambda c: f'a photo of the weird {c}.',
+    lambda c: f'the cartoon {c}.',
+    lambda c: f'art of the {c}.',
+    lambda c: f'a drawing of the {c}.',
+    lambda c: f'a photo of the large {c}.',
+    lambda c: f'a black and white photo of a {c}.',
+    lambda c: f'the plushie {c}.',
+    lambda c: f'a dark photo of a {c}.',
+    lambda c: f'itap of a {c}.',
+    lambda c: f'graffiti of the {c}.',
+    lambda c: f'a toy {c}.',
+    lambda c: f'itap of my {c}.',
+    lambda c: f'a photo of a cool {c}.',
+    lambda c: f'a photo of a small {c}.',
+    lambda c: f'a tattoo of the {c}.',
+]