| import json |
| from PIL import Image |
| import requests |
| from transformers import CLIPProcessor, CLIPModel |
| from transformers.models.clip.modeling_clip import _get_vector_norm |
| import torch |
| import numpy as np |
| import platform |
| import sys |
| import os |
|
|
| processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") |
|
|
| url = "http://images.cocodataset.org/val2017/000000039769.jpg" |
| image = Image.open(requests.get(url, stream=True).raw) |
|
|
| inputs = processor(text="two cats on a pink blanket", images=image, return_tensors="pt", padding="max_length", truncation=True) |
| np_inputs = {k: v.numpy() for k, v in inputs.data.items()} |
|
|
| class VisionModel(torch.nn.Module): |
| def __init__(self, model): |
| super(VisionModel, self).__init__() |
| self.model = model |
|
|
| def forward(self, x): |
| model = self.model |
| vision_outputs = model.vision_model.forward(x) |
| pooled_output = vision_outputs.pooler_output |
| image_features = self.model.visual_projection(pooled_output) |
| image_features = image_features / _get_vector_norm(image_features) |
| return image_features |
|
|
| def eval(self): |
| self.model.eval() |
| self.model.vision_model.eval() |
| self.model.visual_projection.eval() |
| return super().eval() |
|
|
| class TextModel(torch.nn.Module): |
| def __init__(self, model): |
| super(TextModel, self).__init__() |
| self.model = model |
|
|
| def forward(self, input_ids, attention_mask): |
| model = self.model |
| text_outputs = model.text_model.forward(input_ids, attention_mask) |
| pooled_output = text_outputs.pooler_output |
| text_features = self.model.text_projection(pooled_output) |
| text_features = text_features / _get_vector_norm(text_features) |
| return text_features |
|
|
| def eval(self): |
| self.model.eval() |
| self.model.text_model.eval() |
| self.model.text_projection.eval() |
| return super().eval() |
|
|
| torch.set_grad_enabled(False) |
| ptmodel = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") |
|
|
| with torch.no_grad(): |
| vision = VisionModel(ptmodel) |
| vision.eval() |
| traced_vision_model = torch.jit.trace(vision, inputs.data['pixel_values']) |
|
|
| text = TextModel(ptmodel) |
| text.eval() |
| traced_text_model = torch.jit.trace(text, (inputs.data['input_ids'], inputs.data['attention_mask'])) |
|
|
| def convert_coreml(): |
| import coremltools as ct |
| coreml_model = ct.convert(traced_vision_model, inputs=[ct.TensorType(shape=inputs.data['pixel_values'].shape)]) |
| coreml_model.save('vision.mlpackage') |
|
|
| coreml_model = ct.convert(traced_text_model, inputs=[ct.TensorType(shape=inputs.data['input_ids'].shape), ct.TensorType(shape=inputs.data['attention_mask'].shape)]) |
| coreml_model.save('text.mlpackage') |
|
|
| |
|
|
| def infer_coreml(): |
| import coremltools as ct |
| coreml_vision_model = ct.models.MLModel('vision.mlpackage') |
| coreml_text_model = ct.models.MLModel('text.mlpackage') |
|
|
| vision_predictions = coreml_vision_model.predict({'x': np_inputs['pixel_values']}) |
| text_predictions = coreml_text_model.predict({'input_ids_1': np_inputs['input_ids'].astype(np.float32), 'attention_mask_1': np_inputs['attention_mask'].astype(np.float32)}) |
|
|
| image_embeds = vision_predictions['var_877'] |
| text_embeds = text_predictions['var_1050'] |
|
|
| |
| logits_per_text = text_embeds @ image_embeds.T |
|
|
|
|
| print("similarity:", logits_per_text.item()) |
|
|
| def convert_onnx(): |
| torch.onnx.export(traced_vision_model, inputs.data['pixel_values'], "vision.onnx") |
| torch.onnx.export(traced_text_model, (inputs.data['input_ids'], inputs.data['input_ids']), "text.onnx") |
|
|
| |
|
|
| def infer_onnx(): |
| import onnxruntime as ort |
|
|
| providers: list[str] = [] |
| if sys.platform == "darwin": |
| providers.append("CoreMLExecutionProvider") |
|
|
| if ("linux" in sys.platform or "win" in sys.platform) and ( |
| platform.machine() == "x86_64" or platform.machine() == "AMD64" |
| ): |
| providers.append(("CUDAExecutionProvider", {"device_id": 0})) |
|
|
| providers.append("CPUExecutionProvider") |
|
|
| vision_session = ort.InferenceSession("vision.onnx", providers=providers) |
| text_session = ort.InferenceSession("text.onnx", providers=providers) |
|
|
| vision_inputs = {vision_session.get_inputs()[0].name: np_inputs['pixel_values']} |
| text_inputs = { |
| text_session.get_inputs()[0].name: np_inputs['input_ids'], |
| text_session.get_inputs()[1].name: np_inputs['attention_mask'] |
| } |
|
|
| vision_predictions = vision_session.run(None, vision_inputs) |
| text_predictions = text_session.run(None, text_inputs) |
|
|
| image_embeds = vision_predictions[0] |
| text_embeds = text_predictions[0] |
|
|
| logits_per_text = text_embeds @ image_embeds.T |
|
|
| print("similarity:", logits_per_text.item()) |
|
|
| |
|
|
| def convert_openvino(): |
| import openvino as ov |
| ov_vision_model = ov.convert_model(traced_vision_model, example_input=inputs.data['pixel_values']) |
| ov.save_model(ov_vision_model, "openvino/vision.xml") |
|
|
| ov_text_model = ov.convert_model(traced_text_model, example_input=(inputs.data['input_ids'], inputs.data['attention_mask'])) |
| ov.save_model(ov_text_model, "openvino/text.xml") |
|
|
| |
|
|
| def infer_openvino(): |
| import openvino as ov |
| ov_vision_model = ov.Core().read_model("openvino/vision.xml") |
| ov_text_model = ov.Core().read_model("openvino/text.xml") |
|
|
| compiled_vision_model = ov.Core().compile_model(ov_vision_model, "CPU") |
| compiled_text_model = ov.Core().compile_model(ov_text_model, "CPU") |
|
|
| vision_predictions = compiled_vision_model(inputs.data['pixel_values']) |
| text_predictions = compiled_text_model((inputs.data['input_ids'], inputs.data['attention_mask'])) |
|
|
| image_embeds = vision_predictions[0] |
| text_embeds = text_predictions[0] |
|
|
| logits_per_text = text_embeds @ image_embeds.T |
|
|
| print("similarity:", logits_per_text.item()) |
|
|
| |
|
|
| def export_openvino_int8(): |
| import openvino as ov |
| import text_calibration |
| import image_calibration |
| import nncf |
|
|
| ov_vision_model = ov.Core().read_model("openvino/vision.xml") |
| ov_text_model = ov.Core().read_model("openvino/text.xml") |
|
|
| vision_calibration_dataset = image_calibration.get_image_calibration_data() |
| text_calibration_dataset = text_calibration.get_text_calibration_data() |
|
|
| vision_dataset = nncf.Dataset(vision_calibration_dataset) |
| text_dataset = nncf.Dataset(text_calibration_dataset) |
|
|
| quantized_vision_model = nncf.quantize(ov_vision_model, vision_dataset, preset=nncf.QuantizationPreset.MIXED, model_type=nncf.ModelType.TRANSFORMER, |
| |
| ) |
|
|
| quantized_text_model = nncf.quantize(ov_text_model, text_dataset, preset=nncf.QuantizationPreset.MIXED, model_type=nncf.ModelType.TRANSFORMER, |
| |
| ) |
|
|
| ov.save_model(quantized_vision_model, "openvino/vision_int8.xml") |
| ov.save_model(quantized_text_model, "openvino/text_int8.xml") |
|
|
| export_openvino_int8() |
|
|
| def infer_openvino_int8(): |
| import openvino as ov |
| ov_vision_model = ov.Core().read_model("openvino/vision_int8.xml") |
| ov_text_model = ov.Core().read_model("openvino/text_int8.xml") |
|
|
| compiled_vision_model = ov.Core().compile_model(ov_vision_model, "CPU") |
| compiled_text_model = ov.Core().compile_model(ov_text_model, "CPU") |
|
|
| vision_predictions = compiled_vision_model(inputs.data['pixel_values']) |
| text_predictions = compiled_text_model((inputs.data['input_ids'], inputs.data['attention_mask'])) |
|
|
| image_embeds = vision_predictions[0] |
| text_embeds = text_predictions[0] |
|
|
| logits_per_text = text_embeds @ image_embeds.T |
|
|
| print("similarity:", logits_per_text.item()) |
|
|
| infer_openvino_int8() |
|
|
| def export_ncnn(): |
| traced_vision_model.save(f"vision.pt") |
| input_shape_str = json.dumps(list(inputs.data['pixel_values'].shape)).replace(" ", "") |
| os.system(f"pnnx vision.pt 'inputshape={input_shape_str}'") |
|
|
| traced_text_model.save(f"text.pt") |
| input_shape_str = json.dumps(list(inputs.data['input_ids'].shape)).replace(" ", "") |
| input_shape2_str = json.dumps(list(inputs.data['attention_mask'].shape)).replace(" ", "") |
| os.system(f"pnnx text.pt 'inputshape={input_shape_str}i64,{input_shape2_str}i64'") |
|
|
| |
|
|
| def infer_ncnn(): |
| import ncnn |
| |
| vision_extractor = ncnn.Net() |
| vision_extractor.load_param("vision.ncnn.param") |
| vision_extractor.load_model("vision.ncnn.bin") |
|
|
| text_extractor = ncnn.Net() |
| text_extractor.load_param("text.ncnn.param") |
| text_extractor.load_model("text.ncnn.bin") |
|
|
| vision_mat = ncnn.Mat(inputs.data['pixel_values'].numpy()) |
| text_input_ids_mat = ncnn.Mat(inputs.data['input_ids'].numpy()) |
| text_attention_mask_mat = ncnn.Mat(inputs.data['attention_mask'].numpy()) |
|
|
| vision_extractor.input(vision_extractor.input_names()[0], vision_mat) |
| text_extractor.input(text_extractor.input_names()[0], text_input_ids_mat) |
| text_extractor.input(text_extractor.input_names()[1], text_attention_mask_mat) |
|
|
| image_embeds = vision_extractor.extract("out0") |
| text_embeds = text_extractor.extract("out0") |
|
|
| logits_per_text = text_embeds @ image_embeds.T |
|
|
| print("similarity:", logits_per_text[0]) |
|
|
| |
|
|
| def infer_torch(): |
| outputs = ptmodel(**inputs) |
| logits_per_image = outputs.logits_per_image |
| probs = logits_per_image.softmax(dim=1) |
| print(probs) |