MobileCLIP / run_axmodel.py
jordan0811's picture
Upload folder using huggingface_hub
39a7193 verified
import numpy as np
from PIL import Image
import axengine as ort
import torch
from torchvision.transforms import Normalize, Compose, InterpolationMode, ToTensor, Resize, CenterCrop
from tokenizer import SimpleTokenizer
import argparse
OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
def image_transform_v2():
resolution = 256
resize_size = resolution
centercrop_size = resolution
mean = OPENAI_DATASET_MEAN
std = OPENAI_DATASET_STD
aug_list = [
Resize(
resize_size,
interpolation=InterpolationMode.BICUBIC,
),
CenterCrop(centercrop_size),
ToTensor(),
Normalize(mean=mean, std=std)
]
preprocess = Compose(aug_list)
return preprocess
def image_transform_v1():
resolution = 256
resize_size = resolution
centercrop_size = resolution
aug_list = [
Resize(
resize_size,
interpolation=InterpolationMode.BILINEAR,
),
CenterCrop(centercrop_size),
ToTensor(),
]
preprocess = Compose(aug_list)
return preprocess
def softmax(x, axis=-1):
"""
对 numpy 数组在指定维度上应用 softmax 函数
参数:
x: numpy 数组,输入数据
axis: 计算 softmax 的维度,默认为最后一个维度 (-1)
返回:
经过 softmax 处理的 numpy 数组,与输入形状相同
"""
# 减去最大值以防止数值溢出(数值稳定化)
e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
# 计算每个元素的指数与所在维度总和的比值
return e_x / np.sum(e_x, axis=axis, keepdims=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-ie", "--image_encoder_path", type=str, default="./mobileclip2_s4_image_encoder.axmodel",
help="image encoder axmodel path")
parser.add_argument("-te", "--text_encoder_path", type=str, default="./mobileclip2_s4_text_encoder.axmodel",
help="text encoder axmodel path")
parser.add_argument("-i", "--image", type=str, default="./zebra.jpg",
help="input image path")
parser.add_argument("-t", "--class_text", type=str, nargs='+', default=["a zebra", "a dog", "two zebras"],
help='List of captions, e.g.: "a zebra" "a dog" "two zebras"')
args = parser.parse_args()
image_encoder_path = args.image_encoder_path
text_encoder_path = args.text_encoder_path
# NOTICE: 使用v1的预处理,v2的预处理方式在pulsar2中量化误差比较大
preprocess = image_transform_v1()
tokenizer = SimpleTokenizer(context_length=77)
image = preprocess(Image.open(args.image).convert('RGB')).unsqueeze(0)
text = tokenizer(args.class_text)
text = text.to(torch.int32)
onnx_image_encoder = ort.InferenceSession(image_encoder_path)
onnx_text_encoder = ort.InferenceSession(text_encoder_path)
image_features = onnx_image_encoder.run(["unnorm_image_features"],{"image":np.array(image)})[0]
# text_features = []
# for i in range(text.shape[0]):
# text_feature = onnx_text_encoder.run(["unnorm_text_features"],{"text":np.array([text[i]])})[0]
# text_features.append(text_feature)
# text_features = np.array([t[0] for t in text_features])
text_features = onnx_text_encoder.run(["unnorm_text_features"], {"text": text.numpy()})[0]
image_features /= np.linalg.norm(image_features, ord=2, axis=-1, keepdims=True)
text_features /= np.linalg.norm(text_features, ord=2, axis=-1, keepdims=True)
text_probs = softmax(100.0 * image_features @ text_features.T)
print("Label probs:", text_probs)