Spaces:

pdufour
/

Qwen2-VL-2B-Instruct-ONNX-Q4-F16

Running

App Files Files Community

pdufour commited on Nov 19, 2024

Commit

abaea80

verified ·

1 Parent(s): 5c17b02

Update index.js

Browse files

Files changed (1) hide show

index.js +154 -67

index.js CHANGED Viewed

@@ -68,21 +68,31 @@ async function parse(img, txt) {
   status.textContent = output;
 }
-async function imageTextToText(imagePath, query, vision = true) {
-  const config = await getModelJSON(BASE_MODEL, "config.json");
   const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);
   let history_len = new Tensor("int64", new BigInt64Array([0n]), [1]);
-  let pos_factor = new Tensor("float16", new Uint16Array([0]), [1]);
-  let attention_mask = new ort.Tensor("float16", new Uint16Array([0xfbff]), [1]);
   let past_key_states = new ort.Tensor(
     "float16",
     new Uint16Array(
       config.num_hidden_layers *
-      config.num_key_value_heads *
-      MAX_SEQ_LENGTH *
-      (config.hidden_size / config.num_attention_heads)
     ).fill(0),
     [
       config.num_hidden_layers,
@@ -91,8 +101,19 @@ async function imageTextToText(imagePath, query, vision = true) {
       config.hidden_size / config.num_attention_heads,
     ]
   );
   let past_value_states = past_key_states;
   const tokenizer = await AutoTokenizer.from_pretrained(BASE_MODEL);
   const prompt = `\n<|im_start|>user\n<|vision_start|><|vision_end|>${query}<|im_end|>\n<|im_start|>assistant\n`;
   const token = await tokenizer(prompt, {
@@ -101,72 +122,112 @@ async function imageTextToText(imagePath, query, vision = true) {
     tokenize: true,
   }).input_ids;
-  let ids_len = new Tensor("int64", new BigInt64Array([BigInt(token.dims[1])]), [1]);
   let input_ids = new ort.Tensor(
     "int32",
     new Int32Array(MAX_SEQ_LENGTH).fill(0),
     [MAX_SEQ_LENGTH]
   );
-  input_ids.data.set(Array.from(token.data.slice(0, token.dims[1]), Number));
   let { hidden_states } = await ortSessionB.run({
     input_ids: input_ids,
     ids_len: ids_len,
   });
-  const dummy = new ort.Tensor("int32", new Int32Array([0]), []);
-  let { position_ids } = await ortSessionC.run({ dummy });
   if (vision) {
     let image = await RawImage.fromURL(imagePath);
     image = await image.resize(INPUT_IMAGE_SIZE[0], INPUT_IMAGE_SIZE[1]);
-    image = image.rgb().toTensor("CHW").to("float32").div_(255.0);
     const pixel_values = image.unsqueeze(0);
-    console.log('run session a');
-    const { image_embed } = await ortSessionA.run({ pixel_values });
-    console.log('finished session a');
     ids_len = ids_len.add(BigInt(IMAGE_EMBED_SIZE));
-    const ortSessionD = await ort.InferenceSession.create(
       await getModelFile(ONNX_MODEL, `onnx/QwenVL_D_${QUANT}.onnx`),
-      { executionProviders: ["webgpu"] }
     );
-    console.log('run session d');
-    const result = await ortSessionD.run({
-       "hidden_states.1": hidden_states,
       image_embed,
       ids_len,
-      "ids_len_minus": new Tensor(
-        "int32",
-        new Int32Array([Number(ids_len.item()) - Number(prompt_head_len.item())]),
-        [1]
-      ),
-      "split_factor": new Tensor(
-        "int32",
-        new Int32Array([MAX_SEQ_LENGTH - Number(ids_len.item()) - IMAGE_EMBED_SIZE]),
-        [1]
-      ),
-    });
-    console.log('finished session d');
-    past_key_states = result.hidden_states;
-    position_ids = result.position_ids;
   }
-  let num_decode = 0;
   let output = '';
-  while (num_decode < MAX_SINGLE_CHAT_LENGTH && Number(history_len.data[0]) < MAX_SEQ_LENGTH) {
-    const ortSessionE = await ort.InferenceSession.create(
-      await getModelFile(ONNX_MODEL, `onnx/QwenVL_E_${QUANT}.onnx`),
-      { executionProviders: ["wasm"] }
-    );
-    const result = await ortSessionE.run({
-      hidden_states: past_key_states,
       attention_mask,
       "past_key_states.1": past_key_states,
       "past_value_states.1": past_value_states,
@@ -174,35 +235,61 @@ async function imageTextToText(imagePath, query, vision = true) {
       ids_len,
       position_ids,
       pos_factor,
-    });
-    console.log('finished session e');
-    const token_id = result.max_logit_ids;
-    if (token_id === 151643 || token_id === 151645) break;
-    output += tokenizer.decode([...token_id.data]);
     num_decode++;
-    history_len = history_len.add(BigInt(1));
-    pos_factor = new Tensor(
-      "float16",
-      new Uint16Array([Number(pos_factor.data[0]) + 1]),
-      [1]
-    );
-    past_key_states = result.past_key_states;
-    past_value_states = result.past_value_states;
-    input_ids.data[0] = Number(token_id.data[0]);
-    const { hidden_states } = await ortSessionB.run({
-      input_ids,
-      ids_len,
     });
-    past_key_states = hidden_states;
   }
-  return output;
 }
 await initializeSessions();

   status.textContent = output;
 }
+export async function imageTextToText(
+  imagePath,
+  query,
+  vision = true
+) {
+  let ortSessionA, ortSessionB, ortSessionC, ortSessionD, ortSessionE;
   const prompt_head_len = new Tensor("int64", new BigInt64Array([5n]), [1]);
+  logger.tensor("prompt_head_len", prompt_head_len);
+  let position_ids;
+  let num_decode = 0;
   let history_len = new Tensor("int64", new BigInt64Array([0n]), [1]);
+  logger.tensor("history_len", history_len);
+  var pos_factor_v = BigInt(1 - IMAGE_EMBED_SIZE + WIDTH_FACTOR);
   let past_key_states = new ort.Tensor(
     "float16",
     new Uint16Array(
       config.num_hidden_layers *
+        config.num_key_value_heads *
+        MAX_SEQ_LENGTH *
+        (config.hidden_size / config.num_attention_heads)
     ).fill(0),
     [
       config.num_hidden_layers,
       config.hidden_size / config.num_attention_heads,
     ]
   );
   let past_value_states = past_key_states;
+  let attention_mask = new ort.Tensor(
+    "float16",
+    new Uint16Array([0xfbff]),
+    [1]
+  );
+  let pos_factor = new Tensor("float16", new Uint16Array([0]), [1]);
+  logger.tensor("pos_factor", pos_factor);
+  logger.groupCollapsed("[TOKENIZATION] Processing prompt...");
   const tokenizer = await AutoTokenizer.from_pretrained(BASE_MODEL);
   const prompt = `\n<|im_start|>user\n<|vision_start|><|vision_end|>${query}<|im_end|>\n<|im_start|>assistant\n`;
   const token = await tokenizer(prompt, {
     tokenize: true,
   }).input_ids;
+  const seq_length = token.dims[1];
+  let ids_len = new Tensor("int64", new BigInt64Array([BigInt(seq_length)]), [
+    1,
+  ]);
   let input_ids = new ort.Tensor(
     "int32",
     new Int32Array(MAX_SEQ_LENGTH).fill(0),
     [MAX_SEQ_LENGTH]
   );
+  input_ids.data.set(Array.from(token.data.slice(0, seq_length), Number));
+  const dummy = new ort.Tensor("int32", new Int32Array([0]), []);
+  if (!ortSessionB) {
+  }
   let { hidden_states } = await ortSessionB.run({
     input_ids: input_ids,
     ids_len: ids_len,
   });
+  ({ position_ids } = await ortSessionC.run({
+    dummy: dummy,
+  }));
+  // Process image
   if (vision) {
     let image = await RawImage.fromURL(imagePath);
     image = await image.resize(INPUT_IMAGE_SIZE[0], INPUT_IMAGE_SIZE[1]);
+    image = image.rgb();
+    image = image.toTensor("CHW");
+    image = image.to("float32");
+    image = image.div_(255.0);
     const pixel_values = image.unsqueeze(0);
+    const { image_embed } = await ortSessionA.run({
+      pixel_values: pixel_values,
+    });
     ids_len = ids_len.add(BigInt(IMAGE_EMBED_SIZE));
+    const split_factor = new Tensor(
+      "int32",
+      new Int32Array([
+        MAX_SEQ_LENGTH - Number(ids_len.item()) - IMAGE_EMBED_SIZE,
+      ]),
+      [1]
+    );
+    const ids_len_minus = new Tensor(
+      "int32",
+      new Int32Array([Number(ids_len.item()) - Number(prompt_head_len.item())]),
+      [1]
+    );
+    await ortSessionA.release();
+    ortSessionA = null;
+    logger.log("session d create");
+    ortSessionD = await ort.InferenceSession.create(
       await getModelFile(ONNX_MODEL, `onnx/QwenVL_D_${QUANT}.onnx`),
+      {
+        executionProviders: ["webgpu"],
+      }
     );
+    ({ hidden_states, position_ids } = await ortSessionD.run({
+      "hidden_states.1": hidden_states,
       image_embed,
       ids_len,
+      ids_len_minus,
+      split_factor,
+    }));
+    await ortSessionD.release();
+    ortSessionD = null;
   }
   let output = '';
+  while (
+    num_decode < MAX_SINGLE_CHAT_LENGTH &&
+    Number(history_len.data[0]) < MAX_SEQ_LENGTH
+  ) {
+    let token_id;
+    if (!ortSessionE) {
+      console.log("Create ortSessionE");
+      ortSessionE = await ort.InferenceSession.create(
+        await getModelFile(ONNX_MODEL, `onnx/QwenVL_E_${QUANT}.onnx`),
+        {
+          executionProviders: ["wasm"],
+        },
+      );
+    }
+    ({
+      max_logit_ids: token_id,
+      past_key_states: past_key_states,
+      past_value_states: past_value_states,
+    } = await ortSessionE.run({
+      hidden_states,
       attention_mask,
       "past_key_states.1": past_key_states,
       "past_value_states.1": past_value_states,
       ids_len,
       position_ids,
       pos_factor,
+    }));
+    if (token_id === 151643 || token_id === 151645) {
+      break;
+    }
     num_decode++;
+    if (num_decode < 2) {
+      history_len = history_len.add(BigInt(ids_len.data[0]));
+      ids_len = new ort.Tensor("int64", new BigInt64Array([1n]), [1]);
+      attention_mask = new ort.Tensor("float16", new Uint16Array([0]), [1]);
+      if (vision) {
+        pos_factor = new Tensor(
+          "float16",
+          new Uint16Array([int64ToFloat16(pos_factor_v + ids_len.data[0])]),
+          [1]
+        );
+      } else {
+        pos_factor = new Tensor(
+          "float16",
+          new Uint16Array([int64ToFloat16(history_len.data[0] + BigInt(1))]),
+          [1]
+        );
+      }
+    } else {
+      history_len = history_len.add(BigInt(1));
+      pos_factor = pos_factor.map((v) =>
+        int64ToFloat16(float16ToInt64(v) + BigInt(1))
+      );
+      logger.tensor("Updated history_len", history_len);
+      logger.tensor("Updated pos_factor", pos_factor);
+      logger.groupEnd();
+    }
+    (input_ids.data)[0] = Number(token_id.data[0]);
+    const result_B = await ortSessionB.run({
+      input_ids: input_ids,
+      ids_len: ids_len,
     });
+    hidden_states = result_B.hidden_states;
+    if (
+      !Number.isInteger(token_id.data[0]) &&
+      !["bigint", "number"].includes(typeof token_id.data[0])
+    ) {
+      throw new Error(`Token ID is not an integer`);
+    } else {
+      const decoded = tokenizer.decode([...token_id.data])
+      output += decoded;
+    }
   }
 }
 await initializeSessions();