Spaces:
Runtime error
Runtime error
Commit
·
8bce163
1
Parent(s):
c40d27f
init
Browse files- .gitignore +5 -0
- app.py +6 -4
- llava/__pycache__/__init__.cpython-310.pyc +0 -0
- llava/__pycache__/chat.cpython-310.pyc +0 -0
- llava/__pycache__/constants.cpython-310.pyc +0 -0
- llava/__pycache__/conversation.cpython-310.pyc +0 -0
- llava/__pycache__/mm_utils.cpython-310.pyc +0 -0
- llava/__pycache__/utils.cpython-310.pyc +0 -0
- llava/chat.py +11 -12
- llava/model/__pycache__/__init__.cpython-310.pyc +0 -0
- llava/model/__pycache__/builder.cpython-310.pyc +0 -0
- llava/model/__pycache__/llava_arch.cpython-310.pyc +0 -0
- llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc +0 -0
- llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc +0 -0
- llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
- llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc +0 -0
- llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__
|
| 3 |
+
*.pyc
|
| 4 |
+
*.egg-info
|
| 5 |
+
dist
|
app.py
CHANGED
|
@@ -49,6 +49,7 @@ def parse_args():
|
|
| 49 |
parser.add_argument("--num_beams", type=int, default=1)
|
| 50 |
parser.add_argument("--max_new_tokens", type=int, default=512)
|
| 51 |
parser.add_argument("--num-visual-tokens", type=int, default=256)
|
|
|
|
| 52 |
args = parser.parse_args()
|
| 53 |
return args
|
| 54 |
|
|
@@ -68,7 +69,7 @@ disable_torch_init()
|
|
| 68 |
|
| 69 |
model_name = get_model_name_from_path(args.model_path)
|
| 70 |
tokenizer, model, image_processor, context_len = load_pretrained_model(
|
| 71 |
-
args.model_path, args.model_base, model_name
|
| 72 |
)
|
| 73 |
|
| 74 |
# vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
|
|
@@ -109,13 +110,14 @@ def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature, num_vis
|
|
| 109 |
num_beams=num_beams,
|
| 110 |
temperature=temperature,
|
| 111 |
num_visual_tokens=num_visual_tokens,
|
| 112 |
-
)[0]
|
| 113 |
chatbot[-1][1] = llm_message[0]
|
| 114 |
return chatbot, chat_state, img_list
|
| 115 |
|
| 116 |
title = """<h1 align="center">Demo of MQT-LLaVA</h1>"""
|
| 117 |
-
description = """<h3>This is the demo of MQT-LLaVA. Upload your images and start chatting
|
| 118 |
-
example questions, click example image, hit upload, and press enter in the chatbox
|
|
|
|
| 119 |
article = """<p><a href='https://gordonhu608.github.io/mqt-llava/'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/gordonhu608/MQT-LLaVA'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://arxiv.org/abs/'><img src='https://img.shields.io/badge/Paper-ArXiv-red'></a></p>
|
| 120 |
"""
|
| 121 |
|
|
|
|
| 49 |
parser.add_argument("--num_beams", type=int, default=1)
|
| 50 |
parser.add_argument("--max_new_tokens", type=int, default=512)
|
| 51 |
parser.add_argument("--num-visual-tokens", type=int, default=256)
|
| 52 |
+
parser.add_argument("--gpu-id", type=int, default=0)
|
| 53 |
args = parser.parse_args()
|
| 54 |
return args
|
| 55 |
|
|
|
|
| 69 |
|
| 70 |
model_name = get_model_name_from_path(args.model_path)
|
| 71 |
tokenizer, model, image_processor, context_len = load_pretrained_model(
|
| 72 |
+
args.model_path, args.model_base, model_name, device_map=device, device=device
|
| 73 |
)
|
| 74 |
|
| 75 |
# vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
|
|
|
|
| 110 |
num_beams=num_beams,
|
| 111 |
temperature=temperature,
|
| 112 |
num_visual_tokens=num_visual_tokens,
|
| 113 |
+
) #[0]
|
| 114 |
chatbot[-1][1] = llm_message[0]
|
| 115 |
return chatbot, chat_state, img_list
|
| 116 |
|
| 117 |
title = """<h1 align="center">Demo of MQT-LLaVA</h1>"""
|
| 118 |
+
description = """<h3>This is the demo of MQT-LLaVA. Upload your images and start chatting! <br> To use
|
| 119 |
+
example questions, click example image, hit upload & start chat, and press enter on your keyboard in the chatbox.
|
| 120 |
+
<br> Due to limited memory constraint, we only support single turn conversation. To ask multiple questions, hit Restart and upload your image! </h3>"""
|
| 121 |
article = """<p><a href='https://gordonhu608.github.io/mqt-llava/'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/gordonhu608/MQT-LLaVA'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://arxiv.org/abs/'><img src='https://img.shields.io/badge/Paper-ArXiv-red'></a></p>
|
| 122 |
"""
|
| 123 |
|
llava/__pycache__/__init__.cpython-310.pyc
CHANGED
|
Binary files a/llava/__pycache__/__init__.cpython-310.pyc and b/llava/__pycache__/__init__.cpython-310.pyc differ
|
|
|
llava/__pycache__/chat.cpython-310.pyc
ADDED
|
Binary file (13.3 kB). View file
|
|
|
llava/__pycache__/constants.cpython-310.pyc
CHANGED
|
Binary files a/llava/__pycache__/constants.cpython-310.pyc and b/llava/__pycache__/constants.cpython-310.pyc differ
|
|
|
llava/__pycache__/conversation.cpython-310.pyc
CHANGED
|
Binary files a/llava/__pycache__/conversation.cpython-310.pyc and b/llava/__pycache__/conversation.cpython-310.pyc differ
|
|
|
llava/__pycache__/mm_utils.cpython-310.pyc
CHANGED
|
Binary files a/llava/__pycache__/mm_utils.cpython-310.pyc and b/llava/__pycache__/mm_utils.cpython-310.pyc differ
|
|
|
llava/__pycache__/utils.cpython-310.pyc
CHANGED
|
Binary files a/llava/__pycache__/utils.cpython-310.pyc and b/llava/__pycache__/utils.cpython-310.pyc differ
|
|
|
llava/chat.py
CHANGED
|
@@ -442,20 +442,21 @@ def load_images(image_files):
|
|
| 442 |
class Chat:
|
| 443 |
def __init__(self, model, tokenizer, image_processor, args, device='cuda:0'):
|
| 444 |
self.device = device
|
| 445 |
-
self.model = model
|
| 446 |
self.tokenizer = tokenizer
|
| 447 |
self.image_processor = image_processor
|
| 448 |
self.args = args
|
| 449 |
|
| 450 |
def ask(self, text, conv):
|
| 451 |
#conv.messages = [] #hack not keeping history.
|
|
|
|
| 452 |
conv.append_message(conv.roles[0], text)
|
| 453 |
|
| 454 |
def answer(self, conv, img_list, num_visual_tokens=256, max_new_tokens=512, num_beams=1, temperature=0.0):
|
| 455 |
conv.append_message(conv.roles[1], None)
|
| 456 |
|
| 457 |
question = conv.get_prompt()
|
| 458 |
-
images = img_list[0] #torch.stack(img_list).to(self.device)
|
| 459 |
|
| 460 |
images_tensor = process_images(
|
| 461 |
images,
|
|
@@ -466,7 +467,7 @@ class Chat:
|
|
| 466 |
input_ids = (
|
| 467 |
tokenizer_image_token(question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
|
| 468 |
.unsqueeze(0)
|
| 469 |
-
.cuda()
|
| 470 |
)
|
| 471 |
|
| 472 |
with torch.inference_mode():
|
|
@@ -488,21 +489,19 @@ class Chat:
|
|
| 488 |
return output_text, ''
|
| 489 |
|
| 490 |
def upload_img(self, image, conv, img_list):
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
# raw_image = raw_image.convert('RGB')
|
| 498 |
-
# image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
|
| 499 |
# elif isinstance(image, torch.Tensor):
|
| 500 |
# if len(image.shape) == 3:
|
| 501 |
# image = image.unsqueeze(0)
|
| 502 |
# image = image.to(self.device)
|
| 503 |
|
| 504 |
#image_emb, _ = self.model.encode_img(image)
|
| 505 |
-
img_list.append(
|
| 506 |
#conv.append_message(conv.roles[0], "")
|
| 507 |
msg = "Received."
|
| 508 |
# self.conv.append_message(self.conv.roles[1], msg)
|
|
|
|
| 442 |
class Chat:
|
| 443 |
def __init__(self, model, tokenizer, image_processor, args, device='cuda:0'):
|
| 444 |
self.device = device
|
| 445 |
+
self.model = model.to(device)
|
| 446 |
self.tokenizer = tokenizer
|
| 447 |
self.image_processor = image_processor
|
| 448 |
self.args = args
|
| 449 |
|
| 450 |
def ask(self, text, conv):
|
| 451 |
#conv.messages = [] #hack not keeping history.
|
| 452 |
+
text = DEFAULT_IMAGE_TOKEN + "\n" + text
|
| 453 |
conv.append_message(conv.roles[0], text)
|
| 454 |
|
| 455 |
def answer(self, conv, img_list, num_visual_tokens=256, max_new_tokens=512, num_beams=1, temperature=0.0):
|
| 456 |
conv.append_message(conv.roles[1], None)
|
| 457 |
|
| 458 |
question = conv.get_prompt()
|
| 459 |
+
images = img_list #[0] #torch.stack(img_list).to(self.device)
|
| 460 |
|
| 461 |
images_tensor = process_images(
|
| 462 |
images,
|
|
|
|
| 467 |
input_ids = (
|
| 468 |
tokenizer_image_token(question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
|
| 469 |
.unsqueeze(0)
|
| 470 |
+
.to(self.device) #cuda()
|
| 471 |
)
|
| 472 |
|
| 473 |
with torch.inference_mode():
|
|
|
|
| 489 |
return output_text, ''
|
| 490 |
|
| 491 |
def upload_img(self, image, conv, img_list):
|
| 492 |
+
|
| 493 |
+
if isinstance(image, str): # is a image path
|
| 494 |
+
raw_image = Image.open(image).convert('RGB')
|
| 495 |
+
elif isinstance(image, Image.Image):
|
| 496 |
+
raw_image = image
|
| 497 |
+
raw_image = raw_image.convert('RGB')
|
|
|
|
|
|
|
| 498 |
# elif isinstance(image, torch.Tensor):
|
| 499 |
# if len(image.shape) == 3:
|
| 500 |
# image = image.unsqueeze(0)
|
| 501 |
# image = image.to(self.device)
|
| 502 |
|
| 503 |
#image_emb, _ = self.model.encode_img(image)
|
| 504 |
+
img_list.append(raw_image)
|
| 505 |
#conv.append_message(conv.roles[0], "")
|
| 506 |
msg = "Received."
|
| 507 |
# self.conv.append_message(self.conv.roles[1], msg)
|
llava/model/__pycache__/__init__.cpython-310.pyc
CHANGED
|
Binary files a/llava/model/__pycache__/__init__.cpython-310.pyc and b/llava/model/__pycache__/__init__.cpython-310.pyc differ
|
|
|
llava/model/__pycache__/builder.cpython-310.pyc
CHANGED
|
Binary files a/llava/model/__pycache__/builder.cpython-310.pyc and b/llava/model/__pycache__/builder.cpython-310.pyc differ
|
|
|
llava/model/__pycache__/llava_arch.cpython-310.pyc
CHANGED
|
Binary files a/llava/model/__pycache__/llava_arch.cpython-310.pyc and b/llava/model/__pycache__/llava_arch.cpython-310.pyc differ
|
|
|
llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc
CHANGED
|
Binary files a/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc and b/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc differ
|
|
|
llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc
CHANGED
|
Binary files a/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc and b/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc differ
|
|
|
llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc
CHANGED
|
Binary files a/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc and b/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc differ
|
|
|
llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc
CHANGED
|
Binary files a/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc and b/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc differ
|
|
|
llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc
CHANGED
|
Binary files a/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc and b/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc differ
|
|
|