zai-org
/

cogvlm-chat-hf

@@ -29,9 +29,37 @@ model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).to('cuda').eval()
 query = 'Describe this image'
 image = Image.open(requests.get('https://github.com/THUDM/CogVLM/blob/main/examples/1.png?raw=true', stream=True).raw).convert('RGB')
-inputs = model.build_conversation_input_ids(tokenizer, query=query, history=[], images=[image])
 inputs = {
     'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
     'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
@@ -45,9 +73,7 @@ with torch.no_grad():
     outputs = outputs[:, inputs['input_ids'].shape[1]:]
     print(tokenizer.decode(outputs[0]))
-# Two professional basketball players are playing against each other. On the left side, there is Kobe Bryant wearing a yellow jersey with the
-# number 24 on it. He is holding a brown basketball. On the right side, there is another player wearing a blue and red jersey, blocking Kobe's
-# movement. Behind them, there are many spectators watching the game.</s>
 ```
 # 方法（Method）

     trust_remote_code=True
 ).to('cuda').eval()
+# chat example
 query = 'Describe this image'
 image = Image.open(requests.get('https://github.com/THUDM/CogVLM/blob/main/examples/1.png?raw=true', stream=True).raw).convert('RGB')
+inputs = model.build_conversation_input_ids(tokenizer, query=query, history=[], images=[image])  # chat mode
+inputs = {
+    'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
+    'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
+    'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
+    'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]],
+}
+gen_kwargs = {"max_length": 2048, "do_sample": False}
+with torch.no_grad():
+    outputs = model.generate(**inputs, **gen_kwargs)
+    outputs = outputs[:, inputs['input_ids'].shape[1]:]
+    print(tokenizer.decode(outputs[0]))
+# This image captures a moment from a basketball game. Two players are prominently featured: one wearing a yellow jersey with the number
+# 24 and the word 'Lakers' written on it, and the other wearing a navy blue jersey with the word 'Washington' and the number 34. The player
+# in yellow is holding a basketball and appears to be dribbling it, while the player in navy blue is reaching out with his arm, possibly
+# trying to block or defend. The background shows a filled stadium with spectators, indicating that this is a professional game.</s>
+# vqa example
+query = 'How many houses are there in this cartoon?'
+image = Image.open(requests.get('https://github.com/THUDM/CogVLM/blob/main/examples/4.jpg?raw=true', stream=True).raw).convert('RGB')
+inputs = model.build_conversation_input_ids(tokenizer, query=query, history=[], images=[image], template_version='vqa')   # vqa mode
 inputs = {
     'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
     'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
     outputs = outputs[:, inputs['input_ids'].shape[1]:]
     print(tokenizer.decode(outputs[0]))
+# 4
 ```
 # 方法（Method）