支持batch形式的对话

Browse files

— 添加chat_batch方法，可以同时进行多次多伦对话

![1683539812923.png](https://img1.imgtp.com/2023/05/08/AeydGQVA.png)

Files changed (1) hide show

modeling_chatglm.py +30 -13

modeling_chatglm.py CHANGED Viewed

@@ -721,7 +721,6 @@ CHATGLM_6B_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
     usage and behavior.
     Parameters:
         config ([`~ChatGLM6BConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
@@ -732,37 +731,28 @@ CHATGLM_6B_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
             Indices can be obtained using [`ChatGLM6BTokenizer`].
             See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
         token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
             - 0 corresponds to a *sentence A* token,
             - 1 corresponds to a *sentence B* token.
             [What are token type IDs?](../glossary#token-type-ids)
         position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings.
             Selected in the range `[0, config.max_position_embeddings - 1]`.
             [What are position IDs?](../glossary#position-ids)
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
             This is useful if you want more control over how to convert *input_ids* indices into associated vectors
@@ -784,13 +774,11 @@ CHATGLM_6B_INPUTS_DOCSTRING = r"""
 )
 class ChatGLMModel(ChatGLMPreTrainedModel):
     """
     The model can behave as an encoder (with only self-attention) as well
     as a decoder, in which case a layer of cross-attention is added between
     the self-attention layers, following the architecture described in [Attention is
     all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
     Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
     To behave as an decoder the model needs to be initialized with the
     `is_decoder` argument of the configuration set to `True`.
     To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
@@ -1237,7 +1225,6 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
         [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
         beam_idx at every generation step.
         Output shares the same memory storage as `past`.
         """
         return tuple(
@@ -1288,6 +1275,36 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         response = self.process_response(response)
         history = history + [(query, response)]
         return response, history
     @torch.no_grad()
     def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048,

     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
     usage and behavior.
     Parameters:
         config ([`~ChatGLM6BConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
             Indices can be obtained using [`ChatGLM6BTokenizer`].
             See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
         token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
             - 0 corresponds to a *sentence A* token,
             - 1 corresponds to a *sentence B* token.
             [What are token type IDs?](../glossary#token-type-ids)
         position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings.
             Selected in the range `[0, config.max_position_embeddings - 1]`.
             [What are position IDs?](../glossary#position-ids)
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
             This is useful if you want more control over how to convert *input_ids* indices into associated vectors
 )
 class ChatGLMModel(ChatGLMPreTrainedModel):
     """
     The model can behave as an encoder (with only self-attention) as well
     as a decoder, in which case a layer of cross-attention is added between
     the self-attention layers, following the architecture described in [Attention is
     all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
     Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
     To behave as an decoder the model needs to be initialized with the
     `is_decoder` argument of the configuration set to `True`.
     To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
         This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
         [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
         beam_idx at every generation step.
         Output shares the same memory storage as `past`.
         """
         return tuple(
         response = self.process_response(response)
         history = history + [(query, response)]
         return response, history
+	@torch.no_grad()
+    def chat_batch(self, tokenizer, querys=List[str], historys=List[List[Tuple[str, str]]], max_length: int = 2048, num_beams=1,
+             do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
+        responses = []
+        prompts = []
+        for query, history in zip(querys, historys):
+            if history is None:
+                history = []
+            if logits_processor is None:
+                logits_processor = LogitsProcessorList()
+            logits_processor.append(InvalidScoreLogitsProcessor())
+            gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                        "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+            if not history:
+                prompt = query
+            else:
+                prompt = ""
+                for i, (old_query, response) in enumerate(history):
+                    prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+                prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+            prompts.append(prompt)
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True)
+        inputs = inputs.to(self.device)
+        outputs = self.generate(**inputs, **gen_kwargs)
+        outputs = outputs.tolist()
+        outputs = [x[len(inputs["input_ids"][0]):] for x in outputs]
+        responses = [tokenizer.decode(output) for output in outputs]
+        responses = [self.process_response(response) for response in responses]
+        return responses
     @torch.no_grad()
     def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048,