| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| DiffusionVL Processor - Combines image processor and tokenizer. |
| """ |
|
|
| import re |
| from typing import List, Optional, Union |
|
|
| import torch |
|
|
| from transformers.feature_extraction_utils import BatchFeature |
| from transformers.image_utils import ImageInput |
| from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack |
| from transformers.tokenization_utils_base import PreTokenizedInput, TextInput |
| from transformers.video_utils import VideoInput |
|
|
|
|
| IMAGE_TOKEN_INDEX = -200 |
| DEFAULT_IMAGE_TOKEN = "<image>" |
|
|
|
|
| class DiffusionVL_Qwen2_5_VL_ProcessorKwargs(ProcessingKwargs, total=False): |
| """Keyword arguments for DiffusionVL_Qwen2_5_VL_Processor.""" |
|
|
| _defaults = { |
| "text_kwargs": { |
| "padding": False, |
| }, |
| } |
|
|
|
|
| def tokenizer_image_token( |
| prompt: str, |
| tokenizer, |
| image_token_index: int = IMAGE_TOKEN_INDEX, |
| return_tensors: Optional[str] = None, |
| ) -> Union[List[int], torch.Tensor]: |
| """ |
| Tokenize text with image placeholders, replacing <image> with IMAGE_TOKEN_INDEX. |
| |
| This implementation matches the training code (llava/mm_utils.py::tokenizer_image_token). |
| |
| Args: |
| prompt: Input text containing <image> placeholders. |
| tokenizer: The tokenizer to use for encoding text. |
| image_token_index: The token index to use for image placeholders. |
| return_tensors: If "pt", return a PyTorch tensor. |
| |
| Returns: |
| List of token IDs or a PyTorch tensor. |
| """ |
| |
| prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split(DEFAULT_IMAGE_TOKEN)] |
|
|
| def insert_separator(X, sep): |
| return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1] |
|
|
| input_ids = [] |
| offset = 0 |
|
|
| |
| if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: |
| offset = 1 |
| input_ids.append(prompt_chunks[0][0]) |
|
|
| for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): |
| input_ids.extend(x[offset:]) |
|
|
| if return_tensors is not None: |
| if return_tensors == "pt": |
| return torch.tensor(input_ids, dtype=torch.long) |
| raise ValueError(f"Unsupported tensor type: {return_tensors}") |
| return input_ids |
|
|
|
|
| class DiffusionVL_Qwen2_5_VL_Processor(ProcessorMixin): |
| r""" |
| Constructs a DiffusionVL processor which wraps an image processor and a tokenizer into a single processor. |
| |
| [`DiffusionVL_Qwen2_5_VL_Processor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. |
| See the [`~DiffusionVL_Qwen2_5_VL_Processor.__call__`] and [`~DiffusionVL_Qwen2_5_VL_Processor.decode`] for more information. |
| |
| This processor uses LLaVA-style image token handling: |
| - `<image>` in text is replaced with `IMAGE_TOKEN_INDEX` (-200) in input_ids |
| - The model's `prepare_inputs_labels_for_multimodal` replaces -200 with actual image features |
| |
| Args: |
| image_processor ([`Qwen2VLImageProcessor`], *optional*): |
| The image processor is a required input. |
| tokenizer ([`Qwen2TokenizerFast`], *optional*): |
| The tokenizer is a required input. |
| chat_template (`str`, *optional*): |
| A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. |
| |
| Example: |
| |
| ```python |
| >>> from transformers import AutoProcessor |
| >>> from PIL import Image |
| |
| >>> processor = AutoProcessor.from_pretrained("path/to/model", trust_remote_code=True) |
| |
| >>> # Prepare text with image placeholder |
| >>> messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}] |
| >>> text = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| |
| >>> # Process image and text |
| >>> image = Image.open("image.jpg") |
| >>> inputs = processor(text=[text], images=[image], return_tensors="pt") |
| ``` |
| """ |
|
|
| attributes = ["image_processor", "tokenizer"] |
| image_processor_class = "Qwen2VLImageProcessor" |
| tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") |
|
|
| def __init__( |
| self, |
| image_processor=None, |
| tokenizer=None, |
| chat_template: Optional[str] = None, |
| **kwargs, |
| ): |
| self.image_token = DEFAULT_IMAGE_TOKEN |
| self.image_token_index = IMAGE_TOKEN_INDEX |
|
|
| super().__init__(image_processor, tokenizer, chat_template=chat_template) |
|
|
| def __call__( |
| self, |
| images: Optional[ImageInput] = None, |
| text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, |
| videos: Optional[VideoInput] = None, |
| **kwargs: Unpack[DiffusionVL_Qwen2_5_VL_ProcessorKwargs], |
| ) -> BatchFeature: |
| """ |
| Main method to prepare for the model one or several sequences and image(s). |
| |
| This method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] |
| if `text` is not `None` to encode the text. To prepare the vision inputs, this method forwards the `images` |
| and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `images` is not `None`. |
| |
| The text should contain `<image>` placeholders where images should be inserted. |
| These will be replaced with `IMAGE_TOKEN_INDEX` (-200) in the output input_ids. |
| |
| Args: |
| images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, *optional*): |
| The image or batch of images to be prepared. Each image can be a PIL image, NumPy array, or PyTorch |
| tensor. Both channels-first and channels-last formats are supported. |
| text (`str`, `List[str]`, *optional*): |
| The sequence or batch of sequences to be encoded. Each sequence should be a string containing |
| `<image>` placeholders where images will be inserted. |
| videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, *optional*): |
| The video or batch of videos to be prepared. Currently not fully supported. |
| return_tensors (`str` or [`~utils.TensorType`], *optional*): |
| If set, will return tensors of a particular framework. Acceptable values are: |
| - `'pt'`: Return PyTorch `torch.Tensor` objects. |
| - `'np'`: Return NumPy `np.ndarray` objects. |
| |
| Returns: |
| [`BatchFeature`]: A [`BatchFeature`] with the following fields: |
| |
| - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. |
| - **attention_mask** -- List of indices specifying which tokens should be attended to by the model. |
| - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. |
| - **image_grid_thw** -- List of image 3D grid dimensions. Returned when `images` is not `None`. |
| """ |
| output_kwargs = self._merge_kwargs( |
| DiffusionVL_Qwen2_5_VL_ProcessorKwargs, |
| tokenizer_init_kwargs=self.tokenizer.init_kwargs, |
| **kwargs, |
| ) |
|
|
| |
| image_inputs = {} |
| if images is not None: |
| image_inputs = self.image_processor( |
| images=images, **output_kwargs.get("images_kwargs", {}) |
| ) |
|
|
| |
| if text is None: |
| return BatchFeature(data=image_inputs) |
|
|
| if not isinstance(text, list): |
| text = [text] |
|
|
| |
| return_tensors = output_kwargs.get("text_kwargs", {}).pop("return_tensors", None) |
|
|
| all_input_ids = [] |
| for t in text: |
| input_ids = tokenizer_image_token( |
| t, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors=None |
| ) |
| all_input_ids.append(input_ids) |
|
|
| |
| max_len = max(len(ids) for ids in all_input_ids) |
| padded_input_ids = [] |
| attention_masks = [] |
|
|
| pad_token_id = ( |
| self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0 |
| ) |
|
|
| for ids in all_input_ids: |
| padding_length = max_len - len(ids) |
| padded_ids = ids + [pad_token_id] * padding_length |
| mask = [1] * len(ids) + [0] * padding_length |
| padded_input_ids.append(padded_ids) |
| attention_masks.append(mask) |
|
|
| text_inputs = { |
| "input_ids": padded_input_ids, |
| "attention_mask": attention_masks, |
| } |
|
|
| return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) |
|
|
| def build_conversation_input_ids( |
| self, |
| messages: List[dict], |
| images: Optional[List] = None, |
| add_generation_prompt: bool = True, |
| ) -> dict: |
| """ |
| Build input_ids from conversation messages in LLaVA format. |
| |
| This method converts a list of messages into a prompt string with `<image>` placeholders. |
| Uses LLaVA-style chat template format (trained format). |
| |
| Args: |
| messages: List of message dicts with 'role' and 'content' keys. |
| Content can be a string or a list of dicts with 'type' key ('text' or 'image'). |
| images: Optional list of images (used for validation). |
| add_generation_prompt: Whether to add generation prompt at the end. |
| |
| Returns: |
| dict with 'text' key containing the prompt string with `<image>` placeholders. |
| """ |
| |
| |
|
|
| text_parts = [] |
|
|
| for message in messages: |
| role = message.get("role", "user") |
| content = message.get("content", "") |
|
|
| text_parts.append(f"<|im_start|>{role}\n") |
|
|
| |
| if isinstance(content, str): |
| text_parts.append(content) |
| elif isinstance(content, list): |
| for item in content: |
| if isinstance(item, dict): |
| if item.get("type") == "image": |
| text_parts.append(DEFAULT_IMAGE_TOKEN) |
| elif item.get("type") == "text": |
| text_parts.append(item.get("text", "")) |
| else: |
| text_parts.append(str(item)) |
|
|
| text_parts.append("<|im_end|>\n") |
|
|
| if add_generation_prompt: |
| text_parts.append("<|im_start|>assistant\n") |
|
|
| text = "".join(text_parts) |
| return {"text": text} |
|
|
| def batch_decode(self, *args, **kwargs): |
| """ |
| Decode a batch of token IDs to text. |
| |
| This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. |
| Please refer to the docstring of this method for more information. |
| """ |
| return self.tokenizer.batch_decode(*args, **kwargs) |
|
|
| def decode(self, *args, **kwargs): |
| """ |
| Decode token IDs to text. |
| |
| This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. |
| Please refer to the docstring of this method for more information. |
| """ |
| return self.tokenizer.decode(*args, **kwargs) |
|
|
| @property |
| def model_input_names(self) -> List[str]: |
| """Return the list of model input names.""" |
| tokenizer_names = self.tokenizer.model_input_names |
| image_processor_names = self.image_processor.model_input_names |
| return list(dict.fromkeys(tokenizer_names + image_processor_names)) |
|
|
|
|
| __all__ = ["DiffusionVL_Qwen2_5_VL_Processor", "tokenizer_image_token"] |
|
|