DiffusionVL-Qwen2.5VL-3B / processing_diffusionvl_qwen2_5_vl.py

Upload folder using huggingface_hub

39fee1d verified 4 months ago

12.8 kB

	# coding=utf-8
	# Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
	#
	# This code is based on Qwen2.5-VL, which is derived from EleutherAI's GPT-NeoX library
	# and the GPT-NeoX and OPT implementations. It has been modified to create DiffusionVL.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	DiffusionVL Processor - Combines image processor and tokenizer.
	"""

	import re
	from typing import List, Optional, Union

	import torch

	from transformers.feature_extraction_utils import BatchFeature
	from transformers.image_utils import ImageInput
	from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
	from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
	from transformers.video_utils import VideoInput


	IMAGE_TOKEN_INDEX = -200
	DEFAULT_IMAGE_TOKEN = "<image>"


	class DiffusionVL_Qwen2_5_VL_ProcessorKwargs(ProcessingKwargs, total=False):
	"""Keyword arguments for DiffusionVL_Qwen2_5_VL_Processor."""

	_defaults = {
	"text_kwargs": {
	"padding": False,
	},
	}


	def tokenizer_image_token(
	prompt: str,
	tokenizer,
	image_token_index: int = IMAGE_TOKEN_INDEX,
	return_tensors: Optional[str] = None,
	) -> Union[List[int], torch.Tensor]:
	"""
	Tokenize text with image placeholders, replacing <image> with IMAGE_TOKEN_INDEX.

	This implementation matches the training code (llava/mm_utils.py::tokenizer_image_token).

	Args:
	prompt: Input text containing <image> placeholders.
	tokenizer: The tokenizer to use for encoding text.
	image_token_index: The token index to use for image placeholders.
	return_tensors: If "pt", return a PyTorch tensor.

	Returns:
	List of token IDs or a PyTorch tensor.
	"""
	# Tokenize each chunk (matching training code behavior)
	prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split(DEFAULT_IMAGE_TOKEN)]

	def insert_separator(X, sep):
	return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]

	input_ids = []
	offset = 0

	# Handle BOS token if present (matching training code)
	if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
	offset = 1
	input_ids.append(prompt_chunks[0][0])

	for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
	input_ids.extend(x[offset:])

	if return_tensors is not None:
	if return_tensors == "pt":
	return torch.tensor(input_ids, dtype=torch.long)
	raise ValueError(f"Unsupported tensor type: {return_tensors}")
	return input_ids


	class DiffusionVL_Qwen2_5_VL_Processor(ProcessorMixin):
	r"""
	Constructs a DiffusionVL processor which wraps an image processor and a tokenizer into a single processor.

	[`DiffusionVL_Qwen2_5_VL_Processor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
	See the [`~DiffusionVL_Qwen2_5_VL_Processor.__call__`] and [`~DiffusionVL_Qwen2_5_VL_Processor.decode`] for more information.

	This processor uses LLaVA-style image token handling:
	- `<image>` in text is replaced with `IMAGE_TOKEN_INDEX` (-200) in input_ids
	- The model's `prepare_inputs_labels_for_multimodal` replaces -200 with actual image features

	Args:
	image_processor ([`Qwen2VLImageProcessor`], optional):
	The image processor is a required input.
	tokenizer ([`Qwen2TokenizerFast`], optional):
	The tokenizer is a required input.
	chat_template (`str`, optional):
	A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.

	Example:

	```python
	>>> from transformers import AutoProcessor
	>>> from PIL import Image

	>>> processor = AutoProcessor.from_pretrained("path/to/model", trust_remote_code=True)

	>>> # Prepare text with image placeholder
	>>> messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
	>>> text = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

	>>> # Process image and text
	>>> image = Image.open("image.jpg")
	>>> inputs = processor(text=[text], images=[image], return_tensors="pt")
	```
	"""

	attributes = ["image_processor", "tokenizer"]
	image_processor_class = "Qwen2VLImageProcessor"
	tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")

	def __init__(
	self,
	image_processor=None,
	tokenizer=None,
	chat_template: Optional[str] = None,
	**kwargs,
	):
	self.image_token = DEFAULT_IMAGE_TOKEN
	self.image_token_index = IMAGE_TOKEN_INDEX

	super().__init__(image_processor, tokenizer, chat_template=chat_template)

	def __call__(
	self,
	images: Optional[ImageInput] = None,
	text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
	videos: Optional[VideoInput] = None,
	**kwargs: Unpack[DiffusionVL_Qwen2_5_VL_ProcessorKwargs],
	) -> BatchFeature:
	"""
	Main method to prepare for the model one or several sequences and image(s).

	This method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`]
	if `text` is not `None` to encode the text. To prepare the vision inputs, this method forwards the `images`
	and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `images` is not `None`.

	The text should contain `<image>` placeholders where images should be inserted.
	These will be replaced with `IMAGE_TOKEN_INDEX` (-200) in the output input_ids.

	Args:
	images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, optional):
	The image or batch of images to be prepared. Each image can be a PIL image, NumPy array, or PyTorch
	tensor. Both channels-first and channels-last formats are supported.
	text (`str`, `List[str]`, optional):
	The sequence or batch of sequences to be encoded. Each sequence should be a string containing
	`<image>` placeholders where images will be inserted.
	videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, optional):
	The video or batch of videos to be prepared. Currently not fully supported.
	return_tensors (`str` or [`~utils.TensorType`], optional):
	If set, will return tensors of a particular framework. Acceptable values are:
	- `'pt'`: Return PyTorch `torch.Tensor` objects.
	- `'np'`: Return NumPy `np.ndarray` objects.

	Returns:
	[`BatchFeature`]: A [`BatchFeature`] with the following fields:

	- input_ids -- List of token ids to be fed to a model. Returned when `text` is not `None`.
	- attention_mask -- List of indices specifying which tokens should be attended to by the model.
	- pixel_values -- Pixel values to be fed to a model. Returned when `images` is not `None`.
	- image_grid_thw -- List of image 3D grid dimensions. Returned when `images` is not `None`.
	"""
	output_kwargs = self._merge_kwargs(
	DiffusionVL_Qwen2_5_VL_ProcessorKwargs,
	tokenizer_init_kwargs=self.tokenizer.init_kwargs,
	**kwargs,
	)

	# Process images
	image_inputs = {}
	if images is not None:
	image_inputs = self.image_processor(
	images=images, **output_kwargs.get("images_kwargs", {})
	)

	# Handle text input
	if text is None:
	return BatchFeature(data=image_inputs)

	if not isinstance(text, list):
	text = [text]

	# Tokenize with LLaVA-style image token handling
	return_tensors = output_kwargs.get("text_kwargs", {}).pop("return_tensors", None)

	all_input_ids = []
	for t in text:
	input_ids = tokenizer_image_token(
	t, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors=None
	)
	all_input_ids.append(input_ids)

	# Pad sequences
	max_len = max(len(ids) for ids in all_input_ids)
	padded_input_ids = []
	attention_masks = []

	pad_token_id = (
	self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0
	)

	for ids in all_input_ids:
	padding_length = max_len - len(ids)
	padded_ids = ids + [pad_token_id] * padding_length
	mask = [1] * len(ids) + [0] * padding_length
	padded_input_ids.append(padded_ids)
	attention_masks.append(mask)

	text_inputs = {
	"input_ids": padded_input_ids,
	"attention_mask": attention_masks,
	}

	return BatchFeature(data={text_inputs, image_inputs}, tensor_type=return_tensors)

	def build_conversation_input_ids(
	self,
	messages: List[dict],
	images: Optional[List] = None,
	add_generation_prompt: bool = True,
	) -> dict:
	"""
	Build input_ids from conversation messages in LLaVA format.

	This method converts a list of messages into a prompt string with `<image>` placeholders.
	Uses LLaVA-style chat template format (trained format).

	Args:
	messages: List of message dicts with 'role' and 'content' keys.
	Content can be a string or a list of dicts with 'type' key ('text' or 'image').
	images: Optional list of images (used for validation).
	add_generation_prompt: Whether to add generation prompt at the end.

	Returns:
	dict with 'text' key containing the prompt string with `<image>` placeholders.
	"""
	# Build LLaVA-style prompt directly
	# Format: <\|im_start\|>system\nYou are a helpful assistant.<\|im_end\|>\n<\|im_start\|>user\n<image>\nPrompt<\|im_end\|>\n<\|im_start\|>assistant\n

	text_parts = []

	for message in messages:
	role = message.get("role", "user")
	content = message.get("content", "")

	text_parts.append(f"<\|im_start\|>{role}\n")

	# Handle content - can be string or list of content items
	if isinstance(content, str):
	text_parts.append(content)
	elif isinstance(content, list):
	for item in content:
	if isinstance(item, dict):
	if item.get("type") == "image":
	text_parts.append(DEFAULT_IMAGE_TOKEN)
	elif item.get("type") == "text":
	text_parts.append(item.get("text", ""))
	else:
	text_parts.append(str(item))

	text_parts.append("<\|im_end\|>\n")

	if add_generation_prompt:
	text_parts.append("<\|im_start\|>assistant\n")

	text = "".join(text_parts)
	return {"text": text}

	def batch_decode(self, args, *kwargs):
	"""
	Decode a batch of token IDs to text.

	This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
	Please refer to the docstring of this method for more information.
	"""
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	"""
	Decode token IDs to text.

	This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`].
	Please refer to the docstring of this method for more information.
	"""
	return self.tokenizer.decode(args, *kwargs)

	@property
	def model_input_names(self) -> List[str]:
	"""Return the list of model input names."""
	tokenizer_names = self.tokenizer.model_input_names
	image_processor_names = self.image_processor.model_input_names
	return list(dict.fromkeys(tokenizer_names + image_processor_names))


	__all__ = ["DiffusionVL_Qwen2_5_VL_Processor", "tokenizer_image_token"]