Skip to content

Commit

Permalink
Add: num_additional_image_tokens to models (#35052)
Browse files Browse the repository at this point in the history
* Add: num_additional_image_tokens to models

* docs: update docstring for num_additional_image_tokens in configuration files

* Add num_additional_image_tokens to LlavaNextVideo model and update feature selection logic

* revert

* Fix: adjust num_image_tokens calculation in LlavaProcessor

* Remove num_additional_image_tokens initialization from configuration files

* Fix test error

* revert

* Fix: adjust num_image_tokens calculation in LlavaNextVideoProcessor

* fix conflict

* Fix: adjust num_image_tokens calculation in VideoLlavaProcessor

* make style

---------

Co-authored-by: Raushan Turganbay <[email protected]>
  • Loading branch information
jp1924 and zucchini-nlp authored Jan 8, 2025
1 parent 657bb14 commit 29e74b7
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/transformers/models/llava/modeling_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ def __init__(self, config: LlavaConfig):
self.vocab_size = config.text_config.vocab_size
self.language_model = AutoModelForCausalLM.from_config(config.text_config)
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1

self.post_init()

def get_input_embeddings(self):
Expand Down
4 changes: 3 additions & 1 deletion src/transformers/models/llava/processing_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,9 @@ def __call__(
# Replace the image token with the expanded image token sequence
pixel_values = image_inputs["pixel_values"]
height, width = get_image_size(to_numpy_array(pixel_values[0]))
num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
num_image_tokens = (height // self.patch_size) * (
width // self.patch_size
) + self.num_additional_image_tokens
if self.vision_feature_select_strategy == "default":
num_image_tokens -= 1

Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/llava_next/processing_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,9 @@ def __call__(
for sample in text:
while self.image_token in sample:
image_size = next(image_sizes)
if not isinstance(image_size, (list, tuple)):
# cast to list to avoid numerical precision errors when calculating unpadding
image_size = image_size.tolist()
orig_height, orig_width = image_size
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
if self.vision_feature_select_strategy == "default":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,9 @@ def __call__(
for sample in text:
while self.image_token in sample:
image_size = next(image_sizes)
if not isinstance(image_size, (list, tuple)):
# cast to list to avoid numerical precision errors when calculating unpadding
image_size = image_size.tolist()
orig_height, orig_width = image_size
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
if self.vision_feature_select_strategy == "default":
Expand All @@ -193,6 +196,8 @@ def __call__(
one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
height, width = get_image_size(one_video[0])
num_frames = one_video.shape[0] # frame dim is always after batch dim

# no `self.num_additional_image_tokens` added because video always has a default feature selection strategy
num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
num_video_tokens = num_image_tokens // 4 * num_frames # divide by 4 needed for avg pooling layer
prompt_strings = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def __call__(
) + self.num_additional_image_tokens
num_video_tokens = num_image_tokens * num_frames
if self.vision_feature_select_strategy == "default":
num_image_tokens -= self.num_additional_image_tokens
num_image_tokens -= 1

prompt_strings = []
for sample in text:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/vipllava/modeling_vipllava.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ def __init__(self, config: VipLlavaConfig):
self.vocab_size = config.text_config.vocab_size
self.language_model = AutoModelForCausalLM.from_config(config.text_config)
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1

self.post_init()

def get_input_embeddings(self):
Expand Down

0 comments on commit 29e74b7

Please sign in to comment.