hiyouga · Sanster · Sep 11, 2024
diff --git a/README.md b/README.md
@@ -161,7 +161,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 ## Supported Models
 
 | Model                                                             | Model size                       | Template  |
-| ----------------------------------------------------------------- | -------------------------------- | --------- |
+|-------------------------------------------------------------------|----------------------------------| --------- |
 | [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2 |
 | [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
 | [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3  |
@@ -188,6 +188,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi        |
 | [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl     |
 | [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan      |
+| [Florence 2](https://huggingface.co/microsoft/Florence-2-large)                 | 0.23B/0.77B                   | florence2   |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.

diff --git a/README_zh.md b/README_zh.md
@@ -189,6 +189,7 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
 | [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi        |
 | [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl     |
 | [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan      |
+| [Florence 2](https://huggingface.co/microsoft/Florence-2-large)     | 0.23B/0.77B          | florence2   |
 
 > [!NOTE]
 > 对于所有“基座”（Base）模型，`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。

diff --git a/data/TF-ID-arxiv-papers_demo_data/arxiv_2407_02687_7.png b/data/TF-ID-arxiv-papers_demo_data/arxiv_2407_02687_7.png
diff --git a/data/TF-ID-arxiv-papers_demo_data/arxiv_2407_03169_2.png b/data/TF-ID-arxiv-papers_demo_data/arxiv_2407_03169_2.png
diff --git a/data/TF-ID-arxiv-papers_example.json b/data/TF-ID-arxiv-papers_example.json
@@ -0,0 +1,32 @@
+[
+  {
+    "messages": [
+      {
+        "content": "<OD>",
+        "role": "user"
+      },
+      {
+        "content": "table<loc_92><loc_84><loc_911><loc_245>",
+        "role": "assistant"
+      }
+    ],
+    "images": [
+      "TF-ID-arxiv-papers_demo_data/arxiv_2407_03169_2.png"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<OD>",
+        "role": "user"
+      },
+      {
+        "content": "table<loc_173><loc_507><loc_837><loc_656>figure<loc_170><loc_80><loc_827><loc_403>",
+        "role": "assistant"
+      }
+    ],
+    "images": [
+      "TF-ID-arxiv-papers_demo_data/arxiv_2407_02687_7.png"
+    ]
+  }
+]
diff --git a/data/dataset_info.json b/data/dataset_info.json
@@ -622,5 +622,19 @@
       "prompt": "content"
     },
     "folder": "python"
+  },
+  "TF-ID-arxiv-papers_example": {
+    "file_name": "TF-ID-arxiv-papers_example.json",
+    "formatting": "sharegpt",
+    "columns": {
+      "messages": "messages",
+      "images": "images"
+    },
+    "tags": {
+      "role_tag": "role",
+      "content_tag": "content",
+      "user_tag": "user",
+      "assistant_tag": "assistant"
+    }
   }
 }
diff --git a/examples/train_full/florence2_full_sft.yaml b/examples/train_full/florence2_full_sft.yaml
@@ -0,0 +1,42 @@
+### model
+model_name_or_path: microsoft/Florence-2-large-ft
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+deepspeed: examples/deepspeed/ds_z0_config.json
+
+### dataset
+dataset: TF-ID-arxiv-papers_example
+template: florence2
+cutoff_len: 1024
+max_samples: 3000
+overwrite_cache: true
+preprocessing_num_workers: 8
+
+### output
+output_dir: saves/florence2-large/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 4
+gradient_accumulation_steps: 1
+learning_rate: 5.0e-6
+num_train_epochs: 5.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+# florence2 image_processor will resize image to 768x768, so do not resize image too small
+image_resolution: 2048
+freeze_vision_tower: false
+
+### eval
+val_size: 0.01
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
@@ -14,11 +14,9 @@
     from PIL import Image
     from PIL.Image import Image as ImageObject
 
-
 if is_pyav_available():
     import av
 
-
 if TYPE_CHECKING:
     import torch
     from av.stream import Stream
@@ -147,7 +145,7 @@ def _get_mm_inputs(
         r"""
         Processes visual inputs.
 
-        Returns: (llava and paligemma)
+        Returns: (llava, paligemma and florence2)
             pixel_values: tensor with shape (B, C, H, W)
 
         Returns: (qwen2-vl)
@@ -417,11 +415,52 @@ def get_mm_inputs(
         return self._get_mm_inputs(images, videos, processor)
 
 
+class Florence2Plugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        r"""
+        Pre-processes input messages before tokenization for VLMs.
+        """
+        # The predefined tasks in florence2 have specific prompt words. https://huggingface.co/microsoft/Florence-2-base-ft/blob/9803f52844ec1ae5df004e6089262e9a23e527fd/processing_florence2.py#L112
+        # modeling_florence2.py will concat image feature and text features. https://huggingface.co/microsoft/Florence-2-base-ft/blob/9803f52844ec1ae5df004e6089262e9a23e527fd/modeling_florence2.py#L2737
+        self._validate_input(images, videos)
+        for message in messages:
+            if message["role"] == "system":
+                raise ValueError("florence2 does not support system messages.")
+
+            if message["role"] == "user":
+                message["content"] = processor._construct_prompts([message["content"]])[0]
+
+        if len(messages) != 2:
+            raise ValueError(f"florence2 only support two messages(1 round): {len(messages)}")
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        imglens: Sequence[int],
+        vidlens: Sequence[int],
+        seqlens: Sequence[int],
+        processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
+        self._validate_input(images, videos)
+        return self._get_mm_inputs(images, videos, processor)
+
+
 PLUGINS = {
     "base": BasePlugin,
     "llava": LlavaPlugin,
     "paligemma": PaliGemmaPlugin,
     "qwen2_vl": Qwen2vlPlugin,
+    "florence2": Florence2Plugin,
 }
 
 

diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py
@@ -77,8 +77,14 @@ def _encode_supervised_example(
             input_ids = source_ids + target_ids + input_ids
             labels = source_label + target_label + labels
         else:
-            input_ids += source_ids + target_ids
-            labels += source_label + target_label
+            if template.name == "florence2":
+                # Florence2Plugin will make sure messages only contain one round
+                # florence2 is an encoder-decoder model
+                input_ids = source_ids
+                labels = target_ids
+            else:
+                input_ids += source_ids + target_ids
+                labels += source_label + target_label
 
     if template.efficient_eos:
         input_ids += [tokenizer.eos_token_id]
@@ -101,7 +107,6 @@ def preprocess_supervised_dataset(
         if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) != 1:
             logger.warning("Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i]))
             continue
-
         input_ids, labels = _encode_supervised_example(
             prompt=examples["_prompt"][i],
             response=examples["_response"][i],

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
@@ -37,6 +37,7 @@
 
 @dataclass
 class Template:
+    name: str
     format_user: "Formatter"
     format_assistant: "Formatter"
     format_system: "Formatter"
@@ -251,6 +252,7 @@ def _register_template(
     default_separator_formatter = EmptyFormatter()
     default_prefix_formatter = EmptyFormatter()
     TEMPLATES[name] = template_class(
+        name=name,
         format_user=format_user or default_user_formatter,
         format_assistant=format_assistant or default_assistant_formatter,
         format_system=format_system or default_user_formatter,
@@ -984,3 +986,11 @@ def get_template_and_fix_tokenizer(tokenizer: "PreTrainedTokenizer", data_args:
     format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
     format_separator=EmptyFormatter(slots=["\n"]),
 )
+
+
+_register_template(
+    name="florence2",
+    format_user=StringFormatter(slots=["{{content}}"]),
+    stop_words=["</s>"],
+    mm_plugin=get_mm_plugin(name="florence2", image_token=""),
+)
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
@@ -1736,3 +1736,26 @@ def register_model_group(
     },
     template="zephyr",
 )
+
+register_model_group(
+    models={
+        "Florence-2-base": {
+            DownloadSource.DEFAULT: "microsoft/Florence-2-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Florence-2-base",
+        },
+        "Florence-2-base-ft": {
+            DownloadSource.DEFAULT: "microsoft/Florence-2-base-ft",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Florence-2-base-ft",
+        },
+        "Florence-2-large": {
+            DownloadSource.DEFAULT: "microsoft/Florence-2-large",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Florence-2-large",
+        },
+        "Florence-2-large-ft": {
+            DownloadSource.DEFAULT: "microsoft/Florence-2-large-ft",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Florence-2-large-ft",
+        },
+    },
+    template="florence2",
+    vision=True,
+)
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
@@ -67,21 +67,28 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
     """
     init_kwargs = _get_init_kwargs(model_args)
     config = load_config(model_args)
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path,
-            use_fast=model_args.use_fast_tokenizer,
-            split_special_tokens=model_args.split_special_tokens,
-            padding_side="right",
-            **init_kwargs,
-        )
-    except ValueError:  # try the fast one
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path,
-            use_fast=True,
-            padding_side="right",
-            **init_kwargs,
-        )
+    is_florence2 = getattr(config, "model_type", None) == 'florence2'
+
+    if is_florence2:
+        # Florence-2 has a custom tokenizer, see:
+        # https://huggingface.co/microsoft/Florence-2-base-ft/blob/main/processing_florence2.py#L85
+        tokenizer = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs).tokenizer
+    else:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.model_name_or_path,
+                use_fast=model_args.use_fast_tokenizer,
+                split_special_tokens=model_args.split_special_tokens,
+                padding_side="right",
+                **init_kwargs,
+            )
+        except ValueError:  # try the fast one
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_args.model_name_or_path,
+                use_fast=True,
+                padding_side="right",
+                **init_kwargs,
+            )
 
     if model_args.new_special_tokens is not None:
         num_added_tokens = tokenizer.add_special_tokens(

diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
@@ -30,7 +30,6 @@
 
     from ...hparams import FinetuningArguments, ModelArguments
 
-
 logger = get_logger(__name__)
 transformers_logger = logging.get_logger(__name__)
 
@@ -136,6 +135,13 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni
         if finetuning_args.train_mm_proj_only:
             raise ValueError("Qwen2-VL models do not support `train_mm_proj_only`.")
 
+    elif model_type == "florence2":
+        if finetuning_args.freeze_vision_tower:
+            forbidden_modules.add("vision_tower")
+
+        if finetuning_args.train_mm_proj_only:
+            raise ValueError("florence2 models do not support `train_mm_proj_only`.")
+
     return forbidden_modules
 
 
@@ -152,6 +158,11 @@ def get_image_seqlen(config: "PretrainedConfig") -> int:
         image_seqlen = config.vision_config.num_image_tokens
     elif model_type == "qwen2_vl":  # variable length
         image_seqlen = -1
+    elif model_type == "florence2":
+        # Florence2 does not need to add image_token placeholder in the prompt, as modeling_florence2.py will merge image features and text features.
+        # https://huggingface.co/microsoft/Florence-2-large/blob/39ddb416a9819d9fa1bacad7b7899099ae4b0a59/modeling_florence2.py#L2737
+        # hard-coded, cannot find a method to calculate image_seqlen from config.
+        image_seqlen = 577
 
     return image_seqlen