Skip to content

Commit b4e097b

Browse files
committed
Merge remote-tracking branch 'origin/raymond/gelu_act' into distributed_load_debug
2 parents 74fc4eb + 48fc888 commit b4e097b

File tree

3 files changed

+10
-16
lines changed

3 files changed

+10
-16
lines changed

fast_llm/models/multimodal/conversion/llava.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ class LlavaVisionAdapterConverter:
167167
@classmethod
168168
def import_config(cls, config: dict) -> dict:
169169
return {
170-
"intermediate_size": config["vision_config"]["hidden_size"],
170+
"intermediate_size": config["text_config"]["hidden_size"],
171171
"add_linear_biases": config["multimodal_projector_bias"],
172172
"gated": False,
173173
"activation": ActivationType.from_hf_name(config["projector_hidden_act"]),
@@ -183,8 +183,6 @@ def export_config(cls, config: MLPConfig) -> dict:
183183
return {
184184
"projector_hidden_act": config.activation.hf_name,
185185
"multimodal_projector_bias": config.add_linear_biases,
186-
# Not in LlavaConfig, but needed for consistency check in LlavaBaseModelConverter.
187-
"projector_intermediate_size": config.intermediate_size,
188186
}
189187

190188
@classmethod
@@ -243,13 +241,13 @@ def export_config(cls, config: VisionEncoderConfig) -> dict:
243241
def get_converters(cls, config: VisionEncoderConfig) -> list[WeightConverter]:
244242
return [
245243
*cls.embeddings_converter_class.get_converters(
246-
config.embeddings, "vision_encoder.embeddings", "model.vision_tower"
244+
config.embeddings, "vision_encoder.embeddings", "vision_tower"
247245
),
248246
*cls.encoder_converter_class.get_converters(
249-
config.encoder, "vision_encoder.encoder", "model.vision_tower.transformer.layers"
247+
config.encoder, "vision_encoder.encoder", "vision_tower.transformer.layers"
250248
),
251249
*cls.vision_adapter_converter_class.get_converters(
252-
config.adapter, "vision_encoder.adapter", "model.multi_modal_projector"
250+
config.adapter, "vision_encoder.adapter", "multi_modal_projector"
253251
),
254252
]
255253

@@ -266,11 +264,11 @@ def get_converters(
266264
*cls.normalization_converter_class.get_converters(
267265
config.normalization,
268266
f"{fast_llm_prefix}.final_norm",
269-
f"model.language_model.norm",
267+
f"language_model.model.norm",
270268
),
271269
get_parameter_converter(
272270
f"{fast_llm_prefix}.output_weights",
273-
"lm_head.weight",
271+
"language_model.lm_head.weight",
274272
drop_on_import=exported_config["tie_word_embeddings"],
275273
),
276274
]
@@ -309,18 +307,17 @@ def export_config(cls, config: MultiModalBaseModelConfig) -> dict:
309307
"vision_feature_layer": -1,
310308
},
311309
)
312-
Assert.eq(out.pop("projector_intermediate_size"), out["text_config"]["hidden_size"])
313310
return out
314311

315312
@classmethod
316313
def get_converters(cls, config: MultiModalBaseModelConfig, exported_config: dict) -> list[WeightConverter]:
317314
return [
318315
*cls.vision_model_converter_class.get_converters(config.vision_encoder),
319316
*cls.language_model_converter_class.embeddings_converter_class.get_converters(
320-
config.embeddings, "embeddings", "model.language_model"
317+
config.embeddings, "embeddings", "language_model.model"
321318
),
322319
*cls.language_model_converter_class.decoder_converter_class.get_converters(
323-
config.decoder, "decoder", "model.language_model.layers"
320+
config.decoder, "decoder", "language_model.model.layers"
324321
),
325322
*cls.language_model_converter_class.head_converter_class.get_converters(
326323
config.head, {"tie_word_embeddings": False}, "head"

fast_llm_external_models/llava_hybrid/configuration_llava_hybrid.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ def __init__(
5959
text_config=None,
6060
image_token_index=32000,
6161
projector_hidden_act="gelu",
62-
projector_intermediate_size=4096,
6362
vision_feature_select_strategy="default",
6463
vision_feature_layer=-2,
6564
image_seq_length=576,
@@ -68,8 +67,6 @@ def __init__(
6867
):
6968
self.image_token_index = image_token_index
7069
self.projector_hidden_act = projector_hidden_act
71-
# projector_intermediate_size is an addition to the original Llava config
72-
self.projector_intermediate_size = projector_intermediate_size
7370
self.image_seq_length = image_seq_length
7471

7572
if vision_feature_select_strategy not in ["default", "full"]:

fast_llm_external_models/llava_hybrid/modeling_llava_hybrid.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@ def __init__(self, config: LlavaHybridConfig):
2222
num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
2323
self.linear_1 = nn.Linear(
2424
config.vision_config.hidden_size * num_feature_layers,
25-
config.projector_intermediate_size,
25+
config.text_config.hidden_size,
2626
bias=config.multimodal_projector_bias,
2727
)
2828
self.act = ACT2FN[config.projector_hidden_act]
2929
self.linear_2 = nn.Linear(
30-
config.projector_intermediate_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
30+
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
3131
)
3232

3333
def forward(self, image_features):

0 commit comments

Comments
 (0)