@@ -167,7 +167,7 @@ class LlavaVisionAdapterConverter:
167167 @classmethod
168168 def import_config (cls , config : dict ) -> dict :
169169 return {
170- "intermediate_size" : config ["vision_config " ]["hidden_size" ],
170+ "intermediate_size" : config ["text_config " ]["hidden_size" ],
171171 "add_linear_biases" : config ["multimodal_projector_bias" ],
172172 "gated" : False ,
173173 "activation" : ActivationType .from_hf_name (config ["projector_hidden_act" ]),
@@ -183,8 +183,6 @@ def export_config(cls, config: MLPConfig) -> dict:
183183 return {
184184 "projector_hidden_act" : config .activation .hf_name ,
185185 "multimodal_projector_bias" : config .add_linear_biases ,
186- # Not in LlavaConfig, but needed for consistency check in LlavaBaseModelConverter.
187- "projector_intermediate_size" : config .intermediate_size ,
188186 }
189187
190188 @classmethod
@@ -243,13 +241,13 @@ def export_config(cls, config: VisionEncoderConfig) -> dict:
243241 def get_converters (cls , config : VisionEncoderConfig ) -> list [WeightConverter ]:
244242 return [
245243 * cls .embeddings_converter_class .get_converters (
246- config .embeddings , "vision_encoder.embeddings" , "model. vision_tower"
244+ config .embeddings , "vision_encoder.embeddings" , "vision_tower"
247245 ),
248246 * cls .encoder_converter_class .get_converters (
249- config .encoder , "vision_encoder.encoder" , "model. vision_tower.transformer.layers"
247+ config .encoder , "vision_encoder.encoder" , "vision_tower.transformer.layers"
250248 ),
251249 * cls .vision_adapter_converter_class .get_converters (
252- config .adapter , "vision_encoder.adapter" , "model. multi_modal_projector"
250+ config .adapter , "vision_encoder.adapter" , "multi_modal_projector"
253251 ),
254252 ]
255253
@@ -266,11 +264,11 @@ def get_converters(
266264 * cls .normalization_converter_class .get_converters (
267265 config .normalization ,
268266 f"{ fast_llm_prefix } .final_norm" ,
269- f"model. language_model.norm" ,
267+ f"language_model.model .norm" ,
270268 ),
271269 get_parameter_converter (
272270 f"{ fast_llm_prefix } .output_weights" ,
273- "lm_head.weight" ,
271+ "language_model. lm_head.weight" ,
274272 drop_on_import = exported_config ["tie_word_embeddings" ],
275273 ),
276274 ]
@@ -309,18 +307,17 @@ def export_config(cls, config: MultiModalBaseModelConfig) -> dict:
309307 "vision_feature_layer" : - 1 ,
310308 },
311309 )
312- Assert .eq (out .pop ("projector_intermediate_size" ), out ["text_config" ]["hidden_size" ])
313310 return out
314311
315312 @classmethod
316313 def get_converters (cls , config : MultiModalBaseModelConfig , exported_config : dict ) -> list [WeightConverter ]:
317314 return [
318315 * cls .vision_model_converter_class .get_converters (config .vision_encoder ),
319316 * cls .language_model_converter_class .embeddings_converter_class .get_converters (
320- config .embeddings , "embeddings" , "model. language_model"
317+ config .embeddings , "embeddings" , "language_model.model "
321318 ),
322319 * cls .language_model_converter_class .decoder_converter_class .get_converters (
323- config .decoder , "decoder" , "model. language_model.layers"
320+ config .decoder , "decoder" , "language_model.model .layers"
324321 ),
325322 * cls .language_model_converter_class .head_converter_class .get_converters (
326323 config .head , {"tie_word_embeddings" : False }, "head"
0 commit comments