minnesotanlp
diff --git a/‎README.md‎
Lines changed: 10 additions & 10 deletions b/‎README.md‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎examples/scripts/mpo.py‎ ‎examples/scripts/mpoppo.py‎examples/scripts/mpo.py renamed to examples/scripts/mpoppo.py
Lines changed: 12 additions & 5 deletions b/‎examples/scripts/mpo.py‎ ‎examples/scripts/mpoppo.py‎examples/scripts/mpo.py renamed to examples/scripts/mpoppo.py
Lines changed: 12 additions & 5 deletions
diff --git a/‎scripts/mpo_experiments/dgx/launch_sglang_dgx.sbatch‎
Lines changed: 7 additions & 7 deletions b/‎scripts/mpo_experiments/dgx/launch_sglang_dgx.sbatch‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎scripts/mpo_experiments/dgx/launch_train.sbatch‎
Lines changed: 6 additions & 6 deletions b/‎scripts/mpo_experiments/dgx/launch_train.sbatch‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎scripts/mpo_experiments/dgx/mpoppo_train.sh‎
Lines changed: 7 additions & 7 deletions b/‎scripts/mpo_experiments/dgx/mpoppo_train.sh‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎scripts/mpo_experiments/elo_simulation.py‎
Lines changed: 3 additions & 3 deletions b/‎scripts/mpo_experiments/elo_simulation.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎scripts/mpo_experiments/launch_mpo.sh‎ ‎scripts/mpo_experiments/launch_mpoppo.sh‎scripts/mpo_experiments/launch_mpo.sh renamed to scripts/mpo_experiments/launch_mpoppo.sh
Lines changed: 10 additions & 10 deletions b/‎scripts/mpo_experiments/launch_mpo.sh‎ ‎scripts/mpo_experiments/launch_mpoppo.sh‎scripts/mpo_experiments/launch_mpo.sh renamed to scripts/mpo_experiments/launch_mpoppo.sh
Lines changed: 10 additions & 10 deletions
diff --git a/‎scripts/mpo_experiments/llm_generations.py‎
Lines changed: 3 additions & 3 deletions b/‎scripts/mpo_experiments/llm_generations.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎trl/__init__.py‎
Lines changed: 4 additions & 4 deletions b/‎trl/__init__.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎trl/extras/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎trl/extras/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -3,18 +3,18 @@
 
 ## What’s Been Implemented?
 
-- **Main script** for launching MPO training on top of PPO: `examples/scripts/mpo.py`
-- **`MPOTrainer`**: Located in `trl/trainer/mpo_trainer.py`, this extends `PPOTrainer` to implement the full MPO procedure as described in the paper.
-- **`MPOConfig`**: Defined in `trl/trainer/mpo_config.py`, this contains all hyperparameters for MPO training.
-- **Processed corpora** for four tasks (essay writing, summarization, ethical reasoning, and mathematical reasoning) are provided in `trl/extras/mpo/corpora`.
-- **Initial prompts and meta-prompts** for each task are located in `trl/extras/mpo/prompts`.
-- **LLM-based reward models (RMs)** and **meta-reward models (MRMs)** are implemented in task-specific files under `trl/extras/mpo/rm_{task_name}.py`, and dataset loading/processing is handled in `trl/extras/mpo/mpo_datasets.py`.
-- **Utility functions** for MPO training are implemented in `trl/trainer/utils.py`.
+- **Main script** for launching MPOPPO training on top of PPO: `examples/scripts/mpoppo.py`
+- **`MPOPPOTrainer`**: Located in `trl/trainer/mpoppo_trainer.py`, this extends `PPOTrainer` to implement the full MPO procedure as described in the paper.
+- **`MPOPPOConfig`**: Defined in `trl/trainer/mpoppo_config.py`, this contains all hyperparameters for MPOPPO training.
+- **Processed corpora** for four tasks (essay writing, summarization, ethical reasoning, and mathematical reasoning) are provided in `trl/extras/mpoppo/corpora`.
+- **Initial prompts and meta-prompts** for each task are located in `trl/extras/mpoppo/prompts`.
+- **LLM-based reward models (RMs)** and **meta-reward models (MRMs)** are implemented in task-specific files under `trl/extras/mpoppo/rm_{task_name}.py`, and dataset loading/processing is handled in `trl/extras/mpoppo/mpoppo_datasets.py`.
+- **Utility functions** for MPOPPO training are implemented in `trl/trainer/utils.py`.
 - **Additional scripts** for launching remote LLM servers and evaluating trained models are provided in `scripts/mpo_experiments`.
 
 ## Installation & Execution Requirements
 
-- Running MPO requires two components:
+- Running MPOPPO requires two components:
   1. **A primary node or subset of GPUs** dedicated to RL training.
   2. **A separate node or the remaining GPUs** dedicated to serving reward scores in an online fashion.
 - For the former, install this repository using `virtualenv` and `uv` (recommended for clean and reproducible environments):
@@ -36,8 +36,8 @@
     $ uv pip install vllm==0.8.4
     ```
   - Refer to the [SGLang documentation](https://docs.sglang.ai/) for more details.
-- Training start and end notifications are currently sent via [Pushover](https://pushover.net/api). If you do not wish to use this feature, you can simply comment out the relevant lines in the launch script: `examples/scripts/mpo.py`.
-- The `launch_mpo.sh` script in `scripts/mpo_experiments` demonstrates how to train models using MPO with different parameter configurations.
+- Training start and end notifications are currently sent via [Pushover](https://pushover.net/api). If you do not wish to use this feature, you can simply comment out the relevant lines in the launch script: `examples/scripts/mpoppo.py`.
+- The `launch_mpoppo.sh` script in `scripts/mpo_experiments` demonstrates how to train models using MPOPPO with different parameter configurations.
 - The `launch_rm_mrm.sh` script in `scripts/mpo_experiments` shows how to instantiate and serve LLMs via SGLang over an SSH connection.
 
 Below is the README from trl repository.
 
@@ -9,14 +9,21 @@
 from peft import LoraConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 
-from trl import ModelConfig, MPOConfig, MPOTrainer, ScriptArguments, get_kbit_device_map, get_quantization_config
-from trl.extras.mpo import get_task_dataset
+from trl import (
+    MPOPPOConfig,
+    MPOPPOTrainer,
+    ModelConfig,
+    ScriptArguments,
+    get_kbit_device_map,
+    get_quantization_config,
+)
+from trl.extras.mpoppo import get_task_dataset
 from trl.models.modeling_value_head import AutoModelForCausalLMWithValueHead
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
 """
-See launch script in scripts/mpo_experiments/launch_mpo.sh
+See launch script in scripts/mpo_experiments/launch_mpoppo.sh
 """
 
 
@@ -38,7 +45,7 @@ def seed_everything(seed: int = 42):
     pushover = Pushover(user=os.environ["PUSHOVER_USER"], token=os.environ["PUSHOVER_TOKEN"])
 
     seed_everything(42)
-    parser = HfArgumentParser((ScriptArguments, MPOConfig, ModelConfig))
+    parser = HfArgumentParser((ScriptArguments, MPOPPOConfig, ModelConfig))
     script_args, training_args, model_args = parser.parse_args_into_dataclasses()
     if os.path.exists(training_args.output_dir):
         raise ValueError(
@@ -149,7 +156,7 @@ def seed_everything(seed: int = 42):
             sound="magic",
         )
     try:
-        trainer = MPOTrainer(
+        trainer = MPOPPOTrainer(
             args=training_args,
             processing_class=tokenizer,
             model=policy,
 
@@ -7,7 +7,7 @@
 #SBATCH --mem=100GB
 #SBATCH --gpus-per-node=8
 #SBATCH --cpus-per-task=32
-#SBATCH -t 72:00:00
+#SBATCH -t 96:00:00
 #SBATCH -o %x.%j.out
 
 set -euo pipefail
@@ -17,20 +17,20 @@ set -euo pipefail
 ##############################
 
 # Your project root
-export MPO_ROOT="/lustre/fs0/scratch/zkim/Development/mpo-old"
+export MPOPPO_ROOT="/lustre/fs0/scratch/zkim/Development/mpo-old"
 
 # Load .env (expects a line like: HF_TOKEN=xxxx)
-if [ -f "${MPO_ROOT}/.env" ]; then
+if [ -f "${MPOPPO_ROOT}/.env" ]; then
   # Export variables defined inside .env
   set -a
   # shellcheck source=/dev/null
-  source "${MPO_ROOT}/.env"
+  source "${MPOPPO_ROOT}/.env"
   set +a
 else
-  echo "[WARN] ${MPO_ROOT}/.env not found; HF_TOKEN may be unset" >&2
+  echo "[WARN] ${MPOPPO_ROOT}/.env not found; HF_TOKEN may be unset" >&2
 fi
 
-: "${HF_TOKEN:?HF_TOKEN must be set in ${MPO_ROOT}/.env}"
+: "${HF_TOKEN:?HF_TOKEN must be set in ${MPOPPO_ROOT}/.env}"
 
 # Container image (your SQSH)
 export SGLANG_IMAGE="/lustre/fs0/scratch/zkim/sqsh-files/lmsysorg+sglang+latest.sqsh"
@@ -140,4 +140,4 @@ echo "  curl http://localhost:30000/v1/chat/completions ..."
 echo
 
 # Keep job alive as long as servers run
-wait "${RM_PID}" "${MRM_PID}"
+wait "${RM_PID}" "${MRM_PID}"
@@ -24,23 +24,23 @@ mrm_address=$2
 ###############################################################################
 
 # Point to this repo (overrides the default in mpoppo_train.sh)
-export MPO_ROOT="/lustre/fs0/scratch/zkim/Development/mpo-old"
+export MPOPPO_ROOT="/lustre/fs0/scratch/zkim/Development/mpo-old"
 
 # Load secrets (e.g., HF_TOKEN) if present
-if [ -f "${MPO_ROOT}/.env" ]; then
+if [ -f "${MPOPPO_ROOT}/.env" ]; then
   set -a
   # shellcheck source=/dev/null
-  source "${MPO_ROOT}/.env"
+  source "${MPOPPO_ROOT}/.env"
   set +a
 fi
 
-: "${HF_TOKEN:?HF_TOKEN must be set (place it in ${MPO_ROOT}/.env)}"
+: "${HF_TOKEN:?HF_TOKEN must be set (place it in ${MPOPPO_ROOT}/.env)}"
 
-cd "${MPO_ROOT}"
+cd "${MPOPPO_ROOT}"
 
 # Force single-node, 8 GPU layout
 export CUDA_DEVICES_OVERRIDE="${CUDA_DEVICES_OVERRIDE:-0,1,2,3,4,5,6,7}"
-export ACC_CONFIG_OVERRIDE="${ACC_CONFIG_OVERRIDE:-${MPO_ROOT}/examples/accelerate_configs/deepspeed_zero2.yaml}"
+export ACC_CONFIG_OVERRIDE="${ACC_CONFIG_OVERRIDE:-${MPOPPO_ROOT}/examples/accelerate_configs/deepspeed_zero2.yaml}"
 
 echo "Job ID:       ${SLURM_JOB_ID}"
 echo "Node list:    ${SLURM_NODELIST}"
 
@@ -25,21 +25,21 @@ prompt="evaluation_rubric_real_iter_0.txt"
 #  Paths & constants
 ###############################################################################
 
-# Use MPO_ROOT if set; fallback to your explicit path
-trl_dir="${MPO_ROOT:-/lustre/fs0/scratch/zkim/Development/mpo}"
-SCRIPT="$trl_dir/examples/scripts/mpo.py"
+# Use MPOPPO_ROOT if set; fallback to your explicit path
+trl_dir="${MPOPPO_ROOT:-/lustre/fs0/scratch/zkim/Development/mpo-old}"
+SCRIPT="$trl_dir/examples/scripts/mpoppo.py"
 
 WANDB_ENTITY="iterater"
 WANDB_PROJECT="mpoppo-new"
 DATASET="essay_writing"
 TASK="essay_writing"
-PROMPT_DIR="$trl_dir/trl/extras/mpo/prompts/essay_writing"
+PROMPT_DIR="$trl_dir/trl/extras/mpoppo/prompts/essay_writing"
 
 ###############################################################################
 #  Main runner
 ###############################################################################
 run_experiment() {
-    local exp_type=$1         # mpogrpo / ppo …
+    local exp_type=$1         # mpoppo / mpogrpo / ppo …
     local rubric_type=$2      # e.g. iter0
     local rm_params=$3        # reward-model size
     local mrm_params=$4       # meta-reward-model size
@@ -70,7 +70,7 @@ run_experiment() {
     # gradient accumulation scaling
     local grad_acc_steps=16
 
-    # MPOGRPO interval
+    # MPOPPO/MPOGRPO interval
     local num_mpo_interval=99999999
     [[ "$exp_type" == "mpogrpo"  || "$exp_type" == "mpoppo" ]] && num_mpo_interval=2
 
@@ -106,7 +106,7 @@ run_experiment() {
         --learning_rate 3e-6 \
         --num_ppo_epochs 4 \
         --num_mpo_interval "$num_mpo_interval" \
-        --save_n_updates 20 \
+        --save_n_updates 2 \
         --num_mpo_samples 10 \
         --num_mini_batches 1 \
         --per_device_train_batch_size 2 \
 
@@ -15,7 +15,7 @@
 
 load_dotenv()  # take environment variables from .env.
 
-exp_name = sys.argv[1]  # "mpo_variations" "rm_32" "rm_72" "32b_32bvs32b_72b" ""72b_32bvs72b_72b""
+exp_name = sys.argv[1]  # "mpoppo_variations" "rm_32" "rm_72" "32b_32bvs32b_72b" ""72b_32bvs72b_72b""
 num_matches = 2000
 task_name = "summarization"  # "essay_writing"
 print(f"exp_name is: {exp_name}")
@@ -36,7 +36,7 @@
         "base-1.5b": "ModelD",
         "iter0-72b": "ModelE",
     }
-elif exp_name == "mpo_variations":
+elif exp_name == "mpoppo_variations":
     model_names_to_annon = {
         "32b_32b": "ModelA",
         "32b_72b": "ModelB",
@@ -53,7 +53,7 @@
         "72b_32b": "ModelA",
         "72b_72b": "ModelB",
     }
-elif exp_name == "mpo_vs_oracle":
+elif exp_name == "mpoppo_vs_oracle":
     model_names_to_annon = {
         "32b_72b": "ModelB",
         "72b_72b": "ModelD",
 
@@ -21,20 +21,20 @@ remote_host=$4   # default unchanged
 ###############################################################################
 #  Paths & constants
 ###############################################################################
-trl_dir="$HOME/Development/trl"
-SCRIPT="$trl_dir/examples/scripts/mpo.py"
+trl_dir="${MPOPPO_ROOT:-$HOME/Development/trl}"
+SCRIPT="$trl_dir/examples/scripts/mpoppo.py"
 
 WANDB_ENTITY="iterater"
-WANDB_PROJECT="mpo-new"
+WANDB_PROJECT="mpoppo-new"
 DATASET="math_reasoning"
 TASK="math_reasoning"
-PROMPT_DIR="$trl_dir/trl/extras/mpo/prompts/math_reasoning"
+PROMPT_DIR="$trl_dir/trl/extras/mpoppo/prompts/math_reasoning"
 
 ###############################################################################
 #  Main runner
 ###############################################################################
 run_experiment() {
-    local exp_type=$1         # mpo / ppo …
+    local exp_type=$1         # mpoppo / ppo …
     local rubric_type=$2      # e.g. iter0
     local rm=$3               # reward-model size          (e.g. 1.5b)
     local mrm=$4              # meta-reward-model size     (e.g. 3b)
@@ -51,7 +51,7 @@ run_experiment() {
     # ------------------------------------------------------------------------
     local policy_model="policy-1.5b"
     local model_name
-    if [[ "$exp_type" == "mpo" ]]; then
+    if [[ "$exp_type" == "mpoppo" ]]; then
         model_name="${rubric_type}-${rm}_${mrm}"
     else
         model_name="${rubric_type}-${rm}"
@@ -67,9 +67,9 @@ run_experiment() {
     # gradient accumulation scaling
     local grad_acc_steps=8
 
-    # MPO interval
+    # MPOPPO interval
     local num_mpo_interval=99999999
-    [[ "$exp_type" == "mpo" ]] && num_mpo_interval=20
+    [[ "$exp_type" == "mpoppo" ]] && num_mpo_interval=20
 
     local _mrm_address=$mrm_address
     [[ $rm == $mrm ]] && _mrm_address=$rm_address
@@ -132,7 +132,7 @@ run_experiment() {
 ###############################################################################
 #  Sweep
 ###############################################################################
-exp_type="mpo"
+exp_type="mpoppo"
 rubric_type="iter0"
 prompt="evaluation_rubric_real_iter_0.txt"
 # rubric_type="autoprompt"
@@ -146,4 +146,4 @@ for rm in "${rms[@]}"; do
         # run_experiment "$exp_type" "$rubric_type" "$rm" "$mrm" "$prompt"
         run_experiment "$exp_type" "$rubric_type" "$rm" "$rm" "$prompt"
     done
-done
+done
@@ -13,8 +13,8 @@
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, HfArgumentParser, TrainingArguments
 
-from trl.extras.mpo import get_task_dataset
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE, MPODataCollatorWithPadding, generate
+from trl.extras.mpoppo import get_task_dataset
+from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE, MPOPPODataCollatorWithPadding, generate
 
 
 """Example
@@ -153,7 +153,7 @@ class InferenceConfig(TrainingArguments):
     dataloader = DataLoader(
         dataset,
         batch_size=args.batch_size,
-        collate_fn=MPODataCollatorWithPadding(tokenizer),
+        collate_fn=MPOPPODataCollatorWithPadding(tokenizer),
         drop_last=False,
     )
 
 
@@ -72,8 +72,8 @@
         "LogCompletionsCallback",
         "MergeModelCallback",
         "ModelConfig",
-        "MPOConfig",
-        "MPOTrainer",
+        "MPOPPOConfig",
+        "MPOPPOTrainer",
         "NashMDConfig",
         "NashMDTrainer",
         "OnlineDPOConfig",
@@ -169,8 +169,8 @@
         LogCompletionsCallback,
         MergeModelCallback,
         ModelConfig,
-        MPOConfig,
-        MPOTrainer,
+        MPOPPOConfig,
+        MPOPPOTrainer,
         NashMDConfig,
         NashMDTrainer,
         OnlineDPOConfig,
 
@@ -19,10 +19,12 @@
 
 _import_structure = {
     "best_of_n_sampler": ["BestOfNSampler"],
+    "mpoppo": [],
 }
 
 if TYPE_CHECKING:
     from .best_of_n_sampler import BestOfNSampler
+    from .mpoppo import *  # noqa: F401,F403
 else:
     import sys