diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md index 39f60de40b..38224af803 100644 --- a/docs/source/paper_index.md +++ b/docs/source/paper_index.md @@ -5,8 +5,25 @@ ## Group Relative Policy Optimization -Papers relating to the [`GRPOTrainer`] +### DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models +**๐Ÿ“œ Paper**: https://huggingface.co/papers/2402.03300 +Introduces **GRPO** and shows strong math-reasoning gains from math-centric pretraining plus group-relative PPO-style optimization. +**Used in TRL via:** [`GRPOTrainer`] + +```python +# Minimal GRPO setup (mirrors style used for other papers on the page). +from trl import GRPOConfig + +training_args = GRPOConfig( + loss_type="grpo", + epsilon=2e-4, # clip range (use paper/experiment settings if you mirror them) + gradient_accumulation_steps=1, + num_generations=8, # completions per prompt (adjust to your compute) + max_prompt_length=1024, + max_completion_length=1024, +) +``` ### Group Sequence Policy Optimization **๐Ÿ“œ Paper**: https://huggingface.co/papers/2507.18071 @@ -256,7 +273,7 @@ config = GRPOConfig( ## Direct Policy Optimization -Papers relating to the [`DPOTrainer`] +- Papers relating to the [`DPOTrainer`] ### Direct Preference Optimization (DPO): Your Language Model is Secretly a Reward Model @@ -479,6 +496,88 @@ training_args = DPOConfig( These parameters only appear in the [published version](https://aclanthology.org/2025.tacl-1.22.pdf) +### Statistical Rejection Sampling Improves Preference Optimization +**๐Ÿ“œ Paper**: https://huggingface.co/papers/2309.06657 + +Proposes **RSO**, selecting stronger preference pairs via statistical rejection sampling to boost offline preference optimization; complements DPO/SLiC. +```python +from datasets import load_dataset +from trl import DPOConfig, DPOTrainer +from transformers import AutoModelForCausalLM, AutoTokenizer + +# 1) Model & tokenizer (keep the example self-contained) +model = AutoModelForCausalLM.from_pretrained("...") +tokenizer = AutoTokenizer.from_pretrained("...") + +# 2) Load and (optionally) filter your training data +train_dataset = load_dataset(...) # e.g., ("json", data_files="rso_pairs.jsonl") + +def rso_accept(example): # replace with your actual filter/score logic + return example.get("rso_keep", True) + +train_dataset = train_dataset.filter(rso_accept) + +# 3) Configure DPO +training_args = DPOConfig( + loss_type="sigmoid", + beta=0.1, +) + +# 4) Train +trainer = DPOTrainer( + model=model, + args=training_args, + tokenizer=tokenizer, + train_dataset=train_dataset, +) +trainer.train() + +``` + +### Nash Learning from Human Feedback +**๐Ÿ“œ Paper**: https://huggingface.co/papers/2312.00886 + +Frames alignment as a **two-player game**, learning Nash policies from human feedback; connects to multi-objective and competitive preference training. +```python +# Train a DPO policy (one side of the game-theoretic setup) +from trl import DPOConfig, DPOTrainer +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained("..."); tok = AutoTokenizer.from_pretrained("...") +args = DPOConfig() +trainer = DPOTrainer(model=model, args=args, tokenizer=tok, train_dataset=...) +trainer.train() + +``` + + +### Direct Language Model Alignment from Online AI Feedback +**๐Ÿ“œ Paper**: https://huggingface.co/papers/2402.04792 + +Uses **online AI feedback (OAIF)** to supply real-time preference signals, improving direct alignment beyond purely offline pairs. +```python +# Sketch: collect online AI feedback -> extend DPO dataset -> train +from trl import DPOConfig, DPOTrainer +from transformers import AutoModelForCausalLM, AutoTokenizer + +policy = AutoModelForCausalLM.from_pretrained("..."); tok = AutoTokenizer.from_pretrained("...") + +def ai_feedback_score(prompt, response) -> float: + # plug a judge / RM / heuristic here (return scalar) + pass + +new_pairs = [] +for ex in prompts_ds: + out = policy.generate(**tok(ex["prompt"], return_tensors="pt").to(policy.device), max_new_tokens=256) + resp = tok.decode(out[0], skip_special_tokens=True) + score = ai_feedback_score(ex["prompt"], resp) + # build a (chosen, rejected) pair using score (e.g., compare vs baseline response) + new_pairs.append({"prompt": ex["prompt"], "chosen": resp, "rejected": ex["baseline"]}) + +augmented_pairs = dpo_pairs.add_items(new_pairs) + +training_args = DPOConfig() +trainer = DPOTrainer(model=model, args=training_args, tokenizer=tok, train_dataset=...) ## Kahnemanโ€“Tversky Optimization Papers relating to the [`KTOTrainer`] @@ -557,6 +656,43 @@ SFTConfig( per_device_train_batch_size=8, gradient_accumulation_steps=32, ) +``` +## Parameter-Efficient Fine-Tuning (PEFT) + +### LoRA: Low-Rank Adaptation of Large Language Models +**๐Ÿ“œ Paper**: https://huggingface.co/papers/2106.09685 + +Parameter-efficient fine-tuning via **low-rank adapters**, cutting trainable parameters and memory while preserving quality (see PEFT integration in TRL). +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +from trl import SFTTrainer, SFTConfig +from peft import LoraConfig + +# minimal, self-contained setup +model_id = "meta-llama/Llama-3.1-8B-Instruct" # any causal LM +tok = AutoTokenizer.from_pretrained(model_id, use_fast=True) +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto") + +# only the essential PEFT info; keep the rest at defaults +peft_cfg = LoraConfig(task_type="CAUSAL_LM") +# If a model needs explicit targets, uncomment and adjust: +# peft_cfg = LoraConfig(task_type="CAUSAL_LM", target_modules=["q_proj","k_proj","v_proj","o_proj"]) + +training_args = SFTConfig( # lean config; defaults otherwise + max_seq_length=2048, + per_device_train_batch_size=4, + gradient_accumulation_steps=8, +) + +trainer = SFTTrainer( + model=model, + tokenizer=tok, + args=training_args, + peft_config=peft_cfg, + train_dataset=..., # your dataset +) +trainer.train() + ``` ## Reinforce Leave-One-Out @@ -654,6 +790,101 @@ def add_margin(example): dataset = dataset.map(add_margin) ``` +### Solving Math Word Problems with Process- and Outcome-Based Feedback +**๐Ÿ“œ Paper**: https://huggingface.co/papers/2211.14275 + +Shows benefits of **process supervision** (step-level) alongside outcome labels for math reasoning, motivating richer feedback signals for alignment. +```python +# Combine step-level (process) and final-answer (outcome) rewards +from trl import GRPOConfig, GRPOTrainer + +def process_reward(sample) -> float: + # e.g., +1 for each verified-correct reasoning step (paper-specific) + return sample.get("num_correct_steps", 0) / max(1, sample.get("num_steps", 1)) + +def outcome_reward(sample) -> float: + return 1.0 if sample.get("is_correct") else 0.0 + +def fused_reward(sample) -> float: + return 0.5 * process_reward(sample) + 0.5 * outcome_reward(sample) + +args = GRPOConfig(loss_type="grpo", beta=0.0, steps_per_generation=4, num_generations=4) +trainer = GRPOTrainer(model=..., args=args, tokenizer=..., train_dataset=..., reward_funcs=[fused_reward]) +trainer.train() + +``` + +## Distillation / Post-training (Background) + +### On-Policy Distillation of Language Models +**๐Ÿ“œ Paper**: https://huggingface.co/papers/2306.13649 + +Introduces **GKD**, aligning student with teacher **on-policy** to stabilize/boost instruction tuning and integrate cleanly with RLHF pipelines. +```python +# 1) Have the TEACHER generate outputs on current prompts (on-policy) +from transformers import AutoModelForCausalLM, AutoTokenizer +from trl import SFTTrainer, SFTConfig + +teacher = AutoModelForCausalLM.from_pretrained("teacher-model"); tok = AutoTokenizer.from_pretrained("teacher-model") +def teacher_label(prompt): + out = teacher.generate(**tok(prompt, return_tensors="pt").to(teacher.device), max_new_tokens=256) + return tok.decode(out[0], skip_special_tokens=True) + +distill_ds = prompts_ds.map(lambda ex: {"prompt": ex["prompt"], "response": teacher_label(ex["prompt"])}) + +# 2) Train STUDENT with SFT on those on-policy pairs +student = AutoModelForCausalLM.from_pretrained("student-model") +args = SFTConfig(max_seq_length=2048, per_device_train_batch_size=4, learning_rate=5e-5) +trainer = SFTTrainer(model=student, args=args, tokenizer=tok, train_dataset=distill_ds) +trainer.train() + +``` + +## Foundations & Systems (Background) + +### Proximal Policy Optimization Algorithms +**๐Ÿ“œ Paper**: https://huggingface.co/papers/1707.06347 + +Foundational **PPO** objective with clipped ratios and minibatch epochsโ€”baseline for many RL/RLHF variants used in TRL. +```python +from trl import GRPOConfig, GRPOTrainer +args = GRPOConfig( + loss_type="grpo", + epsilon=0.2, epsilon_high=0.2, # classic PPO-style symmetric clip + beta=0.01, # KL coef if you want explicit KL + steps_per_generation=4, num_generations=8, +) +trainer = GRPOTrainer(model=..., args=args, tokenizer=..., train_dataset=...) +trainer.train() + +``` + +### ZeRO: Memory Optimizations Toward Training Trillion-Parameter Models +**๐Ÿ“œ Paper**: https://huggingface.co/papers/1910.02054 + +**ZeRO** partitions optimizer states/gradients/params to scale training efficiently; relevant when configuring DeepSpeed/Accelerate with TRL. +```python +# Most TRL configs forward to HF TrainingArguments under the hood. +# Supply a DeepSpeed ZeRO config via the "deepspeed" argument/path. +from trl import DPOConfig, DPOTrainer + +args = DPOConfig( + loss_type="sigmoid", + beta=0.1, + deepspeed="ds_zero_stage2.json", # path to your ZeRO config + bf16=True, +) + +# ds_zero_stage2.json (very small sketch) +# { +# "zero_optimization": { "stage": 2 }, +# "train_micro_batch_size_per_gpu": 4, +# "gradient_accumulation_steps": 8 +# } + +trainer = DPOTrainer(model=..., args=args, tokenizer=..., train_dataset=...) +trainer.train() + ## Distillation Papers relating to training a student model with the help of a teacher model.