add qizheng's necessary parts

Hoder-zyf · Hoder-zyf · commit 89ffba725361 · 2025-10-23T14:22:53.000Z
diff --git a/rdagent/app/finetune/llm/conf.py b/rdagent/app/finetune/llm/conf.py
@@ -1,43 +1,81 @@
 import os
+from pathlib import Path
 
 from pydantic_settings import SettingsConfigDict
 
-from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.conf import RD_AGENT_SETTINGS, ExtendedBaseSettings
 
 
-class LLMFinetuneScen(ExtendedBaseSettings):
+class LLMFinetunePropSetting(ExtendedBaseSettings):
+    """LLM Fine-tune dedicated property settings.
+
+    - Adjust timeouts and template
+    - Use FT_ env prefix for overrides
+    """
+
     model_config = SettingsConfigDict(env_prefix="FT_", protected_namespaces=())
-    scen: str = "rdagent.app.finetune.llm.scen.LLMFinetuneScen"
+
+    # Main Components
+    scen: str = "rdagent.scenarios.finetune.scen.scenario.LLMFinetuneScen"
+    """Scenario class for LLM fine-tuning tasks."""
+
+    hypothesis_gen: str = "rdagent.scenarios.finetune.proposal.proposal.LLMFinetuneExpGen"
+    """Hypothesis generation class for LLM fine-tuning tasks."""
+
+    hypothesis2experiment: str = "rdagent.scenarios.finetune.proposal.proposal.FTHypothesis2Experiment"
+    """Hypothesis to experiment converter.
+    Function: Convert abstract LLM fine-tuning hypotheses into concrete experiment configurations.
     """
-    Scenario class for data science tasks.
-    - For Kaggle competitions, use: "rdagent.scenarios.data_science.scen.KaggleScen"
-    - For custom data science scenarios, use: "rdagent.scenarios.data_science.scen.DataScienceScen"
-    - For LLM finetune scenarios, use: "rdagent.app.finetune.llm.scen.LLMFinetuneScen"
-    - For Data science finetune scenarios, use: "rdagent.app.finetune.data_science.scen.DSFinetuneScen"
+
+    coder: str = "rdagent.components.coder.finetune.LLMFinetuneCoSTEER"
+    """Code generator.
+    Function: Generate LLM fine-tuning code based on experiment design.
     """
 
-    hypothesis_gen: str = "rdagent.app.finetune.llm.proposal.FinetuneExpGen"
-    """Hypothesis generation class"""
+    runner: str = "rdagent.scenarios.finetune.train.runner.LLMFinetuneRunner"  # TODO
+    """Code runner.
+    Function: Execute LLM fine-tuning code in a Docker environment.
+    """
 
+    summarizer: str = "rdagent.scenarios.finetune.dev.feedback.LLMExperiment2Feedback"
+    """Result summarizer - To be implemented.
+    Function: Analyze fine-tuning results and generate feedback, including performance metrics and error analysis.
+    """
+
+    # Timeouts (longer for LLM training)
     debug_timeout: int = 36000
-    """The timeout limit for running on debugging data"""
+    debug_recommend_timeout: int = 36000
     full_timeout: int = 360000
-    """The timeout limit for running on full data"""
+    full_recommend_timeout: int = 360000
 
+    # Pipeline behavior
     coder_on_whole_pipeline: bool = True
     enable_model_dump: bool = True
-    app_tpl: str = "app/finetune/llm/tpl"
+    app_tpl: str = "scenarios/finetune"
 
+    # Data paths and processing
+    file_path: str | None = None  # FT_FILE_PATH/datasets/<dataset>/, FT_FILE_PATH/models/<baseModel>/
+    show_nan_columns: bool = False
+    sample_data_by_LLM: bool = True
 
-def update_settings(competition: str):
-    """
-    Update the RD_AGENT_SETTINGS with the values from LLM_FINETUNE_SETTINGS.
-    """
-    LLM_FINETUNE_SETTINGS = LLMFinetuneScen()
-    RD_AGENT_SETTINGS.app_tpl = LLM_FINETUNE_SETTINGS.app_tpl
-    os.environ["DS_CODER_COSTEER_EXTRA_EVALUATOR"] = '["rdagent.app.finetune.share.eval.PrevModelLoadEvaluator"]'
-    for field_name, new_value in LLM_FINETUNE_SETTINGS.model_dump().items():
-        if hasattr(DS_RD_SETTING, field_name):
-            setattr(DS_RD_SETTING, field_name, new_value)
-    DS_RD_SETTING.competition = competition
+    # LLM-specific fields
+    base_model: str | None = None
+    dataset: str = ""
+
+    # LLaMA Factory
+    update_llama_factory: bool = True
+
+    # Docker settings
+    docker_enable_cache: bool = False
+    """Enable Docker cache for training (set via FT_DOCKER_ENABLE_CACHE)"""
+
+    @property
+    def task(self) -> str:
+        """Generate task name from base model and dataset."""
+        if self.base_model and self.dataset:
+            return f"{self.base_model}@{self.dataset}".replace("/", "_").replace("\\", "_")
+        return ""
+
+
+# Global setting instance for LLM finetuning scenario
+FT_RD_SETTING = LLMFinetunePropSetting()
diff --git a/rdagent/components/data/dataset_agent.py b/rdagent/components/data/dataset_agent.py
@@ -5,11 +5,10 @@
 import json
 from typing import Any, Dict, List, Optional
 
-from rdagent.scenarios.finetune.download.hf import download_dataset
-
 from rdagent.components.data.search_api import HuggingFaceSearchAPI
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.finetune.download.hf import download_dataset
 from rdagent.utils.agent.tpl import T
 
 
diff --git a/rdagent/scenarios/finetune/download/__init__.py b/rdagent/scenarios/finetune/download/__init__.py
@@ -0,0 +1,32 @@
+"""
+Hugging Face download utility module
+
+Provides convenient functions to download models and datasets from the Hugging Face Hub.
+Uses FT_RD_SETTING for unified path management in finetune scenarios.
+
+Main functions:
+- download_dataset: Download datasets
+- download_model: Download models
+
+Environment variable configuration:
+- HF_TOKEN / HUGGINGFACE_TOKEN / HUGGING_FACE_HUB_TOKEN: Hugging Face access token
+- FT_FILE_PATH: Root directory for finetuning files (managed by FT_RD_SETTING)
+
+Usage example:
+    from rdagent.scenarios.finetune.download import download_dataset, download_model
+
+    # Download dataset (uses FT_RD_SETTING.dataset_path by default)
+    ds_path = download_dataset("shibing624/alpaca-zh", force=True)
+
+    # Download model to specified directory (overrides default path)
+    model_path = download_model("Qwen/Qwen2.5-7B", out_dir_root="/path/to/models")
+
+    # Download model using default path (FT_RD_SETTING.model_path)
+    model_path = download_model("Qwen/Qwen2.5-7B")
+
+    # Download private model with token
+    model_path = download_model("private/model", token="hf_xxx")
+
+    # Download specific revision
+    model_path = download_model("model/repo", revision="main")
+"""
diff --git a/rdagent/scenarios/finetune/download/hf.py b/rdagent/scenarios/finetune/download/hf.py
@@ -0,0 +1,112 @@
+import os
+import shutil
+from pathlib import Path
+from typing import Optional
+
+
+def _ensure_parent(path: Path) -> None:
+    os.makedirs(path.parent, mode=0o777, exist_ok=True)
+
+
+def download_dataset(
+    repo_id: str,
+    out_dir_root: Optional[str] = None,
+    token: Optional[str] = None,
+    revision: Optional[str] = None,
+    force: bool = False,
+) -> str:
+    """
+    Download Hugging Face dataset to a subdirectory under the specified root: <out_dir_root>/<repo_id>
+    Returns the actual download directory path as a string.
+    """
+    if out_dir_root:
+        save_root = Path(out_dir_root)
+    else:
+        # Use FT_RD_SETTING for default root directory
+        from rdagent.app.finetune.llm.conf import FT_RD_SETTING
+
+        if not FT_RD_SETTING.file_path:
+            raise ValueError("No out_dir_root specified and FT_FILE_PATH not set")
+        save_root = Path(FT_RD_SETTING.file_path) / "datasets"
+
+    save_path = save_root / repo_id
+    _ensure_parent(save_path)
+
+    if force and save_path.exists():
+        shutil.rmtree(save_path)
+
+    try:
+        from huggingface_hub import snapshot_download
+    except Exception as e:
+        raise ImportError(
+            "huggingface_hub is missing. Please install it first: pip install -U 'huggingface_hub[cli]'"
+        ) from e
+
+    effective_token = (
+        token
+        or os.environ.get("HF_TOKEN")
+        or os.environ.get("HUGGINGFACE_TOKEN")
+        or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+    )
+
+    snapshot_download(
+        repo_id=repo_id,
+        repo_type="dataset",
+        local_dir=str(save_path),
+        local_dir_use_symlinks=False,
+        token=effective_token,
+        revision=revision,
+    )
+    return str(save_path)
+
+
+def download_model(
+    repo_id: str,
+    out_dir_root: Optional[str] = None,
+    token: Optional[str] = None,
+    revision: Optional[str] = None,
+    force: bool = False,
+) -> str:
+    """
+    Download Hugging Face model to a subdirectory under the specified root: <out_dir_root>/<repo_id>
+    Returns the actual download directory path as a string.
+    """
+    if out_dir_root:
+        save_root = Path(out_dir_root)
+    else:
+        # Use FT_RD_SETTING for default root directory
+        from rdagent.app.finetune.llm.conf import FT_RD_SETTING
+
+        if not FT_RD_SETTING.file_path:
+            raise ValueError("No out_dir_root specified and FT_FILE_PATH not set")
+        save_root = Path(FT_RD_SETTING.file_path) / "model"
+
+    save_path = save_root / repo_id
+    _ensure_parent(save_path)
+
+    if force and save_path.exists():
+        shutil.rmtree(save_path)
+
+    try:
+        from huggingface_hub import snapshot_download
+    except Exception as e:
+        raise ImportError(
+            "huggingface_hub is missing. Please install it first: pip install -U 'huggingface_hub[cli]'"
+        ) from e
+
+    effective_token = (
+        token
+        or os.environ.get("HF_TOKEN")
+        or os.environ.get("HUGGINGFACE_TOKEN")
+        or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+    )
+
+    snapshot_download(
+        repo_id=repo_id,
+        repo_type="model",
+        local_dir=str(save_path),
+        local_dir_use_symlinks=False,
+        token=effective_token,
+        revision=revision,
+    )
+    return str(save_path)