Skip to content

Commit 89ffba7

Browse files
committed
add qizheng's necessary parts
1 parent d689f19 commit 89ffba7

File tree

4 files changed

+207
-26
lines changed

4 files changed

+207
-26
lines changed

rdagent/app/finetune/llm/conf.py

Lines changed: 62 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,81 @@
11
import os
2+
from pathlib import Path
23

34
from pydantic_settings import SettingsConfigDict
45

5-
from rdagent.app.data_science.conf import DS_RD_SETTING
66
from rdagent.core.conf import RD_AGENT_SETTINGS, ExtendedBaseSettings
77

88

9-
class LLMFinetuneScen(ExtendedBaseSettings):
9+
class LLMFinetunePropSetting(ExtendedBaseSettings):
10+
"""LLM Fine-tune dedicated property settings.
11+
12+
- Adjust timeouts and template
13+
- Use FT_ env prefix for overrides
14+
"""
15+
1016
model_config = SettingsConfigDict(env_prefix="FT_", protected_namespaces=())
11-
scen: str = "rdagent.app.finetune.llm.scen.LLMFinetuneScen"
17+
18+
# Main Components
19+
scen: str = "rdagent.scenarios.finetune.scen.scenario.LLMFinetuneScen"
20+
"""Scenario class for LLM fine-tuning tasks."""
21+
22+
hypothesis_gen: str = "rdagent.scenarios.finetune.proposal.proposal.LLMFinetuneExpGen"
23+
"""Hypothesis generation class for LLM fine-tuning tasks."""
24+
25+
hypothesis2experiment: str = "rdagent.scenarios.finetune.proposal.proposal.FTHypothesis2Experiment"
26+
"""Hypothesis to experiment converter.
27+
Function: Convert abstract LLM fine-tuning hypotheses into concrete experiment configurations.
1228
"""
13-
Scenario class for data science tasks.
14-
- For Kaggle competitions, use: "rdagent.scenarios.data_science.scen.KaggleScen"
15-
- For custom data science scenarios, use: "rdagent.scenarios.data_science.scen.DataScienceScen"
16-
- For LLM finetune scenarios, use: "rdagent.app.finetune.llm.scen.LLMFinetuneScen"
17-
- For Data science finetune scenarios, use: "rdagent.app.finetune.data_science.scen.DSFinetuneScen"
29+
30+
coder: str = "rdagent.components.coder.finetune.LLMFinetuneCoSTEER"
31+
"""Code generator.
32+
Function: Generate LLM fine-tuning code based on experiment design.
1833
"""
1934

20-
hypothesis_gen: str = "rdagent.app.finetune.llm.proposal.FinetuneExpGen"
21-
"""Hypothesis generation class"""
35+
runner: str = "rdagent.scenarios.finetune.train.runner.LLMFinetuneRunner" # TODO
36+
"""Code runner.
37+
Function: Execute LLM fine-tuning code in a Docker environment.
38+
"""
2239

40+
summarizer: str = "rdagent.scenarios.finetune.dev.feedback.LLMExperiment2Feedback"
41+
"""Result summarizer - To be implemented.
42+
Function: Analyze fine-tuning results and generate feedback, including performance metrics and error analysis.
43+
"""
44+
45+
# Timeouts (longer for LLM training)
2346
debug_timeout: int = 36000
24-
"""The timeout limit for running on debugging data"""
47+
debug_recommend_timeout: int = 36000
2548
full_timeout: int = 360000
26-
"""The timeout limit for running on full data"""
49+
full_recommend_timeout: int = 360000
2750

51+
# Pipeline behavior
2852
coder_on_whole_pipeline: bool = True
2953
enable_model_dump: bool = True
30-
app_tpl: str = "app/finetune/llm/tpl"
54+
app_tpl: str = "scenarios/finetune"
3155

56+
# Data paths and processing
57+
file_path: str | None = None # FT_FILE_PATH/datasets/<dataset>/, FT_FILE_PATH/models/<baseModel>/
58+
show_nan_columns: bool = False
59+
sample_data_by_LLM: bool = True
3260

33-
def update_settings(competition: str):
34-
"""
35-
Update the RD_AGENT_SETTINGS with the values from LLM_FINETUNE_SETTINGS.
36-
"""
37-
LLM_FINETUNE_SETTINGS = LLMFinetuneScen()
38-
RD_AGENT_SETTINGS.app_tpl = LLM_FINETUNE_SETTINGS.app_tpl
39-
os.environ["DS_CODER_COSTEER_EXTRA_EVALUATOR"] = '["rdagent.app.finetune.share.eval.PrevModelLoadEvaluator"]'
40-
for field_name, new_value in LLM_FINETUNE_SETTINGS.model_dump().items():
41-
if hasattr(DS_RD_SETTING, field_name):
42-
setattr(DS_RD_SETTING, field_name, new_value)
43-
DS_RD_SETTING.competition = competition
61+
# LLM-specific fields
62+
base_model: str | None = None
63+
dataset: str = ""
64+
65+
# LLaMA Factory
66+
update_llama_factory: bool = True
67+
68+
# Docker settings
69+
docker_enable_cache: bool = False
70+
"""Enable Docker cache for training (set via FT_DOCKER_ENABLE_CACHE)"""
71+
72+
@property
73+
def task(self) -> str:
74+
"""Generate task name from base model and dataset."""
75+
if self.base_model and self.dataset:
76+
return f"{self.base_model}@{self.dataset}".replace("/", "_").replace("\\", "_")
77+
return ""
78+
79+
80+
# Global setting instance for LLM finetuning scenario
81+
FT_RD_SETTING = LLMFinetunePropSetting()

rdagent/components/data/dataset_agent.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@
55
import json
66
from typing import Any, Dict, List, Optional
77

8-
from rdagent.scenarios.finetune.download.hf import download_dataset
9-
108
from rdagent.components.data.search_api import HuggingFaceSearchAPI
119
from rdagent.log import rdagent_logger as logger
1210
from rdagent.oai.llm_utils import APIBackend
11+
from rdagent.scenarios.finetune.download.hf import download_dataset
1312
from rdagent.utils.agent.tpl import T
1413

1514

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""
2+
Hugging Face download utility module
3+
4+
Provides convenient functions to download models and datasets from the Hugging Face Hub.
5+
Uses FT_RD_SETTING for unified path management in finetune scenarios.
6+
7+
Main functions:
8+
- download_dataset: Download datasets
9+
- download_model: Download models
10+
11+
Environment variable configuration:
12+
- HF_TOKEN / HUGGINGFACE_TOKEN / HUGGING_FACE_HUB_TOKEN: Hugging Face access token
13+
- FT_FILE_PATH: Root directory for finetuning files (managed by FT_RD_SETTING)
14+
15+
Usage example:
16+
from rdagent.scenarios.finetune.download import download_dataset, download_model
17+
18+
# Download dataset (uses FT_RD_SETTING.dataset_path by default)
19+
ds_path = download_dataset("shibing624/alpaca-zh", force=True)
20+
21+
# Download model to specified directory (overrides default path)
22+
model_path = download_model("Qwen/Qwen2.5-7B", out_dir_root="/path/to/models")
23+
24+
# Download model using default path (FT_RD_SETTING.model_path)
25+
model_path = download_model("Qwen/Qwen2.5-7B")
26+
27+
# Download private model with token
28+
model_path = download_model("private/model", token="hf_xxx")
29+
30+
# Download specific revision
31+
model_path = download_model("model/repo", revision="main")
32+
"""
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import os
2+
import shutil
3+
from pathlib import Path
4+
from typing import Optional
5+
6+
7+
def _ensure_parent(path: Path) -> None:
8+
os.makedirs(path.parent, mode=0o777, exist_ok=True)
9+
10+
11+
def download_dataset(
12+
repo_id: str,
13+
out_dir_root: Optional[str] = None,
14+
token: Optional[str] = None,
15+
revision: Optional[str] = None,
16+
force: bool = False,
17+
) -> str:
18+
"""
19+
Download Hugging Face dataset to a subdirectory under the specified root: <out_dir_root>/<repo_id>
20+
Returns the actual download directory path as a string.
21+
"""
22+
if out_dir_root:
23+
save_root = Path(out_dir_root)
24+
else:
25+
# Use FT_RD_SETTING for default root directory
26+
from rdagent.app.finetune.llm.conf import FT_RD_SETTING
27+
28+
if not FT_RD_SETTING.file_path:
29+
raise ValueError("No out_dir_root specified and FT_FILE_PATH not set")
30+
save_root = Path(FT_RD_SETTING.file_path) / "datasets"
31+
32+
save_path = save_root / repo_id
33+
_ensure_parent(save_path)
34+
35+
if force and save_path.exists():
36+
shutil.rmtree(save_path)
37+
38+
try:
39+
from huggingface_hub import snapshot_download
40+
except Exception as e:
41+
raise ImportError(
42+
"huggingface_hub is missing. Please install it first: pip install -U 'huggingface_hub[cli]'"
43+
) from e
44+
45+
effective_token = (
46+
token
47+
or os.environ.get("HF_TOKEN")
48+
or os.environ.get("HUGGINGFACE_TOKEN")
49+
or os.environ.get("HUGGING_FACE_HUB_TOKEN")
50+
)
51+
52+
snapshot_download(
53+
repo_id=repo_id,
54+
repo_type="dataset",
55+
local_dir=str(save_path),
56+
local_dir_use_symlinks=False,
57+
token=effective_token,
58+
revision=revision,
59+
)
60+
return str(save_path)
61+
62+
63+
def download_model(
64+
repo_id: str,
65+
out_dir_root: Optional[str] = None,
66+
token: Optional[str] = None,
67+
revision: Optional[str] = None,
68+
force: bool = False,
69+
) -> str:
70+
"""
71+
Download Hugging Face model to a subdirectory under the specified root: <out_dir_root>/<repo_id>
72+
Returns the actual download directory path as a string.
73+
"""
74+
if out_dir_root:
75+
save_root = Path(out_dir_root)
76+
else:
77+
# Use FT_RD_SETTING for default root directory
78+
from rdagent.app.finetune.llm.conf import FT_RD_SETTING
79+
80+
if not FT_RD_SETTING.file_path:
81+
raise ValueError("No out_dir_root specified and FT_FILE_PATH not set")
82+
save_root = Path(FT_RD_SETTING.file_path) / "model"
83+
84+
save_path = save_root / repo_id
85+
_ensure_parent(save_path)
86+
87+
if force and save_path.exists():
88+
shutil.rmtree(save_path)
89+
90+
try:
91+
from huggingface_hub import snapshot_download
92+
except Exception as e:
93+
raise ImportError(
94+
"huggingface_hub is missing. Please install it first: pip install -U 'huggingface_hub[cli]'"
95+
) from e
96+
97+
effective_token = (
98+
token
99+
or os.environ.get("HF_TOKEN")
100+
or os.environ.get("HUGGINGFACE_TOKEN")
101+
or os.environ.get("HUGGING_FACE_HUB_TOKEN")
102+
)
103+
104+
snapshot_download(
105+
repo_id=repo_id,
106+
repo_type="model",
107+
local_dir=str(save_path),
108+
local_dir_use_symlinks=False,
109+
token=effective_token,
110+
revision=revision,
111+
)
112+
return str(save_path)

0 commit comments

Comments
 (0)