diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index ef3ae0fe..407dc327 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -9,41 +9,29 @@ }, "plugins": [ { - "name": "hugging-face-model-trainer", - "source": "./skills/hugging-face-model-trainer", + "name": "huggingface-llm-trainer", + "source": "./skills/huggingface-llm-trainer", "skills": "./", "description": "Train or fine-tune language models using TRL on Hugging Face Jobs infrastructure. Covers SFT, DPO, GRPO and reward modeling training methods, plus GGUF conversion for local deployment. Includes hardware selection, cost estimation, Trackio monitoring, and Hub persistence." }, { - "name": "hugging-face-paper-publisher", - "source": "./skills/hugging-face-paper-publisher", + "name": "huggingface-paper-publisher", + "source": "./skills/huggingface-paper-publisher", "skills": "./", "description": "Publish and manage research papers on Hugging Face Hub. Supports creating paper pages, linking papers to models/datasets, claiming authorship, and generating professional markdown-based research articles." }, { - "name": "hugging-face-paper-pages", - "source": "./skills/hugging-face-paper-pages", + "name": "huggingface-papers", + "source": "./skills/huggingface-papers", "skills": "./", "description": "Look up and read Hugging Face paper pages in markdown, and use the papers API for structured metadata like authors, linked models, datasets, Spaces, and media URLs when needed." }, { - "name": "hugging-face-datasets", - "source": "./skills/hugging-face-datasets", - "skills": "./", - "description": "Create and manage datasets on Hugging Face Hub. Supports initializing repos, defining configs/system prompts, streaming row updates, and SQL-based dataset querying/transformation." - }, - { - "name": "hugging-face-evaluation", - "source": "./skills/hugging-face-evaluation", + "name": "huggingface-community-evals", + "source": "./skills/huggingface-community-evals", "skills": "./", "description": "Add and manage evaluation results in Hugging Face model cards. Supports extracting eval tables from README content, importing scores from Artificial Analysis API, and running custom evaluations with vLLM/lighteval." }, - { - "name": "hugging-face-tool-builder", - "source": "./skills/hugging-face-tool-builder", - "skills": "./", - "description": "Build reusable scripts for Hugging Face API operations. Useful for chaining API calls or automating repeated tasks." - }, { "name": "hf-cli", "source": "./skills/hf-cli", @@ -51,39 +39,38 @@ "description": "Execute Hugging Face Hub operations using the hf CLI. Download models/datasets, upload files, manage repos, and run cloud compute jobs." }, { - "name": "hugging-face-jobs", - "source": "./skills/hugging-face-jobs", + "name": "huggingface-jobs", + "source": "./skills/huggingface-jobs", "skills": "./", "description": "Run compute jobs on Hugging Face infrastructure. Execute Python scripts, manage scheduled jobs, and monitor job status." }, { - "name": "hugging-face-trackio", - "source": "./skills/hugging-face-trackio", + "name": "huggingface-trackio", + "source": "./skills/huggingface-trackio", "skills": "./", "description": "Track and visualize ML training experiments with Trackio. Log metrics via Python API and retrieve them via CLI. Supports real-time dashboards synced to HF Spaces." }, { - "name": "hugging-face-dataset-viewer", - "source": "./skills/hugging-face-dataset-viewer", + "name": "huggingface-datasets", + "source": "./skills/huggingface-datasets", "skills": "./", "description": "Explore, query, and extract data from any Hugging Face dataset using the Dataset Viewer REST API and npx tooling. Zero Python dependencies — covers split/config discovery, row pagination, text search, filtering, SQL via parquetlens, and dataset upload via CLI." }, { - "name": "gradio", + "name": "huggingface-gradio", "source": "./skills/huggingface-gradio", "skills": "./", "description": "Build Gradio web UIs and demos in Python. Use when creating or editing Gradio apps, components, event listeners, layouts, or chatbots." }, { "name": "transformers-js", - "source": "./skills/transformers.js", + "source": "./skills/transformers-js", "skills": "./", "description": "Run state-of-the-art machine learning models directly in JavaScript/TypeScript for NLP, computer vision, audio processing, and multimodal tasks. Works in Node.js and browsers with WebGPU/WASM using Hugging Face models." }, - { - "name": "hugging-face-vision-trainer", - "source": "./skills/hugging-face-vision-trainer", + "name": "huggingface-vision-trainer", + "source": "./skills/huggingface-vision-trainer", "skills": "./", "description": "Train and fine-tune object detection models (RTDETRv2, YOLOS, DETR and others) and image classification models (timm and transformers models — MobileNetV3, MobileViT, ResNet, ViT/DINOv3) using Transformers Trainer API on Hugging Face Jobs infrastructure or locally. Includes COCO dataset format support, Albumentations augmentation, mAP/mAR metrics, trackio tracking, hardware selection, and Hub persistence." } diff --git a/README.md b/README.md index c48dbfac..bf072e5f 100644 --- a/README.md +++ b/README.md @@ -89,19 +89,17 @@ This repository contains a few skills to get you started. You can also contribut | Name | Description | Documentation | |------|-------------|---------------| -| `gradio` | Build Gradio web UIs and demos in Python. Use when creating or editing Gradio apps, components, event listeners, layouts, or chatbots. | [SKILL.md](skills/huggingface-gradio/SKILL.md) | | `hf-cli` | Execute Hugging Face Hub operations using the hf CLI. Download models/datasets, upload files, manage repos, and run cloud compute jobs. | [SKILL.md](skills/hf-cli/SKILL.md) | -| `hugging-face-dataset-viewer` | Explore, query, and extract data from any Hugging Face dataset using the Dataset Viewer REST API and npx tooling. Zero Python dependencies — covers split/config discovery, row pagination, text search, filtering, SQL via parquetlens, and dataset upload via CLI. | [SKILL.md](skills/hugging-face-dataset-viewer/SKILL.md) | -| `hugging-face-datasets` | Create and manage datasets on Hugging Face Hub. Supports initializing repos, defining configs/system prompts, streaming row updates, and SQL-based dataset querying/transformation. | [SKILL.md](skills/hugging-face-datasets/SKILL.md) | -| `hugging-face-evaluation` | Add and manage evaluation results in Hugging Face model cards. Supports extracting eval tables from README content, importing scores from Artificial Analysis API, and running custom evaluations with vLLM/lighteval. | [SKILL.md](skills/hugging-face-evaluation/SKILL.md) | -| `hugging-face-jobs` | Run compute jobs on Hugging Face infrastructure. Execute Python scripts, manage scheduled jobs, and monitor job status. | [SKILL.md](skills/hugging-face-jobs/SKILL.md) | -| `hugging-face-model-trainer` | Train or fine-tune language models using TRL on Hugging Face Jobs infrastructure. Covers SFT, DPO, GRPO and reward modeling training methods, plus GGUF conversion for local deployment. Includes hardware selection, cost estimation, Trackio monitoring, and Hub persistence. | [SKILL.md](skills/hugging-face-model-trainer/SKILL.md) | -| `hugging-face-paper-pages` | Look up and read Hugging Face paper pages in markdown, and use the papers API for structured metadata like authors, linked models, datasets, Spaces, and media URLs when needed. | [SKILL.md](skills/hugging-face-paper-pages/SKILL.md) | -| `hugging-face-paper-publisher` | Publish and manage research papers on Hugging Face Hub. Supports creating paper pages, linking papers to models/datasets, claiming authorship, and generating professional markdown-based research articles. | [SKILL.md](skills/hugging-face-paper-publisher/SKILL.md) | -| `hugging-face-tool-builder` | Build reusable scripts for Hugging Face API operations. Useful for chaining API calls or automating repeated tasks. | [SKILL.md](skills/hugging-face-tool-builder/SKILL.md) | -| `hugging-face-trackio` | Track and visualize ML training experiments with Trackio. Log metrics via Python API and retrieve them via CLI. Supports real-time dashboards synced to HF Spaces. | [SKILL.md](skills/hugging-face-trackio/SKILL.md) | -| `hugging-face-vision-trainer` | Train and fine-tune object detection models (RTDETRv2, YOLOS, DETR and others) and image classification models (timm and transformers models — MobileNetV3, MobileViT, ResNet, ViT/DINOv3) using Transformers Trainer API on Hugging Face Jobs infrastructure or locally. Includes COCO dataset format support, Albumentations augmentation, mAP/mAR metrics, trackio tracking, hardware selection, and Hub persistence. | [SKILL.md](skills/hugging-face-vision-trainer/SKILL.md) | -| `transformers-js` | Run state-of-the-art machine learning models directly in JavaScript/TypeScript for NLP, computer vision, audio processing, and multimodal tasks. Works in Node.js and browsers with WebGPU/WASM using Hugging Face models. | [SKILL.md](skills/transformers.js/SKILL.md) | +| `huggingface-community-evals` | Add and manage evaluation results in Hugging Face model cards. Supports extracting eval tables from README content, importing scores from Artificial Analysis API, and running custom evaluations with vLLM/lighteval. | [SKILL.md](skills/huggingface-community-evals/SKILL.md) | +| `huggingface-datasets` | Explore, query, and extract data from any Hugging Face dataset using the Dataset Viewer REST API and npx tooling. Zero Python dependencies — covers split/config discovery, row pagination, text search, filtering, SQL via parquetlens, and dataset upload via CLI. | [SKILL.md](skills/huggingface-datasets/SKILL.md) | +| `huggingface-gradio` | Build Gradio web UIs and demos in Python. Use when creating or editing Gradio apps, components, event listeners, layouts, or chatbots. | [SKILL.md](skills/huggingface-gradio/SKILL.md) | +| `huggingface-jobs` | Run compute jobs on Hugging Face infrastructure. Execute Python scripts, manage scheduled jobs, and monitor job status. | [SKILL.md](skills/huggingface-jobs/SKILL.md) | +| `huggingface-llm-trainer` | Train or fine-tune language models using TRL on Hugging Face Jobs infrastructure. Covers SFT, DPO, GRPO and reward modeling training methods, plus GGUF conversion for local deployment. Includes hardware selection, cost estimation, Trackio monitoring, and Hub persistence. | [SKILL.md](skills/huggingface-llm-trainer/SKILL.md) | +| `huggingface-paper-publisher` | Publish and manage research papers on Hugging Face Hub. Supports creating paper pages, linking papers to models/datasets, claiming authorship, and generating professional markdown-based research articles. | [SKILL.md](skills/huggingface-paper-publisher/SKILL.md) | +| `huggingface-papers` | Look up and read Hugging Face paper pages in markdown, and use the papers API for structured metadata like authors, linked models, datasets, Spaces, and media URLs when needed. | [SKILL.md](skills/huggingface-papers/SKILL.md) | +| `huggingface-trackio` | Track and visualize ML training experiments with Trackio. Log metrics via Python API and retrieve them via CLI. Supports real-time dashboards synced to HF Spaces. | [SKILL.md](skills/huggingface-trackio/SKILL.md) | +| `huggingface-vision-trainer` | Train and fine-tune object detection models (RTDETRv2, YOLOS, DETR and others) and image classification models (timm and transformers models — MobileNetV3, MobileViT, ResNet, ViT/DINOv3) using Transformers Trainer API on Hugging Face Jobs infrastructure or locally. Includes COCO dataset format support, Albumentations augmentation, mAP/mAR metrics, trackio tracking, hardware selection, and Hub persistence. | [SKILL.md](skills/huggingface-vision-trainer/SKILL.md) | +| `transformers-js` | Run state-of-the-art machine learning models directly in JavaScript/TypeScript for NLP, computer vision, audio processing, and multimodal tasks. Works in Node.js and browsers with WebGPU/WASM using Hugging Face models. | [SKILL.md](skills/transformers-js/SKILL.md) | ### Using skills in your coding agent diff --git a/agents/AGENTS.md b/agents/AGENTS.md index fca1d877..e0df1812 100644 --- a/agents/AGENTS.md +++ b/agents/AGENTS.md @@ -3,36 +3,32 @@ You have additional SKILLs documented in directories containing a "SKILL.md" file. These skills are: - - gradio -> "skills/huggingface-gradio/SKILL.md" - hf-cli -> "skills/hf-cli/SKILL.md" - - hugging-face-dataset-viewer -> "skills/hugging-face-dataset-viewer/SKILL.md" - - hugging-face-datasets -> "skills/hugging-face-datasets/SKILL.md" - - hugging-face-evaluation -> "skills/hugging-face-evaluation/SKILL.md" - - hugging-face-jobs -> "skills/hugging-face-jobs/SKILL.md" - - hugging-face-model-trainer -> "skills/hugging-face-model-trainer/SKILL.md" - - hugging-face-paper-pages -> "skills/hugging-face-paper-pages/SKILL.md" - - hugging-face-paper-publisher -> "skills/hugging-face-paper-publisher/SKILL.md" - - hugging-face-tool-builder -> "skills/hugging-face-tool-builder/SKILL.md" - - hugging-face-trackio -> "skills/hugging-face-trackio/SKILL.md" - - hugging-face-vision-trainer -> "skills/hugging-face-vision-trainer/SKILL.md" - - transformers-js -> "skills/transformers.js/SKILL.md" + - huggingface-community-evals -> "skills/huggingface-community-evals/SKILL.md" + - huggingface-datasets -> "skills/huggingface-datasets/SKILL.md" + - huggingface-gradio -> "skills/huggingface-gradio/SKILL.md" + - huggingface-jobs -> "skills/huggingface-jobs/SKILL.md" + - huggingface-llm-trainer -> "skills/huggingface-llm-trainer/SKILL.md" + - huggingface-paper-publisher -> "skills/huggingface-paper-publisher/SKILL.md" + - huggingface-papers -> "skills/huggingface-papers/SKILL.md" + - huggingface-trackio -> "skills/huggingface-trackio/SKILL.md" + - huggingface-vision-trainer -> "skills/huggingface-vision-trainer/SKILL.md" + - transformers-js -> "skills/transformers-js/SKILL.md" IMPORTANT: You MUST read the SKILL.md file whenever the description of the skills matches the user intent, or may help accomplish their task. -gradio: `Build Gradio web UIs and demos in Python. Use when creating or editing Gradio apps, components, event listeners, layouts, or chatbots.` hf-cli: `"Hugging Face Hub CLI (`hf`) for downloading, uploading, and managing repositories, models, datasets, and Spaces on the Hugging Face Hub. Replaces now deprecated `huggingface-cli` command."` -hugging-face-dataset-viewer: `Use this skill for Hugging Face Dataset Viewer API workflows that fetch subset/split metadata, paginate rows, search text, apply filters, download parquet URLs, and read size or statistics.` -hugging-face-datasets: `Create and manage datasets on Hugging Face Hub. Supports initializing repos, defining configs/system prompts, streaming row updates, and SQL-based dataset querying/transformation. Designed to work alongside HF MCP server for comprehensive dataset workflows.` -hugging-face-evaluation: `Add and manage evaluation results in Hugging Face model cards. Supports extracting eval tables from README content, importing scores from Artificial Analysis API, and running custom model evaluations with vLLM/lighteval. Works with the model-index metadata format.` -hugging-face-jobs: `This skill should be used when users want to run any workload on Hugging Face Jobs infrastructure. Covers UV scripts, Docker-based jobs, hardware selection, cost estimation, authentication with tokens, secrets management, timeout configuration, and result persistence. Designed for general-purpose compute workloads including data processing, inference, experiments, batch jobs, and any Python-based tasks. Should be invoked for tasks involving cloud compute, GPU workloads, or when users mention running jobs on Hugging Face infrastructure without local setup.` -hugging-face-model-trainer: `This skill should be used when users want to train or fine-tune language models using TRL (Transformer Reinforcement Learning) on Hugging Face Jobs infrastructure. Covers SFT, DPO, GRPO and reward modeling training methods, plus GGUF conversion for local deployment. Includes guidance on the TRL Jobs package, UV scripts with PEP 723 format, dataset preparation and validation, hardware selection, cost estimation, Trackio monitoring, Hub authentication, and model persistence. Should be invoked for tasks involving cloud GPU training, GGUF conversion, or when users mention training on Hugging Face Jobs without local GPU setup.` -hugging-face-paper-pages: `Look up and read Hugging Face paper pages in markdown, and use the papers API for structured metadata such as authors, linked models/datasets/spaces, Github repo and project page. Use when the user shares a Hugging Face paper page URL, an arXiv URL or ID, or asks to summarize, explain, or analyze an AI research paper.` -hugging-face-paper-publisher: `Publish and manage research papers on Hugging Face Hub. Supports creating paper pages, linking papers to models/datasets, claiming authorship, and generating professional markdown-based research articles.` -hugging-face-tool-builder: `Use this skill when the user wants to build tool/scripts or achieve a task where using data from the Hugging Face API would help. This is especially useful when chaining or combining API calls or the task will be repeated/automated. This Skill creates a reusable script to fetch, enrich or process data.` -hugging-face-trackio: `Track and visualize ML training experiments with Trackio. Use when logging metrics during training (Python API), firing alerts for training diagnostics, or retrieving/analyzing logged metrics (CLI). Supports real-time dashboard visualization, alerts with webhooks, HF Space syncing, and JSON output for automation.` -hugging-face-vision-trainer: `Trains and fine-tunes vision models for object detection (D-FINE, RT-DETR v2, DETR, YOLOS), image classification (timm models — MobileNetV3, MobileViT, ResNet, ViT/DINOv3 — plus any Transformers classifier), and SAM/SAM2 segmentation using Hugging Face Transformers on Hugging Face Jobs cloud GPUs. Covers COCO-format dataset preparation, Albumentations augmentation, mAP/mAR evaluation, accuracy metrics, SAM segmentation with bbox/point prompts, DiceCE loss, hardware selection, cost estimation, Trackio monitoring, and Hub persistence. Use when users mention training object detection, image classification, SAM, SAM2, segmentation, image matting, DETR, D-FINE, RT-DETR, ViT, timm, MobileNet, ResNet, bounding box models, or fine-tuning vision models on Hugging Face Jobs.` +huggingface-community-evals: `Run evaluations for Hugging Face Hub models using inspect-ai and lighteval on local hardware. Use for backend selection, local GPU evals, and choosing between vLLM / Transformers / accelerate. Not for HF Jobs orchestration, model-card PRs, .eval_results publication, or community-evals automation.` +huggingface-datasets: `Use this skill for Hugging Face Dataset Viewer API workflows that fetch subset/split metadata, paginate rows, search text, apply filters, download parquet URLs, and read size or statistics.` +huggingface-gradio: `Build Gradio web UIs and demos in Python. Use when creating or editing Gradio apps, components, event listeners, layouts, or chatbots.` +huggingface-jobs: `This skill should be used when users want to run any workload on Hugging Face Jobs infrastructure. Covers UV scripts, Docker-based jobs, hardware selection, cost estimation, authentication with tokens, secrets management, timeout configuration, and result persistence. Designed for general-purpose compute workloads including data processing, inference, experiments, batch jobs, and any Python-based tasks. Should be invoked for tasks involving cloud compute, GPU workloads, or when users mention running jobs on Hugging Face infrastructure without local setup.` +huggingface-llm-trainer: `This skill should be used when users want to train or fine-tune language models using TRL (Transformer Reinforcement Learning) on Hugging Face Jobs infrastructure. Covers SFT, DPO, GRPO and reward modeling training methods, plus GGUF conversion for local deployment. Includes guidance on the TRL Jobs package, UV scripts with PEP 723 format, dataset preparation and validation, hardware selection, cost estimation, Trackio monitoring, Hub authentication, and model persistence. Should be invoked for tasks involving cloud GPU training, GGUF conversion, or when users mention training on Hugging Face Jobs without local GPU setup.` +huggingface-paper-publisher: `Publish and manage research papers on Hugging Face Hub. Supports creating paper pages, linking papers to models/datasets, claiming authorship, and generating professional markdown-based research articles.` +huggingface-papers: `Look up and read Hugging Face paper pages in markdown, and use the papers API for structured metadata such as authors, linked models/datasets/spaces, Github repo and project page. Use when the user shares a Hugging Face paper page URL, an arXiv URL or ID, or asks to summarize, explain, or analyze an AI research paper.` +huggingface-trackio: `Track and visualize ML training experiments with Trackio. Use when logging metrics during training (Python API), firing alerts for training diagnostics, or retrieving/analyzing logged metrics (CLI). Supports real-time dashboard visualization, alerts with webhooks, HF Space syncing, and JSON output for automation.` +huggingface-vision-trainer: `Trains and fine-tunes vision models for object detection (D-FINE, RT-DETR v2, DETR, YOLOS), image classification (timm models — MobileNetV3, MobileViT, ResNet, ViT/DINOv3 — plus any Transformers classifier), and SAM/SAM2 segmentation using Hugging Face Transformers on Hugging Face Jobs cloud GPUs. Covers COCO-format dataset preparation, Albumentations augmentation, mAP/mAR evaluation, accuracy metrics, SAM segmentation with bbox/point prompts, DiceCE loss, hardware selection, cost estimation, Trackio monitoring, and Hub persistence. Use when users mention training object detection, image classification, SAM, SAM2, segmentation, image matting, DETR, D-FINE, RT-DETR, ViT, timm, MobileNet, ResNet, bounding box models, or fine-tuning vision models on Hugging Face Jobs.` transformers-js: `Use Transformers.js to run state-of-the-art machine learning models directly in JavaScript/TypeScript. Supports NLP (text classification, translation, summarization), computer vision (image classification, object detection), audio (speech recognition, audio classification), and multimodal tasks. Works in Node.js and browsers (with WebGPU/WASM) using pre-trained models from Hugging Face Hub.` diff --git a/skills/hugging-face-datasets/SKILL.md b/skills/hugging-face-datasets/SKILL.md deleted file mode 100644 index 4075d7f8..00000000 --- a/skills/hugging-face-datasets/SKILL.md +++ /dev/null @@ -1,542 +0,0 @@ ---- -name: hugging-face-datasets -description: Create and manage datasets on Hugging Face Hub. Supports initializing repos, defining configs/system prompts, streaming row updates, and SQL-based dataset querying/transformation. Designed to work alongside HF MCP server for comprehensive dataset workflows. ---- - -# Overview -This skill provides tools to manage datasets on the Hugging Face Hub with a focus on creation, configuration, content management, and SQL-based data manipulation. It is designed to complement the existing Hugging Face MCP server by providing dataset editing and querying capabilities. - -## Integration with HF MCP Server -- **Use HF MCP Server for**: Dataset discovery, search, and metadata retrieval -- **Use This Skill for**: Dataset creation, content editing, SQL queries, data transformation, and structured data formatting - -# Version -2.1.0 - -# Dependencies -# This skill uses PEP 723 scripts with inline dependency management -# Scripts auto-install requirements when run with: uv run scripts/script_name.py - -- uv (Python package manager) -- Getting Started: See "Usage Instructions" below for PEP 723 usage - -# Core Capabilities - -## 1. Dataset Lifecycle Management -- **Initialize**: Create new dataset repositories with proper structure -- **Configure**: Store detailed configuration including system prompts and metadata -- **Stream Updates**: Add rows efficiently without downloading entire datasets - -## 2. SQL-Based Dataset Querying (NEW) -Query any Hugging Face dataset using DuckDB SQL via `scripts/sql_manager.py`: -- **Direct Queries**: Run SQL on datasets using the `hf://` protocol -- **Schema Discovery**: Describe dataset structure and column types -- **Data Sampling**: Get random samples for exploration -- **Aggregations**: Count, histogram, unique values analysis -- **Transformations**: Filter, join, reshape data with SQL -- **Export & Push**: Save results locally or push to new Hub repos - -## 3. Multi-Format Dataset Support -Supports diverse dataset types through template system: -- **Chat/Conversational**: Chat templating, multi-turn dialogues, tool usage examples -- **Text Classification**: Sentiment analysis, intent detection, topic classification -- **Question-Answering**: Reading comprehension, factual QA, knowledge bases -- **Text Completion**: Language modeling, code completion, creative writing -- **Tabular Data**: Structured data for regression/classification tasks -- **Custom Formats**: Flexible schema definition for specialized needs - -## 4. Quality Assurance Features -- **JSON Validation**: Ensures data integrity during uploads -- **Batch Processing**: Efficient handling of large datasets -- **Error Recovery**: Graceful handling of upload failures and conflicts - -# Usage Instructions - -The skill includes two Python scripts that use PEP 723 inline dependency management: - -> **All paths are relative to the directory containing this SKILL.md -file.** -> Scripts are run with: `uv run scripts/script_name.py [arguments]` - -- `scripts/dataset_manager.py` - Dataset creation and management -- `scripts/sql_manager.py` - SQL-based dataset querying and transformation - -### Prerequisites -- `uv` package manager installed -- `HF_TOKEN` environment variable must be set with a Write-access token - ---- - -# SQL Dataset Querying (sql_manager.py) - -Query, transform, and push Hugging Face datasets using DuckDB SQL. The `hf://` protocol provides direct access to any public dataset (or private with token). - -## Quick Start - -```bash -# Query a dataset -uv run scripts/sql_manager.py query \ - --dataset "cais/mmlu" \ - --sql "SELECT * FROM data WHERE subject='nutrition' LIMIT 10" - -# Get dataset schema -uv run scripts/sql_manager.py describe --dataset "cais/mmlu" - -# Sample random rows -uv run scripts/sql_manager.py sample --dataset "cais/mmlu" --n 5 - -# Count rows with filter -uv run scripts/sql_manager.py count --dataset "cais/mmlu" --where "subject='nutrition'" -``` - -## SQL Query Syntax - -Use `data` as the table name in your SQL - it gets replaced with the actual `hf://` path: - -```sql --- Basic select -SELECT * FROM data LIMIT 10 - --- Filtering -SELECT * FROM data WHERE subject='nutrition' - --- Aggregations -SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject ORDER BY cnt DESC - --- Column selection and transformation -SELECT question, choices[answer] AS correct_answer FROM data - --- Regex matching -SELECT * FROM data WHERE regexp_matches(question, 'nutrition|diet') - --- String functions -SELECT regexp_replace(question, '\n', '') AS cleaned FROM data -``` - -## Common Operations - -### 1. Explore Dataset Structure -```bash -# Get schema -uv run scripts/sql_manager.py describe --dataset "cais/mmlu" - -# Get unique values in column -uv run scripts/sql_manager.py unique --dataset "cais/mmlu" --column "subject" - -# Get value distribution -uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject" --bins 20 -``` - -### 2. Filter and Transform -```bash -# Complex filtering with SQL -uv run scripts/sql_manager.py query \ - --dataset "cais/mmlu" \ - --sql "SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject HAVING cnt > 100" - -# Using transform command -uv run scripts/sql_manager.py transform \ - --dataset "cais/mmlu" \ - --select "subject, COUNT(*) as cnt" \ - --group-by "subject" \ - --order-by "cnt DESC" \ - --limit 10 -``` - -### 3. Create Subsets and Push to Hub -```bash -# Query and push to new dataset -uv run scripts/sql_manager.py query \ - --dataset "cais/mmlu" \ - --sql "SELECT * FROM data WHERE subject='nutrition'" \ - --push-to "username/mmlu-nutrition-subset" \ - --private - -# Transform and push -uv run scripts/sql_manager.py transform \ - --dataset "ibm/duorc" \ - --config "ParaphraseRC" \ - --select "question, answers" \ - --where "LENGTH(question) > 50" \ - --push-to "username/duorc-long-questions" -``` - -### 4. Export to Local Files -```bash -# Export to Parquet -uv run scripts/sql_manager.py export \ - --dataset "cais/mmlu" \ - --sql "SELECT * FROM data WHERE subject='nutrition'" \ - --output "nutrition.parquet" \ - --format parquet - -# Export to JSONL -uv run scripts/sql_manager.py export \ - --dataset "cais/mmlu" \ - --sql "SELECT * FROM data LIMIT 100" \ - --output "sample.jsonl" \ - --format jsonl -``` - -### 5. Working with Dataset Configs/Splits -```bash -# Specify config (subset) -uv run scripts/sql_manager.py query \ - --dataset "ibm/duorc" \ - --config "ParaphraseRC" \ - --sql "SELECT * FROM data LIMIT 5" - -# Specify split -uv run scripts/sql_manager.py query \ - --dataset "cais/mmlu" \ - --split "test" \ - --sql "SELECT COUNT(*) FROM data" - -# Query all splits -uv run scripts/sql_manager.py query \ - --dataset "cais/mmlu" \ - --split "*" \ - --sql "SELECT * FROM data LIMIT 10" -``` - -### 6. Raw SQL with Full Paths -For complex queries or joining datasets: -```bash -uv run scripts/sql_manager.py raw --sql " - SELECT a.*, b.* - FROM 'hf://datasets/dataset1@~parquet/default/train/*.parquet' a - JOIN 'hf://datasets/dataset2@~parquet/default/train/*.parquet' b - ON a.id = b.id - LIMIT 100 -" -``` - -## Python API Usage - -```python -from sql_manager import HFDatasetSQL - -sql = HFDatasetSQL() - -# Query -results = sql.query("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition' LIMIT 10") - -# Get schema -schema = sql.describe("cais/mmlu") - -# Sample -samples = sql.sample("cais/mmlu", n=5, seed=42) - -# Count -count = sql.count("cais/mmlu", where="subject='nutrition'") - -# Histogram -dist = sql.histogram("cais/mmlu", "subject") - -# Filter and transform -results = sql.filter_and_transform( - "cais/mmlu", - select="subject, COUNT(*) as cnt", - group_by="subject", - order_by="cnt DESC", - limit=10 -) - -# Push to Hub -url = sql.push_to_hub( - "cais/mmlu", - "username/nutrition-subset", - sql="SELECT * FROM data WHERE subject='nutrition'", - private=True -) - -# Export locally -sql.export_to_parquet("cais/mmlu", "output.parquet", sql="SELECT * FROM data LIMIT 100") - -sql.close() -``` - -## HF Path Format - -DuckDB uses the `hf://` protocol to access datasets: -``` -hf://datasets/{dataset_id}@{revision}/{config}/{split}/*.parquet -``` - -Examples: -- `hf://datasets/cais/mmlu@~parquet/default/train/*.parquet` -- `hf://datasets/ibm/duorc@~parquet/ParaphraseRC/test/*.parquet` - -The `@~parquet` revision provides auto-converted Parquet files for any dataset format. - -## Useful DuckDB SQL Functions - -```sql --- String functions -LENGTH(column) -- String length -regexp_replace(col, '\n', '') -- Regex replace -regexp_matches(col, 'pattern') -- Regex match -LOWER(col), UPPER(col) -- Case conversion - --- Array functions -choices[0] -- Array indexing (0-based) -array_length(choices) -- Array length -unnest(choices) -- Expand array to rows - --- Aggregations -COUNT(*), SUM(col), AVG(col) -GROUP BY col HAVING condition - --- Sampling -USING SAMPLE 10 -- Random sample -USING SAMPLE 10 (RESERVOIR, 42) -- Reproducible sample - --- Window functions -ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2) -``` - ---- - -# Dataset Creation (dataset_manager.py) - -### Recommended Workflow - -**1. Discovery (Use HF MCP Server):** -```python -# Use HF MCP tools to find existing datasets -search_datasets("conversational AI training") -get_dataset_details("username/dataset-name") -``` - -**2. Creation (Use This Skill):** -```bash -# Initialize new dataset -uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private] - -# Configure with detailed system prompt -uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "$(cat system_prompt.txt)" -``` - -**3. Content Management (Use This Skill):** -```bash -# Quick setup with any template -uv run scripts/dataset_manager.py quick_setup \ - --repo_id "your-username/dataset-name" \ - --template classification - -# Add data with template validation -uv run scripts/dataset_manager.py add_rows \ - --repo_id "your-username/dataset-name" \ - --template qa \ - --rows_json "$(cat your_qa_data.json)" -``` - -### Template-Based Data Structures - -**1. Chat Template (`--template chat`)** -```json -{ - "messages": [ - {"role": "user", "content": "Natural user request"}, - {"role": "assistant", "content": "Response with tool usage"}, - {"role": "tool", "content": "Tool response", "tool_call_id": "call_123"} - ], - "scenario": "Description of use case", - "complexity": "simple|intermediate|advanced" -} -``` - -**2. Classification Template (`--template classification`)** -```json -{ - "text": "Input text to be classified", - "label": "classification_label", - "confidence": 0.95, - "metadata": {"domain": "technology", "language": "en"} -} -``` - -**3. QA Template (`--template qa`)** -```json -{ - "question": "What is the question being asked?", - "answer": "The complete answer", - "context": "Additional context if needed", - "answer_type": "factual|explanatory|opinion", - "difficulty": "easy|medium|hard" -} -``` - -**4. Completion Template (`--template completion`)** -```json -{ - "prompt": "The beginning text or context", - "completion": "The expected continuation", - "domain": "code|creative|technical|conversational", - "style": "description of writing style" -} -``` - -**5. Tabular Template (`--template tabular`)** -```json -{ - "columns": [ - {"name": "feature1", "type": "numeric", "description": "First feature"}, - {"name": "target", "type": "categorical", "description": "Target variable"} - ], - "data": [ - {"feature1": 123, "target": "class_a"}, - {"feature1": 456, "target": "class_b"} - ] -} -``` - -### Advanced System Prompt Template - -For high-quality training data generation: -```text -You are an AI assistant expert at using MCP tools effectively. - -## MCP SERVER DEFINITIONS -[Define available servers and tools] - -## TRAINING EXAMPLE STRUCTURE -[Specify exact JSON schema for chat templating] - -## QUALITY GUIDELINES -[Detail requirements for realistic scenarios, progressive complexity, proper tool usage] - -## EXAMPLE CATEGORIES -[List development workflows, debugging scenarios, data management tasks] -``` - -### Example Categories & Templates - -The skill includes diverse training examples beyond just MCP usage: - -**Available Example Sets:** -- `training_examples.json` - MCP tool usage examples (debugging, project setup, database analysis) -- `diverse_training_examples.json` - Broader scenarios including: - - **Educational Chat** - Explaining programming concepts, tutorials - - **Git Workflows** - Feature branches, version control guidance - - **Code Analysis** - Performance optimization, architecture review - - **Content Generation** - Professional writing, creative brainstorming - - **Codebase Navigation** - Legacy code exploration, systematic analysis - - **Conversational Support** - Problem-solving, technical discussions - -**Using Different Example Sets:** -```bash -# Add MCP-focused examples -uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \ - --rows_json "$(cat examples/training_examples.json)" - -# Add diverse conversational examples -uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \ - --rows_json "$(cat examples/diverse_training_examples.json)" - -# Mix both for comprehensive training data -uv run scripts/dataset_manager.py add_rows --repo_id "your-username/dataset-name" \ - --rows_json "$(jq -s '.[0] + .[1]' examples/training_examples.json examples/diverse_training_examples.json)" -``` - -### Commands Reference - -**List Available Templates:** -```bash -uv run scripts/dataset_manager.py list_templates -``` - -**Quick Setup (Recommended):** -```bash -uv run scripts/dataset_manager.py quick_setup --repo_id "your-username/dataset-name" --template classification -``` - -**Manual Setup:** -```bash -# Initialize repository -uv run scripts/dataset_manager.py init --repo_id "your-username/dataset-name" [--private] - -# Configure with system prompt -uv run scripts/dataset_manager.py config --repo_id "your-username/dataset-name" --system_prompt "Your prompt here" - -# Add data with validation -uv run scripts/dataset_manager.py add_rows \ - --repo_id "your-username/dataset-name" \ - --template qa \ - --rows_json '[{"question": "What is AI?", "answer": "Artificial Intelligence..."}]' -``` - -**View Dataset Statistics:** -```bash -uv run scripts/dataset_manager.py stats --repo_id "your-username/dataset-name" -``` - -### Error Handling -- **Repository exists**: Script will notify and continue with configuration -- **Invalid JSON**: Clear error message with parsing details -- **Network issues**: Automatic retry for transient failures -- **Token permissions**: Validation before operations begin - ---- - -# Combined Workflow Examples - -## Example 1: Create Training Subset from Existing Dataset -```bash -# 1. Explore the source dataset -uv run scripts/sql_manager.py describe --dataset "cais/mmlu" -uv run scripts/sql_manager.py histogram --dataset "cais/mmlu" --column "subject" - -# 2. Query and create subset -uv run scripts/sql_manager.py query \ - --dataset "cais/mmlu" \ - --sql "SELECT * FROM data WHERE subject IN ('nutrition', 'anatomy', 'clinical_knowledge')" \ - --push-to "username/mmlu-medical-subset" \ - --private -``` - -## Example 2: Transform and Reshape Data -```bash -# Transform MMLU to QA format with correct answers extracted -uv run scripts/sql_manager.py query \ - --dataset "cais/mmlu" \ - --sql "SELECT question, choices[answer] as correct_answer, subject FROM data" \ - --push-to "username/mmlu-qa-format" -``` - -## Example 3: Merge Multiple Dataset Splits -```bash -# Export multiple splits and combine -uv run scripts/sql_manager.py export \ - --dataset "cais/mmlu" \ - --split "*" \ - --output "mmlu_all.parquet" -``` - -## Example 4: Quality Filtering -```bash -# Filter for high-quality examples -uv run scripts/sql_manager.py query \ - --dataset "squad" \ - --sql "SELECT * FROM data WHERE LENGTH(context) > 500 AND LENGTH(question) > 20" \ - --push-to "username/squad-filtered" -``` - -## Example 5: Create Custom Training Dataset -```bash -# 1. Query source data -uv run scripts/sql_manager.py export \ - --dataset "cais/mmlu" \ - --sql "SELECT question, subject FROM data WHERE subject='nutrition'" \ - --output "nutrition_source.jsonl" \ - --format jsonl - -# 2. Process with your pipeline (add answers, format, etc.) - -# 3. Push processed data -uv run scripts/dataset_manager.py init --repo_id "username/nutrition-training" -uv run scripts/dataset_manager.py add_rows \ - --repo_id "username/nutrition-training" \ - --template qa \ - --rows_json "$(cat processed_data.json)" -``` diff --git a/skills/hugging-face-datasets/examples/diverse_training_examples.json b/skills/hugging-face-datasets/examples/diverse_training_examples.json deleted file mode 100644 index b736bc43..00000000 --- a/skills/hugging-face-datasets/examples/diverse_training_examples.json +++ /dev/null @@ -1,239 +0,0 @@ -[ - { - "messages": [ - { - "role": "user", - "content": "Hi there! I'm working on a Python project and getting stuck with some async programming concepts. Can you help explain how to use async/await properly?" - }, - { - "role": "assistant", - "content": "I'd be happy to help you understand async/await in Python! These are powerful tools for writing concurrent code. Let me show you some practical examples and explain the key concepts.\n\n**Basic async/await syntax:**\n```python\nimport asyncio\n\nasync def fetch_data():\n # Simulate an async operation\n await asyncio.sleep(1)\n return \"Data fetched!\"\n\nasync def main():\n result = await fetch_data()\n print(result)\n\n# Run the async function\nasyncio.run(main())\n```\n\n**Key concepts:**\n- `async def` defines an asynchronous function (coroutine)\n- `await` pauses execution until the awaited operation completes\n- `asyncio.run()` runs the main async function\n\n**When to use async/await:**\n- I/O operations (file reads, network requests, database queries)\n- Operations that involve waiting\n- When you want to handle multiple tasks concurrently\n\nWould you like me to show you a more complex example with multiple concurrent operations, or do you have a specific use case you're working on?" - } - ], - "scenario": "Educational chat about Python async programming concepts", - "complexity": "simple", - "tools_used": [], - "learning_objectives": ["conversational teaching", "code explanation", "technical mentoring", "educational chat patterns"] - }, - { - "messages": [ - { - "role": "user", - "content": "I need to create a new feature branch for my project, make some changes, and then create a pull request. Can you walk me through the Git workflow?" - }, - { - "role": "assistant", - "content": "I'll help you with the complete Git workflow for creating a feature branch and pull request. Let me first check your current Git status and then guide you through each step.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "git_status", - "arguments": {} - } - } - ] - }, - { - "role": "tool", - "content": "On branch main\nYour branch is up to date with 'origin/main'.\n\nnothing to commit, working tree clean", - "tool_call_id": "call_301" - }, - { - "role": "assistant", - "content": "Perfect! Your working tree is clean and you're on the main branch. Now let's create your feature branch and walk through the workflow.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "git_branch", - "arguments": { - "branch_name": "feature/user-authentication", - "create": true - } - } - } - ] - }, - { - "role": "tool", - "content": "Switched to a new branch 'feature/user-authentication'", - "tool_call_id": "call_302" - }, - { - "role": "assistant", - "content": "Great! I've created and switched to your new feature branch. Here's the complete workflow:\n\n**✅ Step 1: Created feature branch** (Done)\n```bash\ngit checkout -b feature/user-authentication\n```\n\n**Next steps for you:**\n\n**Step 2: Make your changes**\n- Edit your files as needed\n- Add your new feature code\n\n**Step 3: Stage and commit changes**\n```bash\ngit add .\ngit commit -m \"Add user authentication feature\"\n```\n\n**Step 4: Push branch to remote**\n```bash\ngit push -u origin feature/user-authentication\n```\n\n**Step 5: Create Pull Request**\n- Go to your repository on GitHub/GitLab\n- Click \"Create Pull Request\" or \"Merge Request\"\n- Fill in description and request review\n\nWould you like me to help you with any of these steps once you've made your changes?" - } - ], - "scenario": "Git workflow guidance for feature development", - "complexity": "intermediate", - "tools_used": ["git_status", "git_branch"], - "learning_objectives": ["Git workflow patterns", "branch management", "collaborative development", "step-by-step guidance"] - }, - { - "messages": [ - { - "role": "user", - "content": "I have a Python function that's running really slowly. Can you help me analyze the code and suggest optimizations?" - }, - { - "role": "assistant", - "content": "I'd be happy to help optimize your Python code! Let me analyze the function to identify potential performance bottlenecks and suggest improvements.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "find_functions", - "arguments": { - "pattern": "def.*process.*" - } - } - } - ] - }, - { - "role": "tool", - "content": "Found functions:\n- def process_user_data(users_list): line 45 in utils.py\n- def process_payments(payment_records): line 78 in billing.py", - "tool_call_id": "call_401" - }, - { - "role": "assistant", - "content": "I found some processing functions. Let me examine the code to identify performance issues.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "read_file", - "arguments": { - "path": "utils.py", - "start_line": 45, - "end_line": 60 - } - } - } - ] - }, - { - "role": "tool", - "content": "def process_user_data(users_list):\n results = []\n for user in users_list:\n # Multiple database calls in loop - PERFORMANCE ISSUE!\n profile = db.get_user_profile(user.id)\n settings = db.get_user_settings(user.id)\n preferences = db.get_user_preferences(user.id)\n \n # String concatenation in loop - PERFORMANCE ISSUE!\n summary = \"\"\n for field in user.fields:\n summary += f\"{field.name}: {field.value}\\n\"\n \n results.append({\n 'user': user,\n 'profile': profile,\n 'settings': settings,\n 'preferences': preferences,\n 'summary': summary\n })\n return results", - "tool_call_id": "call_402" - }, - { - "role": "assistant", - "content": "I've identified several performance issues in your `process_user_data` function. Here's my analysis and optimization suggestions:\n\n**🔍 Performance Issues Found:**\n\n1. **N+1 Database Query Problem** (Lines 48-50)\n - Making 3 separate DB calls per user in a loop\n - This scales linearly with user count (very slow!)\n\n2. **String Concatenation in Loop** (Lines 52-55)\n - Using `+=` for string building is inefficient\n - Creates new string objects each iteration\n\n**⚡ Optimization Recommendations:**\n\n```python\ndef process_user_data_optimized(users_list):\n # Fix 1: Batch database queries\n user_ids = [user.id for user in users_list]\n profiles = db.get_user_profiles_batch(user_ids) # Single query\n settings = db.get_user_settings_batch(user_ids) # Single query \n preferences = db.get_user_preferences_batch(user_ids) # Single query\n \n results = []\n for user in users_list:\n # Fix 2: Use join() for string concatenation\n summary = \"\\n\".join(f\"{field.name}: {field.value}\" \n for field in user.fields)\n \n results.append({\n 'user': user,\n 'profile': profiles.get(user.id),\n 'settings': settings.get(user.id), \n 'preferences': preferences.get(user.id),\n 'summary': summary\n })\n return results\n```\n\n**Expected Performance Gain:** 10-100x faster depending on user count!\n\nWould you like me to help implement the batch database queries or analyze any other functions?" - } - ], - "scenario": "Code performance analysis and optimization guidance", - "complexity": "advanced", - "tools_used": ["find_functions", "read_file"], - "learning_objectives": ["performance optimization", "code analysis", "database optimization patterns", "Python best practices"] - }, - { - "messages": [ - { - "role": "user", - "content": "I need to write a professional email to a client explaining a project delay. Can you help me draft something that's apologetic but confident?" - }, - { - "role": "assistant", - "content": "I'll help you draft a professional email that strikes the right balance between taking responsibility and maintaining client confidence. Here's a template that addresses the delay while focusing on solutions:\n\n**Subject:** Project Update - Revised Timeline for [Project Name]\n\n**Email Draft:**\n\nDear [Client Name],\n\nI hope this email finds you well. I'm writing to provide you with an important update regarding the [Project Name] timeline.\n\nAfter a thorough review of our current progress, I need to inform you that we will require an additional [X weeks/days] to complete the project to our mutual quality standards. This adjustment is primarily due to [brief, honest reason - e.g., \"unexpected technical complexities in the integration phase\" or \"additional testing requirements to ensure optimal performance\"].\n\nI want to be completely transparent about this delay and take full responsibility. However, I'm confident that this additional time will allow us to deliver a solution that exceeds your expectations and provides long-term value.\n\n**Here's our revised timeline:**\n- [Key milestone 1]: [New date]\n- [Key milestone 2]: [New date] \n- Final delivery: [New date]\n\nTo ensure we stay on track, I've implemented [specific action - e.g., \"daily progress reviews\" or \"additional resources to the development team\"].\n\nI understand this may impact your planning, and I'm available to discuss how we can minimize any inconvenience. Would you be available for a brief call this week to address any concerns?\n\nThank you for your patience and continued trust in our partnership.\n\nBest regards,\n[Your name]\n\n**Key elements this email includes:**\n- Direct, honest communication\n- Takes responsibility without excessive apologizing\n- Provides specific reasons and solutions\n- Shows proactive management\n- Offers further discussion\n\nWould you like me to adjust the tone or focus on any particular aspects?" - } - ], - "scenario": "Professional communication assistance and content generation", - "complexity": "simple", - "tools_used": [], - "learning_objectives": ["professional writing", "client communication", "content generation", "business correspondence patterns"] - }, - { - "messages": [ - { - "role": "user", - "content": "I'm trying to understand this legacy codebase I inherited. There are so many files and I don't know where to start. Can you help me map out the structure?" - }, - { - "role": "assistant", - "content": "I completely understand - navigating a new codebase can be overwhelming! Let me help you systematically explore and understand the structure. I'll start by getting an overview and then drill down into the important parts.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "list_directory", - "arguments": { - "path": "." - } - } - } - ] - }, - { - "role": "tool", - "content": "README.md\npackage.json\nsrc/\ntest/\nconfig/\ndocs/\n.env.example\n.gitignore\nDockerfile\nnode_modules/", - "tool_call_id": "call_501" - }, - { - "role": "assistant", - "content": "Great! This looks like a Node.js project with a well-organized structure. Let me examine the key areas to give you a roadmap.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "read_file", - "arguments": { - "path": "package.json" - } - } - } - ] - }, - { - "role": "tool", - "content": "{\n \"name\": \"legacy-api-service\",\n \"version\": \"2.3.1\",\n \"description\": \"Customer management API with billing integration\",\n \"main\": \"src/app.js\",\n \"scripts\": {\n \"start\": \"node src/app.js\",\n \"dev\": \"nodemon src/app.js\",\n \"test\": \"jest test/\"\n },\n \"dependencies\": {\n \"express\": \"^4.18.0\",\n \"mongoose\": \"^6.2.0\",\n \"stripe\": \"^8.0.0\"\n }\n}", - "tool_call_id": "call_502" - }, - { - "role": "assistant", - "content": "Now let me explore the source directory to understand the application structure.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "list_directory", - "arguments": { - "path": "src" - } - } - } - ] - }, - { - "role": "tool", - "content": "app.js\nroutes/\nmodels/\nservices/\nmiddleware/\nutils/\ncontrollers/", - "tool_call_id": "call_503" - }, - { - "role": "assistant", - "content": "Perfect! Now I have a clear picture. Here's your **Codebase Navigation Guide**:\n\n## 📋 Project Overview\n**Type:** Customer Management API with billing (Node.js/Express)\n**Main Entry:** `src/app.js`\n**Database:** MongoDB (Mongoose)\n**Payment:** Stripe integration\n\n## 🗺️ Architecture Map\n\n**Start Here:**\n1. 📖 `README.md` - Project documentation\n2. ⚙️ `src/app.js` - Application entry point\n3. 🛣️ `src/routes/` - API endpoints definition\n\n**Core Application Logic:**\n- `src/controllers/` - Request handling logic\n- `src/models/` - Database schemas (MongoDB)\n- `src/services/` - Business logic layer\n- `src/middleware/` - Authentication, validation, etc.\n\n**Supporting Code:**\n- `src/utils/` - Helper functions\n- `config/` - Configuration files\n- `test/` - Unit/integration tests\n\n## 🎯 Recommended Exploration Order:\n\n**Phase 1 - Foundation (Start here!):**\n1. Read `README.md` for setup instructions\n2. Check `src/app.js` to see how everything connects\n3. Browse `src/routes/` to understand available APIs\n\n**Phase 2 - Business Logic:**\n4. Explore `src/controllers/` for main functionality\n5. Review `src/models/` for data structure\n6. Study `src/services/` for business rules\n\n**Phase 3 - Details:**\n7. Check `src/middleware/` for security/validation\n8. Look at `config/` for environment setup\n9. Review `test/` to understand expected behavior\n\nWould you like me to dive deeper into any specific area, like showing you the main routes or examining the data models?" - } - ], - "scenario": "Codebase exploration and architecture understanding", - "complexity": "intermediate", - "tools_used": ["list_directory", "read_file"], - "learning_objectives": ["codebase navigation", "architecture analysis", "systematic exploration", "legacy code understanding"] - }, - { - "messages": [ - { - "role": "user", - "content": "Can you help me brainstorm some creative names for my new productivity app? It's focused on time blocking and helps people focus on deep work." - }, - { - "role": "assistant", - "content": "I'd love to help you brainstorm creative names for your time blocking and deep work app! Let me generate some ideas across different themes and styles.\n\n## 🎯 Focus & Deep Work Theme\n- **FlowBlocks** - Combines time blocking with flow state\n- **DeepDive** - Simple, suggests deep work immersion\n- **FocusForge** - Creating focused time periods\n- **ZenBlocks** - Peaceful, mindful time management\n- **MindVault** - Protecting mental space for deep work\n\n## ⏰ Time & Productivity Theme\n- **ChronoFlow** - Time management with smooth workflow\n- **TimeWeaver** - Weaving together productive time blocks\n- **ClockCraft** - Skillfully crafting your schedule\n- **TempoLock** - Locking in productive tempos\n- **PulseBlocks** - Rhythmic time blocking\n\n## 🏗️ Building & Structure Theme\n- **TaskArchitect** - Building your productive day\n- **FocusFoundry** - Where focused time is created\n- **ProductivityBricks** - Building blocks of productivity\n- **WorkStack** - Stacking focused work sessions\n- **MindGrid** - Organized mental workspace\n\n## 🎨 Creative & Unique\n- **Concentrix** - Focus + matrix of time\n- **Quietude** - Emphasizes calm, focused environment\n- **DeepSpace** - Space for deep work (tech feel)\n- **FlowChamber** - Protected space for flow state\n- **ThinkTank** - Tank of focused thinking time\n\n## 💡 Short & Catchy\n- **Blokr** - Modern, minimal spelling\n- **Focux** - Focus + flux\n- **Zenith** - Peak productivity\n- **Prism** - Breaking time into focused segments\n- **Vault** - Simple, suggests protection/security\n\n**My top recommendations:**\n1. **FlowBlocks** - Perfect blend of your core features\n2. **DeepVault** - Strong, memorable, suggests protection of deep work\n3. **FocusForge** - Active, suggests creating something valuable\n\nDo any of these resonate with your vision? Would you like me to explore variations of your favorites or brainstorm in a different direction?" - } - ], - "scenario": "Creative brainstorming and idea generation assistance", - "complexity": "simple", - "tools_used": [], - "learning_objectives": ["creative thinking", "brainstorming techniques", "branding assistance", "idea generation patterns"] - } -] \ No newline at end of file diff --git a/skills/hugging-face-datasets/examples/system_prompt_template.txt b/skills/hugging-face-datasets/examples/system_prompt_template.txt deleted file mode 100644 index 2a71c944..00000000 --- a/skills/hugging-face-datasets/examples/system_prompt_template.txt +++ /dev/null @@ -1,196 +0,0 @@ -You are an AI assistant expert at using MCP (Model Context Protocol) tools effectively. You have access to various MCP servers and tools that allow you to interact with external systems, databases, APIs, and services. - -## MCP SERVER DEFINITIONS - -### Available MCP Servers and Tools: - -**1. Filesystem Server (mcp-server-filesystem)** -- `read_file`: Read file contents -- `write_file`: Create or overwrite files -- `edit_file`: Make precise edits to existing files -- `list_directory`: List directory contents -- `create_directory`: Create new directories -- `move_file`: Move/rename files -- `delete_file`: Remove files -- `search_files`: Search for files by name pattern - -**2. Git Server (mcp-server-git)** -- `git_status`: Check repository status -- `git_log`: View commit history -- `git_diff`: Show changes between commits -- `git_commit`: Create commits -- `git_branch`: Manage branches -- `git_push`: Push changes to remote -- `git_pull`: Pull changes from remote - -**3. Database Server (mcp-server-sqlite)** -- `execute_query`: Run SQL queries -- `describe_table`: Get table schema -- `list_tables`: Show all tables -- `create_table`: Create new tables -- `insert_data`: Add records -- `backup_database`: Create database backup - -**4. Web Server (mcp-server-web)** -- `fetch_url`: Get web page content -- `post_request`: Send POST requests -- `search_web`: Search the internet -- `download_file`: Download files from URLs - -**5. Code Analysis Server (mcp-server-code)** -- `analyze_code`: Static code analysis -- `find_functions`: Locate function definitions -- `get_dependencies`: Extract imports/dependencies -- `format_code`: Auto-format source code -- `run_tests`: Execute test suites - -## TRAINING EXAMPLE STRUCTURE - -Generate training examples using this exact structure: - -```json -{ - "messages": [ - { - "role": "user", - "content": "[Natural user request describing a real problem]" - }, - { - "role": "assistant", - "content": "[Conversational response with embedded MCP tool usage]", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "[mcp_tool_name]", - "arguments": { - "[param1]": "[value1]", - "[param2]": "[value2]" - } - } - } - ] - }, - { - "role": "tool", - "content": "[Simulated tool response/output]", - "tool_call_id": "call_123" - }, - { - "role": "assistant", - "content": "[Follow-up response analyzing results and next steps]" - } - ], - "scenario": "[Brief description of the use case]", - "complexity": "[simple|intermediate|advanced]", - "tools_used": ["[tool1]", "[tool2]"], - "learning_objectives": ["[objective1]", "[objective2]"] -} -``` - -## QUALITY GUIDELINES - -**1. Realistic Scenarios**: Create examples based on actual developer workflows: -- Debugging application errors -- Setting up new projects -- Code refactoring and optimization -- Database management tasks -- API integration challenges -- Version control operations -- File organization and cleanup - -**2. Progressive Complexity**: -- **Simple**: Single tool usage for straightforward tasks -- **Intermediate**: Multiple tools working together -- **Advanced**: Complex workflows with error handling and iteration - -**3. Tool Usage Patterns**: -- Always explain tool selection rationale -- Show proper parameter usage -- Demonstrate error handling -- Include realistic tool outputs -- Show iterative problem-solving - -**4. Conversational Quality**: -- Natural language flow -- User context awareness -- Helpful explanations without being verbose -- Professional but approachable tone -- Clear next steps and follow-up options - -**5. Technical Accuracy**: -- Correct MCP tool syntax -- Realistic file paths and data -- Valid code snippets -- Proper error messages -- Authentic command outputs - -## EXAMPLE CATEGORIES TO COVER - -**Development Workflows**: -- Project setup and initialization -- Code review and quality checks -- Dependency management -- Build and deployment processes - -**Git & Version Control**: -- Feature branch workflows -- Merge conflict resolution -- Repository management -- Collaborative development patterns - -**Code Analysis & Optimization**: -- Performance profiling and optimization -- Code quality assessment -- Refactoring guidance -- Architecture analysis - -**Debugging & Troubleshooting**: -- Error investigation -- Log analysis -- Performance profiling -- Configuration issues - -**Data Management**: -- Database queries and updates -- File processing and transformation -- Backup and migration tasks -- Data validation and cleanup - -**Integration Tasks**: -- API testing and integration -- External service connections -- Webhook setup and testing -- Authentication implementation - -**Educational & Mentoring**: -- Concept explanations -- Technical tutorials -- Best practices guidance -- Learning path recommendations - -**Content Generation**: -- Professional writing assistance -- Documentation creation -- Creative brainstorming -- Technical communication - -**Conversational Support**: -- Problem-solving discussions -- Project guidance -- Technical Q&A -- Collaborative planning - -**Codebase Navigation**: -- Legacy code exploration -- Architecture understanding -- Documentation analysis -- Systematic code review - -**Maintenance & Operations**: -- Code refactoring -- Documentation updates -- Security audits -- System monitoring - -Generate examples that demonstrate sophisticated MCP usage while maintaining natural conversational flow and practical applicability. \ No newline at end of file diff --git a/skills/hugging-face-datasets/examples/training_examples.json b/skills/hugging-face-datasets/examples/training_examples.json deleted file mode 100644 index 99db376d..00000000 --- a/skills/hugging-face-datasets/examples/training_examples.json +++ /dev/null @@ -1,176 +0,0 @@ -[ - { - "messages": [ - { - "role": "user", - "content": "My Node.js app is crashing on startup. Can you help me debug this? The error message mentions something about a missing config file." - }, - { - "role": "assistant", - "content": "I'll help you debug the startup crash. Let me first check your application structure and then look at the logs to understand the missing config file issue.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "list_directory", - "arguments": { - "path": "." - } - } - } - ] - }, - { - "role": "tool", - "content": "package.json\nserver.js\nsrc/\nnode_modules/\nREADME.md\nlogs/", - "tool_call_id": "call_001" - }, - { - "role": "assistant", - "content": "I can see your project structure. Now let me check the main server file to understand what config file it's expecting.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "read_file", - "arguments": { - "path": "server.js" - } - } - } - ] - }, - { - "role": "tool", - "content": "const express = require('express');\nconst config = require('./config/database.json');\nconst app = express();\n\napp.listen(config.port, () => {\n console.log(`Server running on port ${config.port}`);\n});", - "tool_call_id": "call_002" - }, - { - "role": "assistant", - "content": "Found the issue! Your server.js is trying to load './config/database.json' but I don't see a config directory. Let me check if it exists and create it if needed.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "list_directory", - "arguments": { - "path": "config" - } - } - } - ] - }, - { - "role": "tool", - "content": "Error: Directory 'config' does not exist", - "tool_call_id": "call_003" - }, - { - "role": "assistant", - "content": "Perfect! The config directory is missing. I'll create it and add a basic database configuration file to fix your startup crash.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "create_directory", - "arguments": { - "path": "config" - } - } - } - ] - }, - { - "role": "tool", - "content": "Directory 'config' created successfully", - "tool_call_id": "call_004" - }, - { - "role": "assistant", - "content": "Now I'll create the missing database.json configuration file with sensible defaults.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "write_file", - "arguments": { - "path": "config/database.json", - "content": "{\n \"port\": 3000,\n \"database\": {\n \"host\": \"localhost\",\n \"port\": 5432,\n \"name\": \"myapp\",\n \"user\": \"postgres\"\n }\n}" - } - } - } - ] - }, - { - "role": "tool", - "content": "File 'config/database.json' created successfully", - "tool_call_id": "call_005" - }, - { - "role": "assistant", - "content": "Great! I've fixed the missing config file issue. Your Node.js app should now start without crashing. The server will run on port 3000 by default. You may want to adjust the database connection settings in config/database.json to match your actual database setup." - } - ], - "scenario": "Debugging Node.js startup crash due to missing configuration file", - "complexity": "intermediate", - "tools_used": ["list_directory", "read_file", "create_directory", "write_file"], - "learning_objectives": ["systematic debugging approach", "file system navigation", "configuration file management", "error root cause analysis"] - }, - { - "messages": [ - { - "role": "user", - "content": "I need to set up a new React project with TypeScript and add some basic components. Can you help me get started?" - }, - { - "role": "assistant", - "content": "I'll help you set up a new React TypeScript project from scratch. Let me start by creating the project structure and configuration files.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "create_directory", - "arguments": { - "path": "my-react-app" - } - } - } - ] - }, - { - "role": "tool", - "content": "Directory 'my-react-app' created successfully", - "tool_call_id": "call_101" - }, - { - "role": "assistant", - "content": "Now I'll create the package.json with React and TypeScript dependencies.", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "write_file", - "arguments": { - "path": "my-react-app/package.json", - "content": "{\n \"name\": \"my-react-app\",\n \"version\": \"1.0.0\",\n \"scripts\": {\n \"dev\": \"vite\",\n \"build\": \"tsc && vite build\",\n \"preview\": \"vite preview\"\n },\n \"dependencies\": {\n \"react\": \"^18.2.0\",\n \"react-dom\": \"^18.2.0\"\n },\n \"devDependencies\": {\n \"@types/react\": \"^18.2.0\",\n \"@types/react-dom\": \"^18.2.0\",\n \"@vitejs/plugin-react\": \"^4.0.0\",\n \"typescript\": \"^5.0.0\",\n \"vite\": \"^4.3.0\"\n }\n}" - } - } - } - ] - }, - { - "role": "tool", - "content": "File 'my-react-app/package.json' created successfully", - "tool_call_id": "call_102" - }, - { - "role": "assistant", - "content": "Perfect! I've set up your React TypeScript project with proper configuration. The project includes package.json with all necessary dependencies, and a clean structure. You can now run `npm install` to install dependencies and start developing your components. Would you like me to create any specific components or add additional configuration?" - } - ], - "scenario": "Setting up new React TypeScript project from scratch", - "complexity": "simple", - "tools_used": ["create_directory", "write_file"], - "learning_objectives": ["project initialization", "React TypeScript setup", "configuration management", "directory structure planning"] - } -] \ No newline at end of file diff --git a/skills/hugging-face-datasets/scripts/dataset_manager.py b/skills/hugging-face-datasets/scripts/dataset_manager.py deleted file mode 100644 index 39127a6e..00000000 --- a/skills/hugging-face-datasets/scripts/dataset_manager.py +++ /dev/null @@ -1,522 +0,0 @@ -#!/usr/bin/env -S uv run -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "huggingface_hub>=0.20.0", -# ] -# /// -""" -Hugging Face Dataset Manager - -Enhanced dataset creation and management tool designed to work alongside -the HF MCP server. Provides dataset creation, configuration, and content -management capabilities optimized for conversational AI training data. - -Version: 2.0.0 - -Usage: - uv run dataset_manager.py init --repo_id username/dataset-name - uv run dataset_manager.py quick_setup --repo_id username/dataset-name --template chat - uv run dataset_manager.py add_rows --repo_id username/dataset-name --rows_json '[{"messages": [...]}]' - uv run dataset_manager.py stats --repo_id username/dataset-name - uv run dataset_manager.py list_templates -""" - -import os -import json -import time -import argparse -from pathlib import Path -from typing import List, Dict, Any, Optional -from huggingface_hub import HfApi, create_repo -from huggingface_hub.utils import HfHubHTTPError - -# Configuration -HF_TOKEN = os.environ.get("HF_TOKEN") -EXAMPLES_DIR = Path(__file__).parent.parent / "examples" - - -def init_dataset(repo_id, token=None, private=True): - """ - Initialize a new dataset repository on Hugging Face Hub. - """ - api = HfApi(token=token) - try: - create_repo(repo_id, repo_type="dataset", private=private, token=token) - print(f"Created dataset repository: {repo_id}") - except HfHubHTTPError as e: - if "409" in str(e): - print(f"Repository {repo_id} already exists.") - else: - raise e - - # Create a basic README.md with metadata if it doesn't exist - readme_content = f"""--- -license: mit ---- - -# {repo_id.split("/")[-1]} - -This dataset was created using the Claude Dataset Skill. -""" - try: - api.upload_file( - path_or_fileobj=readme_content.encode("utf-8"), - path_in_repo="README.md", - repo_id=repo_id, - repo_type="dataset", - commit_message="Initialize dataset README", - ) - except Exception as e: - print(f"Note: README might already exist or failed to update: {e}") - - -def define_config(repo_id, system_prompt=None, token=None): - """ - Define a configuration for the dataset, including a system prompt. - This saves a config.json file to the repository. - """ - api = HfApi(token=token) - - config_data = {"dataset_config": {"version": "1.0", "created_at": time.time()}} - - if system_prompt: - config_data["system_prompt"] = system_prompt - - # Upload config.json - api.upload_file( - path_or_fileobj=json.dumps(config_data, indent=2).encode("utf-8"), - path_in_repo="config.json", - repo_id=repo_id, - repo_type="dataset", - commit_message="Update dataset configuration", - ) - print(f"Configuration updated for {repo_id}") - - -def load_dataset_template(template_name: str) -> Dict[str, Any]: - """Load dataset template configuration from templates directory.""" - template_path = EXAMPLES_DIR.parent / "templates" / f"{template_name}.json" - if not template_path.exists(): - available_templates = [f.stem for f in (EXAMPLES_DIR.parent / "templates").glob("*.json")] - print(f"❌ Template '{template_name}' not found.") - print(f"Available templates: {', '.join(available_templates)}") - return {} - - with open(template_path) as f: - return json.load(f) - - -def validate_by_template(rows: List[Dict[str, Any]], template: Dict[str, Any]) -> bool: - """Validate data according to template schema.""" - if not template: - return False - - schema = template.get("validation_schema", {}) - required_fields = set(schema.get("required_fields", [])) - recommended_fields = set(schema.get("recommended_fields", [])) - field_types = schema.get("field_types", {}) - - for i, row in enumerate(rows): - # Check required fields - if not all(field in row for field in required_fields): - missing = required_fields - set(row.keys()) - print(f"Row {i}: Missing required fields: {missing}") - return False - - # Validate field types - for field, expected_type in field_types.items(): - if field in row: - if not _validate_field_type(row[field], expected_type, f"Row {i}, field '{field}'"): - return False - - # Template-specific validation - if template["type"] == "chat": - if not _validate_chat_format(row, i): - return False - elif template["type"] == "classification": - if not _validate_classification_format(row, i): - return False - elif template["type"] == "tabular": - if not _validate_tabular_format(row, i): - return False - - # Warn about missing recommended fields - missing_recommended = recommended_fields - set(row.keys()) - if missing_recommended: - print(f"Row {i}: Recommended to include: {missing_recommended}") - - print(f"✓ Validated {len(rows)} examples for {template['type']} dataset") - return True - - -def _validate_field_type(value: Any, expected_type: str, context: str) -> bool: - """Validate individual field type.""" - if expected_type.startswith("enum:"): - valid_values = expected_type[5:].split(",") - if value not in valid_values: - print(f"{context}: Invalid value '{value}'. Must be one of: {valid_values}") - return False - elif expected_type == "array" and not isinstance(value, list): - print(f"{context}: Expected array, got {type(value).__name__}") - return False - elif expected_type == "object" and not isinstance(value, dict): - print(f"{context}: Expected object, got {type(value).__name__}") - return False - elif expected_type == "string" and not isinstance(value, str): - print(f"{context}: Expected string, got {type(value).__name__}") - return False - elif expected_type == "number" and not isinstance(value, (int, float)): - print(f"{context}: Expected number, got {type(value).__name__}") - return False - - return True - - -def _validate_chat_format(row: Dict[str, Any], row_index: int) -> bool: - """Validate chat-specific format.""" - messages = row.get("messages", []) - if not isinstance(messages, list) or len(messages) == 0: - print(f"Row {row_index}: 'messages' must be a non-empty list") - return False - - valid_roles = {"user", "assistant", "tool", "system"} - for j, msg in enumerate(messages): - if not isinstance(msg, dict): - print(f"Row {row_index}, message {j}: Must be an object") - return False - if "role" not in msg or msg["role"] not in valid_roles: - print(f"Row {row_index}, message {j}: Invalid role. Use: {valid_roles}") - return False - if "content" not in msg: - print(f"Row {row_index}, message {j}: Missing 'content' field") - return False - - return True - - -def _validate_classification_format(row: Dict[str, Any], row_index: int) -> bool: - """Validate classification-specific format.""" - if "text" not in row: - print(f"Row {row_index}: Missing 'text' field") - return False - if "label" not in row: - print(f"Row {row_index}: Missing 'label' field") - return False - - return True - - -def _validate_tabular_format(row: Dict[str, Any], row_index: int) -> bool: - """Validate tabular-specific format.""" - if "data" not in row: - print(f"Row {row_index}: Missing 'data' field") - return False - if "columns" not in row: - print(f"Row {row_index}: Missing 'columns' field") - return False - - data = row["data"] - columns = row["columns"] - - if not isinstance(data, list): - print(f"Row {row_index}: 'data' must be an array") - return False - if not isinstance(columns, list): - print(f"Row {row_index}: 'columns' must be an array") - return False - - return True - - -def validate_training_data(rows: List[Dict[str, Any]], template_name: str = "chat") -> bool: - """ - Validate training data structure according to template. - Supports multiple dataset types with appropriate validation. - """ - template = load_dataset_template(template_name) - if not template: - print(f"❌ Could not load template '{template_name}', falling back to basic validation") - return _basic_validation(rows) - - return validate_by_template(rows, template) - - -def _basic_validation(rows: List[Dict[str, Any]]) -> bool: - """Basic validation when no template is available.""" - for i, row in enumerate(rows): - if not isinstance(row, dict): - print(f"Row {i}: Must be a dictionary/object") - return False - print(f"✓ Basic validation passed for {len(rows)} rows") - return True - - -def add_rows( - repo_id: str, - rows: List[Dict[str, Any]], - split: str = "train", - validate: bool = True, - template: str = "chat", - token: Optional[str] = None, -) -> None: - """ - Stream updates to the dataset by uploading a new chunk of rows. - Enhanced with validation for multiple dataset types. - - Args: - repo_id: Repository identifier (username/dataset-name) - rows: List of training examples - split: Dataset split name (train, test, validation) - validate: Whether to validate data structure before upload - template: Dataset template type (chat, classification, qa, completion, tabular, custom) - token: HuggingFace API token - """ - api = HfApi(token=token) - - if not rows: - print("No rows to add.") - return - - # Validate training data structure - if validate and not validate_training_data(rows, template): - print("❌ Validation failed. Use --no-validate to skip validation.") - return - - # Create a newline-delimited JSON string - jsonl_content = "\n".join(json.dumps(row) for row in rows) - - # Generate a unique filename for this chunk - timestamp = int(time.time() * 1000) - filename = f"data/{split}-{timestamp}.jsonl" - - try: - api.upload_file( - path_or_fileobj=jsonl_content.encode("utf-8"), - path_in_repo=filename, - repo_id=repo_id, - repo_type="dataset", - commit_message=f"Add {len(rows)} rows to {split} split", - ) - print(f"✅ Added {len(rows)} rows to {repo_id} (split: {split})") - except Exception as e: - print(f"❌ Upload failed: {e}") - return - - -def load_template(template_name: str = "system_prompt_template.txt") -> str: - """Load a template file from the examples directory.""" - template_path = EXAMPLES_DIR / template_name - if template_path.exists(): - return template_path.read_text() - else: - print(f"⚠️ Template {template_name} not found at {template_path}") - return "" - - -def quick_setup(repo_id: str, template_type: str = "chat", token: Optional[str] = None) -> None: - """ - Quick setup for different dataset types using templates. - - Args: - repo_id: Repository identifier - template_type: Dataset template (chat, classification, qa, completion, tabular, custom) - token: HuggingFace API token - """ - print(f"🚀 Quick setup for {repo_id} with '{template_type}' template...") - - # Load template configuration - template_config = load_dataset_template(template_type) - if not template_config: - print(f"❌ Could not load template '{template_type}'. Setup cancelled.") - return - - # Initialize repository - init_dataset(repo_id, token=token, private=True) - - # Configure with template system prompt - system_prompt = template_config.get("system_prompt", "") - if system_prompt: - define_config(repo_id, system_prompt=system_prompt, token=token) - - # Add template examples - examples = template_config.get("examples", []) - if examples: - add_rows(repo_id, examples, template=template_type, token=token) - print(f"✅ Added {len(examples)} example(s) from template") - - print(f"✅ Quick setup complete for {repo_id}") - print(f"📊 Dataset type: {template_config.get('description', 'No description')}") - - # Show next steps - print(f"\n📋 Next steps:") - print( - f"1. Add more data: uv run scripts/dataset_manager.py add_rows --repo_id {repo_id} --template {template_type} --rows_json 'your_data.json'" - ) - print(f"2. View stats: uv run scripts/dataset_manager.py stats --repo_id {repo_id}") - print(f"3. Explore at: https://huggingface.co/datasets/{repo_id}") - - -def show_stats(repo_id: str, token: Optional[str] = None) -> None: - """Display statistics about the dataset.""" - api = HfApi(token=token) - - try: - # Get repository info - repo_info = api.repo_info(repo_id, repo_type="dataset") - print(f"\n📊 Dataset Stats: {repo_id}") - print(f"Created: {repo_info.created_at}") - print(f"Updated: {repo_info.last_modified}") - print(f"Private: {repo_info.private}") - - # List files - files = api.list_repo_files(repo_id, repo_type="dataset") - data_files = [f for f in files if f.startswith("data/")] - print(f"Data files: {len(data_files)}") - - if "config.json" in files: - print("✅ Configuration present") - else: - print("⚠️ No configuration found") - - except Exception as e: - print(f"❌ Failed to get stats: {e}") - - -def list_available_templates() -> None: - """List all available dataset templates with descriptions.""" - templates_dir = EXAMPLES_DIR.parent / "templates" - - if not templates_dir.exists(): - print("❌ Templates directory not found") - return - - print("\n📋 Available Dataset Templates:") - print("=" * 50) - - for template_file in templates_dir.glob("*.json"): - try: - with open(template_file) as f: - template = json.load(f) - - name = template_file.stem - desc = template.get("description", "No description available") - template_type = template.get("type", name) - - print(f"\n🏷️ {name}") - print(f" Type: {template_type}") - print(f" Description: {desc}") - - # Show required fields - schema = template.get("validation_schema", {}) - required = schema.get("required_fields", []) - if required: - print(f" Required fields: {', '.join(required)}") - - except Exception as e: - print(f"❌ Error loading template {template_file.name}: {e}") - - print( - f"\n💡 Usage: uv run scripts/dataset_manager.py quick_setup --repo_id your-username/dataset-name --template TEMPLATE_NAME" - ) - print(f"📚 Example templates directory: {templates_dir}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Hugging Face Dataset Manager") - subparsers = parser.add_subparsers(dest="command", required=True) - - # Init command - init_parser = subparsers.add_parser("init", help="Initialize a new dataset") - init_parser.add_argument("--repo_id", required=True, help="Repository ID (user/repo_name)") - init_parser.add_argument("--private", action="store_true", help="Make repository private") - - # Config command - config_parser = subparsers.add_parser("config", help="Setup dataset config") - config_parser.add_argument("--repo_id", required=True, help="Repository ID") - config_parser.add_argument("--system_prompt", help="System prompt to store in config") - - # Add rows command - add_parser = subparsers.add_parser("add_rows", help="Add rows to the dataset") - add_parser.add_argument("--repo_id", required=True, help="Repository ID") - add_parser.add_argument("--split", default="train", help="Dataset split (e.g., train, test)") - add_parser.add_argument( - "--template", - default="chat", - choices=[ - "chat", - "classification", - "qa", - "completion", - "tabular", - "custom", - ], - help="Dataset template type for validation", - ) - add_parser.add_argument( - "--rows_json", - required=True, - help="JSON string containing a list of rows", - ) - add_parser.add_argument( - "--no-validate", - dest="validate", - action="store_false", - help="Skip data validation", - ) - - # Quick setup command - setup_parser = subparsers.add_parser("quick_setup", help="Quick setup with template") - setup_parser.add_argument("--repo_id", required=True, help="Repository ID") - setup_parser.add_argument( - "--template", - default="chat", - choices=[ - "chat", - "classification", - "qa", - "completion", - "tabular", - "custom", - ], - help="Dataset template type", - ) - - # Stats command - stats_parser = subparsers.add_parser("stats", help="Show dataset statistics") - stats_parser.add_argument("--repo_id", required=True, help="Repository ID") - - # List templates command - templates_parser = subparsers.add_parser("list_templates", help="List available dataset templates") - - args = parser.parse_args() - - token = HF_TOKEN - if not token: - print("Warning: HF_TOKEN environment variable not set.") - - if args.command == "init": - init_dataset(args.repo_id, token=token, private=args.private) - elif args.command == "config": - define_config(args.repo_id, system_prompt=args.system_prompt, token=token) - elif args.command == "add_rows": - try: - rows = json.loads(args.rows_json) - if not isinstance(rows, list): - raise ValueError("rows_json must be a JSON list of objects") - add_rows( - args.repo_id, - rows, - split=args.split, - template=args.template, - validate=args.validate, - token=token, - ) - except json.JSONDecodeError: - print("Error: Invalid JSON provided for --rows_json") - elif args.command == "quick_setup": - quick_setup(args.repo_id, template_type=args.template, token=token) - elif args.command == "stats": - show_stats(args.repo_id, token=token) - elif args.command == "list_templates": - list_available_templates() diff --git a/skills/hugging-face-datasets/scripts/sql_manager.py b/skills/hugging-face-datasets/scripts/sql_manager.py deleted file mode 100644 index f63635e0..00000000 --- a/skills/hugging-face-datasets/scripts/sql_manager.py +++ /dev/null @@ -1,872 +0,0 @@ -#!/usr/bin/env -S uv run -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "duckdb>=1.0.0", -# "huggingface_hub>=0.20.0", -# "datasets>=2.14.0", -# "pandas>=2.0.0", -# ] -# /// -""" -Hugging Face Dataset SQL Manager - -Query, transform, and push Hugging Face datasets using DuckDB's SQL interface. -Supports the hf:// protocol for direct dataset access, data wrangling, and -pushing results back to the Hub. - -Version: 1.0.0 - -Usage: - # Query a dataset - uv run sql_manager.py query --dataset "cais/mmlu" --sql "SELECT * FROM data LIMIT 10" - - # Query and push to new dataset - uv run sql_manager.py query --dataset "cais/mmlu" --sql "SELECT * FROM data WHERE subject='nutrition'" \ - --push-to "username/nutrition-subset" - - # Describe dataset schema - uv run sql_manager.py describe --dataset "cais/mmlu" - - # List available splits/configs - uv run sql_manager.py info --dataset "cais/mmlu" - - # Get random sample - uv run sql_manager.py sample --dataset "cais/mmlu" --n 5 - - # Export to parquet - uv run sql_manager.py export --dataset "cais/mmlu" --output "data.parquet" -""" - -import os -import json -import re -import argparse -from typing import Optional, List, Dict, Any, Union - -import duckdb -from huggingface_hub import HfApi - -# Regex for valid SQL identifiers (column names, view names) -_IDENTIFIER_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") - - -# Configuration -HF_TOKEN = os.environ.get("HF_TOKEN") - - -class HFDatasetSQL: - """ - Query Hugging Face datasets using DuckDB SQL. - - Examples: - >>> sql = HFDatasetSQL() - >>> results = sql.query("cais/mmlu", "SELECT * FROM data LIMIT 5") - >>> schema = sql.describe("cais/mmlu") - >>> sql.query_and_push("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition'", "user/nutrition-qa") - """ - - def __init__(self, token: Optional[str] = None): - """Initialize the SQL manager with optional HF token.""" - self.token = token or HF_TOKEN - self.conn = duckdb.connect() - self._setup_connection() - - @staticmethod - def _quote_identifier(name: str) -> str: - """Quote a SQL identifier, escaping embedded double-quotes.""" - return '"' + name.replace('"', '""') + '"' - - @staticmethod - def _validate_identifier(name: str) -> None: - """Raise ValueError if *name* is not a safe SQL identifier.""" - if not _IDENTIFIER_RE.match(name): - raise ValueError( - f"Invalid identifier: {name!r}. " - "Identifiers must start with a letter or underscore and contain only " - "alphanumeric characters and underscores." - ) - - def _setup_connection(self): - """Configure DuckDB connection for HF access.""" - # Set HF token if available (for private datasets) - if self.token: - self.conn.execute("CREATE SECRET hf_token (TYPE HUGGINGFACE, TOKEN $1);", [self.token]) - - def _build_hf_path( - self, dataset_id: str, split: str = "*", config: Optional[str] = None, revision: str = "~parquet" - ) -> str: - """ - Build the hf:// path for a dataset. - - Args: - dataset_id: Dataset ID (e.g., "cais/mmlu") - split: Split name or "*" for all splits - config: Optional config/subset name - revision: Revision, defaults to ~parquet for auto-converted parquet - - Returns: - hf:// path string - """ - if config: - return f"hf://datasets/{dataset_id}@{revision}/{config}/{split}/*.parquet" - else: - return f"hf://datasets/{dataset_id}@{revision}/default/{split}/*.parquet" - - def _build_hf_path_flexible( - self, - dataset_id: str, - split: Optional[str] = None, - config: Optional[str] = None, - ) -> str: - """ - Build flexible hf:// path with wildcards for discovery. - - Args: - dataset_id: Dataset ID - split: Optional specific split - config: Optional config name - - Returns: - hf:// path with appropriate wildcards - """ - base = f"hf://datasets/{dataset_id}@~parquet" - - if config and split: - return f"{base}/{config}/{split}/*.parquet" - elif config: - return f"{base}/{config}/*/*.parquet" - elif split: - return f"{base}/*/{split}/*.parquet" - else: - return f"{base}/*/*/*.parquet" - - def query( - self, - dataset_id: str, - sql: str, - split: str = "train", - config: Optional[str] = None, - limit: Optional[int] = None, - output_format: str = "dict", - ) -> Union[List[Dict], Any]: - """ - Execute SQL query on a Hugging Face dataset. - - Args: - dataset_id: Dataset ID (e.g., "cais/mmlu", "ibm/duorc") - sql: SQL query. Use 'data' as table name (will be replaced with actual path) - split: Dataset split (train, test, validation, or * for all) - config: Optional dataset config/subset - limit: Optional limit override - output_format: Output format - "dict", "df" (pandas), "arrow", "raw" - - Returns: - Query results in specified format - - Examples: - >>> sql.query("cais/mmlu", "SELECT * FROM data WHERE subject='nutrition' LIMIT 10") - >>> sql.query("cais/mmlu", "SELECT subject, COUNT(*) as cnt FROM data GROUP BY subject") - """ - # Build the HF path - hf_path = self._build_hf_path(dataset_id, split=split, config=config) - - # Replace 'data' placeholder with actual path - # Handle various SQL patterns - processed_sql = sql.replace("FROM data", f"FROM '{hf_path}'") - processed_sql = processed_sql.replace("from data", f"FROM '{hf_path}'") - processed_sql = processed_sql.replace("JOIN data", f"JOIN '{hf_path}'") - processed_sql = processed_sql.replace("join data", f"JOIN '{hf_path}'") - - # If user provides raw path, use as-is - if "hf://" in sql: - processed_sql = sql - - # Apply limit if specified and not already in query - if limit and "LIMIT" not in processed_sql.upper(): - processed_sql += f" LIMIT {limit}" - - try: - result = self.conn.execute(processed_sql) - - if output_format == "df": - return result.fetchdf() - elif output_format == "arrow": - return result.fetch_arrow_table() - elif output_format == "raw": - return result.fetchall() - else: # dict - columns = [desc[0] for desc in result.description] - rows = result.fetchall() - return [dict(zip(columns, row)) for row in rows] - - except Exception as e: - print(f"❌ Query error: {e}") - print(f" SQL: {processed_sql[:200]}...") - raise - - def query_raw(self, sql: str, output_format: str = "dict") -> Union[List[Dict], Any]: - """ - Execute raw SQL query without path substitution. - - Useful for queries that already contain full hf:// paths or for - multi-dataset queries. - - Args: - sql: Complete SQL query - output_format: Output format - - Returns: - Query results - """ - result = self.conn.execute(sql) - - if output_format == "df": - return result.fetchdf() - elif output_format == "arrow": - return result.fetch_arrow_table() - elif output_format == "raw": - return result.fetchall() - else: - columns = [desc[0] for desc in result.description] - rows = result.fetchall() - return [dict(zip(columns, row)) for row in rows] - - def describe(self, dataset_id: str, split: str = "train", config: Optional[str] = None) -> List[Dict[str, str]]: - """ - Get schema/structure of a dataset. - - Args: - dataset_id: Dataset ID - split: Dataset split - config: Optional config - - Returns: - List of column definitions with name, type, nullable info - """ - hf_path = self._build_hf_path(dataset_id, split=split, config=config) - - sql = f"DESCRIBE SELECT * FROM '{hf_path}' LIMIT 1" - result = self.conn.execute(sql) - - columns = [desc[0] for desc in result.description] - rows = result.fetchall() - - return [dict(zip(columns, row)) for row in rows] - - def sample( - self, - dataset_id: str, - n: int = 10, - split: str = "train", - config: Optional[str] = None, - seed: Optional[int] = None, - ) -> List[Dict]: - """ - Get a random sample from a dataset. - - Args: - dataset_id: Dataset ID - n: Number of samples - split: Dataset split - config: Optional config - seed: Random seed for reproducibility - - Returns: - List of sampled rows - """ - hf_path = self._build_hf_path(dataset_id, split=split, config=config) - - if seed is not None: - sql = f"SELECT * FROM '{hf_path}' USING SAMPLE {n} (RESERVOIR, {seed})" - else: - sql = f"SELECT * FROM '{hf_path}' USING SAMPLE {n}" - - return self.query_raw(sql) - - def count( - self, dataset_id: str, split: str = "train", config: Optional[str] = None, where: Optional[str] = None - ) -> int: - """ - Count rows in a dataset, optionally with filter. - - Args: - dataset_id: Dataset ID - split: Dataset split - config: Optional config - where: Optional WHERE clause (without WHERE keyword) - - Returns: - Row count - """ - hf_path = self._build_hf_path(dataset_id, split=split, config=config) - - sql = f"SELECT COUNT(*) FROM '{hf_path}'" - if where: - sql += f" WHERE {where}" - - result = self.conn.execute(sql).fetchone() - return result[0] if result else 0 - - def unique_values( - self, dataset_id: str, column: str, split: str = "train", config: Optional[str] = None, limit: int = 100 - ) -> List[Any]: - """ - Get unique values in a column. - - Args: - dataset_id: Dataset ID - column: Column name - split: Dataset split - config: Optional config - limit: Max unique values to return - - Returns: - List of unique values - """ - hf_path = self._build_hf_path(dataset_id, split=split, config=config) - - quoted_col = self._quote_identifier(column) - sql = f"SELECT DISTINCT {quoted_col} FROM '{hf_path}' LIMIT {limit}" - result = self.conn.execute(sql).fetchall() - - return [row[0] for row in result] - - def histogram( - self, dataset_id: str, column: str, split: str = "train", config: Optional[str] = None, bins: int = 10 - ) -> List[Dict]: - """ - Get value distribution/histogram for a column. - - Args: - dataset_id: Dataset ID - column: Column name - split: Dataset split - config: Optional config - bins: Number of bins for numeric columns - - Returns: - Distribution data - """ - hf_path = self._build_hf_path(dataset_id, split=split, config=config) - - quoted_col = self._quote_identifier(column) - sql = f""" - SELECT - {quoted_col}, - COUNT(*) as count - FROM '{hf_path}' - GROUP BY {quoted_col} - ORDER BY count DESC - LIMIT {bins} - """ - - return self.query_raw(sql) - - def filter_and_transform( - self, - dataset_id: str, - select: str = "*", - where: Optional[str] = None, - group_by: Optional[str] = None, - order_by: Optional[str] = None, - split: str = "train", - config: Optional[str] = None, - limit: Optional[int] = None, - ) -> List[Dict]: - """ - Filter and transform dataset with SQL clauses. - - Args: - dataset_id: Dataset ID - select: SELECT clause (columns, expressions, aggregations) - where: WHERE clause (filter conditions) - group_by: GROUP BY clause - order_by: ORDER BY clause - split: Dataset split - config: Optional config - limit: Row limit - - Returns: - Transformed data - - Examples: - >>> sql.filter_and_transform( - ... "cais/mmlu", - ... select="subject, COUNT(*) as cnt", - ... group_by="subject", - ... order_by="cnt DESC", - ... limit=10 - ... ) - """ - hf_path = self._build_hf_path(dataset_id, split=split, config=config) - - sql_parts = [f"SELECT {select}", f"FROM '{hf_path}'"] - - if where: - sql_parts.append(f"WHERE {where}") - if group_by: - sql_parts.append(f"GROUP BY {group_by}") - if order_by: - sql_parts.append(f"ORDER BY {order_by}") - if limit: - sql_parts.append(f"LIMIT {limit}") - - sql = " ".join(sql_parts) - return self.query_raw(sql) - - def join_datasets( - self, - left_dataset: str, - right_dataset: str, - on: str, - select: str = "*", - join_type: str = "INNER", - left_split: str = "train", - right_split: str = "train", - left_config: Optional[str] = None, - right_config: Optional[str] = None, - limit: Optional[int] = None, - ) -> List[Dict]: - """ - Join two datasets. - - Args: - left_dataset: Left dataset ID - right_dataset: Right dataset ID - on: JOIN condition (e.g., "left.id = right.id") - select: SELECT clause - join_type: Type of join (INNER, LEFT, RIGHT, FULL) - left_split: Split for left dataset - right_split: Split for right dataset - left_config: Config for left dataset - right_config: Config for right dataset - limit: Row limit - - Returns: - Joined data - """ - left_path = self._build_hf_path(left_dataset, split=left_split, config=left_config) - right_path = self._build_hf_path(right_dataset, split=right_split, config=right_config) - - sql = f""" - SELECT {select} - FROM '{left_path}' AS left_table - {join_type} JOIN '{right_path}' AS right_table - ON {on} - """ - - if limit: - sql += f" LIMIT {limit}" - - return self.query_raw(sql) - - def export_to_parquet( - self, - dataset_id: str, - output_path: str, - sql: Optional[str] = None, - split: str = "train", - config: Optional[str] = None, - ) -> str: - """ - Export query results to a local Parquet file. - - Args: - dataset_id: Source dataset ID - output_path: Local path for output Parquet file - sql: Optional SQL query (uses SELECT * if not provided) - split: Dataset split - config: Optional config - - Returns: - Path to created file - """ - hf_path = self._build_hf_path(dataset_id, split=split, config=config) - - if sql: - # Process the query - processed_sql = sql.replace("FROM data", f"FROM '{hf_path}'") - processed_sql = processed_sql.replace("from data", f"FROM '{hf_path}'") - else: - processed_sql = f"SELECT * FROM '{hf_path}'" - - if "'" in output_path: - raise ValueError(f"Invalid output path: paths must not contain single quotes") - export_sql = f"COPY ({processed_sql}) TO '{output_path}' (FORMAT PARQUET)" - self.conn.execute(export_sql) - - print(f"✅ Exported to {output_path}") - return output_path - - def export_to_jsonl( - self, - dataset_id: str, - output_path: str, - sql: Optional[str] = None, - split: str = "train", - config: Optional[str] = None, - ) -> str: - """ - Export query results to JSONL format. - - Args: - dataset_id: Source dataset ID - output_path: Local path for output JSONL file - sql: Optional SQL query - split: Dataset split - config: Optional config - - Returns: - Path to created file - """ - results = self.query(dataset_id, sql or "SELECT * FROM data", split=split, config=config) - - with open(output_path, "w") as f: - for row in results: - f.write(json.dumps(row) + "\n") - - print(f"✅ Exported {len(results)} rows to {output_path}") - return output_path - - def push_to_hub( - self, - dataset_id: str, - target_repo: str, - sql: Optional[str] = None, - split: str = "train", - config: Optional[str] = None, - target_split: str = "train", - private: bool = True, - commit_message: Optional[str] = None, - ) -> str: - """ - Query a dataset and push results to a new Hub repository. - - Args: - dataset_id: Source dataset ID - target_repo: Target repository ID (e.g., "username/new-dataset") - sql: SQL query to transform data (optional, defaults to SELECT *) - split: Source split - config: Source config - target_split: Target split name - private: Whether to create private repo - commit_message: Commit message - - Returns: - URL of created dataset - """ - try: - from datasets import Dataset - except ImportError: - raise ImportError( - "datasets library required for push_to_hub. " - "Run with `uv run ...` or install with `uv pip install datasets`." - ) - - # Execute query - results = self.query(dataset_id, sql or "SELECT * FROM data", split=split, config=config) - - if not results: - print("❌ No results to push") - return "" - - # Convert to HF Dataset - ds = Dataset.from_list(results) - - # Push to Hub - ds.push_to_hub( - target_repo, - split=target_split, - private=private, - commit_message=commit_message or f"Created from {dataset_id} via SQL query", - token=self.token, - ) - - url = f"https://huggingface.co/datasets/{target_repo}" - print(f"✅ Pushed {len(results)} rows to {url}") - return url - - def create_view(self, name: str, dataset_id: str, split: str = "train", config: Optional[str] = None): - """ - Create a DuckDB view for easier querying. - - Args: - name: View name - dataset_id: Dataset ID - split: Dataset split - config: Optional config - """ - self._validate_identifier(name) - hf_path = self._build_hf_path(dataset_id, split=split, config=config) - quoted_name = self._quote_identifier(name) - self.conn.execute(f"CREATE OR REPLACE VIEW {quoted_name} AS SELECT * FROM '{hf_path}'") - print(f"✅ Created view '{name}' for {dataset_id}") - - def info(self, dataset_id: str) -> Dict[str, Any]: - """ - Get information about a dataset including available configs and splits. - - Args: - dataset_id: Dataset ID - - Returns: - Dataset information - """ - api = HfApi(token=self.token) - - try: - info = api.dataset_info(dataset_id) - - result = { - "id": info.id, - "author": info.author, - "private": info.private, - "downloads": info.downloads, - "likes": info.likes, - "tags": info.tags, - "created_at": str(info.created_at) if info.created_at else None, - "last_modified": str(info.last_modified) if info.last_modified else None, - } - - # Try to get config/split info from card data - if info.card_data: - result["configs"] = getattr(info.card_data, "configs", None) - - return result - - except Exception as e: - print(f"❌ Failed to get info: {e}") - return {} - - def close(self): - """Close the database connection.""" - self.conn.close() - - -def main(): - """CLI entry point.""" - parser = argparse.ArgumentParser( - description="Query Hugging Face datasets with SQL", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - # Query dataset with SQL - python sql_manager.py query --dataset "cais/mmlu" --sql "SELECT * FROM data WHERE subject='nutrition' LIMIT 10" - - # Get random sample - python sql_manager.py sample --dataset "cais/mmlu" --n 5 - - # Describe schema - python sql_manager.py describe --dataset "cais/mmlu" - - # Get value counts - python sql_manager.py histogram --dataset "cais/mmlu" --column "subject" - - # Filter and transform - python sql_manager.py transform --dataset "cais/mmlu" \\ - --select "subject, COUNT(*) as cnt" \\ - --group-by "subject" \\ - --order-by "cnt DESC" - - # Query and push to Hub - python sql_manager.py query --dataset "cais/mmlu" \\ - --sql "SELECT * FROM data WHERE subject='nutrition'" \\ - --push-to "username/nutrition-subset" - - # Export to Parquet - python sql_manager.py export --dataset "cais/mmlu" \\ - --sql "SELECT * FROM data WHERE subject='nutrition'" \\ - --output "nutrition.parquet" - """, - ) - - subparsers = parser.add_subparsers(dest="command", required=True) - - # Common arguments - def add_common_args(p): - p.add_argument("--dataset", "-d", required=True, help="Dataset ID (e.g., cais/mmlu)") - p.add_argument("--split", "-s", default="train", help="Dataset split (default: train)") - p.add_argument("--config", "-c", help="Dataset config/subset") - - # Query command - query_parser = subparsers.add_parser("query", help="Execute SQL query on dataset") - add_common_args(query_parser) - query_parser.add_argument("--sql", required=True, help="SQL query (use 'data' as table name)") - query_parser.add_argument("--limit", "-l", type=int, help="Limit results") - query_parser.add_argument("--format", choices=["json", "table", "csv"], default="json", help="Output format") - query_parser.add_argument("--push-to", help="Push results to this Hub repo") - query_parser.add_argument("--private", action="store_true", help="Make pushed repo private") - - # Sample command - sample_parser = subparsers.add_parser("sample", help="Get random sample from dataset") - add_common_args(sample_parser) - sample_parser.add_argument("--n", type=int, default=10, help="Number of samples") - sample_parser.add_argument("--seed", type=int, help="Random seed") - - # Describe command - describe_parser = subparsers.add_parser("describe", help="Get dataset schema") - add_common_args(describe_parser) - - # Count command - count_parser = subparsers.add_parser("count", help="Count rows in dataset") - add_common_args(count_parser) - count_parser.add_argument("--where", "-w", help="WHERE clause for filtering") - - # Histogram command - histogram_parser = subparsers.add_parser("histogram", help="Get value distribution") - add_common_args(histogram_parser) - histogram_parser.add_argument("--column", required=True, help="Column name") - histogram_parser.add_argument("--bins", type=int, default=20, help="Number of bins") - - # Unique command - unique_parser = subparsers.add_parser("unique", help="Get unique values in column") - add_common_args(unique_parser) - unique_parser.add_argument("--column", required=True, help="Column name") - unique_parser.add_argument("--limit", "-l", type=int, default=100, help="Max values") - - # Transform command - transform_parser = subparsers.add_parser("transform", help="Filter and transform dataset") - add_common_args(transform_parser) - transform_parser.add_argument("--select", default="*", help="SELECT clause") - transform_parser.add_argument("--where", "-w", help="WHERE clause") - transform_parser.add_argument("--group-by", help="GROUP BY clause") - transform_parser.add_argument("--order-by", help="ORDER BY clause") - transform_parser.add_argument("--limit", "-l", type=int, help="LIMIT") - transform_parser.add_argument("--push-to", help="Push results to Hub repo") - - # Export command - export_parser = subparsers.add_parser("export", help="Export query results to file") - add_common_args(export_parser) - export_parser.add_argument("--sql", help="SQL query (defaults to SELECT *)") - export_parser.add_argument("--output", "-o", required=True, help="Output file path") - export_parser.add_argument("--format", choices=["parquet", "jsonl"], default="parquet", help="Output format") - - # Info command - info_parser = subparsers.add_parser("info", help="Get dataset information") - info_parser.add_argument("--dataset", "-d", required=True, help="Dataset ID") - - # Raw SQL command - raw_parser = subparsers.add_parser("raw", help="Execute raw SQL with full hf:// paths") - raw_parser.add_argument("--sql", required=True, help="Complete SQL query") - raw_parser.add_argument("--format", choices=["json", "table", "csv"], default="json", help="Output format") - - args = parser.parse_args() - - # Initialize SQL manager - sql = HFDatasetSQL() - - try: - if args.command == "query": - results = sql.query(args.dataset, args.sql, split=args.split, config=args.config, limit=args.limit) - - if getattr(args, "push_to", None): - sql.push_to_hub( - args.dataset, args.push_to, sql=args.sql, split=args.split, config=args.config, private=args.private - ) - else: - _print_results(results, args.format) - - elif args.command == "sample": - results = sql.sample(args.dataset, n=args.n, split=args.split, config=args.config, seed=args.seed) - _print_results(results, "json") - - elif args.command == "describe": - schema = sql.describe(args.dataset, split=args.split, config=args.config) - _print_results(schema, "table") - - elif args.command == "count": - count = sql.count(args.dataset, split=args.split, config=args.config, where=args.where) - print(f"Count: {count:,}") - - elif args.command == "histogram": - results = sql.histogram(args.dataset, args.column, split=args.split, config=args.config, bins=args.bins) - _print_results(results, "table") - - elif args.command == "unique": - values = sql.unique_values( - args.dataset, args.column, split=args.split, config=args.config, limit=args.limit - ) - for v in values: - print(v) - - elif args.command == "transform": - results = sql.filter_and_transform( - args.dataset, - select=args.select, - where=args.where, - group_by=args.group_by, - order_by=args.order_by, - split=args.split, - config=args.config, - limit=args.limit, - ) - - if getattr(args, "push_to", None): - # Build SQL for push - query_sql = f"SELECT {args.select} FROM data" - if args.where: - query_sql += f" WHERE {args.where}" - if args.group_by: - query_sql += f" GROUP BY {args.group_by}" - if args.order_by: - query_sql += f" ORDER BY {args.order_by}" - if args.limit: - query_sql += f" LIMIT {args.limit}" - - sql.push_to_hub(args.dataset, args.push_to, sql=query_sql, split=args.split, config=args.config) - else: - _print_results(results, "json") - - elif args.command == "export": - if args.format == "parquet": - sql.export_to_parquet(args.dataset, args.output, sql=args.sql, split=args.split, config=args.config) - else: - sql.export_to_jsonl(args.dataset, args.output, sql=args.sql, split=args.split, config=args.config) - - elif args.command == "info": - info = sql.info(args.dataset) - _print_results([info], "json") - - elif args.command == "raw": - results = sql.query_raw(args.sql) - _print_results(results, args.format) - - finally: - sql.close() - - -def _print_results(results: List[Dict], format: str): - """Print results in specified format.""" - if not results: - print("No results") - return - - if format == "json": - print(json.dumps(results, indent=2, default=str)) - - elif format == "csv": - if results: - keys = results[0].keys() - print(",".join(str(k) for k in keys)) - for row in results: - print(",".join(str(row.get(k, "")) for k in keys)) - - elif format == "table": - if results: - keys = list(results[0].keys()) - # Calculate column widths - widths = {k: max(len(str(k)), max(len(str(r.get(k, ""))) for r in results)) for k in keys} - - # Header - header = " | ".join(str(k).ljust(widths[k]) for k in keys) - print(header) - print("-" * len(header)) - - # Rows - for row in results: - print(" | ".join(str(row.get(k, "")).ljust(widths[k]) for k in keys)) - - -if __name__ == "__main__": - main() diff --git a/skills/hugging-face-datasets/templates/chat.json b/skills/hugging-face-datasets/templates/chat.json deleted file mode 100644 index 4ee645c1..00000000 --- a/skills/hugging-face-datasets/templates/chat.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "type": "chat", - "description": "Conversational AI training data with chat templating support", - "system_prompt": "You are an AI assistant expert at using tools and engaging in helpful conversations. Generate training examples that demonstrate natural dialogue patterns, tool usage, and problem-solving interactions.", - "validation_schema": { - "required_fields": ["messages"], - "recommended_fields": ["scenario", "complexity", "tools_used", "learning_objectives"], - "field_types": { - "messages": "array", - "scenario": "string", - "complexity": "enum:simple,intermediate,advanced", - "tools_used": "array", - "learning_objectives": "array" - } - }, - "example_structure": { - "messages": [ - { - "role": "user|assistant|tool|system", - "content": "Message content", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "tool_name", - "arguments": {"param": "value"} - } - } - ] - } - ], - "scenario": "Brief description of the use case", - "complexity": "simple|intermediate|advanced", - "tools_used": ["tool1", "tool2"], - "learning_objectives": ["objective1", "objective2"] - }, - "examples": [ - { - "messages": [ - { - "role": "user", - "content": "Hello! Can you help me understand how to use APIs?" - }, - { - "role": "assistant", - "content": "I'd be happy to help you understand APIs! An API (Application Programming Interface) is like a bridge that allows different software applications to communicate with each other.\n\nThink of it like ordering at a restaurant:\n- You (the client) look at the menu (API documentation)\n- You tell the waiter (the API) what you want\n- The waiter takes your order to the kitchen (the server)\n- The kitchen prepares your food (processes the request)\n- The waiter brings your food back (returns the response)\n\nWould you like me to show you a simple example of making an API call?" - } - ], - "scenario": "API education and explanation", - "complexity": "simple", - "tools_used": [], - "learning_objectives": ["API concepts", "educational conversation", "analogical explanation"] - } - ] -} \ No newline at end of file diff --git a/skills/hugging-face-datasets/templates/classification.json b/skills/hugging-face-datasets/templates/classification.json deleted file mode 100644 index 09569144..00000000 --- a/skills/hugging-face-datasets/templates/classification.json +++ /dev/null @@ -1,62 +0,0 @@ -{ - "type": "classification", - "description": "Text classification datasets for sentiment analysis, topic classification, intent detection, etc.", - "system_prompt": "Create high-quality text classification training data. Each example should have clear, unambiguous text and appropriate labels. Ensure balanced representation across all classes and include diverse language patterns.", - "validation_schema": { - "required_fields": ["text", "label"], - "recommended_fields": ["confidence", "metadata", "source"], - "field_types": { - "text": "string", - "label": "string|array", - "confidence": "number", - "metadata": "object", - "source": "string" - } - }, - "example_structure": { - "text": "Input text to be classified", - "label": "classification_label", - "confidence": 0.95, - "metadata": { - "length": 42, - "language": "en", - "domain": "technology" - }, - "source": "web_scraping" - }, - "examples": [ - { - "text": "I absolutely love this new smartphone! The camera quality is outstanding and the battery lasts all day.", - "label": "positive", - "confidence": 0.98, - "metadata": { - "length": 98, - "language": "en", - "domain": "product_reviews" - }, - "source": "customer_feedback" - }, - { - "text": "The software keeps crashing and customer support is unresponsive. Very disappointed with this purchase.", - "label": "negative", - "confidence": 0.92, - "metadata": { - "length": 96, - "language": "en", - "domain": "product_reviews" - }, - "source": "customer_feedback" - }, - { - "text": "Book a table for two at 7 PM tonight", - "label": "restaurant_reservation", - "confidence": 0.95, - "metadata": { - "length": 35, - "language": "en", - "domain": "intent_detection" - }, - "source": "voice_assistant" - } - ] -} \ No newline at end of file diff --git a/skills/hugging-face-datasets/templates/completion.json b/skills/hugging-face-datasets/templates/completion.json deleted file mode 100644 index ef931fd3..00000000 --- a/skills/hugging-face-datasets/templates/completion.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "type": "completion", - "description": "Text completion and generation datasets for language modeling, code completion, creative writing", - "system_prompt": "Generate diverse text completion examples that demonstrate natural language patterns, creativity, and contextual understanding. Include various domains like code, creative writing, technical documentation, and conversational text.", - "validation_schema": { - "required_fields": ["prompt", "completion"], - "recommended_fields": ["domain", "style", "length", "difficulty"], - "field_types": { - "prompt": "string", - "completion": "string", - "domain": "enum:code,creative,technical,conversational,academic", - "style": "string", - "length": "enum:short,medium,long", - "difficulty": "enum:beginner,intermediate,advanced" - } - }, - "example_structure": { - "prompt": "The beginning text or context", - "completion": "The expected continuation or completion", - "domain": "code|creative|technical|conversational|academic", - "style": "description of writing style", - "length": "short|medium|long", - "difficulty": "beginner|intermediate|advanced" - }, - "examples": [ - { - "prompt": "def fibonacci(n):\n \"\"\"Calculate the nth Fibonacci number.\"\"\"\n if n <= 1:\n return n\n else:", - "completion": " return fibonacci(n-1) + fibonacci(n-2)", - "domain": "code", - "style": "python_function", - "length": "short", - "difficulty": "beginner" - }, - { - "prompt": "The old lighthouse stood silently on the cliff, its beacon long extinguished. Sarah approached slowly, wondering", - "completion": "if the stories her grandmother had told her were true. As she pushed open the weathered door, a gust of salt air carried whispers of the past, and she could almost see the ghostly figure of the lighthouse keeper who had vanished mysteriously fifty years ago.", - "domain": "creative", - "style": "narrative_fiction", - "length": "medium", - "difficulty": "intermediate" - }, - { - "prompt": "To configure SSL/TLS encryption for your web server, you'll need to:", - "completion": "1. Obtain an SSL certificate from a trusted Certificate Authority (CA)\n2. Install the certificate on your web server\n3. Configure your server to use HTTPS (port 443)\n4. Update your application to redirect HTTP traffic to HTTPS\n5. Test the implementation using SSL testing tools\n\nMake sure to backup your server configuration before making changes.", - "domain": "technical", - "style": "instructional", - "length": "medium", - "difficulty": "intermediate" - } - ] -} \ No newline at end of file diff --git a/skills/hugging-face-datasets/templates/custom.json b/skills/hugging-face-datasets/templates/custom.json deleted file mode 100644 index 5ad1f096..00000000 --- a/skills/hugging-face-datasets/templates/custom.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "type": "custom", - "description": "Flexible template for custom dataset formats - define your own schema and validation rules", - "system_prompt": "Generate data according to the custom schema provided. Follow the field definitions, data types, and validation rules specified in the schema configuration.", - "validation_schema": { - "required_fields": ["data"], - "recommended_fields": ["schema", "metadata"], - "field_types": { - "data": "any", - "schema": "object", - "metadata": "object" - } - }, - "example_structure": { - "data": "Your custom data structure goes here", - "schema": { - "fields": [ - { - "name": "field_name", - "type": "string|number|boolean|array|object", - "required": true, - "description": "Field description" - } - ], - "validation_rules": { - "custom_rule_1": "validation logic", - "custom_rule_2": "additional validation" - } - }, - "metadata": { - "created_by": "user", - "purpose": "dataset purpose", - "version": "1.0" - } - }, - "examples": [ - { - "data": { - "id": "unique_identifier", - "features": { - "text_feature": "sample text", - "numeric_features": [1.0, 2.5, -0.3], - "categorical_feature": "category_a" - }, - "labels": { - "primary_label": "positive", - "confidence_score": 0.87, - "secondary_labels": ["helpful", "informative"] - }, - "annotations": { - "annotator_id": "expert_1", - "annotation_time": "2024-01-15T10:30:00Z", - "quality_score": 9 - } - }, - "schema": { - "fields": [ - {"name": "id", "type": "string", "required": true, "description": "Unique identifier"}, - {"name": "features", "type": "object", "required": true, "description": "Input features"}, - {"name": "labels", "type": "object", "required": true, "description": "Target labels"}, - {"name": "annotations", "type": "object", "required": false, "description": "Annotation metadata"} - ], - "validation_rules": { - "id_format": "must be non-empty string", - "confidence_range": "must be between 0 and 1" - } - }, - "metadata": { - "created_by": "data_scientist", - "purpose": "multi_modal_classification", - "version": "1.0" - } - } - ] -} \ No newline at end of file diff --git a/skills/hugging-face-datasets/templates/qa.json b/skills/hugging-face-datasets/templates/qa.json deleted file mode 100644 index 9abba144..00000000 --- a/skills/hugging-face-datasets/templates/qa.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "type": "qa", - "description": "Question-answering datasets for reading comprehension, knowledge QA, and factual question answering", - "system_prompt": "Generate high-quality question-answer pairs with clear, informative answers. Include context when needed and ensure questions are well-formed and answerable. Cover diverse topics and question types.", - "validation_schema": { - "required_fields": ["question", "answer"], - "recommended_fields": ["context", "answer_type", "difficulty", "topic", "source"], - "field_types": { - "question": "string", - "answer": "string|array", - "context": "string", - "answer_type": "enum:factual,explanatory,opinion,yes_no,multiple_choice", - "difficulty": "enum:easy,medium,hard", - "topic": "string", - "source": "string" - } - }, - "example_structure": { - "question": "What is the question being asked?", - "answer": "The complete answer to the question", - "context": "Additional context or passage (if needed)", - "answer_type": "factual|explanatory|opinion|yes_no|multiple_choice", - "difficulty": "easy|medium|hard", - "topic": "subject_area", - "source": "where_this_came_from" - }, - "examples": [ - { - "question": "What is the capital of France?", - "answer": "Paris", - "answer_type": "factual", - "difficulty": "easy", - "topic": "geography", - "source": "general_knowledge" - }, - { - "question": "How does photosynthesis work?", - "answer": "Photosynthesis is the process by which plants use sunlight, carbon dioxide, and water to produce glucose and oxygen. Chlorophyll in plant cells captures light energy, which drives chemical reactions that convert CO2 and H2O into C6H12O6 (glucose) and O2. This process occurs primarily in the chloroplasts of leaf cells.", - "answer_type": "explanatory", - "difficulty": "medium", - "topic": "biology", - "source": "educational_content" - }, - { - "question": "Based on the passage, what was the main cause of the economic downturn?", - "answer": "According to the passage, the main cause was the sudden drop in consumer confidence following the bank failures, which led to reduced spending and business investment.", - "context": "The economic downturn of 2008 began when several major banks failed due to subprime mortgage losses. This triggered a crisis of confidence among consumers and businesses, leading to decreased spending and investment. The ripple effects spread throughout the global economy.", - "answer_type": "explanatory", - "difficulty": "medium", - "topic": "economics", - "source": "reading_comprehension" - } - ] -} \ No newline at end of file diff --git a/skills/hugging-face-datasets/templates/tabular.json b/skills/hugging-face-datasets/templates/tabular.json deleted file mode 100644 index 00793f77..00000000 --- a/skills/hugging-face-datasets/templates/tabular.json +++ /dev/null @@ -1,81 +0,0 @@ -{ - "type": "tabular", - "description": "Structured tabular data for regression, classification, or analysis tasks", - "system_prompt": "Generate structured tabular data with clear column definitions and appropriate data types. Ensure realistic value ranges and relationships between variables. Include proper metadata for each column.", - "validation_schema": { - "required_fields": ["data", "columns"], - "recommended_fields": ["target", "metadata", "description"], - "field_types": { - "data": "array", - "columns": "array", - "target": "string", - "metadata": "object", - "description": "string" - } - }, - "example_structure": { - "columns": [ - { - "name": "column_name", - "type": "numeric|categorical|text|datetime", - "description": "What this column represents", - "nullable": true - } - ], - "data": [ - {"column1": "value1", "column2": "value2"}, - {"column1": "value3", "column2": "value4"} - ], - "target": "target_column_name", - "metadata": { - "rows": 1000, - "features": 10, - "task": "classification|regression|clustering" - }, - "description": "Description of the dataset purpose" - }, - "examples": [ - { - "columns": [ - { - "name": "age", - "type": "numeric", - "description": "Customer age in years", - "nullable": false - }, - { - "name": "income", - "type": "numeric", - "description": "Annual income in USD", - "nullable": false - }, - { - "name": "education", - "type": "categorical", - "description": "Education level", - "nullable": false - }, - { - "name": "purchased", - "type": "categorical", - "description": "Whether customer made a purchase", - "nullable": false - } - ], - "data": [ - {"age": 25, "income": 45000, "education": "bachelor", "purchased": "yes"}, - {"age": 34, "income": 67000, "education": "master", "purchased": "yes"}, - {"age": 19, "income": 23000, "education": "high_school", "purchased": "no"}, - {"age": 42, "income": 85000, "education": "bachelor", "purchased": "yes"}, - {"age": 28, "income": 52000, "education": "bachelor", "purchased": "no"} - ], - "target": "purchased", - "metadata": { - "rows": 5, - "features": 3, - "task": "classification" - }, - "description": "Customer purchase prediction based on demographics" - } - ] -} \ No newline at end of file diff --git a/skills/hugging-face-tool-builder/SKILL.md b/skills/hugging-face-tool-builder/SKILL.md deleted file mode 100644 index ffe3983a..00000000 --- a/skills/hugging-face-tool-builder/SKILL.md +++ /dev/null @@ -1,120 +0,0 @@ ---- -name: hugging-face-tool-builder -description: Use this skill when the user wants to build tool/scripts or achieve a task where using data from the Hugging Face API would help. This is especially useful when chaining or combining API calls or the task will be repeated/automated. This Skill creates a reusable script to fetch, enrich or process data. ---- - -# Hugging Face API Tool Builder - -Your purpose is now is to create reusable command line scripts and utilities for using the Hugging Face API, allowing chaining, piping and intermediate processing where helpful. You can access the API directly, as well as use the `hf` command line tool. Model and Dataset cards can be accessed from repositories directly. - -## Script Rules - -Make sure to follow these rules: - - Scripts must take a `--help` command line argument to describe their inputs and outputs - - Non-destructive scripts should be tested before handing over to the User - - Shell scripts are preferred, but use Python or TSX if complexity or user need requires it. - - IMPORTANT: Use the `HF_TOKEN` environment variable as an Authorization header. For example: `curl -H "Authorization: Bearer ${HF_TOKEN}" https://huggingface.co/api/`. This provides higher rate limits and appropriate authorization for data access. - - Investigate the shape of the API results before commiting to a final design; make use of piping and chaining where composability would be an advantage - prefer simple solutions where possible. - - Share usage examples once complete. - -Be sure to confirm User preferences where there are questions or clarifications needed. - -## Sample Scripts - -Paths below are relative to this skill directory. - -Reference examples: -- `references/hf_model_papers_auth.sh` — uses `HF_TOKEN` automatically and chains trending → model metadata → model card parsing with fallbacks; it demonstrates multi-step API usage plus auth hygiene for gated/private content. -- `references/find_models_by_paper.sh` — optional `HF_TOKEN` usage via `--token`, consistent authenticated search, and a retry path when arXiv-prefixed searches are too narrow; it shows resilient query strategy and clear user-facing help. -- `references/hf_model_card_frontmatter.sh` — uses the `hf` CLI to download model cards, extracts YAML frontmatter, and emits NDJSON summaries (license, pipeline tag, tags, gated prompt flag) for easy filtering. - -Baseline examples (ultra-simple, minimal logic, raw JSON output with `HF_TOKEN` header): -- `references/baseline_hf_api.sh` — bash -- `references/baseline_hf_api.py` — python -- `references/baseline_hf_api.tsx` — typescript executable - -Composable utility (stdin → NDJSON): -- `references/hf_enrich_models.sh` — reads model IDs from stdin, fetches metadata per ID, emits one JSON object per line for streaming pipelines. - -Composability through piping (shell-friendly JSON output): -- `references/baseline_hf_api.sh 25 | jq -r '.[].id' | references/hf_enrich_models.sh | jq -s 'sort_by(.downloads) | reverse | .[:10]'` -- `references/baseline_hf_api.sh 50 | jq '[.[] | {id, downloads}] | sort_by(.downloads) | reverse | .[:10]'` -- `printf '%s\n' openai/gpt-oss-120b meta-llama/Meta-Llama-3.1-8B | references/hf_model_card_frontmatter.sh | jq -s 'map({id, license, has_extra_gated_prompt})'` - -## High Level Endpoints - -The following are the main API endpoints available at `https://huggingface.co` - -``` -/api/datasets -/api/models -/api/spaces -/api/collections -/api/daily_papers -/api/notifications -/api/settings -/api/whoami-v2 -/api/trending -/oauth/userinfo -``` - -## Accessing the API - -The API is documented with the OpenAPI standard at `https://huggingface.co/.well-known/openapi.json`. - -**IMPORTANT:** DO NOT ATTEMPT to read `https://huggingface.co/.well-known/openapi.json` directly as it is too large to process. - -**IMPORTANT** Use `jq` to query and extract relevant parts. For example, - - Command to Get All 160 Endpoints - -```bash -curl -s "https://huggingface.co/.well-known/openapi.json" | jq '.paths | keys | sort' -``` - -Model Search Endpoint Details - -```bash -curl -s "https://huggingface.co/.well-known/openapi.json" | jq '.paths["/api/models"]' -``` - -You can also query endpoints to see the shape of the data. When doing so constrain results to low numbers to make them easy to process, yet representative. - -## Using the HF command line tool - -The `hf` command line tool gives you further access to Hugging Face repository content and infrastructure. - -```bash -❯ hf --help -Usage: hf [OPTIONS] COMMAND [ARGS]... - - Hugging Face Hub CLI - -Options: - --help Show this message and exit. - -Commands: - auth Manage authentication (login, logout, etc.). - buckets Commands to interact with buckets. - cache Manage local cache directory. - collections Interact with collections on the Hub. - datasets Interact with datasets on the Hub. - discussions Manage discussions and pull requests on the Hub. - download Download files from the Hub. - endpoints Manage Hugging Face Inference Endpoints. - env Print information about the environment. - extensions Manage hf CLI extensions. - jobs Run and manage Jobs on the Hub. - models Interact with models on the Hub. - papers Interact with papers on the Hub. - repos Manage repos on the Hub. - skills Manage skills for AI assistants. - spaces Interact with spaces on the Hub. - sync Sync files between local directory and a bucket. - upload Upload a file or a folder to the Hub. - upload-large-folder Upload a large folder to the Hub. - version Print information about the hf version. - webhooks Manage webhooks on the Hub. -``` - -The `hf` CLI command has replaced the now deprecated `huggingface-cli` command. diff --git a/skills/hugging-face-tool-builder/references/baseline_hf_api.py b/skills/hugging-face-tool-builder/references/baseline_hf_api.py deleted file mode 100644 index fa5b9bdf..00000000 --- a/skills/hugging-face-tool-builder/references/baseline_hf_api.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -""" -Ultra-simple Hugging Face API example (Python). - -Fetches a small list of models from the HF API and prints raw JSON. -Uses HF_TOKEN for auth if the environment variable is set. -""" - -from __future__ import annotations - -import os -import sys -import urllib.request - - -def show_help() -> None: - print( - """Ultra-simple Hugging Face API example (Python) - -Usage: - baseline_hf_api.py [limit] - baseline_hf_api.py --help - -Description: - Fetches a small list of models from the HF API and prints raw JSON. - Uses HF_TOKEN for auth if the environment variable is set. - -Examples: - baseline_hf_api.py - baseline_hf_api.py 5 - HF_TOKEN=your_token baseline_hf_api.py 10 -""" - ) - - -def main() -> int: - if len(sys.argv) > 1 and sys.argv[1] == "--help": - show_help() - return 0 - - limit = sys.argv[1] if len(sys.argv) > 1 else "3" - if not limit.isdigit(): - print("Error: limit must be a number", file=sys.stderr) - return 1 - - token = os.getenv("HF_TOKEN") - headers = {"Authorization": f"Bearer {token}"} if token else {} - url = f"https://huggingface.co/api/models?limit={limit}" - - req = urllib.request.Request(url, headers=headers) - with urllib.request.urlopen(req) as resp: - sys.stdout.write(resp.read().decode("utf-8")) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/skills/hugging-face-tool-builder/references/baseline_hf_api.sh b/skills/hugging-face-tool-builder/references/baseline_hf_api.sh deleted file mode 100644 index 2d4d5f28..00000000 --- a/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -show_help() { - cat << EOF -Ultra-simple Hugging Face API example (Shell) - -Usage: - $0 [limit] - $0 --help - -Description: - Fetches a small list of models from the HF API and prints raw JSON. - Uses HF_TOKEN for auth if the environment variable is set. - -Examples: - $0 - $0 5 - HF_TOKEN=your_token $0 10 -EOF -} - -if [[ "${1:-}" == "--help" ]]; then - show_help - exit 0 -fi - -LIMIT="${1:-3}" -if ! [[ "$LIMIT" =~ ^[0-9]+$ ]]; then - echo "Error: limit must be a number" >&2 - exit 1 -fi - -headers=() -if [[ -n "${HF_TOKEN:-}" ]]; then - headers=(-H "Authorization: Bearer ${HF_TOKEN}") -fi - -curl -s "${headers[@]}" "https://huggingface.co/api/models?limit=${LIMIT}" diff --git a/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx b/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx deleted file mode 100644 index 3f273718..00000000 --- a/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env tsx - -/** - * Ultra-simple Hugging Face API example (TSX). - * - * Fetches a small list of models from the HF API and prints raw JSON. - * Uses HF_TOKEN for auth if the environment variable is set. - */ - -const showHelp = () => { - console.log(`Ultra-simple Hugging Face API example (TSX) - -Usage: - baseline_hf_api.tsx [limit] - baseline_hf_api.tsx --help - -Description: - Fetches a small list of models from the HF API and prints raw JSON. - Uses HF_TOKEN for auth if the environment variable is set. - -Examples: - baseline_hf_api.tsx - baseline_hf_api.tsx 5 - HF_TOKEN=your_token baseline_hf_api.tsx 10 -`); -}; - -const arg = process.argv[2]; -if (arg === "--help") { - showHelp(); - process.exit(0); -} - -const limit = arg ?? "3"; -if (!/^\d+$/.test(limit)) { - console.error("Error: limit must be a number"); - process.exit(1); -} - -const token = process.env.HF_TOKEN; -const headers: Record = token - ? { Authorization: `Bearer ${token}` } - : {}; - -const url = `https://huggingface.co/api/models?limit=${limit}`; - -(async () => { - const res = await fetch(url, { headers }); - - if (!res.ok) { - console.error(`Error: ${res.status} ${res.statusText}`); - process.exit(1); - } - - const text = await res.text(); - process.stdout.write(text); -})(); diff --git a/skills/hugging-face-tool-builder/references/find_models_by_paper.sh b/skills/hugging-face-tool-builder/references/find_models_by_paper.sh deleted file mode 100644 index 93e923a9..00000000 --- a/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +++ /dev/null @@ -1,230 +0,0 @@ -#!/bin/bash - -# Find models associated with papers on Hugging Face -# Usage: ./find_models_by_paper.sh [arXiv_id|search_term] -# Optional: Set HF_TOKEN environment variable for private/gated models - -set -e - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# Help function -show_help() { - echo -e "${BLUE}Find models associated with papers on Hugging Face${NC}" - echo "" - echo -e "${YELLOW}Usage:${NC}" - echo " $0 [OPTIONS] [search_term|arXiv_id]" - echo "" - echo -e "${YELLOW}Options:${NC}" - echo " --help Show this help message" - echo " --token Use HF_TOKEN environment variable (if set)" - echo "" - echo -e "${YELLOW}Environment:${NC}" - echo " HF_TOKEN Optional: Hugging Face token for private/gated models" - echo "" - echo -e "${YELLOW}Examples:${NC}" - echo " $0 1910.01108 # Search by arXiv ID" - echo " $0 distilbert # Search by model name" - echo " $0 transformer # Search by keyword" - echo " HF_TOKEN=your_token $0 1910.01108 # Use authentication" - echo "" - echo -e "${YELLOW}Description:${NC}" - echo "This script finds Hugging Face models that are associated with research papers." - echo "It searches for models that have arXiv IDs in their tags or mentions papers in their metadata." - echo "" - echo -e "${YELLOW}Notes:${NC}" - echo "• HF_TOKEN is optional for public models" - echo "• Use HF_TOKEN for private repositories or gated models" - echo "• HF_TOKEN enables higher rate limits for heavy usage" -} - -# Parse arguments -USE_TOKEN=false -POSITIONAL_ARGS=() - -while [[ $# -gt 0 ]]; do - case $1 in - --help) - show_help - exit 0 - ;; - --token) - USE_TOKEN=true - shift - ;; - -*) - echo -e "${RED}Unknown option: $1${NC}" - show_help - exit 1 - ;; - *) - POSITIONAL_ARGS+=("$1") - shift - ;; - esac -done - -set -- "${POSITIONAL_ARGS[@]}" - -if [[ $# -eq 0 ]]; then - echo -e "${RED}Error: Please provide a search term or arXiv ID${NC}" - echo -e "Use ${YELLOW}$0 --help${NC} for usage information" - exit 1 -fi - -SEARCH_TERM="$1" - -# Set up authentication header if HF_TOKEN is available -if [[ -n "$HF_TOKEN" ]] && [[ "$USE_TOKEN" == true || -n "$HF_TOKEN" ]]; then - AUTH_HEADER="-H \"Authorization: Bearer $HF_TOKEN\"" - echo -e "${BLUE}Using HF_TOKEN for authentication${NC}" -else - AUTH_HEADER="" - if [[ -n "$HF_TOKEN" ]]; then - echo -e "${YELLOW}HF_TOKEN found but not using it (add --token flag to use)${NC}" - fi -fi - -# Check if the input looks like an arXiv ID (format: YYYY.NNNNN or YYYY.NNNNNNN) -if [[ "$SEARCH_TERM" =~ ^[0-9]{4}\.[0-9]{4,7}$ ]]; then - echo -e "${BLUE}Searching for models associated with arXiv paper: $SEARCH_TERM${NC}" - SEARCH_QUERY="arxiv%3A$SEARCH_TERM" - IS_ARXIV_SEARCH=true -else - echo -e "${BLUE}Searching for models related to: $SEARCH_TERM${NC}" - SEARCH_QUERY="$SEARCH_TERM" - IS_ARXIV_SEARCH=false -fi - -# Function to extract arXiv IDs from tags -extract_arxiv_ids() { - local tags="$1" - echo "$tags" | jq -r '.[] | select(. | startswith("arxiv:")) | split(":")[1]' 2>/dev/null || true -} - -# Function to get paper title from arXiv ID -get_paper_title() { - local arxiv_id="$1" - # Try to get paper title from Hugging Face tags if available - # This is a simplified approach - in practice, you might want to call arXiv API - echo "Paper Title (arXiv:$arxiv_id)" -} - -# Search for models -API_URL="https://huggingface.co/api/models" -echo -e "${YELLOW}Searching Hugging Face API...${NC}" - -# Build curl command with authentication if available -CURL_CMD="curl -s $AUTH_HEADER \"$API_URL?search=$SEARCH_QUERY&limit=50\"" -echo -e "${BLUE}API Query: $API_URL?search=$SEARCH_QUERY&limit=50${NC}" - -# Execute the API call -if [[ -n "$HF_TOKEN" ]]; then - RESPONSE=$(curl -s -H "Authorization: Bearer $HF_TOKEN" "$API_URL?search=$SEARCH_QUERY&limit=50" || true) -else - RESPONSE=$(curl -s "$API_URL?search=$SEARCH_QUERY&limit=50" || true) -fi - -# Check if we got a valid response -if [[ -z "$RESPONSE" ]] || [[ "$RESPONSE" == "[]" ]]; then - echo -e "${RED}No models found for search term: $SEARCH_TERM${NC}" - - # If arXiv search failed, try without arxiv: prefix - if [[ "$IS_ARXIV_SEARCH" == true ]]; then - echo -e "${YELLOW}Trying broader search without arxiv: prefix...${NC}" - SEARCH_QUERY="$SEARCH_TERM" - IS_ARXIV_SEARCH=false - - if [[ -n "$HF_TOKEN" ]]; then - RESPONSE=$(curl -s -H "Authorization: Bearer $HF_TOKEN" "$API_URL?search=$SEARCH_QUERY&limit=50" || true) - else - RESPONSE=$(curl -s "$API_URL?search=$SEARCH_QUERY&limit=50" || true) - fi - - if [[ -z "$RESPONSE" ]] || [[ "$RESPONSE" == "[]" ]]; then - echo -e "${RED}Still no results found. Try a different search term.${NC}" - exit 1 - fi - else - exit 1 - fi -fi - -# Process the results -echo -e "${GREEN}Found models! Processing results...${NC}" - -# Use jq to process the JSON response and find models with paper associations -MODELS_WITH_PAPERS=$(echo "$RESPONSE" | jq -r ' - .[] | - select(.id != null) | - { - id: .id, - arxiv_tags: [.tags[] | select(. | startswith("arxiv:"))] | join("; "), - downloads: (.downloads // 0), - likes: (.likes // 0), - task: (.pipeline_tag // "unknown"), - library: (.library_name // "unknown") - } - | @base64' 2>/dev/null || true) - -# Count total results -TOTAL_MODELS=$(echo "$RESPONSE" | jq 'length' 2>/dev/null || echo "0") -MODELS_WITH_PAPERS_COUNT=$(echo "$MODELS_WITH_PAPERS" | wc -l) - -echo -e "${BLUE}Results Summary:${NC}" -echo -e " Total models found: $TOTAL_MODELS" -echo -e " Models with paper associations: $MODELS_WITH_PAPERS_COUNT" -echo "" - -if [[ -z "$MODELS_WITH_PAPERS" ]]; then - # Show all models even if no paper associations found - echo -e "${YELLOW}No explicit paper associations found. Showing all matching models:${NC}" - echo "$RESPONSE" | jq -r ' - .[] | - select(.id != null) | - "📦 \(.id) - Task: \(.pipeline_tag // "unknown") - Downloads: \(.downloads // 0) - Likes: \(.likes // 0) - Library: \(.library_name // "unknown") - ---" - ' 2>/dev/null || echo "Failed to parse response" -else - # Show models with paper associations - echo -e "${GREEN}Models with paper associations:${NC}" - echo "$MODELS_WITH_PAPERS" | while read -r model_data; do - if [[ -n "$model_data" ]]; then - # Decode base64 and show formatted - echo "$model_data" | base64 -d | jq -r ' - "📄 \(.id) - arXiv: \(.arxiv_tags) - Task: \(.task) - Downloads: \(.downloads) - Likes: \(.likes) - Library: \(.library) - ---" - ' 2>/dev/null || echo "Failed to parse model data" - fi - done -fi - -# Additional search tips -echo "" -echo -e "${BLUE}Search Tips:${NC}" -echo "• Try searching with the full arXiv ID (e.g., 1910.01108)" -echo "• Try searching with the paper title keywords" -echo "• Try searching with the model name" -echo "• Use HF_TOKEN for private models or higher rate limits" -echo "" -echo -e "${BLUE}Examples to try:${NC}" -echo " $0 1910.01108 # DistilBERT paper" -echo " $0 1810.04805 # BERT paper" -echo " $0 1706.03762 # Attention is All You Need paper" -echo " $0 roberta # RoBERTa models" -echo " $0 transformer # Transformer models" -echo " HF_TOKEN=your_token $0 1910.01108 # Use authentication" diff --git a/skills/hugging-face-tool-builder/references/hf_enrich_models.sh b/skills/hugging-face-tool-builder/references/hf_enrich_models.sh deleted file mode 100644 index 87708816..00000000 --- a/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -show_help() { - cat << 'USAGE' -Stream model IDs on stdin, emit one JSON object per line (NDJSON). - -Usage: - hf_enrich_models.sh [MODEL_ID ...] - cat ids.txt | hf_enrich_models.sh - baseline_hf_api.sh 50 | jq -r '.[].id' | hf_enrich_models.sh - -Description: - Reads newline-separated model IDs and fetches basic metadata for each. - Outputs NDJSON with id, downloads, likes, pipeline_tag, tags. - Uses HF_TOKEN for auth if the environment variable is set. - -Examples: - hf_enrich_models.sh gpt2 distilbert-base-uncased - baseline_hf_api.sh 50 | jq -r '.[].id' | hf_enrich_models.sh | jq -s 'sort_by(.downloads)' - HF_TOKEN=your_token hf_enrich_models.sh microsoft/DialoGPT-medium -USAGE -} - -if [[ "${1:-}" == "--help" ]]; then - show_help - exit 0 -fi - -if ! command -v jq >/dev/null 2>&1; then - echo "Error: jq is required but not installed" >&2 - exit 1 -fi - -headers=() -if [[ -n "${HF_TOKEN:-}" ]]; then - headers=(-H "Authorization: Bearer ${HF_TOKEN}") -fi - -emit_error() { - local model_id="$1" - local message="$2" - jq -cn --arg id "$model_id" --arg error "$message" '{id: $id, error: $error}' -} - -process_id() { - local model_id="$1" - - if [[ -z "$model_id" ]]; then - return 0 - fi - - local url="https://huggingface.co/api/models/${model_id}" - local response - response=$(curl -s "${headers[@]}" "$url" 2>/dev/null || true) - - if [[ -z "$response" ]]; then - emit_error "$model_id" "request_failed" - return 0 - fi - - if ! jq -e . >/dev/null 2>&1 <<<"$response"; then - emit_error "$model_id" "invalid_json" - return 0 - fi - - if jq -e '.error' >/dev/null 2>&1 <<<"$response"; then - emit_error "$model_id" "not_found" - return 0 - fi - - jq -c --arg id "$model_id" '{ - id: (.id // $id), - downloads: (.downloads // 0), - likes: (.likes // 0), - pipeline_tag: (.pipeline_tag // "unknown"), - tags: (.tags // []) - }' <<<"$response" 2>/dev/null || emit_error "$model_id" "parse_failed" -} - -if [[ $# -gt 0 ]]; then - for model_id in "$@"; do - process_id "$model_id" - done - exit 0 -fi - -if [[ -t 0 ]]; then - show_help - exit 1 -fi - -while IFS= read -r model_id; do - process_id "$model_id" -done diff --git a/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh b/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh deleted file mode 100644 index ded41c12..00000000 --- a/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +++ /dev/null @@ -1,188 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -show_help() { - cat << 'USAGE' -Fetch Hugging Face model cards via the hf CLI and summarize frontmatter. - -Usage: - hf_model_card_frontmatter.sh [MODEL_ID ...] - cat ids.txt | hf_model_card_frontmatter.sh - -Description: - Downloads README.md for each model via `hf download`, extracts YAML - frontmatter, and emits one JSON object per line (NDJSON) with key fields. - Uses HF_TOKEN if set (passed to the hf CLI). - -Output fields: - id, license, pipeline_tag, library_name, tags, language, - new_version, has_extra_gated_prompt - -Examples: - hf_model_card_frontmatter.sh openai/gpt-oss-120b - cat ids.txt | hf_model_card_frontmatter.sh | jq -s '.' - hf_model_card_frontmatter.sh meta-llama/Meta-Llama-3-8B \ - | jq -s 'map({id, license, has_extra_gated_prompt})' -USAGE -} - -if [[ "${1:-}" == "--help" ]]; then - show_help - exit 0 -fi - -if ! command -v hf >/dev/null 2>&1; then - echo "Error: hf CLI is required but not installed" >&2 - exit 1 -fi - -if ! command -v python3 >/dev/null 2>&1; then - echo "Error: python3 is required but not installed" >&2 - exit 1 -fi - -token_args=() -if [[ -n "${HF_TOKEN:-}" ]]; then - token_args=(--token "$HF_TOKEN") -fi - -tmp_dir=$(mktemp -d) -cleanup() { - rm -rf "$tmp_dir" -} -trap cleanup EXIT - -emit_error() { - local model_id="$1" - local message="$2" - python3 - << 'PY' "$model_id" "$message" -import json -import sys - -model_id = sys.argv[1] -message = sys.argv[2] -print(json.dumps({"id": model_id, "error": message})) -PY -} - -parse_readme() { - local model_id="$1" - local readme_path="$2" - - MODEL_ID="$model_id" README_PATH="$readme_path" python3 - << 'PY' -import json -import os -import sys - -model_id = os.environ.get("MODEL_ID", "") -readme_path = os.environ.get("README_PATH", "") - -try: - with open(readme_path, "r", encoding="utf-8") as f: - lines = f.read().splitlines() -except OSError: - print(json.dumps({"id": model_id, "error": "readme_missing"})) - sys.exit(0) - -frontmatter = [] -in_block = False -for line in lines: - if line.strip() == "---": - if in_block: - break - in_block = True - continue - if in_block: - frontmatter.append(line) - -if not frontmatter: - print(json.dumps({"id": model_id, "error": "frontmatter_missing"})) - sys.exit(0) - -key = None -out = {} - -for line in frontmatter: - stripped = line.strip() - if not stripped or line.lstrip().startswith("#"): - continue - - if ":" in line and not line.lstrip().startswith("- "): - key_candidate, value = line.split(":", 1) - key_candidate = key_candidate.strip() - value = value.strip() - if key_candidate and all(c.isalnum() or c in "_-" for c in key_candidate): - key = key_candidate - if value in ("|", "|-", ">", ">-") or value == "": - out[key] = None - continue - if value.startswith("[") and value.endswith("]"): - items = [v.strip() for v in value.strip("[]").split(",") if v.strip()] - out[key] = items - else: - out[key] = value - continue - - if line.lstrip().startswith("- ") and key: - item = line.strip()[2:] - if key not in out or out[key] is None: - out[key] = [] - if isinstance(out[key], list): - out[key].append(item) - -result = { - "id": model_id, - "license": out.get("license"), - "pipeline_tag": out.get("pipeline_tag"), - "library_name": out.get("library_name"), - "tags": out.get("tags", []), - "language": out.get("language", []), - "new_version": out.get("new_version"), - "has_extra_gated_prompt": "extra_gated_prompt" in out, -} - -print(json.dumps(result)) -PY -} - -process_id() { - local model_id="$1" - - if [[ -z "$model_id" ]]; then - return 0 - fi - - local safe_id - safe_id=$(printf '%s' "$model_id" | tr '/' '_') - local local_dir="$tmp_dir/$safe_id" - - if ! hf download "$model_id" README.md --repo-type model --local-dir "$local_dir" "${token_args[@]}" >/dev/null 2>&1; then - emit_error "$model_id" "download_failed" - return 0 - fi - - local readme_path="$local_dir/README.md" - if [[ ! -f "$readme_path" ]]; then - emit_error "$model_id" "readme_missing" - return 0 - fi - - parse_readme "$model_id" "$readme_path" -} - -if [[ $# -gt 0 ]]; then - for model_id in "$@"; do - process_id "$model_id" - done - exit 0 -fi - -if [[ -t 0 ]]; then - show_help - exit 1 -fi - -while IFS= read -r model_id; do - process_id "$model_id" -done diff --git a/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh b/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh deleted file mode 100644 index fcd5b1f8..00000000 --- a/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env bash - -# Hugging Face Model Papers Tool with Authentication -# Fetches papers referenced by Hugging Face models using HF_TOKEN if available - -set -euo pipefail - -# Help function -show_help() { - cat << EOF -Hugging Face Model Papers Tool with Authentication - -This tool fetches papers referenced by Hugging Face models. -Supports authentication via HF_TOKEN environment variable. - -Usage: - $0 [OPTIONS] - -Options: - MODEL_ID Specific model to analyze (e.g., microsoft/DialoGPT-medium) - --trending [N] Show papers for top N trending models (default: 5) - --help Show this help message - -Environment Variables: - HF_TOKEN Hugging Face API token (optional, for private models) - -Examples: - # Get papers for a specific model - $0 microsoft/DialoGPT-medium - - # Get papers with authentication - HF_TOKEN=your_token_here $0 your-private-model - - # Get papers for top 3 trending models - $0 --trending 3 - -EOF -} - -# Function to make authenticated API calls -hf_api_call() { - local url="$1" - local headers=() - - # Add authentication header if HF_TOKEN is set - if [[ -n "${HF_TOKEN:-}" ]]; then - headers+=(-H "Authorization: Bearer $HF_TOKEN") - fi - - curl -s "${headers[@]}" "$url" 2>/dev/null || echo '{"error": "Network error"}' -} - -# Function to extract papers from text -extract_papers() { - local text="$1" - local title="$2" - - echo "$title" - - # Find ArXiv URLs - local arxiv_urls=$(echo "$text" | grep -oE 'https?://arxiv\.org/[^[:space:]\])]+' | head -5) - if [[ -n "$arxiv_urls" ]]; then - echo "ArXiv Papers:" - echo "$arxiv_urls" | sed 's/^/ • /' - fi - - # Find DOI URLs - local doi_urls=$(echo "$text" | grep -oE 'https?://doi\.org/[^[:space:]\])]+' | head -3) - if [[ -n "$doi_urls" ]]; then - echo "DOI Papers:" - echo "$doi_urls" | sed 's/^/ • /' - fi - - # Find arxiv IDs in format YYYY.NNNNN - local arxiv_ids=$(echo "$text" | grep -oE 'arXiv:[0-9]{4}\.[0-9]{4,5}' | head -5) - if [[ -n "$arxiv_ids" ]]; then - echo "ArXiv IDs:" - echo "$arxiv_ids" | sed 's/^/ • /' - fi - - # Check for paper mentions - if echo "$text" | grep -qi "paper\|publication\|citation"; then - local paper_mentions=$(echo "$text" | grep -i -A1 -B1 "paper\|publication" | head -6) - if [[ -n "$paper_mentions" ]]; then - echo "Paper mentions:" - echo "$paper_mentions" | sed 's/^/ /' - fi - fi - - if [[ -z "$arxiv_urls" && -z "$doi_urls" && -z "$arxiv_ids" ]]; then - echo "No papers found in model card" - fi -} - -# Function to get model papers -get_model_papers() { - local model_id="$1" - - echo "=== $model_id ===" - - # Get model info from API with authentication - local api_url="https://huggingface.co/api/models/$model_id" - local response=$(hf_api_call "$api_url") - - if echo "$response" | grep -q '"error"'; then - echo "Error: Could not fetch model '$model_id'" - if [[ -z "${HF_TOKEN:-}" ]]; then - echo "Note: This might be a private model. Try setting HF_TOKEN environment variable." - fi - return 1 - fi - - # Parse basic info - local downloads=$(echo "$response" | jq -r '.downloads // 0') - local likes=$(echo "$response" | jq -r '.likes // 0') - echo "Downloads: $downloads | Likes: $likes" - - # Get model card - local card_url="https://huggingface.co/$model_id/raw/main/README.md" - local card_content=$(curl -s "$card_url" 2>/dev/null || echo "") - - if [[ -n "$card_content" ]]; then - extract_papers "$card_content" "Papers from model card:" - else - echo "Could not fetch model card" - fi - - # Check tags for arxiv references - local arxiv_tag=$(echo "$response" | jq -r '.tags[]' 2>/dev/null | grep arxiv || true) - if [[ -n "$arxiv_tag" ]]; then - echo "ArXiv from tags: $arxiv_tag" - fi - - echo -} - -# Function to get trending models -get_trending_models() { - local limit="${1:-5}" - - echo "Fetching top $limit trending models..." - - local trending_url="https://huggingface.co/api/trending?type=model&limit=$limit" - local response=$(hf_api_call "$trending_url") - - echo "$response" | jq -r '.recentlyTrending[] | .repoData.id' | head -"$limit" | while read -r model_id; do - if [[ -n "$model_id" ]]; then - get_model_papers "$model_id" - fi - done -} - -# Main -if [[ $# -eq 0 ]]; then - echo "Error: No arguments provided" - show_help - exit 1 -fi - -if [[ "$1" == "--help" ]]; then - show_help - exit 0 -elif [[ "$1" == "--trending" ]]; then - if [[ -n "${2:-}" ]] && [[ "$2" =~ ^[0-9]+$ ]]; then - get_trending_models "$2" - else - get_trending_models 5 - fi -else - get_model_papers "$1" -fi diff --git a/skills/hugging-face-evaluation/SKILL.md b/skills/huggingface-community-evals/SKILL.md similarity index 99% rename from skills/hugging-face-evaluation/SKILL.md rename to skills/huggingface-community-evals/SKILL.md index 3034a11a..a68b2e79 100644 --- a/skills/hugging-face-evaluation/SKILL.md +++ b/skills/huggingface-community-evals/SKILL.md @@ -1,5 +1,5 @@ --- -name: hugging-face-evaluation +name: huggingface-community-evals description: Run evaluations for Hugging Face Hub models using inspect-ai and lighteval on local hardware. Use for backend selection, local GPU evals, and choosing between vLLM / Transformers / accelerate. Not for HF Jobs orchestration, model-card PRs, .eval_results publication, or community-evals automation. --- diff --git a/skills/hugging-face-evaluation/examples/.env.example b/skills/huggingface-community-evals/examples/.env.example similarity index 100% rename from skills/hugging-face-evaluation/examples/.env.example rename to skills/huggingface-community-evals/examples/.env.example diff --git a/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md b/skills/huggingface-community-evals/examples/USAGE_EXAMPLES.md similarity index 100% rename from skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md rename to skills/huggingface-community-evals/examples/USAGE_EXAMPLES.md diff --git a/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py b/skills/huggingface-community-evals/scripts/inspect_eval_uv.py similarity index 100% rename from skills/hugging-face-evaluation/scripts/inspect_eval_uv.py rename to skills/huggingface-community-evals/scripts/inspect_eval_uv.py diff --git a/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py b/skills/huggingface-community-evals/scripts/inspect_vllm_uv.py similarity index 100% rename from skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py rename to skills/huggingface-community-evals/scripts/inspect_vllm_uv.py diff --git a/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py b/skills/huggingface-community-evals/scripts/lighteval_vllm_uv.py similarity index 100% rename from skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py rename to skills/huggingface-community-evals/scripts/lighteval_vllm_uv.py diff --git a/skills/hugging-face-dataset-viewer/SKILL.md b/skills/huggingface-datasets/SKILL.md similarity index 96% rename from skills/hugging-face-dataset-viewer/SKILL.md rename to skills/huggingface-datasets/SKILL.md index 30854ffc..16ff9665 100644 --- a/skills/hugging-face-dataset-viewer/SKILL.md +++ b/skills/huggingface-datasets/SKILL.md @@ -1,5 +1,5 @@ --- -name: hugging-face-dataset-viewer +name: huggingface-datasets description: Use this skill for Hugging Face Dataset Viewer API workflows that fetch subset/split metadata, paginate rows, search text, apply filters, download parquet URLs, and read size or statistics. --- diff --git a/skills/huggingface-gradio/SKILL.md b/skills/huggingface-gradio/SKILL.md index eaa6732e..08c45355 100644 --- a/skills/huggingface-gradio/SKILL.md +++ b/skills/huggingface-gradio/SKILL.md @@ -1,5 +1,5 @@ --- -name: gradio +name: huggingface-gradio description: Build Gradio web UIs and demos in Python. Use when creating or editing Gradio apps, components, event listeners, layouts, or chatbots. --- diff --git a/skills/hugging-face-jobs/SKILL.md b/skills/huggingface-jobs/SKILL.md similarity index 99% rename from skills/hugging-face-jobs/SKILL.md rename to skills/huggingface-jobs/SKILL.md index 068ca2fd..280ecc1b 100644 --- a/skills/hugging-face-jobs/SKILL.md +++ b/skills/huggingface-jobs/SKILL.md @@ -1,5 +1,5 @@ --- -name: hugging-face-jobs +name: huggingface-jobs description: This skill should be used when users want to run any workload on Hugging Face Jobs infrastructure. Covers UV scripts, Docker-based jobs, hardware selection, cost estimation, authentication with tokens, secrets management, timeout configuration, and result persistence. Designed for general-purpose compute workloads including data processing, inference, experiments, batch jobs, and any Python-based tasks. Should be invoked for tasks involving cloud compute, GPU workloads, or when users mention running jobs on Hugging Face infrastructure without local setup. license: Complete terms in LICENSE.txt --- diff --git a/skills/hugging-face-jobs/index.html b/skills/huggingface-jobs/index.html similarity index 100% rename from skills/hugging-face-jobs/index.html rename to skills/huggingface-jobs/index.html diff --git a/skills/hugging-face-jobs/references/hardware_guide.md b/skills/huggingface-jobs/references/hardware_guide.md similarity index 100% rename from skills/hugging-face-jobs/references/hardware_guide.md rename to skills/huggingface-jobs/references/hardware_guide.md diff --git a/skills/hugging-face-jobs/references/hub_saving.md b/skills/huggingface-jobs/references/hub_saving.md similarity index 100% rename from skills/hugging-face-jobs/references/hub_saving.md rename to skills/huggingface-jobs/references/hub_saving.md diff --git a/skills/hugging-face-jobs/references/token_usage.md b/skills/huggingface-jobs/references/token_usage.md similarity index 100% rename from skills/hugging-face-jobs/references/token_usage.md rename to skills/huggingface-jobs/references/token_usage.md diff --git a/skills/hugging-face-jobs/references/troubleshooting.md b/skills/huggingface-jobs/references/troubleshooting.md similarity index 100% rename from skills/hugging-face-jobs/references/troubleshooting.md rename to skills/huggingface-jobs/references/troubleshooting.md diff --git a/skills/hugging-face-jobs/scripts/cot-self-instruct.py b/skills/huggingface-jobs/scripts/cot-self-instruct.py similarity index 100% rename from skills/hugging-face-jobs/scripts/cot-self-instruct.py rename to skills/huggingface-jobs/scripts/cot-self-instruct.py diff --git a/skills/hugging-face-jobs/scripts/finepdfs-stats.py b/skills/huggingface-jobs/scripts/finepdfs-stats.py similarity index 100% rename from skills/hugging-face-jobs/scripts/finepdfs-stats.py rename to skills/huggingface-jobs/scripts/finepdfs-stats.py diff --git a/skills/hugging-face-jobs/scripts/generate-responses.py b/skills/huggingface-jobs/scripts/generate-responses.py similarity index 100% rename from skills/hugging-face-jobs/scripts/generate-responses.py rename to skills/huggingface-jobs/scripts/generate-responses.py diff --git a/skills/hugging-face-model-trainer/SKILL.md b/skills/huggingface-llm-trainer/SKILL.md similarity index 99% rename from skills/hugging-face-model-trainer/SKILL.md rename to skills/huggingface-llm-trainer/SKILL.md index 77551944..aa74347d 100644 --- a/skills/hugging-face-model-trainer/SKILL.md +++ b/skills/huggingface-llm-trainer/SKILL.md @@ -1,5 +1,5 @@ --- -name: hugging-face-model-trainer +name: huggingface-llm-trainer description: This skill should be used when users want to train or fine-tune language models using TRL (Transformer Reinforcement Learning) on Hugging Face Jobs infrastructure. Covers SFT, DPO, GRPO and reward modeling training methods, plus GGUF conversion for local deployment. Includes guidance on the TRL Jobs package, UV scripts with PEP 723 format, dataset preparation and validation, hardware selection, cost estimation, Trackio monitoring, Hub authentication, and model persistence. Should be invoked for tasks involving cloud GPU training, GGUF conversion, or when users mention training on Hugging Face Jobs without local GPU setup. license: Complete terms in LICENSE.txt --- diff --git a/skills/hugging-face-model-trainer/references/gguf_conversion.md b/skills/huggingface-llm-trainer/references/gguf_conversion.md similarity index 100% rename from skills/hugging-face-model-trainer/references/gguf_conversion.md rename to skills/huggingface-llm-trainer/references/gguf_conversion.md diff --git a/skills/hugging-face-model-trainer/references/hardware_guide.md b/skills/huggingface-llm-trainer/references/hardware_guide.md similarity index 100% rename from skills/hugging-face-model-trainer/references/hardware_guide.md rename to skills/huggingface-llm-trainer/references/hardware_guide.md diff --git a/skills/hugging-face-model-trainer/references/hub_saving.md b/skills/huggingface-llm-trainer/references/hub_saving.md similarity index 100% rename from skills/hugging-face-model-trainer/references/hub_saving.md rename to skills/huggingface-llm-trainer/references/hub_saving.md diff --git a/skills/hugging-face-model-trainer/references/local_training_macos.md b/skills/huggingface-llm-trainer/references/local_training_macos.md similarity index 100% rename from skills/hugging-face-model-trainer/references/local_training_macos.md rename to skills/huggingface-llm-trainer/references/local_training_macos.md diff --git a/skills/hugging-face-model-trainer/references/reliability_principles.md b/skills/huggingface-llm-trainer/references/reliability_principles.md similarity index 100% rename from skills/hugging-face-model-trainer/references/reliability_principles.md rename to skills/huggingface-llm-trainer/references/reliability_principles.md diff --git a/skills/hugging-face-model-trainer/references/trackio_guide.md b/skills/huggingface-llm-trainer/references/trackio_guide.md similarity index 100% rename from skills/hugging-face-model-trainer/references/trackio_guide.md rename to skills/huggingface-llm-trainer/references/trackio_guide.md diff --git a/skills/hugging-face-model-trainer/references/training_methods.md b/skills/huggingface-llm-trainer/references/training_methods.md similarity index 100% rename from skills/hugging-face-model-trainer/references/training_methods.md rename to skills/huggingface-llm-trainer/references/training_methods.md diff --git a/skills/hugging-face-model-trainer/references/training_patterns.md b/skills/huggingface-llm-trainer/references/training_patterns.md similarity index 100% rename from skills/hugging-face-model-trainer/references/training_patterns.md rename to skills/huggingface-llm-trainer/references/training_patterns.md diff --git a/skills/hugging-face-model-trainer/references/troubleshooting.md b/skills/huggingface-llm-trainer/references/troubleshooting.md similarity index 100% rename from skills/hugging-face-model-trainer/references/troubleshooting.md rename to skills/huggingface-llm-trainer/references/troubleshooting.md diff --git a/skills/hugging-face-model-trainer/references/unsloth.md b/skills/huggingface-llm-trainer/references/unsloth.md similarity index 100% rename from skills/hugging-face-model-trainer/references/unsloth.md rename to skills/huggingface-llm-trainer/references/unsloth.md diff --git a/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py b/skills/huggingface-llm-trainer/scripts/convert_to_gguf.py similarity index 100% rename from skills/hugging-face-model-trainer/scripts/convert_to_gguf.py rename to skills/huggingface-llm-trainer/scripts/convert_to_gguf.py diff --git a/skills/hugging-face-model-trainer/scripts/dataset_inspector.py b/skills/huggingface-llm-trainer/scripts/dataset_inspector.py similarity index 100% rename from skills/hugging-face-model-trainer/scripts/dataset_inspector.py rename to skills/huggingface-llm-trainer/scripts/dataset_inspector.py diff --git a/skills/hugging-face-model-trainer/scripts/estimate_cost.py b/skills/huggingface-llm-trainer/scripts/estimate_cost.py similarity index 100% rename from skills/hugging-face-model-trainer/scripts/estimate_cost.py rename to skills/huggingface-llm-trainer/scripts/estimate_cost.py diff --git a/skills/hugging-face-model-trainer/scripts/train_dpo_example.py b/skills/huggingface-llm-trainer/scripts/train_dpo_example.py similarity index 100% rename from skills/hugging-face-model-trainer/scripts/train_dpo_example.py rename to skills/huggingface-llm-trainer/scripts/train_dpo_example.py diff --git a/skills/hugging-face-model-trainer/scripts/train_grpo_example.py b/skills/huggingface-llm-trainer/scripts/train_grpo_example.py similarity index 100% rename from skills/hugging-face-model-trainer/scripts/train_grpo_example.py rename to skills/huggingface-llm-trainer/scripts/train_grpo_example.py diff --git a/skills/hugging-face-model-trainer/scripts/train_sft_example.py b/skills/huggingface-llm-trainer/scripts/train_sft_example.py similarity index 100% rename from skills/hugging-face-model-trainer/scripts/train_sft_example.py rename to skills/huggingface-llm-trainer/scripts/train_sft_example.py diff --git a/skills/hugging-face-model-trainer/scripts/unsloth_sft_example.py b/skills/huggingface-llm-trainer/scripts/unsloth_sft_example.py similarity index 100% rename from skills/hugging-face-model-trainer/scripts/unsloth_sft_example.py rename to skills/huggingface-llm-trainer/scripts/unsloth_sft_example.py diff --git a/skills/hugging-face-paper-publisher/SKILL.md b/skills/huggingface-paper-publisher/SKILL.md similarity index 99% rename from skills/hugging-face-paper-publisher/SKILL.md rename to skills/huggingface-paper-publisher/SKILL.md index 9f81433d..01b7db0c 100644 --- a/skills/hugging-face-paper-publisher/SKILL.md +++ b/skills/huggingface-paper-publisher/SKILL.md @@ -1,5 +1,5 @@ --- -name: hugging-face-paper-publisher +name: huggingface-paper-publisher description: Publish and manage research papers on Hugging Face Hub. Supports creating paper pages, linking papers to models/datasets, claiming authorship, and generating professional markdown-based research articles. --- diff --git a/skills/hugging-face-paper-publisher/examples/example_usage.md b/skills/huggingface-paper-publisher/examples/example_usage.md similarity index 100% rename from skills/hugging-face-paper-publisher/examples/example_usage.md rename to skills/huggingface-paper-publisher/examples/example_usage.md diff --git a/skills/hugging-face-paper-publisher/references/quick_reference.md b/skills/huggingface-paper-publisher/references/quick_reference.md similarity index 100% rename from skills/hugging-face-paper-publisher/references/quick_reference.md rename to skills/huggingface-paper-publisher/references/quick_reference.md diff --git a/skills/hugging-face-paper-publisher/scripts/paper_manager.py b/skills/huggingface-paper-publisher/scripts/paper_manager.py similarity index 100% rename from skills/hugging-face-paper-publisher/scripts/paper_manager.py rename to skills/huggingface-paper-publisher/scripts/paper_manager.py diff --git a/skills/hugging-face-paper-publisher/templates/arxiv.md b/skills/huggingface-paper-publisher/templates/arxiv.md similarity index 100% rename from skills/hugging-face-paper-publisher/templates/arxiv.md rename to skills/huggingface-paper-publisher/templates/arxiv.md diff --git a/skills/hugging-face-paper-publisher/templates/ml-report.md b/skills/huggingface-paper-publisher/templates/ml-report.md similarity index 100% rename from skills/hugging-face-paper-publisher/templates/ml-report.md rename to skills/huggingface-paper-publisher/templates/ml-report.md diff --git a/skills/hugging-face-paper-publisher/templates/modern.md b/skills/huggingface-paper-publisher/templates/modern.md similarity index 100% rename from skills/hugging-face-paper-publisher/templates/modern.md rename to skills/huggingface-paper-publisher/templates/modern.md diff --git a/skills/hugging-face-paper-publisher/templates/standard.md b/skills/huggingface-paper-publisher/templates/standard.md similarity index 100% rename from skills/hugging-face-paper-publisher/templates/standard.md rename to skills/huggingface-paper-publisher/templates/standard.md diff --git a/skills/hugging-face-paper-pages/SKILL.md b/skills/huggingface-papers/SKILL.md similarity index 99% rename from skills/hugging-face-paper-pages/SKILL.md rename to skills/huggingface-papers/SKILL.md index ed63abd8..e65fc90a 100644 --- a/skills/hugging-face-paper-pages/SKILL.md +++ b/skills/huggingface-papers/SKILL.md @@ -1,5 +1,5 @@ --- -name: hugging-face-paper-pages +name: huggingface-papers description: Look up and read Hugging Face paper pages in markdown, and use the papers API for structured metadata such as authors, linked models/datasets/spaces, Github repo and project page. Use when the user shares a Hugging Face paper page URL, an arXiv URL or ID, or asks to summarize, explain, or analyze an AI research paper. --- diff --git a/skills/hugging-face-trackio/.claude-plugin/plugin.json b/skills/huggingface-trackio/.claude-plugin/plugin.json similarity index 100% rename from skills/hugging-face-trackio/.claude-plugin/plugin.json rename to skills/huggingface-trackio/.claude-plugin/plugin.json diff --git a/skills/hugging-face-trackio/SKILL.md b/skills/huggingface-trackio/SKILL.md similarity index 99% rename from skills/hugging-face-trackio/SKILL.md rename to skills/huggingface-trackio/SKILL.md index 506768d9..58ae095f 100644 --- a/skills/hugging-face-trackio/SKILL.md +++ b/skills/huggingface-trackio/SKILL.md @@ -1,5 +1,5 @@ --- -name: hugging-face-trackio +name: huggingface-trackio description: Track and visualize ML training experiments with Trackio. Use when logging metrics during training (Python API), firing alerts for training diagnostics, or retrieving/analyzing logged metrics (CLI). Supports real-time dashboard visualization, alerts with webhooks, HF Space syncing, and JSON output for automation. --- diff --git a/skills/hugging-face-trackio/references/alerts.md b/skills/huggingface-trackio/references/alerts.md similarity index 100% rename from skills/hugging-face-trackio/references/alerts.md rename to skills/huggingface-trackio/references/alerts.md diff --git a/skills/hugging-face-trackio/references/logging_metrics.md b/skills/huggingface-trackio/references/logging_metrics.md similarity index 100% rename from skills/hugging-face-trackio/references/logging_metrics.md rename to skills/huggingface-trackio/references/logging_metrics.md diff --git a/skills/hugging-face-trackio/references/retrieving_metrics.md b/skills/huggingface-trackio/references/retrieving_metrics.md similarity index 100% rename from skills/hugging-face-trackio/references/retrieving_metrics.md rename to skills/huggingface-trackio/references/retrieving_metrics.md diff --git a/skills/hugging-face-vision-trainer/SKILL.md b/skills/huggingface-vision-trainer/SKILL.md similarity index 99% rename from skills/hugging-face-vision-trainer/SKILL.md rename to skills/huggingface-vision-trainer/SKILL.md index 93574e54..5a2c554b 100644 --- a/skills/hugging-face-vision-trainer/SKILL.md +++ b/skills/huggingface-vision-trainer/SKILL.md @@ -1,5 +1,5 @@ --- -name: hugging-face-vision-trainer +name: huggingface-vision-trainer description: Trains and fine-tunes vision models for object detection (D-FINE, RT-DETR v2, DETR, YOLOS), image classification (timm models — MobileNetV3, MobileViT, ResNet, ViT/DINOv3 — plus any Transformers classifier), and SAM/SAM2 segmentation using Hugging Face Transformers on Hugging Face Jobs cloud GPUs. Covers COCO-format dataset preparation, Albumentations augmentation, mAP/mAR evaluation, accuracy metrics, SAM segmentation with bbox/point prompts, DiceCE loss, hardware selection, cost estimation, Trackio monitoring, and Hub persistence. Use when users mention training object detection, image classification, SAM, SAM2, segmentation, image matting, DETR, D-FINE, RT-DETR, ViT, timm, MobileNet, ResNet, bounding box models, or fine-tuning vision models on Hugging Face Jobs. --- diff --git a/skills/hugging-face-vision-trainer/references/finetune_sam2_trainer.md b/skills/huggingface-vision-trainer/references/finetune_sam2_trainer.md similarity index 100% rename from skills/hugging-face-vision-trainer/references/finetune_sam2_trainer.md rename to skills/huggingface-vision-trainer/references/finetune_sam2_trainer.md diff --git a/skills/hugging-face-vision-trainer/references/hub_saving.md b/skills/huggingface-vision-trainer/references/hub_saving.md similarity index 100% rename from skills/hugging-face-vision-trainer/references/hub_saving.md rename to skills/huggingface-vision-trainer/references/hub_saving.md diff --git a/skills/hugging-face-vision-trainer/references/image_classification_training_notebook.md b/skills/huggingface-vision-trainer/references/image_classification_training_notebook.md similarity index 100% rename from skills/hugging-face-vision-trainer/references/image_classification_training_notebook.md rename to skills/huggingface-vision-trainer/references/image_classification_training_notebook.md diff --git a/skills/hugging-face-vision-trainer/references/object_detection_training_notebook.md b/skills/huggingface-vision-trainer/references/object_detection_training_notebook.md similarity index 100% rename from skills/hugging-face-vision-trainer/references/object_detection_training_notebook.md rename to skills/huggingface-vision-trainer/references/object_detection_training_notebook.md diff --git a/skills/hugging-face-vision-trainer/references/reliability_principles.md b/skills/huggingface-vision-trainer/references/reliability_principles.md similarity index 100% rename from skills/hugging-face-vision-trainer/references/reliability_principles.md rename to skills/huggingface-vision-trainer/references/reliability_principles.md diff --git a/skills/hugging-face-vision-trainer/references/timm_trainer.md b/skills/huggingface-vision-trainer/references/timm_trainer.md similarity index 100% rename from skills/hugging-face-vision-trainer/references/timm_trainer.md rename to skills/huggingface-vision-trainer/references/timm_trainer.md diff --git a/skills/hugging-face-vision-trainer/scripts/dataset_inspector.py b/skills/huggingface-vision-trainer/scripts/dataset_inspector.py similarity index 100% rename from skills/hugging-face-vision-trainer/scripts/dataset_inspector.py rename to skills/huggingface-vision-trainer/scripts/dataset_inspector.py diff --git a/skills/hugging-face-vision-trainer/scripts/estimate_cost.py b/skills/huggingface-vision-trainer/scripts/estimate_cost.py similarity index 100% rename from skills/hugging-face-vision-trainer/scripts/estimate_cost.py rename to skills/huggingface-vision-trainer/scripts/estimate_cost.py diff --git a/skills/hugging-face-vision-trainer/scripts/image_classification_training.py b/skills/huggingface-vision-trainer/scripts/image_classification_training.py similarity index 100% rename from skills/hugging-face-vision-trainer/scripts/image_classification_training.py rename to skills/huggingface-vision-trainer/scripts/image_classification_training.py diff --git a/skills/hugging-face-vision-trainer/scripts/object_detection_training.py b/skills/huggingface-vision-trainer/scripts/object_detection_training.py similarity index 100% rename from skills/hugging-face-vision-trainer/scripts/object_detection_training.py rename to skills/huggingface-vision-trainer/scripts/object_detection_training.py diff --git a/skills/hugging-face-vision-trainer/scripts/sam_segmentation_training.py b/skills/huggingface-vision-trainer/scripts/sam_segmentation_training.py similarity index 100% rename from skills/hugging-face-vision-trainer/scripts/sam_segmentation_training.py rename to skills/huggingface-vision-trainer/scripts/sam_segmentation_training.py diff --git a/skills/transformers.js/SKILL.md b/skills/transformers-js/SKILL.md similarity index 100% rename from skills/transformers.js/SKILL.md rename to skills/transformers-js/SKILL.md diff --git a/skills/transformers.js/references/CACHE.md b/skills/transformers-js/references/CACHE.md similarity index 100% rename from skills/transformers.js/references/CACHE.md rename to skills/transformers-js/references/CACHE.md diff --git a/skills/transformers.js/references/CONFIGURATION.md b/skills/transformers-js/references/CONFIGURATION.md similarity index 100% rename from skills/transformers.js/references/CONFIGURATION.md rename to skills/transformers-js/references/CONFIGURATION.md diff --git a/skills/transformers.js/references/EXAMPLES.md b/skills/transformers-js/references/EXAMPLES.md similarity index 100% rename from skills/transformers.js/references/EXAMPLES.md rename to skills/transformers-js/references/EXAMPLES.md diff --git a/skills/transformers.js/references/MODEL_ARCHITECTURES.md b/skills/transformers-js/references/MODEL_ARCHITECTURES.md similarity index 100% rename from skills/transformers.js/references/MODEL_ARCHITECTURES.md rename to skills/transformers-js/references/MODEL_ARCHITECTURES.md diff --git a/skills/transformers.js/references/PIPELINE_OPTIONS.md b/skills/transformers-js/references/PIPELINE_OPTIONS.md similarity index 100% rename from skills/transformers.js/references/PIPELINE_OPTIONS.md rename to skills/transformers-js/references/PIPELINE_OPTIONS.md diff --git a/skills/transformers.js/references/TEXT_GENERATION.md b/skills/transformers-js/references/TEXT_GENERATION.md similarity index 100% rename from skills/transformers.js/references/TEXT_GENERATION.md rename to skills/transformers-js/references/TEXT_GENERATION.md