تصدير التوسيمات إلى مجموعات بيانات Hugging Face

تعد Hugging Face Datasets الصيغة المعيارية لمشاركة وتحميل مجموعات بيانات تعلم الآلة. يوضح هذا الدليل كيفية تحويل توسيمات Potato لتدريب النماذج ومشاركة مجموعات البيانات باستخدام سكريبتات Python.

لماذا صيغة Hugging Face؟

صيغة معيارية: تعمل مع جميع أدوات HF
تخزين فعال: صيغة Arrow للتحميل السريع
مشاركة سهلة: رفع مباشر إلى Hub
جاهزة للتدريب: تكامل مباشر مع Transformers

التصدير الأساسي باستخدام Python

يحفظ Potato التوسيمات بصيغة JSONL. يمكنك تحويلها إلى مجموعات بيانات Hugging Face باستخدام مكتبة datasets.

تحميل توسيمات Potato

python

import json
from datasets import Dataset
 
# Load Potato annotation output
annotations = []
with open("annotation_output/annotated_instances.jsonl", "r") as f:
    for line in f:
        annotations.append(json.loads(line))
 
# Convert to Hugging Face Dataset
dataset = Dataset.from_list([
    {
        "text": ann["text"],
        "label": ann["label_annotations"]["sentiment"]["label"]
    }
    for ann in annotations
])
 
# Save locally
dataset.save_to_disk("my_dataset")
 
# Or push to Hub
dataset.push_to_hub("username/my-dataset")

إنشاء تقسيمات التدريب/الاختبار

python

from sklearn.model_selection import train_test_split
 
# Split annotations
train_data, temp_data = train_test_split(annotations, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
 
# Create datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)
 
# Combine into DatasetDict
from datasets import DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

التصدير حسب المهمة

تصنيف النصوص

python

from datasets import Dataset, ClassLabel
 
# Load and process sentiment annotations
dataset = Dataset.from_dict({
    "text": [ann["text"] for ann in annotations],
    "label": [ann["label_annotations"]["sentiment"]["label"] for ann in annotations]
})
 
# Define label mapping
dataset = dataset.cast_column(
    "label",
    ClassLabel(names=["Positive", "Negative", "Neutral"])
)

التعرف على الكيانات المسماة

python

# Convert span annotations to IOB format
def convert_to_iob(text, spans):
    tokens = text.split()
    labels = ["O"] * len(tokens)
 
    for span in spans:
        # Map character offsets to token indices
        start_token, end_token = char_to_token(text, span["start"], span["end"])
        labels[start_token] = f"B-{span['annotation']}"
        for i in range(start_token + 1, end_token):
            labels[i] = f"I-{span['annotation']}"
 
    return tokens, labels
 
# Potato stores span annotations in span_annotations field
dataset = Dataset.from_dict({
    "tokens": [convert_to_iob(a["text"], a.get("span_annotations", {}).get("entities", []))[0] for a in annotations],
    "ner_tags": [convert_to_iob(a["text"], a.get("span_annotations", {}).get("entities", []))[1] for a in annotations]
})

تصنيف الصوت

python

from datasets import Audio
 
# For audio annotation tasks
dataset = Dataset.from_dict({
    "audio": [ann["audio"] for ann in annotations],
    "label": [ann["label_annotations"]["emotion"]["label"] for ann in annotations]
})
 
# Cast to Audio feature
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

تصنيف الصور

python

from datasets import Image
 
# For image annotation tasks
dataset = Dataset.from_dict({
    "image": [ann["image"] for ann in annotations],
    "label": [ann["label_annotations"]["category"]["label"] for ann in annotations]
})
 
dataset = dataset.cast_column("image", Image())

تصدير المُوسِّمين المتعددين

عندما يكون لديك عدة مُوسِّمين لكل عنصر، يمكنك التصدير بصيغ مختلفة:

python

# Long format (one row per annotation)
# Each annotator's work is saved in a separate file: annotator_{id}.jsonl
import glob
 
records = []
for filepath in glob.glob("annotation_output/annotator_*.jsonl"):
    annotator_id = filepath.split("_")[-1].replace(".jsonl", "")
    with open(filepath) as f:
        for line in f:
            ann = json.loads(line)
            records.append({
                "id": ann["id"],
                "text": ann["text"],
                "label": ann["label_annotations"]["sentiment"]["label"],
                "annotator": annotator_id
            })
 
dataset = Dataset.from_list(records)
 
# Or aggregate annotations per item
from collections import defaultdict
from statistics import mode
 
items = defaultdict(list)
for record in records:
    items[record["id"]].append(record)
 
aggregated = []
for item_id, anns in items.items():
    labels = [a["label"] for a in anns]
    aggregated.append({
        "id": item_id,
        "text": anns[0]["text"],
        "label": mode(labels),  # Majority vote
        "num_annotators": len(labels)
    })
 
dataset = Dataset.from_list(aggregated)

الرفع إلى Hugging Face Hub

python

from huggingface_hub import login
 
# Login (or use HF_TOKEN env var)
login()
 
# Push dataset
dataset.push_to_hub(
    "username/my-sentiment-dataset",
    private=False,
    token=None  # Uses cached token
)
 
# With dataset card
dataset.push_to_hub(
    "username/my-sentiment-dataset",
    commit_message="Initial upload of sentiment annotations",
)

بطاقة مجموعة البيانات

أنشئ README.md لمجموعة بياناتك:

markdown

---
license: cc-by-4.0
task_categories:
  - text-classification
language:
  - en
size_categories:
  - 1K<n<10K
---
 
# My Sentiment Dataset
 
## Dataset Description
 
Sentiment annotations collected using [Potato](https://potato.iro.umich.edu).
 
## Dataset Structure
 
- **train**: 8,000 examples
- **validation**: 1,000 examples
- **test**: 1,000 examples
 
### Labels
 
- Positive
- Negative
- Neutral
 
## Annotation Process
 
Annotated by 3 workers per item on Prolific.
Inter-annotator agreement (Fleiss' Kappa): 0.75
 
## Citation
 
@article{...}

تحميل مجموعة بياناتك

python

from datasets import load_dataset
 
# From Hub
dataset = load_dataset("username/my-sentiment-dataset")
 
# From local
dataset = load_dataset("my_dataset/")
 
# Use for training
from transformers import Trainer
 
trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    ...
)

أفضل الممارسات

ضمّن البيانات الوصفية: المصدر وعملية التوسيم والاتفاق
وثّق التسميات: تعريفات واضحة للتسميات
أنشئ إصدارات لمجموعات البيانات: تتبع التغييرات بمرور الوقت
أضف الاقتباسات: انسب منهجية التوسيم
حدد الترخيص بوضوح: حدد شروط الاستخدام

وثائق التصدير الكاملة في /docs/core-concepts/data-formats.