يقيس اتفاق المُوسِّمين (IAA) مدى اتساق المُوسِّمين المختلفين في توسيم نفس العناصر. يشير الاتفاق العالي إلى توسيمات موثوقة؛ بينما يشير الاتفاق المنخفض إلى إرشادات غير واضحة أو مهام ذاتية.

لماذا نقيس الاتفاق؟

التحقق من الإرشادات: اتفاق منخفض = تعليمات غير واضحة
تقييم صعوبة المهمة: بعض المهام ذاتية بطبيعتها
تأهيل المُوسِّمين: تحديد من يحتاج مزيداً من التدريب
الإبلاغ عن الموثوقية: مطلوب للمنشورات العلمية
تجميع التسميات: تحديد كيفية دمج التوسيمات

مقاييس الاتفاق

معامل كابا لكوهين (مُوسِّمَين)

لمقارنة مُوسِّمَين على بيانات فئوية:

text

κ = (Po - Pe) / (1 - Pe)

حيث:

Po = الاتفاق الملاحظ
Pe = الاتفاق المتوقع بالصدفة

التفسير:

كابا	التفسير
< 0	أقل من الصدفة
0.01-0.20	طفيف
0.21-0.40	معقول
0.41-0.60	متوسط
0.61-0.80	كبير
0.81-1.00	شبه مثالي

معامل كابا لفليس (3+ مُوسِّمين)

لعدة مُوسِّمين على بيانات فئوية:

yaml

quality_control:
  agreement:
    metrics:
      - fleiss_kappa

نفس مقياس التفسير لمعامل كابا لكوهين.

معامل ألفا لكريبندورف

الأكثر مرونة - يعمل مع:

أي عدد من المُوسِّمين
بيانات مفقودة
أنواع بيانات متنوعة (اسمية، ترتيبية، فترية، نسبية)

yaml

quality_control:
  agreement:
    metrics:
      - krippendorff_alpha
    alpha_level: nominal  # or ordinal, interval, ratio

التفسير:

α ≥ 0.80: موثوق
0.67 ≤ α < 0.80: مقبول بشكل مبدئي
α < 0.67: غير موثوق

تكوين الاتفاق في Potato

الإعداد الأساسي

yaml

quality_control:
  agreement:
    enabled: true
    calculate_on_overlap: true
 
    metrics:
      - cohens_kappa
      - fleiss_kappa
      - krippendorff_alpha
 
    # Per annotation scheme
    per_scheme: true
 
    # Reporting
    report_interval: 100  # Every 100 annotations
    export_file: agreement_report.json

تكوين التداخل

yaml

quality_control:
  redundancy:
    # How many annotators per item
    annotations_per_item: 3
 
    # Minimum overlap for calculations
    min_overlap_for_agreement: 2
 
    # Sampling for agreement
    agreement_sample_size: 100  # Calculate on 100 items
    agreement_sample_method: random  # or stratified, all

حساب الاتفاق

في لوحة التحكم

يعرض Potato مقاييس الاتفاق في لوحة تحكم المسؤول:

yaml

quality_control:
  dashboard:
    show_agreement: true
    agreement_chart: true
    update_frequency: 60  # seconds

عبر API

bash

# Get current agreement metrics
curl http://localhost:8000/api/quality/agreement
 
# Response:
{
  "overall": {
    "fleiss_kappa": 0.72,
    "krippendorff_alpha": 0.75
  },
  "per_scheme": {
    "sentiment": {
      "fleiss_kappa": 0.78,
      "krippendorff_alpha": 0.80
    },
    "topic": {
      "fleiss_kappa": 0.65,
      "krippendorff_alpha": 0.68
    }
  },
  "sample_size": 150,
  "annotator_pairs": 10
}

عبر سطر الأوامر

bash

# Calculate agreement from output files
potato agreement --annotations annotation_output/ --output agreement_report.json
 
# With specific metric
potato agreement --annotations annotation_output/ --metric krippendorff --level ordinal

الاتفاق لأنواع التوسيم المختلفة

الفئوي (Radio، Multiselect)

yaml

quality_control:
  agreement:
    schemes:
      sentiment:
        type: nominal
        metrics: [cohens_kappa, fleiss_kappa]
 
      urgency:
        type: ordinal  # Low < Medium < High
        metrics: [krippendorff_alpha]

مقاييس ليكرت

yaml

quality_control:
  agreement:
    schemes:
      quality_rating:
        type: ordinal
        metrics: [krippendorff_alpha, weighted_kappa]
 
        # Weighted kappa for ordinal
        weighting: linear  # or quadratic

توسيمات النطاق

لمهام NER، تتطلب النطاقات معالجة خاصة:

yaml

quality_control:
  agreement:
    schemes:
      entities:
        type: span
        span_matching: overlap  # or exact, token
 
        # What to compare
        compare: label_and_span  # or label_only, span_only
 
        # Overlap threshold for "match"
        overlap_threshold: 0.5
 
        metrics:
          - span_f1
          - span_precision
          - span_recall

الترتيبات

yaml

quality_control:
  agreement:
    schemes:
      preference_rank:
        type: ranking
        metrics:
          - kendall_tau
          - spearman_rho

الاتفاق الثنائي مقابل الكلي

الثنائي (كل زوج)

yaml

quality_control:
  agreement:
    pairwise: true
    output_matrix: true  # Agreement matrix
 
# Output:
# annotator1 × annotator2: κ = 0.75
# annotator1 × annotator3: κ = 0.68
# annotator2 × annotator3: κ = 0.82

الكلي (جميع المُوسِّمين)

yaml

quality_control:
  agreement:
    overall: true
    metrics:
      - fleiss_kappa  # Designed for 3+ annotators
      - krippendorff_alpha

التعامل مع الاتفاق المنخفض

تحديد مناطق المشكلة

yaml

quality_control:
  agreement:
    diagnostics:
      enabled: true
 
      # Items with most disagreement
      show_disagreed_items: true
      disagreement_threshold: 0.5
 
      # Labels with most confusion
      confusion_matrix: true
 
      # Annotators with low agreement
      per_annotator_agreement: true

الإجراءات عند الاتفاق المنخفض

yaml

quality_control:
  agreement:
    alerts:
      - threshold: 0.6
        action: notify
        message: "Agreement below 0.6 - review guidelines"
 
      - threshold: 0.4
        action: pause
        message: "Agreement critically low - pausing task"
 
    # Automatic guideline reminders
    show_guidelines_on_low_agreement: true
    guideline_threshold: 0.5

التكوين الكامل

yaml

annotation_task_name: "Agreement-Tracked Annotation"
 
quality_control:
  # Redundancy setup
  redundancy:
    annotations_per_item: 3
    assignment_method: random
 
  # Agreement calculation
  agreement:
    enabled: true
 
    # Metrics
    metrics:
      - fleiss_kappa
      - krippendorff_alpha
 
    # Per-scheme configuration
    schemes:
      sentiment:
        type: nominal
        metrics: [fleiss_kappa, cohens_kappa]
 
      intensity:
        type: ordinal
        metrics: [krippendorff_alpha]
        alpha_level: ordinal
 
      entities:
        type: span
        span_matching: overlap
        overlap_threshold: 0.5
        metrics: [span_f1]
 
    # Calculation settings
    calculate_on_overlap: true
    min_overlap: 2
    sample_size: all  # or number
 
    # Pairwise analysis
    pairwise: true
    pairwise_output: agreement_matrix.csv
 
    # Diagnostics
    diagnostics:
      confusion_matrix: true
      disagreed_items: true
      per_annotator: true
 
    # Alerts
    alerts:
      - metric: fleiss_kappa
        threshold: 0.6
        action: notify
 
    # Reporting
    report_file: agreement_report.json
    report_interval: 50
 
  # Dashboard
  dashboard:
    show_agreement: true
    charts:
      - agreement_over_time
      - per_scheme_agreement
      - annotator_comparison

تقرير الإخراج

json

{
  "timestamp": "2024-10-25T15:30:00Z",
  "sample_size": 500,
  "annotators": ["ann1", "ann2", "ann3"],
 
  "overall_agreement": {
    "fleiss_kappa": 0.72,
    "krippendorff_alpha": 0.75
  },
 
  "per_scheme": {
    "sentiment": {
      "fleiss_kappa": 0.78,
      "confusion_matrix": {
        "Positive": {"Positive": 180, "Negative": 5, "Neutral": 15},
        "Negative": {"Positive": 8, "Negative": 165, "Neutral": 12},
        "Neutral": {"Positive": 12, "Negative": 10, "Neutral": 93}
      }
    }
  },
 
  "pairwise": {
    "ann1_ann2": 0.75,
    "ann1_ann3": 0.70,
    "ann2_ann3": 0.72
  },
 
  "per_annotator": {
    "ann1": {"avg_agreement": 0.73, "items_annotated": 500},
    "ann2": {"avg_agreement": 0.74, "items_annotated": 500},
    "ann3": {"avg_agreement": 0.71, "items_annotated": 500}
  },
 
  "most_disagreed_items": [
    {"id": "item_234", "disagreement_rate": 1.0},
    {"id": "item_567", "disagreement_rate": 0.67}
  ]
}

أفضل الممارسات

احسب مبكراً: لا تنتظر حتى النهاية
استخدم المقاييس المناسبة: اسمية مقابل ترتيبية مقابل نطاق
حقق في الاتفاق المنخفض: غالباً يكشف مشاكل في الإرشادات
أبلغ في المنشورات: مطلوب للعمل العلمي
حدد عتبات: عرّف المستويات المقبولة مسبقاً

الخطوات التالية

حسّن الاتفاق مع ضبط الجودة
أضف مراحل تدريبية للمعايرة
تعلم تصدير البيانات مع معلومات الاتفاق

وثائق الاتفاق الكاملة في /docs/core-concepts/user-management.