Skip to main content

Content Moderation

Protect your users and platform by implementing content moderation for both user inputs and AI-generated outputs.

Why Moderate?

Protect Users

Shield users from harmful, offensive, or inappropriate content

Platform Safety

Maintain community standards and brand reputation

Legal Compliance

Meet regulatory requirements (GDPR, DSA, etc.)

Reduce Abuse

Prevent misuse of your AI-powered features

Moderation Endpoint

Use the /v1/moderate endpoint:
curl https://api.assisters.dev/v1/moderate \
  -H "Authorization: Bearer ask_your_api_key" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "llama-guard-3",
    "input": "Content to check"
  }'
Response:
{
  "results": [{
    "flagged": false,
    "categories": {
      "hate": false,
      "harassment": false,
      "self-harm": false,
      "sexual": false,
      "violence": false
    },
    "category_scores": {
      "hate": 0.0001,
      "harassment": 0.0002,
      "self-harm": 0.0001,
      "sexual": 0.0001,
      "violence": 0.0001
    }
  }]
}

Moderation Categories

CategoryDescription
hateContent expressing hatred toward protected groups
hate/threateningHateful content with violence threats
harassmentContent meant to harass or bully
harassment/threateningHarassment with explicit threats
self-harmContent promoting self-harm
self-harm/intentExpression of self-harm intent
self-harm/instructionsInstructions for self-harm
sexualSexually explicit content
sexual/minorsSexual content involving minors
violenceContent depicting violence
violence/graphicGraphic depictions of violence

Implementation Patterns

1. Moderate User Inputs (Pre-moderation)

Check all user messages before processing:
from openai import OpenAI

client = OpenAI(api_key="ask_...", base_url="https://api.assisters.dev/v1")

def moderate_input(text):
    result = client.moderations.create(
        model="llama-guard-3",
        input=text
    ).results[0]

    return not result.flagged, result

def process_message(user_message):
    is_safe, moderation = moderate_input(user_message)

    if not is_safe:
        # Log for review
        log_violation(user_message, moderation)
        return "Your message violates our content policy."

    # Process normally
    return generate_response(user_message)

2. Moderate AI Outputs (Post-moderation)

Verify AI responses before showing to users:
def safe_generate(user_message):
    # Generate response
    response = client.chat.completions.create(
        model="llama-3.1-8b",
        messages=[{"role": "user", "content": user_message}]
    )
    content = response.choices[0].message.content

    # Moderate output
    is_safe, moderation = moderate_input(content)

    if not is_safe:
        log_unsafe_output(content, moderation)
        return "I apologize, but I cannot provide that response."

    return content

3. Bi-directional Moderation

Check both inputs AND outputs:
def fully_moderated_chat(user_message):
    # 1. Check input
    input_safe, input_mod = moderate_input(user_message)
    if not input_safe:
        return "Your message violates our content policy."

    # 2. Generate response
    response = generate_response(user_message)

    # 3. Check output
    output_safe, output_mod = moderate_input(response)
    if not output_safe:
        # Try regenerating or return safe fallback
        return "I cannot provide a response to that."

    return response

4. Custom Thresholds

Use category_scores for fine-grained control:
def custom_moderation(text, thresholds=None):
    if thresholds is None:
        thresholds = {
            "hate": 0.5,
            "harassment": 0.5,
            "violence": 0.3,  # Stricter on violence
            "sexual": 0.7,    # More lenient
        }

    result = client.moderations.create(
        model="llama-guard-3",
        input=text
    ).results[0]

    violations = []
    for category, threshold in thresholds.items():
        score = result.category_scores.get(category, 0)
        if score > threshold:
            violations.append({
                "category": category,
                "score": score,
                "threshold": threshold
            })

    return len(violations) == 0, violations

5. Batch Moderation

Moderate multiple items efficiently:
def moderate_batch(texts):
    result = client.moderations.create(
        model="llama-guard-3",
        input=texts  # Up to 100 items
    )

    return [
        {"text": text, "flagged": r.flagged, "categories": r.categories}
        for text, r in zip(texts, result.results)
    ]

# Example: Moderate comments
comments = ["comment 1", "comment 2", "comment 3"]
results = moderate_batch(comments)

for r in results:
    if r["flagged"]:
        print(f"Flagged: {r['text'][:50]}...")

Handling Violations

1. Block and Notify

def handle_violation(user_id, content, moderation):
    # Block the content
    blocked = True

    # Notify user
    notification = f"Your message was blocked: {get_violation_reason(moderation)}"

    # Log for review
    log_to_moderation_queue(user_id, content, moderation)

    return blocked, notification

2. Review Queue

from enum import Enum

class ModerationAction(Enum):
    APPROVE = "approve"
    REJECT = "reject"
    ESCALATE = "escalate"

def add_to_review_queue(content, moderation, priority="normal"):
    # High scores = higher priority
    max_score = max(moderation.category_scores.values())
    if max_score > 0.8:
        priority = "high"

    review_item = {
        "content": content,
        "categories": moderation.categories,
        "scores": moderation.category_scores,
        "priority": priority,
        "status": "pending"
    }

    save_to_queue(review_item)

3. User Warnings

def handle_user_violation(user_id, violation_type):
    # Get violation history
    history = get_user_violations(user_id)

    if len(history) == 0:
        # First violation: warn
        return send_warning(user_id)
    elif len(history) < 3:
        # Repeat violations: temporary restriction
        return restrict_user(user_id, hours=24)
    else:
        # Multiple violations: escalate
        return escalate_to_admin(user_id)

Best Practices

Moderate Both Directions

Check user inputs AND AI outputs

Use Custom Thresholds

Tune sensitivity based on your use case

Log Everything

Keep audit trails for compliance

Human Review

Have humans review borderline cases

Complete Example

from openai import OpenAI
from dataclasses import dataclass
from typing import Optional

@dataclass
class ModerationResult:
    is_safe: bool
    reason: Optional[str]
    scores: dict

class ContentModerator:
    def __init__(self, api_key: str, thresholds: dict = None):
        self.client = OpenAI(
            api_key=api_key,
            base_url="https://api.assisters.dev/v1"
        )
        self.thresholds = thresholds or {
            "hate": 0.5,
            "harassment": 0.5,
            "self-harm": 0.3,
            "sexual": 0.5,
            "violence": 0.5,
        }

    def check(self, text: str) -> ModerationResult:
        result = self.client.moderations.create(
            model="llama-guard-3",
            input=text
        ).results[0]

        # Check against custom thresholds
        for category, threshold in self.thresholds.items():
            score = getattr(result.category_scores, category, 0)
            if score > threshold:
                return ModerationResult(
                    is_safe=False,
                    reason=f"Content flagged for {category} (score: {score:.2f})",
                    scores=result.category_scores.__dict__
                )

        return ModerationResult(
            is_safe=True,
            reason=None,
            scores=result.category_scores.__dict__
        )

    def safe_chat(self, user_message: str) -> str:
        # Check input
        input_check = self.check(user_message)
        if not input_check.is_safe:
            return f"Message blocked: {input_check.reason}"

        # Generate response
        response = self.client.chat.completions.create(
            model="llama-3.1-8b",
            messages=[{"role": "user", "content": user_message}]
        )
        content = response.choices[0].message.content

        # Check output
        output_check = self.check(content)
        if not output_check.is_safe:
            return "I cannot provide an appropriate response."

        return content

# Usage
moderator = ContentModerator(api_key="ask_...")
response = moderator.safe_chat("Hello, how are you?")
print(response)

Pricing

ModelPrice per Million Tokens
llama-guard-3$0.20
shieldgemma$0.15

View Moderation Models

Compare moderation model capabilities