Run NIM Inference

To run inference with a deployed NIM container, use NIMPredictor. It wraps the container's OpenAI-compatible API and returns structured results for each task type.

Basic inference

Create a predictor

Pass a NIMConfig to NIMPredictor and the task type is read from model metadata automatically:

from vi.deployment.nim import NIMPredictor, NIMConfig

config = NIMConfig(
    nvidia_api_key="nvapi-...",
    run_id="your-run-id",
    port=8000
)

predictor = NIMPredictor(config=config)

predictor = NIMPredictor(
    model_name="cosmos-reason2-2b",
    task_type="phrase-grounding",
    port=8000
)

Run inference on an image

result = predictor(source="image.jpg", stream=False)

print(f"Caption: {result.caption}")
print(f"Result: {result.result}")

# Phrase grounding: check for bounding boxes
if hasattr(result, "phrase_grounding"):
    for phrase in result.phrase_grounding:
        print(f"Phrase: {phrase.phrase}")
        print(f"Box: {phrase.bounding_box}")

Use a custom prompt

Override the default task prompt for the request:

result = predictor(
    source="image.jpg",
    user_prompt="What objects are visible in this image?",
    stream=False
)

print(result.result)

Streaming inference

Streaming returns tokens as they are generated. This is useful for real-time user interfaces where you want to show partial output progressively.

gen = predictor(source="image.jpg", stream=True)

for token in gen:
    print(token, end="", flush=True)

# Capture the final structured result
result = None
try:
    next(gen)
except StopIteration as e:
    result = e.value

print(f"\n\nFinal result: {result.caption}")

import sys

gen = predictor(
    source="image.jpg",
    user_prompt="Describe this image in detail",
    stream=True
)

print("Generating: ", end="")
for token in gen:
    print(token, end="", flush=True)
    sys.stdout.flush()

result = None
try:
    next(gen)
except StopIteration as e:
    result = e.value

print("\n\nDone!")

Sampling parameters

Pass a NIMSamplingParams instance to control generation behavior. See the configuration reference for all available parameters.

Basic parameters

from vi.deployment.nim import NIMSamplingParams

params = NIMSamplingParams(
    temperature=0.7,   # 0.0 = deterministic, 1.0+ = more creative
    max_tokens=1024,
    top_p=0.95,
    top_k=50
)

result = predictor(
    source="image.jpg",
    stream=False,
    sampling_params=params
)

Advanced parameters

params = NIMSamplingParams(
    # Core sampling
    temperature=0.7,
    top_p=0.95,
    top_k=50,
    min_p=0.05,

    # Length
    max_tokens=2048,
    min_tokens=100,

    # Repetition
    presence_penalty=0.1,
    frequency_penalty=0.1,
    repetition_penalty=1.05,

    # Stop sequences
    stop=["\n\n", "END"],

    # Reproducibility
    seed=42,

    # Log probabilities
    logprobs=5,
    prompt_logprobs=5
)

result = predictor(
    source="image.jpg",
    stream=False,
    sampling_params=params
)

Parameter quick reference

Name

Type

Description

Required

Default

temperature

string

Range: 0.0–2.0. 0.0 = greedy. 0.2–0.5 = focused. 0.7–1.0 = balanced

Optional

—

top_p

string

Range: 0.0–1.0. 0.95 recommended. 1.0 = all tokens

Optional

—

top_k

string

Range: -1 or ≥1. 50 balanced. -1 = no filtering

Optional

—

max_tokens

string

Range: ≥1. Cap on output length

Optional

—

Guided decoding

Guided decoding constrains the model's output to a specific format. This is useful when you need structured data from the model.

Choice constraint

params = NIMSamplingParams(
    temperature=0.2,
    guided_choice=["yes", "no", "maybe"]
)

result = predictor(
    source="image.jpg",
    user_prompt="Does this image contain a car?",
    stream=False,
    sampling_params=params
)

print(result.result)  # "yes", "no", or "maybe"

JSON schema

import json

schema = {
    "type": "object",
    "properties": {
        "objects": {"type": "array", "items": {"type": "string"}},
        "count": {"type": "integer"},
        "confidence": {"type": "number"}
    },
    "required": ["objects", "count"]
}

params = NIMSamplingParams(temperature=0.3, guided_json=schema)

result = predictor(
    source="image.jpg",
    user_prompt="List all objects in this image",
    stream=False,
    sampling_params=params
)

output = json.loads(result.result)
print(f"Found {output['count']} objects: {output['objects']}")

Regex pattern

params = NIMSamplingParams(
    temperature=0.1,
    guided_regex=r"\d{4}-\d{2}-\d{2}"  # YYYY-MM-DD
)

result = predictor(
    source="document.jpg",
    user_prompt="Extract the date from this document",
    stream=False,
    sampling_params=params
)

print(result.result)  # e.g. "2024-03-15"

Grammar constraint

grammar = """
root ::= "The image contains " number " " object "."
number ::= [0-9]+
object ::= "person" | "car" | "dog" | "cat"
"""

params = NIMSamplingParams(temperature=0.2, guided_grammar=grammar)

result = predictor(source="image.jpg", stream=False, sampling_params=params)
print(result.result)  # e.g. "The image contains 3 car."

Video inference

Cosmos-Reason2 models (cosmos-reason2-2b and cosmos-reason2-8b) accept video files. The predictor detects video automatically based on file extension.

Video Model Support

Video inference requires a Cosmos-Reason2 model. cosmos-reason1-7b does not support video input.

Basic video processing

from vi.deployment.nim import NIMPredictor

predictor = NIMPredictor(
    model_name="cosmos-reason2-2b",
    task_type="vqa",
    port=8000
)

result = predictor(
    source="video.mp4",
    user_prompt="Describe what happens in this video",
    stream=False
)

print(result.result)

Video sampling parameters

from vi.deployment.nim import NIMSamplingParams

video_params = NIMSamplingParams(
    temperature=0.2,
    max_tokens=4096,
    media_io_kwargs={"fps": 2.0},             # 2 frames per second
    mm_processor_kwargs={
        "shortest_edge": 336,
        "longest_edge": 672
    }
)

result = predictor(
    source="video.mp4",
    user_prompt="What activities are shown in this video?",
    stream=False,
    sampling_params=video_params
)

video_params = NIMSamplingParams(
    temperature=0.2,
    max_tokens=4096,
    media_io_kwargs={"num_frames": 16},       # exactly 16 frames
    mm_processor_kwargs={
        "shortest_edge": 336,
        "longest_edge": 672
    }
)

Use either fps or num_frames in media_io_kwargs, not both.

Frame resolution options

Resolution

`shortest_edge`

`longest_edge`

Use when

Low (faster)

168

336

Quick analysis, large videos

Standard

336

672

Default for most cases

High

672

1344

Detail-sensitive analysis

Video analysis examples

result = predictor(
    source="action.mp4",
    user_prompt="What actions are performed in this video?",
    stream=False,
    sampling_params=NIMSamplingParams(
        temperature=0.2,
        media_io_kwargs={"fps": 1.0}
    )
)

result = predictor(
    source="tracking.mp4",
    user_prompt="Track the movement of objects across frames",
    stream=False,
    sampling_params=NIMSamplingParams(
        temperature=0.1,
        media_io_kwargs={"fps": 4.0},
        mm_processor_kwargs={"shortest_edge": 672}
    )
)

Task types

Auto-detection

When you pass a NIMConfig with a run_id, the task type is read from model metadata automatically:

config = NIMConfig(nvidia_api_key="nvapi-...", run_id="your-run-id")
predictor = NIMPredictor(config=config)  # task type inferred from run metadata

Explicit task types

# Visual question answering
predictor = NIMPredictor(model_name="cosmos-reason2-2b", task_type="vqa", port=8000)

# Phrase grounding
predictor = NIMPredictor(model_name="cosmos-reason2-2b", task_type="phrase-grounding", port=8000)

# Freeform text (open-ended)
predictor = NIMPredictor(model_name="cosmos-reason2-2b", task_type="freeform-text", port=8000)

Task-specific response fields

VQA:

predictor = NIMPredictor(task_type="vqa", config=config)
result = predictor(source="image.jpg", stream=False)

print(result.result)  # Answer text

Phrase grounding:

predictor = NIMPredictor(task_type="phrase-grounding", config=config)
result = predictor(source="image.jpg", stream=False)

print(result.caption)
for phrase in result.phrase_grounding:
    print(f"{phrase.phrase}: {phrase.bounding_box}")

Freeform text:

predictor = NIMPredictor(task_type="freeform-text", config=config)
result = predictor(source="image.jpg", stream=False)

print(result.result)  # Raw JSON string

Handling results

Access response fields

result = predictor(source="image.jpg", stream=False)

# Common fields
print(f"Prompt: {result.prompt}")
print(f"Caption: {result.caption}")
print(f"Result: {result.result}")

# Phrase grounding fields
if hasattr(result, "phrase_grounding"):
    for phrase in result.phrase_grounding:
        print(f"Object: {phrase.phrase}")
        print(f"Box: {phrase.bounding_box}")
        print(f"Confidence: {phrase.confidence}")

Work with bounding boxes

result = predictor(source="image.jpg", stream=False)

for phrase in result.phrase_grounding:
    bbox = phrase.bounding_box
    width = bbox.x2 - bbox.x1
    height = bbox.y2 - bbox.y1

    print(f"Object: {phrase.phrase}")
    print(f"Location: x1={bbox.x1}, y1={bbox.y1}, x2={bbox.x2}, y2={bbox.y2}")
    print(f"Size: {width}x{height}")

Visualize predictions

from vi.inference.utils.visualize import visualize_prediction

result = predictor(source="image.jpg", stream=False)

output_image = visualize_prediction(
    image_path="image.jpg",
    prediction=result
)

output_image.save("output.jpg")

Batch processing

Process multiple images

from pathlib import Path

image_paths = list(Path("./images").glob("*.jpg"))
results = []

for image_path in image_paths:
    result = predictor(source=str(image_path), stream=False)
    results.append({"image": image_path.name, "result": result.result})

print(f"Processed {len(results)} images")

from tqdm import tqdm
from pathlib import Path

image_paths = list(Path("./images").glob("*.jpg"))
results = []

for image_path in tqdm(image_paths, desc="Processing"):
    result = predictor(source=str(image_path), stream=False)
    results.append(result)

Parallel processing

NIM containers process one request at a time. For true parallel throughput, deploy multiple containers on different ports and route requests across them.

from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

def process_image(image_path):
    result = predictor(source=str(image_path), stream=False)
    return {"image": image_path.name, "result": result.result}

image_paths = list(Path("./images").glob("*.jpg"))

with ThreadPoolExecutor(max_workers=4) as executor:
    results = list(executor.map(process_image, image_paths))

Complete example

from vi.deployment.nim import NIMDeployer, NIMPredictor, NIMConfig, NIMSamplingParams
import os

# Deploy
config = NIMConfig(
    nvidia_api_key=os.getenv("NGC_API_KEY"),
    secret_key=os.getenv("DATATURE_VI_SECRET_KEY"),
    organization_id=os.getenv("DATATURE_VI_ORGANIZATION_ID"),
    run_id="your-run-id",
    port=8000
)

deployer = NIMDeployer(config)
deploy_result = deployer.deploy()
print(f"Container deployed on port {deploy_result.port}")

# Configure inference
predictor = NIMPredictor(config=config)

params = NIMSamplingParams(
    temperature=0.7,
    max_tokens=1024,
    top_p=0.95,
    seed=42
)

try:
    # Image inference
    image_result = predictor(
        source="image.jpg",
        user_prompt="Describe this image in detail",
        stream=False,
        sampling_params=params
    )
    print(f"\nImage: {image_result.result}")

    # Video inference (Cosmos-Reason2 only)
    if "cosmos-reason2" in config.image_name:
        video_params = NIMSamplingParams(
            temperature=0.2,
            max_tokens=2048,
            media_io_kwargs={"fps": 2.0},
            mm_processor_kwargs={"shortest_edge": 336}
        )

        video_result = predictor(
            source="video.mp4",
            user_prompt="What happens in this video?",
            stream=False,
            sampling_params=video_params
        )
        print(f"\nVideo: {video_result.result}")

finally:
    NIMDeployer.stop(deploy_result.container_name)
    print("\nContainer stopped")

Related resources

Deploy A Container

Pull NIM images and mount custom weights before running inference.

Configuration Reference

All NIMConfig and NIMSamplingParams parameters in one place.

Troubleshooting

Fix connection errors, slow inference, and video processing failures.