Run NIM Inference
To run inference with a deployed NIM container, use NIMPredictor. It wraps the container's OpenAI-compatible API and returns structured results for each task type.
Basic inference
Create a predictor
Pass a NIMConfig to NIMPredictor and the task type is read from model metadata automatically:
from vi.deployment.nim import NIMPredictor, NIMConfig
config = NIMConfig(
nvidia_api_key="nvapi-...",
run_id="your-run-id",
port=8000
)
predictor = NIMPredictor(config=config)predictor = NIMPredictor(
model_name="cosmos-reason2-2b",
task_type="phrase-grounding",
port=8000
)Run inference on an image
result = predictor(source="image.jpg", stream=False)
print(f"Caption: {result.caption}")
print(f"Result: {result.result}")
# Phrase grounding: check for bounding boxes
if hasattr(result, "phrase_grounding"):
for phrase in result.phrase_grounding:
print(f"Phrase: {phrase.phrase}")
print(f"Box: {phrase.bounding_box}")Use a custom prompt
Override the default task prompt for the request:
result = predictor(
source="image.jpg",
user_prompt="What objects are visible in this image?",
stream=False
)
print(result.result)Streaming inference
Streaming returns tokens as they are generated. This is useful for real-time user interfaces where you want to show partial output progressively.
gen = predictor(source="image.jpg", stream=True)
for token in gen:
print(token, end="", flush=True)
# Capture the final structured result
result = None
try:
next(gen)
except StopIteration as e:
result = e.value
print(f"\n\nFinal result: {result.caption}")import sys
gen = predictor(
source="image.jpg",
user_prompt="Describe this image in detail",
stream=True
)
print("Generating: ", end="")
for token in gen:
print(token, end="", flush=True)
sys.stdout.flush()
result = None
try:
next(gen)
except StopIteration as e:
result = e.value
print("\n\nDone!")Sampling parameters
Pass a NIMSamplingParams instance to control generation behavior. See the configuration reference for all available parameters.
Basic parameters
from vi.deployment.nim import NIMSamplingParams
params = NIMSamplingParams(
temperature=0.7, # 0.0 = deterministic, 1.0+ = more creative
max_tokens=1024,
top_p=0.95,
top_k=50
)
result = predictor(
source="image.jpg",
stream=False,
sampling_params=params
)Advanced parameters
params = NIMSamplingParams(
# Core sampling
temperature=0.7,
top_p=0.95,
top_k=50,
min_p=0.05,
# Length
max_tokens=2048,
min_tokens=100,
# Repetition
presence_penalty=0.1,
frequency_penalty=0.1,
repetition_penalty=1.05,
# Stop sequences
stop=["\n\n", "END"],
# Reproducibility
seed=42,
# Log probabilities
logprobs=5,
prompt_logprobs=5
)
result = predictor(
source="image.jpg",
stream=False,
sampling_params=params
)Parameter quick reference
Guided decoding
Guided decoding constrains the model's output to a specific format. This is useful when you need structured data from the model.
Choice constraint
params = NIMSamplingParams(
temperature=0.2,
guided_choice=["yes", "no", "maybe"]
)
result = predictor(
source="image.jpg",
user_prompt="Does this image contain a car?",
stream=False,
sampling_params=params
)
print(result.result) # "yes", "no", or "maybe"JSON schema
import json
schema = {
"type": "object",
"properties": {
"objects": {"type": "array", "items": {"type": "string"}},
"count": {"type": "integer"},
"confidence": {"type": "number"}
},
"required": ["objects", "count"]
}
params = NIMSamplingParams(temperature=0.3, guided_json=schema)
result = predictor(
source="image.jpg",
user_prompt="List all objects in this image",
stream=False,
sampling_params=params
)
output = json.loads(result.result)
print(f"Found {output['count']} objects: {output['objects']}")Regex pattern
params = NIMSamplingParams(
temperature=0.1,
guided_regex=r"\d{4}-\d{2}-\d{2}" # YYYY-MM-DD
)
result = predictor(
source="document.jpg",
user_prompt="Extract the date from this document",
stream=False,
sampling_params=params
)
print(result.result) # e.g. "2024-03-15"Grammar constraint
grammar = """
root ::= "The image contains " number " " object "."
number ::= [0-9]+
object ::= "person" | "car" | "dog" | "cat"
"""
params = NIMSamplingParams(temperature=0.2, guided_grammar=grammar)
result = predictor(source="image.jpg", stream=False, sampling_params=params)
print(result.result) # e.g. "The image contains 3 car."Video inference
Cosmos-Reason2 models (cosmos-reason2-2b and cosmos-reason2-8b) accept video files. The predictor detects video automatically based on file extension.
Video inference requires a Cosmos-Reason2 model. cosmos-reason1-7b does not support video input.
Basic video processing
from vi.deployment.nim import NIMPredictor
predictor = NIMPredictor(
model_name="cosmos-reason2-2b",
task_type="vqa",
port=8000
)
result = predictor(
source="video.mp4",
user_prompt="Describe what happens in this video",
stream=False
)
print(result.result)Video sampling parameters
from vi.deployment.nim import NIMSamplingParams
video_params = NIMSamplingParams(
temperature=0.2,
max_tokens=4096,
media_io_kwargs={"fps": 2.0}, # 2 frames per second
mm_processor_kwargs={
"shortest_edge": 336,
"longest_edge": 672
}
)
result = predictor(
source="video.mp4",
user_prompt="What activities are shown in this video?",
stream=False,
sampling_params=video_params
)video_params = NIMSamplingParams(
temperature=0.2,
max_tokens=4096,
media_io_kwargs={"num_frames": 16}, # exactly 16 frames
mm_processor_kwargs={
"shortest_edge": 336,
"longest_edge": 672
}
)Use either fps or num_frames in media_io_kwargs, not both.
Frame resolution options
Video analysis examples
result = predictor(
source="action.mp4",
user_prompt="What actions are performed in this video?",
stream=False,
sampling_params=NIMSamplingParams(
temperature=0.2,
media_io_kwargs={"fps": 1.0}
)
)result = predictor(
source="tracking.mp4",
user_prompt="Track the movement of objects across frames",
stream=False,
sampling_params=NIMSamplingParams(
temperature=0.1,
media_io_kwargs={"fps": 4.0},
mm_processor_kwargs={"shortest_edge": 672}
)
)Task types
Auto-detection
When you pass a NIMConfig with a run_id, the task type is read from model metadata automatically:
config = NIMConfig(nvidia_api_key="nvapi-...", run_id="your-run-id")
predictor = NIMPredictor(config=config) # task type inferred from run metadataExplicit task types
# Visual question answering
predictor = NIMPredictor(model_name="cosmos-reason2-2b", task_type="vqa", port=8000)
# Phrase grounding
predictor = NIMPredictor(model_name="cosmos-reason2-2b", task_type="phrase-grounding", port=8000)
# Freeform text (open-ended)
predictor = NIMPredictor(model_name="cosmos-reason2-2b", task_type="freeform-text", port=8000)Task-specific response fields
VQA:
predictor = NIMPredictor(task_type="vqa", config=config)
result = predictor(source="image.jpg", stream=False)
print(result.result) # Answer textPhrase grounding:
predictor = NIMPredictor(task_type="phrase-grounding", config=config)
result = predictor(source="image.jpg", stream=False)
print(result.caption)
for phrase in result.phrase_grounding:
print(f"{phrase.phrase}: {phrase.bounding_box}")Freeform text:
predictor = NIMPredictor(task_type="freeform-text", config=config)
result = predictor(source="image.jpg", stream=False)
print(result.result) # Raw JSON stringHandling results
Access response fields
result = predictor(source="image.jpg", stream=False)
# Common fields
print(f"Prompt: {result.prompt}")
print(f"Caption: {result.caption}")
print(f"Result: {result.result}")
# Phrase grounding fields
if hasattr(result, "phrase_grounding"):
for phrase in result.phrase_grounding:
print(f"Object: {phrase.phrase}")
print(f"Box: {phrase.bounding_box}")
print(f"Confidence: {phrase.confidence}")Work with bounding boxes
result = predictor(source="image.jpg", stream=False)
for phrase in result.phrase_grounding:
bbox = phrase.bounding_box
width = bbox.x2 - bbox.x1
height = bbox.y2 - bbox.y1
print(f"Object: {phrase.phrase}")
print(f"Location: x1={bbox.x1}, y1={bbox.y1}, x2={bbox.x2}, y2={bbox.y2}")
print(f"Size: {width}x{height}")Visualize predictions
from vi.inference.utils.visualize import visualize_prediction
result = predictor(source="image.jpg", stream=False)
output_image = visualize_prediction(
image_path="image.jpg",
prediction=result
)
output_image.save("output.jpg")Batch processing
Process multiple images
from pathlib import Path
image_paths = list(Path("./images").glob("*.jpg"))
results = []
for image_path in image_paths:
result = predictor(source=str(image_path), stream=False)
results.append({"image": image_path.name, "result": result.result})
print(f"Processed {len(results)} images")from tqdm import tqdm
from pathlib import Path
image_paths = list(Path("./images").glob("*.jpg"))
results = []
for image_path in tqdm(image_paths, desc="Processing"):
result = predictor(source=str(image_path), stream=False)
results.append(result)Parallel processing
NIM containers process one request at a time. For true parallel throughput, deploy multiple containers on different ports and route requests across them.
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
def process_image(image_path):
result = predictor(source=str(image_path), stream=False)
return {"image": image_path.name, "result": result.result}
image_paths = list(Path("./images").glob("*.jpg"))
with ThreadPoolExecutor(max_workers=4) as executor:
results = list(executor.map(process_image, image_paths))Complete example
from vi.deployment.nim import NIMDeployer, NIMPredictor, NIMConfig, NIMSamplingParams
import os
# Deploy
config = NIMConfig(
nvidia_api_key=os.getenv("NGC_API_KEY"),
secret_key=os.getenv("DATATURE_VI_SECRET_KEY"),
organization_id=os.getenv("DATATURE_VI_ORGANIZATION_ID"),
run_id="your-run-id",
port=8000
)
deployer = NIMDeployer(config)
deploy_result = deployer.deploy()
print(f"Container deployed on port {deploy_result.port}")
# Configure inference
predictor = NIMPredictor(config=config)
params = NIMSamplingParams(
temperature=0.7,
max_tokens=1024,
top_p=0.95,
seed=42
)
try:
# Image inference
image_result = predictor(
source="image.jpg",
user_prompt="Describe this image in detail",
stream=False,
sampling_params=params
)
print(f"\nImage: {image_result.result}")
# Video inference (Cosmos-Reason2 only)
if "cosmos-reason2" in config.image_name:
video_params = NIMSamplingParams(
temperature=0.2,
max_tokens=2048,
media_io_kwargs={"fps": 2.0},
mm_processor_kwargs={"shortest_edge": 336}
)
video_result = predictor(
source="video.mp4",
user_prompt="What happens in this video?",
stream=False,
sampling_params=video_params
)
print(f"\nVideo: {video_result.result}")
finally:
NIMDeployer.stop(deploy_result.container_name)
print("\nContainer stopped")Related resources
Updated 7 days ago
