Prediction Result Schemas
The Datature Vi SDK returns one of three response types depending on the task type: VQAResponse, PhraseGroundingResponse, or GenericResponse. All three inherit from a base PredictionResponse class that provides common fields like prompt, raw_output, and thinking.
Use isinstance() to check which type you received before accessing task-specific fields.
Base response fields
All response types share these fields from PredictionResponse:
from vi.inference import ViModel
model = ViModel(run_id="your-run-id")
result, error = model(source="image.jpg", user_prompt="Describe this image")
if error is None:
print(f"Prompt: {result.prompt}")
print(f"Raw output length: {len(result.raw_output) if result.raw_output else 0}")
if result.thinking:
print(f"Model reasoning: {result.thinking}")VQA response
Returned by Visual Question Answering inference. Contains a text answer to the user's question.
Schema
class VQAResponse(PredictionResponse):
result: VQAAnswer
class VQAAnswer:
answer: str # minimum 1 characterFields
Example
from vi.inference import ViModel
model = ViModel(run_id="your-run-id")
result, error = model(
source="image.jpg",
user_prompt="What color is the car in this image?"
)
if error is None:
answer = result.result.answer
print(f"Answer: {answer}")
print(f"Question: {result.prompt}")
if result.thinking:
print(f"Reasoning: {result.thinking}")Streaming mode
stream = model(
source="image.jpg",
user_prompt="Describe this image in detail",
stream=True
)
for token in stream:
print(token, end="", flush=True)
result = stream.get_final_completion()
print(f"\n\nFinal answer: {result.result.answer}")Phrase grounding response
Returned by Phrase Grounding inference. Contains a caption sentence and a list of detected objects with bounding boxes.
Schema
class PhraseGroundingResponse(PredictionResponse):
result: PhraseGrounding
class PhraseGrounding:
sentence: str # minimum 1 character
groundings: list[GroundedPhrase] # minimum 1
class GroundedPhrase:
phrase: str # text label, minimum 1 character
grounding: list[list[int]] # each box: [xmin, ymin, xmax, ymax]Fields
Bounding box format
Bounding boxes use normalized coordinates in the range [0, 1024]:
- Format:
[x_min, y_min, x_max, y_max] - Top-left corner:
(0, 0) - Bottom-right corner:
(1024, 1024) - Independent of actual image dimensions
To convert to pixel coordinates, scale each value by image_width / 1024 (x) or image_height / 1024 (y).
from PIL import Image
image = Image.open("image.jpg")
width, height = image.size
x_min, y_min, x_max, y_max = bbox # [0-1024] range
pixel_x_min = int(x_min / 1024 * width)
pixel_y_min = int(y_min / 1024 * height)
pixel_x_max = int(x_max / 1024 * width)
pixel_y_max = int(y_max / 1024 * height)Use the built-in visualize_prediction() utility for automatic coordinate conversion and rendering.
Example
from vi.inference import ViModel
model = ViModel(run_id="your-run-id")
result, error = model(
source="image.jpg",
user_prompt="Describe the objects in this image"
)
if error is None:
caption = result.result.sentence
print(f"Caption: {caption}")
for grounded_phrase in result.result.groundings:
phrase = grounded_phrase.phrase
bboxes = grounded_phrase.grounding
print(f"\nObject: {phrase}")
print(f" Bounding boxes: {len(bboxes)}")
for i, bbox in enumerate(bboxes):
x_min, y_min, x_max, y_max = bbox
print(f" Box {i+1}: [{x_min}, {y_min}, {x_max}, {y_max}]")Pixel coordinate conversion
from PIL import Image
def convert_bbox_to_pixels(bbox, image_path):
"""Convert normalized bbox [0-1024] to pixel coordinates."""
image = Image.open(image_path)
width, height = image.size
x_min, y_min, x_max, y_max = bbox
return [
int(x_min / 1024 * width),
int(y_min / 1024 * height),
int(x_max / 1024 * width),
int(y_max / 1024 * height)
]
result, error = model(source="image.jpg")
if error is None:
for grounded_phrase in result.result.groundings:
print(f"Object: {grounded_phrase.phrase}")
for bbox in grounded_phrase.grounding:
pixel_bbox = convert_bbox_to_pixels(bbox, "image.jpg")
print(f" Pixel coordinates: {pixel_bbox}")Filter groundings by object type
result, error = model(source="image.jpg")
if error is None:
people = [
g for g in result.result.groundings
if "person" in g.phrase.lower()
]
vehicles = [
g for g in result.result.groundings
if any(v in g.phrase.lower() for v in ["car", "truck", "vehicle"])
]
print(f"Found {len(people)} people and {len(vehicles)} vehicles")Generic response
Returned as a fallback when:
- The model output cannot be parsed into VQA or phrase grounding format
- JSON parsing fails
- Task type is explicitly set to
GENERIC
Contains raw text output without structured parsing.
Schema
class GenericResponse(PredictionResponse):
result: str # complete raw output textFields
Example
from vi.inference import ViModel
model = ViModel(run_id="your-run-id")
result, error = model(source="image.jpg", user_prompt="Analyze this image")
if error is None:
output = result.result
print(f"Model output: {output}")
if result.raw_output != result.result:
print("Note: Structured parsing failed; using raw output")You receive a GenericResponse when:
- The model's JSON output is malformed or incomplete
- The output does not match the expected schema
- Task type is explicitly set to
GENERIC
To debug: check result.raw_output for the full model output. Verify the model is trained for the task type you expect, and try adjusting generation config parameters.
Type checking and handling
Checking response type
from vi.inference.task_types import GenericResponse
from vi.inference.task_types.vqa import VQAResponse
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
result, error = model(source="image.jpg")
if error is None:
if isinstance(result, VQAResponse):
print(f"VQA Answer: {result.result.answer}")
elif isinstance(result, PhraseGroundingResponse):
print(f"Caption: {result.result.sentence}")
print(f"Objects: {len(result.result.groundings)}")
elif isinstance(result, GenericResponse):
print(f"Raw output: {result.result}")
print("Warning: structured parsing may have failed")Safe attribute access
def extract_text(result):
"""Extract text from any response type."""
if hasattr(result, "result"):
if hasattr(result.result, "answer"):
return result.result.answer
if hasattr(result.result, "sentence"):
return result.result.sentence
if isinstance(result.result, str):
return result.result
return None
result, error = model(source="image.jpg")
if error is None:
text = extract_text(result)
print(f"Extracted text: {text}")Batch inference schemas
Batch inference returns a list of (result, error) tuples, one per image:
from vi.inference.task_types.vqa import VQAResponse
images = ["img1.jpg", "img2.jpg", "img3.jpg"]
results = model(source=images, user_prompt="What's in this image?")
for i, (result, error) in enumerate(results):
if error is None:
if isinstance(result, VQAResponse):
print(f"Image {i+1}: {result.result.answer}")
else:
print(f"Image {i+1} failed: {error}")successful = []
failed = []
for img, (result, error) in zip(images, results):
if error is None:
successful.append((img, result))
else:
failed.append((img, error))
print(f"Successful: {len(successful)}/{len(images)}")
for img_path, result in successful:
if isinstance(result, VQAResponse):
print(f"{img_path}: {result.result.answer}")Advanced usage
Accessing raw output for debugging
result, error = model(source="image.jpg")
if error is None:
print("=== Raw Model Output ===")
print(result.raw_output)
if result.thinking:
print("\n=== Model Reasoning ===")
print(result.thinking)
if isinstance(result, VQAResponse):
print("\n=== Parsed Answer ===")
print(result.result.answer)Chain-of-thought (CoT) responses
At inference time, pass cot=True to model(...) to request chain-of-thought decoding. The model emits <think> and <answer> segments; the SDK exposes reasoning in result.thinking (when present) alongside the parsed task result.
from vi.inference import ViModel
model = ViModel(run_id="your-run-id")
result, error = model(
source="image.jpg",
user_prompt="Count the number of cars",
cot=True,
stream=False,
)
if error is None:
if result.thinking:
print("Model reasoning:")
print(result.thinking)
print("\nFinal answer:")
print(result.result.answer)Export schema to JSON
import json
from vi.inference.task_types.vqa import VQAResponse
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
result, error = model(source="image.jpg")
if error is None:
if isinstance(result, VQAResponse):
output = {
"type": "vqa",
"prompt": result.prompt,
"answer": result.result.answer,
"thinking": result.thinking
}
elif isinstance(result, PhraseGroundingResponse):
output = {
"type": "phrase_grounding",
"prompt": result.prompt,
"sentence": result.result.sentence,
"objects": [
{"phrase": g.phrase, "bounding_boxes": g.grounding}
for g in result.result.groundings
],
"thinking": result.thinking
}
with open("result.json", "w") as f:
json.dump(output, f, indent=2)Related resources
Updated 30 days ago
