Handle Results
After Datature Vi returns inference results, you access the data through the response object's fields. The structure differs by task type: VQA responses contain a text answer, phrase grounding responses contain a caption plus bounding boxes.
- A loaded model with completed inference
- Understanding of task types (VQA or phrase grounding)
- Familiarity with prediction schemas for field names
Accessing results
Basic result access
Non-streaming inference returns a (result, error) tuple. Check error is None before accessing any fields.
from vi.inference import ViModel
from vi.inference.task_types.vqa import VQAResponse
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
model = ViModel(run_id="your-run-id")
result, error = model(
source="image.jpg",
user_prompt="Describe this image"
)
if error is None:
if isinstance(result, VQAResponse):
print(f"Answer: {result.result.answer}")
elif isinstance(result, PhraseGroundingResponse):
print(f"Caption: {result.result.sentence}")
else:
print(f"Error: {error}")See complete schema reference →
Safe field access for mixed response types
When your code processes results from different task types, use a helper function to extract the text regardless of response type:
from vi.inference.task_types.vqa import VQAResponse
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
def get_text(result):
if isinstance(result, VQAResponse):
return result.result.answer
elif isinstance(result, PhraseGroundingResponse):
return result.result.sentence
return result.result # GenericResponse fallback
result, error = model(source="image.jpg")
if error is None:
print(get_text(result))Working with captions
VQA answers
VQA responses expose the answer text at result.result.answer:
from vi.inference.task_types.vqa import VQAResponse
result, error = model(
source="image.jpg",
user_prompt="What's in this image?"
)
if error is None and isinstance(result, VQAResponse):
text = result.result.answer
print(f"Answer: {text}")Phrase grounding captions
Phrase grounding responses expose the sentence at result.result.sentence:
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
result, error = model(source="image.jpg")
if error is None and isinstance(result, PhraseGroundingResponse):
text = result.result.sentence
print(f"Caption: {text}")Save captions to file
import json
from vi.inference.task_types.vqa import VQAResponse
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
def get_text(result):
if isinstance(result, VQAResponse):
return result.result.answer
elif isinstance(result, PhraseGroundingResponse):
return result.result.sentence
return result.result
# Save results from a batch
results = model(source="./images/")
texts = []
for result, error in results:
if error is None:
texts.append(get_text(result))
with open("outputs.txt", "w") as f:
f.write("\n".join(texts))Working with grounded phrases
Accessing bounding boxes
Phrase grounding results include a list of GroundedPhrase objects. Each has a phrase (text label) and a grounding (list of bounding boxes).
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
result, error = model(source="image.jpg")
if error is None and isinstance(result, PhraseGroundingResponse):
for grounding in result.result.groundings:
print(f"Phrase: {grounding.phrase}")
print(f"Bounding boxes: {grounding.grounding}")View complete phrase grounding schema →
Coordinate system
Bounding boxes use normalized coordinates in the range [0, 1024] with the format [x_min, y_min, x_max, y_max], where (0, 0) is the top-left corner and (1024, 1024) is the bottom-right corner. Values are independent of actual image dimensions. Convert to pixel coordinates before drawing or calculating areas.
Converting to pixel coordinates
from PIL import Image
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
def bbox_to_pixels(bbox, image_path):
"""Convert normalized bbox [0-1024] to pixel coordinates."""
image = Image.open(image_path)
width, height = image.size
x_min, y_min, x_max, y_max = bbox
return [
int(x_min / 1024 * width),
int(y_min / 1024 * height),
int(x_max / 1024 * width),
int(y_max / 1024 * height)
]
result, error = model(source="image.jpg")
if error is None and isinstance(result, PhraseGroundingResponse):
for grounding in result.result.groundings:
for bbox in grounding.grounding:
pixel_bbox = bbox_to_pixels(bbox, "image.jpg")
print(f"{grounding.phrase}: {pixel_bbox}")Filter by object type
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
result, error = model(source="image.jpg")
if error is None and isinstance(result, PhraseGroundingResponse):
people = [g for g in result.result.groundings if "person" in g.phrase.lower()]
vehicles = [
g for g in result.result.groundings
if any(v in g.phrase.lower() for v in ["car", "truck", "vehicle"])
]
print(f"Found {len(people)} people and {len(vehicles)} vehicles")Visualization
Built-in visualization
The Vi SDK includes a visualize_prediction() utility that renders bounding boxes, phrase labels, and VQA panels automatically:
from vi.inference import ViModel
from vi.inference.utils.visualize import visualize_prediction
from pathlib import Path
model = ViModel(run_id="your-run-id")
result, error = model(source="image.jpg")
if error is None:
image = visualize_prediction(
image_path=Path("image.jpg"),
prediction=result
)
image.show()
image.save("prediction_visualization.png")visualize_prediction() handles:
- Bounding boxes with phrase labels (phrase grounding)
- Question and answer panels (VQA)
- Coordinate conversion from
[0, 1024]to pixel space - Text wrapping and font sizing
Note: visualize_prediction() only works with PhraseGroundingResponse and VQAResponse. It does not support GenericResponse. For GenericResponse outputs, implement custom visualization using PIL, OpenCV, or matplotlib.
Custom visualization
Use PIL/Pillow when you need custom colors, fonts, or layouts:
from PIL import Image, ImageDraw
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
def visualize_result(image_path, result, output_path="output.jpg"):
image = Image.open(image_path)
draw = ImageDraw.Draw(image)
width, height = image.size
if isinstance(result, PhraseGroundingResponse):
for grounding in result.result.groundings:
for bbox in grounding.grounding:
x_min = bbox[0] / 1024 * width
y_min = bbox[1] / 1024 * height
x_max = bbox[2] / 1024 * width
y_max = bbox[3] / 1024 * height
draw.rectangle([(x_min, y_min), (x_max, y_max)], outline="red", width=3)
draw.text((x_min, y_min - 10), grounding.phrase, fill="red")
draw.text((10, 10), result.result.sentence[:100], fill="white")
image.save(output_path)
result, error = model(source="image.jpg")
if error is None:
visualize_result("image.jpg", result, "output.jpg")from PIL import Image, ImageDraw
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
def visualize_with_colors(image_path, result, output_path="output.jpg"):
image = Image.open(image_path)
draw = ImageDraw.Draw(image)
width, height = image.size
colors = ["red", "blue", "green", "yellow", "purple", "orange"]
if isinstance(result, PhraseGroundingResponse):
for i, grounding in enumerate(result.result.groundings):
color = colors[i % len(colors)]
for bbox in grounding.grounding:
x_min = bbox[0] / 1024 * width
y_min = bbox[1] / 1024 * height
x_max = bbox[2] / 1024 * width
y_max = bbox[3] / 1024 * height
draw.rectangle([(x_min, y_min), (x_max, y_max)], outline=color, width=3)
draw.text((x_min, y_min - 10), grounding.phrase, fill=color)
image.save(output_path)
result, error = model(source="image.jpg")
if error is None:
visualize_with_colors("image.jpg", result, "colored_output.jpg")Exporting results
Export to JSON
import json
from vi.inference.task_types.vqa import VQAResponse
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
def export_to_json(image_path, result, output_path="result.json"):
data = {"image": image_path, "text": None, "objects": []}
if isinstance(result, VQAResponse):
data["text"] = result.result.answer
data["type"] = "vqa"
elif isinstance(result, PhraseGroundingResponse):
data["text"] = result.result.sentence
data["type"] = "phrase_grounding"
for grounding in result.result.groundings:
data["objects"].append({
"phrase": grounding.phrase,
"bounding_boxes": grounding.grounding
})
else:
data["text"] = result.result
data["type"] = "generic"
with open(output_path, "w") as f:
json.dump(data, f, indent=2)
result, error = model(source="image.jpg")
if error is None:
export_to_json("image.jpg", result, "result.json")Export batch results to CSV
import csv
from vi.inference.task_types.vqa import VQAResponse
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
def export_batch_to_csv(results, image_paths, output_path="results.csv"):
with open(output_path, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Image", "Text", "Object Count", "Objects"])
for img_path, (result, error) in zip(image_paths, results):
if error is None:
if isinstance(result, VQAResponse):
text = result.result.answer
obj_count, objects = 0, []
elif isinstance(result, PhraseGroundingResponse):
text = result.result.sentence
obj_count = len(result.result.groundings)
objects = [g.phrase for g in result.result.groundings]
else:
text = result.result
obj_count, objects = 0, []
writer.writerow([img_path, text, obj_count, "; ".join(objects)])
results = model(source="./images/")
image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
export_batch_to_csv(results, image_paths, "results.csv")Common workflows
Dataset annotation
import json
from pathlib import Path
from vi.inference.task_types.vqa import VQAResponse
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
def annotate_dataset(model, image_dir, output_file):
results = model(
source=image_dir,
user_prompt="Describe this image concisely",
recursive=True,
show_progress=True
)
annotations = []
for result, error in results:
if error is not None:
continue
annotation = {}
if isinstance(result, VQAResponse):
annotation["text"] = result.result.answer
annotation["type"] = "vqa"
elif isinstance(result, PhraseGroundingResponse):
annotation["text"] = result.result.sentence
annotation["type"] = "phrase_grounding"
annotation["objects"] = [
{"phrase": g.phrase, "bounding_boxes": g.grounding}
for g in result.result.groundings
]
else:
annotation["text"] = result.result
annotation["type"] = "generic"
annotations.append(annotation)
with open(output_file, "w") as f:
json.dump(annotations, f, indent=2)
print(f"Generated {len(annotations)} annotations")
annotate_dataset(model, "./images", "annotations.json")Quality control validation
from vi.inference.task_types.vqa import VQAResponse
from vi.inference.task_types.phrase_grounding import PhraseGroundingResponse
def validate_predictions(model, test_cases):
results = []
for test in test_cases:
result, error = model(
source=test["image"],
user_prompt=test["prompt"]
)
if error is None:
if isinstance(result, VQAResponse):
prediction_text = result.result.answer
elif isinstance(result, PhraseGroundingResponse):
prediction_text = result.result.sentence
else:
prediction_text = result.result
match = test["expected"].lower() in prediction_text.lower()
results.append({
"image": test["image"],
"prediction": prediction_text,
"expected": test["expected"],
"match": match
})
else:
results.append({"image": test["image"], "error": str(error), "match": False})
matches = sum(1 for r in results if r.get("match", False))
print(f"Accuracy: {matches / len(results):.2%}")
return results
test_cases = [
{"image": "defect1.jpg", "prompt": "Any defects?", "expected": "defect"},
{"image": "good1.jpg", "prompt": "Any defects?", "expected": "no defect"}
]
validate_predictions(model, test_cases)Related resources
Updated about 1 month ago
