22 ONNX Runtime Inference
Model exported. Let’s run it!
22.1 Loading and Running
import onnxruntime as ort
import numpy as np
# Load model
session = ort.InferenceSession("iris_classifier.onnx")
# Inspect inputs/outputs
print("Inputs:", [i.name for i in session.get_inputs()])
print("Outputs:", [o.name for o in session.get_outputs()])
# Prepare input
input_name = session.get_inputs()[0].name
test_data = np.array([[5.1, 3.5, 1.4, 0.2]], dtype=np.float32)
# Run inference
outputs = session.run(None, {input_name: test_data})
logits = outputs[0]
print(f"Logits: {logits}")
print(f"Predicted class: {logits.argmax()}")22.2 Verifying Export Correctness
Compare TensorWeaver and ONNX outputs:
def verify_export(tw_model, onnx_path, test_input):
"""Verify ONNX model matches TensorWeaver model."""
# TensorWeaver inference
tw_model.eval()
tw_input = Tensor(test_input)
tw_output = tw_model(tw_input).data
# ONNX inference
session = ort.InferenceSession(onnx_path)
input_name = session.get_inputs()[0].name
ort_output = session.run(None, {input_name: test_input})[0]
# Compare
max_diff = np.abs(tw_output - ort_output).max()
print(f"Max difference: {max_diff}")
if max_diff < 1e-5:
print("✓ Export verified!")
return True
else:
print("✗ Outputs don't match!")
return False
# Test
test_data = np.random.randn(10, 4).astype(np.float32)
verify_export(model, "iris_classifier.onnx", test_data)22.3 Batch Inference
# Single sample
single = np.array([[5.1, 3.5, 1.4, 0.2]], dtype=np.float32)
result = session.run(None, {input_name: single})[0]
# Batch of samples
batch = np.random.randn(100, 4).astype(np.float32)
results = session.run(None, {input_name: batch})[0]
print(f"Batch output shape: {results.shape}") # (100, 3)22.4 Performance Optimization
22.4.1 Session Options
sess_options = ort.SessionOptions()
# Enable all optimizations
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Set thread count
sess_options.intra_op_num_threads = 4
# Enable memory optimization
sess_options.enable_mem_pattern = True
session = ort.InferenceSession(
"model.onnx",
sess_options,
providers=['CPUExecutionProvider']
)22.4.2 Execution Providers
# Check available providers
print(ort.get_available_providers())
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
# Use GPU if available
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
session = ort.InferenceSession("model.onnx", providers=providers)
# Check which provider is used
print(session.get_providers())22.5 Benchmarking
import time
def benchmark(session, input_data, num_warmup=10, num_runs=100):
"""Benchmark inference speed."""
input_name = session.get_inputs()[0].name
# Warmup
for _ in range(num_warmup):
session.run(None, {input_name: input_data})
# Benchmark
start = time.perf_counter()
for _ in range(num_runs):
session.run(None, {input_name: input_data})
elapsed = time.perf_counter() - start
avg_ms = (elapsed / num_runs) * 1000
throughput = num_runs / elapsed
print(f"Average latency: {avg_ms:.2f} ms")
print(f"Throughput: {throughput:.1f} inferences/sec")
# Benchmark
test_batch = np.random.randn(32, 4).astype(np.float32)
benchmark(session, test_batch)22.6 Complete Deployment Example
import onnxruntime as ort
import numpy as np
class IrisClassifier:
"""Production Iris classifier using ONNX Runtime."""
def __init__(self, model_path):
self.session = ort.InferenceSession(model_path)
self.input_name = self.session.get_inputs()[0].name
self.classes = ['setosa', 'versicolor', 'virginica']
def predict(self, features):
"""
Predict iris class.
Args:
features: Array of shape (n, 4) with
[sepal_length, sepal_width, petal_length, petal_width]
Returns:
List of predicted class names
"""
features = np.array(features, dtype=np.float32)
if features.ndim == 1:
features = features.reshape(1, -1)
logits = self.session.run(None, {self.input_name: features})[0]
class_ids = logits.argmax(axis=-1)
return [self.classes[i] for i in class_ids]
def predict_proba(self, features):
"""Return class probabilities."""
features = np.array(features, dtype=np.float32)
if features.ndim == 1:
features = features.reshape(1, -1)
logits = self.session.run(None, {self.input_name: features})[0]
# Softmax
exp_logits = np.exp(logits - logits.max(axis=-1, keepdims=True))
probs = exp_logits / exp_logits.sum(axis=-1, keepdims=True)
return probs
# Usage
classifier = IrisClassifier("iris_classifier.onnx")
# Single prediction
result = classifier.predict([5.1, 3.5, 1.4, 0.2])
print(f"Prediction: {result[0]}") # 'setosa'
# Batch prediction
batch = [[5.1, 3.5, 1.4, 0.2], [6.2, 2.9, 4.3, 1.3]]
results = classifier.predict(batch)
print(f"Batch predictions: {results}")
# With probabilities
probs = classifier.predict_proba([5.1, 3.5, 1.4, 0.2])
print(f"Probabilities: {dict(zip(classifier.classes, probs[0]))}")22.7 Part VI Complete!
Tip
Milestone: You can now deploy models to production!
- ✓ ONNX format understanding
- ✓ Export from TensorWeaver
- ✓ ONNX Runtime inference
- ✓ Performance optimization
Your models are now portable and production-ready!
22.8 Summary
- ONNX Runtime runs exported models
- Verify export with TensorWeaver comparison
- Use execution providers for hardware acceleration
- Session options for optimization
- Benchmark for production requirements
Next: GPU acceleration with cuNumeric.