22  ONNX Runtime Inference

Model exported. Let’s run it!

22.1 Loading and Running

import onnxruntime as ort
import numpy as np

# Load model
session = ort.InferenceSession("iris_classifier.onnx")

# Inspect inputs/outputs
print("Inputs:", [i.name for i in session.get_inputs()])
print("Outputs:", [o.name for o in session.get_outputs()])

# Prepare input
input_name = session.get_inputs()[0].name
test_data = np.array([[5.1, 3.5, 1.4, 0.2]], dtype=np.float32)

# Run inference
outputs = session.run(None, {input_name: test_data})
logits = outputs[0]

print(f"Logits: {logits}")
print(f"Predicted class: {logits.argmax()}")

22.2 Verifying Export Correctness

Compare TensorWeaver and ONNX outputs:

def verify_export(tw_model, onnx_path, test_input):
    """Verify ONNX model matches TensorWeaver model."""

    # TensorWeaver inference
    tw_model.eval()
    tw_input = Tensor(test_input)
    tw_output = tw_model(tw_input).data

    # ONNX inference
    session = ort.InferenceSession(onnx_path)
    input_name = session.get_inputs()[0].name
    ort_output = session.run(None, {input_name: test_input})[0]

    # Compare
    max_diff = np.abs(tw_output - ort_output).max()
    print(f"Max difference: {max_diff}")

    if max_diff < 1e-5:
        print("✓ Export verified!")
        return True
    else:
        print("✗ Outputs don't match!")
        return False

# Test
test_data = np.random.randn(10, 4).astype(np.float32)
verify_export(model, "iris_classifier.onnx", test_data)

22.3 Batch Inference

# Single sample
single = np.array([[5.1, 3.5, 1.4, 0.2]], dtype=np.float32)
result = session.run(None, {input_name: single})[0]

# Batch of samples
batch = np.random.randn(100, 4).astype(np.float32)
results = session.run(None, {input_name: batch})[0]
print(f"Batch output shape: {results.shape}")  # (100, 3)

22.4 Performance Optimization

22.4.1 Session Options

sess_options = ort.SessionOptions()

# Enable all optimizations
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

# Set thread count
sess_options.intra_op_num_threads = 4

# Enable memory optimization
sess_options.enable_mem_pattern = True

session = ort.InferenceSession(
    "model.onnx",
    sess_options,
    providers=['CPUExecutionProvider']
)

22.4.2 Execution Providers

# Check available providers
print(ort.get_available_providers())
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']

# Use GPU if available
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
session = ort.InferenceSession("model.onnx", providers=providers)

# Check which provider is used
print(session.get_providers())

22.5 Benchmarking

import time

def benchmark(session, input_data, num_warmup=10, num_runs=100):
    """Benchmark inference speed."""
    input_name = session.get_inputs()[0].name

    # Warmup
    for _ in range(num_warmup):
        session.run(None, {input_name: input_data})

    # Benchmark
    start = time.perf_counter()
    for _ in range(num_runs):
        session.run(None, {input_name: input_data})
    elapsed = time.perf_counter() - start

    avg_ms = (elapsed / num_runs) * 1000
    throughput = num_runs / elapsed

    print(f"Average latency: {avg_ms:.2f} ms")
    print(f"Throughput: {throughput:.1f} inferences/sec")

# Benchmark
test_batch = np.random.randn(32, 4).astype(np.float32)
benchmark(session, test_batch)

22.6 Complete Deployment Example

import onnxruntime as ort
import numpy as np

class IrisClassifier:
    """Production Iris classifier using ONNX Runtime."""

    def __init__(self, model_path):
        self.session = ort.InferenceSession(model_path)
        self.input_name = self.session.get_inputs()[0].name
        self.classes = ['setosa', 'versicolor', 'virginica']

    def predict(self, features):
        """
        Predict iris class.

        Args:
            features: Array of shape (n, 4) with
                      [sepal_length, sepal_width, petal_length, petal_width]

        Returns:
            List of predicted class names
        """
        features = np.array(features, dtype=np.float32)
        if features.ndim == 1:
            features = features.reshape(1, -1)

        logits = self.session.run(None, {self.input_name: features})[0]
        class_ids = logits.argmax(axis=-1)

        return [self.classes[i] for i in class_ids]

    def predict_proba(self, features):
        """Return class probabilities."""
        features = np.array(features, dtype=np.float32)
        if features.ndim == 1:
            features = features.reshape(1, -1)

        logits = self.session.run(None, {self.input_name: features})[0]

        # Softmax
        exp_logits = np.exp(logits - logits.max(axis=-1, keepdims=True))
        probs = exp_logits / exp_logits.sum(axis=-1, keepdims=True)

        return probs

# Usage
classifier = IrisClassifier("iris_classifier.onnx")

# Single prediction
result = classifier.predict([5.1, 3.5, 1.4, 0.2])
print(f"Prediction: {result[0]}")  # 'setosa'

# Batch prediction
batch = [[5.1, 3.5, 1.4, 0.2], [6.2, 2.9, 4.3, 1.3]]
results = classifier.predict(batch)
print(f"Batch predictions: {results}")

# With probabilities
probs = classifier.predict_proba([5.1, 3.5, 1.4, 0.2])
print(f"Probabilities: {dict(zip(classifier.classes, probs[0]))}")

22.7 Part VI Complete!

Tip

Milestone: You can now deploy models to production!

  • ✓ ONNX format understanding
  • ✓ Export from TensorWeaver
  • ✓ ONNX Runtime inference
  • ✓ Performance optimization

Your models are now portable and production-ready!

22.8 Summary

  • ONNX Runtime runs exported models
  • Verify export with TensorWeaver comparison
  • Use execution providers for hardware acceleration
  • Session options for optimization
  • Benchmark for production requirements

Next: GPU acceleration with cuNumeric.