Compare commits

..

5 Commits

Author SHA1 Message Date
robviren 7de23ac5be Made work on windows 2026-06-01 22:05:15 -05:00
robviren c480902306 More live opt 2026-06-01 10:37:25 -05:00
robviren bee1ed65a4 Finding a good balance 2026-05-31 00:50:46 -05:00
robviren 5578b84fd8 Better live handling 2026-05-30 18:26:38 -05:00
robviren 626d4a5a56 Better live handling 2026-05-30 18:25:42 -05:00
9 changed files with 736 additions and 180 deletions
+3 -1
View File
@@ -1 +1,3 @@
outputs/
outputs/
.venv/
__pycache__/
View File
+3 -3
View File
@@ -416,10 +416,10 @@ if __name__ == "__main__":
p.add_argument("--weights")
p.add_argument("--out-dir", default="outputs")
p.add_argument("--chunk", type=int, default=6)
p.add_argument("--enc-left", type=int, default=48)
p.add_argument("--enc-right", type=int, default=2)
p.add_argument("--enc-left", type=int, default=32)
p.add_argument("--enc-right", type=int, default=4)
p.add_argument("--dec-left", type=int, default=32)
p.add_argument("--dec-right", type=int, default=3)
p.add_argument("--dec-right", type=int, default=4)
p.add_argument("--mode", choices=["all", "ssl", "encode", "decode", "global"], default="all")
args = p.parse_args()
+413
View File
@@ -0,0 +1,413 @@
import argparse
import math
import queue
import threading
import time
from pathlib import Path
import numpy as np
import sounddevice as sd
import soundfile as sf
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from miocodec.model import MioCodecModel
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import gc
class StreamingISTFT:
def __init__(self, n_fft, hop, device):
self.n_fft = n_fft
self.win = n_fft
self.hop = hop
self.pad = (self.win - hop) // 2
self.window = torch.hann_window(self.win, device=device)
self.win_sq = (self.window**2).view(1, -1, 1)
self.carry = self.win - self.hop
self.tail_y = torch.zeros(1, 0, device=device)
self.tail_e = torch.zeros(1, 0, device=device)
self.started = False
def reset(self):
self.tail_y = self.tail_y[:, :0]
self.tail_e = self.tail_e[:, :0]
self.started = False
def process(self, spec):
T = spec.shape[-1]
ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward") * self.window.view(1, -1, 1)
region = (T - 1) * self.hop + self.win
y = F.fold(ifft, (1, region), (1, self.win), stride=(1, self.hop))[:, 0, 0, :]
e = F.fold(self.win_sq.expand(1, self.win, T), (1, region), (1, self.win), stride=(1, self.hop))[:, 0, 0, :]
tl = self.tail_y.shape[-1]
if tl:
y[:, :tl] += self.tail_y
e[:, :tl] += self.tail_e
emit = region - self.carry
out = y[:, :emit] / e[:, :emit].clamp(min=1e-8)
self.tail_y = y[:, emit:].clone()
self.tail_e = e[:, emit:].clone()
if not self.started:
out = out[:, self.pad:]
self.started = True
return out.squeeze(0)
class StreamingVC:
def __init__(self, model, device, *, chunk=6, enc_left=48, enc_right=2,
dec_left=32, dec_right=3, ema_alpha=0.9):
self.m = model.to(device).eval()
self.dev = device
c = model.config
ssl_fps = self.m.ssl_feature_extractor.ssl_sample_rate // self.m.ssl_feature_extractor.hop_size
self.token_hz = ssl_fps // c.downsample_factor
self.sr = c.sample_rate
self.tok_samples = self.sr // self.token_hz
ups_total = self.m.wave_upsampler.total_upsample_factor
self.frames_per_tok = c.wave_upsample_factor * ups_total
assert self.frames_per_tok * c.hop_length == self.tok_samples, "token/frame/sample ratios disagree"
self.chunk = chunk
self.enc_left, self.enc_right = enc_left, enc_right
self.dec_left, self.dec_right = dec_left, dec_right
self.local_layers = list(self.m.local_ssl_layers)
self.istft = StreamingISTFT(c.n_fft, c.hop_length, device)
self.global_emb = None
self.src_mean = self.src_std = None
self.tokens = None
self.decoded = 0
self.ema_alpha = ema_alpha
self.prev_local_feats = None
def _raw_local(self, audio):
feats = self.m.ssl_feature_extractor(audio.to(self.dev))
sel = [feats[i - 1] for i in self.local_layers]
return torch.stack(sel, 0).mean(0) if len(sel) > 1 else sel[0]
def apply_ema(self, local_feats):
if self.prev_local_feats is not None and local_feats.shape == self.prev_local_feats.shape:
local_feats = self.ema_alpha * local_feats + (1.0 - self.ema_alpha) * self.prev_local_feats
self.prev_local_feats = local_feats.clone()
return local_feats
@torch.inference_mode()
def set_target(self, ref_audio):
feats = self.m.encode(ref_audio.to(self.dev), return_content=False, return_global=True)
self.global_emb = feats.global_embedding.view(1, -1)
def _encode_features(self, loc):
loc_norm = (loc - self.src_mean) / (self.src_std + 1e-8)
enc = self.m.local_encoder(loc_norm)
enc = self.m.conv_downsample(enc.transpose(1, 2)).transpose(1, 2)
_, idx = self.m.local_quantizer.encode(enc)
return idx
@torch.inference_mode()
def seed(self, seed_audio):
self.reset()
if seed_audio.dim() == 1:
seed_audio = seed_audio.unsqueeze(0)
loc = self._raw_local(seed_audio)
self.src_mean = loc.mean(dim=1, keepdim=True).clone()
self.src_std = loc.std(dim=1, keepdim=True).clone()
idx = self._encode_features(loc)
self.tokens = idx.clone()
self.decoded = idx.shape[1]
def reset(self):
self.istft.reset()
self.tokens = None
self.decoded = 0
self.prev_local_feats = None
@torch.inference_mode()
def _encode(self, window_audio):
loc = self._raw_local(window_audio)
loc = self.apply_ema(loc)
return self._encode_features(loc)
@torch.inference_mode()
def _wave_stages(self, tok_window):
Tw = tok_window.shape[1]
emb = self.m.local_quantizer.decode(tok_window)
x = self.m.wave_prenet(emb)
x = self.m.wave_conv_upsample(x.transpose(1, 2)).transpose(1, 2)
x = F.interpolate(x.transpose(1, 2), size=2 * Tw, mode=self.m.config.wave_interpolation_mode).transpose(1, 2)
x = self.m.wave_prior_net(x.transpose(1, 2)).transpose(1, 2)
x = self.m.wave_decoder(x, condition=self.global_emb.unsqueeze(1))
x = self.m.wave_post_net(x.transpose(1, 2)).transpose(1, 2)
return self.m.wave_upsampler(x.transpose(1, 2))
@torch.inference_mode()
def _decode(self, tok_window, keep_left, keep_n):
x = self._wave_stages(tok_window)
h = self.m.istft_head.out(x).transpose(1, 2)
mag, phase = h.chunk(2, dim=1)
mag = torch.exp(mag).clamp(max=1e2)
spec = torch.complex(mag * torch.cos(phase), mag * torch.sin(phase))
f0 = keep_left * self.frames_per_tok
f1 = (keep_left + keep_n) * self.frames_per_tok
return self.istft.process(spec[..., f0:f1])
def _commit_tokens(self, new_idx):
self.tokens = new_idx if self.tokens is None else torch.cat([self.tokens, new_idx], dim=1)
def _drain(self, final=False):
out = []
committed = self.tokens.shape[1]
while True:
d0 = self.decoded
avail = committed - d0
if avail <= 0 or (not final and avail < self.chunk + self.dec_right):
break
keep_n = min(self.chunk, avail) if final else self.chunk
left = min(self.dec_left, d0)
right = min(self.dec_right, committed - (d0 + keep_n))
win = self.tokens[:, d0 - left: d0 + keep_n + right]
out.append(self._decode(win, left, keep_n))
self.decoded += keep_n
return torch.cat(out) if out else torch.zeros(0, device=self.dev)
def list_devices():
print(f"{'idx':>4} {'name':<50} {'in':>3} {'out':>3} {'sr':>7}")
print("-" * 76)
for i, d in enumerate(sd.query_devices()):
print(f"{i:>4} {d['name']:<50} {d['max_input_channels']:>3} {d['max_output_channels']:>3} {int(d['default_samplerate']):>7}")
def sync_time(fn):
if DEVICE.type == "cuda":
torch.cuda.synchronize()
t0 = time.perf_counter()
out = fn()
if DEVICE.type == "cuda":
torch.cuda.synchronize()
return out, (time.perf_counter() - t0) * 1000
def load_audio(path, target_sr):
a, sr = sf.read(path, dtype="float32", always_2d=True)
a = a.mean(axis=1)
if sr != target_sr:
print(f"Resampling {path.name} from {sr} Hz to {target_sr} Hz...")
tensor = torch.from_numpy(a)
tensor = torchaudio.functional.resample(tensor, orig_freq=sr, new_freq=target_sr)
else:
tensor = torch.from_numpy(a)
p = torch.abs(tensor).max()
return tensor / p if p > 1e-8 else tensor
def main():
gc.collect()
gc.freeze()
gc.disable()
parser = argparse.ArgumentParser()
parser.add_argument("--list-devices", action="store_true")
parser.add_argument("--input", type=int)
parser.add_argument("--output", type=int)
parser.add_argument("--target", type=Path, help="Target voice reference WAV")
parser.add_argument("--seed-audio", type=Path, help="Seed speaker calibration WAV (optional)")
parser.add_argument("--chunk", type=int, default=6)
parser.add_argument("--enc-left", type=int, default=48)
parser.add_argument("--enc-right", type=int, default=4)
parser.add_argument("--dec-left", type=int, default=32)
parser.add_argument("--dec-right", type=int, default=4)
parser.add_argument("--ema-alpha", type=float, default=0.9,
help="EMA smoothing on local SSL features (0=full smoothing, 1=no smoothing)")
parser.add_argument("--rms-floor", type=float, default=0.0035,
help="RMS threshold below which audio chunk is evaluated as silence")
parser.add_argument("--hangover-chunks", type=int, default=5,
help="Number of chunks to hold the gate open after RMS drop")
parser.add_argument("--silence-fade-ms", type=float, default=10.0,
help="Ramp-down duration in ms at silence boundary (0 to disable)")
args = parser.parse_args()
if args.list_devices:
list_devices()
return
if args.input is None or args.output is None:
parser.error("--input and --output required")
model = MioCodecModel.from_pretrained("Aratako/MioCodec-25Hz-44.1kHz-v2")
vc = StreamingVC(
model, DEVICE, chunk=args.chunk, enc_left=args.enc_left, enc_right=args.enc_right,
dec_left=args.dec_left, dec_right=args.dec_right, ema_alpha=args.ema_alpha
)
sr = vc.sr
ts = vc.tok_samples
chunk_samples = vc.chunk * ts
left_pad = vc.enc_left * ts
right_pad = vc.enc_right * ts
budget_ms = (vc.chunk / vc.token_hz) * 1000
fade_samples = int(args.silence_fade_ms * 0.001 * sr)
print(f"Sample Rate: {sr} Hz | Chunk: {args.chunk} tokens ({budget_ms:.1f}ms budget)")
print(f"EMA alpha: {args.ema_alpha} | Silence fade: {args.silence_fade_ms:.0f}ms")
print(f"Loading target speaker profile: {args.target}...")
target_audio = load_audio(args.target, sr)
vc.set_target(target_audio)
in_info = sd.query_devices(args.input)
n_in_ch = min(in_info["max_input_channels"], 2)
if args.seed_audio:
print(f"Loading speaker calibration profile: {args.seed_audio}...")
seed_audio = load_audio(args.seed_audio, sr)
else:
print("\n" + "=" * 60)
print("No seed-audio provided. Recording 3 seconds for normalization calibration.")
print("Please speak into your microphone...")
print("=" * 60)
recorded = sd.rec(int(3.0 * sr), samplerate=sr, channels=n_in_ch, dtype="float32")
sd.wait()
print("Recording complete. Calibrating feature scaling...")
recorded_mono = recorded.mean(axis=1) if recorded.shape[1] > 1 else recorded[:, 0]
seed_audio = torch.from_numpy(recorded_mono)
print("Seeding streaming context from speaker profile...")
vc.seed(seed_audio)
if seed_audio.numel() >= left_pad:
raw_input_accum = seed_audio[-left_pad:].numpy()
else:
raw_input_accum = np.pad(seed_audio.numpy(), (left_pad - seed_audio.numel(), 0))
in_q = queue.Queue(maxsize=8)
out_q = queue.Queue(maxsize=2)
stop_event = threading.Event()
def input_cb(indata, frames, time_info, status):
if in_q.full():
in_q.get_nowait()
mono = indata.mean(axis=1) if indata.shape[1] > 1 else indata[:, 0]
in_q.put_nowait(mono.copy())
def write_thread(out_stream):
while not stop_event.is_set():
try:
pcm = out_q.get(timeout=0.5)
out_stream.write(pcm)
except queue.Empty:
continue
print(f"\n{'chunk':>6} {'q_in':>4} {'q_out':>5} {'enc':>7} {'dec':>7} {'total':>7} {'budget':>7} {'gap':>7}")
print("-" * 76)
chunk_n = 0
t_last = None
hangover_counter = 0
if fade_samples > 0:
ramp_down = np.linspace(1.0, 0.0, fade_samples, dtype=np.float32)
with sd.InputStream(device=args.input, channels=n_in_ch, samplerate=sr,
blocksize=chunk_samples, dtype="float32",
callback=input_cb, latency="low"):
with sd.OutputStream(device=args.output, channels=2, samplerate=sr,
dtype="float32", latency="low") as out_stream:
writer = threading.Thread(target=write_thread, args=(out_stream,), daemon=True)
writer.start()
try:
while True:
raw = in_q.get()
t_now = time.perf_counter()
gap_ms = (t_now - t_last) * 1000 if t_last else 0.0
t_last = t_now
rms = float(np.sqrt(np.mean(raw ** 2)))
if rms >= args.rms_floor:
hangover_counter = args.hangover_chunks
is_silence = False
else:
if hangover_counter > 0:
hangover_counter -= 1
is_silence = False
else:
is_silence = True
raw_input_accum = np.concatenate([raw_input_accum, raw])
required_samples = left_pad + chunk_samples + right_pad
if len(raw_input_accum) >= required_samples:
window_np = raw_input_accum[:required_samples]
raw_input_accum = raw_input_accum[chunk_samples:]
if is_silence:
window_np = window_np.copy()
active_start = left_pad
active_end = left_pad + chunk_samples
if fade_samples > 0:
fade_end = active_start + fade_samples
window_np[active_start:fade_end] *= ramp_down
window_np[fade_end:active_end] = 0.0
else:
window_np[active_start:active_end] = 0.0
window_torch = torch.from_numpy(window_np).unsqueeze(0).to(DEVICE)
with torch.no_grad():
idx, t_enc = sync_time(lambda: vc._encode(window_torch))
chunk_tokens = idx[:, vc.enc_left : vc.enc_left + vc.chunk]
vc._commit_tokens(chunk_tokens)
audio_out, t_dec = sync_time(lambda: vc._drain(final=False))
if audio_out.numel() == 0:
pcm_out = np.zeros((chunk_samples, 2), dtype=np.float32)
else:
pcm = audio_out.cpu().numpy()
pcm = np.clip(pcm, -1.0, 1.0)
pcm_out = np.stack([pcm, pcm], axis=1)
else:
pcm_out = np.zeros((chunk_samples, 2), dtype=np.float32)
t_enc, t_dec = 0.0, 0.0
out_q.put(pcm_out)
total = t_enc + t_dec
chunk_n += 1
if is_silence:
print(
f"{chunk_n:>6} {in_q.qsize():>4} {out_q.qsize():>5} "
f"{'--silence--':>31} rms={rms:.4f}",
flush=True,
)
else:
print(
f"{chunk_n:>6} {in_q.qsize():>4} {out_q.qsize():>5} "
f"{t_enc:>6.1f}ms {t_dec:>6.1f}ms "
f"{total:>6.1f}ms {budget_ms:>6.0f}ms {gap_ms:>6.1f}ms",
flush=True,
)
except KeyboardInterrupt:
pass
finally:
stop_event.set()
writer.join()
print("stopped")
if __name__ == "__main__":
main()
+105 -86
View File
@@ -8,7 +8,7 @@ import json
import numpy as np
import onnxruntime as ort
# This bullshit
ort.preload_dlls()
import sounddevice as sd
import soundfile as sf
@@ -53,7 +53,20 @@ class StreamingISTFT:
self.win_sq = self.window ** 2
self.tail_y = np.zeros(0, dtype=np.float32)
self.tail_e = np.zeros(0, dtype=np.float32)
self.started = False
self.started = False\
def block(self, real, imag):
spec = real + 1j * imag
T = spec.shape[1]
ifft = (np.fft.irfft(spec, self.n_fft, axis=0) * self.window[:, None]).astype(np.float32)
region = (T - 1) * self.hop + self.win
y = np.zeros(region, dtype=np.float32)
e = np.zeros(region, dtype=np.float32)
for t in range(T):
s = t * self.hop
y[s : s + self.win] += ifft[:, t]
e[s : s + self.win] += self.win_sq
return (y / np.maximum(e, 1e-8)).astype(np.float32)
def process(self, real, imag):
spec = real + 1j * imag
@@ -100,25 +113,33 @@ class StreamingVCONNX:
opts = ort.SessionOptions()
opts.inter_op_num_threads = 1
opts.intra_op_num_threads = 1
opts.intra_op_num_threads = 4
opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
prov = ["CUDAExecutionProvider", "CPUExecutionProvider"] if args.cuda else ["CPUExecutionProvider"]
if getattr(args, "openvino", False):
prov = [("OpenVINOExecutionProvider", {"device_type": "CPU"}), "CPUExecutionProvider"]
elif args.cuda:
prov = ["CUDAExecutionProvider", "CPUExecutionProvider"]
else:
prov = ["CPUExecutionProvider"]
self.ssl = ort.InferenceSession(args.ssl, sess_options=opts, providers=prov)
self.enc = ort.InferenceSession(args.encode, sess_options=opts, providers=prov)
self.dec = ort.InferenceSession(args.decode, sess_options=opts, providers=prov)
self.glb = ort.InferenceSession(args.global_path, sess_options=opts, providers=prov)
self.istft = StreamingISTFT(meta["n_fft"], meta["hop_length"])
self.xfade_frames = 9
self.istft_margin = int(np.ceil(meta["n_fft"] / meta["hop_length"]))
self.xfade_tail = None
self.global_emb = None
self.src_mean = None
self.src_std = None
self.tokens = None
self.decoded = 0
self.prev_local_feats = None
self.ema_alpha = 0.8 # Adjust between 0.5 (heavy smoothing) and 1.0 (no smoothing)
self.ema_alpha = 0.9
def _ssl(self, win16):
w = take(win16, 0, self.ssl_in).reshape(1, -1)
@@ -154,37 +175,46 @@ class StreamingVCONNX:
frames = np.concatenate([l[c : c + keep * self.ds] for keep, l in locals_], axis=0)
self.src_mean = frames.mean(axis=0).astype(np.float32)
self.src_std = frames.std(axis=0, ddof=1).astype(np.float32)
seed_tokens = np.concatenate(
[self._encode(l, self.src_mean, self.src_std)[self.enc_left : self.enc_left + keep] for keep, l in locals_]
) if locals_ else np.zeros(0, dtype=np.int64)
self.tokens = seed_tokens.astype(np.int64)
self.decoded = len(self.tokens)
def reset(self):
self.istft = StreamingISTFT(self.meta["n_fft"], self.meta["hop_length"])
self.xfade_tail = None
self.tokens = None
self.decoded = 0
def _encode_window(self, win16):
local_feats, _ = self._ssl(win16)
# Apply temporal smoothing to the continuous representations
if self.prev_local_feats is not None and local_feats.shape == self.prev_local_feats.shape:
local_feats = self.ema_alpha * local_feats + (1.0 - self.ema_alpha) * self.prev_local_feats
def apply_ema(self, local_feats):
shift = self.chunk * self.ds
if self.prev_local_feats is not None:
n = local_feats.shape[0] - shift
if n > 0:
local_feats[:n] = (self.ema_alpha * local_feats[:n]
+ (1 - self.ema_alpha) * self.prev_local_feats[shift:shift + n])
self.prev_local_feats = local_feats.copy()
return self._encode(local_feats, self.src_mean, self.src_std)
return local_feats
def _decode(self, win_tokens, keep_left, keep_n):
def _decode(self, win_tokens, keep_left, keep_n, right_tokens):
real, imag = self.dec.run(
["spec_real", "spec_imag"],
{"content_token_indices": win_tokens, "global_embedding": self.global_emb}
{"content_token_indices": win_tokens, "global_embedding": self.global_emb},
)
f0 = keep_left * self.fpt
f1 = (keep_left + keep_n) * self.fpt
return self.istft.process(real[:, f0:f1], imag[:, f0:f1])
fpt, hop = self.fpt, self.istft.hop
a = keep_left * fpt
b = (keep_left + keep_n) * fpt
right_frames = right_tokens * fpt
ov = min(self.xfade_frames, max(0, right_frames))
m = min(self.istft_margin, a, max(0, right_frames - ov))
F0, F1 = a - m, b + ov + m
audio = self.istft.block(real[:, F0:F1], imag[:, F0:F1])
start = (a - F0) * hop
seg = audio[start : start + (keep_n * fpt + ov) * hop]
return seg, ov * hop
def _commit_tokens(self, new_idx):
if self.tokens is None:
@@ -194,6 +224,7 @@ class StreamingVCONNX:
def _drain(self, final=False):
out = []
hop = self.istft.hop
committed = len(self.tokens) if self.tokens is not None else 0
while True:
d0 = self.decoded
@@ -203,13 +234,23 @@ class StreamingVCONNX:
keep_n = min(self.chunk, avail) if final else self.chunk
left = min(self.dec_left, d0)
right = min(self.dec_right, committed - (d0 + keep_n))
lo = d0 - left
hi = d0 + keep_n + right
win_idx = np.clip(np.arange(lo, hi), 0, committed - 1)
win = self.tokens[win_idx].astype(np.int64)
out.append(self._decode(win, left, keep_n))
lo, hi = d0 - left, d0 + keep_n + right
win = self.tokens[np.clip(np.arange(lo, hi), 0, committed - 1)].astype(np.int64)
seg, h = self._decode(win, left, keep_n, right)
body_end = keep_n * self.fpt * hop
head, body, tail = seg[:h], seg[h:body_end], seg[body_end:]
if self.xfade_tail is not None and len(self.xfade_tail) == h and h > 0:
t = np.linspace(0.0, 1.0, h, dtype=np.float32)
out.append((1.0 - t) * self.xfade_tail + t * head)
else:
out.append(head)
out.append(body)
self.xfade_tail = None if final else tail
if final and tail.size:
out.append(tail)
self.decoded += keep_n
return np.concatenate(out) if out else np.zeros(0, dtype=np.float32)
@@ -232,18 +273,17 @@ def main():
parser.add_argument("--list-devices", action="store_true")
parser.add_argument("--input", type=int)
parser.add_argument("--output", type=int)
parser.add_argument("--target", type=Path, required=True, help="Target voice reference WAV")
parser.add_argument("--seed-audio", type=Path, help="Seed speaker calibration WAV (optional)")
parser.add_argument("--encode", required=True, help="Path to encode.onnx")
parser.add_argument("--decode", help="Path to decode.onnx (defaults to encode.onnx parent folder)")
parser.add_argument("--global", dest="global_path", help="Path to global.onnx (defaults to encode.onnx parent folder)")
parser.add_argument("--ssl", help="Path to ssl.onnx (defaults to encode.onnx parent folder)")
parser.add_argument("--meta", help="Path to meta.json (defaults to encode.onnx parent folder)")
parser.add_argument("--cuda", action="store_true", help="Enable CUDA execution provider")
parser.add_argument("--rms-floor", type=float, default=0.0035,
help="RMS threshold below which audio chunk is evaluated as silence")
parser.add_argument("--hangover-chunks", type=int, default=3,
help="Number of chunks to hold the gate open after RMS drop to prevent trailing cutoffs")
parser.add_argument("--target", type=Path, required=True)
parser.add_argument("--seed-audio", type=Path)
parser.add_argument("--encode", required=True)
parser.add_argument("--decode")
parser.add_argument("--global", dest="global_path")
parser.add_argument("--ssl")
parser.add_argument("--meta")
parser.add_argument("--cuda", action="store_true")
parser.add_argument("--openvino", action="store_true")
parser.add_argument("--rms-floor", type=float, default=0.0035)
parser.add_argument("--hangover-chunks", type=int, default=3)
args = parser.parse_args()
if args.list_devices:
@@ -264,25 +304,24 @@ def main():
sr = vc.sr
sr16 = vc.sr16
# Calculate sample sizes based on target (playback) sample rate
# token_hz is standard (usually 25 Hz), tok_samples is usually 1764 for 44.1 kHz
token_hz = meta["token_hz"]
tok_samples = sr // token_hz
chunk_samples = vc.chunk * tok_samples
budget_ms = (vc.chunk / token_hz) * 1000
# Calculated parameters for processing 16 kHz streams
tok16 = vc.tok16
chunk_samples_16k = vc.chunk * tok16
left_pad_16k = vc.enc_left * tok16
right_pad_16k = vc.enc_right * tok16
ssl_in_16k = vc.ssl_in
required_samples_16k = left_pad_16k + chunk_samples_16k + right_pad_16k
fade_len = int(0.01 * sr16)
ramp_down = np.linspace(1.0, 0.0, fade_len, dtype=np.float32)
print(f"Sample Rate: {sr} Hz (target) | 16000 Hz (SSL internal)")
print(f"Chunk Size: {vc.chunk} tokens ({budget_ms:.1f}ms budget)")
print(f"Loading target speaker profile: {args.target}...")
target_audio = load_16k(args.target, sr16)
vc.set_target(target_audio)
@@ -290,27 +329,19 @@ def main():
n_in_ch = min(in_info["max_input_channels"], 2)
if args.seed_audio:
print(f"Loading speaker calibration profile: {args.seed_audio}...")
seed_audio = load_16k(args.seed_audio, sr16)
else:
print("\n" + "=" * 60)
print("No seed-audio provided. Recording 3 seconds for normalization calibration.")
print("Please speak into your microphone...")
print("=" * 60)
recorded = sd.rec(int(3.0 * sr), samplerate=sr, channels=n_in_ch, dtype="float32")
sd.wait()
print("Recording complete. Calibrating feature scaling...")
recorded_mono = recorded.mean(axis=1) if recorded.shape[1] > 1 else recorded[:, 0]
seed_audio = resample(recorded_mono, sr, sr16)
print("Seeding streaming context from speaker profile...")
vc.seed(seed_audio)
# Establish initial left-side padding context buffer in 16 kHz
if len(seed_audio) >= left_pad_16k:
raw_input_accum_16k = seed_audio[-left_pad_16k:]
accum_16k = seed_audio[-left_pad_16k:]
else:
raw_input_accum_16k = np.pad(seed_audio, (left_pad_16k - len(seed_audio), 0))
accum_16k = np.pad(seed_audio, (left_pad_16k - len(seed_audio), 0))
in_q = queue.Queue(maxsize=8)
out_q = queue.Queue(maxsize=2)
@@ -330,8 +361,8 @@ def main():
except queue.Empty:
continue
print(f"\n{'chunk':>6} {'q_in':>4} {'q_out':>5} {'enc':>7} {'dec':>7} {'total':>7} {'budget':>7} {'gap':>7}")
print("-" * 76)
print(f"\n{'chunk':>6} {'q_in':>4} {'q_out':>5} {'ssl':>7} {'enc':>7} {'dec':>7} {'total':>7} {'budget':>7} {'gap':>7}")
print("-" * 80)
chunk_n = 0
t_last = None
@@ -354,44 +385,32 @@ def main():
t_last = t_now
rms = float(np.sqrt(np.mean(raw ** 2)))
if rms >= args.rms_floor:
hangover_counter = args.hangover_chunks
is_silence = False
elif hangover_counter > 0:
hangover_counter -= 1
is_silence = False
else:
if hangover_counter > 0:
hangover_counter -= 1
is_silence = False
else:
is_silence = True
is_silence = True
# Resample current input chunk to 16 kHz
raw_16k = resample(raw, sr, sr16)
raw_input_accum_16k = np.concatenate([raw_input_accum_16k, raw_16k])
required_samples_16k = left_pad_16k + chunk_samples_16k + right_pad_16k
accum_16k = np.concatenate([accum_16k, raw_16k])
if len(raw_input_accum_16k) >= required_samples_16k:
window_16k = raw_input_accum_16k[:required_samples_16k]
raw_input_accum_16k = raw_input_accum_16k[chunk_samples_16k:]
if len(accum_16k) >= required_samples_16k:
window_16k = accum_16k[:required_samples_16k]
accum_16k = accum_16k[chunk_samples_16k:]
# Create a simple linear ramp at the beginning of your script or class
fade_len = int(0.01 * sr16) # 10ms ramp
ramp_down = np.linspace(1.0, 0.0, fade_len, dtype=np.float32)
ramp_up = np.linspace(0.0, 1.0, fade_len, dtype=np.float32)
# Apply a soft gate instead of hard zeroing
if is_silence:
window_16k = window_16k.copy()
# Smoothly ramp down the boundary before zeroing
active_start = left_pad_16k
active_end = left_pad_16k + chunk_samples_16k
# Apply fade out
window_16k[active_start : active_start + fade_len] *= ramp_down
window_16k[active_start + fade_len : active_end] = 0.0
# Run inference via ONNX models
idx, t_enc = sync_time(lambda: vc._encode_window(window_16k))
local_feats, t_ssl = sync_time(lambda: vc._ssl(window_16k)[0])
local_feats = vc.apply_ema(local_feats)
idx, t_enc = sync_time(lambda: vc._encode(local_feats, vc.src_mean, vc.src_std))
chunk_tokens = idx[vc.enc_left : vc.enc_left + vc.chunk]
vc._commit_tokens(chunk_tokens)
audio_out, t_dec = sync_time(lambda: vc._drain(final=False))
@@ -403,23 +422,23 @@ def main():
pcm_out = np.stack([pcm, pcm], axis=1)
else:
pcm_out = np.zeros((chunk_samples, 2), dtype=np.float32)
t_enc, t_dec = 0.0, 0.0
t_ssl, t_enc, t_dec = 0.0, 0.0, 0.0
out_q.put(pcm_out)
total = t_enc + t_dec
total = t_ssl + t_enc + t_dec
chunk_n += 1
if is_silence:
print(
f"{chunk_n:>6} {in_q.qsize():>4} {out_q.qsize():>5} "
f"{'--silence--':>31} rms={rms:.4f}",
f"{'--silence--':>41} rms={rms:.4f}",
flush=True,
)
else:
print(
f"{chunk_n:>6} {in_q.qsize():>4} {out_q.qsize():>5} "
f"{t_enc:>6.1f}ms {t_dec:>6.1f}ms "
f"{t_ssl:>6.1f}ms {t_enc:>6.1f}ms {t_dec:>6.1f}ms "
f"{total:>6.1f}ms {budget_ms:>6.0f}ms {gap_ms:>6.1f}ms",
flush=True,
)
+34
View File
@@ -0,0 +1,34 @@
# optimize_models.py
from onnxruntime.transformers.optimizer import optimize_model
from onnxruntime.transformers.fusion_options import FusionOptions
def optimize_custom(input_path, output_path):
print(f"Optimizing {input_path}...")
# Load default BERT fusion options
options = FusionOptions("bert")
# Disable LayerNorm fusions that break on AdaLN / dynamic biases
options.enable_skip_layer_norm = False
options.enable_layer_norm = False
# Run the optimizer
optimizer = optimize_model(
input=input_path,
model_type="bert",
optimization_options=options
)
optimizer.save_model_to_file(output_path)
print(f"Saved optimized model to {output_path}\n")
if __name__ == "__main__":
optimize_custom("outputs/encode.onnx", "outputs/encode_opt.onnx")
optimize_custom("outputs/decode.onnx", "outputs/decode_opt.onnx")
# ssl.onnx (WavLM) is a standard BERT architecture, so we can leave
# all standard fusions enabled for maximum speed.
print("Optimizing outputs/ssl.onnx...")
ssl_opt = optimize_model("outputs/ssl.onnx", model_type="bert")
ssl_opt.save_model_to_file("outputs/ssl_opt.onnx")
print("Saved optimized model to outputs/ssl_opt.onnx")
+2 -3
View File
@@ -1,5 +1,5 @@
[project]
name = "dovc"
name = "mioonnx"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
@@ -7,11 +7,10 @@ requires-python = ">=3.12"
dependencies = [
"miocodec",
"numpy>=2.4.6",
"onnxruntime>=1.26.0",
"onnxruntime-gpu>=1.26.0",
"onnxscript>=0.7.0",
"sounddevice>=0.5.5",
"torch>=2.11.0",
"soundfile>=0.13.1",
]
[tool.uv.sources]
+153 -29
View File
@@ -1,44 +1,168 @@
import argparse
import json
from pathlib import Path
import numpy as np
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime.quantization.shape_inference import quant_pre_process
ort.set_default_logger_severity(3)
OPS = ["Conv", "Gemm", "MatMul"]
def quantize_model(input_path: Path, output_path: Path):
# Create temporary path for the pre-processed model
preprocessed_path = input_path.with_name(f"{input_path.stem}_preprocessed.onnx")
print(f"Pre-processing {input_path.name}...")
def has_external(path):
m = onnx.load(str(path), load_external_data=False)
return any(t.data_location == onnx.TensorProto.EXTERNAL for t in m.graph.initializer)
def grouped_or_nonconst_convs(path):
m = onnx.load(str(path), load_external_data=False)
inits = {i.name for i in m.graph.initializer}
bad = []
for n in m.graph.node:
if n.op_type != "Conv":
continue
group = next((a.i for a in n.attribute if a.name == "group"), 1)
w_const = len(n.input) > 1 and n.input[1] in inits
if group > 1 or not w_const:
bad.append(n.name)
return bad
def quantize_one(path, weight_type, reduce_range):
stem = path.stem
out = path.with_name(f"{stem}_quant.onnx")
pre = path.with_name(f"{stem}_pre.onnx")
target = path
try:
quant_pre_process(str(input_path), str(preprocessed_path))
target_input = preprocessed_path
except Exception as e:
print(f"Pre-processing skipped or failed: {e}")
target_input = input_path
quant_pre_process(str(path), str(pre), skip_optimization=False,
skip_onnx_shape=False, skip_symbolic_shape=False, auto_merge=True)
target = pre
except Exception as e1:
try:
quant_pre_process(str(path), str(pre), skip_optimization=False,
skip_onnx_shape=False, skip_symbolic_shape=True)
target = pre
print(" preprocess: symbolic shape skipped")
except Exception as e2:
print(f" preprocess failed, quantizing raw: {e2}")
print(f"Quantizing {target_input.name}...")
try:
quantize_dynamic(
model_input=str(target_input),
model_output=str(output_path),
weight_type=QuantType.QUInt8,
# Limit quantization to MatMul. This bypasses the Conv layers
# that cause weight initialization errors, while still optimizing
# the heavy transformer layers.
op_types_to_quantize=["MatMul"]
)
print(f"Quantization complete: {output_path}")
finally:
# Clean up temporary preprocessed file if it was created
if preprocessed_path.exists() and preprocessed_path != input_path:
preprocessed_path.unlink()
exclude = grouped_or_nonconst_convs(target) if stem == "ssl" else []
if exclude:
print(f" excluding {len(exclude)} grouped/non-const conv(s)")
quantize_dynamic(
model_input=str(target), model_output=str(out),
weight_type=weight_type, op_types_to_quantize=OPS,
nodes_to_exclude=exclude, reduce_range=reduce_range,
)
pre.unlink(missing_ok=True)
b = out.stat().st_size
if has_external(path):
print(f" {path.name} -> {out.name} {b/1e6:.3g} MB int8 self-contained (fp32 weights were external)")
else:
a = path.stat().st_size
print(f" {path.name} -> {out.name} {a/1e6:.3g} -> {b/1e6:.3g} MB ({100*(1-b/a):.0f}% smaller)")
return out
def feeds_for(sess, meta, rng):
feeds = {}
for inp in sess.get_inputs():
dt = np.int64 if "int64" in inp.type else (np.int32 if "int32" in inp.type else np.float32)
shape = [d if isinstance(d, int) and d > 0
else (1 if ax == 0 and len(inp.shape) >= 2 else meta.get("enc_ssl_frames", 100))
for ax, d in enumerate(inp.shape)]
n = inp.name.lower()
if np.issubdtype(dt, np.integer):
feeds[inp.name] = np.zeros(shape, dtype=dt)
else:
a = rng.standard_normal(shape).astype(np.float32)
feeds[inp.name] = (np.abs(a) + 0.5) if "std" in n else (a * 0.0 if "mean" in n else a)
return feeds
def check(fp32, quant, meta):
rng = np.random.default_rng(0)
s0 = ort.InferenceSession(str(fp32), providers=["CPUExecutionProvider"])
s1 = ort.InferenceSession(str(quant), providers=["CPUExecutionProvider"])
feeds = feeds_for(s0, meta, rng)
out = [o.name for o in s0.get_outputs()]
r0 = s0.run(out, feeds)
r1 = s1.run(out, feeds)
for name, a, b in zip(out, r0, r1):
if np.issubdtype(a.dtype, np.integer):
print(f" {name}: {100*(a != b).mean():.2f}% tokens changed")
else:
d = np.abs(a - b)
print(f" {name}: max|d|={d.max():.3g} mean|d|={d.mean():.3g}")
def check_real(d, meta, source, target):
import infer
a = argparse.Namespace(ssl=str(d / "ssl.onnx"), encode=str(d / "encode.onnx"),
decode=str(d / "decode.onnx"), global_path=str(d / "global.onnx"),
cuda=False)
vc = infer.Infer(a, meta)
sr16 = meta["ssl_sample_rate"]
src16 = infer.load_16k(source, sr16)
mean, std, _ = vc.calibrate(src16)
qs = {n: ort.InferenceSession(str(d / f"{n}_quant.onnx"), providers=["CPUExecutionProvider"])
for n in ["ssl", "encode", "decode", "global"] if (d / f"{n}_quant.onnx").exists()}
keep, win = next(vc._windows(src16))
win1 = infer.take(win, 0, vc.ssl_in).reshape(1, -1)
local_real = vc._ssl(win)[0]
if "ssl" in qs:
l1, g1 = qs["ssl"].run(["local_features", "global_features"], {"audio_16k": win1})
l0, g0 = vc.ssl.run(["local_features", "global_features"], {"audio_16k": win1})
print(f" ssl local max|d|={np.abs(l0 - l1).max():.3g} global max|d|={np.abs(g0 - g1).max():.3g}")
if "encode" in qs:
feed = {"local_ssl_features": local_real, "mean": mean, "std": std}
t0 = vc.enc.run(["content_token_indices"], feed)[0]
t1 = qs["encode"].run(["content_token_indices"], feed)[0]
k = slice(vc.enc_left, vc.enc_left + keep)
print(f" encode tokens (real, center {keep}): {100 * (t0[k] == t1[k]).mean():.1f}% agree")
if "decode" in qs and target:
emb = vc.embed(infer.load_16k(target, sr16))
toks = vc.tokens(src16, mean, std)
lo = vc.dec_left
w = toks[np.clip(np.arange(lo, lo + vc.dec_tokens), 0, len(toks) - 1)].astype(np.int64)
feed = {"content_token_indices": w, "global_embedding": emb}
r0 = vc.dec.run(["spec_real", "spec_imag"], feed)
r1 = qs["decode"].run(["spec_real", "spec_imag"], feed)
print(f" decode spec_real max|d|={np.abs(r0[0] - r1[0]).max():.3g} "
f"spec_imag max|d|={np.abs(r0[1] - r1[1]).max():.3g}")
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("--model", required=True, help="Path to the ONNX model to quantize")
p.add_argument("--dir", default="outputs")
p.add_argument("--models", nargs="*", default=["ssl", "encode", "decode", "global"])
p.add_argument("--weight-type", choices=["int8", "uint8"], default="int8")
p.add_argument("--no-reduce-range", action="store_true")
p.add_argument("--check", action="store_true")
p.add_argument("--source")
p.add_argument("--target")
args = p.parse_args()
in_path = Path(args.model)
out_path = in_path.with_name(f"{in_path.stem}_quant.onnx")
quantize_model(in_path, out_path)
d = Path(args.dir)
wt = QuantType.QInt8 if args.weight_type == "int8" else QuantType.QUInt8
meta = json.loads((d / "meta.json").read_text()) if (d / "meta.json").exists() else {}
for name in args.models:
f = d / f"{name}.onnx"
if not f.exists():
continue
print(f"{name}:")
q = quantize_one(f, wt, not args.no_reduce_range)
if args.check:
check(f, q, meta)
if args.source:
print("real-audio check:")
check_real(d, meta, args.source, args.target)
Generated
+23 -58
View File
@@ -201,32 +201,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a7/5f/ed01f9a3cdffbd5a008556fc7b2a08ddb1cc6ace7effa7340604b1d16699/docstring_parser-0.18.0-py3-none-any.whl", hash = "sha256:b3fcbed555c47d8479be0796ef7e19c2670d428d72e96da63f3a40122860374b", size = 22484, upload-time = "2026-04-14T04:09:18.638Z" },
]
[[package]]
name = "dovc"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "miocodec" },
{ name = "numpy" },
{ name = "onnxruntime" },
{ name = "onnxruntime-gpu" },
{ name = "onnxscript" },
{ name = "sounddevice" },
{ name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
{ name = "torch", version = "2.12.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
]
[package.metadata]
requires-dist = [
{ name = "miocodec", git = "https://github.com/Aratako/MioCodec" },
{ name = "numpy", specifier = ">=2.4.6" },
{ name = "onnxruntime", specifier = ">=1.26.0" },
{ name = "onnxruntime-gpu", specifier = ">=1.26.0" },
{ name = "onnxscript", specifier = ">=0.7.0" },
{ name = "sounddevice", specifier = ">=0.5.5" },
{ name = "torch", specifier = ">=2.11.0" },
]
[[package]]
name = "einops"
version = "0.8.2"
@@ -523,6 +497,29 @@ dependencies = [
{ name = "torchaudio", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
]
[[package]]
name = "mioonnx"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "miocodec" },
{ name = "numpy" },
{ name = "onnxruntime-gpu" },
{ name = "onnxscript" },
{ name = "sounddevice" },
{ name = "soundfile" },
]
[package.metadata]
requires-dist = [
{ name = "miocodec", git = "https://github.com/Aratako/MioCodec" },
{ name = "numpy", specifier = ">=2.4.6" },
{ name = "onnxruntime-gpu", specifier = ">=1.26.0" },
{ name = "onnxscript", specifier = ">=0.7.0" },
{ name = "sounddevice", specifier = ">=0.5.5" },
{ name = "soundfile", specifier = ">=0.13.1" },
]
[[package]]
name = "ml-dtypes"
version = "0.5.4"
@@ -833,38 +830,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/8c/aa/f7a53321c60b9ad9ee184b6018292ed6b5389947592a2c8c09c736bb7f9e/onnx_ir-0.2.1-py3-none-any.whl", hash = "sha256:c7285da889312f91882de2092e298a9eeeefbfc1d1951c49d983992967eb09a7", size = 166792, upload-time = "2026-04-20T20:21:46.357Z" },
]
[[package]]
name = "onnxruntime"
version = "1.26.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "flatbuffers" },
{ name = "numpy" },
{ name = "packaging" },
{ name = "protobuf" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/81/b1/d111b1df656761f980d9e298a60039a9cb66036b1d039e777537743d0ac3/onnxruntime-1.26.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:05b028781b322ad74b57ce5b50aa5280bb1fe96ceec334628ade681e0b24c1ac", size = 18016624, upload-time = "2026-05-12T00:41:01.735Z" },
{ url = "https://files.pythonhosted.org/packages/f6/a0/3f9d896a0385a36bd04345d6d0b802821a5782adde562e7e135f6bb71c73/onnxruntime-1.26.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91f2bb870a4b9224eba0a6728c1fa7a9e552b8e59e1083c51fbbc3d013f2b5c0", size = 16052692, upload-time = "2026-05-08T19:07:13.829Z" },
{ url = "https://files.pythonhosted.org/packages/7c/43/2a4e04f8dbeffad19bbcced4bcd4289bf478921518437404d6b92bdf213b/onnxruntime-1.26.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b6dd70599005bd1bf29779f04a91978b92b5e719c11a20068a8f8e535f725b6", size = 18185439, upload-time = "2026-05-08T19:07:36.299Z" },
{ url = "https://files.pythonhosted.org/packages/44/fc/026d0a7162b9c2153dac292baea9e027c42304dc1d9dc6f8ff5b4cfbaedd/onnxruntime-1.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:a26374dc7fbcaae593601086b242120e13f2310558df0991da6dd8b8fac00414", size = 13026427, upload-time = "2026-05-08T19:08:03.503Z" },
{ url = "https://files.pythonhosted.org/packages/3e/27/1dcf88e45e4c69db5f7b106f2dacc3801ba98994e082ca03e1dfdf7bfe57/onnxruntime-1.26.0-cp312-cp312-win_arm64.whl", hash = "sha256:54a8053410fd31fd66469bd754fcfe8a4df9f7eb44756b4b5479bf50c842d948", size = 12796647, upload-time = "2026-05-08T19:07:52.108Z" },
{ url = "https://files.pythonhosted.org/packages/cf/a2/c801242685e0ce48a4ca51dfafbb588765e0446397e123be53ba5598f3f5/onnxruntime-1.26.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccce19c5f771b8268902f77d9fed9e88f9499465d6780808faa6611a789d33f0", size = 18016563, upload-time = "2026-05-08T19:07:28.081Z" },
{ url = "https://files.pythonhosted.org/packages/e2/64/0492c0b1db04e29b2630c87cfa36f9d6872b1ca8614b90c5cad58fac7d76/onnxruntime-1.26.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdbed8cf3b672b66acb032f33a253bc27f42bce6ece48ae3fab4fa483a5e96e0", size = 16052634, upload-time = "2026-05-08T19:07:16.885Z" },
{ url = "https://files.pythonhosted.org/packages/3d/26/4d09ddc755a84fc8d5e192991626b0e0680e8f6c5d58f4f1d05c42bc48cf/onnxruntime-1.26.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c07af6fc6d5557835f2b6ee7a96d8b3235d0c57a8e230efdedaee106a8a3cbc6", size = 18185632, upload-time = "2026-05-08T19:07:38.756Z" },
{ url = "https://files.pythonhosted.org/packages/77/89/3e52249aa08fa301e217ecba07b5246a8338fa2b401e109326e3fc5be0f9/onnxruntime-1.26.0-cp313-cp313-win_amd64.whl", hash = "sha256:61bec80655efa460591c2bc655392d57d2650ce85533a6b9b3b7a790d7ea7916", size = 13026751, upload-time = "2026-05-08T19:08:06.2Z" },
{ url = "https://files.pythonhosted.org/packages/06/b3/c1c8782b14af6797c303de132d6eef26a9fb80dfacd3750ce57911d11c6b/onnxruntime-1.26.0-cp313-cp313-win_arm64.whl", hash = "sha256:a6677545ff451e3539a02746d2f207d8c5baa4a0a818886bb9d6a6eb9511ee89", size = 12796807, upload-time = "2026-05-08T19:07:54.879Z" },
{ url = "https://files.pythonhosted.org/packages/c3/f5/47b0676408abec652c14b84d7173e389837832d850c24f87184277313e8d/onnxruntime-1.26.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e016edc15d3c19f36807e1c6b10be5b27807688c32720f91b5ae480a95215d0", size = 16057265, upload-time = "2026-05-08T19:07:19.603Z" },
{ url = "https://files.pythonhosted.org/packages/3b/45/33ab6deeef010ca844c877dd618cebc079590bbe52d2a3678e7223b1b908/onnxruntime-1.26.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f5fc48a91a046a6a5c9b147f83fb41d65d24d24923373b222cdd248f0f4f4aac", size = 18197590, upload-time = "2026-05-08T19:07:41.422Z" },
{ url = "https://files.pythonhosted.org/packages/40/89/17546c1c20f6bfc3ae41c22152378a26edfea918af3129e2139dcd7c99f3/onnxruntime-1.26.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:33a791f31432a3af1a96db5e54818b37aba5e5eefc2e6af5794c10a9118a9993", size = 18019724, upload-time = "2026-05-08T19:07:30.723Z" },
{ url = "https://files.pythonhosted.org/packages/bb/24/89457a35f6af29538a76647f2c18c3a28277e6c19234c847e7b4b7c19860/onnxruntime-1.26.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e90c00732c4553618103149d93f688e8c3063017938f8983e21a71d9f3b6d22e", size = 16054821, upload-time = "2026-05-08T19:07:22.348Z" },
{ url = "https://files.pythonhosted.org/packages/12/f9/15b2e1815cf570d238e0135529f80d2dce64e8e8818a1489cae83823c5c6/onnxruntime-1.26.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01498e80ba8988428d08c2d51b1338f89e3de2a93e6ffe555f79c68f26a5c06b", size = 18185815, upload-time = "2026-05-08T19:07:44.179Z" },
{ url = "https://files.pythonhosted.org/packages/d7/65/2e11055faf015e4b07f45b513fa49b391baf2e19d92d77d73ebee13c1004/onnxruntime-1.26.0-cp314-cp314-win_amd64.whl", hash = "sha256:7ead61450d8405167c87dd3a31d8da1d576b490a57dab1aa8b82a7da6825f5aa", size = 13349887, upload-time = "2026-05-08T19:08:08.671Z" },
{ url = "https://files.pythonhosted.org/packages/19/e4/0f9d1a5718b1781c610c1e354765a3820597081754277a6a9a2b50705702/onnxruntime-1.26.0-cp314-cp314-win_arm64.whl", hash = "sha256:31d71a53490e46910877d0902b5ad99c69a5955e5c7ea6c82863519410e1ba7c", size = 13140121, upload-time = "2026-05-08T19:07:57.804Z" },
{ url = "https://files.pythonhosted.org/packages/1c/42/3b8e635f067d06d9f45bede470b8d539d101a4166c272213158dfd08b6ce/onnxruntime-1.26.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7b6d258fb78fdfcf049795bcfaa74dcb90ae7baa277afd21e6fd28b83f2c496", size = 16057240, upload-time = "2026-05-08T19:07:25.163Z" },
{ url = "https://files.pythonhosted.org/packages/93/99/f2be40a31b908d96b861ae0ce98582fa376c18a7f816b9d5eb4cd6aa0a4c/onnxruntime-1.26.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4eefd386a45202aefb7a5132b94f32df9d506c9edcc7faf2fc60d65183f4b183", size = 18197382, upload-time = "2026-05-08T19:07:46.965Z" },
]
[[package]]
name = "onnxruntime-gpu"
version = "1.26.0"