Compare commits
5 Commits
51e384c32e
..
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 7de23ac5be | |||
| c480902306 | |||
| bee1ed65a4 | |||
| 5578b84fd8 | |||
| 626d4a5a56 |
+3
-1
@@ -1 +1,3 @@
|
|||||||
outputs/
|
outputs/
|
||||||
|
.venv/
|
||||||
|
__pycache__/
|
||||||
+3
-3
@@ -416,10 +416,10 @@ if __name__ == "__main__":
|
|||||||
p.add_argument("--weights")
|
p.add_argument("--weights")
|
||||||
p.add_argument("--out-dir", default="outputs")
|
p.add_argument("--out-dir", default="outputs")
|
||||||
p.add_argument("--chunk", type=int, default=6)
|
p.add_argument("--chunk", type=int, default=6)
|
||||||
p.add_argument("--enc-left", type=int, default=48)
|
p.add_argument("--enc-left", type=int, default=32)
|
||||||
p.add_argument("--enc-right", type=int, default=2)
|
p.add_argument("--enc-right", type=int, default=4)
|
||||||
p.add_argument("--dec-left", type=int, default=32)
|
p.add_argument("--dec-left", type=int, default=32)
|
||||||
p.add_argument("--dec-right", type=int, default=3)
|
p.add_argument("--dec-right", type=int, default=4)
|
||||||
p.add_argument("--mode", choices=["all", "ssl", "encode", "decode", "global"], default="all")
|
p.add_argument("--mode", choices=["all", "ssl", "encode", "decode", "global"], default="all")
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,413 @@
|
|||||||
|
import argparse
|
||||||
|
import math
|
||||||
|
import queue
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import sounddevice as sd
|
||||||
|
import soundfile as sf
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
import torchaudio
|
||||||
|
|
||||||
|
from miocodec.model import MioCodecModel
|
||||||
|
|
||||||
|
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
import gc
|
||||||
|
|
||||||
|
class StreamingISTFT:
|
||||||
|
def __init__(self, n_fft, hop, device):
|
||||||
|
self.n_fft = n_fft
|
||||||
|
self.win = n_fft
|
||||||
|
self.hop = hop
|
||||||
|
self.pad = (self.win - hop) // 2
|
||||||
|
self.window = torch.hann_window(self.win, device=device)
|
||||||
|
self.win_sq = (self.window**2).view(1, -1, 1)
|
||||||
|
self.carry = self.win - self.hop
|
||||||
|
self.tail_y = torch.zeros(1, 0, device=device)
|
||||||
|
self.tail_e = torch.zeros(1, 0, device=device)
|
||||||
|
self.started = False
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.tail_y = self.tail_y[:, :0]
|
||||||
|
self.tail_e = self.tail_e[:, :0]
|
||||||
|
self.started = False
|
||||||
|
|
||||||
|
def process(self, spec):
|
||||||
|
T = spec.shape[-1]
|
||||||
|
ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward") * self.window.view(1, -1, 1)
|
||||||
|
region = (T - 1) * self.hop + self.win
|
||||||
|
y = F.fold(ifft, (1, region), (1, self.win), stride=(1, self.hop))[:, 0, 0, :]
|
||||||
|
e = F.fold(self.win_sq.expand(1, self.win, T), (1, region), (1, self.win), stride=(1, self.hop))[:, 0, 0, :]
|
||||||
|
tl = self.tail_y.shape[-1]
|
||||||
|
if tl:
|
||||||
|
y[:, :tl] += self.tail_y
|
||||||
|
e[:, :tl] += self.tail_e
|
||||||
|
emit = region - self.carry
|
||||||
|
out = y[:, :emit] / e[:, :emit].clamp(min=1e-8)
|
||||||
|
self.tail_y = y[:, emit:].clone()
|
||||||
|
self.tail_e = e[:, emit:].clone()
|
||||||
|
if not self.started:
|
||||||
|
out = out[:, self.pad:]
|
||||||
|
self.started = True
|
||||||
|
return out.squeeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
class StreamingVC:
|
||||||
|
def __init__(self, model, device, *, chunk=6, enc_left=48, enc_right=2,
|
||||||
|
dec_left=32, dec_right=3, ema_alpha=0.9):
|
||||||
|
self.m = model.to(device).eval()
|
||||||
|
self.dev = device
|
||||||
|
|
||||||
|
c = model.config
|
||||||
|
ssl_fps = self.m.ssl_feature_extractor.ssl_sample_rate // self.m.ssl_feature_extractor.hop_size
|
||||||
|
self.token_hz = ssl_fps // c.downsample_factor
|
||||||
|
self.sr = c.sample_rate
|
||||||
|
self.tok_samples = self.sr // self.token_hz
|
||||||
|
ups_total = self.m.wave_upsampler.total_upsample_factor
|
||||||
|
self.frames_per_tok = c.wave_upsample_factor * ups_total
|
||||||
|
assert self.frames_per_tok * c.hop_length == self.tok_samples, "token/frame/sample ratios disagree"
|
||||||
|
|
||||||
|
self.chunk = chunk
|
||||||
|
self.enc_left, self.enc_right = enc_left, enc_right
|
||||||
|
self.dec_left, self.dec_right = dec_left, dec_right
|
||||||
|
self.local_layers = list(self.m.local_ssl_layers)
|
||||||
|
|
||||||
|
self.istft = StreamingISTFT(c.n_fft, c.hop_length, device)
|
||||||
|
self.global_emb = None
|
||||||
|
self.src_mean = self.src_std = None
|
||||||
|
self.tokens = None
|
||||||
|
self.decoded = 0
|
||||||
|
|
||||||
|
self.ema_alpha = ema_alpha
|
||||||
|
self.prev_local_feats = None
|
||||||
|
|
||||||
|
def _raw_local(self, audio):
|
||||||
|
feats = self.m.ssl_feature_extractor(audio.to(self.dev))
|
||||||
|
sel = [feats[i - 1] for i in self.local_layers]
|
||||||
|
return torch.stack(sel, 0).mean(0) if len(sel) > 1 else sel[0]
|
||||||
|
|
||||||
|
def apply_ema(self, local_feats):
|
||||||
|
if self.prev_local_feats is not None and local_feats.shape == self.prev_local_feats.shape:
|
||||||
|
local_feats = self.ema_alpha * local_feats + (1.0 - self.ema_alpha) * self.prev_local_feats
|
||||||
|
self.prev_local_feats = local_feats.clone()
|
||||||
|
return local_feats
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def set_target(self, ref_audio):
|
||||||
|
feats = self.m.encode(ref_audio.to(self.dev), return_content=False, return_global=True)
|
||||||
|
self.global_emb = feats.global_embedding.view(1, -1)
|
||||||
|
|
||||||
|
def _encode_features(self, loc):
|
||||||
|
loc_norm = (loc - self.src_mean) / (self.src_std + 1e-8)
|
||||||
|
enc = self.m.local_encoder(loc_norm)
|
||||||
|
enc = self.m.conv_downsample(enc.transpose(1, 2)).transpose(1, 2)
|
||||||
|
_, idx = self.m.local_quantizer.encode(enc)
|
||||||
|
return idx
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def seed(self, seed_audio):
|
||||||
|
self.reset()
|
||||||
|
if seed_audio.dim() == 1:
|
||||||
|
seed_audio = seed_audio.unsqueeze(0)
|
||||||
|
|
||||||
|
loc = self._raw_local(seed_audio)
|
||||||
|
self.src_mean = loc.mean(dim=1, keepdim=True).clone()
|
||||||
|
self.src_std = loc.std(dim=1, keepdim=True).clone()
|
||||||
|
|
||||||
|
idx = self._encode_features(loc)
|
||||||
|
self.tokens = idx.clone()
|
||||||
|
self.decoded = idx.shape[1]
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.istft.reset()
|
||||||
|
self.tokens = None
|
||||||
|
self.decoded = 0
|
||||||
|
self.prev_local_feats = None
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def _encode(self, window_audio):
|
||||||
|
loc = self._raw_local(window_audio)
|
||||||
|
loc = self.apply_ema(loc)
|
||||||
|
return self._encode_features(loc)
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def _wave_stages(self, tok_window):
|
||||||
|
Tw = tok_window.shape[1]
|
||||||
|
emb = self.m.local_quantizer.decode(tok_window)
|
||||||
|
x = self.m.wave_prenet(emb)
|
||||||
|
x = self.m.wave_conv_upsample(x.transpose(1, 2)).transpose(1, 2)
|
||||||
|
x = F.interpolate(x.transpose(1, 2), size=2 * Tw, mode=self.m.config.wave_interpolation_mode).transpose(1, 2)
|
||||||
|
x = self.m.wave_prior_net(x.transpose(1, 2)).transpose(1, 2)
|
||||||
|
x = self.m.wave_decoder(x, condition=self.global_emb.unsqueeze(1))
|
||||||
|
x = self.m.wave_post_net(x.transpose(1, 2)).transpose(1, 2)
|
||||||
|
return self.m.wave_upsampler(x.transpose(1, 2))
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def _decode(self, tok_window, keep_left, keep_n):
|
||||||
|
x = self._wave_stages(tok_window)
|
||||||
|
h = self.m.istft_head.out(x).transpose(1, 2)
|
||||||
|
mag, phase = h.chunk(2, dim=1)
|
||||||
|
mag = torch.exp(mag).clamp(max=1e2)
|
||||||
|
spec = torch.complex(mag * torch.cos(phase), mag * torch.sin(phase))
|
||||||
|
f0 = keep_left * self.frames_per_tok
|
||||||
|
f1 = (keep_left + keep_n) * self.frames_per_tok
|
||||||
|
return self.istft.process(spec[..., f0:f1])
|
||||||
|
|
||||||
|
def _commit_tokens(self, new_idx):
|
||||||
|
self.tokens = new_idx if self.tokens is None else torch.cat([self.tokens, new_idx], dim=1)
|
||||||
|
|
||||||
|
def _drain(self, final=False):
|
||||||
|
out = []
|
||||||
|
committed = self.tokens.shape[1]
|
||||||
|
while True:
|
||||||
|
d0 = self.decoded
|
||||||
|
avail = committed - d0
|
||||||
|
if avail <= 0 or (not final and avail < self.chunk + self.dec_right):
|
||||||
|
break
|
||||||
|
keep_n = min(self.chunk, avail) if final else self.chunk
|
||||||
|
left = min(self.dec_left, d0)
|
||||||
|
right = min(self.dec_right, committed - (d0 + keep_n))
|
||||||
|
win = self.tokens[:, d0 - left: d0 + keep_n + right]
|
||||||
|
out.append(self._decode(win, left, keep_n))
|
||||||
|
self.decoded += keep_n
|
||||||
|
return torch.cat(out) if out else torch.zeros(0, device=self.dev)
|
||||||
|
|
||||||
|
|
||||||
|
def list_devices():
|
||||||
|
print(f"{'idx':>4} {'name':<50} {'in':>3} {'out':>3} {'sr':>7}")
|
||||||
|
print("-" * 76)
|
||||||
|
for i, d in enumerate(sd.query_devices()):
|
||||||
|
print(f"{i:>4} {d['name']:<50} {d['max_input_channels']:>3} {d['max_output_channels']:>3} {int(d['default_samplerate']):>7}")
|
||||||
|
|
||||||
|
|
||||||
|
def sync_time(fn):
|
||||||
|
if DEVICE.type == "cuda":
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
out = fn()
|
||||||
|
if DEVICE.type == "cuda":
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
return out, (time.perf_counter() - t0) * 1000
|
||||||
|
|
||||||
|
|
||||||
|
def load_audio(path, target_sr):
|
||||||
|
a, sr = sf.read(path, dtype="float32", always_2d=True)
|
||||||
|
a = a.mean(axis=1)
|
||||||
|
|
||||||
|
if sr != target_sr:
|
||||||
|
print(f"Resampling {path.name} from {sr} Hz to {target_sr} Hz...")
|
||||||
|
tensor = torch.from_numpy(a)
|
||||||
|
tensor = torchaudio.functional.resample(tensor, orig_freq=sr, new_freq=target_sr)
|
||||||
|
else:
|
||||||
|
tensor = torch.from_numpy(a)
|
||||||
|
|
||||||
|
p = torch.abs(tensor).max()
|
||||||
|
return tensor / p if p > 1e-8 else tensor
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
gc.collect()
|
||||||
|
gc.freeze()
|
||||||
|
gc.disable()
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--list-devices", action="store_true")
|
||||||
|
parser.add_argument("--input", type=int)
|
||||||
|
parser.add_argument("--output", type=int)
|
||||||
|
parser.add_argument("--target", type=Path, help="Target voice reference WAV")
|
||||||
|
parser.add_argument("--seed-audio", type=Path, help="Seed speaker calibration WAV (optional)")
|
||||||
|
parser.add_argument("--chunk", type=int, default=6)
|
||||||
|
parser.add_argument("--enc-left", type=int, default=48)
|
||||||
|
parser.add_argument("--enc-right", type=int, default=4)
|
||||||
|
parser.add_argument("--dec-left", type=int, default=32)
|
||||||
|
parser.add_argument("--dec-right", type=int, default=4)
|
||||||
|
parser.add_argument("--ema-alpha", type=float, default=0.9,
|
||||||
|
help="EMA smoothing on local SSL features (0=full smoothing, 1=no smoothing)")
|
||||||
|
parser.add_argument("--rms-floor", type=float, default=0.0035,
|
||||||
|
help="RMS threshold below which audio chunk is evaluated as silence")
|
||||||
|
parser.add_argument("--hangover-chunks", type=int, default=5,
|
||||||
|
help="Number of chunks to hold the gate open after RMS drop")
|
||||||
|
parser.add_argument("--silence-fade-ms", type=float, default=10.0,
|
||||||
|
help="Ramp-down duration in ms at silence boundary (0 to disable)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.list_devices:
|
||||||
|
list_devices()
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.input is None or args.output is None:
|
||||||
|
parser.error("--input and --output required")
|
||||||
|
|
||||||
|
model = MioCodecModel.from_pretrained("Aratako/MioCodec-25Hz-44.1kHz-v2")
|
||||||
|
|
||||||
|
vc = StreamingVC(
|
||||||
|
model, DEVICE, chunk=args.chunk, enc_left=args.enc_left, enc_right=args.enc_right,
|
||||||
|
dec_left=args.dec_left, dec_right=args.dec_right, ema_alpha=args.ema_alpha
|
||||||
|
)
|
||||||
|
|
||||||
|
sr = vc.sr
|
||||||
|
ts = vc.tok_samples
|
||||||
|
chunk_samples = vc.chunk * ts
|
||||||
|
left_pad = vc.enc_left * ts
|
||||||
|
right_pad = vc.enc_right * ts
|
||||||
|
budget_ms = (vc.chunk / vc.token_hz) * 1000
|
||||||
|
fade_samples = int(args.silence_fade_ms * 0.001 * sr)
|
||||||
|
|
||||||
|
print(f"Sample Rate: {sr} Hz | Chunk: {args.chunk} tokens ({budget_ms:.1f}ms budget)")
|
||||||
|
print(f"EMA alpha: {args.ema_alpha} | Silence fade: {args.silence_fade_ms:.0f}ms")
|
||||||
|
|
||||||
|
print(f"Loading target speaker profile: {args.target}...")
|
||||||
|
target_audio = load_audio(args.target, sr)
|
||||||
|
vc.set_target(target_audio)
|
||||||
|
|
||||||
|
in_info = sd.query_devices(args.input)
|
||||||
|
n_in_ch = min(in_info["max_input_channels"], 2)
|
||||||
|
|
||||||
|
if args.seed_audio:
|
||||||
|
print(f"Loading speaker calibration profile: {args.seed_audio}...")
|
||||||
|
seed_audio = load_audio(args.seed_audio, sr)
|
||||||
|
else:
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("No seed-audio provided. Recording 3 seconds for normalization calibration.")
|
||||||
|
print("Please speak into your microphone...")
|
||||||
|
print("=" * 60)
|
||||||
|
recorded = sd.rec(int(3.0 * sr), samplerate=sr, channels=n_in_ch, dtype="float32")
|
||||||
|
sd.wait()
|
||||||
|
print("Recording complete. Calibrating feature scaling...")
|
||||||
|
recorded_mono = recorded.mean(axis=1) if recorded.shape[1] > 1 else recorded[:, 0]
|
||||||
|
seed_audio = torch.from_numpy(recorded_mono)
|
||||||
|
|
||||||
|
print("Seeding streaming context from speaker profile...")
|
||||||
|
vc.seed(seed_audio)
|
||||||
|
|
||||||
|
if seed_audio.numel() >= left_pad:
|
||||||
|
raw_input_accum = seed_audio[-left_pad:].numpy()
|
||||||
|
else:
|
||||||
|
raw_input_accum = np.pad(seed_audio.numpy(), (left_pad - seed_audio.numel(), 0))
|
||||||
|
|
||||||
|
in_q = queue.Queue(maxsize=8)
|
||||||
|
out_q = queue.Queue(maxsize=2)
|
||||||
|
stop_event = threading.Event()
|
||||||
|
|
||||||
|
def input_cb(indata, frames, time_info, status):
|
||||||
|
if in_q.full():
|
||||||
|
in_q.get_nowait()
|
||||||
|
mono = indata.mean(axis=1) if indata.shape[1] > 1 else indata[:, 0]
|
||||||
|
in_q.put_nowait(mono.copy())
|
||||||
|
|
||||||
|
def write_thread(out_stream):
|
||||||
|
while not stop_event.is_set():
|
||||||
|
try:
|
||||||
|
pcm = out_q.get(timeout=0.5)
|
||||||
|
out_stream.write(pcm)
|
||||||
|
except queue.Empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"\n{'chunk':>6} {'q_in':>4} {'q_out':>5} {'enc':>7} {'dec':>7} {'total':>7} {'budget':>7} {'gap':>7}")
|
||||||
|
print("-" * 76)
|
||||||
|
|
||||||
|
chunk_n = 0
|
||||||
|
t_last = None
|
||||||
|
hangover_counter = 0
|
||||||
|
|
||||||
|
if fade_samples > 0:
|
||||||
|
ramp_down = np.linspace(1.0, 0.0, fade_samples, dtype=np.float32)
|
||||||
|
|
||||||
|
with sd.InputStream(device=args.input, channels=n_in_ch, samplerate=sr,
|
||||||
|
blocksize=chunk_samples, dtype="float32",
|
||||||
|
callback=input_cb, latency="low"):
|
||||||
|
with sd.OutputStream(device=args.output, channels=2, samplerate=sr,
|
||||||
|
dtype="float32", latency="low") as out_stream:
|
||||||
|
|
||||||
|
writer = threading.Thread(target=write_thread, args=(out_stream,), daemon=True)
|
||||||
|
writer.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
raw = in_q.get()
|
||||||
|
t_now = time.perf_counter()
|
||||||
|
gap_ms = (t_now - t_last) * 1000 if t_last else 0.0
|
||||||
|
t_last = t_now
|
||||||
|
|
||||||
|
rms = float(np.sqrt(np.mean(raw ** 2)))
|
||||||
|
|
||||||
|
if rms >= args.rms_floor:
|
||||||
|
hangover_counter = args.hangover_chunks
|
||||||
|
is_silence = False
|
||||||
|
else:
|
||||||
|
if hangover_counter > 0:
|
||||||
|
hangover_counter -= 1
|
||||||
|
is_silence = False
|
||||||
|
else:
|
||||||
|
is_silence = True
|
||||||
|
|
||||||
|
raw_input_accum = np.concatenate([raw_input_accum, raw])
|
||||||
|
required_samples = left_pad + chunk_samples + right_pad
|
||||||
|
|
||||||
|
if len(raw_input_accum) >= required_samples:
|
||||||
|
window_np = raw_input_accum[:required_samples]
|
||||||
|
raw_input_accum = raw_input_accum[chunk_samples:]
|
||||||
|
|
||||||
|
if is_silence:
|
||||||
|
window_np = window_np.copy()
|
||||||
|
active_start = left_pad
|
||||||
|
active_end = left_pad + chunk_samples
|
||||||
|
if fade_samples > 0:
|
||||||
|
fade_end = active_start + fade_samples
|
||||||
|
window_np[active_start:fade_end] *= ramp_down
|
||||||
|
window_np[fade_end:active_end] = 0.0
|
||||||
|
else:
|
||||||
|
window_np[active_start:active_end] = 0.0
|
||||||
|
|
||||||
|
window_torch = torch.from_numpy(window_np).unsqueeze(0).to(DEVICE)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
idx, t_enc = sync_time(lambda: vc._encode(window_torch))
|
||||||
|
chunk_tokens = idx[:, vc.enc_left : vc.enc_left + vc.chunk]
|
||||||
|
vc._commit_tokens(chunk_tokens)
|
||||||
|
audio_out, t_dec = sync_time(lambda: vc._drain(final=False))
|
||||||
|
|
||||||
|
if audio_out.numel() == 0:
|
||||||
|
pcm_out = np.zeros((chunk_samples, 2), dtype=np.float32)
|
||||||
|
else:
|
||||||
|
pcm = audio_out.cpu().numpy()
|
||||||
|
pcm = np.clip(pcm, -1.0, 1.0)
|
||||||
|
pcm_out = np.stack([pcm, pcm], axis=1)
|
||||||
|
else:
|
||||||
|
pcm_out = np.zeros((chunk_samples, 2), dtype=np.float32)
|
||||||
|
t_enc, t_dec = 0.0, 0.0
|
||||||
|
|
||||||
|
out_q.put(pcm_out)
|
||||||
|
|
||||||
|
total = t_enc + t_dec
|
||||||
|
chunk_n += 1
|
||||||
|
|
||||||
|
if is_silence:
|
||||||
|
print(
|
||||||
|
f"{chunk_n:>6} {in_q.qsize():>4} {out_q.qsize():>5} "
|
||||||
|
f"{'--silence--':>31} rms={rms:.4f}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"{chunk_n:>6} {in_q.qsize():>4} {out_q.qsize():>5} "
|
||||||
|
f"{t_enc:>6.1f}ms {t_dec:>6.1f}ms "
|
||||||
|
f"{total:>6.1f}ms {budget_ms:>6.0f}ms {gap_ms:>6.1f}ms",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
stop_event.set()
|
||||||
|
writer.join()
|
||||||
|
|
||||||
|
print("stopped")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
+105
-86
@@ -8,7 +8,7 @@ import json
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import onnxruntime as ort
|
import onnxruntime as ort
|
||||||
# This bullshit
|
|
||||||
ort.preload_dlls()
|
ort.preload_dlls()
|
||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
@@ -53,7 +53,20 @@ class StreamingISTFT:
|
|||||||
self.win_sq = self.window ** 2
|
self.win_sq = self.window ** 2
|
||||||
self.tail_y = np.zeros(0, dtype=np.float32)
|
self.tail_y = np.zeros(0, dtype=np.float32)
|
||||||
self.tail_e = np.zeros(0, dtype=np.float32)
|
self.tail_e = np.zeros(0, dtype=np.float32)
|
||||||
self.started = False
|
self.started = False\
|
||||||
|
|
||||||
|
def block(self, real, imag):
|
||||||
|
spec = real + 1j * imag
|
||||||
|
T = spec.shape[1]
|
||||||
|
ifft = (np.fft.irfft(spec, self.n_fft, axis=0) * self.window[:, None]).astype(np.float32)
|
||||||
|
region = (T - 1) * self.hop + self.win
|
||||||
|
y = np.zeros(region, dtype=np.float32)
|
||||||
|
e = np.zeros(region, dtype=np.float32)
|
||||||
|
for t in range(T):
|
||||||
|
s = t * self.hop
|
||||||
|
y[s : s + self.win] += ifft[:, t]
|
||||||
|
e[s : s + self.win] += self.win_sq
|
||||||
|
return (y / np.maximum(e, 1e-8)).astype(np.float32)
|
||||||
|
|
||||||
def process(self, real, imag):
|
def process(self, real, imag):
|
||||||
spec = real + 1j * imag
|
spec = real + 1j * imag
|
||||||
@@ -100,25 +113,33 @@ class StreamingVCONNX:
|
|||||||
|
|
||||||
opts = ort.SessionOptions()
|
opts = ort.SessionOptions()
|
||||||
opts.inter_op_num_threads = 1
|
opts.inter_op_num_threads = 1
|
||||||
opts.intra_op_num_threads = 1
|
opts.intra_op_num_threads = 4
|
||||||
opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
||||||
opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||||
|
|
||||||
prov = ["CUDAExecutionProvider", "CPUExecutionProvider"] if args.cuda else ["CPUExecutionProvider"]
|
if getattr(args, "openvino", False):
|
||||||
|
prov = [("OpenVINOExecutionProvider", {"device_type": "CPU"}), "CPUExecutionProvider"]
|
||||||
|
elif args.cuda:
|
||||||
|
prov = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||||
|
else:
|
||||||
|
prov = ["CPUExecutionProvider"]
|
||||||
|
|
||||||
self.ssl = ort.InferenceSession(args.ssl, sess_options=opts, providers=prov)
|
self.ssl = ort.InferenceSession(args.ssl, sess_options=opts, providers=prov)
|
||||||
self.enc = ort.InferenceSession(args.encode, sess_options=opts, providers=prov)
|
self.enc = ort.InferenceSession(args.encode, sess_options=opts, providers=prov)
|
||||||
self.dec = ort.InferenceSession(args.decode, sess_options=opts, providers=prov)
|
self.dec = ort.InferenceSession(args.decode, sess_options=opts, providers=prov)
|
||||||
self.glb = ort.InferenceSession(args.global_path, sess_options=opts, providers=prov)
|
self.glb = ort.InferenceSession(args.global_path, sess_options=opts, providers=prov)
|
||||||
|
|
||||||
self.istft = StreamingISTFT(meta["n_fft"], meta["hop_length"])
|
self.istft = StreamingISTFT(meta["n_fft"], meta["hop_length"])
|
||||||
|
self.xfade_frames = 9
|
||||||
|
self.istft_margin = int(np.ceil(meta["n_fft"] / meta["hop_length"]))
|
||||||
|
self.xfade_tail = None
|
||||||
self.global_emb = None
|
self.global_emb = None
|
||||||
self.src_mean = None
|
self.src_mean = None
|
||||||
self.src_std = None
|
self.src_std = None
|
||||||
self.tokens = None
|
self.tokens = None
|
||||||
self.decoded = 0
|
self.decoded = 0
|
||||||
self.prev_local_feats = None
|
self.prev_local_feats = None
|
||||||
self.ema_alpha = 0.8 # Adjust between 0.5 (heavy smoothing) and 1.0 (no smoothing)
|
self.ema_alpha = 0.9
|
||||||
|
|
||||||
def _ssl(self, win16):
|
def _ssl(self, win16):
|
||||||
w = take(win16, 0, self.ssl_in).reshape(1, -1)
|
w = take(win16, 0, self.ssl_in).reshape(1, -1)
|
||||||
@@ -154,37 +175,46 @@ class StreamingVCONNX:
|
|||||||
frames = np.concatenate([l[c : c + keep * self.ds] for keep, l in locals_], axis=0)
|
frames = np.concatenate([l[c : c + keep * self.ds] for keep, l in locals_], axis=0)
|
||||||
self.src_mean = frames.mean(axis=0).astype(np.float32)
|
self.src_mean = frames.mean(axis=0).astype(np.float32)
|
||||||
self.src_std = frames.std(axis=0, ddof=1).astype(np.float32)
|
self.src_std = frames.std(axis=0, ddof=1).astype(np.float32)
|
||||||
|
|
||||||
seed_tokens = np.concatenate(
|
seed_tokens = np.concatenate(
|
||||||
[self._encode(l, self.src_mean, self.src_std)[self.enc_left : self.enc_left + keep] for keep, l in locals_]
|
[self._encode(l, self.src_mean, self.src_std)[self.enc_left : self.enc_left + keep] for keep, l in locals_]
|
||||||
) if locals_ else np.zeros(0, dtype=np.int64)
|
) if locals_ else np.zeros(0, dtype=np.int64)
|
||||||
|
|
||||||
self.tokens = seed_tokens.astype(np.int64)
|
self.tokens = seed_tokens.astype(np.int64)
|
||||||
self.decoded = len(self.tokens)
|
self.decoded = len(self.tokens)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.istft = StreamingISTFT(self.meta["n_fft"], self.meta["hop_length"])
|
self.istft = StreamingISTFT(self.meta["n_fft"], self.meta["hop_length"])
|
||||||
|
self.xfade_tail = None
|
||||||
self.tokens = None
|
self.tokens = None
|
||||||
self.decoded = 0
|
self.decoded = 0
|
||||||
|
|
||||||
def _encode_window(self, win16):
|
def apply_ema(self, local_feats):
|
||||||
local_feats, _ = self._ssl(win16)
|
shift = self.chunk * self.ds
|
||||||
|
if self.prev_local_feats is not None:
|
||||||
# Apply temporal smoothing to the continuous representations
|
n = local_feats.shape[0] - shift
|
||||||
if self.prev_local_feats is not None and local_feats.shape == self.prev_local_feats.shape:
|
if n > 0:
|
||||||
local_feats = self.ema_alpha * local_feats + (1.0 - self.ema_alpha) * self.prev_local_feats
|
local_feats[:n] = (self.ema_alpha * local_feats[:n]
|
||||||
|
+ (1 - self.ema_alpha) * self.prev_local_feats[shift:shift + n])
|
||||||
self.prev_local_feats = local_feats.copy()
|
self.prev_local_feats = local_feats.copy()
|
||||||
return self._encode(local_feats, self.src_mean, self.src_std)
|
return local_feats
|
||||||
|
|
||||||
def _decode(self, win_tokens, keep_left, keep_n):
|
def _decode(self, win_tokens, keep_left, keep_n, right_tokens):
|
||||||
real, imag = self.dec.run(
|
real, imag = self.dec.run(
|
||||||
["spec_real", "spec_imag"],
|
["spec_real", "spec_imag"],
|
||||||
{"content_token_indices": win_tokens, "global_embedding": self.global_emb}
|
{"content_token_indices": win_tokens, "global_embedding": self.global_emb},
|
||||||
)
|
)
|
||||||
f0 = keep_left * self.fpt
|
fpt, hop = self.fpt, self.istft.hop
|
||||||
f1 = (keep_left + keep_n) * self.fpt
|
a = keep_left * fpt
|
||||||
return self.istft.process(real[:, f0:f1], imag[:, f0:f1])
|
b = (keep_left + keep_n) * fpt
|
||||||
|
right_frames = right_tokens * fpt
|
||||||
|
ov = min(self.xfade_frames, max(0, right_frames))
|
||||||
|
m = min(self.istft_margin, a, max(0, right_frames - ov))
|
||||||
|
F0, F1 = a - m, b + ov + m
|
||||||
|
audio = self.istft.block(real[:, F0:F1], imag[:, F0:F1])
|
||||||
|
start = (a - F0) * hop
|
||||||
|
seg = audio[start : start + (keep_n * fpt + ov) * hop]
|
||||||
|
return seg, ov * hop
|
||||||
|
|
||||||
def _commit_tokens(self, new_idx):
|
def _commit_tokens(self, new_idx):
|
||||||
if self.tokens is None:
|
if self.tokens is None:
|
||||||
@@ -194,6 +224,7 @@ class StreamingVCONNX:
|
|||||||
|
|
||||||
def _drain(self, final=False):
|
def _drain(self, final=False):
|
||||||
out = []
|
out = []
|
||||||
|
hop = self.istft.hop
|
||||||
committed = len(self.tokens) if self.tokens is not None else 0
|
committed = len(self.tokens) if self.tokens is not None else 0
|
||||||
while True:
|
while True:
|
||||||
d0 = self.decoded
|
d0 = self.decoded
|
||||||
@@ -203,13 +234,23 @@ class StreamingVCONNX:
|
|||||||
keep_n = min(self.chunk, avail) if final else self.chunk
|
keep_n = min(self.chunk, avail) if final else self.chunk
|
||||||
left = min(self.dec_left, d0)
|
left = min(self.dec_left, d0)
|
||||||
right = min(self.dec_right, committed - (d0 + keep_n))
|
right = min(self.dec_right, committed - (d0 + keep_n))
|
||||||
|
lo, hi = d0 - left, d0 + keep_n + right
|
||||||
lo = d0 - left
|
win = self.tokens[np.clip(np.arange(lo, hi), 0, committed - 1)].astype(np.int64)
|
||||||
hi = d0 + keep_n + right
|
|
||||||
win_idx = np.clip(np.arange(lo, hi), 0, committed - 1)
|
seg, h = self._decode(win, left, keep_n, right)
|
||||||
win = self.tokens[win_idx].astype(np.int64)
|
body_end = keep_n * self.fpt * hop
|
||||||
|
head, body, tail = seg[:h], seg[h:body_end], seg[body_end:]
|
||||||
out.append(self._decode(win, left, keep_n))
|
|
||||||
|
if self.xfade_tail is not None and len(self.xfade_tail) == h and h > 0:
|
||||||
|
t = np.linspace(0.0, 1.0, h, dtype=np.float32)
|
||||||
|
out.append((1.0 - t) * self.xfade_tail + t * head)
|
||||||
|
else:
|
||||||
|
out.append(head)
|
||||||
|
out.append(body)
|
||||||
|
|
||||||
|
self.xfade_tail = None if final else tail
|
||||||
|
if final and tail.size:
|
||||||
|
out.append(tail)
|
||||||
self.decoded += keep_n
|
self.decoded += keep_n
|
||||||
return np.concatenate(out) if out else np.zeros(0, dtype=np.float32)
|
return np.concatenate(out) if out else np.zeros(0, dtype=np.float32)
|
||||||
|
|
||||||
@@ -232,18 +273,17 @@ def main():
|
|||||||
parser.add_argument("--list-devices", action="store_true")
|
parser.add_argument("--list-devices", action="store_true")
|
||||||
parser.add_argument("--input", type=int)
|
parser.add_argument("--input", type=int)
|
||||||
parser.add_argument("--output", type=int)
|
parser.add_argument("--output", type=int)
|
||||||
parser.add_argument("--target", type=Path, required=True, help="Target voice reference WAV")
|
parser.add_argument("--target", type=Path, required=True)
|
||||||
parser.add_argument("--seed-audio", type=Path, help="Seed speaker calibration WAV (optional)")
|
parser.add_argument("--seed-audio", type=Path)
|
||||||
parser.add_argument("--encode", required=True, help="Path to encode.onnx")
|
parser.add_argument("--encode", required=True)
|
||||||
parser.add_argument("--decode", help="Path to decode.onnx (defaults to encode.onnx parent folder)")
|
parser.add_argument("--decode")
|
||||||
parser.add_argument("--global", dest="global_path", help="Path to global.onnx (defaults to encode.onnx parent folder)")
|
parser.add_argument("--global", dest="global_path")
|
||||||
parser.add_argument("--ssl", help="Path to ssl.onnx (defaults to encode.onnx parent folder)")
|
parser.add_argument("--ssl")
|
||||||
parser.add_argument("--meta", help="Path to meta.json (defaults to encode.onnx parent folder)")
|
parser.add_argument("--meta")
|
||||||
parser.add_argument("--cuda", action="store_true", help="Enable CUDA execution provider")
|
parser.add_argument("--cuda", action="store_true")
|
||||||
parser.add_argument("--rms-floor", type=float, default=0.0035,
|
parser.add_argument("--openvino", action="store_true")
|
||||||
help="RMS threshold below which audio chunk is evaluated as silence")
|
parser.add_argument("--rms-floor", type=float, default=0.0035)
|
||||||
parser.add_argument("--hangover-chunks", type=int, default=3,
|
parser.add_argument("--hangover-chunks", type=int, default=3)
|
||||||
help="Number of chunks to hold the gate open after RMS drop to prevent trailing cutoffs")
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.list_devices:
|
if args.list_devices:
|
||||||
@@ -264,25 +304,24 @@ def main():
|
|||||||
|
|
||||||
sr = vc.sr
|
sr = vc.sr
|
||||||
sr16 = vc.sr16
|
sr16 = vc.sr16
|
||||||
|
|
||||||
# Calculate sample sizes based on target (playback) sample rate
|
|
||||||
# token_hz is standard (usually 25 Hz), tok_samples is usually 1764 for 44.1 kHz
|
|
||||||
token_hz = meta["token_hz"]
|
token_hz = meta["token_hz"]
|
||||||
tok_samples = sr // token_hz
|
tok_samples = sr // token_hz
|
||||||
chunk_samples = vc.chunk * tok_samples
|
chunk_samples = vc.chunk * tok_samples
|
||||||
budget_ms = (vc.chunk / token_hz) * 1000
|
budget_ms = (vc.chunk / token_hz) * 1000
|
||||||
|
|
||||||
# Calculated parameters for processing 16 kHz streams
|
|
||||||
tok16 = vc.tok16
|
tok16 = vc.tok16
|
||||||
chunk_samples_16k = vc.chunk * tok16
|
chunk_samples_16k = vc.chunk * tok16
|
||||||
left_pad_16k = vc.enc_left * tok16
|
left_pad_16k = vc.enc_left * tok16
|
||||||
right_pad_16k = vc.enc_right * tok16
|
right_pad_16k = vc.enc_right * tok16
|
||||||
ssl_in_16k = vc.ssl_in
|
required_samples_16k = left_pad_16k + chunk_samples_16k + right_pad_16k
|
||||||
|
|
||||||
|
fade_len = int(0.01 * sr16)
|
||||||
|
ramp_down = np.linspace(1.0, 0.0, fade_len, dtype=np.float32)
|
||||||
|
|
||||||
print(f"Sample Rate: {sr} Hz (target) | 16000 Hz (SSL internal)")
|
print(f"Sample Rate: {sr} Hz (target) | 16000 Hz (SSL internal)")
|
||||||
print(f"Chunk Size: {vc.chunk} tokens ({budget_ms:.1f}ms budget)")
|
print(f"Chunk Size: {vc.chunk} tokens ({budget_ms:.1f}ms budget)")
|
||||||
|
|
||||||
print(f"Loading target speaker profile: {args.target}...")
|
|
||||||
target_audio = load_16k(args.target, sr16)
|
target_audio = load_16k(args.target, sr16)
|
||||||
vc.set_target(target_audio)
|
vc.set_target(target_audio)
|
||||||
|
|
||||||
@@ -290,27 +329,19 @@ def main():
|
|||||||
n_in_ch = min(in_info["max_input_channels"], 2)
|
n_in_ch = min(in_info["max_input_channels"], 2)
|
||||||
|
|
||||||
if args.seed_audio:
|
if args.seed_audio:
|
||||||
print(f"Loading speaker calibration profile: {args.seed_audio}...")
|
|
||||||
seed_audio = load_16k(args.seed_audio, sr16)
|
seed_audio = load_16k(args.seed_audio, sr16)
|
||||||
else:
|
else:
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("No seed-audio provided. Recording 3 seconds for normalization calibration.")
|
|
||||||
print("Please speak into your microphone...")
|
|
||||||
print("=" * 60)
|
|
||||||
recorded = sd.rec(int(3.0 * sr), samplerate=sr, channels=n_in_ch, dtype="float32")
|
recorded = sd.rec(int(3.0 * sr), samplerate=sr, channels=n_in_ch, dtype="float32")
|
||||||
sd.wait()
|
sd.wait()
|
||||||
print("Recording complete. Calibrating feature scaling...")
|
|
||||||
recorded_mono = recorded.mean(axis=1) if recorded.shape[1] > 1 else recorded[:, 0]
|
recorded_mono = recorded.mean(axis=1) if recorded.shape[1] > 1 else recorded[:, 0]
|
||||||
seed_audio = resample(recorded_mono, sr, sr16)
|
seed_audio = resample(recorded_mono, sr, sr16)
|
||||||
|
|
||||||
print("Seeding streaming context from speaker profile...")
|
|
||||||
vc.seed(seed_audio)
|
vc.seed(seed_audio)
|
||||||
|
|
||||||
# Establish initial left-side padding context buffer in 16 kHz
|
|
||||||
if len(seed_audio) >= left_pad_16k:
|
if len(seed_audio) >= left_pad_16k:
|
||||||
raw_input_accum_16k = seed_audio[-left_pad_16k:]
|
accum_16k = seed_audio[-left_pad_16k:]
|
||||||
else:
|
else:
|
||||||
raw_input_accum_16k = np.pad(seed_audio, (left_pad_16k - len(seed_audio), 0))
|
accum_16k = np.pad(seed_audio, (left_pad_16k - len(seed_audio), 0))
|
||||||
|
|
||||||
in_q = queue.Queue(maxsize=8)
|
in_q = queue.Queue(maxsize=8)
|
||||||
out_q = queue.Queue(maxsize=2)
|
out_q = queue.Queue(maxsize=2)
|
||||||
@@ -330,8 +361,8 @@ def main():
|
|||||||
except queue.Empty:
|
except queue.Empty:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f"\n{'chunk':>6} {'q_in':>4} {'q_out':>5} {'enc':>7} {'dec':>7} {'total':>7} {'budget':>7} {'gap':>7}")
|
print(f"\n{'chunk':>6} {'q_in':>4} {'q_out':>5} {'ssl':>7} {'enc':>7} {'dec':>7} {'total':>7} {'budget':>7} {'gap':>7}")
|
||||||
print("-" * 76)
|
print("-" * 80)
|
||||||
|
|
||||||
chunk_n = 0
|
chunk_n = 0
|
||||||
t_last = None
|
t_last = None
|
||||||
@@ -354,44 +385,32 @@ def main():
|
|||||||
t_last = t_now
|
t_last = t_now
|
||||||
|
|
||||||
rms = float(np.sqrt(np.mean(raw ** 2)))
|
rms = float(np.sqrt(np.mean(raw ** 2)))
|
||||||
|
|
||||||
if rms >= args.rms_floor:
|
if rms >= args.rms_floor:
|
||||||
hangover_counter = args.hangover_chunks
|
hangover_counter = args.hangover_chunks
|
||||||
is_silence = False
|
is_silence = False
|
||||||
|
elif hangover_counter > 0:
|
||||||
|
hangover_counter -= 1
|
||||||
|
is_silence = False
|
||||||
else:
|
else:
|
||||||
if hangover_counter > 0:
|
is_silence = True
|
||||||
hangover_counter -= 1
|
|
||||||
is_silence = False
|
|
||||||
else:
|
|
||||||
is_silence = True
|
|
||||||
|
|
||||||
# Resample current input chunk to 16 kHz
|
|
||||||
raw_16k = resample(raw, sr, sr16)
|
raw_16k = resample(raw, sr, sr16)
|
||||||
raw_input_accum_16k = np.concatenate([raw_input_accum_16k, raw_16k])
|
accum_16k = np.concatenate([accum_16k, raw_16k])
|
||||||
required_samples_16k = left_pad_16k + chunk_samples_16k + right_pad_16k
|
|
||||||
|
|
||||||
if len(raw_input_accum_16k) >= required_samples_16k:
|
if len(accum_16k) >= required_samples_16k:
|
||||||
window_16k = raw_input_accum_16k[:required_samples_16k]
|
window_16k = accum_16k[:required_samples_16k]
|
||||||
raw_input_accum_16k = raw_input_accum_16k[chunk_samples_16k:]
|
accum_16k = accum_16k[chunk_samples_16k:]
|
||||||
|
|
||||||
# Create a simple linear ramp at the beginning of your script or class
|
|
||||||
fade_len = int(0.01 * sr16) # 10ms ramp
|
|
||||||
ramp_down = np.linspace(1.0, 0.0, fade_len, dtype=np.float32)
|
|
||||||
ramp_up = np.linspace(0.0, 1.0, fade_len, dtype=np.float32)
|
|
||||||
|
|
||||||
# Apply a soft gate instead of hard zeroing
|
|
||||||
if is_silence:
|
if is_silence:
|
||||||
window_16k = window_16k.copy()
|
window_16k = window_16k.copy()
|
||||||
# Smoothly ramp down the boundary before zeroing
|
|
||||||
active_start = left_pad_16k
|
active_start = left_pad_16k
|
||||||
active_end = left_pad_16k + chunk_samples_16k
|
active_end = left_pad_16k + chunk_samples_16k
|
||||||
|
|
||||||
# Apply fade out
|
|
||||||
window_16k[active_start : active_start + fade_len] *= ramp_down
|
window_16k[active_start : active_start + fade_len] *= ramp_down
|
||||||
window_16k[active_start + fade_len : active_end] = 0.0
|
window_16k[active_start + fade_len : active_end] = 0.0
|
||||||
|
|
||||||
# Run inference via ONNX models
|
local_feats, t_ssl = sync_time(lambda: vc._ssl(window_16k)[0])
|
||||||
idx, t_enc = sync_time(lambda: vc._encode_window(window_16k))
|
local_feats = vc.apply_ema(local_feats)
|
||||||
|
idx, t_enc = sync_time(lambda: vc._encode(local_feats, vc.src_mean, vc.src_std))
|
||||||
chunk_tokens = idx[vc.enc_left : vc.enc_left + vc.chunk]
|
chunk_tokens = idx[vc.enc_left : vc.enc_left + vc.chunk]
|
||||||
vc._commit_tokens(chunk_tokens)
|
vc._commit_tokens(chunk_tokens)
|
||||||
audio_out, t_dec = sync_time(lambda: vc._drain(final=False))
|
audio_out, t_dec = sync_time(lambda: vc._drain(final=False))
|
||||||
@@ -403,23 +422,23 @@ def main():
|
|||||||
pcm_out = np.stack([pcm, pcm], axis=1)
|
pcm_out = np.stack([pcm, pcm], axis=1)
|
||||||
else:
|
else:
|
||||||
pcm_out = np.zeros((chunk_samples, 2), dtype=np.float32)
|
pcm_out = np.zeros((chunk_samples, 2), dtype=np.float32)
|
||||||
t_enc, t_dec = 0.0, 0.0
|
t_ssl, t_enc, t_dec = 0.0, 0.0, 0.0
|
||||||
|
|
||||||
out_q.put(pcm_out)
|
out_q.put(pcm_out)
|
||||||
|
|
||||||
total = t_enc + t_dec
|
total = t_ssl + t_enc + t_dec
|
||||||
chunk_n += 1
|
chunk_n += 1
|
||||||
|
|
||||||
if is_silence:
|
if is_silence:
|
||||||
print(
|
print(
|
||||||
f"{chunk_n:>6} {in_q.qsize():>4} {out_q.qsize():>5} "
|
f"{chunk_n:>6} {in_q.qsize():>4} {out_q.qsize():>5} "
|
||||||
f"{'--silence--':>31} rms={rms:.4f}",
|
f"{'--silence--':>41} rms={rms:.4f}",
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
f"{chunk_n:>6} {in_q.qsize():>4} {out_q.qsize():>5} "
|
f"{chunk_n:>6} {in_q.qsize():>4} {out_q.qsize():>5} "
|
||||||
f"{t_enc:>6.1f}ms {t_dec:>6.1f}ms "
|
f"{t_ssl:>6.1f}ms {t_enc:>6.1f}ms {t_dec:>6.1f}ms "
|
||||||
f"{total:>6.1f}ms {budget_ms:>6.0f}ms {gap_ms:>6.1f}ms",
|
f"{total:>6.1f}ms {budget_ms:>6.0f}ms {gap_ms:>6.1f}ms",
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
|
|||||||
+34
@@ -0,0 +1,34 @@
|
|||||||
|
# optimize_models.py
|
||||||
|
from onnxruntime.transformers.optimizer import optimize_model
|
||||||
|
from onnxruntime.transformers.fusion_options import FusionOptions
|
||||||
|
|
||||||
|
def optimize_custom(input_path, output_path):
|
||||||
|
print(f"Optimizing {input_path}...")
|
||||||
|
|
||||||
|
# Load default BERT fusion options
|
||||||
|
options = FusionOptions("bert")
|
||||||
|
|
||||||
|
# Disable LayerNorm fusions that break on AdaLN / dynamic biases
|
||||||
|
options.enable_skip_layer_norm = False
|
||||||
|
options.enable_layer_norm = False
|
||||||
|
|
||||||
|
# Run the optimizer
|
||||||
|
optimizer = optimize_model(
|
||||||
|
input=input_path,
|
||||||
|
model_type="bert",
|
||||||
|
optimization_options=options
|
||||||
|
)
|
||||||
|
|
||||||
|
optimizer.save_model_to_file(output_path)
|
||||||
|
print(f"Saved optimized model to {output_path}\n")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
optimize_custom("outputs/encode.onnx", "outputs/encode_opt.onnx")
|
||||||
|
optimize_custom("outputs/decode.onnx", "outputs/decode_opt.onnx")
|
||||||
|
|
||||||
|
# ssl.onnx (WavLM) is a standard BERT architecture, so we can leave
|
||||||
|
# all standard fusions enabled for maximum speed.
|
||||||
|
print("Optimizing outputs/ssl.onnx...")
|
||||||
|
ssl_opt = optimize_model("outputs/ssl.onnx", model_type="bert")
|
||||||
|
ssl_opt.save_model_to_file("outputs/ssl_opt.onnx")
|
||||||
|
print("Saved optimized model to outputs/ssl_opt.onnx")
|
||||||
+2
-3
@@ -1,5 +1,5 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "dovc"
|
name = "mioonnx"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Add your description here"
|
description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
@@ -7,11 +7,10 @@ requires-python = ">=3.12"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"miocodec",
|
"miocodec",
|
||||||
"numpy>=2.4.6",
|
"numpy>=2.4.6",
|
||||||
"onnxruntime>=1.26.0",
|
|
||||||
"onnxruntime-gpu>=1.26.0",
|
"onnxruntime-gpu>=1.26.0",
|
||||||
"onnxscript>=0.7.0",
|
"onnxscript>=0.7.0",
|
||||||
"sounddevice>=0.5.5",
|
"sounddevice>=0.5.5",
|
||||||
"torch>=2.11.0",
|
"soundfile>=0.13.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.uv.sources]
|
[tool.uv.sources]
|
||||||
|
|||||||
+153
-29
@@ -1,44 +1,168 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import onnx
|
||||||
|
import onnxruntime as ort
|
||||||
from onnxruntime.quantization import quantize_dynamic, QuantType
|
from onnxruntime.quantization import quantize_dynamic, QuantType
|
||||||
from onnxruntime.quantization.shape_inference import quant_pre_process
|
from onnxruntime.quantization.shape_inference import quant_pre_process
|
||||||
|
|
||||||
|
ort.set_default_logger_severity(3)
|
||||||
|
OPS = ["Conv", "Gemm", "MatMul"]
|
||||||
|
|
||||||
def quantize_model(input_path: Path, output_path: Path):
|
|
||||||
# Create temporary path for the pre-processed model
|
|
||||||
preprocessed_path = input_path.with_name(f"{input_path.stem}_preprocessed.onnx")
|
|
||||||
|
|
||||||
print(f"Pre-processing {input_path.name}...")
|
def has_external(path):
|
||||||
|
m = onnx.load(str(path), load_external_data=False)
|
||||||
|
return any(t.data_location == onnx.TensorProto.EXTERNAL for t in m.graph.initializer)
|
||||||
|
|
||||||
|
|
||||||
|
def grouped_or_nonconst_convs(path):
|
||||||
|
m = onnx.load(str(path), load_external_data=False)
|
||||||
|
inits = {i.name for i in m.graph.initializer}
|
||||||
|
bad = []
|
||||||
|
for n in m.graph.node:
|
||||||
|
if n.op_type != "Conv":
|
||||||
|
continue
|
||||||
|
group = next((a.i for a in n.attribute if a.name == "group"), 1)
|
||||||
|
w_const = len(n.input) > 1 and n.input[1] in inits
|
||||||
|
if group > 1 or not w_const:
|
||||||
|
bad.append(n.name)
|
||||||
|
return bad
|
||||||
|
|
||||||
|
|
||||||
|
def quantize_one(path, weight_type, reduce_range):
|
||||||
|
stem = path.stem
|
||||||
|
out = path.with_name(f"{stem}_quant.onnx")
|
||||||
|
pre = path.with_name(f"{stem}_pre.onnx")
|
||||||
|
target = path
|
||||||
try:
|
try:
|
||||||
quant_pre_process(str(input_path), str(preprocessed_path))
|
quant_pre_process(str(path), str(pre), skip_optimization=False,
|
||||||
target_input = preprocessed_path
|
skip_onnx_shape=False, skip_symbolic_shape=False, auto_merge=True)
|
||||||
except Exception as e:
|
target = pre
|
||||||
print(f"Pre-processing skipped or failed: {e}")
|
except Exception as e1:
|
||||||
target_input = input_path
|
try:
|
||||||
|
quant_pre_process(str(path), str(pre), skip_optimization=False,
|
||||||
|
skip_onnx_shape=False, skip_symbolic_shape=True)
|
||||||
|
target = pre
|
||||||
|
print(" preprocess: symbolic shape skipped")
|
||||||
|
except Exception as e2:
|
||||||
|
print(f" preprocess failed, quantizing raw: {e2}")
|
||||||
|
|
||||||
print(f"Quantizing {target_input.name}...")
|
exclude = grouped_or_nonconst_convs(target) if stem == "ssl" else []
|
||||||
try:
|
if exclude:
|
||||||
quantize_dynamic(
|
print(f" excluding {len(exclude)} grouped/non-const conv(s)")
|
||||||
model_input=str(target_input),
|
|
||||||
model_output=str(output_path),
|
quantize_dynamic(
|
||||||
weight_type=QuantType.QUInt8,
|
model_input=str(target), model_output=str(out),
|
||||||
# Limit quantization to MatMul. This bypasses the Conv layers
|
weight_type=weight_type, op_types_to_quantize=OPS,
|
||||||
# that cause weight initialization errors, while still optimizing
|
nodes_to_exclude=exclude, reduce_range=reduce_range,
|
||||||
# the heavy transformer layers.
|
)
|
||||||
op_types_to_quantize=["MatMul"]
|
pre.unlink(missing_ok=True)
|
||||||
)
|
b = out.stat().st_size
|
||||||
print(f"Quantization complete: {output_path}")
|
if has_external(path):
|
||||||
finally:
|
print(f" {path.name} -> {out.name} {b/1e6:.3g} MB int8 self-contained (fp32 weights were external)")
|
||||||
# Clean up temporary preprocessed file if it was created
|
else:
|
||||||
if preprocessed_path.exists() and preprocessed_path != input_path:
|
a = path.stat().st_size
|
||||||
preprocessed_path.unlink()
|
print(f" {path.name} -> {out.name} {a/1e6:.3g} -> {b/1e6:.3g} MB ({100*(1-b/a):.0f}% smaller)")
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def feeds_for(sess, meta, rng):
|
||||||
|
feeds = {}
|
||||||
|
for inp in sess.get_inputs():
|
||||||
|
dt = np.int64 if "int64" in inp.type else (np.int32 if "int32" in inp.type else np.float32)
|
||||||
|
shape = [d if isinstance(d, int) and d > 0
|
||||||
|
else (1 if ax == 0 and len(inp.shape) >= 2 else meta.get("enc_ssl_frames", 100))
|
||||||
|
for ax, d in enumerate(inp.shape)]
|
||||||
|
n = inp.name.lower()
|
||||||
|
if np.issubdtype(dt, np.integer):
|
||||||
|
feeds[inp.name] = np.zeros(shape, dtype=dt)
|
||||||
|
else:
|
||||||
|
a = rng.standard_normal(shape).astype(np.float32)
|
||||||
|
feeds[inp.name] = (np.abs(a) + 0.5) if "std" in n else (a * 0.0 if "mean" in n else a)
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
|
def check(fp32, quant, meta):
|
||||||
|
rng = np.random.default_rng(0)
|
||||||
|
s0 = ort.InferenceSession(str(fp32), providers=["CPUExecutionProvider"])
|
||||||
|
s1 = ort.InferenceSession(str(quant), providers=["CPUExecutionProvider"])
|
||||||
|
feeds = feeds_for(s0, meta, rng)
|
||||||
|
out = [o.name for o in s0.get_outputs()]
|
||||||
|
r0 = s0.run(out, feeds)
|
||||||
|
r1 = s1.run(out, feeds)
|
||||||
|
for name, a, b in zip(out, r0, r1):
|
||||||
|
if np.issubdtype(a.dtype, np.integer):
|
||||||
|
print(f" {name}: {100*(a != b).mean():.2f}% tokens changed")
|
||||||
|
else:
|
||||||
|
d = np.abs(a - b)
|
||||||
|
print(f" {name}: max|d|={d.max():.3g} mean|d|={d.mean():.3g}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_real(d, meta, source, target):
|
||||||
|
import infer
|
||||||
|
a = argparse.Namespace(ssl=str(d / "ssl.onnx"), encode=str(d / "encode.onnx"),
|
||||||
|
decode=str(d / "decode.onnx"), global_path=str(d / "global.onnx"),
|
||||||
|
cuda=False)
|
||||||
|
vc = infer.Infer(a, meta)
|
||||||
|
sr16 = meta["ssl_sample_rate"]
|
||||||
|
src16 = infer.load_16k(source, sr16)
|
||||||
|
mean, std, _ = vc.calibrate(src16)
|
||||||
|
qs = {n: ort.InferenceSession(str(d / f"{n}_quant.onnx"), providers=["CPUExecutionProvider"])
|
||||||
|
for n in ["ssl", "encode", "decode", "global"] if (d / f"{n}_quant.onnx").exists()}
|
||||||
|
|
||||||
|
keep, win = next(vc._windows(src16))
|
||||||
|
win1 = infer.take(win, 0, vc.ssl_in).reshape(1, -1)
|
||||||
|
local_real = vc._ssl(win)[0]
|
||||||
|
if "ssl" in qs:
|
||||||
|
l1, g1 = qs["ssl"].run(["local_features", "global_features"], {"audio_16k": win1})
|
||||||
|
l0, g0 = vc.ssl.run(["local_features", "global_features"], {"audio_16k": win1})
|
||||||
|
print(f" ssl local max|d|={np.abs(l0 - l1).max():.3g} global max|d|={np.abs(g0 - g1).max():.3g}")
|
||||||
|
|
||||||
|
if "encode" in qs:
|
||||||
|
feed = {"local_ssl_features": local_real, "mean": mean, "std": std}
|
||||||
|
t0 = vc.enc.run(["content_token_indices"], feed)[0]
|
||||||
|
t1 = qs["encode"].run(["content_token_indices"], feed)[0]
|
||||||
|
k = slice(vc.enc_left, vc.enc_left + keep)
|
||||||
|
print(f" encode tokens (real, center {keep}): {100 * (t0[k] == t1[k]).mean():.1f}% agree")
|
||||||
|
|
||||||
|
if "decode" in qs and target:
|
||||||
|
emb = vc.embed(infer.load_16k(target, sr16))
|
||||||
|
toks = vc.tokens(src16, mean, std)
|
||||||
|
lo = vc.dec_left
|
||||||
|
w = toks[np.clip(np.arange(lo, lo + vc.dec_tokens), 0, len(toks) - 1)].astype(np.int64)
|
||||||
|
feed = {"content_token_indices": w, "global_embedding": emb}
|
||||||
|
r0 = vc.dec.run(["spec_real", "spec_imag"], feed)
|
||||||
|
r1 = qs["decode"].run(["spec_real", "spec_imag"], feed)
|
||||||
|
print(f" decode spec_real max|d|={np.abs(r0[0] - r1[0]).max():.3g} "
|
||||||
|
f"spec_imag max|d|={np.abs(r0[1] - r1[1]).max():.3g}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
p = argparse.ArgumentParser()
|
p = argparse.ArgumentParser()
|
||||||
p.add_argument("--model", required=True, help="Path to the ONNX model to quantize")
|
p.add_argument("--dir", default="outputs")
|
||||||
|
p.add_argument("--models", nargs="*", default=["ssl", "encode", "decode", "global"])
|
||||||
|
p.add_argument("--weight-type", choices=["int8", "uint8"], default="int8")
|
||||||
|
p.add_argument("--no-reduce-range", action="store_true")
|
||||||
|
p.add_argument("--check", action="store_true")
|
||||||
|
p.add_argument("--source")
|
||||||
|
p.add_argument("--target")
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|
||||||
in_path = Path(args.model)
|
d = Path(args.dir)
|
||||||
out_path = in_path.with_name(f"{in_path.stem}_quant.onnx")
|
wt = QuantType.QInt8 if args.weight_type == "int8" else QuantType.QUInt8
|
||||||
quantize_model(in_path, out_path)
|
meta = json.loads((d / "meta.json").read_text()) if (d / "meta.json").exists() else {}
|
||||||
|
|
||||||
|
for name in args.models:
|
||||||
|
f = d / f"{name}.onnx"
|
||||||
|
if not f.exists():
|
||||||
|
continue
|
||||||
|
print(f"{name}:")
|
||||||
|
q = quantize_one(f, wt, not args.no_reduce_range)
|
||||||
|
if args.check:
|
||||||
|
check(f, q, meta)
|
||||||
|
|
||||||
|
if args.source:
|
||||||
|
print("real-audio check:")
|
||||||
|
check_real(d, meta, args.source, args.target)
|
||||||
@@ -201,32 +201,6 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/a7/5f/ed01f9a3cdffbd5a008556fc7b2a08ddb1cc6ace7effa7340604b1d16699/docstring_parser-0.18.0-py3-none-any.whl", hash = "sha256:b3fcbed555c47d8479be0796ef7e19c2670d428d72e96da63f3a40122860374b", size = 22484, upload-time = "2026-04-14T04:09:18.638Z" },
|
{ url = "https://files.pythonhosted.org/packages/a7/5f/ed01f9a3cdffbd5a008556fc7b2a08ddb1cc6ace7effa7340604b1d16699/docstring_parser-0.18.0-py3-none-any.whl", hash = "sha256:b3fcbed555c47d8479be0796ef7e19c2670d428d72e96da63f3a40122860374b", size = 22484, upload-time = "2026-04-14T04:09:18.638Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "dovc"
|
|
||||||
version = "0.1.0"
|
|
||||||
source = { virtual = "." }
|
|
||||||
dependencies = [
|
|
||||||
{ name = "miocodec" },
|
|
||||||
{ name = "numpy" },
|
|
||||||
{ name = "onnxruntime" },
|
|
||||||
{ name = "onnxruntime-gpu" },
|
|
||||||
{ name = "onnxscript" },
|
|
||||||
{ name = "sounddevice" },
|
|
||||||
{ name = "torch", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
|
|
||||||
{ name = "torch", version = "2.12.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.metadata]
|
|
||||||
requires-dist = [
|
|
||||||
{ name = "miocodec", git = "https://github.com/Aratako/MioCodec" },
|
|
||||||
{ name = "numpy", specifier = ">=2.4.6" },
|
|
||||||
{ name = "onnxruntime", specifier = ">=1.26.0" },
|
|
||||||
{ name = "onnxruntime-gpu", specifier = ">=1.26.0" },
|
|
||||||
{ name = "onnxscript", specifier = ">=0.7.0" },
|
|
||||||
{ name = "sounddevice", specifier = ">=0.5.5" },
|
|
||||||
{ name = "torch", specifier = ">=2.11.0" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "einops"
|
name = "einops"
|
||||||
version = "0.8.2"
|
version = "0.8.2"
|
||||||
@@ -523,6 +497,29 @@ dependencies = [
|
|||||||
{ name = "torchaudio", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
|
{ name = "torchaudio", version = "2.11.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mioonnx"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = { virtual = "." }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "miocodec" },
|
||||||
|
{ name = "numpy" },
|
||||||
|
{ name = "onnxruntime-gpu" },
|
||||||
|
{ name = "onnxscript" },
|
||||||
|
{ name = "sounddevice" },
|
||||||
|
{ name = "soundfile" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.metadata]
|
||||||
|
requires-dist = [
|
||||||
|
{ name = "miocodec", git = "https://github.com/Aratako/MioCodec" },
|
||||||
|
{ name = "numpy", specifier = ">=2.4.6" },
|
||||||
|
{ name = "onnxruntime-gpu", specifier = ">=1.26.0" },
|
||||||
|
{ name = "onnxscript", specifier = ">=0.7.0" },
|
||||||
|
{ name = "sounddevice", specifier = ">=0.5.5" },
|
||||||
|
{ name = "soundfile", specifier = ">=0.13.1" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ml-dtypes"
|
name = "ml-dtypes"
|
||||||
version = "0.5.4"
|
version = "0.5.4"
|
||||||
@@ -833,38 +830,6 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/8c/aa/f7a53321c60b9ad9ee184b6018292ed6b5389947592a2c8c09c736bb7f9e/onnx_ir-0.2.1-py3-none-any.whl", hash = "sha256:c7285da889312f91882de2092e298a9eeeefbfc1d1951c49d983992967eb09a7", size = 166792, upload-time = "2026-04-20T20:21:46.357Z" },
|
{ url = "https://files.pythonhosted.org/packages/8c/aa/f7a53321c60b9ad9ee184b6018292ed6b5389947592a2c8c09c736bb7f9e/onnx_ir-0.2.1-py3-none-any.whl", hash = "sha256:c7285da889312f91882de2092e298a9eeeefbfc1d1951c49d983992967eb09a7", size = 166792, upload-time = "2026-04-20T20:21:46.357Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "onnxruntime"
|
|
||||||
version = "1.26.0"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
dependencies = [
|
|
||||||
{ name = "flatbuffers" },
|
|
||||||
{ name = "numpy" },
|
|
||||||
{ name = "packaging" },
|
|
||||||
{ name = "protobuf" },
|
|
||||||
]
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/81/b1/d111b1df656761f980d9e298a60039a9cb66036b1d039e777537743d0ac3/onnxruntime-1.26.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:05b028781b322ad74b57ce5b50aa5280bb1fe96ceec334628ade681e0b24c1ac", size = 18016624, upload-time = "2026-05-12T00:41:01.735Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/f6/a0/3f9d896a0385a36bd04345d6d0b802821a5782adde562e7e135f6bb71c73/onnxruntime-1.26.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91f2bb870a4b9224eba0a6728c1fa7a9e552b8e59e1083c51fbbc3d013f2b5c0", size = 16052692, upload-time = "2026-05-08T19:07:13.829Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/7c/43/2a4e04f8dbeffad19bbcced4bcd4289bf478921518437404d6b92bdf213b/onnxruntime-1.26.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b6dd70599005bd1bf29779f04a91978b92b5e719c11a20068a8f8e535f725b6", size = 18185439, upload-time = "2026-05-08T19:07:36.299Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/44/fc/026d0a7162b9c2153dac292baea9e027c42304dc1d9dc6f8ff5b4cfbaedd/onnxruntime-1.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:a26374dc7fbcaae593601086b242120e13f2310558df0991da6dd8b8fac00414", size = 13026427, upload-time = "2026-05-08T19:08:03.503Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/3e/27/1dcf88e45e4c69db5f7b106f2dacc3801ba98994e082ca03e1dfdf7bfe57/onnxruntime-1.26.0-cp312-cp312-win_arm64.whl", hash = "sha256:54a8053410fd31fd66469bd754fcfe8a4df9f7eb44756b4b5479bf50c842d948", size = 12796647, upload-time = "2026-05-08T19:07:52.108Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/cf/a2/c801242685e0ce48a4ca51dfafbb588765e0446397e123be53ba5598f3f5/onnxruntime-1.26.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccce19c5f771b8268902f77d9fed9e88f9499465d6780808faa6611a789d33f0", size = 18016563, upload-time = "2026-05-08T19:07:28.081Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/e2/64/0492c0b1db04e29b2630c87cfa36f9d6872b1ca8614b90c5cad58fac7d76/onnxruntime-1.26.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdbed8cf3b672b66acb032f33a253bc27f42bce6ece48ae3fab4fa483a5e96e0", size = 16052634, upload-time = "2026-05-08T19:07:16.885Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/3d/26/4d09ddc755a84fc8d5e192991626b0e0680e8f6c5d58f4f1d05c42bc48cf/onnxruntime-1.26.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c07af6fc6d5557835f2b6ee7a96d8b3235d0c57a8e230efdedaee106a8a3cbc6", size = 18185632, upload-time = "2026-05-08T19:07:38.756Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/77/89/3e52249aa08fa301e217ecba07b5246a8338fa2b401e109326e3fc5be0f9/onnxruntime-1.26.0-cp313-cp313-win_amd64.whl", hash = "sha256:61bec80655efa460591c2bc655392d57d2650ce85533a6b9b3b7a790d7ea7916", size = 13026751, upload-time = "2026-05-08T19:08:06.2Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/06/b3/c1c8782b14af6797c303de132d6eef26a9fb80dfacd3750ce57911d11c6b/onnxruntime-1.26.0-cp313-cp313-win_arm64.whl", hash = "sha256:a6677545ff451e3539a02746d2f207d8c5baa4a0a818886bb9d6a6eb9511ee89", size = 12796807, upload-time = "2026-05-08T19:07:54.879Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/c3/f5/47b0676408abec652c14b84d7173e389837832d850c24f87184277313e8d/onnxruntime-1.26.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e016edc15d3c19f36807e1c6b10be5b27807688c32720f91b5ae480a95215d0", size = 16057265, upload-time = "2026-05-08T19:07:19.603Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/3b/45/33ab6deeef010ca844c877dd618cebc079590bbe52d2a3678e7223b1b908/onnxruntime-1.26.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f5fc48a91a046a6a5c9b147f83fb41d65d24d24923373b222cdd248f0f4f4aac", size = 18197590, upload-time = "2026-05-08T19:07:41.422Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/40/89/17546c1c20f6bfc3ae41c22152378a26edfea918af3129e2139dcd7c99f3/onnxruntime-1.26.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:33a791f31432a3af1a96db5e54818b37aba5e5eefc2e6af5794c10a9118a9993", size = 18019724, upload-time = "2026-05-08T19:07:30.723Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/bb/24/89457a35f6af29538a76647f2c18c3a28277e6c19234c847e7b4b7c19860/onnxruntime-1.26.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e90c00732c4553618103149d93f688e8c3063017938f8983e21a71d9f3b6d22e", size = 16054821, upload-time = "2026-05-08T19:07:22.348Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/12/f9/15b2e1815cf570d238e0135529f80d2dce64e8e8818a1489cae83823c5c6/onnxruntime-1.26.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01498e80ba8988428d08c2d51b1338f89e3de2a93e6ffe555f79c68f26a5c06b", size = 18185815, upload-time = "2026-05-08T19:07:44.179Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/d7/65/2e11055faf015e4b07f45b513fa49b391baf2e19d92d77d73ebee13c1004/onnxruntime-1.26.0-cp314-cp314-win_amd64.whl", hash = "sha256:7ead61450d8405167c87dd3a31d8da1d576b490a57dab1aa8b82a7da6825f5aa", size = 13349887, upload-time = "2026-05-08T19:08:08.671Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/19/e4/0f9d1a5718b1781c610c1e354765a3820597081754277a6a9a2b50705702/onnxruntime-1.26.0-cp314-cp314-win_arm64.whl", hash = "sha256:31d71a53490e46910877d0902b5ad99c69a5955e5c7ea6c82863519410e1ba7c", size = 13140121, upload-time = "2026-05-08T19:07:57.804Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/1c/42/3b8e635f067d06d9f45bede470b8d539d101a4166c272213158dfd08b6ce/onnxruntime-1.26.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7b6d258fb78fdfcf049795bcfaa74dcb90ae7baa277afd21e6fd28b83f2c496", size = 16057240, upload-time = "2026-05-08T19:07:25.163Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/93/99/f2be40a31b908d96b861ae0ce98582fa376c18a7f816b9d5eb4cd6aa0a4c/onnxruntime-1.26.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4eefd386a45202aefb7a5132b94f32df9d506c9edcc7faf2fc60d65183f4b183", size = 18197382, upload-time = "2026-05-08T19:07:46.965Z" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "onnxruntime-gpu"
|
name = "onnxruntime-gpu"
|
||||||
version = "1.26.0"
|
version = "1.26.0"
|
||||||
|
|||||||
Reference in New Issue
Block a user