向量量化 - Pytorch
$ pip install vector-quantize-pytorch
import torch
from vector_quantize_pytorch import VectorQuantize
vq = VectorQuantize(
dim = 256,
codebook_size = 512, # 码本大小
decay = 0.8, # 指数移动平均衰减,值越低字典变化越快
commitment_weight = 1. # 承诺损失的权重
x = torch.randn(1, 1024, 256)
quantized, indices, commit_loss = vq(x) # (1, 1024, 256), (1, 1024), (1)
import torch
from vector_quantize_pytorch import ResidualVQ
residual_vq = ResidualVQ(
dim = 256,
num_quantizers = 8, # 指定量化器的数量
codebook_size = 1024, # 码本大小
x = torch.randn(1, 1024, 256)
quantized, indices, commit_loss = residual_vq(x)
print(quantized.shape, indices.shape, commit_loss.shape)
# (1, 1024, 256), (1, 1024, 8), (1, 8)
# 如果你需要所有量化层的代码,只要传递 return_all_codes = True
quantized, indices, commit_loss, all_codes = residual_vq(x, return_all_codes = True)
# (8, 1, 1024, 256)
import torch
from vector_quantize_pytorch import ResidualVQ
residual_vq = ResidualVQ(
dim = 256,
num_quantizers = 8,
codebook_size = 1024,
stochastic_sample_codes = True,
sample_codebook_temp = 0.1, # 随机采样代码的温度,0意味着非随机
shared_codebook = True # 是否共享所有量化器的码本
x = torch.randn(1, 1024, 256)
quantized, indices, commit_loss = residual_vq(x)
# (1, 1024, 256), (1, 1024, 8), (1, 8)
import torch
from vector_quantize_pytorch import GroupedResidualVQ
residual_vq = GroupedResidualVQ(
dim = 256,
num_quantizers = 8, # 指定量化器的数量
groups = 2,
codebook_size = 1024, # 码本大小
x = torch.randn(1, 1024, 256)
quantized, indices, commit_loss = residual_vq(x)
# (1, 1024, 256), (2, 1, 1024, 8), (2, 1, 8)
类中设置一个标志kmeans_init = True
import torch
from vector_quantize_pytorch import ResidualVQ
residual_vq = ResidualVQ(
dim = 256,
codebook_size = 256,
num_quantizers = 4,
kmeans_init = True, # 设置为True
kmeans_iters = 10 # 初始化时计算码本质心的kmeans迭代次数
x = torch.randn(1, 1024, 256)
quantized, indices, commit_loss = residual_vq(x)
# (1, 1024, 256), (1, 1024, 4), (1, 4)
import torch
from vector_quantize_pytorch import VectorQuantize
vq = VectorQuantize(
dim = 256,
codebook_size = 256,
codebook_dim = 16 # 论文建议设置为32或低至8以增加码本使用率
x = torch.randn(1, 1024, 256)
quantized, indices, commit_loss = vq(x)
# (1, 1024, 256), (1, 1024), (1,)
改进VQGAN论文还提出对代码和编码向量进行l2归一化,这相当于使用余弦相似度来计算距离。他们声称将向量强制在一个球上有助于增强码本使用率和下游重建。你可以通过设置use_cosine_sim = True
import torch
from vector_quantize_pytorch import VectorQuantize
vq = VectorQuantize(
dim = 256,
codebook_size = 256,
use_cosine_sim = True # 设置为True
x = torch.randn(1, 1024, 256)
quantized, indices, commit_loss = vq(x)
# (1, 1024, 256), (1, 1024), (1,)
import torch
from vector_quantize_pytorch import VectorQuantize
vq = VectorQuantize(
dim = 256,
codebook_size = 512,
threshold_ema_dead_code = 2 # 应该积极替换指数移动平均簇大小小于2的代码
x = torch.randn(1, 1024, 256)
quantized, indices, commit_loss = vq(x)
# (1, 1024, 256), (1, 1024), (1,)
import torch
from vector_quantize_pytorch import VectorQuantize
vq = VectorQuantize(
dim = 256,
codebook_size = 256,
accept_image_fmap = True, # 设置为True以能够传递图像特征图
orthogonal_reg_weight = 10, # 论文中建议值为10
orthogonal_reg_max_codes = 128, # 这将随机从码本中采样以用于正交正则化损失,以限制内存使用
orthogonal_reg_active_codes_only = False # 如果你有一个非常大的码本,并且只希望对每批激活的代码强制该损失,则设置为True
img_fmap = torch.randn(1, 256, 32, 32)
quantized, indices, loss = vq(img_fmap) # (1, 256, 32, 32), (1, 32, 32), (1,)
# 损失现在包含正交正则化损失及其分配的权重
import torch
from vector_quantize_pytorch import VectorQuantize
vq = VectorQuantize(
dim = 256,
codebook_dim = 32, # 许多论文显示较小的码本维度是可接受的
heads = 8, # 向量量化的头数量,共享码本
separate_codebook_per_head = True, # 是否为每个头设置单独的码本。False意味着共享一个码本
codebook_size = 8196,
accept_image_fmap = True
img_fmap = torch.randn(1, 256, 32, 32)
quantized, indices, loss = vq(img_fmap)
# (1, 256, 32, 32), (1, 32, 32, 8), (1,)
import torch
from vector_quantize_pytorch import RandomProjectionQuantizer
quantizer = RandomProjectionQuantizer(
dim = 512, # 输入维度
num_codebooks = 16, # 在USM中,他们使用了最多16个码本以获得5%的提升
codebook_dim = 256, # 码本维度
codebook_size = 1024 # 码本大小
x = torch.randn(1, 1024, 512)
indices = quantizer(x)
# (1, 1024, 16)
为 True | False
VQ | FSQ | |
量化 | argmin_c || z-c || | round(f(z)) |
梯度 | 直接估计法 (STE) | STE |
辅助损失 | 承诺、码本、熵损失等 | N/A |
技巧 | 码本的EMA,码本拆分,投影等 | N/A |
参数 | 码本 | N/A |
这项来自Google Deepmind的工作旨在极大地简化用于生成建模的矢量量化方法,消除承诺损失、码本的EMA更新,以及解决码本崩溃或利用率不足的问题。他们采用将每个标量舍入到离散级别的直接梯度法;代码变为超立方体中的均匀点。
import torch
from vector_quantize_pytorch import FSQ
quantizer = FSQ(
levels = [8, 5, 5, 5]
x = torch.randn(1, 1024, 4) # 4是因为有4个级别
xhat, indices = quantizer(x)
# (1, 1024, 4), (1, 1024)
assert torch.all(xhat == quantizer.indices_to_codes(indices))
一个改进的Residual FSQ,用于尝试改进音频编码。
import torch
from vector_quantize_pytorch import ResidualFSQ
residual_fsq = ResidualFSQ(
dim = 256,
levels = [8, 5, 5, 3],
num_quantizers = 8
x = torch.randn(1, 1024, 256)
quantized, indices = residual_fsq(x)
# (1, 1024, 256), (1, 1024, 8)
quantized_out = residual_fsq.get_output_from_indices(indices)
# (1, 1024, 256)
assert torch.all(quantized == quantized_out)
本文提出了使用独立二进制潜变量的简单LFQ量化器。其他实现的LFQ也存在。然而,团队表明,使用LFQ的MAGVIT-v2在ImageNet基准上显著提升。 LFQ和2级FSQ之间的差异包括熵正则化以及承诺损失的维持。
您可以按如下方式简单使用。在MagViT2 pytorch port上试用。
import torch
from vector_quantize_pytorch import LFQ
# 您可以指定dim或codebook_size
# 如果两者都指定,将相互验证
quantizer = LFQ(
codebook_size = 65536, # 码本大小,必须为2的幂
dim = 16, # 输入特征尺寸,默认为log2(码本大小)如果未定义
entropy_loss_weight = 0.1, # 熵损失的权重
diversity_gamma = 1. # 熵损失中代码多样性的权重,来源于https://arxiv.org/abs/1911.05894
image_feats = torch.randn(1, 16, 32, 32)
quantized, indices, entropy_aux_loss = quantizer(image_feats, inv_temperature=100.) # 您可能需要尝试温度
# (1, 16, 32, 32), (1, 32, 32), ()
assert (quantized == quantizer.indices_to_codes(indices)).all()
您还可以传入视频特征,格式为 (batch, feat, time, height, width)
或序列,格式为(batch, seq, feat)
import torch
from vector_quantize_pytorch import LFQ
quantizer = LFQ(
codebook_size = 65536,
dim = 16,
entropy_loss_weight = 0.1,
diversity_gamma = 1.
seq = torch.randn(1, 32, 16)
quantized, *_ = quantizer(seq)
assert seq.shape == quantized.shape
video_feats = torch.randn(1, 16, 10, 32, 32)
quantized, *_ = quantizer(video_feats)
assert video_feats.shape == quantized.shape
import torch
from vector_quantize_pytorch import LFQ
quantizer = LFQ(
codebook_size = 4096,
dim = 16,
num_codebooks = 4 # 4个码本,总码本维度为log2(4096) * 4
image_feats = torch.randn(1, 16, 32, 32)
quantized, indices, entropy_aux_loss = quantizer(image_feats)
# (1, 16, 32, 32), (1, 32, 32, 4), ()
assert image_feats.shape == quantized.shape
assert (quantized == quantizer.indices_to_codes(indices)).all()
一个改进的Residual LFQ,看看是否可以改进音频压缩。
import torch
from vector_quantize_pytorch import ResidualLFQ
residual_lfq = ResidualLFQ(
dim = 256,
codebook_size = 256,
num_quantizers = 8
x = torch.randn(1, 1024, 256)
quantized, indices, commit_loss = residual_lfq(x)
# (1, 1024, 256), (1, 1024, 8), (8)
quantized_out = residual_lfq.get_output_from_indices(indices)
# (1, 1024, 256)
assert torch.all(quantized == quantized_out)
import torch
from vector_quantize_pytorch import LatentQuantize
# 您可以指定dim或codebook_size
# 如果两者都指定,将相互验证
quantizer = LatentQuantize(
levels = [5, 5, 8], # 每个码本维度的级别数
dim = 16, # 输入维度
image_feats = torch.randn(1, 16, 32, 32)
quantized, indices, loss = quantizer(image_feats)
# (1, 16, 32, 32), (1, 32, 32), ()
assert image_feats.shape == quantized.shape
assert (quantized == quantizer.indices_to_codes(indices)).all()
您还可以传入视频特征,格式为 (batch, feat, time, height, width)
或序列,格式为(batch, seq, feat)
import torch
from vector_quantize_pytorch import LatentQuantize
quantizer = LatentQuantize(
levels = [5, 5, 8],
dim = 16,
seq = torch.randn(1, 32, 16)
quantized, *_ = quantizer(seq)
# (1, 32, 16)
video_feats = torch.randn(1, 16, 10, 32, 32)
quantized, *_ = quantizer(video_feats)
# (1, 16, 10, 32, 32)
import torch
from vector_quantize_pytorch import LatentQuantize
model = LatentQuantize(
levels = [4, 8, 16],
dim = 9,
num_codebooks = 3
input_tensor = torch.randn(2, 3, dim)
output_tensor, indices, loss = model(input_tensor)
# (2, 3, 9), (2, 3, 3), ()
assert output_tensor.shape == input_tensor.shape
assert indices.shape == (2, 3, num_codebooks)
assert loss.item() >= 0
title = {Neural Discrete Representation Learning},
author = {Aaron van den Oord and Oriol Vinyals and Koray Kavukcuoglu},
year = {2018},
eprint = {1711.00937},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
title = {SoundStream: An End-to-End Neural Audio Codec},
author = {Neil Zeghidour and Alejandro Luebs and Ahmed Omran and Jan Skoglund and Marco Tagliasacchi},
year = {2021},
eprint = {2107.03312},
archivePrefix = {arXiv},
primaryClass = {cs.SD}
title = {Vector-quantized Image Modeling with Improved {VQGAN}},
author = {Anonymous},
booktitle = {Submitted to The Tenth International Conference on Learning Representations },
year = {2022},
url = {https://openreview.net/forum?id=pfNyExj7z2},
note = {under review}
title={Autoregressive Image Generation using Residual Quantization},
author={Lee, Doyup and Kim, Chiheon and Kim, Saehoon and Cho, Minsu and Han, Wook-Shin},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
title = {High Fidelity Neural Audio Compression},
author = {Alexandre D'efossez and Jade Copet and Gabriel Synnaeve and Yossi Adi},
journal = {ArXiv},
year = {2022},
volume = {abs/2210.13438}
title = {Self-supervised Learning with Random-projection Quantizer for Speech Recognition},
author = {Chung-Cheng Chiu and James Qin and Yu Zhang and Jiahui Yu and Yonghui Wu},
booktitle = {International Conference on Machine Learning},
year = {2022}
title = {Google USM: Scaling Automatic Speech Recognition Beyond 100 Languages},
author = {Yu Zhang and Wei Han and James Qin and Yongqiang Wang and Ankur Bapna and Zhehuai Chen and Nanxin Chen and Bo Li and Vera Axelrod and Gary Wang and Zhong Meng and Ke Hu and Andrew Rosenberg and Rohit Prabhavalkar and Daniel S. Park and Parisa Haghani and Jason Riesa and Ginger Perng and Hagen Soltau and Trevor Strohman and Bhuvana Ramabhadran and Tara N. Sainath and Pedro J. Moreno and Chung-Cheng Chiu and Johan Schalkwyk and Franccoise Beaufays and Yonghui Wu},
year = {2023}
title = {NaturalSpeech 2: Latent Diffusion Models are Natural and Zero-Shot Speech and Singing Synthesizers},
author = {Kai Shen and Zeqian Ju and Xu Tan and Yanqing Liu and Yichong Leng and Lei He and Tao Qin and Sheng Zhao and Jiang Bian},
year = {2023}
title = {HiFi-Codec: Group-residual Vector quantization for High Fidelity Audio Codec},
author = {Dongchao Yang and Songxiang Liu and Rongjie Huang and Jinchuan Tian and Chao Weng and Yuexian Zou},
year = {2023}
title = {Bridging Discrete and Backpropagation: Straight-Through and Beyond},
author = {Liyuan Liu and Chengyu Dong and Xiaodong Liu and Bin Yu and Jianfeng Gao},
journal = {ArXiv},
year = {2023},
volume = {abs/2304.08612}
title = {Straightening Out the Straight-Through Estimator: Overcoming Optimization Challenges in Vector Quantized Networks},
author = {Huh, Minyoung and Cheung, Brian and Agrawal, Pulkit and Isola, Phillip},
booktitle = {International Conference on Machine Learning},
year = {2023},
organization = {PMLR}
title = {Einops: Clear and Reliable Tensor Manipulations with Einstein-like Notation},
author = {Alex Rogozhnikov},
booktitle = {International Conference on Learning Representations},
year = {2022},
url = {https://openreview.net/forum?id=oapKSVM2bcj}
title = {Translation-equivariant Image Quantizer for Bi-directional Image-Text Generation},
author = {Woncheol Shin and Gyubok Lee and Jiyoung Lee and Joonseok Lee and Edward Choi},
year = {2021},
eprint = {2112.00384},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
title = {Finite Scalar Quantization: VQ-VAE Made Simple},
author = {Fabian Mentzer and David Minnen and Eirikur Agustsson and Michael Tschannen},
year = {2023},
eprint = {2309.15505},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
title = {Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation},
author = {Lijun Yu and José Lezama and Nitesh B. Gundavarapu and Luca Versari and Kihyuk Sohn and David Minnen and Yong Cheng and Agrim Gupta and Xiuye Gu and Alexander G. Hauptmann and Boqing Gong and Ming-Hsuan Yang and Irfan Essa and David A. Ross and Lu Jiang},
year = {2023},
eprint = {2310.05737},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
title = {Image and Video Tokenization with Binary Spherical Quantization},
author = {Yue Zhao and Yuanjun Xiong and Philipp Krahenbuhl},
year = {2024},
url = {https://api.semanticscholar.org/CorpusID:270380237}
title = {Disentanglement via Latent Quantization},
author = {Kyle Hsu and Will Dorrell and James C. R. Whittington and Jiajun Wu and Chelsea Finn},
year = {2023},
eprint = {2305.18378},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
title = {Self-Organising Neural Discrete Representation Learning \`a la Kohonen},
author = {Kazuki Irie and R'obert Csord'as and J{\"u}rgen Schmidhuber},
year = {2023},
url = {https://api.semanticscholar.org/CorpusID:256901024}