phenaki-pytorch - PyTorch实现Phenaki长视频AI生成技术

Phenaki - Pytorch

Pytorch版Phenaki Video的实现，它使用Mask GIT来生成长达2分钟的文本引导视频。它还将结合另一种涉及token critic的技术，可能会产生更好的生成效果。

如果你有兴趣在开放环境中复现这项工作，请加入

致谢

感谢Stability.ai慷慨赞助，支持我们进行前沿人工智能研究
感谢🤗 Huggingface提供出色的transformers和accelerate库
感谢Guillem持续的贡献
你？如果你是一位优秀的机器学习工程师和/或研究员，欢迎为开源生成式AI的前沿发展做出贡献

安装

$ pip install phenaki-pytorch

使用方法

C-ViViT

import torch
from phenaki_pytorch import CViViT, CViViTTrainer

cvivit = CViViT(
    dim = 512,
    codebook_size = 65536,
    image_size = 256,
    patch_size = 32,
    temporal_patch_size = 2,
    spatial_depth = 4,
    temporal_depth = 4,
    dim_head = 64,
    heads = 8
).cuda()

trainer = CViViTTrainer(
    cvivit,
    folder = '/path/to/images/or/videos',
    batch_size = 4,
    grad_accum_every = 4,
    train_on_images = False,  # 你可以先在图像上训练，然后再在视频上微调，以提高采样效率
    use_ema = False,          # 建议开启（保持cvivit的指数移动平均），除非资源不足
    num_train_steps = 10000
)

trainer.train()               # 重建结果和检查点将定期保存到./results

Phenaki

import torch
from phenaki_pytorch import CViViT, MaskGit, Phenaki

cvivit = CViViT(
    dim = 512,
    codebook_size = 65536,
    image_size = (256, 128),  # 允许矩形屏幕的视频
    patch_size = 32,
    temporal_patch_size = 2,
    spatial_depth = 4,
    temporal_depth = 4,
    dim_head = 64,
    heads = 8
)

cvivit.load('/path/to/trained/cvivit.pt')

maskgit = MaskGit(
    num_tokens = 5000,
    max_seq_len = 1024,
    dim = 512,
    dim_context = 768,
    depth = 6,
)

phenaki = Phenaki(
    cvivit = cvivit,
    maskgit = maskgit
).cuda()

videos = torch.randn(3, 3, 17, 256, 128).cuda() # (批次, 通道, 帧数, 高度, 宽度)
mask = torch.ones((3, 17)).bool().cuda() # [可选] (批次, 帧数) - 允许在同一批次中共同训练不同长度的视频以及视频和图像

texts = [
    '远处一头鲸鱼跃出水面',
    '小女孩吹灭生日蛋糕上的蜡烛',
    '蓝色和绿色火花的烟花'
]

loss = phenaki(videos, texts = texts, video_frame_mask = mask)
loss.backward()

# 重复上述步骤多次，然后...

video = phenaki.sample(texts = '一只松鼠检查一个橡子', num_frames = 17, cond_scale = 5.) # (1, 3, 17, 256, 128)

# 在论文中，他们并没有真正实现2分钟的连贯视频
# 在每个有新文本条件的新场景中，他们基于前K帧进行条件设置
# 你可以轻松地使用这个框架实现这一点，如下所示

video_prime = video[:, :, -3:] # (1, 3, 3, 256, 128) # 假设 K = 3

video_next = phenaki.sample(texts = '一只猫从远处观察松鼠', prime_frames = video_prime, num_frames = 14) # (1, 3, 14, 256, 128)

# 完整视频

entire_video = torch.cat((video, video_next), dim = 2) # (1, 3, 17 + 14, 256, 128)

# 以此类推...

或者直接导入make_video函数

# ... 上述代码

from phenaki_pytorch import make_video

entire_video, scenes = make_video(phenaki, texts = [
    '一只松鼠在雪中检查埋藏的橡子',
    '一只猫从结霜的窗台上观察松鼠',
    '镜头拉远，展示整个客厅，猫坐在窗台旁'
], num_frames = (17, 14, 14), prime_lengths = (5, 5))

entire_video.shape # (1, 3, 17 + 14 + 14 = 45, 256, 256)

# scenes - List[Tensor[3]] - 每个场景的视频片段

就是这样！

Token Critic

一篇新论文建议，与其依赖每个token的预测概率作为置信度的衡量，不如训练一个额外的评论者来决定在采样过程中迭代地掩蔽什么。你可以选择性地训练这个评论者，以获得可能更好的生成效果，如下所示：

import torch
from phenaki_pytorch import CViViT, MaskGit, TokenCritic, Phenaki

cvivit = CViViT(
    dim = 512,
    codebook_size = 65536,
    image_size = (256, 128),
    patch_size = 32,
    temporal_patch_size = 2,
    spatial_depth = 4,
    temporal_depth = 4,
    dim_head = 64,
    heads = 8
)

maskgit = MaskGit(
    num_tokens = 65536,
    max_seq_len = 1024,
    dim = 512,
    dim_context = 768,
    depth = 6,
)

# (1) 定义评论者
critic = TokenCritic(
    num_tokens = 65536,
    max_seq_len = 1024,
    dim = 512,
    dim_context = 768,
    depth = 6,
    has_cross_attn = True
)

trainer = Phenaki(
    maskgit = maskgit,
    cvivit = cvivit,
    critic = critic    # 然后(2)将其传入Phenaki
).cuda()

texts = [
    '远处一头鲸鱼跃出水面',
    '年轻女孩吹灭生日蛋糕上的蜡烛',
    '蓝色和绿色火花的烟花'
]

videos = torch.randn(3, 3, 3, 256, 128).cuda() # (批次, 通道数, 帧数, 高度, 宽度)

loss = trainer(videos = videos, texts = texts)
loss.backward()

或者更简单地,只需通过在初始化 Phenaki 时设置 self_token_critic = True 来重用 MaskGit 本身作为自我评论家 (Nijkamp et al)

phenaki = Phenaki(
    ...,
    self_token_critic= True  # 将此设置为True
)

现在你的生成效果应该会大大改善!

Phenaki 训练器

该存储库还将努力让研究人员能够先在文本到图像上训练,然后再在文本到视频上训练。同样,对于无条件训练,研究人员应该能够首先在图像上训练,然后在视频上微调。以下是文本到视频的示例

import torch
from torch.utils.data import Dataset
from phenaki_pytorch import CViViT, MaskGit, Phenaki, PhenakiTrainer

cvivit = CViViT(
    dim = 512,
    codebook_size = 65536,
    image_size = 256,
    patch_size = 32,
    temporal_patch_size = 2,
    spatial_depth = 4,
    temporal_depth = 4,
    dim_head = 64,
    heads = 8
)

cvivit.load('/path/to/trained/cvivit.pt')

maskgit = MaskGit(
    num_tokens = 5000,
    max_seq_len = 1024,
    dim = 512,
    dim_context = 768,
    depth = 6,
    unconditional = False
)

phenaki = Phenaki(
    cvivit = cvivit,
    maskgit = maskgit
).cuda()

# 模拟文本视频数据集
# 你需要扩展你自己的数据集,并返回(<视频张量>, <标题>)元组

class MockTextVideoDataset(Dataset):
    def __init__(
        self,
        length = 100,
        image_size = 256,
        num_frames = 17
    ):
        super().__init__()
        self.num_frames = num_frames
        self.image_size = image_size
        self.len = length

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        video = torch.randn(3, self.num_frames, self.image_size, self.image_size)
        caption = '视频标题'
        return video, caption

dataset = MockTextVideoDataset()

# 传入数据集

trainer = PhenakiTrainer(
    phenaki = phenaki,
    batch_size = 4,
    grad_accum_every = 4,
    train_on_images = False, # 如果你上面的模拟数据集返回(图像, 标题)对,将此设置为True
    dataset = dataset,       # 在此处传入你的数据集
    sample_texts_file_path = '/path/to/captions.txt' # 每个标题应该在新的一行,在采样期间会随机抽取
)

trainer.train()

无条件训练如下

例如无条件图像和视频训练

import torch
from phenaki_pytorch import CViViT, MaskGit, Phenaki, PhenakiTrainer

cvivit = CViViT(
    dim = 512,
    codebook_size = 65536,
    image_size = 256,
    patch_size = 32,
    temporal_patch_size = 2,
    spatial_depth = 4,
    temporal_depth = 4,
    dim_head = 64,
    heads = 8
)

cvivit.load('/path/to/trained/cvivit.pt')

maskgit = MaskGit(
    num_tokens = 5000,
    max_seq_len = 1024,
    dim = 512,
    dim_context = 768,
    depth = 6,
    unconditional = False
)

phenaki = Phenaki(
    cvivit = cvivit,
    maskgit = maskgit
).cuda()

# 传入图像或视频的文件夹

trainer = PhenakiTrainer(
    phenaki = phenaki,
    batch_size = 4,
    grad_accum_every = 4,
    train_on_images = True,                # 为了示例,底部是图像文件夹
    dataset = '/path/to/images/or/video'
)

trainer.train()

待办事项

引用

@article{Villegas2022PhenakiVL,
    title   = {Phenaki：基于开放域文本描述的可变长度视频生成},
    author  = {Ruben Villegas 和 Mohammad Babaeizadeh 和 Pieter-Jan Kindermans 和 Hernan Moraldo 和 Han Zhang 和 Mohammad Taghi Saffar 和 Santiago Castro 和 Julius Kunze 和 D. Erhan},
    journal = {ArXiv},
    year    = {2022},
    volume  = {abs/2210.02399}
}

@article{Chang2022MaskGITMG,
    title   = {MaskGIT：掩码生成图像转换器},
    author  = {Huiwen Chang 和 Han Zhang 和 Lu Jiang 和 Ce Liu 和 William T. Freeman},
    journal = {2022年IEEE/CVF计算机视觉与模式识别会议（CVPR）},
    year    = {2022},
    pages   = {11305-11315}
}

@article{Lezama2022ImprovedMI,
    title   = {基于令牌评论家的改进掩码图像生成},
    author  = {José Lezama 和 Huiwen Chang 和 Lu Jiang 和 Irfan Essa},
    journal = {ArXiv},
    year    = {2022},
    volume  = {abs/2209.04439}
}

@misc{ding2021cogview,
    title   = {CogView：通过Transformer掌控文本到图像的生成},
    author  = {Ming Ding 和 Zhuoyi Yang 和 Wenyi Hong 和 Wendi Zheng 和 Chang Zhou 和 Da Yin 和 Junyang Lin 和 Xu Zou 和 Zhou Shao 和 Hongxia Yang 和 Jie Tang},
    year    = {2021},
    eprint  = {2105.13290},
    archivePrefix = {arXiv},
    primaryClass = {cs.CV}
}

@misc{shazeer2020glu,
    title   = {GLU变体改进Transformer},
    author  = {Noam Shazeer},
    year    = {2020},
    url     = {https://arxiv.org/abs/2002.05202}
}

@misc{press2021ALiBi,
    title   = {短训练长测试：具有线性偏置的注意力机制实现输入长度外推},
    author  = {Ofir Press 和 Noah A. Smith 和 Mike Lewis},
    year    = {2021},
    url     = {https://ofir.io/train_short_test_long.pdf}
}

@article{Liu2022SwinTV,
    title   = {Swin Transformer V2：扩展容量和分辨率},
    author  = {Ze Liu 和 Han Hu 和 Yutong Lin 和 Zhuliang Yao 和 Zhenda Xie 和 Yixuan Wei 和 Jia Ning 和 Yue Cao 和 Zheng Zhang 和 Li Dong 和 Furu Wei 和 Baining Guo},
    journal = {2022年IEEE/CVF计算机视觉与模式识别会议（CVPR）},
    year    = {2022},
    pages   = {11999-12009}
}

@inproceedings{Nijkamp2021SCRIPTSP,
    title   = {SCRIPT：Transformer的自我评论预训练},
    author  = {Erik Nijkamp 和 Bo Pang 和 Ying Nian Wu 和 Caiming Xiong},
    booktitle = {北美计算语言学协会章程},
    year    = {2021}
}

@misc{https://doi.org/10.48550/arxiv.2302.01327,
    doi     = {10.48550/ARXIV.2302.01327},
    url     = {https://arxiv.org/abs/2302.01327},
    author  = {Kumar, Manoj 和 Dehghani, Mostafa 和 Houlsby, Neil},
    title   = {双重补丁归一化},
    publisher = {arXiv},
    year    = {2023},
    copyright = {知识共享署名4.0国际许可协议}
}

@misc{gilmer2023intriguing
    title  = {Transformer训练不稳定性的有趣特性},
    author = {Justin Gilmer, Andrea Schioppa, 和 Jeremy Cohen},
    year   = {2023},
    status = {待发表 - 一种注意力稳定技术正在Google Brain内部流传，被多个团队使用}
}

@misc{mentzer2023finite,
    title   = {有限标量量化：简化的VQ-VAE},
    author  = {Fabian Mentzer 和 David Minnen 和 Eirikur Agustsson 和 Michael Tschannen},
    year    = {2023},
    eprint  = {2309.15505},
    archivePrefix = {arXiv},
    primaryClass = {cs.CV}
}

@misc{yu2023language,
    title   = {语言模型胜过扩散模型 -- 分词器是视觉生成的关键},
    author  = {Lijun Yu 和 José Lezama 和 Nitesh B. Gundavarapu 和 Luca Versari 和 Kihyuk Sohn 和 David Minnen 和 Yong Cheng 和 Agrim Gupta 和 Xiuye Gu 和 Alexander G. Hauptmann 和 Boqing Gong 和 Ming-Hsuan Yang 和 Irfan Essa 和 David A. Ross 和 Lu Jiang},
    year    = {2023},
    eprint  = {2310.05737},
    archivePrefix = {arXiv},
    primaryClass = {cs.CV}
}