qblog

latent deepfrying

destroy image quality by constantly encoding then decoding an image using a VAE

example

code

import imageio
import numpy as np
import torch
from diffusers import AutoencoderKL
from PIL import Image

device = "cuda"

vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse").to(device)

image = Image.open("your_image_here").convert("RGB")
image = image.resize((512, 512))

def encode(image):
    image = image.resize((512, 512))
    image = np.array(image).astype(np.float32) / 255.0
    image = image[None].transpose(0, 3, 1, 2)
    image = torch.from_numpy(image).to(device)
    with torch.no_grad():
        latent = vae.encode(2.0 * image - 1.0)
    return 0.18215 * latent.latent_dist.sample()


def decode(latents):
    latents = (1 / 0.18215) * latents
    with torch.no_grad():
        image = vae.decode(latents).sample
    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().permute(0, 2, 3, 1).float().cpu().numpy()
    image = (image * 255).round().astype("uint8")
    image = Image.fromarray(image[0])
    return image


writer = imageio.get_writer("./latent_deepfrying.mp4", fps=10)

for _ in range(100):
    latents = encode(image)
    image = decode(latents)
    image_np = np.array(image)
    writer.append_data(image_np)

writer.close()