AniPortrait_official

Build error

App Files Files Community

zejunyang commited on Apr 7, 2024

Commit

3e99418

1 Parent(s): fa7d98a

update

Browse files

Files changed (3) hide show

app.py +39 -35
src/utils/crop_face_single.py +31 -21
src/utils/frame_interpolation.py +17 -38

app.py CHANGED Viewed

@@ -98,10 +98,11 @@ vis = FaceMeshVisualizer()
 frame_inter_model = init_frame_interpolation_model()
-@spaces.GPU(duration=200)
-def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=150, seed=42):
     fps = 30
     cfg = 3.5
     generator = torch.manual_seed(seed)
@@ -161,8 +162,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
     #     [transforms.Resize((height, width)), transforms.ToTensor()]
     # )
     args_L = len(pose_images) if length==0 or length > len(pose_images) else length
-    args_L = min(args_L, 180)
-    for pose_image_np in pose_images[: args_L : 2]:
         # pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
         # pose_tensor_list.append(pose_transform(pose_image_pil))
         pose_image_np = cv2.resize(pose_image_np,  (width, height))
@@ -183,19 +184,21 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
         cfg,
         generator=generator,
     ).videos
-    # save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
-    # save_videos_grid(
-    #     video,
-    #     save_path,
-    #     n_rows=1,
-    #     fps=fps,
-    # )
-    save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
-    save_pil_imgs(video, save_path)
-    save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(fps))
     stream = ffmpeg.input(save_path)
     audio = ffmpeg.input(input_audio)
@@ -204,9 +207,10 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
     return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
-@spaces.GPU(duration=200)
-def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
     cfg = 3.5
     generator = torch.manual_seed(seed)
@@ -248,11 +252,9 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
     pose_trans_list = []
     verts_list = []
     bs_list = []
-    src_tensor_list = []
     args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
-    args_L = min(args_L, 180*step)
-    for src_image_pil in source_images[: args_L : step*2]:
-        src_tensor_list.append(pose_transform(src_image_pil))
         src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
         frame_height, frame_width, _ = src_img_np.shape
         src_img_result = lmk_extractor(src_img_np)
@@ -308,19 +310,21 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
         cfg,
         generator=generator,
     ).videos
-    # save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
-    # save_videos_grid(
-    #     video,
-    #     save_path,
-    #     n_rows=1,
-    #     fps=src_fps,
-    # )
-    save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
-    save_pil_imgs(video, save_path)
-    save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(src_fps))
     audio_output = f'{save_dir}/audio_from_video.aac'
     # extract audio
@@ -353,7 +357,7 @@ description = r"""
 """
 tips = r"""
-When the video cannot be displayed, you can download the result video.
 """
 with gr.Blocks() as demo:
@@ -372,10 +376,10 @@ with gr.Blocks() as demo:
                 with gr.Row():
                     a2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
-                    a2v_step_slider = gr.Slider(minimum=5, maximum=50, step=1, value=20, label="Steps (--steps)")
                 with gr.Row():
-                    a2v_length = gr.Slider(minimum=0, maximum=180, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
                     a2v_seed = gr.Number(value=42, label="Seed (--seed)")
                 a2v_botton = gr.Button("Generate", variant="primary")
@@ -400,10 +404,10 @@ with gr.Blocks() as demo:
                 with gr.Row():
                     v2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
-                    v2v_step_slider = gr.Slider(minimum=5, maximum=50, step=1, value=20, label="Steps (--steps)")
                 with gr.Row():
-                    v2v_length = gr.Slider(minimum=0, maximum=180, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
                     v2v_seed = gr.Number(value=42, label="Seed (--seed)")
                 v2v_botton = gr.Button("Generate", variant="primary")

 frame_inter_model = init_frame_interpolation_model()
+@spaces.GPU(duration=300)
+def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
     fps = 30
     cfg = 3.5
+    fi_step = 3
     generator = torch.manual_seed(seed)
     #     [transforms.Resize((height, width)), transforms.ToTensor()]
     # )
     args_L = len(pose_images) if length==0 or length > len(pose_images) else length
+    args_L = min(args_L, 150)
+    for pose_image_np in pose_images[: args_L : fi_step]:
         # pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
         # pose_tensor_list.append(pose_transform(pose_image_pil))
         pose_image_np = cv2.resize(pose_image_np,  (width, height))
         cfg,
         generator=generator,
     ).videos
+    video = batch_images_interpolation_tool(video, frame_inter_model, inter_frames=fi_step-1)
+    save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
+    save_videos_grid(
+        video,
+        save_path,
+        n_rows=1,
+        fps=fps,
+    )
+    # save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
+    # save_pil_imgs(video, save_path)
+    # save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(fps))
     stream = ffmpeg.input(save_path)
     audio = ffmpeg.input(input_audio)
     return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
+@spaces.GPU(duration=300)
+def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
     cfg = 3.5
+    fi_step = 3
     generator = torch.manual_seed(seed)
     pose_trans_list = []
     verts_list = []
     bs_list = []
     args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
+    args_L = min(args_L, 150*step)
+    for src_image_pil in source_images[: args_L : step*fi_step]:
         src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
         frame_height, frame_width, _ = src_img_np.shape
         src_img_result = lmk_extractor(src_img_np)
         cfg,
         generator=generator,
     ).videos
+    video = batch_images_interpolation_tool(video, frame_inter_model, inter_frames=fi_step-1)
+    save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
+    save_videos_grid(
+        video,
+        save_path,
+        n_rows=1,
+        fps=src_fps,
+    )
+    # save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
+    # save_pil_imgs(video, save_path)
+    # save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(src_fps))
     audio_output = f'{save_dir}/audio_from_video.aac'
     # extract audio
 """
 tips = r"""
+Here is an accelerated version of AniPortrait. Due to limitations in computing power, the wait time will be quite long. Please utilize the source code to experience the full performance.
 """
 with gr.Blocks() as demo:
                 with gr.Row():
                     a2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
+                    a2v_step_slider = gr.Slider(minimum=5, maximum=30, step=1, value=20, label="Steps (--steps)")
                 with gr.Row():
+                    a2v_length = gr.Slider(minimum=0, maximum=150, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
                     a2v_seed = gr.Number(value=42, label="Seed (--seed)")
                 a2v_botton = gr.Button("Generate", variant="primary")
                 with gr.Row():
                     v2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
+                    v2v_step_slider = gr.Slider(minimum=5, maximum=30, step=1, value=20, label="Steps (--steps)")
                 with gr.Row():
+                    v2v_length = gr.Slider(minimum=0, maximum=150, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
                     v2v_seed = gr.Number(value=42, label="Seed (--seed)")
                 v2v_botton = gr.Button("Generate", variant="primary")

src/utils/crop_face_single.py CHANGED Viewed

@@ -20,26 +20,36 @@ def crop_face(img, lmk_extractor, expand=1.5):
     width = x_max - x_min
     height = y_max - y_min
-    center_x = x_min + width / 2
-    center_y = y_min + height / 2
-    width *= expand
-    height *= expand
-    size = max(width, height)
-    x_min = int(center_x - size / 2)
-    x_max = int(center_x + size / 2)
-    y_min = int(center_y - size / 2)
-    y_max = int(center_y + size / 2)
-    top = max(0, -y_min)
-    bottom = max(0, y_max - img.shape[0])
-    left = max(0, -x_min)
-    right = max(0, x_max - img.shape[1])
-    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)
-    cropped_img = img[y_min + top:y_max + top, x_min + left:x_max + left]
     return cropped_img

     width = x_max - x_min
     height = y_max - y_min
+    if width*height >= W*H*0.15:
+        if W == H:
+            return img
+        size = min(H, W)
+        offset = int((max(H, W) - size)/2)
+        if size == H:
+            return img[:, offset:-offset]
+        else:
+            return img[offset:-offset, :]
+    else:
+        center_x = x_min + width / 2
+        center_y = y_min + height / 2
+        width *= expand
+        height *= expand
+        size = max(width, height)
+        x_min = int(center_x - size / 2)
+        x_max = int(center_x + size / 2)
+        y_min = int(center_y - size / 2)
+        y_max = int(center_y + size / 2)
+        top = max(0, -y_min)
+        bottom = max(0, y_max - img.shape[0])
+        left = max(0, -x_min)
+        right = max(0, x_max - img.shape[1])
+        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)
+        cropped_img = img[y_min + top:y_max + top, x_min + left:x_max + left]
     return cropped_img

src/utils/frame_interpolation.py CHANGED Viewed

@@ -1,37 +1,32 @@
 import os
 import cv2
 import numpy as np
 import torch
 import bisect
 import shutil
 def init_frame_interpolation_model():
     print("Initializing frame interpolation model")
     checkpoint_name = os.path.join("./pretrained_model/film_net_fp16.pt")
-    model = torch.load(checkpoint_name, map_location='cpu')
     model.eval()
     model = model.half()
     model = model.to(device="cuda")
     return model
-def batch_images_interpolation_tool(input_file, model, fps, inter_frames=1):
-    image_save_dir = input_file + '_tmp'
-    os.makedirs(image_save_dir, exist_ok=True)
-    input_img_list = os.listdir(input_file)
-    input_img_list.sort()
-    for idx in range(len(input_img_list)-1):
-        img1 = cv2.imread(os.path.join(input_file, input_img_list[idx]))
-        img2 = cv2.imread(os.path.join(input_file, input_img_list[idx+1]))
-        image1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB).astype(np.float32) / np.float32(255)
-        image2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB).astype(np.float32) / np.float32(255)
-        image1 = torch.from_numpy(image1).unsqueeze(0).permute(0, 3, 1, 2)
-        image2 = torch.from_numpy(image2).unsqueeze(0).permute(0, 3, 1, 2)
         results = [image1, image2]
@@ -66,25 +61,9 @@ def batch_images_interpolation_tool(input_file, model, fps, inter_frames=1):
             results.insert(insert_position, prediction.clamp(0, 1).cpu().float())
             del remains[step]
-        frames = [(tensor[0] * 255).byte().flip(0).permute(1, 2, 0).numpy().copy() for tensor in results]
-        for sub_idx in range(len(frames)):
-            img_path = os.path.join(image_save_dir, f'{sub_idx+idx*(inter_frames+1):06d}.png')
-            cv2.imwrite(img_path, frames[sub_idx])
-    final_frames = []
-    final_img_list = os.listdir(image_save_dir)
-    final_img_list.sort()
-    for item in final_img_list:
-        final_frames.append(cv2.imread(os.path.join(image_save_dir, item)))
-    w, h = final_frames[0].shape[1::-1]
-    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
-    video_save_dir = input_file + '.mp4'
-    writer = cv2.VideoWriter(video_save_dir, fourcc, fps, (w, h))
-    for frame in final_frames:
-        writer.write(frame)
-    writer.release()
-    shutil.rmtree(image_save_dir)
-    return video_save_dir

+# Adapted from https://github.com/dajes/frame-interpolation-pytorch
 import os
 import cv2
 import numpy as np
 import torch
 import bisect
 import shutil
+import pdb
+from tqdm import tqdm
 def init_frame_interpolation_model():
     print("Initializing frame interpolation model")
     checkpoint_name = os.path.join("./pretrained_model/film_net_fp16.pt")
+    model = torch.jit.load(checkpoint_name, map_location='cpu')
     model.eval()
     model = model.half()
     model = model.to(device="cuda")
     return model
+def batch_images_interpolation_tool(input_tensor, model, inter_frames=1):
+    video_tensor = []
+    frame_num = input_tensor.shape[2]  # bs, channel, frame, height, width
+    for idx in tqdm(range(frame_num-1)):
+        image1 = input_tensor[:,:,idx]
+        image2 = input_tensor[:,:,idx+1]
         results = [image1, image2]
             results.insert(insert_position, prediction.clamp(0, 1).cpu().float())
             del remains[step]
+        for sub_idx in range(len(results)-1):
+            video_tensor.append(results[sub_idx].unsqueeze(2))
+    video_tensor.append(input_tensor[:,:,-1].unsqueeze(2))
+    video_tensor = torch.cat(video_tensor, dim=2)
+    return video_tensor