Spaces:
Build error
Build error
zejunyang
commited on
Commit
·
3e99418
1
Parent(s):
fa7d98a
update
Browse files- app.py +39 -35
- src/utils/crop_face_single.py +31 -21
- src/utils/frame_interpolation.py +17 -38
app.py
CHANGED
|
@@ -98,10 +98,11 @@ vis = FaceMeshVisualizer()
|
|
| 98 |
|
| 99 |
frame_inter_model = init_frame_interpolation_model()
|
| 100 |
|
| 101 |
-
@spaces.GPU(duration=
|
| 102 |
-
def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=
|
| 103 |
fps = 30
|
| 104 |
cfg = 3.5
|
|
|
|
| 105 |
|
| 106 |
generator = torch.manual_seed(seed)
|
| 107 |
|
|
@@ -161,8 +162,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 161 |
# [transforms.Resize((height, width)), transforms.ToTensor()]
|
| 162 |
# )
|
| 163 |
args_L = len(pose_images) if length==0 or length > len(pose_images) else length
|
| 164 |
-
args_L = min(args_L,
|
| 165 |
-
for pose_image_np in pose_images[: args_L :
|
| 166 |
# pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
|
| 167 |
# pose_tensor_list.append(pose_transform(pose_image_pil))
|
| 168 |
pose_image_np = cv2.resize(pose_image_np, (width, height))
|
|
@@ -183,19 +184,21 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 183 |
cfg,
|
| 184 |
generator=generator,
|
| 185 |
).videos
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
|
| 195 |
-
save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
|
| 196 |
-
save_pil_imgs(video, save_path)
|
| 197 |
|
| 198 |
-
save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(fps))
|
| 199 |
|
| 200 |
stream = ffmpeg.input(save_path)
|
| 201 |
audio = ffmpeg.input(input_audio)
|
|
@@ -204,9 +207,10 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 204 |
|
| 205 |
return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
|
| 206 |
|
| 207 |
-
@spaces.GPU(duration=
|
| 208 |
-
def video2video(ref_img, source_video, size=512, steps=25, length=
|
| 209 |
cfg = 3.5
|
|
|
|
| 210 |
|
| 211 |
generator = torch.manual_seed(seed)
|
| 212 |
|
|
@@ -248,11 +252,9 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
|
|
| 248 |
pose_trans_list = []
|
| 249 |
verts_list = []
|
| 250 |
bs_list = []
|
| 251 |
-
src_tensor_list = []
|
| 252 |
args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
|
| 253 |
-
args_L = min(args_L,
|
| 254 |
-
for src_image_pil in source_images[: args_L : step*
|
| 255 |
-
src_tensor_list.append(pose_transform(src_image_pil))
|
| 256 |
src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
|
| 257 |
frame_height, frame_width, _ = src_img_np.shape
|
| 258 |
src_img_result = lmk_extractor(src_img_np)
|
|
@@ -308,19 +310,21 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
|
|
| 308 |
cfg,
|
| 309 |
generator=generator,
|
| 310 |
).videos
|
|
|
|
|
|
|
| 311 |
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
|
| 320 |
-
save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
|
| 321 |
-
save_pil_imgs(video, save_path)
|
| 322 |
|
| 323 |
-
save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(src_fps))
|
| 324 |
|
| 325 |
audio_output = f'{save_dir}/audio_from_video.aac'
|
| 326 |
# extract audio
|
|
@@ -353,7 +357,7 @@ description = r"""
|
|
| 353 |
"""
|
| 354 |
|
| 355 |
tips = r"""
|
| 356 |
-
|
| 357 |
"""
|
| 358 |
|
| 359 |
with gr.Blocks() as demo:
|
|
@@ -372,10 +376,10 @@ with gr.Blocks() as demo:
|
|
| 372 |
|
| 373 |
with gr.Row():
|
| 374 |
a2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
|
| 375 |
-
a2v_step_slider = gr.Slider(minimum=5, maximum=
|
| 376 |
|
| 377 |
with gr.Row():
|
| 378 |
-
a2v_length = gr.Slider(minimum=0, maximum=
|
| 379 |
a2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
| 380 |
|
| 381 |
a2v_botton = gr.Button("Generate", variant="primary")
|
|
@@ -400,10 +404,10 @@ with gr.Blocks() as demo:
|
|
| 400 |
|
| 401 |
with gr.Row():
|
| 402 |
v2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
|
| 403 |
-
v2v_step_slider = gr.Slider(minimum=5, maximum=
|
| 404 |
|
| 405 |
with gr.Row():
|
| 406 |
-
v2v_length = gr.Slider(minimum=0, maximum=
|
| 407 |
v2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
| 408 |
|
| 409 |
v2v_botton = gr.Button("Generate", variant="primary")
|
|
|
|
| 98 |
|
| 99 |
frame_inter_model = init_frame_interpolation_model()
|
| 100 |
|
| 101 |
+
@spaces.GPU(duration=300)
|
| 102 |
+
def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
|
| 103 |
fps = 30
|
| 104 |
cfg = 3.5
|
| 105 |
+
fi_step = 3
|
| 106 |
|
| 107 |
generator = torch.manual_seed(seed)
|
| 108 |
|
|
|
|
| 162 |
# [transforms.Resize((height, width)), transforms.ToTensor()]
|
| 163 |
# )
|
| 164 |
args_L = len(pose_images) if length==0 or length > len(pose_images) else length
|
| 165 |
+
args_L = min(args_L, 150)
|
| 166 |
+
for pose_image_np in pose_images[: args_L : fi_step]:
|
| 167 |
# pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
|
| 168 |
# pose_tensor_list.append(pose_transform(pose_image_pil))
|
| 169 |
pose_image_np = cv2.resize(pose_image_np, (width, height))
|
|
|
|
| 184 |
cfg,
|
| 185 |
generator=generator,
|
| 186 |
).videos
|
| 187 |
+
|
| 188 |
+
video = batch_images_interpolation_tool(video, frame_inter_model, inter_frames=fi_step-1)
|
| 189 |
|
| 190 |
+
save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
|
| 191 |
+
save_videos_grid(
|
| 192 |
+
video,
|
| 193 |
+
save_path,
|
| 194 |
+
n_rows=1,
|
| 195 |
+
fps=fps,
|
| 196 |
+
)
|
| 197 |
|
| 198 |
+
# save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
|
| 199 |
+
# save_pil_imgs(video, save_path)
|
| 200 |
|
| 201 |
+
# save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(fps))
|
| 202 |
|
| 203 |
stream = ffmpeg.input(save_path)
|
| 204 |
audio = ffmpeg.input(input_audio)
|
|
|
|
| 207 |
|
| 208 |
return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
|
| 209 |
|
| 210 |
+
@spaces.GPU(duration=300)
|
| 211 |
+
def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
|
| 212 |
cfg = 3.5
|
| 213 |
+
fi_step = 3
|
| 214 |
|
| 215 |
generator = torch.manual_seed(seed)
|
| 216 |
|
|
|
|
| 252 |
pose_trans_list = []
|
| 253 |
verts_list = []
|
| 254 |
bs_list = []
|
|
|
|
| 255 |
args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
|
| 256 |
+
args_L = min(args_L, 150*step)
|
| 257 |
+
for src_image_pil in source_images[: args_L : step*fi_step]:
|
|
|
|
| 258 |
src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
|
| 259 |
frame_height, frame_width, _ = src_img_np.shape
|
| 260 |
src_img_result = lmk_extractor(src_img_np)
|
|
|
|
| 310 |
cfg,
|
| 311 |
generator=generator,
|
| 312 |
).videos
|
| 313 |
+
|
| 314 |
+
video = batch_images_interpolation_tool(video, frame_inter_model, inter_frames=fi_step-1)
|
| 315 |
|
| 316 |
+
save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
|
| 317 |
+
save_videos_grid(
|
| 318 |
+
video,
|
| 319 |
+
save_path,
|
| 320 |
+
n_rows=1,
|
| 321 |
+
fps=src_fps,
|
| 322 |
+
)
|
| 323 |
|
| 324 |
+
# save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
|
| 325 |
+
# save_pil_imgs(video, save_path)
|
| 326 |
|
| 327 |
+
# save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(src_fps))
|
| 328 |
|
| 329 |
audio_output = f'{save_dir}/audio_from_video.aac'
|
| 330 |
# extract audio
|
|
|
|
| 357 |
"""
|
| 358 |
|
| 359 |
tips = r"""
|
| 360 |
+
Here is an accelerated version of AniPortrait. Due to limitations in computing power, the wait time will be quite long. Please utilize the source code to experience the full performance.
|
| 361 |
"""
|
| 362 |
|
| 363 |
with gr.Blocks() as demo:
|
|
|
|
| 376 |
|
| 377 |
with gr.Row():
|
| 378 |
a2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
|
| 379 |
+
a2v_step_slider = gr.Slider(minimum=5, maximum=30, step=1, value=20, label="Steps (--steps)")
|
| 380 |
|
| 381 |
with gr.Row():
|
| 382 |
+
a2v_length = gr.Slider(minimum=0, maximum=150, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
|
| 383 |
a2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
| 384 |
|
| 385 |
a2v_botton = gr.Button("Generate", variant="primary")
|
|
|
|
| 404 |
|
| 405 |
with gr.Row():
|
| 406 |
v2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
|
| 407 |
+
v2v_step_slider = gr.Slider(minimum=5, maximum=30, step=1, value=20, label="Steps (--steps)")
|
| 408 |
|
| 409 |
with gr.Row():
|
| 410 |
+
v2v_length = gr.Slider(minimum=0, maximum=150, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
|
| 411 |
v2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
| 412 |
|
| 413 |
v2v_botton = gr.Button("Generate", variant="primary")
|
src/utils/crop_face_single.py
CHANGED
|
@@ -20,26 +20,36 @@ def crop_face(img, lmk_extractor, expand=1.5):
|
|
| 20 |
|
| 21 |
width = x_max - x_min
|
| 22 |
height = y_max - y_min
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
return cropped_img
|
|
|
|
| 20 |
|
| 21 |
width = x_max - x_min
|
| 22 |
height = y_max - y_min
|
| 23 |
+
|
| 24 |
+
if width*height >= W*H*0.15:
|
| 25 |
+
if W == H:
|
| 26 |
+
return img
|
| 27 |
+
size = min(H, W)
|
| 28 |
+
offset = int((max(H, W) - size)/2)
|
| 29 |
+
if size == H:
|
| 30 |
+
return img[:, offset:-offset]
|
| 31 |
+
else:
|
| 32 |
+
return img[offset:-offset, :]
|
| 33 |
+
else:
|
| 34 |
+
center_x = x_min + width / 2
|
| 35 |
+
center_y = y_min + height / 2
|
| 36 |
+
|
| 37 |
+
width *= expand
|
| 38 |
+
height *= expand
|
| 39 |
+
|
| 40 |
+
size = max(width, height)
|
| 41 |
+
|
| 42 |
+
x_min = int(center_x - size / 2)
|
| 43 |
+
x_max = int(center_x + size / 2)
|
| 44 |
+
y_min = int(center_y - size / 2)
|
| 45 |
+
y_max = int(center_y + size / 2)
|
| 46 |
+
|
| 47 |
+
top = max(0, -y_min)
|
| 48 |
+
bottom = max(0, y_max - img.shape[0])
|
| 49 |
+
left = max(0, -x_min)
|
| 50 |
+
right = max(0, x_max - img.shape[1])
|
| 51 |
+
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)
|
| 52 |
+
|
| 53 |
+
cropped_img = img[y_min + top:y_max + top, x_min + left:x_max + left]
|
| 54 |
|
| 55 |
return cropped_img
|
src/utils/frame_interpolation.py
CHANGED
|
@@ -1,37 +1,32 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import cv2
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
| 5 |
import bisect
|
| 6 |
import shutil
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def init_frame_interpolation_model():
|
| 9 |
print("Initializing frame interpolation model")
|
| 10 |
checkpoint_name = os.path.join("./pretrained_model/film_net_fp16.pt")
|
| 11 |
|
| 12 |
-
model = torch.load(checkpoint_name, map_location='cpu')
|
| 13 |
model.eval()
|
| 14 |
model = model.half()
|
| 15 |
model = model.to(device="cuda")
|
| 16 |
return model
|
| 17 |
|
| 18 |
|
| 19 |
-
def batch_images_interpolation_tool(
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
input_img_list = os.listdir(input_file)
|
| 25 |
-
input_img_list.sort()
|
| 26 |
-
|
| 27 |
-
for idx in range(len(input_img_list)-1):
|
| 28 |
-
img1 = cv2.imread(os.path.join(input_file, input_img_list[idx]))
|
| 29 |
-
img2 = cv2.imread(os.path.join(input_file, input_img_list[idx+1]))
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
image2 = torch.from_numpy(image2).unsqueeze(0).permute(0, 3, 1, 2)
|
| 35 |
|
| 36 |
results = [image1, image2]
|
| 37 |
|
|
@@ -66,25 +61,9 @@ def batch_images_interpolation_tool(input_file, model, fps, inter_frames=1):
|
|
| 66 |
results.insert(insert_position, prediction.clamp(0, 1).cpu().float())
|
| 67 |
del remains[step]
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
final_frames = []
|
| 76 |
-
final_img_list = os.listdir(image_save_dir)
|
| 77 |
-
final_img_list.sort()
|
| 78 |
-
for item in final_img_list:
|
| 79 |
-
final_frames.append(cv2.imread(os.path.join(image_save_dir, item)))
|
| 80 |
-
w, h = final_frames[0].shape[1::-1]
|
| 81 |
-
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
|
| 82 |
-
video_save_dir = input_file + '.mp4'
|
| 83 |
-
writer = cv2.VideoWriter(video_save_dir, fourcc, fps, (w, h))
|
| 84 |
-
for frame in final_frames:
|
| 85 |
-
writer.write(frame)
|
| 86 |
-
writer.release()
|
| 87 |
-
|
| 88 |
-
shutil.rmtree(image_save_dir)
|
| 89 |
-
|
| 90 |
-
return video_save_dir
|
|
|
|
| 1 |
+
# Adapted from https://github.com/dajes/frame-interpolation-pytorch
|
| 2 |
import os
|
| 3 |
import cv2
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
| 6 |
import bisect
|
| 7 |
import shutil
|
| 8 |
+
import pdb
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
|
| 11 |
def init_frame_interpolation_model():
|
| 12 |
print("Initializing frame interpolation model")
|
| 13 |
checkpoint_name = os.path.join("./pretrained_model/film_net_fp16.pt")
|
| 14 |
|
| 15 |
+
model = torch.jit.load(checkpoint_name, map_location='cpu')
|
| 16 |
model.eval()
|
| 17 |
model = model.half()
|
| 18 |
model = model.to(device="cuda")
|
| 19 |
return model
|
| 20 |
|
| 21 |
|
| 22 |
+
def batch_images_interpolation_tool(input_tensor, model, inter_frames=1):
|
| 23 |
+
|
| 24 |
+
video_tensor = []
|
| 25 |
+
frame_num = input_tensor.shape[2] # bs, channel, frame, height, width
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
for idx in tqdm(range(frame_num-1)):
|
| 28 |
+
image1 = input_tensor[:,:,idx]
|
| 29 |
+
image2 = input_tensor[:,:,idx+1]
|
|
|
|
| 30 |
|
| 31 |
results = [image1, image2]
|
| 32 |
|
|
|
|
| 61 |
results.insert(insert_position, prediction.clamp(0, 1).cpu().float())
|
| 62 |
del remains[step]
|
| 63 |
|
| 64 |
+
for sub_idx in range(len(results)-1):
|
| 65 |
+
video_tensor.append(results[sub_idx].unsqueeze(2))
|
| 66 |
+
|
| 67 |
+
video_tensor.append(input_tensor[:,:,-1].unsqueeze(2))
|
| 68 |
+
video_tensor = torch.cat(video_tensor, dim=2)
|
| 69 |
+
return video_tensor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|