Spaces:

Luminia
/

CorridorKey

Running on Zero

Nekochu commited on 20 days ago

Commit

f4a7288

1 Parent(s): 18d73cb

postprocess at model res, defer resize+write to CPU (saves ~35s GPU)

GPU phase: postprocess (clean_matte + despill) at 1024 instead of 4K (8x fewer
pixels), save uint8 to /tmp (~4MB/frame). No 4K file encoding during GPU time.

After GPU release (free CPU): load 1024, resize to output res (LANCZOS4),
write all 4 outputs (comp+fg+matte+processed).

Before: 28s inference@4K + 26s write@4K = 62s GPU
Target: 16s inference@1024 + 1s save = ~22s GPU

Files changed (1) hide show

app.py +42 -27

app.py CHANGED Viewed

@@ -539,20 +539,18 @@ def corridorkey_batch_pytorch(model, images_f32, masks_f32, img_size,
         out = model(inp)
     del inp
-    # --- GPU Postprocessing (despill + clean_matte + resize stay on device) ---
     alpha = out["alpha"].float()
     fg = out["fg"].float()
-    alpha = TF.resize(alpha, [h, w])
-    fg = TF.resize(fg, [h, w])
     if auto_despeckle:
         alpha = clean_matte_torch(alpha, area_threshold=int(despeckle_size), dilation=25, blur_size=5)
     fg = despill_torch(fg, despill_strength, screen_channel=screen_channel)
-    # --- Single CPU transfer at the end ---
-    alpha_np = alpha.cpu().numpy()
-    fg_np = fg.cpu().numpy()
     del alpha, fg
     results = []
@@ -791,29 +789,19 @@ def _gpu_phase(video_path, resolution, despill_val, mask_mode,
             logger.info("[GPU phase] done: %d frames in %.1fs (%.2fs/fr)",
                         len(all_results), gpu_elapsed, gpu_elapsed / max(len(all_results), 1))
-            from concurrent.futures import ThreadPoolExecutor
-            bg_lin = srgb_to_linear(create_checkerboard(w, h))
-            comp_dir = os.path.join(tmpdir, "Comp")
-            matte_dir = os.path.join(tmpdir, "Matte")
-            fg_dir = os.path.join(tmpdir, "FG")
-            processed_dir = os.path.join(tmpdir, "Processed")
-            for d in [comp_dir, fg_dir, matte_dir, processed_dir]:
-                os.makedirs(d, exist_ok=True)
-            t_write = time.time()
-            progress(0.86, desc="Writing preview frames...")
-            with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as pool:
-                futs = [pool.submit(_write_frame, idx, alpha, fg, w, h, bg_lin,
-                                    comp_dir, fg_dir, matte_dir, processed_dir)
-                        for idx, alpha, fg in all_results]
-                for f in futs:
-                    f.result()
             del all_results
             gc.collect()
-            logger.info("[GPU phase] Fast write in %.1fs", time.time() - t_write)
             return {
-                "results": "written", "frame_times": frame_times,
                 "use_gpu": True, "batch_size": batch_size,
                 "w": w, "h": h, "fps": fps, "tmpdir": tmpdir,
                 "screen_color": screen_color,
@@ -938,8 +926,35 @@ def process_video(video_path, resolution, despill_val, mask_mode,
     fg_dir = os.path.join(tmpdir, "FG")
     matte_dir = os.path.join(tmpdir, "Matte")
     processed_dir = os.path.join(tmpdir, "Processed")
     try:
         # Phase 3: stitch videos from written frames
         logger.info("[Phase 3] Stitching videos")
@@ -970,7 +985,7 @@ def process_video(video_path, resolution, despill_val, mask_mode,
         status = (f"Processed {n} frames ({w}x{h}) at {resolution}px | "
                   f"{avg:.2f}s/frame | {engine}" +
                   (f" batch={batch_size}" if use_gpu else "") +
-                  f" | {t_cpu:.0f}s CPU + {t_gpu:.0f}s GPU = {wall:.0f}s total" +
                   (f" | {sc} screen" if sc != "green" else ""))
         return (

         out = model(inp)
     del inp
+    # --- GPU Postprocessing at MODEL resolution (1024/2048, NOT output 4K) ---
+    # Resize to output happens on CPU after GPU release (free time)
     alpha = out["alpha"].float()
     fg = out["fg"].float()
     if auto_despeckle:
         alpha = clean_matte_torch(alpha, area_threshold=int(despeckle_size), dilation=25, blur_size=5)
     fg = despill_torch(fg, despill_strength, screen_channel=screen_channel)
+    # Transfer at model resolution (1024×1024 = 4MB/frame, not 4K = 33MB/frame)
+    alpha_np = (alpha.clamp(0, 1) * 255).byte().cpu().numpy()
+    fg_np = (fg.clamp(0, 1) * 255).byte().cpu().numpy()
     del alpha, fg
     results = []
             logger.info("[GPU phase] done: %d frames in %.1fs (%.2fs/fr)",
                         len(all_results), gpu_elapsed, gpu_elapsed / max(len(all_results), 1))
+            # Save model-resolution uint8 results to /tmp (tiny: ~4MB/frame at 1024)
+            raw_dir = os.path.join(tmpdir, "raw")
+            os.makedirs(raw_dir, exist_ok=True)
+            t_save = time.time()
+            for idx, alpha, fg in all_results:
+                np.save(os.path.join(raw_dir, f"alpha_{idx:05d}.npy"), alpha)
+                np.save(os.path.join(raw_dir, f"fg_{idx:05d}.npy"), fg)
             del all_results
             gc.collect()
+            logger.info("[GPU phase] Raw save in %.1fs", time.time() - t_save)
             return {
+                "results": "raw", "raw_dir": raw_dir, "frame_times": frame_times,
                 "use_gpu": True, "batch_size": batch_size,
                 "w": w, "h": h, "fps": fps, "tmpdir": tmpdir,
                 "screen_color": screen_color,
     fg_dir = os.path.join(tmpdir, "FG")
     matte_dir = os.path.join(tmpdir, "Matte")
     processed_dir = os.path.join(tmpdir, "Processed")
+    for d in [comp_dir, fg_dir, matte_dir, processed_dir]:
+        os.makedirs(d, exist_ok=True)
     try:
+        # Phase 2: CPU resize + write (GPU results saved at model resolution)
+        raw_dir = data.get("raw_dir")
+        if raw_dir and use_gpu:
+            from concurrent.futures import ThreadPoolExecutor
+            t_phase2 = time.time()
+            bg_lin = srgb_to_linear(create_checkerboard(w, h))
+            n_frames = len(frame_times)
+            logger.info("[Phase 2] CPU resize %d→%dx%d + write (%d frames)",
+                        int(resolution), w, h, n_frames)
+            progress(0.85, desc=f"Resizing to {w}x{h} + writing...")
+            def _resize_and_write(idx):
+                alpha_1k = np.load(os.path.join(raw_dir, f"alpha_{idx:05d}.npy"))
+                fg_1k = np.load(os.path.join(raw_dir, f"fg_{idx:05d}.npy"))
+                alpha = cv2.resize(alpha_1k, (w, h), interpolation=cv2.INTER_LANCZOS4)
+                fg = cv2.resize(fg_1k, (w, h), interpolation=cv2.INTER_LANCZOS4)
+                alpha = alpha.astype(np.float32) / 255.0
+                fg = fg.astype(np.float32) / 255.0
+                if alpha.ndim == 2:
+                    alpha = alpha[:, :, np.newaxis]
+                _write_frame(idx, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir, processed_dir)
+            with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as pool:
+                list(pool.map(_resize_and_write, range(n_frames)))
+            logger.info("[Phase 2] CPU write in %.1fs", time.time() - t_phase2)
         # Phase 3: stitch videos from written frames
         logger.info("[Phase 3] Stitching videos")
         status = (f"Processed {n} frames ({w}x{h}) at {resolution}px | "
                   f"{avg:.2f}s/frame | {engine}" +
                   (f" batch={batch_size}" if use_gpu else "") +
+                  f" | {t_gpu:.0f}s GPU, {wall:.0f}s total" +
                   (f" | {sc} screen" if sc != "green" else ""))
         return (