FFmpeg: Add VFR media support

Variable frame rate (VFR) files have been difficult to work with. This is because during sequential decoding, spacing between frames is not always equal, but it was assumed to be equal. This can result in movie getting out of sync with sound and difference between preview and rendered image. A way to resolve these issues was to build and use timecodes which is quite lengthy and resource intensive process. Such issues are also difficult to communicate through UI because it is not possible to predict if timecode usage would be needed. With this patch, double buffer is used to keep previously decoded frame. If current frame has PTS greater than what we are looking for, it is not time to display it yet, and previous frame is displayed instead. Each `AVFrame` has information about it's duration, so in theory double buffering would not be needed, but in practice this information is unreliable. To ensure double buffer is always used, function `ffmpeg_decode_video_frame_scan` is used for sequential decoding, even if no scanning is expected. This approach is similar to D6392, but this implementation does not require seeking so it is much faster. Currently `AVFrame` is only referenced, so no data is copied and therefore no overhead is added. Note: There is one known issue where seeking fails even with double buffering: Some files may seek too far in stream and miss requested PTS. These require preseeking or greater negative subframe offset Fixes: T86361, T72347 Reviewed By: zeddb, sergey Differential Revision: https://developer.blender.org/D13583
author: Richard Antalik <richardantalik@gmail.com> 2022-06-27 17:16:21 +0300
committer: Richard Antalik <richardantalik@gmail.com> 2022-06-27 17:58:07 +0300
commit: f0a3d2beb23e127ea246f9544d68355fbf3ce438 (patch)
tree: eb36800170d80401e9e0328839c60ce6bd0c45f5 /source/blender/imbuf
parent: 6f7171525b4abd0a4c0e22dc64f4d53d3a313643 (diff)
2 files changed, 177 insertions, 105 deletions
diff --git a/source/blender/imbuf/intern/IMB_anim.h b/source/blender/imbuf/intern/IMB_anim.h
index e99572adbb0..0ac1d7bfb74 100644
--- a/source/blender/imbuf/intern/IMB_anim.h
+++ b/source/blender/imbuf/intern/IMB_anim.h
@@ -109,17 +109,22 @@ struct anim {
   AVFormatContext *pFormatCtx;
   AVCodecContext *pCodecCtx;
   const AVCodec *pCodec;
-  AVFrame *pFrame;
-  int pFrameComplete;
   AVFrame *pFrameRGB;
   AVFrame *pFrameDeinterlaced;
   struct SwsContext *img_convert_ctx;
   int videoStream;
 
+  AVFrame *pFrame;
+  bool pFrame_complete;
+  AVFrame *pFrame_backup;
+  bool pFrame_backup_complete;
+
   struct ImBuf *cur_frame_final;
   int64_t cur_pts;
   int64_t cur_key_frame_pts;
   AVPacket *cur_packet;
+
+  bool seek_before_decode;
 #endif
 
   char index_dir[768];
diff --git a/source/blender/imbuf/intern/anim_movie.c b/source/blender/imbuf/intern/anim_movie.c
index 0052ce19aa1..12ba6a0217d 100644
--- a/source/blender/imbuf/intern/anim_movie.c
+++ b/source/blender/imbuf/intern/anim_movie.c
@@ -675,7 +675,7 @@ static int startffmpeg(struct anim *anim)
   anim->orientation = 0;
   anim->framesize = anim->x * anim->y * 4;
 
-  anim->cur_position = -1;
+  anim->cur_position = 0;
   anim->cur_frame_final = 0;
   anim->cur_pts = -1;
   anim->cur_key_frame_pts = -1;
@@ -683,7 +683,9 @@ static int startffmpeg(struct anim *anim)
   anim->cur_packet->stream_index = -1;
 
   anim->pFrame = av_frame_alloc();
-  anim->pFrameComplete = false;
+  anim->pFrame_backup = av_frame_alloc();
+  anim->pFrame_backup_complete = false;
+  anim->pFrame_complete = false;
   anim->pFrameDeinterlaced = av_frame_alloc();
   anim->pFrameRGB = av_frame_alloc();
   anim->pFrameRGB->format = AV_PIX_FMT_RGBA;
@@ -698,6 +700,7 @@ static int startffmpeg(struct anim *anim)
     av_frame_free(&anim->pFrameRGB);
     av_frame_free(&anim->pFrameDeinterlaced);
     av_frame_free(&anim->pFrame);
+    av_frame_free(&anim->pFrame_backup);
     anim->pCodecCtx = NULL;
     return -1;
   }
@@ -710,6 +713,7 @@ static int startffmpeg(struct anim *anim)
     av_frame_free(&anim->pFrameRGB);
     av_frame_free(&anim->pFrameDeinterlaced);
     av_frame_free(&anim->pFrame);
+    av_frame_free(&anim->pFrame_backup);
     anim->pCodecCtx = NULL;
     return -1;
   }
@@ -747,6 +751,7 @@ static int startffmpeg(struct anim *anim)
     av_frame_free(&anim->pFrameRGB);
     av_frame_free(&anim->pFrameDeinterlaced);
     av_frame_free(&anim->pFrame);
+    av_frame_free(&anim->pFrame_backup);
     anim->pCodecCtx = NULL;
     return -1;
   }
@@ -781,22 +786,71 @@ static int startffmpeg(struct anim *anim)
   return 0;
 }
 
+static double ffmpeg_steps_per_frame_get(struct anim *anim)
+{
+  AVStream *v_st = anim->pFormatCtx->streams[anim->videoStream];
+  AVRational time_base = v_st->time_base;
+  AVRational frame_rate = av_guess_frame_rate(anim->pFormatCtx, v_st, NULL);
+  return av_q2d(av_inv_q(av_mul_q(frame_rate, time_base)));
+}
+
+/* Store backup frame.
+ * With VFR movies, if PTS is not matched perfectly, scanning continues to look for next PTS.
+ * It is likely to overshoot and scaning stops. Having previous frame backed up, it is possible
+ * to use it when overshoot happens.
+ */
+static void ffmpeg_double_buffer_backup_frame_store(struct anim *anim, int64_t pts_to_search)
+{
+  /* `anim->pFrame` is beyond `pts_to_search`. Don't store it. */
+  if (anim->pFrame_backup_complete && anim->cur_pts >= pts_to_search) {
+    return;
+  }
+  if (!anim->pFrame_complete) {
+    return;
+  }
+
+  if (anim->pFrame_backup_complete) {
+    av_frame_unref(anim->pFrame_backup);
+  }
+
+  av_frame_move_ref(anim->pFrame_backup, anim->pFrame);
+  anim->pFrame_backup_complete = true;
+}
+
+/* Free stored backup frame. */
+static void ffmpeg_double_buffer_backup_frame_clear(struct anim *anim)
+{
+  if (anim->pFrame_backup_complete) {
+    av_frame_unref(anim->pFrame_backup);
+  }
+  anim->pFrame_backup_complete = false;
+}
+
+/* Return recently decoded frame. If it does not exist, return frame from backup buffer. */
+static AVFrame *ffmpeg_double_buffer_frame_fallback_get(struct anim *anim)
+{
+  av_log(anim->pFormatCtx, AV_LOG_ERROR, "DECODE UNHAPPY: PTS not matched!\n");
+
+  if (anim->pFrame_complete) {
+    return anim->pFrame;
+  }
+  if (anim->pFrame_backup_complete) {
+    return anim->pFrame_backup;
+  }
+  return NULL;
+}
+
 /* postprocess the image in anim->pFrame and do color conversion
  * and deinterlacing stuff.
  *
  * Output is anim->cur_frame_final
  */
 
-static void ffmpeg_postprocess(struct anim *anim)
+static void ffmpeg_postprocess(struct anim *anim, AVFrame *input)
 {
-  AVFrame *input = anim->pFrame;
   ImBuf *ibuf = anim->cur_frame_final;
   int filter_y = 0;
 
-  if (!anim->pFrameComplete) {
-    return;
-  }
-
   /* This means the data wasn't read properly,
    * this check stops crashing */
   if (input->data[0] == 0 && input->data[1] == 0 && input->data[2] == 0 && input->data[3] == 0) {
@@ -808,7 +862,7 @@ static void ffmpeg_postprocess(struct anim *anim)
 
   av_log(anim->pFormatCtx,
          AV_LOG_DEBUG,
-         "  POSTPROC: anim->pFrame planes: %p %p %p %p\n",
+         "  POSTPROC: AVFrame planes: %p %p %p %p\n",
          input->data[0],
          input->data[1],
          input->data[2],
@@ -852,6 +906,52 @@ static void ffmpeg_postprocess(struct anim *anim)
   }
 }
 
+static void final_frame_log(struct anim *anim,
+                            int64_t frame_pts_start,
+                            int64_t frame_pts_end,
+                            const char *str)
+{
+  av_log(anim->pFormatCtx,
+         AV_LOG_INFO,
+         "DECODE HAPPY: %s frame PTS range %" PRId64 " - %" PRId64 ".\n",
+         str,
+         frame_pts_start,
+         frame_pts_end);
+}
+
+static bool ffmpeg_pts_isect(int64_t pts_start, int64_t pts_end, int64_t pts_to_search)
+{
+  return pts_start <= pts_to_search && pts_to_search < pts_end;
+}
+
+/* Return frame that matches `pts_to_search`, NULL if matching frame does not exist. */
+static AVFrame *ffmpeg_frame_by_pts_get(struct anim *anim, int64_t pts_to_search)
+{
+  /* NOTE: `frame->pts + frame->pkt_duration` does not always match pts of next frame.
+   * See footage from T86361. Here it is OK to use, because PTS must match current or backup frame.
+   * If there is no current frame, return NULL.
+   */
+  if (!anim->pFrame_complete) {
+    return NULL;
+  }
+
+  const bool backup_frame_ready = anim->pFrame_backup_complete;
+  const int64_t recent_start = av_get_pts_from_frame(anim->pFrame);
+  const int64_t recent_end = recent_start + anim->pFrame->pkt_duration;
+  const int64_t backup_start = backup_frame_ready ? av_get_pts_from_frame(anim->pFrame_backup) : 0;
+
+  AVFrame *best_frame = NULL;
+  if (ffmpeg_pts_isect(recent_start, recent_end, pts_to_search)) {
+    final_frame_log(anim, recent_start, recent_end, "Recent");
+    best_frame = anim->pFrame;
+  }
+  else if (backup_frame_ready && ffmpeg_pts_isect(backup_start, recent_start, pts_to_search)) {
+    final_frame_log(anim, backup_start, recent_start, "Backup");
+    best_frame = anim->pFrame_backup;
+  }
+  return best_frame;
+}
+
 static void ffmpeg_decode_store_frame_pts(struct anim *anim)
 {
   anim->cur_pts = av_get_pts_from_frame(anim->pFrame);
@@ -863,7 +963,7 @@ static void ffmpeg_decode_store_frame_pts(struct anim *anim)
   av_log(anim->pFormatCtx,
          AV_LOG_DEBUG,
          "  FRAME DONE: cur_pts=%" PRId64 ", guessed_pts=%" PRId64 "\n",
-         (anim->pFrame->pts == AV_NOPTS_VALUE) ? -1 : (int64_t)anim->pFrame->pts,
+         av_get_pts_from_frame(anim->pFrame),
          (int64_t)anim->cur_pts);
 }
 
@@ -888,8 +988,8 @@ static int ffmpeg_decode_video_frame(struct anim *anim)
 
   /* Sometimes, decoder returns more than one frame per sent packet. Check if frames are available.
    * This frames must be read, otherwise decoding will fail. See T91405. */
-  anim->pFrameComplete = avcodec_receive_frame(anim->pCodecCtx, anim->pFrame) == 0;
-  if (anim->pFrameComplete) {
+  anim->pFrame_complete = avcodec_receive_frame(anim->pCodecCtx, anim->pFrame) == 0;
+  if (anim->pFrame_complete) {
     av_log(anim->pFormatCtx, AV_LOG_DEBUG, "  DECODE FROM CODEC BUFFER\n");
     ffmpeg_decode_store_frame_pts(anim);
     return 1;
@@ -902,20 +1002,22 @@ static int ffmpeg_decode_video_frame(struct anim *anim)
   }
 
   while ((rval = ffmpeg_read_video_frame(anim, anim->cur_packet)) >= 0) {
+    if (anim->cur_packet->stream_index != anim->videoStream) {
+      continue;
+    }
+
     av_log(anim->pFormatCtx,
            AV_LOG_DEBUG,
-           "%sREAD: strID=%d (VID: %d) dts=%" PRId64 " pts=%" PRId64 " %s\n",
-           (anim->cur_packet->stream_index == anim->videoStream) ? "->" : "  ",
+           "READ: strID=%d dts=%" PRId64 " pts=%" PRId64 " %s\n",
            anim->cur_packet->stream_index,
-           anim->videoStream,
            (anim->cur_packet->dts == AV_NOPTS_VALUE) ? -1 : (int64_t)anim->cur_packet->dts,
            (anim->cur_packet->pts == AV_NOPTS_VALUE) ? -1 : (int64_t)anim->cur_packet->pts,
            (anim->cur_packet->flags & AV_PKT_FLAG_KEY) ? " KEY" : "");
 
     avcodec_send_packet(anim->pCodecCtx, anim->cur_packet);
-    anim->pFrameComplete = avcodec_receive_frame(anim->pCodecCtx, anim->pFrame) == 0;
+    anim->pFrame_complete = avcodec_receive_frame(anim->pCodecCtx, anim->pFrame) == 0;
 
-    if (anim->pFrameComplete) {
+    if (anim->pFrame_complete) {
       ffmpeg_decode_store_frame_pts(anim);
       break;
     }
@@ -926,9 +1028,9 @@ static int ffmpeg_decode_video_frame(struct anim *anim)
   if (rval == AVERROR_EOF) {
     /* Flush any remaining frames out of the decoder. */
     avcodec_send_packet(anim->pCodecCtx, NULL);
-    anim->pFrameComplete = avcodec_receive_frame(anim->pCodecCtx, anim->pFrame) == 0;
+    anim->pFrame_complete = avcodec_receive_frame(anim->pCodecCtx, anim->pFrame) == 0;
 
-    if (anim->pFrameComplete) {
+    if (anim->pFrame_complete) {
       ffmpeg_decode_store_frame_pts(anim);
       rval = 0;
     }
@@ -990,15 +1092,6 @@ static int ffmpeg_seek_by_byte(AVFormatContext *pFormatCtx)
   return false;
 }
 
-static double ffmpeg_steps_per_frame_get(struct anim *anim)
-{
-  AVStream *v_st = anim->pFormatCtx->streams[anim->videoStream];
-  AVRational time_base = v_st->time_base;
-  AVRational frame_rate = av_guess_frame_rate(anim->pFormatCtx, v_st, NULL);
-  return av_q2d(av_inv_q(av_mul_q(frame_rate, time_base)));
-  ;
-}
-
 static int64_t ffmpeg_get_seek_pts(struct anim *anim, int64_t pts_to_search)
 {
   /* Step back half a frame position to make sure that we get the requested
@@ -1035,75 +1128,41 @@ static int64_t ffmpeg_get_pts_to_search(struct anim *anim,
   return pts_to_search;
 }
 
-/* Check if the pts will get us the same frame that we already have in memory from last decode. */
-static bool ffmpeg_pts_matches_last_frame(struct anim *anim, int64_t pts_to_search)
+static bool ffmpeg_is_first_frame_decode(struct anim *anim)
 {
-  if (anim->pFrame && anim->cur_frame_final) {
-    int64_t diff = pts_to_search - anim->cur_pts;
-    return diff >= 0 && diff < anim->pFrame->pkt_duration;
-  }
-
-  return false;
+  return anim->pFrame_complete == false;
 }
 
-static bool ffmpeg_is_first_frame_decode(struct anim *anim, int position)
+static void ffmpeg_scan_log(struct anim *anim, int64_t pts_to_search)
 {
-  return position == 0 && anim->cur_position == -1;
+  int64_t frame_pts_start = av_get_pts_from_frame(anim->pFrame);
+  int64_t frame_pts_end = frame_pts_start + anim->pFrame->pkt_duration;
+  av_log(anim->pFormatCtx,
+         AV_LOG_DEBUG,
+         "  SCAN WHILE: PTS range %" PRId64 " - %" PRId64 " in search of %" PRId64 "\n",
+         frame_pts_start,
+         frame_pts_end,
+         pts_to_search);
 }
 
 /* Decode frames one by one until its PTS matches pts_to_search. */
 static void ffmpeg_decode_video_frame_scan(struct anim *anim, int64_t pts_to_search)
 {
-  av_log(anim->pFormatCtx, AV_LOG_DEBUG, "FETCH: within current GOP\n");
-
-  av_log(anim->pFormatCtx,
-         AV_LOG_DEBUG,
-         "SCAN start: considering pts=%" PRId64 " in search of %" PRId64 "\n",
-         (int64_t)anim->cur_pts,
-         (int64_t)pts_to_search);
-
-  int64_t start_gop_frame = anim->cur_key_frame_pts;
-  bool scan_fuzzy = false;
-
-  while (anim->cur_pts < pts_to_search) {
-    av_log(anim->pFormatCtx,
-           AV_LOG_DEBUG,
-           "  WHILE: pts=%" PRId64 " in search of %" PRId64 "\n",
-           (int64_t)anim->cur_pts,
-           (int64_t)pts_to_search);
-    if (!ffmpeg_decode_video_frame(anim)) {
-      break;
+  const int64_t start_gop_frame = anim->cur_key_frame_pts;
+  bool decode_error = false;
+
+  while (!decode_error && anim->cur_pts < pts_to_search) {
+    ffmpeg_scan_log(anim, pts_to_search);
+    ffmpeg_double_buffer_backup_frame_store(anim, pts_to_search);
+    decode_error = ffmpeg_decode_video_frame(anim) < 1;
+
+    /* We should not get a new GOP keyframe while scanning if seeking is working as intended.
+     * If this condition triggers, there may be and error in our seeking code.
+     * Note: This seems to happen if DTS value is used for seeking in ffmpeg internally. There
+     * seems to be no good way to handle such case. */
+    if (anim->seek_before_decode && start_gop_frame != anim->cur_key_frame_pts) {
+      av_log(anim->pFormatCtx, AV_LOG_ERROR, "SCAN: Frame belongs to an unexpected GOP!\n");
     }
-
-    if (start_gop_frame != anim->cur_key_frame_pts) {
-      break;
-    }
-
-    if (anim->cur_pts < pts_to_search &&
-        anim->cur_pts + anim->pFrame->pkt_duration > pts_to_search) {
-      /* Our estimate of the pts was a bit off, but we have the frame we want. */
-      av_log(anim->pFormatCtx, AV_LOG_DEBUG, "SCAN fuzzy frame match\n");
-      scan_fuzzy = true;
-      break;
-    }
-  }
-
-  if (start_gop_frame != anim->cur_key_frame_pts) {
-    /* We went into an other GOP frame. This should never happen as we should have positioned us
-     * correctly by seeking into the GOP frame that contains the frame we want. */
-    av_log(anim->pFormatCtx,
-           AV_LOG_ERROR,
-           "SCAN failed: completely lost in stream, "
-           "bailing out at PTS=%" PRId64 ", searching for PTS=%" PRId64 "\n",
-           (int64_t)anim->cur_pts,
-           (int64_t)pts_to_search);
-  }
-
-  if (scan_fuzzy || anim->cur_pts == pts_to_search) {
-    av_log(anim->pFormatCtx, AV_LOG_DEBUG, "SCAN HAPPY: we found our PTS!\n");
-  }
-  else {
-    av_log(anim->pFormatCtx, AV_LOG_ERROR, "SCAN UNHAPPY: PTS not matched!\n");
   }
 }
 
@@ -1299,6 +1358,7 @@ static int ffmpeg_seek_to_key_frame(struct anim *anim,
   /* Flush the internal buffers of ffmpeg. This needs to be done after seeking to avoid decoding
    * errors. */
   avcodec_flush_buffers(anim->pCodecCtx);
+  ffmpeg_double_buffer_backup_frame_clear(anim);
 
   anim->cur_pts = -1;
 
@@ -1310,6 +1370,13 @@ static int ffmpeg_seek_to_key_frame(struct anim *anim,
   return ret;
 }
 
+static bool ffmpeg_must_seek(struct anim *anim, int position)
+{
+  bool must_seek = position != anim->cur_position + 1 || ffmpeg_is_first_frame_decode(anim);
+  anim->seek_before_decode = must_seek;
+  return must_seek;
+}
+
 static ImBuf *ffmpeg_fetchibuf(struct anim *anim, int position, IMB_Timecode_Type tc)
 {
   if (anim == NULL) {
@@ -1334,23 +1401,11 @@ static ImBuf *ffmpeg_fetchibuf(struct anim *anim, int position, IMB_Timecode_Typ
          frame_rate,
          start_pts);
 
-  if (ffmpeg_pts_matches_last_frame(anim, pts_to_search)) {
-    av_log(anim->pFormatCtx,
-           AV_LOG_DEBUG,
-           "FETCH: frame repeat: pts: %" PRId64 "\n",
-           (int64_t)anim->cur_pts);
-    IMB_refImBuf(anim->cur_frame_final);
-    anim->cur_position = position;
-    return anim->cur_frame_final;
+  if (ffmpeg_must_seek(anim, position)) {
+    ffmpeg_seek_to_key_frame(anim, position, tc_index, pts_to_search);
   }
 
-  if (position == anim->cur_position + 1 || ffmpeg_is_first_frame_decode(anim, position)) {
-    av_log(anim->pFormatCtx, AV_LOG_DEBUG, "FETCH: no seek necessary, just continue...\n");
-    ffmpeg_decode_video_frame(anim);
-  }
-  else if (ffmpeg_seek_to_key_frame(anim, position, tc_index, pts_to_search) >= 0) {
-    ffmpeg_decode_video_frame_scan(anim, pts_to_search);
-  }
+  ffmpeg_decode_video_frame_scan(anim, pts_to_search);
 
   IMB_freeImBuf(anim->cur_frame_final);
 
@@ -1387,7 +1442,18 @@ static ImBuf *ffmpeg_fetchibuf(struct anim *anim, int position, IMB_Timecode_Typ
 
   anim->cur_frame_final->rect_colorspace = colormanage_colorspace_get_named(anim->colorspace);
 
-  ffmpeg_postprocess(anim);
+  AVFrame *final_frame = ffmpeg_frame_by_pts_get(anim, pts_to_search);
+  if (final_frame == NULL) {
+    /* No valid frame was decoded for requested PTS, fall back on most recent decoded frame, even
+     * if it is incorrect. */
+    final_frame = ffmpeg_double_buffer_frame_fallback_get(anim);
+  }
+
+  /* Even with the fallback from above it is possible that the current decode frame is NULL. In
+   * this case skip post-processing and return current image buffer. */
+  if (final_frame != NULL) {
+    ffmpeg_postprocess(anim, final_frame);
+  }
 
   anim->cur_position = position;
 
@@ -1408,6 +1474,7 @@ static void free_anim_ffmpeg(struct anim *anim)
     av_packet_free(&anim->cur_packet);
 
     av_frame_free(&anim->pFrame);
+    av_frame_free(&anim->pFrame_backup);
     av_frame_free(&anim->pFrameRGB);
     av_frame_free(&anim->pFrameDeinterlaced);
author	Richard Antalik <richardantalik@gmail.com>	2022-06-27 17:16:21 +0300
committer	Richard Antalik <richardantalik@gmail.com>	2022-06-27 17:58:07 +0300
commit	f0a3d2beb23e127ea246f9544d68355fbf3ce438 (patch)
tree	eb36800170d80401e9e0328839c60ce6bd0c45f5 /source/blender/imbuf
parent	6f7171525b4abd0a4c0e22dc64f4d53d3a313643 (diff)