doxygen/trunk/vf__mestimate__d3d12_8c_source.html

/*

 * D3D12 Hardware-Accelerated Motion Estimation Filter

 *

 * Copyright (c) 2025 Advanced Micro Devices, Inc.

 *

 * This file is part of FFmpeg.

 *

 * FFmpeg is free software; you can redistribute it and/or

 * modify it under the terms of the GNU Lesser General Public

 * License as published by the Free Software Foundation; either

 * version 2.1 of the License, or (at your option) any later version.

 *

 * FFmpeg is distributed in the hope that it will be useful,

 * but WITHOUT ANY WARRANTY; without even the implied warranty of

 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

 * Lesser General Public License for more details.

 *

 * You should have received a copy of the GNU Lesser General Public

 * License along with FFmpeg; if not, write to the Free Software

 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

 */


#include "libavutil/avassert.h"

#include "libavutil/buffer.h"

#include "libavutil/hwcontext.h"

#include "libavutil/hwcontext_d3d12va_internal.h"

#include "libavutil/hwcontext_d3d12va.h"

#include "libavutil/internal.h"

#include "libavutil/opt.h"

#include "libavutil/motion_vector.h"

#include "libavutil/mem.h"

#include "avfilter.h"

#include "filters.h"

#include "video.h"


typedef struct MEstimateD3D12Context {

    const AVClass *class;


    AVBufferRef *hw_device_ref;

    AVBufferRef *hw_frames_ref;


    AVD3D12VADeviceContext *device_ctx;

    AVD3D12VAFramesContext *frames_ctx;


    ID3D12Device *device;

    ID3D12VideoDevice1 *video_device;

    ID3D12VideoMotionEstimator *motion_estimator;

    ID3D12VideoMotionVectorHeap *motion_vector_heap;

    ID3D12VideoEncodeCommandList *command_list;

    ID3D12CommandQueue *command_queue;

    ID3D12CommandAllocator *command_allocator;


    // Graphics command list and queue for copy operations

    ID3D12GraphicsCommandList *copy_command_list;

    ID3D12CommandAllocator *copy_command_allocator;

    ID3D12CommandQueue *copy_command_queue;


    // Synchronization

    ID3D12Fence *fence;

    HANDLE fence_event;

    uint64_t fence_value;


    // Motion estimation parameters

    int block_size;                     // 8 or 16

    D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE d3d12_block_size;

    D3D12_VIDEO_MOTION_ESTIMATOR_VECTOR_PRECISION precision;


    // Frame buffer

    AVFrame *prev_frame;

    AVFrame *cur_frame;

    AVFrame *next_frame;


    // Output textures for resolved motion vectors (GPU-side, DEFAULT heap)

    ID3D12Resource *resolved_mv_texture_back;

    ID3D12Resource *resolved_mv_texture_fwd;


    // Readback buffers for CPU access (READBACK heap)

    ID3D12Resource *readback_buffer_back;

    ID3D12Resource *readback_buffer_fwd;

    size_t readback_buffer_size;


    int initialized;

} MEstimateD3D12Context;


static int mestimate_d3d12_init(AVFilterContext *ctx)

{

    MEstimateD3D12Context *s = ctx->priv;


    s->initialized = 0;

    s->fence_value = 0;


    // Validate block size - only 8 and 16 are valid

    if (s->block_size != 8 && s->block_size != 16) {

        av_log(ctx, AV_LOG_ERROR, "Invalid block_size %d. Only 8 and 16 are supported.\n", s->block_size);

        return AVERROR(EINVAL);

    }


    // Set D3D12 block size based on user option

    if (s->block_size == 8)

        s->d3d12_block_size = D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_8X8;

    else

        s->d3d12_block_size = D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_16X16;


    // Use quarter-pel precision

    s->precision = D3D12_VIDEO_MOTION_ESTIMATOR_VECTOR_PRECISION_QUARTER_PEL;


    return 0;

}


static int mestimate_d3d12_create_objects(AVFilterContext *ctx)

{

    MEstimateD3D12Context *s = ctx->priv;

    HRESULT hr;

    D3D12_COMMAND_QUEUE_DESC queue_desc = {

        .Type     = D3D12_COMMAND_LIST_TYPE_VIDEO_ENCODE,

        .Priority = 0,

        .Flags    = D3D12_COMMAND_QUEUE_FLAG_NONE,

        .NodeMask = 0,

    };


    // Create fence for synchronization

    hr = ID3D12Device_CreateFence(s->device, 0, D3D12_FENCE_FLAG_NONE,

                                  &IID_ID3D12Fence, (void **)&s->fence);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create fence\n");

        return AVERROR(EINVAL);

    }


    s->fence_event = CreateEvent(NULL, FALSE, FALSE, NULL);

    if (!s->fence_event) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create fence event\n");

        return AVERROR(EINVAL);

    }


    // Create command queue

    hr = ID3D12Device_CreateCommandQueue(s->device, &queue_desc,

                                         &IID_ID3D12CommandQueue, (void **)&s->command_queue);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create command queue\n");

        return AVERROR(EINVAL);

    }


    // Create command allocator

    hr = ID3D12Device_CreateCommandAllocator(s->device, D3D12_COMMAND_LIST_TYPE_VIDEO_ENCODE,

                                             &IID_ID3D12CommandAllocator, (void **)&s->command_allocator);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create command allocator\n");

        return AVERROR(EINVAL);

    }


    // Create command list

    hr = ID3D12Device_CreateCommandList(s->device, 0, D3D12_COMMAND_LIST_TYPE_VIDEO_ENCODE,

                                        s->command_allocator, NULL, &IID_ID3D12VideoEncodeCommandList,

                                        (void **)&s->command_list);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create command list\n");

        return AVERROR(EINVAL);

    }


    hr = ID3D12VideoEncodeCommandList_Close(s->command_list);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to close command list\n");

        return AVERROR(EINVAL);

    }


    return 0;

}


static int mestimate_d3d12_create_motion_estimator(AVFilterContext *ctx, int width, int height)

{

    MEstimateD3D12Context *s = ctx->priv;

    HRESULT hr;

    D3D12_FEATURE_DATA_VIDEO_MOTION_ESTIMATOR feature_data = {0};

    D3D12_VIDEO_MOTION_ESTIMATOR_DESC me_desc              = {0};

    D3D12_VIDEO_MOTION_VECTOR_HEAP_DESC heap_desc          = {0};


    // Check if motion estimation is supported

    // Set the input parameters for what we want to query

    feature_data.NodeIndex      = 0;

    feature_data.InputFormat    = s->frames_ctx->format;

    feature_data.BlockSizeFlags = 0;  // Will be filled by CheckFeatureSupport with supported flags

    feature_data.PrecisionFlags = 0;  // Will be filled by CheckFeatureSupport with supported flags

    feature_data.SizeRange.MaxWidth  = width;

    feature_data.SizeRange.MaxHeight = height;

    feature_data.SizeRange.MinWidth  = width;

    feature_data.SizeRange.MinHeight = height;


    hr = ID3D12VideoDevice1_CheckFeatureSupport(s->video_device,

                                                D3D12_FEATURE_VIDEO_MOTION_ESTIMATOR,

                                                &feature_data, sizeof(feature_data));

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to check motion estimator support (hr=0x%lx)\n", (long)hr);

        return AVERROR(EINVAL);

    }


    // Verify the requested features are actually supported (check returned flags)

    D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_FLAGS requested_block_flag =

        (s->d3d12_block_size == D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_8X8) ?

        D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_FLAG_8X8 :

        D3D12_VIDEO_MOTION_ESTIMATOR_SEARCH_BLOCK_SIZE_FLAG_16X16;


    if (!(feature_data.BlockSizeFlags & requested_block_flag)) {

        av_log(ctx, AV_LOG_ERROR, "Requested block size (%dx%d) not supported by device (supported flags: 0x%x)\n",

               s->block_size, s->block_size, feature_data.BlockSizeFlags);

        return AVERROR(ENOSYS);

    }


    if (!(feature_data.PrecisionFlags & D3D12_VIDEO_MOTION_ESTIMATOR_VECTOR_PRECISION_FLAG_QUARTER_PEL)) {

        av_log(ctx, AV_LOG_ERROR, "Quarter-pel precision not supported by device (supported flags: 0x%x)\n",

               feature_data.PrecisionFlags);

        return AVERROR(ENOSYS);

    }


    av_log(ctx, AV_LOG_VERBOSE, "Motion estimator support confirmed: block_size=%dx%d, precision=quarter-pel\n",

           s->block_size, s->block_size);


    // Create motion estimator

    me_desc.NodeMask    = 0;

    me_desc.InputFormat = s->frames_ctx->format;

    me_desc.BlockSize   = s->d3d12_block_size;

    me_desc.Precision   = s->precision;

    me_desc.SizeRange   = feature_data.SizeRange;


    hr = ID3D12VideoDevice1_CreateVideoMotionEstimator(s->video_device, &me_desc, NULL,

                                                       &IID_ID3D12VideoMotionEstimator,

                                                       (void **)&s->motion_estimator);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create motion estimator\n");

        return AVERROR(EINVAL);

    }


    // Create motion vector heap

    heap_desc.NodeMask    = 0;

    heap_desc.InputFormat = s->frames_ctx->format;

    heap_desc.BlockSize   = s->d3d12_block_size;

    heap_desc.Precision   = s->precision;

    heap_desc.SizeRange   = feature_data.SizeRange;


    hr = ID3D12VideoDevice1_CreateVideoMotionVectorHeap(s->video_device, &heap_desc, NULL,

                                                        &IID_ID3D12VideoMotionVectorHeap,

                                                        (void **)&s->motion_vector_heap);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create motion vector heap\n");

        return AVERROR(EINVAL);

    }


    // Create resolved motion vector textures in DEFAULT heap (GPU writable)

    // ResolveMotionVectorHeap outputs to TEXTURE2D with DXGI_FORMAT_R16G16_SINT

    int mb_width  = (width + s->block_size - 1) / s->block_size;

    int mb_height = (height + s->block_size - 1) / s->block_size;


    D3D12_HEAP_PROPERTIES heap_props_default = {.Type = D3D12_HEAP_TYPE_DEFAULT};

    D3D12_RESOURCE_DESC texture_desc = {

        .Dimension  = D3D12_RESOURCE_DIMENSION_TEXTURE2D,

        .Alignment        = 0,

        .Width            = mb_width,

        .Height           = mb_height,

        .DepthOrArraySize = 1,

        .MipLevels        = 1,

        .Format           = DXGI_FORMAT_R16G16_SINT,  // Motion vector format: signed 16-bit X,Y

        .SampleDesc       = {.Count = 1, .Quality = 0},

        .Layout           = D3D12_TEXTURE_LAYOUT_UNKNOWN,

        .Flags            = D3D12_RESOURCE_FLAG_NONE,

    };


    hr = ID3D12Device_CreateCommittedResource(s->device, &heap_props_default, D3D12_HEAP_FLAG_NONE,

                                              &texture_desc, D3D12_RESOURCE_STATE_COMMON, NULL,

                                              &IID_ID3D12Resource, (void **)&s->resolved_mv_texture_back);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create backward motion vector texture (hr=0x%lx)\n", (long)hr);

        return AVERROR(EINVAL);

    }


    hr = ID3D12Device_CreateCommittedResource(s->device, &heap_props_default, D3D12_HEAP_FLAG_NONE,

                                              &texture_desc, D3D12_RESOURCE_STATE_COMMON, NULL,

                                              &IID_ID3D12Resource, (void **)&s->resolved_mv_texture_fwd);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create forward motion vector texture (hr=0x%lx)\n", (long)hr);

        return AVERROR(EINVAL);

    }


    // Create READBACK buffers for CPU access

    // Need to calculate proper size accounting for D3D12 row pitch alignment

    // Get the footprint to determine the actual required buffer size

    D3D12_PLACED_SUBRESOURCE_FOOTPRINT temp_layout;

    UINT64 temp_total_size;


    ID3D12Device_GetCopyableFootprints(s->device, &texture_desc, 0, 1, 0,

                                       &temp_layout, NULL, NULL, &temp_total_size);


    s->readback_buffer_size = temp_total_size;


    av_log(ctx, AV_LOG_DEBUG, "Readback buffer size: %llu bytes (texture: %dx%d, pitch: %u)\n",

           (unsigned long long)s->readback_buffer_size, mb_width, mb_height, temp_layout.Footprint.RowPitch);


    D3D12_HEAP_PROPERTIES heap_props_readback = {.Type = D3D12_HEAP_TYPE_READBACK};

    D3D12_RESOURCE_DESC buffer_desc = {

        .Dimension = D3D12_RESOURCE_DIMENSION_BUFFER,

        .Alignment        = 0,

        .Width            = s->readback_buffer_size,

        .Height           = 1,

        .DepthOrArraySize = 1,

        .MipLevels        = 1,

        .Format           = DXGI_FORMAT_UNKNOWN,

        .SampleDesc       = {.Count = 1, .Quality = 0},

        .Layout           = D3D12_TEXTURE_LAYOUT_ROW_MAJOR,

        .Flags            = D3D12_RESOURCE_FLAG_NONE,

    };


    hr = ID3D12Device_CreateCommittedResource(s->device, &heap_props_readback, D3D12_HEAP_FLAG_NONE,

                                              &buffer_desc, D3D12_RESOURCE_STATE_COPY_DEST, NULL,

                                              &IID_ID3D12Resource, (void **)&s->readback_buffer_back);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create backward readback buffer (hr=0x%lx)\n", (long)hr);

        return AVERROR(EINVAL);

    }


    hr = ID3D12Device_CreateCommittedResource(s->device, &heap_props_readback, D3D12_HEAP_FLAG_NONE,

                                              &buffer_desc, D3D12_RESOURCE_STATE_COPY_DEST, NULL,

                                              &IID_ID3D12Resource, (void **)&s->readback_buffer_fwd);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create forward readback buffer (hr=0x%lx)\n", (long)hr);

        return AVERROR(EINVAL);

    }


    // Create graphics command queue, allocator and list for copy operations

    D3D12_COMMAND_QUEUE_DESC copy_queue_desc = {

        .Type     = D3D12_COMMAND_LIST_TYPE_DIRECT,

        .Priority = 0,

        .Flags    = D3D12_COMMAND_QUEUE_FLAG_NONE,

        .NodeMask = 0,

    };


    hr = ID3D12Device_CreateCommandQueue(s->device, &copy_queue_desc,

                                         &IID_ID3D12CommandQueue, (void **)&s->copy_command_queue);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create copy command queue\n");

        return AVERROR(EINVAL);

    }


    hr = ID3D12Device_CreateCommandAllocator(s->device, D3D12_COMMAND_LIST_TYPE_DIRECT,

                                             &IID_ID3D12CommandAllocator, (void **)&s->copy_command_allocator);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create copy command allocator\n");

        return AVERROR(EINVAL);

    }


    hr = ID3D12Device_CreateCommandList(s->device, 0, D3D12_COMMAND_LIST_TYPE_DIRECT,

                                        s->copy_command_allocator, NULL, &IID_ID3D12GraphicsCommandList,

                                        (void **)&s->copy_command_list);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to create copy command list\n");

        return AVERROR(EINVAL);

    }


    hr = ID3D12GraphicsCommandList_Close(s->copy_command_list);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to close copy command list\n");

        return AVERROR(EINVAL);

    }


    return 0;

}


static int mestimate_d3d12_config_props(AVFilterLink *outlink)

{

    AVFilterContext     *ctx = outlink->src;

    AVFilterLink     *inlink = ctx->inputs[0];

    FilterLink          *inl = ff_filter_link(inlink);

    FilterLink         *outl = ff_filter_link(outlink);

    MEstimateD3D12Context *s = ctx->priv;

    AVHWFramesContext *hw_frames_ctx;

    HRESULT hr;

    int err;


    if (!inl->hw_frames_ctx) {

        av_log(ctx, AV_LOG_ERROR, "D3D12 hardware frames context required\n");

        return AVERROR(EINVAL);

    }


    hw_frames_ctx = (AVHWFramesContext *)inl->hw_frames_ctx->data;

    if (hw_frames_ctx->format != AV_PIX_FMT_D3D12) {

        av_log(ctx, AV_LOG_ERROR, "Input must be D3D12 frames\n");

        return AVERROR(EINVAL);

    }


    s->hw_frames_ref = av_buffer_ref(inl->hw_frames_ctx);

    if (!s->hw_frames_ref)

        return AVERROR(ENOMEM);


    s->frames_ctx = hw_frames_ctx->hwctx;

    s->hw_device_ref = av_buffer_ref(hw_frames_ctx->device_ref);

    if (!s->hw_device_ref)

        return AVERROR(ENOMEM);


    s->device_ctx = ((AVHWDeviceContext *)s->hw_device_ref->data)->hwctx;

    s->device = s->device_ctx->device;


    // Propagate hardware frames context to output

    outl->hw_frames_ctx = av_buffer_ref(inl->hw_frames_ctx);

    if (!outl->hw_frames_ctx)

        return AVERROR(ENOMEM);


    // Query for ID3D12VideoDevice1 interface from the base video device

    hr = ID3D12VideoDevice_QueryInterface(s->device_ctx->video_device, &IID_ID3D12VideoDevice1,

                                          (void **)&s->video_device);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "ID3D12VideoDevice1 interface not supported\n");

        return AVERROR(ENOSYS);

    }


    err = mestimate_d3d12_create_objects(ctx);

    if (err < 0)

        return err;


    err = mestimate_d3d12_create_motion_estimator(ctx, inlink->w, inlink->h);

    if (err < 0)

        return err;


    s->initialized = 1;


    return 0;

}


static int mestimate_d3d12_sync_gpu(MEstimateD3D12Context *s)

{

    uint64_t completion = ID3D12Fence_GetCompletedValue(s->fence);


    if (completion < s->fence_value) {

        if (FAILED(ID3D12Fence_SetEventOnCompletion(s->fence, s->fence_value, s->fence_event)))

            return AVERROR(EINVAL);

        WaitForSingleObjectEx(s->fence_event, INFINITE, FALSE);

    }


    return 0;

}


static inline void d3d12_barrier_transition(D3D12_RESOURCE_BARRIER *barrier,

                                            ID3D12Resource *resource,

                                            D3D12_RESOURCE_STATES state_before,

                                            D3D12_RESOURCE_STATES state_after)

{

    barrier->Type  = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;

    barrier->Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;

    barrier->Transition.pResource   = resource;

    barrier->Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;

    barrier->Transition.StateBefore = state_before;

    barrier->Transition.StateAfter  = state_after;

}


static void add_mv_data(AVMotionVector *mv, int mb_size,

                        int x, int y, int x_mv, int y_mv, int dir)

{

    mv->w      = mb_size;

    mv->h      = mb_size;

    mv->dst_x  = x + (mb_size >> 1);

    mv->dst_y  = y + (mb_size >> 1);

    mv->src_x  = x_mv + (mb_size >> 1);

    mv->src_y  = y_mv + (mb_size >> 1);

    mv->source = dir ? 1 : -1;

    mv->flags  = 0;

    mv->motion_x = x_mv - x;

    mv->motion_y = y_mv - y;

    mv->motion_scale = 1;

}


static int mestimate_d3d12_read_motion_vectors(AVFilterContext *ctx, AVFrame *out, int direction)

{

    MEstimateD3D12Context *s = ctx->priv;

    uint8_t *mapped_data = NULL;

    HRESULT hr;

    int err = 0;

    AVFrameSideData *sd;

    AVMotionVector *mvs;

    int mb_x, mb_y, mv_idx;

    int mb_width, mb_height;

    int16_t *d3d12_mvs;

    ID3D12Resource *buffer = (direction == 0) ? s->readback_buffer_back : s->readback_buffer_fwd;


    // Map the readback buffer

    hr = ID3D12Resource_Map(buffer, 0, NULL, (void **)&mapped_data);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to map readback buffer (dir=%d, hr=0x%lx)\n", direction, (long)hr);

        return AVERROR(EINVAL);

    }


    // Get the motion vector side data

    sd = av_frame_get_side_data(out, AV_FRAME_DATA_MOTION_VECTORS);

    if (!sd) {

        av_log(ctx, AV_LOG_ERROR, "No motion vector side data found\n");

        ID3D12Resource_Unmap(buffer, 0, NULL);

        return AVERROR(EINVAL);

    }


    mvs       = (AVMotionVector *)sd->data;

    mb_width  = (out->width + s->block_size - 1) / s->block_size;

    mb_height = (out->height + s->block_size - 1) / s->block_size;


    // Calculate offset for this direction (0 = backward, 1 = forward)

    mv_idx = direction * mb_width * mb_height;


    // Parse D3D12 motion vector format

    // According to Microsoft documentation:

    // - Format: DXGI_FORMAT_R16G16_SINT (2D texture)

    // - Data: Signed 16-bit integers

    // - Units: Quarter-PEL (quarter pixel precision)

    // - Layout: X component in R channel, Y component in G channel

    // - Storage: 2D array matching block layout

    //

    // Each motion vector is stored as two int16_t values (X, Y) in quarter-pel units

    // The buffer is organized as a 2D array: [mb_height][mb_width][2]


    d3d12_mvs = (int16_t *)mapped_data;


    for (mb_y = 0; mb_y < mb_height; mb_y++) {

        for (mb_x = 0; mb_x < mb_width; mb_x++) {

            const int x_mb = mb_x * s->block_size;

            const int y_mb = mb_y * s->block_size;

            const int mv_offset = (mb_y * mb_width + mb_x) * 2;


            // Read motion vector components in quarter-pel units

            // R component (index 0) = X motion

            // G component (index 1) = Y motion

            int16_t mv_x_qpel = d3d12_mvs[mv_offset + 0];

            int16_t mv_y_qpel = d3d12_mvs[mv_offset + 1];


            // Convert from quarter-pel to full pixel coordinates

            // Quarter-pel means the value is 4x the actual pixel displacement

            // So divide by 4 to get pixel displacement

            int src_x = x_mb + (mv_x_qpel / 4);

            int src_y = y_mb + (mv_y_qpel / 4);


            // Store the motion vector data

            // This will set dst (current position) and src (where it came from)

            add_mv_data(&mvs[mv_idx++], s->block_size, x_mb, y_mb, src_x, src_y, direction);


            av_log(ctx, AV_LOG_TRACE, "Block[%d,%d] dir=%d: MV=(%d,%d) qpel -> (%d,%d) pixels\n",

                   mb_x, mb_y, direction, mv_x_qpel, mv_y_qpel,

                   mv_x_qpel / 4, mv_y_qpel / 4);

        }

    }


    ID3D12Resource_Unmap(buffer, 0, NULL);


    av_log(ctx, AV_LOG_DEBUG, "Parsed %d motion vectors for direction %d\n",

           mb_width * mb_height, direction);


    return err;

}


static int mestimate_d3d12_filter_frame(AVFilterLink *inlink, AVFrame *frame)

{

    AVFilterContext     *ctx = inlink->dst;

    MEstimateD3D12Context *s = ctx->priv;

    AVFrame *out;

    AVFrameSideData *sd;

    AVD3D12VAFrame *cur_hwframe, *prev_hwframe, *next_hwframe = NULL;

    HRESULT hr;

    int err;

    int mb_width, mb_height, mb_count;


    if (!s->initialized) {

        err = mestimate_d3d12_config_props(ctx->outputs[0]);

        if (err < 0) {

            av_frame_free(&frame);

            return err;

        }

    }


    // Manage frame buffer

    av_frame_free(&s->prev_frame);

    s->prev_frame = s->cur_frame;

    s->cur_frame  = s->next_frame;

    s->next_frame = frame;


    if (!s->cur_frame) {

        s->cur_frame = av_frame_clone(frame);

        if (!s->cur_frame)

            return AVERROR(ENOMEM);

    }


    if (!s->prev_frame)

        return 0;


    // Clone current frame for output

    out = av_frame_clone(s->cur_frame);

    if (!out)

        return AVERROR(ENOMEM);


    mb_width  = (frame->width + s->block_size - 1) / s->block_size;

    mb_height = (frame->height + s->block_size - 1) / s->block_size;

    mb_count  = mb_width * mb_height;


    // Allocate side data for motion vectors (2 directions)

    sd = av_frame_new_side_data(out, AV_FRAME_DATA_MOTION_VECTORS,

                                2 * mb_count * sizeof(AVMotionVector));

    if (!sd) {

        av_frame_free(&out);

        return AVERROR(ENOMEM);

    }


    // Get hardware frame pointers

    cur_hwframe = (AVD3D12VAFrame *)s->cur_frame->data[0];

    prev_hwframe = (AVD3D12VAFrame *)s->prev_frame->data[0];

    if (s->next_frame)

        next_hwframe = (AVD3D12VAFrame *)s->next_frame->data[0];


    // Reset command allocator and list ONCE for both estimations

    hr = ID3D12CommandAllocator_Reset(s->command_allocator);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to reset command allocator\n");

        av_frame_free(&out);

        return AVERROR(EINVAL);

    }


    hr = ID3D12VideoEncodeCommandList_Reset(s->command_list, s->command_allocator);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to reset command list\n");

        av_frame_free(&out);

        return AVERROR(EINVAL);

    }


    // Transition current and previous frames to VIDEO_ENCODE_READ

    D3D12_RESOURCE_BARRIER barriers[3];

    int barrier_count = 2;


    d3d12_barrier_transition(&barriers[0], cur_hwframe->texture,

                            D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ);

    d3d12_barrier_transition(&barriers[1], prev_hwframe->texture,

                            D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ);


    if (next_hwframe) {

        d3d12_barrier_transition(&barriers[2], next_hwframe->texture,

                                D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_VIDEO_ENCODE_READ);

        barrier_count = 3;

    }


    ID3D12VideoEncodeCommandList_ResourceBarrier(s->command_list, barrier_count, barriers);


    // Backward motion estimation (cur -> prev)

    D3D12_VIDEO_MOTION_ESTIMATOR_INPUT input_back = {

        .pInputTexture2D           = cur_hwframe->texture,

        .InputSubresourceIndex     = 0,

        .pReferenceTexture2D       = prev_hwframe->texture,

        .ReferenceSubresourceIndex = 0,

        .pHintMotionVectorHeap     = NULL,

    };


    D3D12_VIDEO_MOTION_ESTIMATOR_OUTPUT output = {

        .pMotionVectorHeap = s->motion_vector_heap,

    };


    ID3D12VideoEncodeCommandList_EstimateMotion(s->command_list, s->motion_estimator,

                                                &output, &input_back);


    D3D12_RESOLVE_VIDEO_MOTION_VECTOR_HEAP_INPUT resolve_input = {

        .pMotionVectorHeap = s->motion_vector_heap,

        .PixelWidth        = s->cur_frame->width,

        .PixelHeight       = s->cur_frame->height,

    };


    D3D12_RESOLVE_VIDEO_MOTION_VECTOR_HEAP_OUTPUT resolve_output_back = {

        .pMotionVectorTexture2D = s->resolved_mv_texture_back,

        .MotionVectorCoordinate = {.X = 0, .Y = 0, .Z = 0, .SubresourceIndex = 0},

    };


    ID3D12VideoEncodeCommandList_ResolveMotionVectorHeap(s->command_list,

                                                         &resolve_output_back, &resolve_input);


    // Copy resolved texture to readback buffer for CPU access

    // CopyTextureRegion is not available on video encode command list

    // We'll need to read directly from the resolved texture after GPU sync


    // Forward motion estimation (cur -> next) if next frame exists

    if (next_hwframe) {

        D3D12_VIDEO_MOTION_ESTIMATOR_INPUT input_fwd = {

            .pInputTexture2D           = cur_hwframe->texture,

            .InputSubresourceIndex     = 0,

            .pReferenceTexture2D       = next_hwframe->texture,

            .ReferenceSubresourceIndex = 0,

            .pHintMotionVectorHeap     = NULL,

        };


        ID3D12VideoEncodeCommandList_EstimateMotion(s->command_list, s->motion_estimator,

                                                    &output, &input_fwd);


        D3D12_RESOLVE_VIDEO_MOTION_VECTOR_HEAP_OUTPUT resolve_output_fwd = {

            .pMotionVectorTexture2D = s->resolved_mv_texture_fwd,

            .MotionVectorCoordinate = {.X = 0, .Y = 0, .Z = 0, .SubresourceIndex = 0},

        };


        ID3D12VideoEncodeCommandList_ResolveMotionVectorHeap(s->command_list,

                                                             &resolve_output_fwd, &resolve_input);


        // Copy will be done after command list execution

    }


    // Transition resources back to COMMON (reuse barriers by swapping states)

    for (int i = 0; i < barrier_count; i++)

        FFSWAP(D3D12_RESOURCE_STATES, barriers[i].Transition.StateBefore, barriers[i].Transition.StateAfter);


    ID3D12VideoEncodeCommandList_ResourceBarrier(s->command_list, barrier_count, barriers);


    // Close command list ONCE

    hr = ID3D12VideoEncodeCommandList_Close(s->command_list);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to close command list (hr=0x%lx)\n", (long)hr);

        av_frame_free(&out);

        return AVERROR(EINVAL);

    }


    // Wait for input frame sync

    hr = ID3D12CommandQueue_Wait(s->command_queue, cur_hwframe->sync_ctx.fence,

                                 cur_hwframe->sync_ctx.fence_value);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to wait for current frame\n");

        av_frame_free(&out);

        return AVERROR(EINVAL);

    }


    hr = ID3D12CommandQueue_Wait(s->command_queue, prev_hwframe->sync_ctx.fence,

                                 prev_hwframe->sync_ctx.fence_value);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to wait for previous frame\n");

        av_frame_free(&out);

        return AVERROR(EINVAL);

    }


    if (next_hwframe) {

        hr = ID3D12CommandQueue_Wait(s->command_queue, next_hwframe->sync_ctx.fence,

                                     next_hwframe->sync_ctx.fence_value);

        if (FAILED(hr)) {

            av_log(ctx, AV_LOG_ERROR, "Failed to wait for next frame\n");

            av_frame_free(&out);

            return AVERROR(EINVAL);

        }

    }


    // Execute command list ONCE

    ID3D12CommandQueue_ExecuteCommandLists(s->command_queue, 1, (ID3D12CommandList **)&s->command_list);


    // Signal completion

    hr = ID3D12CommandQueue_Signal(s->command_queue, s->fence, ++s->fence_value);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to signal fence\n");

        av_frame_free(&out);

        return AVERROR(EINVAL);

    }


    // Wait for GPU to complete

    err = mestimate_d3d12_sync_gpu(s);

    if (err < 0) {

        av_frame_free(&out);

        return err;

    }


    // Now copy the resolved textures to readback buffers using graphics command list

    hr = ID3D12CommandAllocator_Reset(s->copy_command_allocator);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to reset copy command allocator\n");

        av_frame_free(&out);

        return AVERROR(EINVAL);

    }


    hr = ID3D12GraphicsCommandList_Reset(s->copy_command_list, s->copy_command_allocator, NULL);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to reset copy command list\n");

        av_frame_free(&out);

        return AVERROR(EINVAL);

    }


    // Transition resolved textures to COPY_SOURCE state

    D3D12_RESOURCE_BARRIER copy_barriers[2];

    int copy_barrier_count = 1;


    d3d12_barrier_transition(&copy_barriers[0], s->resolved_mv_texture_back,

                            D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_SOURCE);


    if (s->next_frame) {

        d3d12_barrier_transition(&copy_barriers[1], s->resolved_mv_texture_fwd,

                                D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_SOURCE);

        copy_barrier_count = 2;

    }


    ID3D12GraphicsCommandList_ResourceBarrier(s->copy_command_list, copy_barrier_count, copy_barriers);


    // Get texture layout for backward copy

    D3D12_RESOURCE_DESC texture_desc_back;

    D3D12_PLACED_SUBRESOURCE_FOOTPRINT layout_back;

    UINT64 row_size_back, total_size_back;

    UINT num_rows_back;


    // Get the resource description for backward texture

    s->resolved_mv_texture_back->lpVtbl->GetDesc(s->resolved_mv_texture_back, &texture_desc_back);


    av_log(ctx, AV_LOG_DEBUG, "Back texture desc: Width=%llu, Height=%u, Format=%d\n",

           (unsigned long long)texture_desc_back.Width, texture_desc_back.Height, texture_desc_back.Format);


    // Get the copyable footprints for the backward texture

    ID3D12Device_GetCopyableFootprints(s->device, &texture_desc_back, 0, 1, 0,

                                       &layout_back, &num_rows_back, &row_size_back, &total_size_back);


    av_log(ctx, AV_LOG_DEBUG, "Back layout: Offset=%llu, Width=%u, Height=%u, Depth=%u, RowPitch=%u\n",

           (unsigned long long)layout_back.Offset, layout_back.Footprint.Width, layout_back.Footprint.Height,

           layout_back.Footprint.Depth, layout_back.Footprint.RowPitch);


    // Copy backward motion vectors

    D3D12_TEXTURE_COPY_LOCATION src_back = {

        .pResource = s->resolved_mv_texture_back,

        .Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX,

        .SubresourceIndex = 0

    };


    D3D12_TEXTURE_COPY_LOCATION dst_back = {

        .pResource = s->readback_buffer_back,

        .Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT,

        .PlacedFootprint = {

            .Offset = 0,

            .Footprint = layout_back.Footprint

        }

    };


    av_log(ctx, AV_LOG_DEBUG, "Copying backward MVs...\n");

    ID3D12GraphicsCommandList_CopyTextureRegion(s->copy_command_list, &dst_back, 0, 0, 0, &src_back, NULL);


    // Copy forward motion vectors if available

    if (s->next_frame) {

        // Get texture layout for forward copy

        D3D12_RESOURCE_DESC texture_desc_fwd;

        D3D12_PLACED_SUBRESOURCE_FOOTPRINT layout_fwd;

        UINT64 row_size_fwd, total_size_fwd;

        UINT num_rows_fwd;


        // Get the resource description for forward texture

        s->resolved_mv_texture_fwd->lpVtbl->GetDesc(s->resolved_mv_texture_fwd, &texture_desc_fwd);


        av_log(ctx, AV_LOG_DEBUG, "Fwd texture desc: Width=%llu, Height=%u, Format=%d\n",

               (unsigned long long)texture_desc_fwd.Width, texture_desc_fwd.Height, texture_desc_fwd.Format);


        // Get the copyable footprints for the forward texture

        ID3D12Device_GetCopyableFootprints(s->device, &texture_desc_fwd, 0, 1, 0,

                                           &layout_fwd, &num_rows_fwd, &row_size_fwd, &total_size_fwd);


        av_log(ctx, AV_LOG_DEBUG, "Fwd layout: Offset=%llu, Width=%u, Height=%u, Depth=%u, RowPitch=%u\n",

               (unsigned long long)layout_fwd.Offset, layout_fwd.Footprint.Width, layout_fwd.Footprint.Height,

               layout_fwd.Footprint.Depth, layout_fwd.Footprint.RowPitch);


        D3D12_TEXTURE_COPY_LOCATION src_fwd = {

            .pResource = s->resolved_mv_texture_fwd,

            .Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX,

            .SubresourceIndex = 0

        };


        D3D12_TEXTURE_COPY_LOCATION dst_fwd = {

            .pResource = s->readback_buffer_fwd,

            .Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT,

            .PlacedFootprint = {

                .Offset = 0,

                .Footprint = layout_fwd.Footprint

            }

        };


        av_log(ctx, AV_LOG_DEBUG, "Copying forward MVs...\n");

        ID3D12GraphicsCommandList_CopyTextureRegion(s->copy_command_list, &dst_fwd, 0, 0, 0, &src_fwd, NULL);

    }


    // Transition back to COMMON state (reuse barriers by swapping states)

    for (int i = 0; i < copy_barrier_count; i++)

        FFSWAP(D3D12_RESOURCE_STATES, copy_barriers[i].Transition.StateBefore, copy_barriers[i].Transition.StateAfter);


    ID3D12GraphicsCommandList_ResourceBarrier(s->copy_command_list, copy_barrier_count, copy_barriers);


    hr = ID3D12GraphicsCommandList_Close(s->copy_command_list);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to close copy command list (hr=0x%lx)\n", (long)hr);

        av_frame_free(&out);

        return AVERROR(EINVAL);

    }


    // Execute copy command list on the copy queue

    ID3D12CommandQueue_ExecuteCommandLists(s->copy_command_queue, 1, (ID3D12CommandList **)&s->copy_command_list);


    // Signal and wait for copy completion

    hr = ID3D12CommandQueue_Signal(s->copy_command_queue, s->fence, ++s->fence_value);

    if (FAILED(hr)) {

        av_log(ctx, AV_LOG_ERROR, "Failed to signal fence for copy\n");

        av_frame_free(&out);

        return AVERROR(EINVAL);

    }


    err = mestimate_d3d12_sync_gpu(s);

    if (err < 0) {

        av_frame_free(&out);

        return err;

    }


    // Read motion vectors for both directions

    err = mestimate_d3d12_read_motion_vectors(ctx, out, 0);

    if (err < 0) {

        av_frame_free(&out);

        return err;

    }


    if (s->next_frame) {

        err = mestimate_d3d12_read_motion_vectors(ctx, out, 1);

        if (err < 0) {

            av_frame_free(&out);

            return err;

        }

    }


    return ff_filter_frame(ctx->outputs[0], out);

}


static av_cold void mestimate_d3d12_uninit(AVFilterContext *ctx)

{

    MEstimateD3D12Context *s = ctx->priv;


    av_frame_free(&s->prev_frame);

    av_frame_free(&s->cur_frame);

    av_frame_free(&s->next_frame);


    D3D12_OBJECT_RELEASE(s->copy_command_list);

    D3D12_OBJECT_RELEASE(s->copy_command_allocator);

    D3D12_OBJECT_RELEASE(s->copy_command_queue);

    D3D12_OBJECT_RELEASE(s->readback_buffer_back);

    D3D12_OBJECT_RELEASE(s->readback_buffer_fwd);

    D3D12_OBJECT_RELEASE(s->resolved_mv_texture_back);

    D3D12_OBJECT_RELEASE(s->resolved_mv_texture_fwd);

    D3D12_OBJECT_RELEASE(s->motion_vector_heap);

    D3D12_OBJECT_RELEASE(s->motion_estimator);

    D3D12_OBJECT_RELEASE(s->command_list);

    D3D12_OBJECT_RELEASE(s->command_allocator);

    D3D12_OBJECT_RELEASE(s->command_queue);

    D3D12_OBJECT_RELEASE(s->fence);


    if (s->fence_event)

        CloseHandle(s->fence_event);


    av_buffer_unref(&s->hw_frames_ref);

    av_buffer_unref(&s->hw_device_ref);

}


static const AVFilterPad mestimate_d3d12_inputs[] = {

    {

        .name         = "default",

        .type         = AVMEDIA_TYPE_VIDEO,

        .filter_frame = mestimate_d3d12_filter_frame,

    },

};


static const AVFilterPad mestimate_d3d12_outputs[] = {

    {

        .name         = "default",

        .type         = AVMEDIA_TYPE_VIDEO,

        .config_props = mestimate_d3d12_config_props,

    },

};


#define OFFSET(x) offsetof(MEstimateD3D12Context, x)

#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM


static const AVOption mestimate_d3d12_options[] = {

    { "mb_size", "macroblock size, only 8 and 16 are supported", OFFSET(block_size), AV_OPT_TYPE_INT, {.i64 = 16}, 8, 16, FLAGS },

    { NULL }

};


AVFILTER_DEFINE_CLASS(mestimate_d3d12);


const FFFilter ff_vf_mestimate_d3d12 = {

    .p.name         = "mestimate_d3d12",

    .p.description  = NULL_IF_CONFIG_SMALL("Generate motion vectors using D3D12 hardware acceleration."),

    .p.priv_class   = &mestimate_d3d12_class,

    .p.flags        = AVFILTER_FLAG_METADATA_ONLY | AVFILTER_FLAG_HWDEVICE,

    .priv_size      = sizeof(MEstimateD3D12Context),

    .init           = mestimate_d3d12_init,

    .uninit         = mestimate_d3d12_uninit,

    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,

    FILTER_INPUTS(mestimate_d3d12_inputs),

    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_D3D12),

    FILTER_OUTPUTS(mestimate_d3d12_outputs),

};